3 * (c) Copyright 2007-2008 by Intra2net AG
18 #include <stringfunc.hxx>
29 const std::string hexDigitsLower("0123456789abcdef");
30 const std::string hexDigitsUpper("0123456789ABCDEF");
35 char operator() (char c)
37 return std::toupper(c);
39 }; // eo struct UpperFunc
44 char operator() (char c)
46 return std::tolower(c);
48 }; // eo struct LowerFunc
51 } // eo namespace <anonymous>
56 * default list of Whitespaces (" \t\r\n");
58 const std::string Whitespaces = " \t\r\n";
61 * default list of lineendings ("\r\n");
63 const std::string LineEndings= "\r\n";
68 * @brief checks if a string begins with a given prefix.
69 * @param[in,out] str the string which is tested
70 * @param prefix the prefix which should be tested for.
71 * @return @a true iff the prefix is not empty and the string begins with that prefix.
73 bool has_prefix(const std::string& str, const std::string& prefix)
75 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
79 return str.compare(0, prefix.size(), prefix) == 0;
80 } // eo has_prefix(const std::string&,const std::string&)
84 * @brief checks if a string ends with a given suffix.
85 * @param[in,out] str the string which is tested
86 * @param suffix the suffix which should be tested for.
87 * @return @a true iff the suffix is not empty and the string ends with that suffix.
89 bool has_suffix(const std::string& str, const std::string& suffix)
91 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
95 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
96 } // eo has_suffix(const std::string&,const std::string&)
100 * cut off characters from a given list from front and end of a string.
101 * @param[in,out] str the string which should be trimmed.
102 * @param charlist the list of characters to remove from beginning and end of string
103 * @return the result string.
105 std::string trim_mod(std::string& str, const std::string& charlist)
107 // first: trim the beginning:
108 std::string::size_type pos= str.find_first_not_of (charlist);
109 if (pos == std::string::npos)
111 // whole string consists of charlist (or is already empty)
117 // str starts with charlist
120 // now let's look at the tail:
121 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
122 if ( pos < str.size() )
124 str.erase(pos, str.size()-pos);
127 } // eo trim_mod(std::string&,const std::string&)
132 * removes last character from a string when it is in a list of chars to be removed.
133 * @param[in,out] str the string.
134 * @param what the list of chars which will be tested for.
135 * @return the resulting string with last char removed (if applicable)
137 std::string chomp_mod(std::string& str, const std::string& what)
139 if (str.empty() || what.empty() )
143 if (what.find(str.at (str.size()-1) ) != std::string::npos)
145 str.erase(str.size() - 1);
148 } // eo chomp_mod(std::string&,const std::string&)
152 * @brief converts a string to lower case.
153 * @param[in,out] str the string to modify.
156 std::string to_lower_mod(std::string& str)
158 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
160 } // eo to_lower_mod(std::string&)
164 * @brief converts a string to upper case.
165 * @param[in,out] str the string to modify.
168 std::string to_upper_mod(std::string& str)
170 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
172 } // eo to_upper_mod(std::string&)
177 * cut off characters from a given list from front and end of a string.
178 * @param str the string which should be trimmed.
179 * @param charlist the list of characters to remove from beginning and end of string
180 * @return the result string.
182 std::string trim (const std::string& str, const std::string& charlist)
184 // first: trim the beginning:
185 std::string::size_type pos0= str.find_first_not_of(charlist);
186 if (pos0 == std::string::npos)
188 // whole string consists of charlist (or is already empty)
189 return std::string();
191 // now let's look at the end:
192 std::string::size_type pos1= str.find_last_not_of(charlist);
193 return str.substr(pos0, pos1 - pos0 + 1);
194 } // eo trim(const std:.string&,const std::string&)
198 * removes last character from a string when it is in a list of chars to be removed.
199 * @param str the string.
200 * @param what the list of chars which will be tested for.
201 * @return the resulting string with last char removed (if applicable)
203 std::string chomp (const std::string& str, const std::string& what)
205 if (str.empty() || what.empty() )
209 if (what.find(str.at (str.size()-1) ) != std::string::npos)
211 return str.substr(0, str.size()-1);
214 } // eo chomp(const std:.string&,const std::string&)
218 * @brief returns a lower case version of a given string.
219 * @param str the string
220 * @return the lower case version of the string
222 std::string to_lower (const std::string& str)
224 std::string result(str);
225 return to_lower_mod(result);
226 } // eo to_lower(const std::string&)
230 * @brief returns a upper case version of a given string.
231 * @param str the string
232 * @return the upper case version of the string
234 std::string to_upper(const std::string& str)
236 std::string result(str);
237 return to_upper_mod(result);
238 } // eo to_upper(const std::string&)
243 * @brief removes a given suffix from a string.
244 * @param str the string.
245 * @param suffix the suffix which should be removed if the string ends with it.
246 * @return the string without the suffix.
248 * If the string ends with the suffix, it is removed. If the the string doesn't end
249 * with the suffix the original string is returned.
251 std::string remove_suffix(const std::string& str, const std::string& suffix)
253 if (has_suffix(str,suffix) )
255 return str.substr(0, str.size()-suffix.size() );
258 } // eo remove_suffix(const std::string&,const std::string&)
263 * @brief removes a given prefix from a string.
264 * @param str the string.
265 * @param prefix the prefix which should be removed if the string begins with it.
266 * @return the string without the prefix.
268 * If the string begins with the prefix, it is removed. If the the string doesn't begin
269 * with the prefix the original string is returned.
271 std::string remove_prefix(const std::string& str, const std::string& prefix)
273 if (has_prefix(str,prefix) )
275 return str.substr( prefix.size() );
278 } // eo remove_prefix(const std::string&,const std::string&)
282 * split a string to key and value delimited by a given delimiter.
283 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
284 * @param str the string which should be splitted.
285 * @param[out] key the resulting key
286 * @param[out] value the resulting value
287 * @param delimiter the delimiter between key and value; default is '='.
288 * @return @a true if the split was successful.
291 const std::string& str,
296 std::string::size_type pos = str.find (delimiter);
297 if (pos == std::string::npos) return false;
298 key= str.substr(0,pos);
299 value= str.substr(pos+1);
303 } // eo pair_split(const std::string&,std::string&,std::string&,char)
307 * splits a string by given delimiter
309 * @param[in] str the string which should be splitted.
310 * @param[out] result the list resulting from splitting @a str.
311 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
312 * @param[in] omit_empty should empty parts not be stored?
313 * @param[in] trim_list list of characters the parts should be trimmed by.
314 * (empty string results in no trim)
317 const std::string& str,
318 std::list<std::string>& result,
319 const std::string& delimiter,
321 const std::string& trim_list
324 std::string::size_type pos, last_pos=0;
325 bool delimiter_found= false;
326 while ( last_pos < str.size() && last_pos != std::string::npos)
328 pos= str.find(delimiter, last_pos);
330 if (pos == std::string::npos)
332 part= str.substr(last_pos);
333 delimiter_found= false;
337 part= str.substr(last_pos, pos-last_pos);
338 delimiter_found=true;
340 if (pos != std::string::npos)
342 last_pos= pos+ delimiter.size();
346 last_pos= std::string::npos;
348 if (!trim_list.empty() ) trim_mod (part, trim_list);
349 if (omit_empty && part.empty() ) continue;
350 result.push_back( part );
352 // if the string ends with a delimiter we need to append an empty string if no omit_empty
354 // (this way we keep the split result consistent to a join operation)
355 if (delimiter_found && !omit_empty)
357 result.push_back("");
359 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
363 * splits a string by a given delimiter
364 * @param str the string which should be splitted.
365 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
366 * @param[in] omit_empty should empty parts not be stored?
367 * @param[in] trim_list list of characters the parts should be trimmed by.
368 * (empty string results in no trim)
369 * @return the list resulting from splitting @a str.
371 std::list<std::string> split_string(
372 const std::string& str,
373 const std::string& delimiter,
375 const std::string& trim_list
378 std::list<std::string> result;
379 split_string(str, result, delimiter, omit_empty, trim_list);
381 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
385 * @brief joins a list of strings into a single string.
387 * This funtion is (basically) the reverse operation of @a split_string.
389 * @param parts the list of strings.
390 * @param delimiter the delimiter which is inserted between the strings.
391 * @return the joined string.
393 std::string join_string(
394 const std::list< std::string >& parts,
395 const std::string& delimiter
399 if (! parts.empty() )
401 std::list< std::string >::const_iterator it= parts.begin();
403 while ( ++it != parts.end() )
410 } // eo join_string(const std::list< std::string >&,const std::string&)
420 * @brief returns a hex string from a binary string.
421 * @param str the (binary) string
422 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
423 * @return the string in hex notation.
425 std::string convert_binary_to_hex(
426 const std::string& str,
427 bool upper_case_digits
431 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
432 for ( std::string::const_iterator it= str.begin();
436 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
437 result.push_back( hexDigits[ (*it) & 0x0f ] );
440 } // eo convert_binary_to_hex(const std::string&,bool)
444 * @brief converts a hex digit string to binary string.
445 * @param str hex digit string
446 * @return the binary string.
448 * The hex digit string may contains white spaces or colons which are treated
449 * as delimiters between hex digit groups.
451 * @todo rework the handling of half nibbles (consistency)!
453 std::string convert_hex_to_binary(
454 const std::string& str
456 throw (std::runtime_error)
460 bool hasNibble= false;
461 bool lastWasWS= true;
462 for ( std::string::const_iterator it= str.begin();
466 std::string::size_type p = hexDigitsLower.find( *it );
467 if (p== std::string::npos)
469 p= hexDigitsUpper.find( *it );
471 if (p == std::string::npos)
473 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
474 or ( *it == ':') // or a colon?
477 // we treat that as a valid delimiter:
480 // 1 nibble before WS is treate as lower part:
489 if (p == std::string::npos )
491 throw runtime_error("illegal character in hex digit string: " + str);
505 //we already had a nibble, so a char is complete now:
506 result.push_back( c );
511 // this is the first nibble of a new char:
517 //well, there is one nibble left
518 // let's do some heuristics:
521 // if the preceeding character was a white space (or a colon)
522 // we treat the nibble as lower part:
523 //( this is consistent with shortened hex notations where leading zeros are not noted)
524 result.push_back( c );
528 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
529 result.push_back( c << 4 );
533 } // eo convert_hex_to_binary(const std::string&)
536 } // eo namespace I2n
541 std::string iso_to_utf8(const std::string& isostring)
545 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
547 if (iso_to_utf8 == (iconv_t)-1)
548 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
550 size_t in_size=isostring.size();
551 size_t out_size=in_size*4;
553 char *buf = (char *)malloc(out_size+1);
555 throw runtime_error("out of memory for iconv buffer");
557 const char *in = isostring.c_str();
559 iconv(i2utf8, &in, &in_size, &out, &out_size);
561 buf[isostring.size()*4-out_size]=0;
571 std::string utf8_to_iso(const std::string& utf8string)
575 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
577 if (utf82iso == (iconv_t)-1)
578 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
580 size_t in_size=utf8string.size();
581 size_t out_size=in_size;
583 char *buf = (char *)malloc(out_size+1);
585 throw runtime_error("out of memory for iconv buffer");
587 const char *in = utf8string.c_str();
589 iconv(utf82iso, &in, &in_size, &out, &out_size);
591 buf[utf8string.size()-out_size]=0;
596 iconv_close(utf82iso);
601 wchar_t* utf8_to_wbuf(const std::string& utf8string)
603 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
605 if (utf82wstr == (iconv_t)-1)
606 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
608 size_t in_size=utf8string.size();
609 size_t out_size= (in_size+1)*sizeof(wchar_t);
611 wchar_t *buf = (wchar_t *)malloc(out_size);
613 throw runtime_error("out of memory for iconv buffer");
615 const char *in = utf8string.c_str();
616 char *out = (char*) buf;
617 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == -1)
618 throw runtime_error("error converting char encodings");
620 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
622 iconv_close(utf82wstr);
627 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
631 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
633 if (utf7imap2utf8 == (iconv_t)-1)
634 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
636 size_t in_size=utf7imapstring.size();
637 size_t out_size=in_size*4;
639 char *buf = (char *)malloc(out_size+1);
641 throw runtime_error("out of memory for iconv buffer");
643 const char *in = utf7imapstring.c_str();
645 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
647 buf[utf7imapstring.size()*4-out_size]=0;
652 iconv_close(utf7imap2utf8);
657 std::string utf8_to_utf7imap(const std::string& utf8string)
661 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
663 if (utf82utf7imap == (iconv_t)-1)
664 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
666 // UTF-7 is base64 encoded, a buffer 10x as large
667 // as the utf-8 buffer should be enough. If not the string will be truncated.
668 size_t in_size=utf8string.size();
669 size_t out_size=in_size*10;
671 char *buf = (char *)malloc(out_size+1);
673 throw runtime_error("out of memory for iconv buffer");
675 const char *in = utf8string.c_str();
677 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
679 buf[utf8string.size()*10-out_size]= 0;
684 iconv_close(utf82utf7imap);
689 // Tokenize string by (html) tags
690 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
692 string::size_type pos, len = input.size();
693 bool inside_tag = false;
696 for (pos = 0; pos < len; pos++)
698 if (input[pos] == '<')
702 if (!current.empty() )
704 tokenized.push_back( make_pair(current, false) );
708 current += input[pos];
710 else if (input[pos] == '>' && inside_tag)
712 current += input[pos];
714 if (!current.empty() )
716 tokenized.push_back( make_pair(current, true) );
721 current += input[pos];
724 // String left over in buffer?
725 if (!current.empty() )
726 tokenized.push_back( make_pair(current, false) );
727 } // eo tokenize_by_tag
730 std::string strip_html_tags(const std::string &input)
732 // Pair first: string, second: isTag
733 vector<pair<string,bool> > tokenized;
734 tokenize_by_tag (tokenized, input);
737 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
738 for (token = tokenized.begin(); token != tokens_end; token++)
740 output += token->first;
743 } // eo strip_html_tags
746 // Smart-encode HTML en
747 string smart_html_entities(const std::string &input)
749 // Pair first: string, second: isTag
750 vector<pair<string,bool> > tokenized;
751 tokenize_by_tag (tokenized, input);
754 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
755 for (token = tokenized.begin(); token != tokens_end; token++)
757 // keep HTML tags as they are
759 output += token->first;
761 output += html_entities(token->first);
768 string::size_type find_8bit(const std::string &str)
770 string::size_type l=str.size();
771 for (string::size_type p=0; p < l; p++)
772 if (static_cast<unsigned char>(str[p]) > 127)
778 // encoded UTF-8 chars into HTML entities
779 string html_entities(std::string str)
782 replace_all (str, "&", "&");
783 replace_all (str, "\"", """);
784 replace_all (str, "<", "<");
785 replace_all (str, ">", ">");
788 replace_all (str, "\xC3\xA4", "ä");
789 replace_all (str, "\xC3\xB6", "ö");
790 replace_all (str, "\xC3\xBC", "ü");
791 replace_all (str, "\xC3\x84", "Ä");
792 replace_all (str, "\xC3\x96", "Ö");
793 replace_all (str, "\xC3\x9C", "Ü");
796 replace_all (str, "\xC3\x9F", "ß");
798 // conversion of remaining non-ASCII chars needed?
799 // just do if needed because of performance
800 if (find_8bit(str) != string::npos)
802 // convert to fixed-size encoding UTF-32
803 wchar_t* wbuf=utf8_to_wbuf(str);
804 ostringstream target;
806 // replace all non-ASCII chars with HTML representation
807 for (int p=0; wbuf[p] != 0; p++)
809 unsigned int c=wbuf[p];
812 target << static_cast<unsigned char>(c);
814 target << "&#" << c << ';';
823 } // eo html_entities(std::string)
826 bool replace_all(string &base, const char *ist, const char *soll)
830 return replace_all(base,&i,&s);
833 bool replace_all(string &base, const string &ist, const char *soll)
836 return replace_all(base,&ist,&s);
839 bool replace_all(string &base, const string *ist, const string *soll)
841 return replace_all(base,*ist,*soll);
844 bool replace_all(string &base, const char *ist, const string *soll)
847 return replace_all(base,&i,soll);
850 bool replace_all(string &base, const string &ist, const string &soll)
852 bool found_ist = false;
853 string::size_type a=0;
856 throw runtime_error ("replace_all called with empty search string");
858 while ( (a=base.find(ist,a) ) != string::npos)
860 base.replace(a,ist.size(),soll);
869 string to_lower(const string &src)
873 string::size_type pos, end = dst.size();
874 for (pos = 0; pos < end; pos++)
875 dst[pos] = tolower(dst[pos]);
880 string to_upper(const string &src)
884 string::size_type pos, end = dst.size();
885 for (pos = 0; pos < end; pos++)
886 dst[pos] = toupper(dst[pos]);
892 string nice_unit_format(int input)
907 tmp = float (tmp) /float (10);
912 out.setf (ios::fixed);
917 out << size << i18n (" Bytes");
920 out << size << i18n (" KBytes");
923 out << size << i18n (" MBytes");
926 out << size << i18n (" GBytes");
929 out << size << i18n (" TBytes");
932 out << size << i18n (" PBytes");
935 out << size << i18n (" EBytes");
938 out << size << i18n (" ZBytes");
941 out << size << i18n (" YBytes");
944 out << size << "*10^" << (sizecount*3)<< i18n (" Bytes");
949 } // eo nice_unit_format(int input)
952 string escape(const string &s)
958 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
965 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
967 out.replace (p,1,"\\r");
972 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
974 out.replace (p,1,"\\n");
981 } // eo scape(const std::string&)
984 string descape(const string &s, int startpos, int &endpos)
988 if (s.at(startpos) != '"')
989 throw out_of_range("value not type escaped string");
991 out=s.substr(startpos+1);
992 string::size_type p=0;
994 // search for the end of the string
995 while ( (p=out.find("\"",p) ) !=out.npos)
1000 // the " might be escaped with a backslash
1001 while (e>=0 && out.at (e) =='\\')
1003 if (escaped == false)
1017 // we now have the end of the string
1018 out=out.substr(0,p);
1020 // tell calling prog about the endposition
1021 endpos=startpos+p+1;
1023 // descape all \ stuff inside the string now
1025 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1027 switch (out.at(p+1) )
1030 out.replace(p,2,"\r");
1033 out.replace(p,2,"\n");
1042 } // eo descape(const std::string&,int,int&)
1045 string escape_shellarg(const string &input)
1047 string output = "'";
1048 string::const_iterator it, it_end = input.end();
1049 for (it = input.begin(); it != it_end; it++)