2 The software in this package is distributed under the GNU General
3 Public License version 2 (with a special exception described below).
5 A copy of GNU General Public License (GPL) is included in this distribution,
6 in the file COPYING.GPL.
8 As a special exception, if other files instantiate templates or use macros
9 or inline functions from this file, or you compile this file and link it
10 with other works to produce a work based on this file, this file
11 does not by itself cause the resulting work to be covered
12 by the GNU General Public License.
14 However the source code for this file must still be made available
15 in accordance with section (3) of the GNU General Public License.
17 This exception does not invalidate any other reasons why a work based
18 on this file might be covered by the GNU General Public License.
22 * (c) Copyright 2007-2008 by Intra2net AG
36 #include <stringfunc.hxx>
47 const std::string hexDigitsLower("0123456789abcdef");
48 const std::string hexDigitsUpper("0123456789ABCDEF");
53 char operator() (char c)
55 return std::toupper(c);
57 }; // eo struct UpperFunc
62 char operator() (char c)
64 return std::tolower(c);
66 }; // eo struct LowerFunc
69 } // eo namespace <anonymous>
74 * default list of Whitespaces (" \t\r\n");
76 const std::string Whitespaces = " \t\r\n";
79 * default list of lineendings ("\r\n");
81 const std::string LineEndings= "\r\n";
86 * @brief checks if a string begins with a given prefix.
87 * @param[in,out] str the string which is tested
88 * @param prefix the prefix which should be tested for.
89 * @return @a true iff the prefix is not empty and the string begins with that prefix.
91 bool has_prefix(const std::string& str, const std::string& prefix)
93 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
97 return str.compare(0, prefix.size(), prefix) == 0;
98 } // eo has_prefix(const std::string&,const std::string&)
102 * @brief checks if a string ends with a given suffix.
103 * @param[in,out] str the string which is tested
104 * @param suffix the suffix which should be tested for.
105 * @return @a true iff the suffix is not empty and the string ends with that suffix.
107 bool has_suffix(const std::string& str, const std::string& suffix)
109 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
113 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
114 } // eo has_suffix(const std::string&,const std::string&)
118 * cut off characters from a given list from front and end of a string.
119 * @param[in,out] str the string which should be trimmed.
120 * @param charlist the list of characters to remove from beginning and end of string
121 * @return the result string.
123 std::string trim_mod(std::string& str, const std::string& charlist)
125 // first: trim the beginning:
126 std::string::size_type pos= str.find_first_not_of (charlist);
127 if (pos == std::string::npos)
129 // whole string consists of charlist (or is already empty)
135 // str starts with charlist
138 // now let's look at the tail:
139 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
140 if ( pos < str.size() )
142 str.erase(pos, str.size()-pos);
145 } // eo trim_mod(std::string&,const std::string&)
150 * removes last character from a string when it is in a list of chars to be removed.
151 * @param[in,out] str the string.
152 * @param what the list of chars which will be tested for.
153 * @return the resulting string with last char removed (if applicable)
155 std::string chomp_mod(std::string& str, const std::string& what)
157 if (str.empty() || what.empty() )
161 if (what.find(str.at (str.size()-1) ) != std::string::npos)
163 str.erase(str.size() - 1);
166 } // eo chomp_mod(std::string&,const std::string&)
170 * @brief converts a string to lower case.
171 * @param[in,out] str the string to modify.
174 std::string to_lower_mod(std::string& str)
176 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
178 } // eo to_lower_mod(std::string&)
182 * @brief converts a string to upper case.
183 * @param[in,out] str the string to modify.
186 std::string to_upper_mod(std::string& str)
188 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
190 } // eo to_upper_mod(std::string&)
195 * cut off characters from a given list from front and end of a string.
196 * @param str the string which should be trimmed.
197 * @param charlist the list of characters to remove from beginning and end of string
198 * @return the result string.
200 std::string trim (const std::string& str, const std::string& charlist)
202 // first: trim the beginning:
203 std::string::size_type pos0= str.find_first_not_of(charlist);
204 if (pos0 == std::string::npos)
206 // whole string consists of charlist (or is already empty)
207 return std::string();
209 // now let's look at the end:
210 std::string::size_type pos1= str.find_last_not_of(charlist);
211 return str.substr(pos0, pos1 - pos0 + 1);
212 } // eo trim(const std:.string&,const std::string&)
216 * removes last character from a string when it is in a list of chars to be removed.
217 * @param str the string.
218 * @param what the list of chars which will be tested for.
219 * @return the resulting string with last char removed (if applicable)
221 std::string chomp (const std::string& str, const std::string& what)
223 if (str.empty() || what.empty() )
227 if (what.find(str.at (str.size()-1) ) != std::string::npos)
229 return str.substr(0, str.size()-1);
232 } // eo chomp(const std:.string&,const std::string&)
236 * @brief returns a lower case version of a given string.
237 * @param str the string
238 * @return the lower case version of the string
240 std::string to_lower (const std::string& str)
242 std::string result(str);
243 return to_lower_mod(result);
244 } // eo to_lower(const std::string&)
248 * @brief returns a upper case version of a given string.
249 * @param str the string
250 * @return the upper case version of the string
252 std::string to_upper(const std::string& str)
254 std::string result(str);
255 return to_upper_mod(result);
256 } // eo to_upper(const std::string&)
261 * @brief removes a given suffix from a string.
262 * @param str the string.
263 * @param suffix the suffix which should be removed if the string ends with it.
264 * @return the string without the suffix.
266 * If the string ends with the suffix, it is removed. If the the string doesn't end
267 * with the suffix the original string is returned.
269 std::string remove_suffix(const std::string& str, const std::string& suffix)
271 if (has_suffix(str,suffix) )
273 return str.substr(0, str.size()-suffix.size() );
276 } // eo remove_suffix(const std::string&,const std::string&)
281 * @brief removes a given prefix from a string.
282 * @param str the string.
283 * @param prefix the prefix which should be removed if the string begins with it.
284 * @return the string without the prefix.
286 * If the string begins with the prefix, it is removed. If the the string doesn't begin
287 * with the prefix the original string is returned.
289 std::string remove_prefix(const std::string& str, const std::string& prefix)
291 if (has_prefix(str,prefix) )
293 return str.substr( prefix.size() );
296 } // eo remove_prefix(const std::string&,const std::string&)
300 * split a string to key and value delimited by a given delimiter.
301 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
302 * @param str the string which should be splitted.
303 * @param[out] key the resulting key
304 * @param[out] value the resulting value
305 * @param delimiter the delimiter between key and value; default is '='.
306 * @return @a true if the split was successful.
309 const std::string& str,
314 std::string::size_type pos = str.find (delimiter);
315 if (pos == std::string::npos) return false;
316 key= str.substr(0,pos);
317 value= str.substr(pos+1);
321 } // eo pair_split(const std::string&,std::string&,std::string&,char)
325 * splits a string by given delimiter
327 * @param[in] str the string which should be splitted.
328 * @param[out] result the list resulting from splitting @a str.
329 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
330 * @param[in] omit_empty should empty parts not be stored?
331 * @param[in] trim_list list of characters the parts should be trimmed by.
332 * (empty string results in no trim)
335 const std::string& str,
336 std::list<std::string>& result,
337 const std::string& delimiter,
339 const std::string& trim_list
342 std::string::size_type pos, last_pos=0;
343 bool delimiter_found= false;
344 while ( last_pos < str.size() && last_pos != std::string::npos)
346 pos= str.find(delimiter, last_pos);
348 if (pos == std::string::npos)
350 part= str.substr(last_pos);
351 delimiter_found= false;
355 part= str.substr(last_pos, pos-last_pos);
356 delimiter_found=true;
358 if (pos != std::string::npos)
360 last_pos= pos+ delimiter.size();
364 last_pos= std::string::npos;
366 if (!trim_list.empty() ) trim_mod (part, trim_list);
367 if (omit_empty && part.empty() ) continue;
368 result.push_back( part );
370 // if the string ends with a delimiter we need to append an empty string if no omit_empty
372 // (this way we keep the split result consistent to a join operation)
373 if (delimiter_found && !omit_empty)
375 result.push_back("");
377 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
381 * splits a string by a given delimiter
382 * @param str the string which should be splitted.
383 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
384 * @param[in] omit_empty should empty parts not be stored?
385 * @param[in] trim_list list of characters the parts should be trimmed by.
386 * (empty string results in no trim)
387 * @return the list resulting from splitting @a str.
389 std::list<std::string> split_string(
390 const std::string& str,
391 const std::string& delimiter,
393 const std::string& trim_list
396 std::list<std::string> result;
397 split_string(str, result, delimiter, omit_empty, trim_list);
399 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
403 * @brief joins a list of strings into a single string.
405 * This funtion is (basically) the reverse operation of @a split_string.
407 * @param parts the list of strings.
408 * @param delimiter the delimiter which is inserted between the strings.
409 * @return the joined string.
411 std::string join_string(
412 const std::list< std::string >& parts,
413 const std::string& delimiter
417 if (! parts.empty() )
419 std::list< std::string >::const_iterator it= parts.begin();
421 while ( ++it != parts.end() )
428 } // eo join_string(const std::list< std::string >&,const std::string&)
438 * @brief returns a hex string from a binary string.
439 * @param str the (binary) string
440 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
441 * @return the string in hex notation.
443 std::string convert_binary_to_hex(
444 const std::string& str,
445 bool upper_case_digits
449 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
450 for ( std::string::const_iterator it= str.begin();
454 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
455 result.push_back( hexDigits[ (*it) & 0x0f ] );
458 } // eo convert_binary_to_hex(const std::string&,bool)
462 * @brief converts a hex digit string to binary string.
463 * @param str hex digit string
464 * @return the binary string.
466 * The hex digit string may contains white spaces or colons which are treated
467 * as delimiters between hex digit groups.
469 * @todo rework the handling of half nibbles (consistency)!
471 std::string convert_hex_to_binary(
472 const std::string& str
474 throw (std::runtime_error)
478 bool hasNibble= false;
479 bool lastWasWS= true;
480 for ( std::string::const_iterator it= str.begin();
484 std::string::size_type p = hexDigitsLower.find( *it );
485 if (p== std::string::npos)
487 p= hexDigitsUpper.find( *it );
489 if (p == std::string::npos)
491 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
492 or ( *it == ':') // or a colon?
495 // we treat that as a valid delimiter:
498 // 1 nibble before WS is treate as lower part:
507 if (p == std::string::npos )
509 throw runtime_error("illegal character in hex digit string: " + str);
523 //we already had a nibble, so a char is complete now:
524 result.push_back( c );
529 // this is the first nibble of a new char:
535 //well, there is one nibble left
536 // let's do some heuristics:
539 // if the preceeding character was a white space (or a colon)
540 // we treat the nibble as lower part:
541 //( this is consistent with shortened hex notations where leading zeros are not noted)
542 result.push_back( c );
546 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
547 result.push_back( c << 4 );
551 } // eo convert_hex_to_binary(const std::string&)
554 } // eo namespace I2n
559 std::string iso_to_utf8(const std::string& isostring)
563 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
565 if (iso_to_utf8 == (iconv_t)-1)
566 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
568 size_t in_size=isostring.size();
569 size_t out_size=in_size*4;
571 char *buf = (char *)malloc(out_size+1);
573 throw runtime_error("out of memory for iconv buffer");
575 char *in = (char *)isostring.c_str();
577 iconv(i2utf8, &in, &in_size, &out, &out_size);
579 buf[isostring.size()*4-out_size]=0;
589 std::string utf8_to_iso(const std::string& utf8string)
593 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
595 if (utf82iso == (iconv_t)-1)
596 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
598 size_t in_size=utf8string.size();
599 size_t out_size=in_size;
601 char *buf = (char *)malloc(out_size+1);
603 throw runtime_error("out of memory for iconv buffer");
605 char *in = (char *)utf8string.c_str();
607 iconv(utf82iso, &in, &in_size, &out, &out_size);
609 buf[utf8string.size()-out_size]=0;
614 iconv_close(utf82iso);
619 wchar_t* utf8_to_wbuf(const std::string& utf8string)
621 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
623 if (utf82wstr == (iconv_t)-1)
624 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
626 size_t in_size=utf8string.size();
627 size_t out_size= (in_size+1)*sizeof(wchar_t);
629 wchar_t *buf = (wchar_t *)malloc(out_size);
631 throw runtime_error("out of memory for iconv buffer");
633 char *in = (char *)utf8string.c_str();
634 char *out = (char*) buf;
635 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
636 throw runtime_error("error converting char encodings");
638 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
640 iconv_close(utf82wstr);
645 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
649 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
651 if (utf7imap2utf8 == (iconv_t)-1)
652 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
654 size_t in_size=utf7imapstring.size();
655 size_t out_size=in_size*4;
657 char *buf = (char *)malloc(out_size+1);
659 throw runtime_error("out of memory for iconv buffer");
661 char *in = (char *)utf7imapstring.c_str();
663 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
665 buf[utf7imapstring.size()*4-out_size]=0;
670 iconv_close(utf7imap2utf8);
675 std::string utf8_to_utf7imap(const std::string& utf8string)
679 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
681 if (utf82utf7imap == (iconv_t)-1)
682 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
684 // UTF-7 is base64 encoded, a buffer 10x as large
685 // as the utf-8 buffer should be enough. If not the string will be truncated.
686 size_t in_size=utf8string.size();
687 size_t out_size=in_size*10;
689 char *buf = (char *)malloc(out_size+1);
691 throw runtime_error("out of memory for iconv buffer");
693 char *in = (char *)utf8string.c_str();
695 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
697 buf[utf8string.size()*10-out_size]= 0;
702 iconv_close(utf82utf7imap);
707 // Tokenize string by (html) tags
708 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
710 string::size_type pos, len = input.size();
711 bool inside_tag = false;
714 for (pos = 0; pos < len; pos++)
716 if (input[pos] == '<')
720 if (!current.empty() )
722 tokenized.push_back( make_pair(current, false) );
726 current += input[pos];
728 else if (input[pos] == '>' && inside_tag)
730 current += input[pos];
732 if (!current.empty() )
734 tokenized.push_back( make_pair(current, true) );
739 current += input[pos];
742 // String left over in buffer?
743 if (!current.empty() )
744 tokenized.push_back( make_pair(current, false) );
745 } // eo tokenize_by_tag
748 std::string strip_html_tags(const std::string &input)
750 // Pair first: string, second: isTag
751 vector<pair<string,bool> > tokenized;
752 tokenize_by_tag (tokenized, input);
755 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
756 for (token = tokenized.begin(); token != tokens_end; ++token)
758 output += token->first;
761 } // eo strip_html_tags
764 // Smart-encode HTML en
765 string smart_html_entities(const std::string &input)
767 // Pair first: string, second: isTag
768 vector<pair<string,bool> > tokenized;
769 tokenize_by_tag (tokenized, input);
772 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
773 for (token = tokenized.begin(); token != tokens_end; ++token)
775 // keep HTML tags as they are
777 output += token->first;
779 output += html_entities(token->first);
786 string::size_type find_8bit(const std::string &str)
788 string::size_type l=str.size();
789 for (string::size_type p=0; p < l; p++)
790 if (static_cast<unsigned char>(str[p]) > 127)
796 // encoded UTF-8 chars into HTML entities
797 string html_entities(std::string str)
800 replace_all (str, "&", "&");
801 replace_all (str, "<", "<");
802 replace_all (str, ">", ">");
803 replace_all (str, "\"", """);
804 replace_all (str, "'", "'");
805 replace_all (str, "/", "/");
808 replace_all (str, "\xC3\xA4", "ä");
809 replace_all (str, "\xC3\xB6", "ö");
810 replace_all (str, "\xC3\xBC", "ü");
811 replace_all (str, "\xC3\x84", "Ä");
812 replace_all (str, "\xC3\x96", "Ö");
813 replace_all (str, "\xC3\x9C", "Ü");
816 replace_all (str, "\xC3\x9F", "ß");
818 // conversion of remaining non-ASCII chars needed?
819 // just do if needed because of performance
820 if (find_8bit(str) != string::npos)
822 // convert to fixed-size encoding UTF-32
823 wchar_t* wbuf=utf8_to_wbuf(str);
824 ostringstream target;
826 // replace all non-ASCII chars with HTML representation
827 for (int p=0; wbuf[p] != 0; p++)
829 unsigned int c=wbuf[p];
832 target << static_cast<unsigned char>(c);
834 target << "&#" << c << ';';
843 } // eo html_entities(std::string)
845 // convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
846 string html_entities_to_console(std::string str)
849 replace_all (str, "&", "&");
850 replace_all (str, "<", "<");
851 replace_all (str, ">", ">");
852 replace_all (str, """, "\"");
853 replace_all (str, "'", "'");
854 replace_all (str, "/", "/");
857 replace_all (str, "ä", "ae");
858 replace_all (str, "ö", "oe");
859 replace_all (str, "ü", "ue");
860 replace_all (str, "Ä", "Ae");
861 replace_all (str, "Ö", "Oe");
862 replace_all (str, "Ü", "Ue");
865 replace_all (str, "ß", "ss");
870 bool replace_all(string &base, const char *ist, const char *soll)
874 return replace_all(base,&i,&s);
877 bool replace_all(string &base, const string &ist, const char *soll)
880 return replace_all(base,&ist,&s);
883 bool replace_all(string &base, const string *ist, const string *soll)
885 return replace_all(base,*ist,*soll);
888 bool replace_all(string &base, const char *ist, const string *soll)
891 return replace_all(base,&i,soll);
894 bool replace_all(string &base, const string &ist, const string &soll)
896 bool found_ist = false;
897 string::size_type a=0;
900 throw runtime_error ("replace_all called with empty search string");
902 while ( (a=base.find(ist,a) ) != string::npos)
904 base.replace(a,ist.size(),soll);
913 * @brief replaces all characters that could be problematic or impose a security risk when being logged
914 * @param str the original string
915 * @param replace_with the character to replace the unsafe chars with
916 * @return a string that is safe to send to syslog or other logfiles
918 * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
919 * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
920 * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
923 std::string sanitize_for_logging(const std::string &str, const char replace_with)
925 std::string output=str;
927 const string::size_type len = output.size();
928 for (std::string::size_type p=0; p < len; p++)
929 if (output[p] < 0x20 || output[p] > 0x7E)
930 output[p]=replace_with;
936 string to_lower(const string &src)
940 string::size_type pos, end = dst.size();
941 for (pos = 0; pos < end; pos++)
942 dst[pos] = tolower(dst[pos]);
947 string to_upper(const string &src)
951 string::size_type pos, end = dst.size();
952 for (pos = 0; pos < end; pos++)
953 dst[pos] = toupper(dst[pos]);
959 const int MAX_UNIT_FORMAT_SYMBOLS = 6;
961 const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
970 const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
972 i18n_noop(" KBytes"),
973 i18n_noop(" MBytes"),
974 i18n_noop(" GBytes"),
975 i18n_noop(" TBytes"),
980 long double rounding_upwards(
982 const int rounding_multiplier
985 long double rounded_number;
986 rounded_number = number * rounding_multiplier;
987 rounded_number += 0.5;
988 rounded_number = (int64_t) (rounded_number);
989 rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
991 return rounded_number;
995 string nice_unit_format(
997 const UnitFormat format,
1001 // select the system of units (decimal or binary)
1003 if (base == UnitBase1000)
1012 long double size = input;
1014 // check the size of the input number to fit in the appropriate symbol
1016 while (size > multiple)
1018 size = size / multiple;
1021 // rollback to the previous values and stop the loop when cannot
1022 // represent the number length.
1023 if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1025 size = size * multiple;
1031 // round the input number "half up" to multiples of 10
1032 const int rounding_multiplier = 10;
1033 size = rounding_upwards(size, rounding_multiplier);
1035 // format the input number, placing the appropriate symbol
1037 out.setf (ios::fixed);
1038 if (format == ShortUnitFormat)
1041 out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
1046 out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
1050 } // eo nice_unit_format(int input)
1053 string escape(const string &s)
1056 string::size_type p;
1059 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1061 out.insert (p,"\\");
1066 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1068 out.replace (p,1,"\\r");
1073 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1075 out.replace (p,1,"\\n");
1082 } // eo scape(const std::string&)
1085 string descape(const string &s, int startpos, int &endpos)
1089 if (s.at(startpos) != '"')
1090 throw out_of_range("value not type escaped string");
1092 out=s.substr(startpos+1);
1093 string::size_type p=0;
1095 // search for the end of the string
1096 while ( (p=out.find("\"",p) ) !=out.npos)
1101 // the " might be escaped with a backslash
1102 while (e>=0 && out.at (e) =='\\')
1104 if (escaped == false)
1118 // we now have the end of the string
1119 out=out.substr(0,p);
1121 // tell calling prog about the endposition
1122 endpos=startpos+p+1;
1124 // descape all \ stuff inside the string now
1126 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1128 switch (out.at(p+1) )
1131 out.replace(p,2,"\r");
1134 out.replace(p,2,"\n");
1143 } // eo descape(const std::string&,int,int&)
1146 string escape_shellarg(const string &input)
1148 string output = "'";
1149 string::const_iterator it, it_end = input.end();
1150 for (it = input.begin(); it != it_end; ++it)