Fix unittests
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
0e23f538
TJ
1/*
2The software in this package is distributed under the GNU General
3Public License version 2 (with a special exception described below).
4
5A copy of GNU General Public License (GPL) is included in this distribution,
6in the file COPYING.GPL.
7
8As a special exception, if other files instantiate templates or use macros
9or inline functions from this file, or you compile this file and link it
10with other works to produce a work based on this file, this file
11does not by itself cause the resulting work to be covered
12by the GNU General Public License.
13
14However the source code for this file must still be made available
15in accordance with section (3) of the GNU General Public License.
16
17This exception does not invalidate any other reasons why a work based
18on this file might be covered by the GNU General Public License.
19*/
6a93d84a
TJ
20/** @file
21 *
22 * (c) Copyright 2007-2008 by Intra2net AG
6a93d84a 23 */
e93545dd
GE
24
25#include <iostream>
26#include <string>
27#include <sstream>
28#include <stdexcept>
5efd35b1 29#include <algorithm>
5cd64148 30#include <cmath> // for round()
e93545dd 31
a5f3af6e 32#include <wchar.h>
e93545dd
GE
33#include <stdlib.h>
34#include <iconv.h>
35#include <i18n.h>
36
5cd64148
CH
37#include <boost/numeric/conversion/cast.hpp>
38
e93545dd
GE
39#include <stringfunc.hxx>
40
41using namespace std;
42
6ab3bc95
RP
43namespace I2n
44{
6a93d84a
TJ
45
46
6ab3bc95
RP
47namespace
48{
6a93d84a
TJ
49
50const std::string hexDigitsLower("0123456789abcdef");
51const std::string hexDigitsUpper("0123456789ABCDEF");
52
53
54struct UpperFunc
55{
6ab3bc95
RP
56 char operator() (char c)
57 {
58 return std::toupper(c);
59 }
6a93d84a
TJ
60}; // eo struct UpperFunc
61
62
63struct LowerFunc
64{
6ab3bc95
RP
65 char operator() (char c)
66 {
67 return std::tolower(c);
68 }
6a93d84a
TJ
69}; // eo struct LowerFunc
70
71
72} // eo namespace <anonymous>
73
74
75
76/**
6ab3bc95 77 * default list of Whitespaces (" \t\r\n");
6a93d84a 78 */
6ab3bc95 79const std::string Whitespaces = " \t\r\n";
6a93d84a
TJ
80
81/**
82 * default list of lineendings ("\r\n");
83 */
6ab3bc95 84const std::string LineEndings= "\r\n";
6a93d84a
TJ
85
86
87
88/**
89 * @brief checks if a string begins with a given prefix.
90 * @param[in,out] str the string which is tested
91 * @param prefix the prefix which should be tested for.
92 * @return @a true iff the prefix is not empty and the string begins with that prefix.
93 */
6ab3bc95 94bool has_prefix(const std::string& str, const std::string& prefix)
6a93d84a 95{
6ab3bc95
RP
96 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
97 {
98 return false;
99 }
100 return str.compare(0, prefix.size(), prefix) == 0;
101} // eo has_prefix(const std::string&,const std::string&)
6a93d84a
TJ
102
103
104/**
105 * @brief checks if a string ends with a given suffix.
106 * @param[in,out] str the string which is tested
107 * @param suffix the suffix which should be tested for.
108 * @return @a true iff the suffix is not empty and the string ends with that suffix.
109 */
6ab3bc95 110bool has_suffix(const std::string& str, const std::string& suffix)
6a93d84a 111{
6ab3bc95
RP
112 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
113 {
114 return false;
115 }
116 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
117} // eo has_suffix(const std::string&,const std::string&)
6a93d84a
TJ
118
119
120/**
121 * cut off characters from a given list from front and end of a string.
122 * @param[in,out] str the string which should be trimmed.
123 * @param charlist the list of characters to remove from beginning and end of string
124 * @return the result string.
125 */
6ab3bc95
RP
126std::string trim_mod(std::string& str, const std::string& charlist)
127{
128 // first: trim the beginning:
129 std::string::size_type pos= str.find_first_not_of (charlist);
130 if (pos == std::string::npos)
131 {
132 // whole string consists of charlist (or is already empty)
133 str.clear();
134 return str;
135 }
136 else if (pos>0)
137 {
138 // str starts with charlist
139 str.erase(0,pos);
140 }
141 // now let's look at the tail:
142 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
143 if ( pos < str.size() )
144 {
145 str.erase(pos, str.size()-pos);
146 }
147 return str;
148} // eo trim_mod(std::string&,const std::string&)
6a93d84a
TJ
149
150
151
152/**
153 * removes last character from a string when it is in a list of chars to be removed.
154 * @param[in,out] str the string.
155 * @param what the list of chars which will be tested for.
156 * @return the resulting string with last char removed (if applicable)
157 */
6ab3bc95 158std::string chomp_mod(std::string& str, const std::string& what)
6a93d84a 159{
6ab3bc95
RP
160 if (str.empty() || what.empty() )
161 {
162 return str;
163 }
164 if (what.find(str.at (str.size()-1) ) != std::string::npos)
165 {
166 str.erase(str.size() - 1);
167 }
168 return str;
169} // eo chomp_mod(std::string&,const std::string&)
6a93d84a
TJ
170
171
172/**
173 * @brief converts a string to lower case.
174 * @param[in,out] str the string to modify.
175 * @return the string
176 */
6ab3bc95 177std::string to_lower_mod(std::string& str)
6a93d84a 178{
6ab3bc95
RP
179 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
180 return str;
181} // eo to_lower_mod(std::string&)
6a93d84a
TJ
182
183
184/**
185 * @brief converts a string to upper case.
186 * @param[in,out] str the string to modify.
187 * @return the string
188 */
6ab3bc95 189std::string to_upper_mod(std::string& str)
6a93d84a 190{
6ab3bc95
RP
191 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
192 return str;
193} // eo to_upper_mod(std::string&)
6a93d84a
TJ
194
195
196
197/**
198 * cut off characters from a given list from front and end of a string.
199 * @param str the string which should be trimmed.
200 * @param charlist the list of characters to remove from beginning and end of string
201 * @return the result string.
202 */
6ab3bc95
RP
203std::string trim (const std::string& str, const std::string& charlist)
204{
205 // first: trim the beginning:
206 std::string::size_type pos0= str.find_first_not_of(charlist);
207 if (pos0 == std::string::npos)
208 {
209 // whole string consists of charlist (or is already empty)
210 return std::string();
211 }
212 // now let's look at the end:
213 std::string::size_type pos1= str.find_last_not_of(charlist);
214 return str.substr(pos0, pos1 - pos0 + 1);
6a93d84a
TJ
215} // eo trim(const std:.string&,const std::string&)
216
217
218/**
219 * removes last character from a string when it is in a list of chars to be removed.
220 * @param str the string.
221 * @param what the list of chars which will be tested for.
222 * @return the resulting string with last char removed (if applicable)
223 */
6ab3bc95
RP
224std::string chomp (const std::string& str, const std::string& what)
225{
226 if (str.empty() || what.empty() )
227 {
228 return str;
229 }
230 if (what.find(str.at (str.size()-1) ) != std::string::npos)
231 {
232 return str.substr(0, str.size()-1);
233 }
234 return str;
6a93d84a
TJ
235} // eo chomp(const std:.string&,const std::string&)
236
237
238/**
239 * @brief returns a lower case version of a given string.
240 * @param str the string
241 * @return the lower case version of the string
242 */
6ab3bc95 243std::string to_lower (const std::string& str)
6a93d84a 244{
6ab3bc95
RP
245 std::string result(str);
246 return to_lower_mod(result);
247} // eo to_lower(const std::string&)
6a93d84a
TJ
248
249
250/**
251 * @brief returns a upper case version of a given string.
252 * @param str the string
253 * @return the upper case version of the string
254 */
6ab3bc95 255std::string to_upper(const std::string& str)
6a93d84a 256{
6ab3bc95
RP
257 std::string result(str);
258 return to_upper_mod(result);
259} // eo to_upper(const std::string&)
6a93d84a
TJ
260
261
262
263/**
264 * @brief removes a given suffix from a string.
265 * @param str the string.
266 * @param suffix the suffix which should be removed if the string ends with it.
267 * @return the string without the suffix.
268 *
269 * If the string ends with the suffix, it is removed. If the the string doesn't end
270 * with the suffix the original string is returned.
271 */
6ab3bc95 272std::string remove_suffix(const std::string& str, const std::string& suffix)
6a93d84a 273{
6ab3bc95
RP
274 if (has_suffix(str,suffix) )
275 {
276 return str.substr(0, str.size()-suffix.size() );
277 }
278 return str;
279} // eo remove_suffix(const std::string&,const std::string&)
6a93d84a
TJ
280
281
282
283/**
284 * @brief removes a given prefix from a string.
285 * @param str the string.
286 * @param prefix the prefix which should be removed if the string begins with it.
287 * @return the string without the prefix.
288 *
289 * If the string begins with the prefix, it is removed. If the the string doesn't begin
290 * with the prefix the original string is returned.
291 */
6ab3bc95 292std::string remove_prefix(const std::string& str, const std::string& prefix)
6a93d84a 293{
6ab3bc95
RP
294 if (has_prefix(str,prefix) )
295 {
296 return str.substr( prefix.size() );
297 }
298 return str;
299} // eo remove_prefix(const std::string&,const std::string&)
6a93d84a
TJ
300
301
302/**
303 * split a string to key and value delimited by a given delimiter.
6ab3bc95 304 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
6a93d84a
TJ
305 * @param str the string which should be splitted.
306 * @param[out] key the resulting key
307 * @param[out] value the resulting value
308 * @param delimiter the delimiter between key and value; default is '='.
309 * @return @a true if the split was successful.
310 */
6ab3bc95
RP
311bool pair_split(
312 const std::string& str,
313 std::string& key,
314 std::string& value,
315 char delimiter)
316{
317 std::string::size_type pos = str.find (delimiter);
318 if (pos == std::string::npos) return false;
319 key= str.substr(0,pos);
320 value= str.substr(pos+1);
321 trim_mod(key);
322 trim_mod(value);
323 return true;
324} // eo pair_split(const std::string&,std::string&,std::string&,char)
6a93d84a
TJ
325
326
327/**
328 * splits a string by given delimiter
329 *
330 * @param[in] str the string which should be splitted.
331 * @param[out] result the list resulting from splitting @a str.
332 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
333 * @param[in] omit_empty should empty parts not be stored?
334 * @param[in] trim_list list of characters the parts should be trimmed by.
335 * (empty string results in no trim)
336 */
6ab3bc95
RP
337void split_string(
338 const std::string& str,
339 std::list<std::string>& result,
340 const std::string& delimiter,
341 bool omit_empty,
342 const std::string& trim_list
6a93d84a
TJ
343)
344{
6ab3bc95
RP
345 std::string::size_type pos, last_pos=0;
346 bool delimiter_found= false;
347 while ( last_pos < str.size() && last_pos != std::string::npos)
348 {
349 pos= str.find(delimiter, last_pos);
350 std::string part;
351 if (pos == std::string::npos)
352 {
353 part= str.substr(last_pos);
354 delimiter_found= false;
355 }
356 else
357 {
358 part= str.substr(last_pos, pos-last_pos);
359 delimiter_found=true;
360 }
361 if (pos != std::string::npos)
362 {
363 last_pos= pos+ delimiter.size();
364 }
365 else
366 {
367 last_pos= std::string::npos;
368 }
369 if (!trim_list.empty() ) trim_mod (part, trim_list);
370 if (omit_empty && part.empty() ) continue;
371 result.push_back( part );
372 }
373 // if the string ends with a delimiter we need to append an empty string if no omit_empty
374 // was given.
375 // (this way we keep the split result consistent to a join operation)
376 if (delimiter_found && !omit_empty)
377 {
378 result.push_back("");
379 }
380} // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
6a93d84a
TJ
381
382
383/**
384 * splits a string by a given delimiter
385 * @param str the string which should be splitted.
386 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
387 * @param[in] omit_empty should empty parts not be stored?
388 * @param[in] trim_list list of characters the parts should be trimmed by.
389 * (empty string results in no trim)
390 * @return the list resulting from splitting @a str.
391 */
6ab3bc95
RP
392std::list<std::string> split_string(
393 const std::string& str,
394 const std::string& delimiter,
395 bool omit_empty,
396 const std::string& trim_list
6a93d84a
TJ
397)
398{
6ab3bc95
RP
399 std::list<std::string> result;
400 split_string(str, result, delimiter, omit_empty, trim_list);
401 return result;
402} // eo split_string(const std::string&,const std::string&,bool,const std::string&)
6a93d84a
TJ
403
404
405/**
406 * @brief joins a list of strings into a single string.
407 *
6ab3bc95
RP
408 * This funtion is (basically) the reverse operation of @a split_string.
409 *
6a93d84a
TJ
410 * @param parts the list of strings.
411 * @param delimiter the delimiter which is inserted between the strings.
412 * @return the joined string.
413 */
6ab3bc95
RP
414std::string join_string(
415 const std::list< std::string >& parts,
416 const std::string& delimiter
6a93d84a
TJ
417)
418{
6ab3bc95
RP
419 std::string result;
420 if (! parts.empty() )
421 {
422 std::list< std::string >::const_iterator it= parts.begin();
423 result = *it;
424 while ( ++it != parts.end() )
425 {
426 result+= delimiter;
427 result+= *it;
428 }
429 }
430 return result;
431} // eo join_string(const std::list< std::string >&,const std::string&)
6a93d84a
TJ
432
433
434
435/*
436** conversions
437*/
438
439
440/**
441 * @brief returns a hex string from a binary string.
442 * @param str the (binary) string
443 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
444 * @return the string in hex notation.
445 */
6ab3bc95
RP
446std::string convert_binary_to_hex(
447 const std::string& str,
448 bool upper_case_digits
6a93d84a
TJ
449)
450{
6ab3bc95
RP
451 std::string result;
452 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
453 for ( std::string::const_iterator it= str.begin();
454 it != str.end();
455 ++it)
456 {
457 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
458 result.push_back( hexDigits[ (*it) & 0x0f ] );
459 }
460 return result;
461} // eo convert_binary_to_hex(const std::string&,bool)
6a93d84a
TJ
462
463
464/**
465 * @brief converts a hex digit string to binary string.
466 * @param str hex digit string
467 * @return the binary string.
468 *
469 * The hex digit string may contains white spaces or colons which are treated
470 * as delimiters between hex digit groups.
471 *
472 * @todo rework the handling of half nibbles (consistency)!
473 */
6ab3bc95
RP
474std::string convert_hex_to_binary(
475 const std::string& str
6a93d84a 476)
6ab3bc95
RP
477throw (std::runtime_error)
478{
479 std::string result;
480 char c= 0;
481 bool hasNibble= false;
482 bool lastWasWS= true;
483 for ( std::string::const_iterator it= str.begin();
484 it != str.end();
485 ++it)
486 {
487 std::string::size_type p = hexDigitsLower.find( *it );
488 if (p== std::string::npos)
489 {
490 p= hexDigitsUpper.find( *it );
491 }
492 if (p == std::string::npos)
493 {
494 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
6a93d84a 495 or ( *it == ':') // or a colon?
6ab3bc95
RP
496 )
497 {
498 // we treat that as a valid delimiter:
499 if (hasNibble)
6a93d84a 500 {
6ab3bc95
RP
501 // 1 nibble before WS is treate as lower part:
502 result.push_back(c);
503 // reset state:
504 hasNibble= false;
6a93d84a 505 }
6ab3bc95
RP
506 lastWasWS= true;
507 continue;
508 }
509 }
510 if (p == std::string::npos )
511 {
512 throw runtime_error("illegal character in hex digit string: " + str);
513 }
514 lastWasWS= false;
515 if (hasNibble)
516 {
517 c<<=4;
518 }
519 else
520 {
521 c=0;
522 }
523 c+= (p & 0x0f);
524 if (hasNibble)
525 {
526 //we already had a nibble, so a char is complete now:
527 result.push_back( c );
528 hasNibble=false;
529 }
530 else
531 {
532 // this is the first nibble of a new char:
533 hasNibble=true;
534 }
535 }
536 if (hasNibble)
537 {
538 //well, there is one nibble left
539 // let's do some heuristics:
540 if (lastWasWS)
541 {
542 // if the preceeding character was a white space (or a colon)
543 // we treat the nibble as lower part:
544 //( this is consistent with shortened hex notations where leading zeros are not noted)
545 result.push_back( c );
546 }
547 else
548 {
549 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
550 result.push_back( c << 4 );
551 }
552 }
553 return result;
554} // eo convert_hex_to_binary(const std::string&)
555
556
557} // eo namespace I2n
558
559
560
6a93d84a 561
e93545dd
GE
562std::string iso_to_utf8(const std::string& isostring)
563{
6ab3bc95 564 string result;
118e216e 565
6ab3bc95 566 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
118e216e 567
6ab3bc95
RP
568 if (iso_to_utf8 == (iconv_t)-1)
569 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 570
6ab3bc95
RP
571 size_t in_size=isostring.size();
572 size_t out_size=in_size*4;
118e216e 573
6ab3bc95
RP
574 char *buf = (char *)malloc(out_size+1);
575 if (buf == NULL)
576 throw runtime_error("out of memory for iconv buffer");
e93545dd 577
5a4ecb51 578 char *in = (char *)isostring.c_str();
6ab3bc95
RP
579 char *out = buf;
580 iconv(i2utf8, &in, &in_size, &out, &out_size);
118e216e 581
6ab3bc95 582 buf[isostring.size()*4-out_size]=0;
118e216e 583
6ab3bc95 584 result=buf;
118e216e 585
6ab3bc95
RP
586 free(buf);
587 iconv_close(i2utf8);
118e216e 588
6ab3bc95 589 return result;
e93545dd
GE
590}
591
592std::string utf8_to_iso(const std::string& utf8string)
593{
6ab3bc95 594 string result;
118e216e 595
6ab3bc95 596 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
118e216e 597
6ab3bc95
RP
598 if (utf82iso == (iconv_t)-1)
599 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 600
6ab3bc95
RP
601 size_t in_size=utf8string.size();
602 size_t out_size=in_size;
118e216e 603
6ab3bc95
RP
604 char *buf = (char *)malloc(out_size+1);
605 if (buf == NULL)
606 throw runtime_error("out of memory for iconv buffer");
e93545dd 607
5a4ecb51 608 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
609 char *out = buf;
610 iconv(utf82iso, &in, &in_size, &out, &out_size);
118e216e 611
6ab3bc95 612 buf[utf8string.size()-out_size]=0;
118e216e 613
6ab3bc95 614 result=buf;
118e216e 615
6ab3bc95
RP
616 free(buf);
617 iconv_close(utf82iso);
e93545dd 618
6ab3bc95 619 return result;
e93545dd
GE
620}
621
a5f3af6e
GE
622wchar_t* utf8_to_wbuf(const std::string& utf8string)
623{
6ab3bc95 624 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
a5f3af6e 625
6ab3bc95
RP
626 if (utf82wstr == (iconv_t)-1)
627 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
a5f3af6e 628
6ab3bc95
RP
629 size_t in_size=utf8string.size();
630 size_t out_size= (in_size+1)*sizeof(wchar_t);
a5f3af6e 631
6ab3bc95
RP
632 wchar_t *buf = (wchar_t *)malloc(out_size);
633 if (buf == NULL)
634 throw runtime_error("out of memory for iconv buffer");
a5f3af6e 635
5a4ecb51 636 char *in = (char *)utf8string.c_str();
6ab3bc95 637 char *out = (char*) buf;
dbd6d77c 638 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
6ab3bc95 639 throw runtime_error("error converting char encodings");
a5f3af6e 640
6ab3bc95 641 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
a5f3af6e 642
6ab3bc95 643 iconv_close(utf82wstr);
a5f3af6e 644
6ab3bc95 645 return buf;
a5f3af6e
GE
646}
647
13cc4db1 648std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071 649{
6ab3bc95 650 string result;
118e216e 651
6ab3bc95 652 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
118e216e 653
6ab3bc95
RP
654 if (utf7imap2utf8 == (iconv_t)-1)
655 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 656
6ab3bc95
RP
657 size_t in_size=utf7imapstring.size();
658 size_t out_size=in_size*4;
118e216e 659
6ab3bc95
RP
660 char *buf = (char *)malloc(out_size+1);
661 if (buf == NULL)
662 throw runtime_error("out of memory for iconv buffer");
d116a071 663
5a4ecb51 664 char *in = (char *)utf7imapstring.c_str();
6ab3bc95
RP
665 char *out = buf;
666 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 667
6ab3bc95 668 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 669
6ab3bc95 670 result=buf;
118e216e 671
6ab3bc95
RP
672 free(buf);
673 iconv_close(utf7imap2utf8);
118e216e 674
6ab3bc95 675 return result;
118e216e
TJ
676}
677
6a2b6dd1
TJ
678std::string utf8_to_utf7imap(const std::string& utf8string)
679{
6ab3bc95 680 string result;
6a2b6dd1 681
6ab3bc95 682 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
6a2b6dd1 683
6ab3bc95
RP
684 if (utf82utf7imap == (iconv_t)-1)
685 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
6a2b6dd1 686
6ab3bc95
RP
687 // UTF-7 is base64 encoded, a buffer 10x as large
688 // as the utf-8 buffer should be enough. If not the string will be truncated.
689 size_t in_size=utf8string.size();
690 size_t out_size=in_size*10;
6a2b6dd1 691
6ab3bc95
RP
692 char *buf = (char *)malloc(out_size+1);
693 if (buf == NULL)
694 throw runtime_error("out of memory for iconv buffer");
6a2b6dd1 695
5a4ecb51 696 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
697 char *out = buf;
698 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
6a2b6dd1 699
6ab3bc95 700 buf[utf8string.size()*10-out_size]= 0;
6a2b6dd1 701
6ab3bc95 702 result=buf;
6a2b6dd1 703
6ab3bc95
RP
704 free(buf);
705 iconv_close(utf82utf7imap);
6a2b6dd1 706
6ab3bc95 707 return result;
6a2b6dd1
TJ
708}
709
118e216e
TJ
710// Tokenize string by (html) tags
711void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
712{
6ab3bc95
RP
713 string::size_type pos, len = input.size();
714 bool inside_tag = false;
715 string current;
716
717 for (pos = 0; pos < len; pos++)
718 {
719 if (input[pos] == '<')
720 {
721 inside_tag = true;
722
723 if (!current.empty() )
724 {
725 tokenized.push_back( make_pair(current, false) );
726 current = "";
727 }
728
729 current += input[pos];
730 }
731 else if (input[pos] == '>' && inside_tag)
732 {
733 current += input[pos];
734 inside_tag = false;
735 if (!current.empty() )
736 {
737 tokenized.push_back( make_pair(current, true) );
738 current = "";
739 }
740 }
741 else
742 current += input[pos];
743 }
744
745 // String left over in buffer?
746 if (!current.empty() )
747 tokenized.push_back( make_pair(current, false) );
748} // eo tokenize_by_tag
118e216e 749
118e216e
TJ
750
751std::string strip_html_tags(const std::string &input)
752{
6ab3bc95
RP
753 // Pair first: string, second: isTag
754 vector<pair<string,bool> > tokenized;
755 tokenize_by_tag (tokenized, input);
118e216e 756
6ab3bc95
RP
757 string output;
758 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 759 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
760 if (!token->second)
761 output += token->first;
762
763 return output;
764} // eo strip_html_tags
118e216e 765
118e216e
TJ
766
767// Smart-encode HTML en
768string smart_html_entities(const std::string &input)
769{
6ab3bc95
RP
770 // Pair first: string, second: isTag
771 vector<pair<string,bool> > tokenized;
772 tokenize_by_tag (tokenized, input);
773
774 string output;
775 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 776 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
777 {
778 // keep HTML tags as they are
779 if (token->second)
780 output += token->first;
781 else
782 output += html_entities(token->first);
783 }
784
785 return output;
118e216e
TJ
786}
787
6ab3bc95 788
a5f3af6e
GE
789string::size_type find_8bit(const std::string &str)
790{
6ab3bc95
RP
791 string::size_type l=str.size();
792 for (string::size_type p=0; p < l; p++)
793 if (static_cast<unsigned char>(str[p]) > 127)
794 return p;
a5f3af6e 795
6ab3bc95 796 return string::npos;
a5f3af6e
GE
797}
798
118e216e
TJ
799// encoded UTF-8 chars into HTML entities
800string html_entities(std::string str)
801{
6ab3bc95
RP
802 // Normal chars
803 replace_all (str, "&", "&amp;");
6ab3bc95
RP
804 replace_all (str, "<", "&lt;");
805 replace_all (str, ">", "&gt;");
980577e1
TJ
806 replace_all (str, "\"", "&quot;");
807 replace_all (str, "'", "&#x27;");
808 replace_all (str, "/", "&#x2F;");
6ab3bc95
RP
809
810 // Umlauts
811 replace_all (str, "\xC3\xA4", "&auml;");
812 replace_all (str, "\xC3\xB6", "&ouml;");
813 replace_all (str, "\xC3\xBC", "&uuml;");
814 replace_all (str, "\xC3\x84", "&Auml;");
815 replace_all (str, "\xC3\x96", "&Ouml;");
816 replace_all (str, "\xC3\x9C", "&Uuml;");
817
818 // Misc
819 replace_all (str, "\xC3\x9F", "&szlig;");
820
821 // conversion of remaining non-ASCII chars needed?
822 // just do if needed because of performance
823 if (find_8bit(str) != string::npos)
824 {
825 // convert to fixed-size encoding UTF-32
826 wchar_t* wbuf=utf8_to_wbuf(str);
827 ostringstream target;
828
829 // replace all non-ASCII chars with HTML representation
830 for (int p=0; wbuf[p] != 0; p++)
831 {
832 unsigned int c=wbuf[p];
833
834 if (c <= 127)
835 target << static_cast<unsigned char>(c);
836 else
837 target << "&#" << c << ';';
838 }
839
840 free(wbuf);
841
842 str=target.str();
843 }
844
845 return str;
846} // eo html_entities(std::string)
847
554f813d
GE
848// convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
849string html_entities_to_console(std::string str)
850{
851 // Normal chars
852 replace_all (str, "&amp;", "&");
853 replace_all (str, "&lt;", "<");
854 replace_all (str, "&gt;", ">");
855 replace_all (str, "&quot;", "\"");
856 replace_all (str, "&#x27;", "'");
857 replace_all (str, "&#x2F;", "/");
858
859 // Umlauts
860 replace_all (str, "&auml;", "ae");
861 replace_all (str, "&ouml;", "oe");
862 replace_all (str, "&uuml;", "ue");
863 replace_all (str, "&Auml;", "Ae");
864 replace_all (str, "&Ouml;", "Oe");
865 replace_all (str, "&Uuml;", "Ue");
866
867 // Misc
868 replace_all (str, "&szlig;", "ss");
869
870 return str;
871}
118e216e 872
e93545dd
GE
873bool replace_all(string &base, const char *ist, const char *soll)
874{
6ab3bc95
RP
875 string i=ist;
876 string s=soll;
877 return replace_all(base,&i,&s);
e93545dd
GE
878}
879
880bool replace_all(string &base, const string &ist, const char *soll)
881{
6ab3bc95
RP
882 string s=soll;
883 return replace_all(base,&ist,&s);
e93545dd
GE
884}
885
886bool replace_all(string &base, const string *ist, const string *soll)
887{
6ab3bc95 888 return replace_all(base,*ist,*soll);
e93545dd
GE
889}
890
891bool replace_all(string &base, const char *ist, const string *soll)
892{
6ab3bc95
RP
893 string i=ist;
894 return replace_all(base,&i,soll);
e93545dd
GE
895}
896
897bool replace_all(string &base, const string &ist, const string &soll)
898{
6ab3bc95
RP
899 bool found_ist = false;
900 string::size_type a=0;
901
902 if (ist.empty() )
903 throw runtime_error ("replace_all called with empty search string");
e93545dd 904
6ab3bc95
RP
905 while ( (a=base.find(ist,a) ) != string::npos)
906 {
907 base.replace(a,ist.size(),soll);
908 a=a+soll.size();
909 found_ist = true;
910 }
1ec2064e 911
6ab3bc95 912 return found_ist;
e93545dd
GE
913}
914
b953bf36
GE
915/**
916 * @brief replaces all characters that could be problematic or impose a security risk when being logged
917 * @param str the original string
918 * @param replace_with the character to replace the unsafe chars with
919 * @return a string that is safe to send to syslog or other logfiles
920 *
921 * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
922 * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
923 * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
924 *
925 */
926std::string sanitize_for_logging(const std::string &str, const char replace_with)
927{
928 std::string output=str;
929
c0e32d64
GE
930 const string::size_type len = output.size();
931 for (std::string::size_type p=0; p < len; p++)
b953bf36
GE
932 if (output[p] < 0x20 || output[p] > 0x7E)
933 output[p]=replace_with;
934
935 return output;
936}
937
e5b21dbb 938#if 0
e93545dd
GE
939string to_lower(const string &src)
940{
6ab3bc95 941 string dst = src;
e93545dd 942
6ab3bc95
RP
943 string::size_type pos, end = dst.size();
944 for (pos = 0; pos < end; pos++)
945 dst[pos] = tolower(dst[pos]);
e93545dd 946
6ab3bc95 947 return dst;
e93545dd
GE
948}
949
950string to_upper(const string &src)
951{
6ab3bc95 952 string dst = src;
e93545dd 953
6ab3bc95
RP
954 string::size_type pos, end = dst.size();
955 for (pos = 0; pos < end; pos++)
956 dst[pos] = toupper(dst[pos]);
e93545dd 957
6ab3bc95 958 return dst;
e93545dd 959}
e5b21dbb 960#endif
e93545dd 961
83809f5e 962const int MAX_UNIT_FORMAT_SYMBOLS = 6;
d1ea9075 963
2cb9a9c5 964const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
d1ea9075
GMF
965 " B",
966 " KB",
967 " MB",
968 " GB",
969 " TB",
83809f5e 970 " PB"
d1ea9075
GMF
971};
972
2cb9a9c5 973const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
5cb766b9
GMF
974 i18n_noop(" Bytes"),
975 i18n_noop(" KBytes"),
976 i18n_noop(" MBytes"),
977 i18n_noop(" GBytes"),
978 i18n_noop(" TBytes"),
83809f5e 979 i18n_noop(" PBytes")
d1ea9075
GMF
980};
981
72a94426
GMF
982
983long double rounding_upwards(
e91c1952 984 const long double number,
72a94426
GMF
985 const int rounding_multiplier
986)
987{
988 long double rounded_number;
989 rounded_number = number * rounding_multiplier;
990 rounded_number += 0.5;
991 rounded_number = (int64_t) (rounded_number);
992 rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
993
994 return rounded_number;
995}
996
997
81267544
GMF
998string nice_unit_format(
999 const int64_t input,
70fc0674
GMF
1000 const UnitFormat format,
1001 const UnitBase base
81267544 1002)
6ab3bc95 1003{
d1ea9075 1004 // select the system of units (decimal or binary)
81267544 1005 int multiple = 0;
a398513a 1006 if (base == UnitBase1000)
81267544
GMF
1007 {
1008 multiple = 1000;
1009 }
1010 else
1011 {
1012 multiple = 1024;
1013 }
1014
1015 long double size = input;
6ab3bc95 1016
d1ea9075
GMF
1017 // check the size of the input number to fit in the appropriate symbol
1018 int sizecount = 0;
81267544 1019 while (size > multiple)
6ab3bc95 1020 {
81267544
GMF
1021 size = size / multiple;
1022 sizecount++;
83809f5e
GMF
1023
1024 // rollback to the previous values and stop the loop when cannot
1025 // represent the number length.
1026 if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1027 {
1028 size = size * multiple;
1029 sizecount--;
1030 break;
1031 }
6ab3bc95
RP
1032 }
1033
a398513a
GMF
1034 // round the input number "half up" to multiples of 10
1035 const int rounding_multiplier = 10;
72a94426 1036 size = rounding_upwards(size, rounding_multiplier);
6ab3bc95 1037
d1ea9075 1038 // format the input number, placing the appropriate symbol
6ab3bc95 1039 ostringstream out;
6ab3bc95 1040 out.setf (ios::fixed);
a398513a 1041 if (format == ShortUnitFormat)
d1ea9075
GMF
1042 {
1043 out.precision(1);
68d37a5c 1044 out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
d1ea9075
GMF
1045 }
1046 else
6ab3bc95 1047 {
d1ea9075 1048 out.precision (2);
68d37a5c 1049 out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
6ab3bc95
RP
1050 }
1051
1052 return out.str();
1053} // eo nice_unit_format(int input)
1054
e93545dd 1055
5cd64148
CH
1056string nice_unit_format(
1057 const double input,
1058 const UnitFormat format,
1059 const UnitBase base
1060)
1061{
1062 // round as double and cast to int64_t
1063 // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1064 int64_t input_casted_and_rounded =
1065 boost::numeric_cast<int64_t>( round(input) );
1066
1067 // now call other
1068 return nice_unit_format( input_casted_and_rounded, format, base );
1069} // eo nice_unit_format(double input)
1070
1071
47c07fba
GE
1072string escape(const string &s)
1073{
6ab3bc95
RP
1074 string out(s);
1075 string::size_type p;
47c07fba 1076
6ab3bc95
RP
1077 p=0;
1078 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1079 {
1080 out.insert (p,"\\");
1081 p+=2;
1082 }
47c07fba 1083
6ab3bc95
RP
1084 p=0;
1085 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1086 {
1087 out.replace (p,1,"\\r");
1088 p+=2;
1089 }
47c07fba 1090
6ab3bc95
RP
1091 p=0;
1092 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1093 {
1094 out.replace (p,1,"\\n");
1095 p+=2;
1096 }
47c07fba 1097
6ab3bc95 1098 out='"'+out+'"';
47c07fba 1099
6ab3bc95
RP
1100 return out;
1101} // eo scape(const std::string&)
47c07fba 1102
47c07fba 1103
6ab3bc95
RP
1104string descape(const string &s, int startpos, int &endpos)
1105{
1106 string out;
1107
1108 if (s.at(startpos) != '"')
1109 throw out_of_range("value not type escaped string");
1110
1111 out=s.substr(startpos+1);
1112 string::size_type p=0;
1113
1114 // search for the end of the string
1115 while ( (p=out.find("\"",p) ) !=out.npos)
1116 {
1117 int e=p-1;
1118 bool escaped=false;
1119
1120 // the " might be escaped with a backslash
1121 while (e>=0 && out.at (e) =='\\')
1122 {
1123 if (escaped == false)
1124 escaped=true;
1125 else
1126 escaped=false;
1127
1128 e--;
1129 }
1130
1131 if (escaped==false)
1132 break;
1133 else
1134 p++;
1135 }
1136
1137 // we now have the end of the string
1138 out=out.substr(0,p);
1139
1140 // tell calling prog about the endposition
1141 endpos=startpos+p+1;
1142
1143 // descape all \ stuff inside the string now
1144 p=0;
1145 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1146 {
1147 switch (out.at(p+1) )
1148 {
1149 case 'r':
47c07fba
GE
1150 out.replace(p,2,"\r");
1151 break;
6ab3bc95 1152 case 'n':
47c07fba
GE
1153 out.replace(p,2,"\n");
1154 break;
6ab3bc95 1155 default:
47c07fba 1156 out.erase(p,1);
6ab3bc95
RP
1157 }
1158 p++;
1159 }
1160
1161 return out;
1162} // eo descape(const std::string&,int,int&)
47c07fba 1163
e93545dd 1164
47c07fba
GE
1165string escape_shellarg(const string &input)
1166{
6ab3bc95
RP
1167 string output = "'";
1168 string::const_iterator it, it_end = input.end();
83d700e9 1169 for (it = input.begin(); it != it_end; ++it)
6ab3bc95
RP
1170 {
1171 if ( (*it) == '\'')
1172 output += "'\\'";
1173
1174 output += *it;
1175 }
1176
1177 output += "'";
1178 return output;
47c07fba 1179}