Switch to Intra2net rpm group
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
0e23f538
TJ
1/*
2The software in this package is distributed under the GNU General
3Public License version 2 (with a special exception described below).
4
5A copy of GNU General Public License (GPL) is included in this distribution,
6in the file COPYING.GPL.
7
8As a special exception, if other files instantiate templates or use macros
9or inline functions from this file, or you compile this file and link it
10with other works to produce a work based on this file, this file
11does not by itself cause the resulting work to be covered
12by the GNU General Public License.
13
14However the source code for this file must still be made available
15in accordance with section (3) of the GNU General Public License.
16
17This exception does not invalidate any other reasons why a work based
18on this file might be covered by the GNU General Public License.
19*/
6a93d84a
TJ
20/** @file
21 *
22 * (c) Copyright 2007-2008 by Intra2net AG
6a93d84a 23 */
e93545dd
GE
24
25#include <iostream>
26#include <string>
27#include <sstream>
28#include <stdexcept>
5efd35b1 29#include <algorithm>
e93545dd 30
a5f3af6e 31#include <wchar.h>
e93545dd
GE
32#include <stdlib.h>
33#include <iconv.h>
34#include <i18n.h>
35
36#include <stringfunc.hxx>
37
38using namespace std;
39
6ab3bc95
RP
40namespace I2n
41{
6a93d84a
TJ
42
43
6ab3bc95
RP
44namespace
45{
6a93d84a
TJ
46
47const std::string hexDigitsLower("0123456789abcdef");
48const std::string hexDigitsUpper("0123456789ABCDEF");
49
50
51struct UpperFunc
52{
6ab3bc95
RP
53 char operator() (char c)
54 {
55 return std::toupper(c);
56 }
6a93d84a
TJ
57}; // eo struct UpperFunc
58
59
60struct LowerFunc
61{
6ab3bc95
RP
62 char operator() (char c)
63 {
64 return std::tolower(c);
65 }
6a93d84a
TJ
66}; // eo struct LowerFunc
67
68
69} // eo namespace <anonymous>
70
71
72
73/**
6ab3bc95 74 * default list of Whitespaces (" \t\r\n");
6a93d84a 75 */
6ab3bc95 76const std::string Whitespaces = " \t\r\n";
6a93d84a
TJ
77
78/**
79 * default list of lineendings ("\r\n");
80 */
6ab3bc95 81const std::string LineEndings= "\r\n";
6a93d84a
TJ
82
83
84
85/**
86 * @brief checks if a string begins with a given prefix.
87 * @param[in,out] str the string which is tested
88 * @param prefix the prefix which should be tested for.
89 * @return @a true iff the prefix is not empty and the string begins with that prefix.
90 */
6ab3bc95 91bool has_prefix(const std::string& str, const std::string& prefix)
6a93d84a 92{
6ab3bc95
RP
93 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
94 {
95 return false;
96 }
97 return str.compare(0, prefix.size(), prefix) == 0;
98} // eo has_prefix(const std::string&,const std::string&)
6a93d84a
TJ
99
100
101/**
102 * @brief checks if a string ends with a given suffix.
103 * @param[in,out] str the string which is tested
104 * @param suffix the suffix which should be tested for.
105 * @return @a true iff the suffix is not empty and the string ends with that suffix.
106 */
6ab3bc95 107bool has_suffix(const std::string& str, const std::string& suffix)
6a93d84a 108{
6ab3bc95
RP
109 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
110 {
111 return false;
112 }
113 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
114} // eo has_suffix(const std::string&,const std::string&)
6a93d84a
TJ
115
116
117/**
118 * cut off characters from a given list from front and end of a string.
119 * @param[in,out] str the string which should be trimmed.
120 * @param charlist the list of characters to remove from beginning and end of string
121 * @return the result string.
122 */
6ab3bc95
RP
123std::string trim_mod(std::string& str, const std::string& charlist)
124{
125 // first: trim the beginning:
126 std::string::size_type pos= str.find_first_not_of (charlist);
127 if (pos == std::string::npos)
128 {
129 // whole string consists of charlist (or is already empty)
130 str.clear();
131 return str;
132 }
133 else if (pos>0)
134 {
135 // str starts with charlist
136 str.erase(0,pos);
137 }
138 // now let's look at the tail:
139 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
140 if ( pos < str.size() )
141 {
142 str.erase(pos, str.size()-pos);
143 }
144 return str;
145} // eo trim_mod(std::string&,const std::string&)
6a93d84a
TJ
146
147
148
149/**
150 * removes last character from a string when it is in a list of chars to be removed.
151 * @param[in,out] str the string.
152 * @param what the list of chars which will be tested for.
153 * @return the resulting string with last char removed (if applicable)
154 */
6ab3bc95 155std::string chomp_mod(std::string& str, const std::string& what)
6a93d84a 156{
6ab3bc95
RP
157 if (str.empty() || what.empty() )
158 {
159 return str;
160 }
161 if (what.find(str.at (str.size()-1) ) != std::string::npos)
162 {
163 str.erase(str.size() - 1);
164 }
165 return str;
166} // eo chomp_mod(std::string&,const std::string&)
6a93d84a
TJ
167
168
169/**
170 * @brief converts a string to lower case.
171 * @param[in,out] str the string to modify.
172 * @return the string
173 */
6ab3bc95 174std::string to_lower_mod(std::string& str)
6a93d84a 175{
6ab3bc95
RP
176 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
177 return str;
178} // eo to_lower_mod(std::string&)
6a93d84a
TJ
179
180
181/**
182 * @brief converts a string to upper case.
183 * @param[in,out] str the string to modify.
184 * @return the string
185 */
6ab3bc95 186std::string to_upper_mod(std::string& str)
6a93d84a 187{
6ab3bc95
RP
188 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
189 return str;
190} // eo to_upper_mod(std::string&)
6a93d84a
TJ
191
192
193
194/**
195 * cut off characters from a given list from front and end of a string.
196 * @param str the string which should be trimmed.
197 * @param charlist the list of characters to remove from beginning and end of string
198 * @return the result string.
199 */
6ab3bc95
RP
200std::string trim (const std::string& str, const std::string& charlist)
201{
202 // first: trim the beginning:
203 std::string::size_type pos0= str.find_first_not_of(charlist);
204 if (pos0 == std::string::npos)
205 {
206 // whole string consists of charlist (or is already empty)
207 return std::string();
208 }
209 // now let's look at the end:
210 std::string::size_type pos1= str.find_last_not_of(charlist);
211 return str.substr(pos0, pos1 - pos0 + 1);
6a93d84a
TJ
212} // eo trim(const std:.string&,const std::string&)
213
214
215/**
216 * removes last character from a string when it is in a list of chars to be removed.
217 * @param str the string.
218 * @param what the list of chars which will be tested for.
219 * @return the resulting string with last char removed (if applicable)
220 */
6ab3bc95
RP
221std::string chomp (const std::string& str, const std::string& what)
222{
223 if (str.empty() || what.empty() )
224 {
225 return str;
226 }
227 if (what.find(str.at (str.size()-1) ) != std::string::npos)
228 {
229 return str.substr(0, str.size()-1);
230 }
231 return str;
6a93d84a
TJ
232} // eo chomp(const std:.string&,const std::string&)
233
234
235/**
236 * @brief returns a lower case version of a given string.
237 * @param str the string
238 * @return the lower case version of the string
239 */
6ab3bc95 240std::string to_lower (const std::string& str)
6a93d84a 241{
6ab3bc95
RP
242 std::string result(str);
243 return to_lower_mod(result);
244} // eo to_lower(const std::string&)
6a93d84a
TJ
245
246
247/**
248 * @brief returns a upper case version of a given string.
249 * @param str the string
250 * @return the upper case version of the string
251 */
6ab3bc95 252std::string to_upper(const std::string& str)
6a93d84a 253{
6ab3bc95
RP
254 std::string result(str);
255 return to_upper_mod(result);
256} // eo to_upper(const std::string&)
6a93d84a
TJ
257
258
259
260/**
261 * @brief removes a given suffix from a string.
262 * @param str the string.
263 * @param suffix the suffix which should be removed if the string ends with it.
264 * @return the string without the suffix.
265 *
266 * If the string ends with the suffix, it is removed. If the the string doesn't end
267 * with the suffix the original string is returned.
268 */
6ab3bc95 269std::string remove_suffix(const std::string& str, const std::string& suffix)
6a93d84a 270{
6ab3bc95
RP
271 if (has_suffix(str,suffix) )
272 {
273 return str.substr(0, str.size()-suffix.size() );
274 }
275 return str;
276} // eo remove_suffix(const std::string&,const std::string&)
6a93d84a
TJ
277
278
279
280/**
281 * @brief removes a given prefix from a string.
282 * @param str the string.
283 * @param prefix the prefix which should be removed if the string begins with it.
284 * @return the string without the prefix.
285 *
286 * If the string begins with the prefix, it is removed. If the the string doesn't begin
287 * with the prefix the original string is returned.
288 */
6ab3bc95 289std::string remove_prefix(const std::string& str, const std::string& prefix)
6a93d84a 290{
6ab3bc95
RP
291 if (has_prefix(str,prefix) )
292 {
293 return str.substr( prefix.size() );
294 }
295 return str;
296} // eo remove_prefix(const std::string&,const std::string&)
6a93d84a
TJ
297
298
299/**
300 * split a string to key and value delimited by a given delimiter.
6ab3bc95 301 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
6a93d84a
TJ
302 * @param str the string which should be splitted.
303 * @param[out] key the resulting key
304 * @param[out] value the resulting value
305 * @param delimiter the delimiter between key and value; default is '='.
306 * @return @a true if the split was successful.
307 */
6ab3bc95
RP
308bool pair_split(
309 const std::string& str,
310 std::string& key,
311 std::string& value,
312 char delimiter)
313{
314 std::string::size_type pos = str.find (delimiter);
315 if (pos == std::string::npos) return false;
316 key= str.substr(0,pos);
317 value= str.substr(pos+1);
318 trim_mod(key);
319 trim_mod(value);
320 return true;
321} // eo pair_split(const std::string&,std::string&,std::string&,char)
6a93d84a
TJ
322
323
324/**
325 * splits a string by given delimiter
326 *
327 * @param[in] str the string which should be splitted.
328 * @param[out] result the list resulting from splitting @a str.
329 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
330 * @param[in] omit_empty should empty parts not be stored?
331 * @param[in] trim_list list of characters the parts should be trimmed by.
332 * (empty string results in no trim)
333 */
6ab3bc95
RP
334void split_string(
335 const std::string& str,
336 std::list<std::string>& result,
337 const std::string& delimiter,
338 bool omit_empty,
339 const std::string& trim_list
6a93d84a
TJ
340)
341{
6ab3bc95
RP
342 std::string::size_type pos, last_pos=0;
343 bool delimiter_found= false;
344 while ( last_pos < str.size() && last_pos != std::string::npos)
345 {
346 pos= str.find(delimiter, last_pos);
347 std::string part;
348 if (pos == std::string::npos)
349 {
350 part= str.substr(last_pos);
351 delimiter_found= false;
352 }
353 else
354 {
355 part= str.substr(last_pos, pos-last_pos);
356 delimiter_found=true;
357 }
358 if (pos != std::string::npos)
359 {
360 last_pos= pos+ delimiter.size();
361 }
362 else
363 {
364 last_pos= std::string::npos;
365 }
366 if (!trim_list.empty() ) trim_mod (part, trim_list);
367 if (omit_empty && part.empty() ) continue;
368 result.push_back( part );
369 }
370 // if the string ends with a delimiter we need to append an empty string if no omit_empty
371 // was given.
372 // (this way we keep the split result consistent to a join operation)
373 if (delimiter_found && !omit_empty)
374 {
375 result.push_back("");
376 }
377} // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
6a93d84a
TJ
378
379
380/**
381 * splits a string by a given delimiter
382 * @param str the string which should be splitted.
383 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
384 * @param[in] omit_empty should empty parts not be stored?
385 * @param[in] trim_list list of characters the parts should be trimmed by.
386 * (empty string results in no trim)
387 * @return the list resulting from splitting @a str.
388 */
6ab3bc95
RP
389std::list<std::string> split_string(
390 const std::string& str,
391 const std::string& delimiter,
392 bool omit_empty,
393 const std::string& trim_list
6a93d84a
TJ
394)
395{
6ab3bc95
RP
396 std::list<std::string> result;
397 split_string(str, result, delimiter, omit_empty, trim_list);
398 return result;
399} // eo split_string(const std::string&,const std::string&,bool,const std::string&)
6a93d84a
TJ
400
401
402/**
403 * @brief joins a list of strings into a single string.
404 *
6ab3bc95
RP
405 * This funtion is (basically) the reverse operation of @a split_string.
406 *
6a93d84a
TJ
407 * @param parts the list of strings.
408 * @param delimiter the delimiter which is inserted between the strings.
409 * @return the joined string.
410 */
6ab3bc95
RP
411std::string join_string(
412 const std::list< std::string >& parts,
413 const std::string& delimiter
6a93d84a
TJ
414)
415{
6ab3bc95
RP
416 std::string result;
417 if (! parts.empty() )
418 {
419 std::list< std::string >::const_iterator it= parts.begin();
420 result = *it;
421 while ( ++it != parts.end() )
422 {
423 result+= delimiter;
424 result+= *it;
425 }
426 }
427 return result;
428} // eo join_string(const std::list< std::string >&,const std::string&)
6a93d84a
TJ
429
430
431
432/*
433** conversions
434*/
435
436
437/**
438 * @brief returns a hex string from a binary string.
439 * @param str the (binary) string
440 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
441 * @return the string in hex notation.
442 */
6ab3bc95
RP
443std::string convert_binary_to_hex(
444 const std::string& str,
445 bool upper_case_digits
6a93d84a
TJ
446)
447{
6ab3bc95
RP
448 std::string result;
449 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
450 for ( std::string::const_iterator it= str.begin();
451 it != str.end();
452 ++it)
453 {
454 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
455 result.push_back( hexDigits[ (*it) & 0x0f ] );
456 }
457 return result;
458} // eo convert_binary_to_hex(const std::string&,bool)
6a93d84a
TJ
459
460
461/**
462 * @brief converts a hex digit string to binary string.
463 * @param str hex digit string
464 * @return the binary string.
465 *
466 * The hex digit string may contains white spaces or colons which are treated
467 * as delimiters between hex digit groups.
468 *
469 * @todo rework the handling of half nibbles (consistency)!
470 */
6ab3bc95
RP
471std::string convert_hex_to_binary(
472 const std::string& str
6a93d84a 473)
6ab3bc95
RP
474throw (std::runtime_error)
475{
476 std::string result;
477 char c= 0;
478 bool hasNibble= false;
479 bool lastWasWS= true;
480 for ( std::string::const_iterator it= str.begin();
481 it != str.end();
482 ++it)
483 {
484 std::string::size_type p = hexDigitsLower.find( *it );
485 if (p== std::string::npos)
486 {
487 p= hexDigitsUpper.find( *it );
488 }
489 if (p == std::string::npos)
490 {
491 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
6a93d84a 492 or ( *it == ':') // or a colon?
6ab3bc95
RP
493 )
494 {
495 // we treat that as a valid delimiter:
496 if (hasNibble)
6a93d84a 497 {
6ab3bc95
RP
498 // 1 nibble before WS is treate as lower part:
499 result.push_back(c);
500 // reset state:
501 hasNibble= false;
6a93d84a 502 }
6ab3bc95
RP
503 lastWasWS= true;
504 continue;
505 }
506 }
507 if (p == std::string::npos )
508 {
509 throw runtime_error("illegal character in hex digit string: " + str);
510 }
511 lastWasWS= false;
512 if (hasNibble)
513 {
514 c<<=4;
515 }
516 else
517 {
518 c=0;
519 }
520 c+= (p & 0x0f);
521 if (hasNibble)
522 {
523 //we already had a nibble, so a char is complete now:
524 result.push_back( c );
525 hasNibble=false;
526 }
527 else
528 {
529 // this is the first nibble of a new char:
530 hasNibble=true;
531 }
532 }
533 if (hasNibble)
534 {
535 //well, there is one nibble left
536 // let's do some heuristics:
537 if (lastWasWS)
538 {
539 // if the preceeding character was a white space (or a colon)
540 // we treat the nibble as lower part:
541 //( this is consistent with shortened hex notations where leading zeros are not noted)
542 result.push_back( c );
543 }
544 else
545 {
546 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
547 result.push_back( c << 4 );
548 }
549 }
550 return result;
551} // eo convert_hex_to_binary(const std::string&)
552
553
554} // eo namespace I2n
555
556
557
6a93d84a 558
e93545dd
GE
559std::string iso_to_utf8(const std::string& isostring)
560{
6ab3bc95 561 string result;
118e216e 562
6ab3bc95 563 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
118e216e 564
6ab3bc95
RP
565 if (iso_to_utf8 == (iconv_t)-1)
566 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 567
6ab3bc95
RP
568 size_t in_size=isostring.size();
569 size_t out_size=in_size*4;
118e216e 570
6ab3bc95
RP
571 char *buf = (char *)malloc(out_size+1);
572 if (buf == NULL)
573 throw runtime_error("out of memory for iconv buffer");
e93545dd 574
5a4ecb51 575 char *in = (char *)isostring.c_str();
6ab3bc95
RP
576 char *out = buf;
577 iconv(i2utf8, &in, &in_size, &out, &out_size);
118e216e 578
6ab3bc95 579 buf[isostring.size()*4-out_size]=0;
118e216e 580
6ab3bc95 581 result=buf;
118e216e 582
6ab3bc95
RP
583 free(buf);
584 iconv_close(i2utf8);
118e216e 585
6ab3bc95 586 return result;
e93545dd
GE
587}
588
589std::string utf8_to_iso(const std::string& utf8string)
590{
6ab3bc95 591 string result;
118e216e 592
6ab3bc95 593 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
118e216e 594
6ab3bc95
RP
595 if (utf82iso == (iconv_t)-1)
596 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 597
6ab3bc95
RP
598 size_t in_size=utf8string.size();
599 size_t out_size=in_size;
118e216e 600
6ab3bc95
RP
601 char *buf = (char *)malloc(out_size+1);
602 if (buf == NULL)
603 throw runtime_error("out of memory for iconv buffer");
e93545dd 604
5a4ecb51 605 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
606 char *out = buf;
607 iconv(utf82iso, &in, &in_size, &out, &out_size);
118e216e 608
6ab3bc95 609 buf[utf8string.size()-out_size]=0;
118e216e 610
6ab3bc95 611 result=buf;
118e216e 612
6ab3bc95
RP
613 free(buf);
614 iconv_close(utf82iso);
e93545dd 615
6ab3bc95 616 return result;
e93545dd
GE
617}
618
a5f3af6e
GE
619wchar_t* utf8_to_wbuf(const std::string& utf8string)
620{
6ab3bc95 621 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
a5f3af6e 622
6ab3bc95
RP
623 if (utf82wstr == (iconv_t)-1)
624 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
a5f3af6e 625
6ab3bc95
RP
626 size_t in_size=utf8string.size();
627 size_t out_size= (in_size+1)*sizeof(wchar_t);
a5f3af6e 628
6ab3bc95
RP
629 wchar_t *buf = (wchar_t *)malloc(out_size);
630 if (buf == NULL)
631 throw runtime_error("out of memory for iconv buffer");
a5f3af6e 632
5a4ecb51 633 char *in = (char *)utf8string.c_str();
6ab3bc95 634 char *out = (char*) buf;
dbd6d77c 635 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
6ab3bc95 636 throw runtime_error("error converting char encodings");
a5f3af6e 637
6ab3bc95 638 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
a5f3af6e 639
6ab3bc95 640 iconv_close(utf82wstr);
a5f3af6e 641
6ab3bc95 642 return buf;
a5f3af6e
GE
643}
644
13cc4db1 645std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071 646{
6ab3bc95 647 string result;
118e216e 648
6ab3bc95 649 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
118e216e 650
6ab3bc95
RP
651 if (utf7imap2utf8 == (iconv_t)-1)
652 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 653
6ab3bc95
RP
654 size_t in_size=utf7imapstring.size();
655 size_t out_size=in_size*4;
118e216e 656
6ab3bc95
RP
657 char *buf = (char *)malloc(out_size+1);
658 if (buf == NULL)
659 throw runtime_error("out of memory for iconv buffer");
d116a071 660
5a4ecb51 661 char *in = (char *)utf7imapstring.c_str();
6ab3bc95
RP
662 char *out = buf;
663 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 664
6ab3bc95 665 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 666
6ab3bc95 667 result=buf;
118e216e 668
6ab3bc95
RP
669 free(buf);
670 iconv_close(utf7imap2utf8);
118e216e 671
6ab3bc95 672 return result;
118e216e
TJ
673}
674
6a2b6dd1
TJ
675std::string utf8_to_utf7imap(const std::string& utf8string)
676{
6ab3bc95 677 string result;
6a2b6dd1 678
6ab3bc95 679 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
6a2b6dd1 680
6ab3bc95
RP
681 if (utf82utf7imap == (iconv_t)-1)
682 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
6a2b6dd1 683
6ab3bc95
RP
684 // UTF-7 is base64 encoded, a buffer 10x as large
685 // as the utf-8 buffer should be enough. If not the string will be truncated.
686 size_t in_size=utf8string.size();
687 size_t out_size=in_size*10;
6a2b6dd1 688
6ab3bc95
RP
689 char *buf = (char *)malloc(out_size+1);
690 if (buf == NULL)
691 throw runtime_error("out of memory for iconv buffer");
6a2b6dd1 692
5a4ecb51 693 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
694 char *out = buf;
695 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
6a2b6dd1 696
6ab3bc95 697 buf[utf8string.size()*10-out_size]= 0;
6a2b6dd1 698
6ab3bc95 699 result=buf;
6a2b6dd1 700
6ab3bc95
RP
701 free(buf);
702 iconv_close(utf82utf7imap);
6a2b6dd1 703
6ab3bc95 704 return result;
6a2b6dd1
TJ
705}
706
118e216e
TJ
707// Tokenize string by (html) tags
708void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
709{
6ab3bc95
RP
710 string::size_type pos, len = input.size();
711 bool inside_tag = false;
712 string current;
713
714 for (pos = 0; pos < len; pos++)
715 {
716 if (input[pos] == '<')
717 {
718 inside_tag = true;
719
720 if (!current.empty() )
721 {
722 tokenized.push_back( make_pair(current, false) );
723 current = "";
724 }
725
726 current += input[pos];
727 }
728 else if (input[pos] == '>' && inside_tag)
729 {
730 current += input[pos];
731 inside_tag = false;
732 if (!current.empty() )
733 {
734 tokenized.push_back( make_pair(current, true) );
735 current = "";
736 }
737 }
738 else
739 current += input[pos];
740 }
741
742 // String left over in buffer?
743 if (!current.empty() )
744 tokenized.push_back( make_pair(current, false) );
745} // eo tokenize_by_tag
118e216e 746
118e216e
TJ
747
748std::string strip_html_tags(const std::string &input)
749{
6ab3bc95
RP
750 // Pair first: string, second: isTag
751 vector<pair<string,bool> > tokenized;
752 tokenize_by_tag (tokenized, input);
118e216e 753
6ab3bc95
RP
754 string output;
755 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 756 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
757 if (!token->second)
758 output += token->first;
759
760 return output;
761} // eo strip_html_tags
118e216e 762
118e216e
TJ
763
764// Smart-encode HTML en
765string smart_html_entities(const std::string &input)
766{
6ab3bc95
RP
767 // Pair first: string, second: isTag
768 vector<pair<string,bool> > tokenized;
769 tokenize_by_tag (tokenized, input);
770
771 string output;
772 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 773 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
774 {
775 // keep HTML tags as they are
776 if (token->second)
777 output += token->first;
778 else
779 output += html_entities(token->first);
780 }
781
782 return output;
118e216e
TJ
783}
784
6ab3bc95 785
a5f3af6e
GE
786string::size_type find_8bit(const std::string &str)
787{
6ab3bc95
RP
788 string::size_type l=str.size();
789 for (string::size_type p=0; p < l; p++)
790 if (static_cast<unsigned char>(str[p]) > 127)
791 return p;
a5f3af6e 792
6ab3bc95 793 return string::npos;
a5f3af6e
GE
794}
795
118e216e
TJ
796// encoded UTF-8 chars into HTML entities
797string html_entities(std::string str)
798{
6ab3bc95
RP
799 // Normal chars
800 replace_all (str, "&", "&amp;");
6ab3bc95
RP
801 replace_all (str, "<", "&lt;");
802 replace_all (str, ">", "&gt;");
980577e1
TJ
803 replace_all (str, "\"", "&quot;");
804 replace_all (str, "'", "&#x27;");
805 replace_all (str, "/", "&#x2F;");
6ab3bc95
RP
806
807 // Umlauts
808 replace_all (str, "\xC3\xA4", "&auml;");
809 replace_all (str, "\xC3\xB6", "&ouml;");
810 replace_all (str, "\xC3\xBC", "&uuml;");
811 replace_all (str, "\xC3\x84", "&Auml;");
812 replace_all (str, "\xC3\x96", "&Ouml;");
813 replace_all (str, "\xC3\x9C", "&Uuml;");
814
815 // Misc
816 replace_all (str, "\xC3\x9F", "&szlig;");
817
818 // conversion of remaining non-ASCII chars needed?
819 // just do if needed because of performance
820 if (find_8bit(str) != string::npos)
821 {
822 // convert to fixed-size encoding UTF-32
823 wchar_t* wbuf=utf8_to_wbuf(str);
824 ostringstream target;
825
826 // replace all non-ASCII chars with HTML representation
827 for (int p=0; wbuf[p] != 0; p++)
828 {
829 unsigned int c=wbuf[p];
830
831 if (c <= 127)
832 target << static_cast<unsigned char>(c);
833 else
834 target << "&#" << c << ';';
835 }
836
837 free(wbuf);
838
839 str=target.str();
840 }
841
842 return str;
843} // eo html_entities(std::string)
844
554f813d
GE
845// convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
846string html_entities_to_console(std::string str)
847{
848 // Normal chars
849 replace_all (str, "&amp;", "&");
850 replace_all (str, "&lt;", "<");
851 replace_all (str, "&gt;", ">");
852 replace_all (str, "&quot;", "\"");
853 replace_all (str, "&#x27;", "'");
854 replace_all (str, "&#x2F;", "/");
855
856 // Umlauts
857 replace_all (str, "&auml;", "ae");
858 replace_all (str, "&ouml;", "oe");
859 replace_all (str, "&uuml;", "ue");
860 replace_all (str, "&Auml;", "Ae");
861 replace_all (str, "&Ouml;", "Oe");
862 replace_all (str, "&Uuml;", "Ue");
863
864 // Misc
865 replace_all (str, "&szlig;", "ss");
866
867 return str;
868}
118e216e 869
e93545dd
GE
870bool replace_all(string &base, const char *ist, const char *soll)
871{
6ab3bc95
RP
872 string i=ist;
873 string s=soll;
874 return replace_all(base,&i,&s);
e93545dd
GE
875}
876
877bool replace_all(string &base, const string &ist, const char *soll)
878{
6ab3bc95
RP
879 string s=soll;
880 return replace_all(base,&ist,&s);
e93545dd
GE
881}
882
883bool replace_all(string &base, const string *ist, const string *soll)
884{
6ab3bc95 885 return replace_all(base,*ist,*soll);
e93545dd
GE
886}
887
888bool replace_all(string &base, const char *ist, const string *soll)
889{
6ab3bc95
RP
890 string i=ist;
891 return replace_all(base,&i,soll);
e93545dd
GE
892}
893
894bool replace_all(string &base, const string &ist, const string &soll)
895{
6ab3bc95
RP
896 bool found_ist = false;
897 string::size_type a=0;
898
899 if (ist.empty() )
900 throw runtime_error ("replace_all called with empty search string");
e93545dd 901
6ab3bc95
RP
902 while ( (a=base.find(ist,a) ) != string::npos)
903 {
904 base.replace(a,ist.size(),soll);
905 a=a+soll.size();
906 found_ist = true;
907 }
1ec2064e 908
6ab3bc95 909 return found_ist;
e93545dd
GE
910}
911
b953bf36
GE
912/**
913 * @brief replaces all characters that could be problematic or impose a security risk when being logged
914 * @param str the original string
915 * @param replace_with the character to replace the unsafe chars with
916 * @return a string that is safe to send to syslog or other logfiles
917 *
918 * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
919 * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
920 * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
921 *
922 */
923std::string sanitize_for_logging(const std::string &str, const char replace_with)
924{
925 std::string output=str;
926
c0e32d64
GE
927 const string::size_type len = output.size();
928 for (std::string::size_type p=0; p < len; p++)
b953bf36
GE
929 if (output[p] < 0x20 || output[p] > 0x7E)
930 output[p]=replace_with;
931
932 return output;
933}
934
e5b21dbb 935#if 0
e93545dd
GE
936string to_lower(const string &src)
937{
6ab3bc95 938 string dst = src;
e93545dd 939
6ab3bc95
RP
940 string::size_type pos, end = dst.size();
941 for (pos = 0; pos < end; pos++)
942 dst[pos] = tolower(dst[pos]);
e93545dd 943
6ab3bc95 944 return dst;
e93545dd
GE
945}
946
947string to_upper(const string &src)
948{
6ab3bc95 949 string dst = src;
e93545dd 950
6ab3bc95
RP
951 string::size_type pos, end = dst.size();
952 for (pos = 0; pos < end; pos++)
953 dst[pos] = toupper(dst[pos]);
e93545dd 954
6ab3bc95 955 return dst;
e93545dd 956}
e5b21dbb 957#endif
e93545dd 958
83809f5e 959const int MAX_UNIT_FORMAT_SYMBOLS = 6;
d1ea9075 960
2cb9a9c5 961const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
d1ea9075
GMF
962 " B",
963 " KB",
964 " MB",
965 " GB",
966 " TB",
83809f5e 967 " PB"
d1ea9075
GMF
968};
969
2cb9a9c5 970const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
5cb766b9
GMF
971 i18n_noop(" Bytes"),
972 i18n_noop(" KBytes"),
973 i18n_noop(" MBytes"),
974 i18n_noop(" GBytes"),
975 i18n_noop(" TBytes"),
83809f5e 976 i18n_noop(" PBytes")
d1ea9075
GMF
977};
978
72a94426
GMF
979
980long double rounding_upwards(
981 long double number,
982 const int rounding_multiplier
983)
984{
985 long double rounded_number;
986 rounded_number = number * rounding_multiplier;
987 rounded_number += 0.5;
988 rounded_number = (int64_t) (rounded_number);
989 rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
990
991 return rounded_number;
992}
993
994
81267544
GMF
995string nice_unit_format(
996 const int64_t input,
70fc0674
GMF
997 const UnitFormat format,
998 const UnitBase base
81267544 999)
6ab3bc95 1000{
d1ea9075 1001 // select the system of units (decimal or binary)
81267544 1002 int multiple = 0;
a398513a 1003 if (base == UnitBase1000)
81267544
GMF
1004 {
1005 multiple = 1000;
1006 }
1007 else
1008 {
1009 multiple = 1024;
1010 }
1011
1012 long double size = input;
6ab3bc95 1013
d1ea9075
GMF
1014 // check the size of the input number to fit in the appropriate symbol
1015 int sizecount = 0;
81267544 1016 while (size > multiple)
6ab3bc95 1017 {
81267544
GMF
1018 size = size / multiple;
1019 sizecount++;
83809f5e
GMF
1020
1021 // rollback to the previous values and stop the loop when cannot
1022 // represent the number length.
1023 if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1024 {
1025 size = size * multiple;
1026 sizecount--;
1027 break;
1028 }
6ab3bc95
RP
1029 }
1030
a398513a
GMF
1031 // round the input number "half up" to multiples of 10
1032 const int rounding_multiplier = 10;
72a94426 1033 size = rounding_upwards(size, rounding_multiplier);
6ab3bc95 1034
d1ea9075 1035 // format the input number, placing the appropriate symbol
6ab3bc95 1036 ostringstream out;
6ab3bc95 1037 out.setf (ios::fixed);
a398513a 1038 if (format == ShortUnitFormat)
d1ea9075
GMF
1039 {
1040 out.precision(1);
68d37a5c 1041 out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
d1ea9075
GMF
1042 }
1043 else
6ab3bc95 1044 {
d1ea9075 1045 out.precision (2);
68d37a5c 1046 out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
6ab3bc95
RP
1047 }
1048
1049 return out.str();
1050} // eo nice_unit_format(int input)
1051
e93545dd 1052
47c07fba
GE
1053string escape(const string &s)
1054{
6ab3bc95
RP
1055 string out(s);
1056 string::size_type p;
47c07fba 1057
6ab3bc95
RP
1058 p=0;
1059 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1060 {
1061 out.insert (p,"\\");
1062 p+=2;
1063 }
47c07fba 1064
6ab3bc95
RP
1065 p=0;
1066 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1067 {
1068 out.replace (p,1,"\\r");
1069 p+=2;
1070 }
47c07fba 1071
6ab3bc95
RP
1072 p=0;
1073 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1074 {
1075 out.replace (p,1,"\\n");
1076 p+=2;
1077 }
47c07fba 1078
6ab3bc95 1079 out='"'+out+'"';
47c07fba 1080
6ab3bc95
RP
1081 return out;
1082} // eo scape(const std::string&)
47c07fba 1083
47c07fba 1084
6ab3bc95
RP
1085string descape(const string &s, int startpos, int &endpos)
1086{
1087 string out;
1088
1089 if (s.at(startpos) != '"')
1090 throw out_of_range("value not type escaped string");
1091
1092 out=s.substr(startpos+1);
1093 string::size_type p=0;
1094
1095 // search for the end of the string
1096 while ( (p=out.find("\"",p) ) !=out.npos)
1097 {
1098 int e=p-1;
1099 bool escaped=false;
1100
1101 // the " might be escaped with a backslash
1102 while (e>=0 && out.at (e) =='\\')
1103 {
1104 if (escaped == false)
1105 escaped=true;
1106 else
1107 escaped=false;
1108
1109 e--;
1110 }
1111
1112 if (escaped==false)
1113 break;
1114 else
1115 p++;
1116 }
1117
1118 // we now have the end of the string
1119 out=out.substr(0,p);
1120
1121 // tell calling prog about the endposition
1122 endpos=startpos+p+1;
1123
1124 // descape all \ stuff inside the string now
1125 p=0;
1126 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1127 {
1128 switch (out.at(p+1) )
1129 {
1130 case 'r':
47c07fba
GE
1131 out.replace(p,2,"\r");
1132 break;
6ab3bc95 1133 case 'n':
47c07fba
GE
1134 out.replace(p,2,"\n");
1135 break;
6ab3bc95 1136 default:
47c07fba 1137 out.erase(p,1);
6ab3bc95
RP
1138 }
1139 p++;
1140 }
1141
1142 return out;
1143} // eo descape(const std::string&,int,int&)
47c07fba 1144
e93545dd 1145
47c07fba
GE
1146string escape_shellarg(const string &input)
1147{
6ab3bc95
RP
1148 string output = "'";
1149 string::const_iterator it, it_end = input.end();
83d700e9 1150 for (it = input.begin(); it != it_end; ++it)
6ab3bc95
RP
1151 {
1152 if ( (*it) == '\'')
1153 output += "'\\'";
1154
1155 output += *it;
1156 }
1157
1158 output += "'";
1159 return output;
47c07fba 1160}