Create functions find/remove_html_comments
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
0e23f538
TJ
1/*
2The software in this package is distributed under the GNU General
3Public License version 2 (with a special exception described below).
4
5A copy of GNU General Public License (GPL) is included in this distribution,
6in the file COPYING.GPL.
7
8As a special exception, if other files instantiate templates or use macros
9or inline functions from this file, or you compile this file and link it
10with other works to produce a work based on this file, this file
11does not by itself cause the resulting work to be covered
12by the GNU General Public License.
13
14However the source code for this file must still be made available
15in accordance with section (3) of the GNU General Public License.
16
17This exception does not invalidate any other reasons why a work based
18on this file might be covered by the GNU General Public License.
19*/
6a93d84a
TJ
20/** @file
21 *
22 * (c) Copyright 2007-2008 by Intra2net AG
6a93d84a 23 */
e93545dd
GE
24
25#include <iostream>
26#include <string>
27#include <sstream>
28#include <stdexcept>
5efd35b1 29#include <algorithm>
5cd64148 30#include <cmath> // for round()
e93545dd 31
a5f3af6e 32#include <wchar.h>
e93545dd
GE
33#include <stdlib.h>
34#include <iconv.h>
35#include <i18n.h>
36
5cd64148 37#include <boost/numeric/conversion/cast.hpp>
3f5c5ccd 38#include <boost/foreach.hpp>
5cd64148 39
e93545dd
GE
40#include <stringfunc.hxx>
41
42using namespace std;
43
6ab3bc95
RP
44namespace I2n
45{
6a93d84a
TJ
46
47
6ab3bc95
RP
48namespace
49{
6a93d84a
TJ
50
51const std::string hexDigitsLower("0123456789abcdef");
52const std::string hexDigitsUpper("0123456789ABCDEF");
53
54
55struct UpperFunc
56{
6ab3bc95
RP
57 char operator() (char c)
58 {
59 return std::toupper(c);
60 }
6a93d84a
TJ
61}; // eo struct UpperFunc
62
63
64struct LowerFunc
65{
6ab3bc95
RP
66 char operator() (char c)
67 {
68 return std::tolower(c);
69 }
6a93d84a
TJ
70}; // eo struct LowerFunc
71
72
73} // eo namespace <anonymous>
74
75
76
77/**
6ab3bc95 78 * default list of Whitespaces (" \t\r\n");
6a93d84a 79 */
6ab3bc95 80const std::string Whitespaces = " \t\r\n";
6a93d84a
TJ
81
82/**
83 * default list of lineendings ("\r\n");
84 */
6ab3bc95 85const std::string LineEndings= "\r\n";
6a93d84a
TJ
86
87
88
89/**
90 * @brief checks if a string begins with a given prefix.
91 * @param[in,out] str the string which is tested
92 * @param prefix the prefix which should be tested for.
93 * @return @a true iff the prefix is not empty and the string begins with that prefix.
94 */
6ab3bc95 95bool has_prefix(const std::string& str, const std::string& prefix)
6a93d84a 96{
6ab3bc95
RP
97 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
98 {
99 return false;
100 }
101 return str.compare(0, prefix.size(), prefix) == 0;
102} // eo has_prefix(const std::string&,const std::string&)
6a93d84a
TJ
103
104
105/**
106 * @brief checks if a string ends with a given suffix.
107 * @param[in,out] str the string which is tested
108 * @param suffix the suffix which should be tested for.
109 * @return @a true iff the suffix is not empty and the string ends with that suffix.
110 */
6ab3bc95 111bool has_suffix(const std::string& str, const std::string& suffix)
6a93d84a 112{
6ab3bc95
RP
113 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
114 {
115 return false;
116 }
117 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
118} // eo has_suffix(const std::string&,const std::string&)
6a93d84a
TJ
119
120
121/**
122 * cut off characters from a given list from front and end of a string.
123 * @param[in,out] str the string which should be trimmed.
124 * @param charlist the list of characters to remove from beginning and end of string
125 * @return the result string.
126 */
6ab3bc95
RP
127std::string trim_mod(std::string& str, const std::string& charlist)
128{
129 // first: trim the beginning:
130 std::string::size_type pos= str.find_first_not_of (charlist);
131 if (pos == std::string::npos)
132 {
133 // whole string consists of charlist (or is already empty)
134 str.clear();
135 return str;
136 }
137 else if (pos>0)
138 {
139 // str starts with charlist
140 str.erase(0,pos);
141 }
142 // now let's look at the tail:
143 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
144 if ( pos < str.size() )
145 {
146 str.erase(pos, str.size()-pos);
147 }
148 return str;
149} // eo trim_mod(std::string&,const std::string&)
6a93d84a
TJ
150
151
152
153/**
154 * removes last character from a string when it is in a list of chars to be removed.
155 * @param[in,out] str the string.
156 * @param what the list of chars which will be tested for.
157 * @return the resulting string with last char removed (if applicable)
158 */
6ab3bc95 159std::string chomp_mod(std::string& str, const std::string& what)
6a93d84a 160{
6ab3bc95
RP
161 if (str.empty() || what.empty() )
162 {
163 return str;
164 }
165 if (what.find(str.at (str.size()-1) ) != std::string::npos)
166 {
167 str.erase(str.size() - 1);
168 }
169 return str;
170} // eo chomp_mod(std::string&,const std::string&)
6a93d84a
TJ
171
172
173/**
174 * @brief converts a string to lower case.
175 * @param[in,out] str the string to modify.
176 * @return the string
177 */
6ab3bc95 178std::string to_lower_mod(std::string& str)
6a93d84a 179{
6ab3bc95
RP
180 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
181 return str;
182} // eo to_lower_mod(std::string&)
6a93d84a
TJ
183
184
185/**
186 * @brief converts a string to upper case.
187 * @param[in,out] str the string to modify.
188 * @return the string
189 */
6ab3bc95 190std::string to_upper_mod(std::string& str)
6a93d84a 191{
6ab3bc95
RP
192 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
193 return str;
194} // eo to_upper_mod(std::string&)
6a93d84a
TJ
195
196
197
198/**
199 * cut off characters from a given list from front and end of a string.
200 * @param str the string which should be trimmed.
201 * @param charlist the list of characters to remove from beginning and end of string
202 * @return the result string.
203 */
6ab3bc95
RP
204std::string trim (const std::string& str, const std::string& charlist)
205{
206 // first: trim the beginning:
207 std::string::size_type pos0= str.find_first_not_of(charlist);
208 if (pos0 == std::string::npos)
209 {
210 // whole string consists of charlist (or is already empty)
211 return std::string();
212 }
213 // now let's look at the end:
214 std::string::size_type pos1= str.find_last_not_of(charlist);
215 return str.substr(pos0, pos1 - pos0 + 1);
6a93d84a
TJ
216} // eo trim(const std:.string&,const std::string&)
217
218
219/**
220 * removes last character from a string when it is in a list of chars to be removed.
221 * @param str the string.
222 * @param what the list of chars which will be tested for.
223 * @return the resulting string with last char removed (if applicable)
224 */
6ab3bc95
RP
225std::string chomp (const std::string& str, const std::string& what)
226{
227 if (str.empty() || what.empty() )
228 {
229 return str;
230 }
231 if (what.find(str.at (str.size()-1) ) != std::string::npos)
232 {
233 return str.substr(0, str.size()-1);
234 }
235 return str;
6a93d84a
TJ
236} // eo chomp(const std:.string&,const std::string&)
237
238
239/**
240 * @brief returns a lower case version of a given string.
241 * @param str the string
242 * @return the lower case version of the string
243 */
6ab3bc95 244std::string to_lower (const std::string& str)
6a93d84a 245{
6ab3bc95
RP
246 std::string result(str);
247 return to_lower_mod(result);
248} // eo to_lower(const std::string&)
6a93d84a
TJ
249
250
251/**
252 * @brief returns a upper case version of a given string.
253 * @param str the string
254 * @return the upper case version of the string
255 */
6ab3bc95 256std::string to_upper(const std::string& str)
6a93d84a 257{
6ab3bc95
RP
258 std::string result(str);
259 return to_upper_mod(result);
260} // eo to_upper(const std::string&)
6a93d84a
TJ
261
262
263
264/**
265 * @brief removes a given suffix from a string.
266 * @param str the string.
267 * @param suffix the suffix which should be removed if the string ends with it.
268 * @return the string without the suffix.
269 *
270 * If the string ends with the suffix, it is removed. If the the string doesn't end
271 * with the suffix the original string is returned.
272 */
6ab3bc95 273std::string remove_suffix(const std::string& str, const std::string& suffix)
6a93d84a 274{
6ab3bc95
RP
275 if (has_suffix(str,suffix) )
276 {
277 return str.substr(0, str.size()-suffix.size() );
278 }
279 return str;
280} // eo remove_suffix(const std::string&,const std::string&)
6a93d84a
TJ
281
282
283
284/**
285 * @brief removes a given prefix from a string.
286 * @param str the string.
287 * @param prefix the prefix which should be removed if the string begins with it.
288 * @return the string without the prefix.
289 *
290 * If the string begins with the prefix, it is removed. If the the string doesn't begin
291 * with the prefix the original string is returned.
292 */
6ab3bc95 293std::string remove_prefix(const std::string& str, const std::string& prefix)
6a93d84a 294{
6ab3bc95
RP
295 if (has_prefix(str,prefix) )
296 {
297 return str.substr( prefix.size() );
298 }
299 return str;
300} // eo remove_prefix(const std::string&,const std::string&)
6a93d84a
TJ
301
302
303/**
304 * split a string to key and value delimited by a given delimiter.
6ab3bc95 305 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
6a93d84a
TJ
306 * @param str the string which should be splitted.
307 * @param[out] key the resulting key
308 * @param[out] value the resulting value
309 * @param delimiter the delimiter between key and value; default is '='.
310 * @return @a true if the split was successful.
311 */
6ab3bc95
RP
312bool pair_split(
313 const std::string& str,
314 std::string& key,
315 std::string& value,
316 char delimiter)
317{
318 std::string::size_type pos = str.find (delimiter);
319 if (pos == std::string::npos) return false;
320 key= str.substr(0,pos);
321 value= str.substr(pos+1);
322 trim_mod(key);
323 trim_mod(value);
324 return true;
325} // eo pair_split(const std::string&,std::string&,std::string&,char)
6a93d84a
TJ
326
327
328/**
329 * splits a string by given delimiter
330 *
331 * @param[in] str the string which should be splitted.
332 * @param[out] result the list resulting from splitting @a str.
333 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
334 * @param[in] omit_empty should empty parts not be stored?
335 * @param[in] trim_list list of characters the parts should be trimmed by.
336 * (empty string results in no trim)
337 */
6ab3bc95
RP
338void split_string(
339 const std::string& str,
340 std::list<std::string>& result,
341 const std::string& delimiter,
342 bool omit_empty,
343 const std::string& trim_list
6a93d84a
TJ
344)
345{
6ab3bc95
RP
346 std::string::size_type pos, last_pos=0;
347 bool delimiter_found= false;
348 while ( last_pos < str.size() && last_pos != std::string::npos)
349 {
350 pos= str.find(delimiter, last_pos);
351 std::string part;
352 if (pos == std::string::npos)
353 {
354 part= str.substr(last_pos);
355 delimiter_found= false;
356 }
357 else
358 {
359 part= str.substr(last_pos, pos-last_pos);
360 delimiter_found=true;
361 }
362 if (pos != std::string::npos)
363 {
364 last_pos= pos+ delimiter.size();
365 }
366 else
367 {
368 last_pos= std::string::npos;
369 }
370 if (!trim_list.empty() ) trim_mod (part, trim_list);
371 if (omit_empty && part.empty() ) continue;
372 result.push_back( part );
373 }
374 // if the string ends with a delimiter we need to append an empty string if no omit_empty
375 // was given.
376 // (this way we keep the split result consistent to a join operation)
377 if (delimiter_found && !omit_empty)
378 {
379 result.push_back("");
380 }
381} // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
6a93d84a
TJ
382
383
384/**
385 * splits a string by a given delimiter
386 * @param str the string which should be splitted.
387 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
388 * @param[in] omit_empty should empty parts not be stored?
389 * @param[in] trim_list list of characters the parts should be trimmed by.
390 * (empty string results in no trim)
391 * @return the list resulting from splitting @a str.
392 */
6ab3bc95
RP
393std::list<std::string> split_string(
394 const std::string& str,
395 const std::string& delimiter,
396 bool omit_empty,
397 const std::string& trim_list
6a93d84a
TJ
398)
399{
6ab3bc95
RP
400 std::list<std::string> result;
401 split_string(str, result, delimiter, omit_empty, trim_list);
402 return result;
403} // eo split_string(const std::string&,const std::string&,bool,const std::string&)
6a93d84a
TJ
404
405
406/**
407 * @brief joins a list of strings into a single string.
408 *
6ab3bc95
RP
409 * This funtion is (basically) the reverse operation of @a split_string.
410 *
6a93d84a
TJ
411 * @param parts the list of strings.
412 * @param delimiter the delimiter which is inserted between the strings.
413 * @return the joined string.
414 */
6ab3bc95
RP
415std::string join_string(
416 const std::list< std::string >& parts,
417 const std::string& delimiter
6a93d84a
TJ
418)
419{
6ab3bc95
RP
420 std::string result;
421 if (! parts.empty() )
422 {
423 std::list< std::string >::const_iterator it= parts.begin();
424 result = *it;
425 while ( ++it != parts.end() )
426 {
427 result+= delimiter;
428 result+= *it;
429 }
430 }
431 return result;
432} // eo join_string(const std::list< std::string >&,const std::string&)
6a93d84a
TJ
433
434
376ec4fa
CH
435/** @brief same as join_string for list, except uses a vector */
436std::string join_string(
437 const std::vector< std::string >& parts,
438 const std::string& delimiter
439)
440{
441 std::string result;
442 if (! parts.empty() )
443 {
444 std::vector< std::string >::const_iterator it= parts.begin();
445 result = *it;
446 while ( ++it != parts.end() )
447 {
448 result+= delimiter;
449 result+= *it;
450 }
451 }
452 return result;
453} // eo join_string(const std::vector< std::string >&,const std::string&)
454
455
6a93d84a
TJ
456
457/*
458** conversions
459*/
460
461
462/**
463 * @brief returns a hex string from a binary string.
464 * @param str the (binary) string
465 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
466 * @return the string in hex notation.
467 */
6ab3bc95
RP
468std::string convert_binary_to_hex(
469 const std::string& str,
470 bool upper_case_digits
6a93d84a
TJ
471)
472{
6ab3bc95
RP
473 std::string result;
474 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
475 for ( std::string::const_iterator it= str.begin();
476 it != str.end();
477 ++it)
478 {
479 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
480 result.push_back( hexDigits[ (*it) & 0x0f ] );
481 }
482 return result;
483} // eo convert_binary_to_hex(const std::string&,bool)
6a93d84a
TJ
484
485
486/**
487 * @brief converts a hex digit string to binary string.
488 * @param str hex digit string
489 * @return the binary string.
490 *
491 * The hex digit string may contains white spaces or colons which are treated
492 * as delimiters between hex digit groups.
493 *
494 * @todo rework the handling of half nibbles (consistency)!
495 */
6ab3bc95
RP
496std::string convert_hex_to_binary(
497 const std::string& str
6a93d84a 498)
6ab3bc95
RP
499throw (std::runtime_error)
500{
501 std::string result;
502 char c= 0;
503 bool hasNibble= false;
504 bool lastWasWS= true;
505 for ( std::string::const_iterator it= str.begin();
506 it != str.end();
507 ++it)
508 {
509 std::string::size_type p = hexDigitsLower.find( *it );
510 if (p== std::string::npos)
511 {
512 p= hexDigitsUpper.find( *it );
513 }
514 if (p == std::string::npos)
515 {
516 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
6a93d84a 517 or ( *it == ':') // or a colon?
6ab3bc95
RP
518 )
519 {
520 // we treat that as a valid delimiter:
521 if (hasNibble)
6a93d84a 522 {
6ab3bc95
RP
523 // 1 nibble before WS is treate as lower part:
524 result.push_back(c);
525 // reset state:
526 hasNibble= false;
6a93d84a 527 }
6ab3bc95
RP
528 lastWasWS= true;
529 continue;
530 }
531 }
532 if (p == std::string::npos )
533 {
534 throw runtime_error("illegal character in hex digit string: " + str);
535 }
536 lastWasWS= false;
537 if (hasNibble)
538 {
539 c<<=4;
540 }
541 else
542 {
543 c=0;
544 }
545 c+= (p & 0x0f);
546 if (hasNibble)
547 {
548 //we already had a nibble, so a char is complete now:
549 result.push_back( c );
550 hasNibble=false;
551 }
552 else
553 {
554 // this is the first nibble of a new char:
555 hasNibble=true;
556 }
557 }
558 if (hasNibble)
559 {
560 //well, there is one nibble left
561 // let's do some heuristics:
562 if (lastWasWS)
563 {
564 // if the preceeding character was a white space (or a colon)
565 // we treat the nibble as lower part:
566 //( this is consistent with shortened hex notations where leading zeros are not noted)
567 result.push_back( c );
568 }
569 else
570 {
571 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
572 result.push_back( c << 4 );
573 }
574 }
575 return result;
576} // eo convert_hex_to_binary(const std::string&)
577
578
579} // eo namespace I2n
580
581
582
6a93d84a 583
e93545dd
GE
584std::string iso_to_utf8(const std::string& isostring)
585{
6ab3bc95 586 string result;
118e216e 587
6ab3bc95 588 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
118e216e 589
6ab3bc95
RP
590 if (iso_to_utf8 == (iconv_t)-1)
591 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 592
6ab3bc95
RP
593 size_t in_size=isostring.size();
594 size_t out_size=in_size*4;
118e216e 595
6ab3bc95
RP
596 char *buf = (char *)malloc(out_size+1);
597 if (buf == NULL)
598 throw runtime_error("out of memory for iconv buffer");
e93545dd 599
5a4ecb51 600 char *in = (char *)isostring.c_str();
6ab3bc95
RP
601 char *out = buf;
602 iconv(i2utf8, &in, &in_size, &out, &out_size);
118e216e 603
6ab3bc95 604 buf[isostring.size()*4-out_size]=0;
118e216e 605
6ab3bc95 606 result=buf;
118e216e 607
6ab3bc95
RP
608 free(buf);
609 iconv_close(i2utf8);
118e216e 610
6ab3bc95 611 return result;
e93545dd
GE
612}
613
614std::string utf8_to_iso(const std::string& utf8string)
615{
6ab3bc95 616 string result;
118e216e 617
6ab3bc95 618 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
118e216e 619
6ab3bc95
RP
620 if (utf82iso == (iconv_t)-1)
621 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 622
6ab3bc95
RP
623 size_t in_size=utf8string.size();
624 size_t out_size=in_size;
118e216e 625
6ab3bc95
RP
626 char *buf = (char *)malloc(out_size+1);
627 if (buf == NULL)
628 throw runtime_error("out of memory for iconv buffer");
e93545dd 629
5a4ecb51 630 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
631 char *out = buf;
632 iconv(utf82iso, &in, &in_size, &out, &out_size);
118e216e 633
6ab3bc95 634 buf[utf8string.size()-out_size]=0;
118e216e 635
6ab3bc95 636 result=buf;
118e216e 637
6ab3bc95
RP
638 free(buf);
639 iconv_close(utf82iso);
e93545dd 640
6ab3bc95 641 return result;
e93545dd
GE
642}
643
a5f3af6e
GE
644wchar_t* utf8_to_wbuf(const std::string& utf8string)
645{
6ab3bc95 646 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
a5f3af6e 647
6ab3bc95
RP
648 if (utf82wstr == (iconv_t)-1)
649 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
a5f3af6e 650
6ab3bc95
RP
651 size_t in_size=utf8string.size();
652 size_t out_size= (in_size+1)*sizeof(wchar_t);
a5f3af6e 653
6ab3bc95
RP
654 wchar_t *buf = (wchar_t *)malloc(out_size);
655 if (buf == NULL)
656 throw runtime_error("out of memory for iconv buffer");
a5f3af6e 657
5a4ecb51 658 char *in = (char *)utf8string.c_str();
6ab3bc95 659 char *out = (char*) buf;
dbd6d77c 660 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
6ab3bc95 661 throw runtime_error("error converting char encodings");
a5f3af6e 662
6ab3bc95 663 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
a5f3af6e 664
6ab3bc95 665 iconv_close(utf82wstr);
a5f3af6e 666
6ab3bc95 667 return buf;
a5f3af6e
GE
668}
669
13cc4db1 670std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071 671{
6ab3bc95 672 string result;
118e216e 673
6ab3bc95 674 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
118e216e 675
6ab3bc95
RP
676 if (utf7imap2utf8 == (iconv_t)-1)
677 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 678
6ab3bc95
RP
679 size_t in_size=utf7imapstring.size();
680 size_t out_size=in_size*4;
118e216e 681
6ab3bc95
RP
682 char *buf = (char *)malloc(out_size+1);
683 if (buf == NULL)
684 throw runtime_error("out of memory for iconv buffer");
d116a071 685
5a4ecb51 686 char *in = (char *)utf7imapstring.c_str();
6ab3bc95
RP
687 char *out = buf;
688 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 689
6ab3bc95 690 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 691
6ab3bc95 692 result=buf;
118e216e 693
6ab3bc95
RP
694 free(buf);
695 iconv_close(utf7imap2utf8);
118e216e 696
6ab3bc95 697 return result;
118e216e
TJ
698}
699
6a2b6dd1
TJ
700std::string utf8_to_utf7imap(const std::string& utf8string)
701{
6ab3bc95 702 string result;
6a2b6dd1 703
6ab3bc95 704 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
6a2b6dd1 705
6ab3bc95
RP
706 if (utf82utf7imap == (iconv_t)-1)
707 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
6a2b6dd1 708
6ab3bc95
RP
709 // UTF-7 is base64 encoded, a buffer 10x as large
710 // as the utf-8 buffer should be enough. If not the string will be truncated.
711 size_t in_size=utf8string.size();
712 size_t out_size=in_size*10;
6a2b6dd1 713
6ab3bc95
RP
714 char *buf = (char *)malloc(out_size+1);
715 if (buf == NULL)
716 throw runtime_error("out of memory for iconv buffer");
6a2b6dd1 717
5a4ecb51 718 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
719 char *out = buf;
720 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
6a2b6dd1 721
6ab3bc95 722 buf[utf8string.size()*10-out_size]= 0;
6a2b6dd1 723
6ab3bc95 724 result=buf;
6a2b6dd1 725
6ab3bc95
RP
726 free(buf);
727 iconv_close(utf82utf7imap);
6a2b6dd1 728
6ab3bc95 729 return result;
6a2b6dd1
TJ
730}
731
118e216e
TJ
732// Tokenize string by (html) tags
733void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
734{
6ab3bc95
RP
735 string::size_type pos, len = input.size();
736 bool inside_tag = false;
737 string current;
738
739 for (pos = 0; pos < len; pos++)
740 {
741 if (input[pos] == '<')
742 {
743 inside_tag = true;
744
745 if (!current.empty() )
746 {
747 tokenized.push_back( make_pair(current, false) );
748 current = "";
749 }
750
751 current += input[pos];
752 }
753 else if (input[pos] == '>' && inside_tag)
754 {
755 current += input[pos];
756 inside_tag = false;
757 if (!current.empty() )
758 {
759 tokenized.push_back( make_pair(current, true) );
760 current = "";
761 }
762 }
763 else
764 current += input[pos];
765 }
766
767 // String left over in buffer?
768 if (!current.empty() )
769 tokenized.push_back( make_pair(current, false) );
770} // eo tokenize_by_tag
118e216e 771
118e216e
TJ
772
773std::string strip_html_tags(const std::string &input)
774{
6ab3bc95
RP
775 // Pair first: string, second: isTag
776 vector<pair<string,bool> > tokenized;
777 tokenize_by_tag (tokenized, input);
118e216e 778
6ab3bc95
RP
779 string output;
780 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 781 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
782 if (!token->second)
783 output += token->first;
784
785 return output;
786} // eo strip_html_tags
118e216e 787
118e216e
TJ
788
789// Smart-encode HTML en
790string smart_html_entities(const std::string &input)
791{
6ab3bc95
RP
792 // Pair first: string, second: isTag
793 vector<pair<string,bool> > tokenized;
794 tokenize_by_tag (tokenized, input);
795
796 string output;
797 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 798 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
799 {
800 // keep HTML tags as they are
801 if (token->second)
802 output += token->first;
803 else
804 output += html_entities(token->first);
805 }
806
807 return output;
118e216e
TJ
808}
809
6ab3bc95 810
a5f3af6e
GE
811string::size_type find_8bit(const std::string &str)
812{
6ab3bc95
RP
813 string::size_type l=str.size();
814 for (string::size_type p=0; p < l; p++)
815 if (static_cast<unsigned char>(str[p]) > 127)
816 return p;
a5f3af6e 817
6ab3bc95 818 return string::npos;
a5f3af6e
GE
819}
820
118e216e
TJ
821// encoded UTF-8 chars into HTML entities
822string html_entities(std::string str)
823{
6ab3bc95
RP
824 // Normal chars
825 replace_all (str, "&", "&amp;");
6ab3bc95
RP
826 replace_all (str, "<", "&lt;");
827 replace_all (str, ">", "&gt;");
980577e1
TJ
828 replace_all (str, "\"", "&quot;");
829 replace_all (str, "'", "&#x27;");
830 replace_all (str, "/", "&#x2F;");
6ab3bc95
RP
831
832 // Umlauts
833 replace_all (str, "\xC3\xA4", "&auml;");
834 replace_all (str, "\xC3\xB6", "&ouml;");
835 replace_all (str, "\xC3\xBC", "&uuml;");
836 replace_all (str, "\xC3\x84", "&Auml;");
837 replace_all (str, "\xC3\x96", "&Ouml;");
838 replace_all (str, "\xC3\x9C", "&Uuml;");
839
840 // Misc
841 replace_all (str, "\xC3\x9F", "&szlig;");
842
843 // conversion of remaining non-ASCII chars needed?
844 // just do if needed because of performance
845 if (find_8bit(str) != string::npos)
846 {
847 // convert to fixed-size encoding UTF-32
848 wchar_t* wbuf=utf8_to_wbuf(str);
849 ostringstream target;
850
851 // replace all non-ASCII chars with HTML representation
852 for (int p=0; wbuf[p] != 0; p++)
853 {
854 unsigned int c=wbuf[p];
855
856 if (c <= 127)
857 target << static_cast<unsigned char>(c);
858 else
859 target << "&#" << c << ';';
860 }
861
862 free(wbuf);
863
864 str=target.str();
865 }
866
867 return str;
868} // eo html_entities(std::string)
869
554f813d
GE
870// convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
871string html_entities_to_console(std::string str)
872{
873 // Normal chars
874 replace_all (str, "&amp;", "&");
875 replace_all (str, "&lt;", "<");
876 replace_all (str, "&gt;", ">");
877 replace_all (str, "&quot;", "\"");
878 replace_all (str, "&#x27;", "'");
879 replace_all (str, "&#x2F;", "/");
880
881 // Umlauts
882 replace_all (str, "&auml;", "ae");
883 replace_all (str, "&ouml;", "oe");
884 replace_all (str, "&uuml;", "ue");
885 replace_all (str, "&Auml;", "Ae");
886 replace_all (str, "&Ouml;", "Oe");
887 replace_all (str, "&Uuml;", "Ue");
888
889 // Misc
890 replace_all (str, "&szlig;", "ss");
891
892 return str;
893}
118e216e 894
3f5c5ccd
CH
895// find_html_comments + remove_html_comments(str, comments)
896void remove_html_comments(string &str)
897{
898 vector<CommentZone> comments;
899 find_html_comments(str, comments);
900 remove_html_comments(str, comments);
901}
902
903// find all html comments, behaving correctly if they are nested; ignores comment tags ("<!--FOO .... BAR-->")
904// If there are invalid comments ("-->" before "<!--" or different number of closing and opening tags),
905// then the unknown index of corresponding start/end tag will be represented by a string::npos
906// Indices are from start of start tag until first index after closing tag
907void find_html_comments(const std::string &str, vector<CommentZone> &comments)
908{
909 static const string START = "<!--";
910 static const string CLOSE = "-->";
911 static const string::size_type START_LEN = START.length();
912 static const string::size_type CLOSE_LEN = CLOSE.length();
913
914 // in order to find nested comments, need either recursion or a stack
915 vector<string::size_type> starts; // stack of start tags
916
917 string::size_type pos = 0;
918 string::size_type len = str.length();
919 string::size_type next_start, next_close;
920
921 while (pos < len) // not really needed but just in case
922 {
923 next_start = str.find(START, pos);
924 next_close = str.find(CLOSE, pos);
925
926 if ( (next_start == string::npos) && (next_close == string::npos) )
927 break; // we are done
928
929 else if ( (next_start == string::npos) || (next_close < next_start) ) // close one comment (pop)
930 {
931 if (starts.empty()) // closing tag without a start
932 comments.push_back(CommentZone(string::npos, next_close+CLOSE_LEN));
933 else
934 {
935 comments.push_back(CommentZone(starts.back(), next_close+CLOSE_LEN));
936 starts.pop_back();
937 }
938 pos = next_close + CLOSE_LEN;
939 }
940
941 else if ( (next_close == string::npos) || (next_start < next_close) ) // start a new comment (push)
942 {
943 starts.push_back(next_start);
944 pos = next_start + START_LEN;
945 }
946 }
947
948 // add comments that have no closing tag from back to front (important for remove_html_comments!)
949 while (!starts.empty())
950 {
951 comments.push_back(CommentZone(starts.back(), string::npos));
952 starts.pop_back();
953 }
954}
955
956// remove all html comments foundby find_html_comments
957void remove_html_comments(std::string &str, const vector<CommentZone> &comments)
958{
959 // remember position where last removal started
960 string::size_type last_removal_start = str.length();
961
962 // Go from back to front to not mess up indices.
963 // This requires that bigger comments, that contain smaller comments, come AFTER
964 // the small contained comments in the comments vector (i.e. comments are ordered by
965 // their closing tag, not their opening tag). This is true for results from find_html_comments
966 BOOST_REVERSE_FOREACH(const CommentZone &comment, comments)
967 {
968 if (comment.first == string::npos)
969 {
970 str = str.replace(0, comment.second, ""); // comment starts "before" str --> delete from start
971 break; // there can be no more
972 }
973 else if (comment.first >= last_removal_start)
974 {
975 continue; // this comment is inside another comment that we have removed already
976 }
977 else if (comment.second == string::npos) // comment ends "after" str --> delete until end
978 {
979 str = str.replace(comment.first, string::npos, "");
980 last_removal_start = comment.first;
981 }
982 else
983 {
984 str = str.replace(comment.first, comment.second-comment.first, "");
985 last_removal_start = comment.first;
986 }
987 }
988}
989
e93545dd
GE
990bool replace_all(string &base, const char *ist, const char *soll)
991{
6ab3bc95
RP
992 string i=ist;
993 string s=soll;
994 return replace_all(base,&i,&s);
e93545dd
GE
995}
996
997bool replace_all(string &base, const string &ist, const char *soll)
998{
6ab3bc95
RP
999 string s=soll;
1000 return replace_all(base,&ist,&s);
e93545dd
GE
1001}
1002
1003bool replace_all(string &base, const string *ist, const string *soll)
1004{
6ab3bc95 1005 return replace_all(base,*ist,*soll);
e93545dd
GE
1006}
1007
1008bool replace_all(string &base, const char *ist, const string *soll)
1009{
6ab3bc95
RP
1010 string i=ist;
1011 return replace_all(base,&i,soll);
e93545dd
GE
1012}
1013
1014bool replace_all(string &base, const string &ist, const string &soll)
1015{
6ab3bc95
RP
1016 bool found_ist = false;
1017 string::size_type a=0;
1018
1019 if (ist.empty() )
1020 throw runtime_error ("replace_all called with empty search string");
e93545dd 1021
6ab3bc95
RP
1022 while ( (a=base.find(ist,a) ) != string::npos)
1023 {
1024 base.replace(a,ist.size(),soll);
1025 a=a+soll.size();
1026 found_ist = true;
1027 }
1ec2064e 1028
6ab3bc95 1029 return found_ist;
e93545dd
GE
1030}
1031
b953bf36
GE
1032/**
1033 * @brief replaces all characters that could be problematic or impose a security risk when being logged
1034 * @param str the original string
1035 * @param replace_with the character to replace the unsafe chars with
1036 * @return a string that is safe to send to syslog or other logfiles
1037 *
1038 * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
1039 * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
1040 * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
1041 *
1042 */
1043std::string sanitize_for_logging(const std::string &str, const char replace_with)
1044{
1045 std::string output=str;
1046
c0e32d64
GE
1047 const string::size_type len = output.size();
1048 for (std::string::size_type p=0; p < len; p++)
b953bf36
GE
1049 if (output[p] < 0x20 || output[p] > 0x7E)
1050 output[p]=replace_with;
1051
1052 return output;
1053}
1054
e5b21dbb 1055#if 0
e93545dd
GE
1056string to_lower(const string &src)
1057{
6ab3bc95 1058 string dst = src;
e93545dd 1059
6ab3bc95
RP
1060 string::size_type pos, end = dst.size();
1061 for (pos = 0; pos < end; pos++)
1062 dst[pos] = tolower(dst[pos]);
e93545dd 1063
6ab3bc95 1064 return dst;
e93545dd
GE
1065}
1066
1067string to_upper(const string &src)
1068{
6ab3bc95 1069 string dst = src;
e93545dd 1070
6ab3bc95
RP
1071 string::size_type pos, end = dst.size();
1072 for (pos = 0; pos < end; pos++)
1073 dst[pos] = toupper(dst[pos]);
e93545dd 1074
6ab3bc95 1075 return dst;
e93545dd 1076}
e5b21dbb 1077#endif
e93545dd 1078
83809f5e 1079const int MAX_UNIT_FORMAT_SYMBOLS = 6;
d1ea9075 1080
2cb9a9c5 1081const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
d1ea9075
GMF
1082 " B",
1083 " KB",
1084 " MB",
1085 " GB",
1086 " TB",
83809f5e 1087 " PB"
d1ea9075
GMF
1088};
1089
2cb9a9c5 1090const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
5cb766b9
GMF
1091 i18n_noop(" Bytes"),
1092 i18n_noop(" KBytes"),
1093 i18n_noop(" MBytes"),
1094 i18n_noop(" GBytes"),
1095 i18n_noop(" TBytes"),
83809f5e 1096 i18n_noop(" PBytes")
d1ea9075
GMF
1097};
1098
72a94426
GMF
1099
1100long double rounding_upwards(
e91c1952 1101 const long double number,
72a94426
GMF
1102 const int rounding_multiplier
1103)
1104{
1105 long double rounded_number;
1106 rounded_number = number * rounding_multiplier;
1107 rounded_number += 0.5;
1108 rounded_number = (int64_t) (rounded_number);
1109 rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
1110
1111 return rounded_number;
1112}
1113
1114
81267544
GMF
1115string nice_unit_format(
1116 const int64_t input,
70fc0674
GMF
1117 const UnitFormat format,
1118 const UnitBase base
81267544 1119)
6ab3bc95 1120{
d1ea9075 1121 // select the system of units (decimal or binary)
81267544 1122 int multiple = 0;
a398513a 1123 if (base == UnitBase1000)
81267544
GMF
1124 {
1125 multiple = 1000;
1126 }
1127 else
1128 {
1129 multiple = 1024;
1130 }
1131
1132 long double size = input;
6ab3bc95 1133
d1ea9075
GMF
1134 // check the size of the input number to fit in the appropriate symbol
1135 int sizecount = 0;
81267544 1136 while (size > multiple)
6ab3bc95 1137 {
81267544
GMF
1138 size = size / multiple;
1139 sizecount++;
83809f5e
GMF
1140
1141 // rollback to the previous values and stop the loop when cannot
1142 // represent the number length.
1143 if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1144 {
1145 size = size * multiple;
1146 sizecount--;
1147 break;
1148 }
6ab3bc95
RP
1149 }
1150
a398513a
GMF
1151 // round the input number "half up" to multiples of 10
1152 const int rounding_multiplier = 10;
72a94426 1153 size = rounding_upwards(size, rounding_multiplier);
6ab3bc95 1154
d1ea9075 1155 // format the input number, placing the appropriate symbol
6ab3bc95 1156 ostringstream out;
6ab3bc95 1157 out.setf (ios::fixed);
a398513a 1158 if (format == ShortUnitFormat)
d1ea9075
GMF
1159 {
1160 out.precision(1);
68d37a5c 1161 out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
d1ea9075
GMF
1162 }
1163 else
6ab3bc95 1164 {
d1ea9075 1165 out.precision (2);
68d37a5c 1166 out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
6ab3bc95
RP
1167 }
1168
1169 return out.str();
1170} // eo nice_unit_format(int input)
1171
e93545dd 1172
5cd64148
CH
1173string nice_unit_format(
1174 const double input,
1175 const UnitFormat format,
1176 const UnitBase base
1177)
1178{
1179 // round as double and cast to int64_t
1180 // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1181 int64_t input_casted_and_rounded =
1182 boost::numeric_cast<int64_t>( round(input) );
1183
1184 // now call other
1185 return nice_unit_format( input_casted_and_rounded, format, base );
1186} // eo nice_unit_format(double input)
1187
1188
47c07fba
GE
1189string escape(const string &s)
1190{
6ab3bc95
RP
1191 string out(s);
1192 string::size_type p;
47c07fba 1193
6ab3bc95
RP
1194 p=0;
1195 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1196 {
1197 out.insert (p,"\\");
1198 p+=2;
1199 }
47c07fba 1200
6ab3bc95
RP
1201 p=0;
1202 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1203 {
1204 out.replace (p,1,"\\r");
1205 p+=2;
1206 }
47c07fba 1207
6ab3bc95
RP
1208 p=0;
1209 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1210 {
1211 out.replace (p,1,"\\n");
1212 p+=2;
1213 }
47c07fba 1214
6ab3bc95 1215 out='"'+out+'"';
47c07fba 1216
6ab3bc95
RP
1217 return out;
1218} // eo scape(const std::string&)
47c07fba 1219
47c07fba 1220
6ab3bc95
RP
1221string descape(const string &s, int startpos, int &endpos)
1222{
1223 string out;
1224
1225 if (s.at(startpos) != '"')
1226 throw out_of_range("value not type escaped string");
1227
1228 out=s.substr(startpos+1);
1229 string::size_type p=0;
1230
1231 // search for the end of the string
1232 while ( (p=out.find("\"",p) ) !=out.npos)
1233 {
1234 int e=p-1;
1235 bool escaped=false;
1236
1237 // the " might be escaped with a backslash
1238 while (e>=0 && out.at (e) =='\\')
1239 {
1240 if (escaped == false)
1241 escaped=true;
1242 else
1243 escaped=false;
1244
1245 e--;
1246 }
1247
1248 if (escaped==false)
1249 break;
1250 else
1251 p++;
1252 }
1253
1254 // we now have the end of the string
1255 out=out.substr(0,p);
1256
1257 // tell calling prog about the endposition
1258 endpos=startpos+p+1;
1259
1260 // descape all \ stuff inside the string now
1261 p=0;
1262 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1263 {
1264 switch (out.at(p+1) )
1265 {
1266 case 'r':
47c07fba
GE
1267 out.replace(p,2,"\r");
1268 break;
6ab3bc95 1269 case 'n':
47c07fba
GE
1270 out.replace(p,2,"\n");
1271 break;
6ab3bc95 1272 default:
47c07fba 1273 out.erase(p,1);
6ab3bc95
RP
1274 }
1275 p++;
1276 }
1277
1278 return out;
1279} // eo descape(const std::string&,int,int&)
47c07fba 1280
e93545dd 1281
47c07fba
GE
1282string escape_shellarg(const string &input)
1283{
6ab3bc95
RP
1284 string output = "'";
1285 string::const_iterator it, it_end = input.end();
83d700e9 1286 for (it = input.begin(); it != it_end; ++it)
6ab3bc95
RP
1287 {
1288 if ( (*it) == '\'')
1289 output += "'\\'";
1290
1291 output += *it;
1292 }
1293
1294 output += "'";
1295 return output;
47c07fba 1296}