Create vector-result-version of split_string with unit test
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
0e23f538
TJ
1/*
2The software in this package is distributed under the GNU General
3Public License version 2 (with a special exception described below).
4
5A copy of GNU General Public License (GPL) is included in this distribution,
6in the file COPYING.GPL.
7
8As a special exception, if other files instantiate templates or use macros
9or inline functions from this file, or you compile this file and link it
10with other works to produce a work based on this file, this file
11does not by itself cause the resulting work to be covered
12by the GNU General Public License.
13
14However the source code for this file must still be made available
15in accordance with section (3) of the GNU General Public License.
16
17This exception does not invalidate any other reasons why a work based
18on this file might be covered by the GNU General Public License.
19*/
6a93d84a
TJ
20/** @file
21 *
22 * (c) Copyright 2007-2008 by Intra2net AG
6a93d84a 23 */
e93545dd
GE
24
25#include <iostream>
26#include <string>
27#include <sstream>
28#include <stdexcept>
5efd35b1 29#include <algorithm>
5cd64148 30#include <cmath> // for round()
e93545dd 31
a5f3af6e 32#include <wchar.h>
e93545dd
GE
33#include <stdlib.h>
34#include <iconv.h>
35#include <i18n.h>
36
5cd64148
CH
37#include <boost/numeric/conversion/cast.hpp>
38
e93545dd
GE
39#include <stringfunc.hxx>
40
41using namespace std;
42
6ab3bc95
RP
43namespace I2n
44{
6a93d84a
TJ
45
46
6ab3bc95
RP
47namespace
48{
6a93d84a
TJ
49
50const std::string hexDigitsLower("0123456789abcdef");
51const std::string hexDigitsUpper("0123456789ABCDEF");
52
53
54struct UpperFunc
55{
6ab3bc95
RP
56 char operator() (char c)
57 {
58 return std::toupper(c);
59 }
6a93d84a
TJ
60}; // eo struct UpperFunc
61
62
63struct LowerFunc
64{
6ab3bc95
RP
65 char operator() (char c)
66 {
67 return std::tolower(c);
68 }
6a93d84a
TJ
69}; // eo struct LowerFunc
70
71
72} // eo namespace <anonymous>
73
74
75
76/**
6ab3bc95 77 * default list of Whitespaces (" \t\r\n");
6a93d84a 78 */
6ab3bc95 79const std::string Whitespaces = " \t\r\n";
6a93d84a
TJ
80
81/**
82 * default list of lineendings ("\r\n");
83 */
6ab3bc95 84const std::string LineEndings= "\r\n";
6a93d84a
TJ
85
86
87
88/**
89 * @brief checks if a string begins with a given prefix.
90 * @param[in,out] str the string which is tested
91 * @param prefix the prefix which should be tested for.
92 * @return @a true iff the prefix is not empty and the string begins with that prefix.
93 */
6ab3bc95 94bool has_prefix(const std::string& str, const std::string& prefix)
6a93d84a 95{
6ab3bc95
RP
96 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
97 {
98 return false;
99 }
100 return str.compare(0, prefix.size(), prefix) == 0;
101} // eo has_prefix(const std::string&,const std::string&)
6a93d84a
TJ
102
103
104/**
105 * @brief checks if a string ends with a given suffix.
106 * @param[in,out] str the string which is tested
107 * @param suffix the suffix which should be tested for.
108 * @return @a true iff the suffix is not empty and the string ends with that suffix.
109 */
6ab3bc95 110bool has_suffix(const std::string& str, const std::string& suffix)
6a93d84a 111{
6ab3bc95
RP
112 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
113 {
114 return false;
115 }
116 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
117} // eo has_suffix(const std::string&,const std::string&)
6a93d84a
TJ
118
119
120/**
121 * cut off characters from a given list from front and end of a string.
122 * @param[in,out] str the string which should be trimmed.
123 * @param charlist the list of characters to remove from beginning and end of string
124 * @return the result string.
125 */
6ab3bc95
RP
126std::string trim_mod(std::string& str, const std::string& charlist)
127{
128 // first: trim the beginning:
129 std::string::size_type pos= str.find_first_not_of (charlist);
130 if (pos == std::string::npos)
131 {
132 // whole string consists of charlist (or is already empty)
133 str.clear();
134 return str;
135 }
136 else if (pos>0)
137 {
138 // str starts with charlist
139 str.erase(0,pos);
140 }
141 // now let's look at the tail:
142 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
143 if ( pos < str.size() )
144 {
145 str.erase(pos, str.size()-pos);
146 }
147 return str;
148} // eo trim_mod(std::string&,const std::string&)
6a93d84a
TJ
149
150
151
152/**
153 * removes last character from a string when it is in a list of chars to be removed.
154 * @param[in,out] str the string.
155 * @param what the list of chars which will be tested for.
156 * @return the resulting string with last char removed (if applicable)
157 */
6ab3bc95 158std::string chomp_mod(std::string& str, const std::string& what)
6a93d84a 159{
6ab3bc95
RP
160 if (str.empty() || what.empty() )
161 {
162 return str;
163 }
164 if (what.find(str.at (str.size()-1) ) != std::string::npos)
165 {
166 str.erase(str.size() - 1);
167 }
168 return str;
169} // eo chomp_mod(std::string&,const std::string&)
6a93d84a
TJ
170
171
172/**
173 * @brief converts a string to lower case.
174 * @param[in,out] str the string to modify.
175 * @return the string
176 */
6ab3bc95 177std::string to_lower_mod(std::string& str)
6a93d84a 178{
6ab3bc95
RP
179 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
180 return str;
181} // eo to_lower_mod(std::string&)
6a93d84a
TJ
182
183
184/**
185 * @brief converts a string to upper case.
186 * @param[in,out] str the string to modify.
187 * @return the string
188 */
6ab3bc95 189std::string to_upper_mod(std::string& str)
6a93d84a 190{
6ab3bc95
RP
191 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
192 return str;
193} // eo to_upper_mod(std::string&)
6a93d84a
TJ
194
195
196
197/**
198 * cut off characters from a given list from front and end of a string.
199 * @param str the string which should be trimmed.
200 * @param charlist the list of characters to remove from beginning and end of string
201 * @return the result string.
202 */
6ab3bc95
RP
203std::string trim (const std::string& str, const std::string& charlist)
204{
205 // first: trim the beginning:
206 std::string::size_type pos0= str.find_first_not_of(charlist);
207 if (pos0 == std::string::npos)
208 {
209 // whole string consists of charlist (or is already empty)
210 return std::string();
211 }
212 // now let's look at the end:
213 std::string::size_type pos1= str.find_last_not_of(charlist);
214 return str.substr(pos0, pos1 - pos0 + 1);
6a93d84a
TJ
215} // eo trim(const std:.string&,const std::string&)
216
217
218/**
219 * removes last character from a string when it is in a list of chars to be removed.
220 * @param str the string.
221 * @param what the list of chars which will be tested for.
222 * @return the resulting string with last char removed (if applicable)
223 */
6ab3bc95
RP
224std::string chomp (const std::string& str, const std::string& what)
225{
226 if (str.empty() || what.empty() )
227 {
228 return str;
229 }
230 if (what.find(str.at (str.size()-1) ) != std::string::npos)
231 {
232 return str.substr(0, str.size()-1);
233 }
234 return str;
6a93d84a
TJ
235} // eo chomp(const std:.string&,const std::string&)
236
237
238/**
239 * @brief returns a lower case version of a given string.
240 * @param str the string
241 * @return the lower case version of the string
242 */
6ab3bc95 243std::string to_lower (const std::string& str)
6a93d84a 244{
6ab3bc95
RP
245 std::string result(str);
246 return to_lower_mod(result);
247} // eo to_lower(const std::string&)
6a93d84a
TJ
248
249
250/**
251 * @brief returns a upper case version of a given string.
252 * @param str the string
253 * @return the upper case version of the string
254 */
6ab3bc95 255std::string to_upper(const std::string& str)
6a93d84a 256{
6ab3bc95
RP
257 std::string result(str);
258 return to_upper_mod(result);
259} // eo to_upper(const std::string&)
6a93d84a
TJ
260
261
262
263/**
264 * @brief removes a given suffix from a string.
265 * @param str the string.
266 * @param suffix the suffix which should be removed if the string ends with it.
267 * @return the string without the suffix.
268 *
269 * If the string ends with the suffix, it is removed. If the the string doesn't end
270 * with the suffix the original string is returned.
271 */
6ab3bc95 272std::string remove_suffix(const std::string& str, const std::string& suffix)
6a93d84a 273{
6ab3bc95
RP
274 if (has_suffix(str,suffix) )
275 {
276 return str.substr(0, str.size()-suffix.size() );
277 }
278 return str;
279} // eo remove_suffix(const std::string&,const std::string&)
6a93d84a
TJ
280
281
282
283/**
284 * @brief removes a given prefix from a string.
285 * @param str the string.
286 * @param prefix the prefix which should be removed if the string begins with it.
287 * @return the string without the prefix.
288 *
289 * If the string begins with the prefix, it is removed. If the the string doesn't begin
290 * with the prefix the original string is returned.
291 */
6ab3bc95 292std::string remove_prefix(const std::string& str, const std::string& prefix)
6a93d84a 293{
6ab3bc95
RP
294 if (has_prefix(str,prefix) )
295 {
296 return str.substr( prefix.size() );
297 }
298 return str;
299} // eo remove_prefix(const std::string&,const std::string&)
6a93d84a
TJ
300
301
302/**
303 * split a string to key and value delimited by a given delimiter.
6ab3bc95 304 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
6a93d84a
TJ
305 * @param str the string which should be splitted.
306 * @param[out] key the resulting key
307 * @param[out] value the resulting value
308 * @param delimiter the delimiter between key and value; default is '='.
309 * @return @a true if the split was successful.
310 */
6ab3bc95
RP
311bool pair_split(
312 const std::string& str,
313 std::string& key,
314 std::string& value,
315 char delimiter)
316{
317 std::string::size_type pos = str.find (delimiter);
318 if (pos == std::string::npos) return false;
319 key= str.substr(0,pos);
320 value= str.substr(pos+1);
321 trim_mod(key);
322 trim_mod(value);
323 return true;
324} // eo pair_split(const std::string&,std::string&,std::string&,char)
6a93d84a
TJ
325
326
327/**
328 * splits a string by given delimiter
329 *
330 * @param[in] str the string which should be splitted.
331 * @param[out] result the list resulting from splitting @a str.
332 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
333 * @param[in] omit_empty should empty parts not be stored?
334 * @param[in] trim_list list of characters the parts should be trimmed by.
335 * (empty string results in no trim)
336 */
6ab3bc95
RP
337void split_string(
338 const std::string& str,
339 std::list<std::string>& result,
340 const std::string& delimiter,
341 bool omit_empty,
342 const std::string& trim_list
6a93d84a
TJ
343)
344{
6ab3bc95
RP
345 std::string::size_type pos, last_pos=0;
346 bool delimiter_found= false;
347 while ( last_pos < str.size() && last_pos != std::string::npos)
348 {
349 pos= str.find(delimiter, last_pos);
350 std::string part;
351 if (pos == std::string::npos)
352 {
353 part= str.substr(last_pos);
354 delimiter_found= false;
355 }
356 else
357 {
358 part= str.substr(last_pos, pos-last_pos);
359 delimiter_found=true;
360 }
361 if (pos != std::string::npos)
362 {
363 last_pos= pos+ delimiter.size();
364 }
365 else
366 {
367 last_pos= std::string::npos;
368 }
369 if (!trim_list.empty() ) trim_mod (part, trim_list);
370 if (omit_empty && part.empty() ) continue;
371 result.push_back( part );
372 }
373 // if the string ends with a delimiter we need to append an empty string if no omit_empty
374 // was given.
375 // (this way we keep the split result consistent to a join operation)
376 if (delimiter_found && !omit_empty)
377 {
378 result.push_back("");
379 }
380} // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
6a93d84a
TJ
381
382
338da253
CH
383/** call split_string with list<string>, converts result to vector; vector is clear()-ed first
384 *
385 * Note: Uses 3 O(n)-operations: list.size, vector.resize and std::swap_ranges;
386 * not sure whether there is a better way to do this
387 * */
388void split_string(
389 const std::string& str,
390 std::vector<std::string>& result,
391 const std::string& delimiter,
392 bool omit_empty,
393 const std::string& trim_list
394)
395{
396 std::list<std::string> tmp;
397 split_string(str, tmp, delimiter, omit_empty, trim_list);
398 std::size_t size = tmp.size(); // this is O(n)
399 result.clear();
400 result.resize(size); // also O(n)
401 std::swap_ranges(tmp.begin(), tmp.end(), result.begin()); // also O(n)
402}
403
6a93d84a
TJ
404/**
405 * splits a string by a given delimiter
406 * @param str the string which should be splitted.
407 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
408 * @param[in] omit_empty should empty parts not be stored?
409 * @param[in] trim_list list of characters the parts should be trimmed by.
410 * (empty string results in no trim)
411 * @return the list resulting from splitting @a str.
412 */
6ab3bc95
RP
413std::list<std::string> split_string(
414 const std::string& str,
415 const std::string& delimiter,
416 bool omit_empty,
417 const std::string& trim_list
6a93d84a
TJ
418)
419{
6ab3bc95
RP
420 std::list<std::string> result;
421 split_string(str, result, delimiter, omit_empty, trim_list);
422 return result;
423} // eo split_string(const std::string&,const std::string&,bool,const std::string&)
6a93d84a
TJ
424
425
426/**
427 * @brief joins a list of strings into a single string.
428 *
6ab3bc95
RP
429 * This funtion is (basically) the reverse operation of @a split_string.
430 *
6a93d84a
TJ
431 * @param parts the list of strings.
432 * @param delimiter the delimiter which is inserted between the strings.
433 * @return the joined string.
434 */
6ab3bc95
RP
435std::string join_string(
436 const std::list< std::string >& parts,
437 const std::string& delimiter
6a93d84a
TJ
438)
439{
6ab3bc95
RP
440 std::string result;
441 if (! parts.empty() )
442 {
443 std::list< std::string >::const_iterator it= parts.begin();
444 result = *it;
445 while ( ++it != parts.end() )
446 {
447 result+= delimiter;
448 result+= *it;
449 }
450 }
451 return result;
452} // eo join_string(const std::list< std::string >&,const std::string&)
6a93d84a
TJ
453
454
376ec4fa
CH
455/** @brief same as join_string for list, except uses a vector */
456std::string join_string(
457 const std::vector< std::string >& parts,
458 const std::string& delimiter
459)
460{
461 std::string result;
462 if (! parts.empty() )
463 {
464 std::vector< std::string >::const_iterator it= parts.begin();
465 result = *it;
466 while ( ++it != parts.end() )
467 {
468 result+= delimiter;
469 result+= *it;
470 }
471 }
472 return result;
473} // eo join_string(const std::vector< std::string >&,const std::string&)
474
475
6a93d84a
TJ
476
477/*
478** conversions
479*/
480
481
482/**
483 * @brief returns a hex string from a binary string.
484 * @param str the (binary) string
485 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
486 * @return the string in hex notation.
487 */
6ab3bc95
RP
488std::string convert_binary_to_hex(
489 const std::string& str,
490 bool upper_case_digits
6a93d84a
TJ
491)
492{
6ab3bc95
RP
493 std::string result;
494 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
495 for ( std::string::const_iterator it= str.begin();
496 it != str.end();
497 ++it)
498 {
499 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
500 result.push_back( hexDigits[ (*it) & 0x0f ] );
501 }
502 return result;
503} // eo convert_binary_to_hex(const std::string&,bool)
6a93d84a
TJ
504
505
506/**
507 * @brief converts a hex digit string to binary string.
508 * @param str hex digit string
509 * @return the binary string.
510 *
511 * The hex digit string may contains white spaces or colons which are treated
512 * as delimiters between hex digit groups.
513 *
514 * @todo rework the handling of half nibbles (consistency)!
515 */
6ab3bc95
RP
516std::string convert_hex_to_binary(
517 const std::string& str
6a93d84a 518)
6ab3bc95
RP
519throw (std::runtime_error)
520{
521 std::string result;
522 char c= 0;
523 bool hasNibble= false;
524 bool lastWasWS= true;
525 for ( std::string::const_iterator it= str.begin();
526 it != str.end();
527 ++it)
528 {
529 std::string::size_type p = hexDigitsLower.find( *it );
530 if (p== std::string::npos)
531 {
532 p= hexDigitsUpper.find( *it );
533 }
534 if (p == std::string::npos)
535 {
536 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
6a93d84a 537 or ( *it == ':') // or a colon?
6ab3bc95
RP
538 )
539 {
540 // we treat that as a valid delimiter:
541 if (hasNibble)
6a93d84a 542 {
6ab3bc95
RP
543 // 1 nibble before WS is treate as lower part:
544 result.push_back(c);
545 // reset state:
546 hasNibble= false;
6a93d84a 547 }
6ab3bc95
RP
548 lastWasWS= true;
549 continue;
550 }
551 }
552 if (p == std::string::npos )
553 {
554 throw runtime_error("illegal character in hex digit string: " + str);
555 }
556 lastWasWS= false;
557 if (hasNibble)
558 {
559 c<<=4;
560 }
561 else
562 {
563 c=0;
564 }
565 c+= (p & 0x0f);
566 if (hasNibble)
567 {
568 //we already had a nibble, so a char is complete now:
569 result.push_back( c );
570 hasNibble=false;
571 }
572 else
573 {
574 // this is the first nibble of a new char:
575 hasNibble=true;
576 }
577 }
578 if (hasNibble)
579 {
580 //well, there is one nibble left
581 // let's do some heuristics:
582 if (lastWasWS)
583 {
584 // if the preceeding character was a white space (or a colon)
585 // we treat the nibble as lower part:
586 //( this is consistent with shortened hex notations where leading zeros are not noted)
587 result.push_back( c );
588 }
589 else
590 {
591 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
592 result.push_back( c << 4 );
593 }
594 }
595 return result;
596} // eo convert_hex_to_binary(const std::string&)
597
598
599} // eo namespace I2n
600
601
602
6a93d84a 603
e93545dd
GE
604std::string iso_to_utf8(const std::string& isostring)
605{
6ab3bc95 606 string result;
118e216e 607
6ab3bc95 608 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
118e216e 609
6ab3bc95
RP
610 if (iso_to_utf8 == (iconv_t)-1)
611 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 612
6ab3bc95
RP
613 size_t in_size=isostring.size();
614 size_t out_size=in_size*4;
118e216e 615
6ab3bc95
RP
616 char *buf = (char *)malloc(out_size+1);
617 if (buf == NULL)
618 throw runtime_error("out of memory for iconv buffer");
e93545dd 619
5a4ecb51 620 char *in = (char *)isostring.c_str();
6ab3bc95
RP
621 char *out = buf;
622 iconv(i2utf8, &in, &in_size, &out, &out_size);
118e216e 623
6ab3bc95 624 buf[isostring.size()*4-out_size]=0;
118e216e 625
6ab3bc95 626 result=buf;
118e216e 627
6ab3bc95
RP
628 free(buf);
629 iconv_close(i2utf8);
118e216e 630
6ab3bc95 631 return result;
e93545dd
GE
632}
633
634std::string utf8_to_iso(const std::string& utf8string)
635{
6ab3bc95 636 string result;
118e216e 637
6ab3bc95 638 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
118e216e 639
6ab3bc95
RP
640 if (utf82iso == (iconv_t)-1)
641 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 642
6ab3bc95
RP
643 size_t in_size=utf8string.size();
644 size_t out_size=in_size;
118e216e 645
6ab3bc95
RP
646 char *buf = (char *)malloc(out_size+1);
647 if (buf == NULL)
648 throw runtime_error("out of memory for iconv buffer");
e93545dd 649
5a4ecb51 650 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
651 char *out = buf;
652 iconv(utf82iso, &in, &in_size, &out, &out_size);
118e216e 653
6ab3bc95 654 buf[utf8string.size()-out_size]=0;
118e216e 655
6ab3bc95 656 result=buf;
118e216e 657
6ab3bc95
RP
658 free(buf);
659 iconv_close(utf82iso);
e93545dd 660
6ab3bc95 661 return result;
e93545dd
GE
662}
663
a5f3af6e
GE
664wchar_t* utf8_to_wbuf(const std::string& utf8string)
665{
6ab3bc95 666 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
a5f3af6e 667
6ab3bc95
RP
668 if (utf82wstr == (iconv_t)-1)
669 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
a5f3af6e 670
6ab3bc95
RP
671 size_t in_size=utf8string.size();
672 size_t out_size= (in_size+1)*sizeof(wchar_t);
a5f3af6e 673
6ab3bc95
RP
674 wchar_t *buf = (wchar_t *)malloc(out_size);
675 if (buf == NULL)
676 throw runtime_error("out of memory for iconv buffer");
a5f3af6e 677
5a4ecb51 678 char *in = (char *)utf8string.c_str();
6ab3bc95 679 char *out = (char*) buf;
dbd6d77c 680 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
6ab3bc95 681 throw runtime_error("error converting char encodings");
a5f3af6e 682
6ab3bc95 683 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
a5f3af6e 684
6ab3bc95 685 iconv_close(utf82wstr);
a5f3af6e 686
6ab3bc95 687 return buf;
a5f3af6e
GE
688}
689
13cc4db1 690std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071 691{
6ab3bc95 692 string result;
118e216e 693
6ab3bc95 694 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
118e216e 695
6ab3bc95
RP
696 if (utf7imap2utf8 == (iconv_t)-1)
697 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 698
6ab3bc95
RP
699 size_t in_size=utf7imapstring.size();
700 size_t out_size=in_size*4;
118e216e 701
6ab3bc95
RP
702 char *buf = (char *)malloc(out_size+1);
703 if (buf == NULL)
704 throw runtime_error("out of memory for iconv buffer");
d116a071 705
5a4ecb51 706 char *in = (char *)utf7imapstring.c_str();
6ab3bc95
RP
707 char *out = buf;
708 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 709
6ab3bc95 710 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 711
6ab3bc95 712 result=buf;
118e216e 713
6ab3bc95
RP
714 free(buf);
715 iconv_close(utf7imap2utf8);
118e216e 716
6ab3bc95 717 return result;
118e216e
TJ
718}
719
6a2b6dd1
TJ
720std::string utf8_to_utf7imap(const std::string& utf8string)
721{
6ab3bc95 722 string result;
6a2b6dd1 723
6ab3bc95 724 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
6a2b6dd1 725
6ab3bc95
RP
726 if (utf82utf7imap == (iconv_t)-1)
727 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
6a2b6dd1 728
6ab3bc95
RP
729 // UTF-7 is base64 encoded, a buffer 10x as large
730 // as the utf-8 buffer should be enough. If not the string will be truncated.
731 size_t in_size=utf8string.size();
732 size_t out_size=in_size*10;
6a2b6dd1 733
6ab3bc95
RP
734 char *buf = (char *)malloc(out_size+1);
735 if (buf == NULL)
736 throw runtime_error("out of memory for iconv buffer");
6a2b6dd1 737
5a4ecb51 738 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
739 char *out = buf;
740 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
6a2b6dd1 741
6ab3bc95 742 buf[utf8string.size()*10-out_size]= 0;
6a2b6dd1 743
6ab3bc95 744 result=buf;
6a2b6dd1 745
6ab3bc95
RP
746 free(buf);
747 iconv_close(utf82utf7imap);
6a2b6dd1 748
6ab3bc95 749 return result;
6a2b6dd1
TJ
750}
751
118e216e
TJ
752// Tokenize string by (html) tags
753void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
754{
6ab3bc95
RP
755 string::size_type pos, len = input.size();
756 bool inside_tag = false;
757 string current;
758
759 for (pos = 0; pos < len; pos++)
760 {
761 if (input[pos] == '<')
762 {
763 inside_tag = true;
764
765 if (!current.empty() )
766 {
767 tokenized.push_back( make_pair(current, false) );
768 current = "";
769 }
770
771 current += input[pos];
772 }
773 else if (input[pos] == '>' && inside_tag)
774 {
775 current += input[pos];
776 inside_tag = false;
777 if (!current.empty() )
778 {
779 tokenized.push_back( make_pair(current, true) );
780 current = "";
781 }
782 }
783 else
784 current += input[pos];
785 }
786
787 // String left over in buffer?
788 if (!current.empty() )
789 tokenized.push_back( make_pair(current, false) );
790} // eo tokenize_by_tag
118e216e 791
118e216e
TJ
792
793std::string strip_html_tags(const std::string &input)
794{
6ab3bc95
RP
795 // Pair first: string, second: isTag
796 vector<pair<string,bool> > tokenized;
797 tokenize_by_tag (tokenized, input);
118e216e 798
6ab3bc95
RP
799 string output;
800 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 801 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
802 if (!token->second)
803 output += token->first;
804
805 return output;
806} // eo strip_html_tags
118e216e 807
118e216e
TJ
808
809// Smart-encode HTML en
810string smart_html_entities(const std::string &input)
811{
6ab3bc95
RP
812 // Pair first: string, second: isTag
813 vector<pair<string,bool> > tokenized;
814 tokenize_by_tag (tokenized, input);
815
816 string output;
817 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 818 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
819 {
820 // keep HTML tags as they are
821 if (token->second)
822 output += token->first;
823 else
824 output += html_entities(token->first);
825 }
826
827 return output;
118e216e
TJ
828}
829
6ab3bc95 830
a5f3af6e
GE
831string::size_type find_8bit(const std::string &str)
832{
6ab3bc95
RP
833 string::size_type l=str.size();
834 for (string::size_type p=0; p < l; p++)
835 if (static_cast<unsigned char>(str[p]) > 127)
836 return p;
a5f3af6e 837
6ab3bc95 838 return string::npos;
a5f3af6e
GE
839}
840
118e216e
TJ
841// encoded UTF-8 chars into HTML entities
842string html_entities(std::string str)
843{
6ab3bc95
RP
844 // Normal chars
845 replace_all (str, "&", "&amp;");
6ab3bc95
RP
846 replace_all (str, "<", "&lt;");
847 replace_all (str, ">", "&gt;");
980577e1
TJ
848 replace_all (str, "\"", "&quot;");
849 replace_all (str, "'", "&#x27;");
850 replace_all (str, "/", "&#x2F;");
6ab3bc95
RP
851
852 // Umlauts
853 replace_all (str, "\xC3\xA4", "&auml;");
854 replace_all (str, "\xC3\xB6", "&ouml;");
855 replace_all (str, "\xC3\xBC", "&uuml;");
856 replace_all (str, "\xC3\x84", "&Auml;");
857 replace_all (str, "\xC3\x96", "&Ouml;");
858 replace_all (str, "\xC3\x9C", "&Uuml;");
859
860 // Misc
861 replace_all (str, "\xC3\x9F", "&szlig;");
862
863 // conversion of remaining non-ASCII chars needed?
864 // just do if needed because of performance
865 if (find_8bit(str) != string::npos)
866 {
867 // convert to fixed-size encoding UTF-32
868 wchar_t* wbuf=utf8_to_wbuf(str);
869 ostringstream target;
870
871 // replace all non-ASCII chars with HTML representation
872 for (int p=0; wbuf[p] != 0; p++)
873 {
874 unsigned int c=wbuf[p];
875
876 if (c <= 127)
877 target << static_cast<unsigned char>(c);
878 else
879 target << "&#" << c << ';';
880 }
881
882 free(wbuf);
883
884 str=target.str();
885 }
886
887 return str;
888} // eo html_entities(std::string)
889
554f813d
GE
890// convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
891string html_entities_to_console(std::string str)
892{
893 // Normal chars
894 replace_all (str, "&amp;", "&");
895 replace_all (str, "&lt;", "<");
896 replace_all (str, "&gt;", ">");
897 replace_all (str, "&quot;", "\"");
898 replace_all (str, "&#x27;", "'");
899 replace_all (str, "&#x2F;", "/");
900
901 // Umlauts
902 replace_all (str, "&auml;", "ae");
903 replace_all (str, "&ouml;", "oe");
904 replace_all (str, "&uuml;", "ue");
905 replace_all (str, "&Auml;", "Ae");
906 replace_all (str, "&Ouml;", "Oe");
907 replace_all (str, "&Uuml;", "Ue");
908
909 // Misc
910 replace_all (str, "&szlig;", "ss");
911
912 return str;
913}
118e216e 914
e93545dd
GE
915bool replace_all(string &base, const char *ist, const char *soll)
916{
6ab3bc95
RP
917 string i=ist;
918 string s=soll;
919 return replace_all(base,&i,&s);
e93545dd
GE
920}
921
922bool replace_all(string &base, const string &ist, const char *soll)
923{
6ab3bc95
RP
924 string s=soll;
925 return replace_all(base,&ist,&s);
e93545dd
GE
926}
927
928bool replace_all(string &base, const string *ist, const string *soll)
929{
6ab3bc95 930 return replace_all(base,*ist,*soll);
e93545dd
GE
931}
932
933bool replace_all(string &base, const char *ist, const string *soll)
934{
6ab3bc95
RP
935 string i=ist;
936 return replace_all(base,&i,soll);
e93545dd
GE
937}
938
939bool replace_all(string &base, const string &ist, const string &soll)
940{
6ab3bc95
RP
941 bool found_ist = false;
942 string::size_type a=0;
943
944 if (ist.empty() )
945 throw runtime_error ("replace_all called with empty search string");
e93545dd 946
6ab3bc95
RP
947 while ( (a=base.find(ist,a) ) != string::npos)
948 {
949 base.replace(a,ist.size(),soll);
950 a=a+soll.size();
951 found_ist = true;
952 }
1ec2064e 953
6ab3bc95 954 return found_ist;
e93545dd
GE
955}
956
b953bf36
GE
957/**
958 * @brief replaces all characters that could be problematic or impose a security risk when being logged
959 * @param str the original string
960 * @param replace_with the character to replace the unsafe chars with
961 * @return a string that is safe to send to syslog or other logfiles
962 *
963 * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
964 * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
965 * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
966 *
967 */
968std::string sanitize_for_logging(const std::string &str, const char replace_with)
969{
970 std::string output=str;
971
c0e32d64
GE
972 const string::size_type len = output.size();
973 for (std::string::size_type p=0; p < len; p++)
b953bf36
GE
974 if (output[p] < 0x20 || output[p] > 0x7E)
975 output[p]=replace_with;
976
977 return output;
978}
979
e5b21dbb 980#if 0
e93545dd
GE
981string to_lower(const string &src)
982{
6ab3bc95 983 string dst = src;
e93545dd 984
6ab3bc95
RP
985 string::size_type pos, end = dst.size();
986 for (pos = 0; pos < end; pos++)
987 dst[pos] = tolower(dst[pos]);
e93545dd 988
6ab3bc95 989 return dst;
e93545dd
GE
990}
991
992string to_upper(const string &src)
993{
6ab3bc95 994 string dst = src;
e93545dd 995
6ab3bc95
RP
996 string::size_type pos, end = dst.size();
997 for (pos = 0; pos < end; pos++)
998 dst[pos] = toupper(dst[pos]);
e93545dd 999
6ab3bc95 1000 return dst;
e93545dd 1001}
e5b21dbb 1002#endif
e93545dd 1003
83809f5e 1004const int MAX_UNIT_FORMAT_SYMBOLS = 6;
d1ea9075 1005
2cb9a9c5 1006const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
d1ea9075
GMF
1007 " B",
1008 " KB",
1009 " MB",
1010 " GB",
1011 " TB",
83809f5e 1012 " PB"
d1ea9075
GMF
1013};
1014
2cb9a9c5 1015const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
5cb766b9
GMF
1016 i18n_noop(" Bytes"),
1017 i18n_noop(" KBytes"),
1018 i18n_noop(" MBytes"),
1019 i18n_noop(" GBytes"),
1020 i18n_noop(" TBytes"),
83809f5e 1021 i18n_noop(" PBytes")
d1ea9075
GMF
1022};
1023
72a94426
GMF
1024
1025long double rounding_upwards(
e91c1952 1026 const long double number,
72a94426
GMF
1027 const int rounding_multiplier
1028)
1029{
1030 long double rounded_number;
1031 rounded_number = number * rounding_multiplier;
1032 rounded_number += 0.5;
1033 rounded_number = (int64_t) (rounded_number);
1034 rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
1035
1036 return rounded_number;
1037}
1038
1039
81267544
GMF
1040string nice_unit_format(
1041 const int64_t input,
70fc0674
GMF
1042 const UnitFormat format,
1043 const UnitBase base
81267544 1044)
6ab3bc95 1045{
d1ea9075 1046 // select the system of units (decimal or binary)
81267544 1047 int multiple = 0;
a398513a 1048 if (base == UnitBase1000)
81267544
GMF
1049 {
1050 multiple = 1000;
1051 }
1052 else
1053 {
1054 multiple = 1024;
1055 }
1056
1057 long double size = input;
6ab3bc95 1058
d1ea9075
GMF
1059 // check the size of the input number to fit in the appropriate symbol
1060 int sizecount = 0;
81267544 1061 while (size > multiple)
6ab3bc95 1062 {
81267544
GMF
1063 size = size / multiple;
1064 sizecount++;
83809f5e
GMF
1065
1066 // rollback to the previous values and stop the loop when cannot
1067 // represent the number length.
1068 if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1069 {
1070 size = size * multiple;
1071 sizecount--;
1072 break;
1073 }
6ab3bc95
RP
1074 }
1075
a398513a
GMF
1076 // round the input number "half up" to multiples of 10
1077 const int rounding_multiplier = 10;
72a94426 1078 size = rounding_upwards(size, rounding_multiplier);
6ab3bc95 1079
d1ea9075 1080 // format the input number, placing the appropriate symbol
6ab3bc95 1081 ostringstream out;
6ab3bc95 1082 out.setf (ios::fixed);
a398513a 1083 if (format == ShortUnitFormat)
d1ea9075
GMF
1084 {
1085 out.precision(1);
68d37a5c 1086 out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
d1ea9075
GMF
1087 }
1088 else
6ab3bc95 1089 {
d1ea9075 1090 out.precision (2);
68d37a5c 1091 out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
6ab3bc95
RP
1092 }
1093
1094 return out.str();
1095} // eo nice_unit_format(int input)
1096
e93545dd 1097
5cd64148
CH
1098string nice_unit_format(
1099 const double input,
1100 const UnitFormat format,
1101 const UnitBase base
1102)
1103{
1104 // round as double and cast to int64_t
1105 // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1106 int64_t input_casted_and_rounded =
1107 boost::numeric_cast<int64_t>( round(input) );
1108
1109 // now call other
1110 return nice_unit_format( input_casted_and_rounded, format, base );
1111} // eo nice_unit_format(double input)
1112
1113
47c07fba
GE
1114string escape(const string &s)
1115{
6ab3bc95
RP
1116 string out(s);
1117 string::size_type p;
47c07fba 1118
6ab3bc95
RP
1119 p=0;
1120 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1121 {
1122 out.insert (p,"\\");
1123 p+=2;
1124 }
47c07fba 1125
6ab3bc95
RP
1126 p=0;
1127 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1128 {
1129 out.replace (p,1,"\\r");
1130 p+=2;
1131 }
47c07fba 1132
6ab3bc95
RP
1133 p=0;
1134 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1135 {
1136 out.replace (p,1,"\\n");
1137 p+=2;
1138 }
47c07fba 1139
6ab3bc95 1140 out='"'+out+'"';
47c07fba 1141
6ab3bc95
RP
1142 return out;
1143} // eo scape(const std::string&)
47c07fba 1144
47c07fba 1145
6ab3bc95
RP
1146string descape(const string &s, int startpos, int &endpos)
1147{
1148 string out;
1149
1150 if (s.at(startpos) != '"')
1151 throw out_of_range("value not type escaped string");
1152
1153 out=s.substr(startpos+1);
1154 string::size_type p=0;
1155
1156 // search for the end of the string
1157 while ( (p=out.find("\"",p) ) !=out.npos)
1158 {
1159 int e=p-1;
1160 bool escaped=false;
1161
1162 // the " might be escaped with a backslash
1163 while (e>=0 && out.at (e) =='\\')
1164 {
1165 if (escaped == false)
1166 escaped=true;
1167 else
1168 escaped=false;
1169
1170 e--;
1171 }
1172
1173 if (escaped==false)
1174 break;
1175 else
1176 p++;
1177 }
1178
1179 // we now have the end of the string
1180 out=out.substr(0,p);
1181
1182 // tell calling prog about the endposition
1183 endpos=startpos+p+1;
1184
1185 // descape all \ stuff inside the string now
1186 p=0;
1187 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1188 {
1189 switch (out.at(p+1) )
1190 {
1191 case 'r':
47c07fba
GE
1192 out.replace(p,2,"\r");
1193 break;
6ab3bc95 1194 case 'n':
47c07fba
GE
1195 out.replace(p,2,"\n");
1196 break;
6ab3bc95 1197 default:
47c07fba 1198 out.erase(p,1);
6ab3bc95
RP
1199 }
1200 p++;
1201 }
1202
1203 return out;
1204} // eo descape(const std::string&,int,int&)
47c07fba 1205
e93545dd 1206
47c07fba
GE
1207string escape_shellarg(const string &input)
1208{
6ab3bc95
RP
1209 string output = "'";
1210 string::const_iterator it, it_end = input.end();
83d700e9 1211 for (it = input.begin(); it != it_end; ++it)
6ab3bc95
RP
1212 {
1213 if ( (*it) == '\'')
1214 output += "'\\'";
1215
1216 output += *it;
1217 }
1218
1219 output += "'";
1220 return output;
47c07fba 1221}