Increase version to 2.9
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
0e23f538
TJ
1/*
2The software in this package is distributed under the GNU General
3Public License version 2 (with a special exception described below).
4
5A copy of GNU General Public License (GPL) is included in this distribution,
6in the file COPYING.GPL.
7
8As a special exception, if other files instantiate templates or use macros
9or inline functions from this file, or you compile this file and link it
10with other works to produce a work based on this file, this file
11does not by itself cause the resulting work to be covered
12by the GNU General Public License.
13
14However the source code for this file must still be made available
15in accordance with section (3) of the GNU General Public License.
16
17This exception does not invalidate any other reasons why a work based
18on this file might be covered by the GNU General Public License.
19*/
6a93d84a
TJ
20/** @file
21 *
22 * (c) Copyright 2007-2008 by Intra2net AG
6a93d84a 23 */
e93545dd
GE
24
25#include <iostream>
26#include <string>
27#include <sstream>
28#include <stdexcept>
5efd35b1 29#include <algorithm>
5cd64148 30#include <cmath> // for round()
e93545dd 31
a5f3af6e 32#include <wchar.h>
e93545dd
GE
33#include <stdlib.h>
34#include <iconv.h>
35#include <i18n.h>
36
5cd64148 37#include <boost/numeric/conversion/cast.hpp>
3f5c5ccd 38#include <boost/foreach.hpp>
5cd64148 39
e93545dd
GE
40#include <stringfunc.hxx>
41
42using namespace std;
43
6ab3bc95
RP
44namespace I2n
45{
6a93d84a
TJ
46
47
6ab3bc95
RP
48namespace
49{
6a93d84a
TJ
50
51const std::string hexDigitsLower("0123456789abcdef");
52const std::string hexDigitsUpper("0123456789ABCDEF");
53
54
55struct UpperFunc
56{
6ab3bc95
RP
57 char operator() (char c)
58 {
59 return std::toupper(c);
60 }
6a93d84a
TJ
61}; // eo struct UpperFunc
62
63
64struct LowerFunc
65{
6ab3bc95
RP
66 char operator() (char c)
67 {
68 return std::tolower(c);
69 }
6a93d84a
TJ
70}; // eo struct LowerFunc
71
72
73} // eo namespace <anonymous>
74
75
76
77/**
6ab3bc95 78 * default list of Whitespaces (" \t\r\n");
6a93d84a 79 */
6ab3bc95 80const std::string Whitespaces = " \t\r\n";
6a93d84a
TJ
81
82/**
83 * default list of lineendings ("\r\n");
84 */
6ab3bc95 85const std::string LineEndings= "\r\n";
6a93d84a
TJ
86
87
88
89/**
90 * @brief checks if a string begins with a given prefix.
91 * @param[in,out] str the string which is tested
92 * @param prefix the prefix which should be tested for.
93 * @return @a true iff the prefix is not empty and the string begins with that prefix.
94 */
6ab3bc95 95bool has_prefix(const std::string& str, const std::string& prefix)
6a93d84a 96{
6ab3bc95
RP
97 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
98 {
99 return false;
100 }
101 return str.compare(0, prefix.size(), prefix) == 0;
102} // eo has_prefix(const std::string&,const std::string&)
6a93d84a
TJ
103
104
105/**
106 * @brief checks if a string ends with a given suffix.
107 * @param[in,out] str the string which is tested
108 * @param suffix the suffix which should be tested for.
109 * @return @a true iff the suffix is not empty and the string ends with that suffix.
110 */
6ab3bc95 111bool has_suffix(const std::string& str, const std::string& suffix)
6a93d84a 112{
6ab3bc95
RP
113 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
114 {
115 return false;
116 }
117 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
118} // eo has_suffix(const std::string&,const std::string&)
6a93d84a
TJ
119
120
121/**
122 * cut off characters from a given list from front and end of a string.
123 * @param[in,out] str the string which should be trimmed.
124 * @param charlist the list of characters to remove from beginning and end of string
125 * @return the result string.
126 */
6ab3bc95
RP
127std::string trim_mod(std::string& str, const std::string& charlist)
128{
129 // first: trim the beginning:
130 std::string::size_type pos= str.find_first_not_of (charlist);
131 if (pos == std::string::npos)
132 {
133 // whole string consists of charlist (or is already empty)
134 str.clear();
135 return str;
136 }
137 else if (pos>0)
138 {
139 // str starts with charlist
140 str.erase(0,pos);
141 }
142 // now let's look at the tail:
143 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
144 if ( pos < str.size() )
145 {
146 str.erase(pos, str.size()-pos);
147 }
148 return str;
149} // eo trim_mod(std::string&,const std::string&)
6a93d84a
TJ
150
151
152
153/**
154 * removes last character from a string when it is in a list of chars to be removed.
155 * @param[in,out] str the string.
156 * @param what the list of chars which will be tested for.
157 * @return the resulting string with last char removed (if applicable)
158 */
6ab3bc95 159std::string chomp_mod(std::string& str, const std::string& what)
6a93d84a 160{
6ab3bc95
RP
161 if (str.empty() || what.empty() )
162 {
163 return str;
164 }
165 if (what.find(str.at (str.size()-1) ) != std::string::npos)
166 {
167 str.erase(str.size() - 1);
168 }
169 return str;
170} // eo chomp_mod(std::string&,const std::string&)
6a93d84a
TJ
171
172
173/**
174 * @brief converts a string to lower case.
175 * @param[in,out] str the string to modify.
176 * @return the string
177 */
6ab3bc95 178std::string to_lower_mod(std::string& str)
6a93d84a 179{
6ab3bc95
RP
180 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
181 return str;
182} // eo to_lower_mod(std::string&)
6a93d84a
TJ
183
184
185/**
186 * @brief converts a string to upper case.
187 * @param[in,out] str the string to modify.
188 * @return the string
189 */
6ab3bc95 190std::string to_upper_mod(std::string& str)
6a93d84a 191{
6ab3bc95
RP
192 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
193 return str;
194} // eo to_upper_mod(std::string&)
6a93d84a
TJ
195
196
197
198/**
199 * cut off characters from a given list from front and end of a string.
200 * @param str the string which should be trimmed.
201 * @param charlist the list of characters to remove from beginning and end of string
202 * @return the result string.
203 */
6ab3bc95
RP
204std::string trim (const std::string& str, const std::string& charlist)
205{
206 // first: trim the beginning:
207 std::string::size_type pos0= str.find_first_not_of(charlist);
208 if (pos0 == std::string::npos)
209 {
210 // whole string consists of charlist (or is already empty)
211 return std::string();
212 }
213 // now let's look at the end:
214 std::string::size_type pos1= str.find_last_not_of(charlist);
215 return str.substr(pos0, pos1 - pos0 + 1);
6a93d84a
TJ
216} // eo trim(const std:.string&,const std::string&)
217
218
219/**
220 * removes last character from a string when it is in a list of chars to be removed.
221 * @param str the string.
222 * @param what the list of chars which will be tested for.
223 * @return the resulting string with last char removed (if applicable)
224 */
6ab3bc95
RP
225std::string chomp (const std::string& str, const std::string& what)
226{
227 if (str.empty() || what.empty() )
228 {
229 return str;
230 }
231 if (what.find(str.at (str.size()-1) ) != std::string::npos)
232 {
233 return str.substr(0, str.size()-1);
234 }
235 return str;
6a93d84a
TJ
236} // eo chomp(const std:.string&,const std::string&)
237
238
239/**
240 * @brief returns a lower case version of a given string.
241 * @param str the string
242 * @return the lower case version of the string
243 */
6ab3bc95 244std::string to_lower (const std::string& str)
6a93d84a 245{
6ab3bc95
RP
246 std::string result(str);
247 return to_lower_mod(result);
248} // eo to_lower(const std::string&)
6a93d84a
TJ
249
250
251/**
252 * @brief returns a upper case version of a given string.
253 * @param str the string
254 * @return the upper case version of the string
255 */
6ab3bc95 256std::string to_upper(const std::string& str)
6a93d84a 257{
6ab3bc95
RP
258 std::string result(str);
259 return to_upper_mod(result);
260} // eo to_upper(const std::string&)
6a93d84a
TJ
261
262
263
264/**
265 * @brief removes a given suffix from a string.
266 * @param str the string.
267 * @param suffix the suffix which should be removed if the string ends with it.
268 * @return the string without the suffix.
269 *
270 * If the string ends with the suffix, it is removed. If the the string doesn't end
271 * with the suffix the original string is returned.
272 */
6ab3bc95 273std::string remove_suffix(const std::string& str, const std::string& suffix)
6a93d84a 274{
6ab3bc95
RP
275 if (has_suffix(str,suffix) )
276 {
277 return str.substr(0, str.size()-suffix.size() );
278 }
279 return str;
280} // eo remove_suffix(const std::string&,const std::string&)
6a93d84a
TJ
281
282
283
284/**
285 * @brief removes a given prefix from a string.
286 * @param str the string.
287 * @param prefix the prefix which should be removed if the string begins with it.
288 * @return the string without the prefix.
289 *
290 * If the string begins with the prefix, it is removed. If the the string doesn't begin
291 * with the prefix the original string is returned.
292 */
6ab3bc95 293std::string remove_prefix(const std::string& str, const std::string& prefix)
6a93d84a 294{
6ab3bc95
RP
295 if (has_prefix(str,prefix) )
296 {
297 return str.substr( prefix.size() );
298 }
299 return str;
300} // eo remove_prefix(const std::string&,const std::string&)
6a93d84a
TJ
301
302
303/**
304 * split a string to key and value delimited by a given delimiter.
6ab3bc95 305 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
6a93d84a
TJ
306 * @param str the string which should be splitted.
307 * @param[out] key the resulting key
308 * @param[out] value the resulting value
309 * @param delimiter the delimiter between key and value; default is '='.
310 * @return @a true if the split was successful.
311 */
6ab3bc95
RP
312bool pair_split(
313 const std::string& str,
314 std::string& key,
315 std::string& value,
316 char delimiter)
317{
318 std::string::size_type pos = str.find (delimiter);
319 if (pos == std::string::npos) return false;
320 key= str.substr(0,pos);
321 value= str.substr(pos+1);
322 trim_mod(key);
323 trim_mod(value);
324 return true;
325} // eo pair_split(const std::string&,std::string&,std::string&,char)
6a93d84a
TJ
326
327
328/**
329 * splits a string by given delimiter
330 *
331 * @param[in] str the string which should be splitted.
332 * @param[out] result the list resulting from splitting @a str.
333 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
334 * @param[in] omit_empty should empty parts not be stored?
335 * @param[in] trim_list list of characters the parts should be trimmed by.
336 * (empty string results in no trim)
337 */
6ab3bc95
RP
338void split_string(
339 const std::string& str,
340 std::list<std::string>& result,
341 const std::string& delimiter,
342 bool omit_empty,
343 const std::string& trim_list
6a93d84a
TJ
344)
345{
6ab3bc95
RP
346 std::string::size_type pos, last_pos=0;
347 bool delimiter_found= false;
348 while ( last_pos < str.size() && last_pos != std::string::npos)
349 {
350 pos= str.find(delimiter, last_pos);
351 std::string part;
352 if (pos == std::string::npos)
353 {
354 part= str.substr(last_pos);
355 delimiter_found= false;
356 }
357 else
358 {
359 part= str.substr(last_pos, pos-last_pos);
360 delimiter_found=true;
361 }
362 if (pos != std::string::npos)
363 {
364 last_pos= pos+ delimiter.size();
365 }
366 else
367 {
368 last_pos= std::string::npos;
369 }
370 if (!trim_list.empty() ) trim_mod (part, trim_list);
371 if (omit_empty && part.empty() ) continue;
372 result.push_back( part );
373 }
374 // if the string ends with a delimiter we need to append an empty string if no omit_empty
375 // was given.
376 // (this way we keep the split result consistent to a join operation)
377 if (delimiter_found && !omit_empty)
378 {
379 result.push_back("");
380 }
381} // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
6a93d84a
TJ
382
383
338da253
CH
384/** call split_string with list<string>, converts result to vector; vector is clear()-ed first
385 *
386 * Note: Uses 3 O(n)-operations: list.size, vector.resize and std::swap_ranges;
387 * not sure whether there is a better way to do this
388 * */
389void split_string(
390 const std::string& str,
391 std::vector<std::string>& result,
392 const std::string& delimiter,
393 bool omit_empty,
394 const std::string& trim_list
395)
396{
397 std::list<std::string> tmp;
398 split_string(str, tmp, delimiter, omit_empty, trim_list);
399 std::size_t size = tmp.size(); // this is O(n)
400 result.clear();
401 result.resize(size); // also O(n)
402 std::swap_ranges(tmp.begin(), tmp.end(), result.begin()); // also O(n)
403}
404
6a93d84a
TJ
405/**
406 * splits a string by a given delimiter
407 * @param str the string which should be splitted.
408 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
409 * @param[in] omit_empty should empty parts not be stored?
410 * @param[in] trim_list list of characters the parts should be trimmed by.
411 * (empty string results in no trim)
412 * @return the list resulting from splitting @a str.
413 */
6ab3bc95
RP
414std::list<std::string> split_string(
415 const std::string& str,
416 const std::string& delimiter,
417 bool omit_empty,
418 const std::string& trim_list
6a93d84a
TJ
419)
420{
6ab3bc95
RP
421 std::list<std::string> result;
422 split_string(str, result, delimiter, omit_empty, trim_list);
423 return result;
424} // eo split_string(const std::string&,const std::string&,bool,const std::string&)
6a93d84a
TJ
425
426
427/**
428 * @brief joins a list of strings into a single string.
429 *
6ab3bc95
RP
430 * This funtion is (basically) the reverse operation of @a split_string.
431 *
6a93d84a
TJ
432 * @param parts the list of strings.
433 * @param delimiter the delimiter which is inserted between the strings.
434 * @return the joined string.
435 */
6ab3bc95
RP
436std::string join_string(
437 const std::list< std::string >& parts,
438 const std::string& delimiter
6a93d84a
TJ
439)
440{
6ab3bc95
RP
441 std::string result;
442 if (! parts.empty() )
443 {
444 std::list< std::string >::const_iterator it= parts.begin();
445 result = *it;
446 while ( ++it != parts.end() )
447 {
448 result+= delimiter;
449 result+= *it;
450 }
451 }
452 return result;
453} // eo join_string(const std::list< std::string >&,const std::string&)
6a93d84a
TJ
454
455
376ec4fa
CH
456/** @brief same as join_string for list, except uses a vector */
457std::string join_string(
458 const std::vector< std::string >& parts,
459 const std::string& delimiter
460)
461{
462 std::string result;
463 if (! parts.empty() )
464 {
465 std::vector< std::string >::const_iterator it= parts.begin();
466 result = *it;
467 while ( ++it != parts.end() )
468 {
469 result+= delimiter;
470 result+= *it;
471 }
472 }
473 return result;
474} // eo join_string(const std::vector< std::string >&,const std::string&)
475
476
6a93d84a
TJ
477
478/*
479** conversions
480*/
481
482
483/**
484 * @brief returns a hex string from a binary string.
485 * @param str the (binary) string
486 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
487 * @return the string in hex notation.
488 */
6ab3bc95
RP
489std::string convert_binary_to_hex(
490 const std::string& str,
491 bool upper_case_digits
6a93d84a
TJ
492)
493{
6ab3bc95
RP
494 std::string result;
495 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
496 for ( std::string::const_iterator it= str.begin();
497 it != str.end();
498 ++it)
499 {
500 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
501 result.push_back( hexDigits[ (*it) & 0x0f ] );
502 }
503 return result;
504} // eo convert_binary_to_hex(const std::string&,bool)
6a93d84a
TJ
505
506
507/**
508 * @brief converts a hex digit string to binary string.
509 * @param str hex digit string
510 * @return the binary string.
511 *
512 * The hex digit string may contains white spaces or colons which are treated
513 * as delimiters between hex digit groups.
514 *
515 * @todo rework the handling of half nibbles (consistency)!
516 */
6ab3bc95
RP
517std::string convert_hex_to_binary(
518 const std::string& str
6a93d84a 519)
6ab3bc95
RP
520throw (std::runtime_error)
521{
522 std::string result;
523 char c= 0;
524 bool hasNibble= false;
525 bool lastWasWS= true;
526 for ( std::string::const_iterator it= str.begin();
527 it != str.end();
528 ++it)
529 {
530 std::string::size_type p = hexDigitsLower.find( *it );
531 if (p== std::string::npos)
532 {
533 p= hexDigitsUpper.find( *it );
534 }
535 if (p == std::string::npos)
536 {
537 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
6a93d84a 538 or ( *it == ':') // or a colon?
6ab3bc95
RP
539 )
540 {
541 // we treat that as a valid delimiter:
542 if (hasNibble)
6a93d84a 543 {
6ab3bc95
RP
544 // 1 nibble before WS is treate as lower part:
545 result.push_back(c);
546 // reset state:
547 hasNibble= false;
6a93d84a 548 }
6ab3bc95
RP
549 lastWasWS= true;
550 continue;
551 }
552 }
553 if (p == std::string::npos )
554 {
555 throw runtime_error("illegal character in hex digit string: " + str);
556 }
557 lastWasWS= false;
558 if (hasNibble)
559 {
560 c<<=4;
561 }
562 else
563 {
564 c=0;
565 }
566 c+= (p & 0x0f);
567 if (hasNibble)
568 {
569 //we already had a nibble, so a char is complete now:
570 result.push_back( c );
571 hasNibble=false;
572 }
573 else
574 {
575 // this is the first nibble of a new char:
576 hasNibble=true;
577 }
578 }
579 if (hasNibble)
580 {
581 //well, there is one nibble left
582 // let's do some heuristics:
583 if (lastWasWS)
584 {
585 // if the preceeding character was a white space (or a colon)
586 // we treat the nibble as lower part:
587 //( this is consistent with shortened hex notations where leading zeros are not noted)
588 result.push_back( c );
589 }
590 else
591 {
592 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
593 result.push_back( c << 4 );
594 }
595 }
596 return result;
597} // eo convert_hex_to_binary(const std::string&)
598
599
1a0267e5
CH
600static list<string>& alloc_template_starts()
601{
602 static list<string> result;
603 if (result.empty())
604 {
605 result.push_back("std::list");
606 result.push_back("std::vector");
607 }
608 return result;
609}
610
611string shorten_stl_types(const string &input)
612{
613 string output = input;
614
615 // first: replace fixed string for std::string
616 replace_all(output, "std::basic_string<char, std::char_traits<char>, std::allocator<char> >",
617 "std::string");
618
619 // loop over list/vector/... that have an allocator, e.g.
620 // std::list< some_type_here, std::allocator<some_type_here> >
621 string::size_type start, comma, end, len, start_text_len;
622 int n_open_brackets;
623 string allocator_text;
624 BOOST_FOREACH(const string &start_text, alloc_template_starts())
625 {
626 start = 0;
627 comma = 0;
628 end = 0;
629 start_text_len = start_text.length();
630 while( (start=output.find(start_text+"<", start)) != string::npos )
631 {
632 len = output.length();
633 start += start_text_len+1; // start next iter and tests here after opening bracket
634
635 // now comes the tricky part: find matching ',' and the closing '>' even if "subtype" is template again
636 comma = start;
637 n_open_brackets = 1; // the bracket right after start_text counts as first
638 while (comma < len && n_open_brackets > 0)
639 {
640 if (output[comma] == ',' && n_open_brackets == 1)
641 break;
642 else if (output[comma] == '<')
643 ++n_open_brackets;
644 else if (output[comma] == '>')
645 --n_open_brackets;
646 ++comma;
647 }
648 end = comma+1;
649 while (end < len && n_open_brackets > 0)
650 {
651 if (output[end] == '<')
652 ++n_open_brackets;
653 else if (output[end] == '>')
654 {
655 --n_open_brackets;
656 if (n_open_brackets == 0)
657 break; // do not increment end
658 }
659 ++end;
660 }
661
662 // check that start < comma < end < len && n_open_brackets == 0
663 if (start >= comma || comma >= end || end >= len || n_open_brackets != 0)
664 continue; // input seems to be of unexpected form
665
666 // check that type in allocator is same as until comma
667 string type = output.substr(start, comma-start);
668 if (type[type.length()-1] == '>')
669 allocator_text = string("std::allocator<") + type + " > ";
670 else
671 allocator_text = string("std::allocator<") + type + "> ";
672 if (output.substr(comma+2, end-comma-2) == allocator_text)
673 output.replace(comma+2, end-comma-2, "_alloc_");
674 }
675 }
676
677 return output;
678}
679
6ab3bc95
RP
680} // eo namespace I2n
681
682
683
6a93d84a 684
e93545dd
GE
685std::string iso_to_utf8(const std::string& isostring)
686{
6ab3bc95 687 string result;
118e216e 688
6ab3bc95 689 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
118e216e 690
6ab3bc95
RP
691 if (iso_to_utf8 == (iconv_t)-1)
692 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 693
6ab3bc95
RP
694 size_t in_size=isostring.size();
695 size_t out_size=in_size*4;
118e216e 696
6ab3bc95
RP
697 char *buf = (char *)malloc(out_size+1);
698 if (buf == NULL)
699 throw runtime_error("out of memory for iconv buffer");
e93545dd 700
5a4ecb51 701 char *in = (char *)isostring.c_str();
6ab3bc95
RP
702 char *out = buf;
703 iconv(i2utf8, &in, &in_size, &out, &out_size);
118e216e 704
6ab3bc95 705 buf[isostring.size()*4-out_size]=0;
118e216e 706
6ab3bc95 707 result=buf;
118e216e 708
6ab3bc95
RP
709 free(buf);
710 iconv_close(i2utf8);
118e216e 711
6ab3bc95 712 return result;
e93545dd
GE
713}
714
715std::string utf8_to_iso(const std::string& utf8string)
716{
6ab3bc95 717 string result;
118e216e 718
6ab3bc95 719 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
118e216e 720
6ab3bc95
RP
721 if (utf82iso == (iconv_t)-1)
722 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 723
6ab3bc95
RP
724 size_t in_size=utf8string.size();
725 size_t out_size=in_size;
118e216e 726
6ab3bc95
RP
727 char *buf = (char *)malloc(out_size+1);
728 if (buf == NULL)
729 throw runtime_error("out of memory for iconv buffer");
e93545dd 730
5a4ecb51 731 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
732 char *out = buf;
733 iconv(utf82iso, &in, &in_size, &out, &out_size);
118e216e 734
6ab3bc95 735 buf[utf8string.size()-out_size]=0;
118e216e 736
6ab3bc95 737 result=buf;
118e216e 738
6ab3bc95
RP
739 free(buf);
740 iconv_close(utf82iso);
e93545dd 741
6ab3bc95 742 return result;
e93545dd
GE
743}
744
a5f3af6e
GE
745wchar_t* utf8_to_wbuf(const std::string& utf8string)
746{
6ab3bc95 747 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
a5f3af6e 748
6ab3bc95
RP
749 if (utf82wstr == (iconv_t)-1)
750 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
a5f3af6e 751
6ab3bc95
RP
752 size_t in_size=utf8string.size();
753 size_t out_size= (in_size+1)*sizeof(wchar_t);
a5f3af6e 754
6ab3bc95
RP
755 wchar_t *buf = (wchar_t *)malloc(out_size);
756 if (buf == NULL)
757 throw runtime_error("out of memory for iconv buffer");
a5f3af6e 758
5a4ecb51 759 char *in = (char *)utf8string.c_str();
6ab3bc95 760 char *out = (char*) buf;
dbd6d77c 761 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
6ab3bc95 762 throw runtime_error("error converting char encodings");
a5f3af6e 763
6ab3bc95 764 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
a5f3af6e 765
6ab3bc95 766 iconv_close(utf82wstr);
a5f3af6e 767
6ab3bc95 768 return buf;
a5f3af6e
GE
769}
770
13cc4db1 771std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071 772{
6ab3bc95 773 string result;
118e216e 774
6ab3bc95 775 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
118e216e 776
6ab3bc95
RP
777 if (utf7imap2utf8 == (iconv_t)-1)
778 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 779
6ab3bc95
RP
780 size_t in_size=utf7imapstring.size();
781 size_t out_size=in_size*4;
118e216e 782
6ab3bc95
RP
783 char *buf = (char *)malloc(out_size+1);
784 if (buf == NULL)
785 throw runtime_error("out of memory for iconv buffer");
d116a071 786
5a4ecb51 787 char *in = (char *)utf7imapstring.c_str();
6ab3bc95
RP
788 char *out = buf;
789 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 790
6ab3bc95 791 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 792
6ab3bc95 793 result=buf;
118e216e 794
6ab3bc95
RP
795 free(buf);
796 iconv_close(utf7imap2utf8);
118e216e 797
6ab3bc95 798 return result;
118e216e
TJ
799}
800
6a2b6dd1
TJ
801std::string utf8_to_utf7imap(const std::string& utf8string)
802{
6ab3bc95 803 string result;
6a2b6dd1 804
6ab3bc95 805 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
6a2b6dd1 806
6ab3bc95
RP
807 if (utf82utf7imap == (iconv_t)-1)
808 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
6a2b6dd1 809
6ab3bc95
RP
810 // UTF-7 is base64 encoded, a buffer 10x as large
811 // as the utf-8 buffer should be enough. If not the string will be truncated.
812 size_t in_size=utf8string.size();
813 size_t out_size=in_size*10;
6a2b6dd1 814
6ab3bc95
RP
815 char *buf = (char *)malloc(out_size+1);
816 if (buf == NULL)
817 throw runtime_error("out of memory for iconv buffer");
6a2b6dd1 818
5a4ecb51 819 char *in = (char *)utf8string.c_str();
6ab3bc95
RP
820 char *out = buf;
821 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
6a2b6dd1 822
6ab3bc95 823 buf[utf8string.size()*10-out_size]= 0;
6a2b6dd1 824
6ab3bc95 825 result=buf;
6a2b6dd1 826
6ab3bc95
RP
827 free(buf);
828 iconv_close(utf82utf7imap);
6a2b6dd1 829
6ab3bc95 830 return result;
6a2b6dd1
TJ
831}
832
118e216e
TJ
833// Tokenize string by (html) tags
834void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
835{
6ab3bc95
RP
836 string::size_type pos, len = input.size();
837 bool inside_tag = false;
838 string current;
839
840 for (pos = 0; pos < len; pos++)
841 {
842 if (input[pos] == '<')
843 {
844 inside_tag = true;
845
846 if (!current.empty() )
847 {
848 tokenized.push_back( make_pair(current, false) );
849 current = "";
850 }
851
852 current += input[pos];
853 }
854 else if (input[pos] == '>' && inside_tag)
855 {
856 current += input[pos];
857 inside_tag = false;
858 if (!current.empty() )
859 {
860 tokenized.push_back( make_pair(current, true) );
861 current = "";
862 }
863 }
864 else
865 current += input[pos];
866 }
867
868 // String left over in buffer?
869 if (!current.empty() )
870 tokenized.push_back( make_pair(current, false) );
871} // eo tokenize_by_tag
118e216e 872
118e216e
TJ
873
874std::string strip_html_tags(const std::string &input)
875{
6ab3bc95
RP
876 // Pair first: string, second: isTag
877 vector<pair<string,bool> > tokenized;
878 tokenize_by_tag (tokenized, input);
118e216e 879
6ab3bc95
RP
880 string output;
881 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 882 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
883 if (!token->second)
884 output += token->first;
885
886 return output;
887} // eo strip_html_tags
118e216e 888
118e216e
TJ
889
890// Smart-encode HTML en
891string smart_html_entities(const std::string &input)
892{
6ab3bc95
RP
893 // Pair first: string, second: isTag
894 vector<pair<string,bool> > tokenized;
895 tokenize_by_tag (tokenized, input);
896
897 string output;
898 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
83d700e9 899 for (token = tokenized.begin(); token != tokens_end; ++token)
6ab3bc95
RP
900 {
901 // keep HTML tags as they are
902 if (token->second)
903 output += token->first;
904 else
905 output += html_entities(token->first);
906 }
907
908 return output;
118e216e
TJ
909}
910
6ab3bc95 911
a5f3af6e
GE
912string::size_type find_8bit(const std::string &str)
913{
6ab3bc95
RP
914 string::size_type l=str.size();
915 for (string::size_type p=0; p < l; p++)
916 if (static_cast<unsigned char>(str[p]) > 127)
917 return p;
a5f3af6e 918
6ab3bc95 919 return string::npos;
a5f3af6e
GE
920}
921
118e216e
TJ
922// encoded UTF-8 chars into HTML entities
923string html_entities(std::string str)
924{
6ab3bc95
RP
925 // Normal chars
926 replace_all (str, "&", "&amp;");
6ab3bc95
RP
927 replace_all (str, "<", "&lt;");
928 replace_all (str, ">", "&gt;");
980577e1
TJ
929 replace_all (str, "\"", "&quot;");
930 replace_all (str, "'", "&#x27;");
931 replace_all (str, "/", "&#x2F;");
6ab3bc95
RP
932
933 // Umlauts
934 replace_all (str, "\xC3\xA4", "&auml;");
935 replace_all (str, "\xC3\xB6", "&ouml;");
936 replace_all (str, "\xC3\xBC", "&uuml;");
937 replace_all (str, "\xC3\x84", "&Auml;");
938 replace_all (str, "\xC3\x96", "&Ouml;");
939 replace_all (str, "\xC3\x9C", "&Uuml;");
940
941 // Misc
942 replace_all (str, "\xC3\x9F", "&szlig;");
943
944 // conversion of remaining non-ASCII chars needed?
945 // just do if needed because of performance
946 if (find_8bit(str) != string::npos)
947 {
948 // convert to fixed-size encoding UTF-32
949 wchar_t* wbuf=utf8_to_wbuf(str);
950 ostringstream target;
951
952 // replace all non-ASCII chars with HTML representation
953 for (int p=0; wbuf[p] != 0; p++)
954 {
955 unsigned int c=wbuf[p];
956
957 if (c <= 127)
958 target << static_cast<unsigned char>(c);
959 else
960 target << "&#" << c << ';';
961 }
962
963 free(wbuf);
964
965 str=target.str();
966 }
967
968 return str;
969} // eo html_entities(std::string)
970
554f813d
GE
971// convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
972string html_entities_to_console(std::string str)
973{
974 // Normal chars
975 replace_all (str, "&amp;", "&");
976 replace_all (str, "&lt;", "<");
977 replace_all (str, "&gt;", ">");
978 replace_all (str, "&quot;", "\"");
979 replace_all (str, "&#x27;", "'");
980 replace_all (str, "&#x2F;", "/");
981
982 // Umlauts
983 replace_all (str, "&auml;", "ae");
984 replace_all (str, "&ouml;", "oe");
985 replace_all (str, "&uuml;", "ue");
986 replace_all (str, "&Auml;", "Ae");
987 replace_all (str, "&Ouml;", "Oe");
988 replace_all (str, "&Uuml;", "Ue");
989
990 // Misc
991 replace_all (str, "&szlig;", "ss");
992
993 return str;
994}
118e216e 995
3f5c5ccd
CH
996// find_html_comments + remove_html_comments(str, comments)
997void remove_html_comments(string &str)
998{
46dd1321 999 vector<CommentZone> comments = find_html_comments(str);
3f5c5ccd
CH
1000 remove_html_comments(str, comments);
1001}
1002
1003// find all html comments, behaving correctly if they are nested; ignores comment tags ("<!--FOO .... BAR-->")
1004// If there are invalid comments ("-->" before "<!--" or different number of closing and opening tags),
1005// then the unknown index of corresponding start/end tag will be represented by a string::npos
1006// Indices are from start of start tag until first index after closing tag
46dd1321 1007vector<CommentZone> find_html_comments(const std::string &str)
3f5c5ccd
CH
1008{
1009 static const string START = "<!--";
1010 static const string CLOSE = "-->";
1011 static const string::size_type START_LEN = START.length();
1012 static const string::size_type CLOSE_LEN = CLOSE.length();
1013
46dd1321
TJ
1014 vector<CommentZone> comments;
1015
3f5c5ccd
CH
1016 // in order to find nested comments, need either recursion or a stack
1017 vector<string::size_type> starts; // stack of start tags
1018
1019 string::size_type pos = 0;
1020 string::size_type len = str.length();
1021 string::size_type next_start, next_close;
1022
1023 while (pos < len) // not really needed but just in case
1024 {
1025 next_start = str.find(START, pos);
1026 next_close = str.find(CLOSE, pos);
1027
1028 if ( (next_start == string::npos) && (next_close == string::npos) )
1029 break; // we are done
1030
1031 else if ( (next_start == string::npos) || (next_close < next_start) ) // close one comment (pop)
1032 {
1033 if (starts.empty()) // closing tag without a start
1034 comments.push_back(CommentZone(string::npos, next_close+CLOSE_LEN));
1035 else
1036 {
1037 comments.push_back(CommentZone(starts.back(), next_close+CLOSE_LEN));
1038 starts.pop_back();
1039 }
1040 pos = next_close + CLOSE_LEN;
1041 }
1042
1043 else if ( (next_close == string::npos) || (next_start < next_close) ) // start a new comment (push)
1044 {
1045 starts.push_back(next_start);
1046 pos = next_start + START_LEN;
1047 }
1048 }
1049
1050 // add comments that have no closing tag from back to front (important for remove_html_comments!)
1051 while (!starts.empty())
1052 {
1053 comments.push_back(CommentZone(starts.back(), string::npos));
1054 starts.pop_back();
1055 }
46dd1321
TJ
1056
1057 return comments;
3f5c5ccd
CH
1058}
1059
1060// remove all html comments foundby find_html_comments
1061void remove_html_comments(std::string &str, const vector<CommentZone> &comments)
1062{
1063 // remember position where last removal started
1064 string::size_type last_removal_start = str.length();
1065
1066 // Go from back to front to not mess up indices.
1067 // This requires that bigger comments, that contain smaller comments, come AFTER
1068 // the small contained comments in the comments vector (i.e. comments are ordered by
1069 // their closing tag, not their opening tag). This is true for results from find_html_comments
1070 BOOST_REVERSE_FOREACH(const CommentZone &comment, comments)
1071 {
1072 if (comment.first == string::npos)
1073 {
1074 str = str.replace(0, comment.second, ""); // comment starts "before" str --> delete from start
1075 break; // there can be no more
1076 }
1077 else if (comment.first >= last_removal_start)
1078 {
1079 continue; // this comment is inside another comment that we have removed already
1080 }
1081 else if (comment.second == string::npos) // comment ends "after" str --> delete until end
1082 {
1083 str = str.replace(comment.first, string::npos, "");
1084 last_removal_start = comment.first;
1085 }
1086 else
1087 {
1088 str = str.replace(comment.first, comment.second-comment.first, "");
1089 last_removal_start = comment.first;
1090 }
1091 }
1092}
1093
e93545dd
GE
1094bool replace_all(string &base, const char *ist, const char *soll)
1095{
6ab3bc95
RP
1096 string i=ist;
1097 string s=soll;
1098 return replace_all(base,&i,&s);
e93545dd
GE
1099}
1100
1101bool replace_all(string &base, const string &ist, const char *soll)
1102{
6ab3bc95
RP
1103 string s=soll;
1104 return replace_all(base,&ist,&s);
e93545dd
GE
1105}
1106
1107bool replace_all(string &base, const string *ist, const string *soll)
1108{
6ab3bc95 1109 return replace_all(base,*ist,*soll);
e93545dd
GE
1110}
1111
1112bool replace_all(string &base, const char *ist, const string *soll)
1113{
6ab3bc95
RP
1114 string i=ist;
1115 return replace_all(base,&i,soll);
e93545dd
GE
1116}
1117
1118bool replace_all(string &base, const string &ist, const string &soll)
1119{
6ab3bc95
RP
1120 bool found_ist = false;
1121 string::size_type a=0;
1122
1123 if (ist.empty() )
1124 throw runtime_error ("replace_all called with empty search string");
e93545dd 1125
6ab3bc95
RP
1126 while ( (a=base.find(ist,a) ) != string::npos)
1127 {
1128 base.replace(a,ist.size(),soll);
1129 a=a+soll.size();
1130 found_ist = true;
1131 }
1ec2064e 1132
6ab3bc95 1133 return found_ist;
e93545dd
GE
1134}
1135
b953bf36
GE
1136/**
1137 * @brief replaces all characters that could be problematic or impose a security risk when being logged
1138 * @param str the original string
1139 * @param replace_with the character to replace the unsafe chars with
1140 * @return a string that is safe to send to syslog or other logfiles
1141 *
1142 * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
1143 * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
1144 * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
1145 *
1146 */
1147std::string sanitize_for_logging(const std::string &str, const char replace_with)
1148{
1149 std::string output=str;
1150
c0e32d64
GE
1151 const string::size_type len = output.size();
1152 for (std::string::size_type p=0; p < len; p++)
b953bf36
GE
1153 if (output[p] < 0x20 || output[p] > 0x7E)
1154 output[p]=replace_with;
1155
1156 return output;
1157}
1158
e5b21dbb 1159#if 0
e93545dd
GE
1160string to_lower(const string &src)
1161{
6ab3bc95 1162 string dst = src;
e93545dd 1163
6ab3bc95
RP
1164 string::size_type pos, end = dst.size();
1165 for (pos = 0; pos < end; pos++)
1166 dst[pos] = tolower(dst[pos]);
e93545dd 1167
6ab3bc95 1168 return dst;
e93545dd
GE
1169}
1170
1171string to_upper(const string &src)
1172{
6ab3bc95 1173 string dst = src;
e93545dd 1174
6ab3bc95
RP
1175 string::size_type pos, end = dst.size();
1176 for (pos = 0; pos < end; pos++)
1177 dst[pos] = toupper(dst[pos]);
e93545dd 1178
6ab3bc95 1179 return dst;
e93545dd 1180}
e5b21dbb 1181#endif
e93545dd 1182
83809f5e 1183const int MAX_UNIT_FORMAT_SYMBOLS = 6;
d1ea9075 1184
2cb9a9c5 1185const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
d1ea9075
GMF
1186 " B",
1187 " KB",
1188 " MB",
1189 " GB",
1190 " TB",
83809f5e 1191 " PB"
d1ea9075
GMF
1192};
1193
2cb9a9c5 1194const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
5cb766b9
GMF
1195 i18n_noop(" Bytes"),
1196 i18n_noop(" KBytes"),
1197 i18n_noop(" MBytes"),
1198 i18n_noop(" GBytes"),
1199 i18n_noop(" TBytes"),
83809f5e 1200 i18n_noop(" PBytes")
d1ea9075
GMF
1201};
1202
72a94426 1203
e26f7d51 1204static long double rounding_upwards(
e91c1952 1205 const long double number,
72a94426
GMF
1206 const int rounding_multiplier
1207)
1208{
1209 long double rounded_number;
1210 rounded_number = number * rounding_multiplier;
1211 rounded_number += 0.5;
1212 rounded_number = (int64_t) (rounded_number);
1213 rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
1214
1215 return rounded_number;
1216}
1217
1218
81267544
GMF
1219string nice_unit_format(
1220 const int64_t input,
70fc0674
GMF
1221 const UnitFormat format,
1222 const UnitBase base
81267544 1223)
6ab3bc95 1224{
d1ea9075 1225 // select the system of units (decimal or binary)
81267544 1226 int multiple = 0;
a398513a 1227 if (base == UnitBase1000)
81267544
GMF
1228 {
1229 multiple = 1000;
1230 }
1231 else
1232 {
1233 multiple = 1024;
1234 }
1235
1236 long double size = input;
6ab3bc95 1237
d1ea9075
GMF
1238 // check the size of the input number to fit in the appropriate symbol
1239 int sizecount = 0;
81267544 1240 while (size > multiple)
6ab3bc95 1241 {
81267544
GMF
1242 size = size / multiple;
1243 sizecount++;
83809f5e
GMF
1244
1245 // rollback to the previous values and stop the loop when cannot
1246 // represent the number length.
1247 if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1248 {
1249 size = size * multiple;
1250 sizecount--;
1251 break;
1252 }
6ab3bc95
RP
1253 }
1254
a398513a
GMF
1255 // round the input number "half up" to multiples of 10
1256 const int rounding_multiplier = 10;
72a94426 1257 size = rounding_upwards(size, rounding_multiplier);
6ab3bc95 1258
d1ea9075 1259 // format the input number, placing the appropriate symbol
6ab3bc95 1260 ostringstream out;
6ab3bc95 1261 out.setf (ios::fixed);
a398513a 1262 if (format == ShortUnitFormat)
d1ea9075
GMF
1263 {
1264 out.precision(1);
68d37a5c 1265 out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
d1ea9075
GMF
1266 }
1267 else
6ab3bc95 1268 {
d1ea9075 1269 out.precision (2);
68d37a5c 1270 out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
6ab3bc95
RP
1271 }
1272
1273 return out.str();
1274} // eo nice_unit_format(int input)
1275
e93545dd 1276
5cd64148
CH
1277string nice_unit_format(
1278 const double input,
1279 const UnitFormat format,
1280 const UnitBase base
1281)
1282{
1283 // round as double and cast to int64_t
1284 // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1285 int64_t input_casted_and_rounded =
1286 boost::numeric_cast<int64_t>( round(input) );
1287
1288 // now call other
1289 return nice_unit_format( input_casted_and_rounded, format, base );
1290} // eo nice_unit_format(double input)
1291
1292
47c07fba
GE
1293string escape(const string &s)
1294{
6ab3bc95
RP
1295 string out(s);
1296 string::size_type p;
47c07fba 1297
6ab3bc95
RP
1298 p=0;
1299 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1300 {
1301 out.insert (p,"\\");
1302 p+=2;
1303 }
47c07fba 1304
6ab3bc95
RP
1305 p=0;
1306 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1307 {
1308 out.replace (p,1,"\\r");
1309 p+=2;
1310 }
47c07fba 1311
6ab3bc95
RP
1312 p=0;
1313 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1314 {
1315 out.replace (p,1,"\\n");
1316 p+=2;
1317 }
47c07fba 1318
6ab3bc95 1319 out='"'+out+'"';
47c07fba 1320
6ab3bc95
RP
1321 return out;
1322} // eo scape(const std::string&)
47c07fba 1323
47c07fba 1324
6ab3bc95
RP
1325string descape(const string &s, int startpos, int &endpos)
1326{
1327 string out;
1328
1329 if (s.at(startpos) != '"')
1330 throw out_of_range("value not type escaped string");
1331
1332 out=s.substr(startpos+1);
1333 string::size_type p=0;
1334
1335 // search for the end of the string
1336 while ( (p=out.find("\"",p) ) !=out.npos)
1337 {
1338 int e=p-1;
1339 bool escaped=false;
1340
1341 // the " might be escaped with a backslash
1342 while (e>=0 && out.at (e) =='\\')
1343 {
1344 if (escaped == false)
1345 escaped=true;
1346 else
1347 escaped=false;
1348
1349 e--;
1350 }
1351
1352 if (escaped==false)
1353 break;
1354 else
1355 p++;
1356 }
1357
1358 // we now have the end of the string
1359 out=out.substr(0,p);
1360
1361 // tell calling prog about the endposition
1362 endpos=startpos+p+1;
1363
1364 // descape all \ stuff inside the string now
1365 p=0;
1366 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1367 {
1368 switch (out.at(p+1) )
1369 {
1370 case 'r':
47c07fba
GE
1371 out.replace(p,2,"\r");
1372 break;
6ab3bc95 1373 case 'n':
47c07fba
GE
1374 out.replace(p,2,"\n");
1375 break;
6ab3bc95 1376 default:
47c07fba 1377 out.erase(p,1);
6ab3bc95
RP
1378 }
1379 p++;
1380 }
1381
1382 return out;
1383} // eo descape(const std::string&,int,int&)
47c07fba 1384
e93545dd 1385
47c07fba
GE
1386string escape_shellarg(const string &input)
1387{
6ab3bc95
RP
1388 string output = "'";
1389 string::const_iterator it, it_end = input.end();
83d700e9 1390 for (it = input.begin(); it != it_end; ++it)
6ab3bc95
RP
1391 {
1392 if ( (*it) == '\'')
1393 output += "'\\'";
1394
1395 output += *it;
1396 }
1397
1398 output += "'";
1399 return output;
47c07fba 1400}