libi2ncommon: (reinhard) modified stringfunc (and some others) according to our new...
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
6a93d84a
TJ
1/** @file
2 *
3 * (c) Copyright 2007-2008 by Intra2net AG
6ab3bc95 4 *
6a93d84a
TJ
5 * info@intra2net.com
6 */
e93545dd
GE
7
8#include <iostream>
9#include <string>
10#include <sstream>
11#include <stdexcept>
12
a5f3af6e 13#include <wchar.h>
e93545dd
GE
14#include <stdlib.h>
15#include <iconv.h>
16#include <i18n.h>
17
18#include <stringfunc.hxx>
19
20using namespace std;
21
6ab3bc95
RP
22namespace I2n
23{
6a93d84a
TJ
24
25
6ab3bc95
RP
26namespace
27{
6a93d84a
TJ
28
29const std::string hexDigitsLower("0123456789abcdef");
30const std::string hexDigitsUpper("0123456789ABCDEF");
31
32
33struct UpperFunc
34{
6ab3bc95
RP
35 char operator() (char c)
36 {
37 return std::toupper(c);
38 }
6a93d84a
TJ
39}; // eo struct UpperFunc
40
41
42struct LowerFunc
43{
6ab3bc95
RP
44 char operator() (char c)
45 {
46 return std::tolower(c);
47 }
6a93d84a
TJ
48}; // eo struct LowerFunc
49
50
51} // eo namespace <anonymous>
52
53
54
55/**
6ab3bc95 56 * default list of Whitespaces (" \t\r\n");
6a93d84a 57 */
6ab3bc95 58const std::string Whitespaces = " \t\r\n";
6a93d84a
TJ
59
60/**
61 * default list of lineendings ("\r\n");
62 */
6ab3bc95 63const std::string LineEndings= "\r\n";
6a93d84a
TJ
64
65
66
67/**
68 * @brief checks if a string begins with a given prefix.
69 * @param[in,out] str the string which is tested
70 * @param prefix the prefix which should be tested for.
71 * @return @a true iff the prefix is not empty and the string begins with that prefix.
72 */
6ab3bc95 73bool has_prefix(const std::string& str, const std::string& prefix)
6a93d84a 74{
6ab3bc95
RP
75 if (prefix.empty() || str.empty() || str.size() < prefix.size() )
76 {
77 return false;
78 }
79 return str.compare(0, prefix.size(), prefix) == 0;
80} // eo has_prefix(const std::string&,const std::string&)
6a93d84a
TJ
81
82
83/**
84 * @brief checks if a string ends with a given suffix.
85 * @param[in,out] str the string which is tested
86 * @param suffix the suffix which should be tested for.
87 * @return @a true iff the suffix is not empty and the string ends with that suffix.
88 */
6ab3bc95 89bool has_suffix(const std::string& str, const std::string& suffix)
6a93d84a 90{
6ab3bc95
RP
91 if (suffix.empty() || str.empty() || str.size() < suffix.size() )
92 {
93 return false;
94 }
95 return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
96} // eo has_suffix(const std::string&,const std::string&)
6a93d84a
TJ
97
98
99/**
100 * cut off characters from a given list from front and end of a string.
101 * @param[in,out] str the string which should be trimmed.
102 * @param charlist the list of characters to remove from beginning and end of string
103 * @return the result string.
104 */
6ab3bc95
RP
105std::string trim_mod(std::string& str, const std::string& charlist)
106{
107 // first: trim the beginning:
108 std::string::size_type pos= str.find_first_not_of (charlist);
109 if (pos == std::string::npos)
110 {
111 // whole string consists of charlist (or is already empty)
112 str.clear();
113 return str;
114 }
115 else if (pos>0)
116 {
117 // str starts with charlist
118 str.erase(0,pos);
119 }
120 // now let's look at the tail:
121 pos= str.find_last_not_of(charlist) +1; // note: we already know there is at least one other char!
122 if ( pos < str.size() )
123 {
124 str.erase(pos, str.size()-pos);
125 }
126 return str;
127} // eo trim_mod(std::string&,const std::string&)
6a93d84a
TJ
128
129
130
131/**
132 * removes last character from a string when it is in a list of chars to be removed.
133 * @param[in,out] str the string.
134 * @param what the list of chars which will be tested for.
135 * @return the resulting string with last char removed (if applicable)
136 */
6ab3bc95 137std::string chomp_mod(std::string& str, const std::string& what)
6a93d84a 138{
6ab3bc95
RP
139 if (str.empty() || what.empty() )
140 {
141 return str;
142 }
143 if (what.find(str.at (str.size()-1) ) != std::string::npos)
144 {
145 str.erase(str.size() - 1);
146 }
147 return str;
148} // eo chomp_mod(std::string&,const std::string&)
6a93d84a
TJ
149
150
151/**
152 * @brief converts a string to lower case.
153 * @param[in,out] str the string to modify.
154 * @return the string
155 */
6ab3bc95 156std::string to_lower_mod(std::string& str)
6a93d84a 157{
6ab3bc95
RP
158 std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
159 return str;
160} // eo to_lower_mod(std::string&)
6a93d84a
TJ
161
162
163/**
164 * @brief converts a string to upper case.
165 * @param[in,out] str the string to modify.
166 * @return the string
167 */
6ab3bc95 168std::string to_upper_mod(std::string& str)
6a93d84a 169{
6ab3bc95
RP
170 std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
171 return str;
172} // eo to_upper_mod(std::string&)
6a93d84a
TJ
173
174
175
176/**
177 * cut off characters from a given list from front and end of a string.
178 * @param str the string which should be trimmed.
179 * @param charlist the list of characters to remove from beginning and end of string
180 * @return the result string.
181 */
6ab3bc95
RP
182std::string trim (const std::string& str, const std::string& charlist)
183{
184 // first: trim the beginning:
185 std::string::size_type pos0= str.find_first_not_of(charlist);
186 if (pos0 == std::string::npos)
187 {
188 // whole string consists of charlist (or is already empty)
189 return std::string();
190 }
191 // now let's look at the end:
192 std::string::size_type pos1= str.find_last_not_of(charlist);
193 return str.substr(pos0, pos1 - pos0 + 1);
6a93d84a
TJ
194} // eo trim(const std:.string&,const std::string&)
195
196
197/**
198 * removes last character from a string when it is in a list of chars to be removed.
199 * @param str the string.
200 * @param what the list of chars which will be tested for.
201 * @return the resulting string with last char removed (if applicable)
202 */
6ab3bc95
RP
203std::string chomp (const std::string& str, const std::string& what)
204{
205 if (str.empty() || what.empty() )
206 {
207 return str;
208 }
209 if (what.find(str.at (str.size()-1) ) != std::string::npos)
210 {
211 return str.substr(0, str.size()-1);
212 }
213 return str;
6a93d84a
TJ
214} // eo chomp(const std:.string&,const std::string&)
215
216
217/**
218 * @brief returns a lower case version of a given string.
219 * @param str the string
220 * @return the lower case version of the string
221 */
6ab3bc95 222std::string to_lower (const std::string& str)
6a93d84a 223{
6ab3bc95
RP
224 std::string result(str);
225 return to_lower_mod(result);
226} // eo to_lower(const std::string&)
6a93d84a
TJ
227
228
229/**
230 * @brief returns a upper case version of a given string.
231 * @param str the string
232 * @return the upper case version of the string
233 */
6ab3bc95 234std::string to_upper(const std::string& str)
6a93d84a 235{
6ab3bc95
RP
236 std::string result(str);
237 return to_upper_mod(result);
238} // eo to_upper(const std::string&)
6a93d84a
TJ
239
240
241
242/**
243 * @brief removes a given suffix from a string.
244 * @param str the string.
245 * @param suffix the suffix which should be removed if the string ends with it.
246 * @return the string without the suffix.
247 *
248 * If the string ends with the suffix, it is removed. If the the string doesn't end
249 * with the suffix the original string is returned.
250 */
6ab3bc95 251std::string remove_suffix(const std::string& str, const std::string& suffix)
6a93d84a 252{
6ab3bc95
RP
253 if (has_suffix(str,suffix) )
254 {
255 return str.substr(0, str.size()-suffix.size() );
256 }
257 return str;
258} // eo remove_suffix(const std::string&,const std::string&)
6a93d84a
TJ
259
260
261
262/**
263 * @brief removes a given prefix from a string.
264 * @param str the string.
265 * @param prefix the prefix which should be removed if the string begins with it.
266 * @return the string without the prefix.
267 *
268 * If the string begins with the prefix, it is removed. If the the string doesn't begin
269 * with the prefix the original string is returned.
270 */
6ab3bc95 271std::string remove_prefix(const std::string& str, const std::string& prefix)
6a93d84a 272{
6ab3bc95
RP
273 if (has_prefix(str,prefix) )
274 {
275 return str.substr( prefix.size() );
276 }
277 return str;
278} // eo remove_prefix(const std::string&,const std::string&)
6a93d84a
TJ
279
280
281/**
282 * split a string to key and value delimited by a given delimiter.
6ab3bc95 283 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
6a93d84a
TJ
284 * @param str the string which should be splitted.
285 * @param[out] key the resulting key
286 * @param[out] value the resulting value
287 * @param delimiter the delimiter between key and value; default is '='.
288 * @return @a true if the split was successful.
289 */
6ab3bc95
RP
290bool pair_split(
291 const std::string& str,
292 std::string& key,
293 std::string& value,
294 char delimiter)
295{
296 std::string::size_type pos = str.find (delimiter);
297 if (pos == std::string::npos) return false;
298 key= str.substr(0,pos);
299 value= str.substr(pos+1);
300 trim_mod(key);
301 trim_mod(value);
302 return true;
303} // eo pair_split(const std::string&,std::string&,std::string&,char)
6a93d84a
TJ
304
305
306/**
307 * splits a string by given delimiter
308 *
309 * @param[in] str the string which should be splitted.
310 * @param[out] result the list resulting from splitting @a str.
311 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
312 * @param[in] omit_empty should empty parts not be stored?
313 * @param[in] trim_list list of characters the parts should be trimmed by.
314 * (empty string results in no trim)
315 */
6ab3bc95
RP
316void split_string(
317 const std::string& str,
318 std::list<std::string>& result,
319 const std::string& delimiter,
320 bool omit_empty,
321 const std::string& trim_list
6a93d84a
TJ
322)
323{
6ab3bc95
RP
324 std::string::size_type pos, last_pos=0;
325 bool delimiter_found= false;
326 while ( last_pos < str.size() && last_pos != std::string::npos)
327 {
328 pos= str.find(delimiter, last_pos);
329 std::string part;
330 if (pos == std::string::npos)
331 {
332 part= str.substr(last_pos);
333 delimiter_found= false;
334 }
335 else
336 {
337 part= str.substr(last_pos, pos-last_pos);
338 delimiter_found=true;
339 }
340 if (pos != std::string::npos)
341 {
342 last_pos= pos+ delimiter.size();
343 }
344 else
345 {
346 last_pos= std::string::npos;
347 }
348 if (!trim_list.empty() ) trim_mod (part, trim_list);
349 if (omit_empty && part.empty() ) continue;
350 result.push_back( part );
351 }
352 // if the string ends with a delimiter we need to append an empty string if no omit_empty
353 // was given.
354 // (this way we keep the split result consistent to a join operation)
355 if (delimiter_found && !omit_empty)
356 {
357 result.push_back("");
358 }
359} // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
6a93d84a
TJ
360
361
362/**
363 * splits a string by a given delimiter
364 * @param str the string which should be splitted.
365 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
366 * @param[in] omit_empty should empty parts not be stored?
367 * @param[in] trim_list list of characters the parts should be trimmed by.
368 * (empty string results in no trim)
369 * @return the list resulting from splitting @a str.
370 */
6ab3bc95
RP
371std::list<std::string> split_string(
372 const std::string& str,
373 const std::string& delimiter,
374 bool omit_empty,
375 const std::string& trim_list
6a93d84a
TJ
376)
377{
6ab3bc95
RP
378 std::list<std::string> result;
379 split_string(str, result, delimiter, omit_empty, trim_list);
380 return result;
381} // eo split_string(const std::string&,const std::string&,bool,const std::string&)
6a93d84a
TJ
382
383
384/**
385 * @brief joins a list of strings into a single string.
386 *
6ab3bc95
RP
387 * This funtion is (basically) the reverse operation of @a split_string.
388 *
6a93d84a
TJ
389 * @param parts the list of strings.
390 * @param delimiter the delimiter which is inserted between the strings.
391 * @return the joined string.
392 */
6ab3bc95
RP
393std::string join_string(
394 const std::list< std::string >& parts,
395 const std::string& delimiter
6a93d84a
TJ
396)
397{
6ab3bc95
RP
398 std::string result;
399 if (! parts.empty() )
400 {
401 std::list< std::string >::const_iterator it= parts.begin();
402 result = *it;
403 while ( ++it != parts.end() )
404 {
405 result+= delimiter;
406 result+= *it;
407 }
408 }
409 return result;
410} // eo join_string(const std::list< std::string >&,const std::string&)
6a93d84a
TJ
411
412
413
414/*
415** conversions
416*/
417
418
419/**
420 * @brief returns a hex string from a binary string.
421 * @param str the (binary) string
422 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
423 * @return the string in hex notation.
424 */
6ab3bc95
RP
425std::string convert_binary_to_hex(
426 const std::string& str,
427 bool upper_case_digits
6a93d84a
TJ
428)
429{
6ab3bc95
RP
430 std::string result;
431 std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
432 for ( std::string::const_iterator it= str.begin();
433 it != str.end();
434 ++it)
435 {
436 result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
437 result.push_back( hexDigits[ (*it) & 0x0f ] );
438 }
439 return result;
440} // eo convert_binary_to_hex(const std::string&,bool)
6a93d84a
TJ
441
442
443/**
444 * @brief converts a hex digit string to binary string.
445 * @param str hex digit string
446 * @return the binary string.
447 *
448 * The hex digit string may contains white spaces or colons which are treated
449 * as delimiters between hex digit groups.
450 *
451 * @todo rework the handling of half nibbles (consistency)!
452 */
6ab3bc95
RP
453std::string convert_hex_to_binary(
454 const std::string& str
6a93d84a 455)
6ab3bc95
RP
456throw (std::runtime_error)
457{
458 std::string result;
459 char c= 0;
460 bool hasNibble= false;
461 bool lastWasWS= true;
462 for ( std::string::const_iterator it= str.begin();
463 it != str.end();
464 ++it)
465 {
466 std::string::size_type p = hexDigitsLower.find( *it );
467 if (p== std::string::npos)
468 {
469 p= hexDigitsUpper.find( *it );
470 }
471 if (p == std::string::npos)
472 {
473 if ( ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
6a93d84a 474 or ( *it == ':') // or a colon?
6ab3bc95
RP
475 )
476 {
477 // we treat that as a valid delimiter:
478 if (hasNibble)
6a93d84a 479 {
6ab3bc95
RP
480 // 1 nibble before WS is treate as lower part:
481 result.push_back(c);
482 // reset state:
483 hasNibble= false;
6a93d84a 484 }
6ab3bc95
RP
485 lastWasWS= true;
486 continue;
487 }
488 }
489 if (p == std::string::npos )
490 {
491 throw runtime_error("illegal character in hex digit string: " + str);
492 }
493 lastWasWS= false;
494 if (hasNibble)
495 {
496 c<<=4;
497 }
498 else
499 {
500 c=0;
501 }
502 c+= (p & 0x0f);
503 if (hasNibble)
504 {
505 //we already had a nibble, so a char is complete now:
506 result.push_back( c );
507 hasNibble=false;
508 }
509 else
510 {
511 // this is the first nibble of a new char:
512 hasNibble=true;
513 }
514 }
515 if (hasNibble)
516 {
517 //well, there is one nibble left
518 // let's do some heuristics:
519 if (lastWasWS)
520 {
521 // if the preceeding character was a white space (or a colon)
522 // we treat the nibble as lower part:
523 //( this is consistent with shortened hex notations where leading zeros are not noted)
524 result.push_back( c );
525 }
526 else
527 {
528 // if it was part of a hex digit chain, we treat it as UPPER part (!!)
529 result.push_back( c << 4 );
530 }
531 }
532 return result;
533} // eo convert_hex_to_binary(const std::string&)
534
535
536} // eo namespace I2n
537
538
539
6a93d84a 540
e93545dd
GE
541std::string iso_to_utf8(const std::string& isostring)
542{
6ab3bc95 543 string result;
118e216e 544
6ab3bc95 545 iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
118e216e 546
6ab3bc95
RP
547 if (iso_to_utf8 == (iconv_t)-1)
548 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 549
6ab3bc95
RP
550 size_t in_size=isostring.size();
551 size_t out_size=in_size*4;
118e216e 552
6ab3bc95
RP
553 char *buf = (char *)malloc(out_size+1);
554 if (buf == NULL)
555 throw runtime_error("out of memory for iconv buffer");
e93545dd 556
6ab3bc95
RP
557 const char *in = isostring.c_str();
558 char *out = buf;
559 iconv(i2utf8, &in, &in_size, &out, &out_size);
118e216e 560
6ab3bc95 561 buf[isostring.size()*4-out_size]=0;
118e216e 562
6ab3bc95 563 result=buf;
118e216e 564
6ab3bc95
RP
565 free(buf);
566 iconv_close(i2utf8);
118e216e 567
6ab3bc95 568 return result;
e93545dd
GE
569}
570
571std::string utf8_to_iso(const std::string& utf8string)
572{
6ab3bc95 573 string result;
118e216e 574
6ab3bc95 575 iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
118e216e 576
6ab3bc95
RP
577 if (utf82iso == (iconv_t)-1)
578 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 579
6ab3bc95
RP
580 size_t in_size=utf8string.size();
581 size_t out_size=in_size;
118e216e 582
6ab3bc95
RP
583 char *buf = (char *)malloc(out_size+1);
584 if (buf == NULL)
585 throw runtime_error("out of memory for iconv buffer");
e93545dd 586
6ab3bc95
RP
587 const char *in = utf8string.c_str();
588 char *out = buf;
589 iconv(utf82iso, &in, &in_size, &out, &out_size);
118e216e 590
6ab3bc95 591 buf[utf8string.size()-out_size]=0;
118e216e 592
6ab3bc95 593 result=buf;
118e216e 594
6ab3bc95
RP
595 free(buf);
596 iconv_close(utf82iso);
e93545dd 597
6ab3bc95 598 return result;
e93545dd
GE
599}
600
a5f3af6e
GE
601wchar_t* utf8_to_wbuf(const std::string& utf8string)
602{
6ab3bc95 603 iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
a5f3af6e 604
6ab3bc95
RP
605 if (utf82wstr == (iconv_t)-1)
606 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
a5f3af6e 607
6ab3bc95
RP
608 size_t in_size=utf8string.size();
609 size_t out_size= (in_size+1)*sizeof(wchar_t);
a5f3af6e 610
6ab3bc95
RP
611 wchar_t *buf = (wchar_t *)malloc(out_size);
612 if (buf == NULL)
613 throw runtime_error("out of memory for iconv buffer");
a5f3af6e 614
6ab3bc95
RP
615 const char *in = utf8string.c_str();
616 char *out = (char*) buf;
617 if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == -1)
618 throw runtime_error("error converting char encodings");
a5f3af6e 619
6ab3bc95 620 buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
a5f3af6e 621
6ab3bc95 622 iconv_close(utf82wstr);
a5f3af6e 623
6ab3bc95 624 return buf;
a5f3af6e
GE
625}
626
13cc4db1 627std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071 628{
6ab3bc95 629 string result;
118e216e 630
6ab3bc95 631 iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
118e216e 632
6ab3bc95
RP
633 if (utf7imap2utf8 == (iconv_t)-1)
634 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 635
6ab3bc95
RP
636 size_t in_size=utf7imapstring.size();
637 size_t out_size=in_size*4;
118e216e 638
6ab3bc95
RP
639 char *buf = (char *)malloc(out_size+1);
640 if (buf == NULL)
641 throw runtime_error("out of memory for iconv buffer");
d116a071 642
6ab3bc95
RP
643 const char *in = utf7imapstring.c_str();
644 char *out = buf;
645 iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 646
6ab3bc95 647 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 648
6ab3bc95 649 result=buf;
118e216e 650
6ab3bc95
RP
651 free(buf);
652 iconv_close(utf7imap2utf8);
118e216e 653
6ab3bc95 654 return result;
118e216e
TJ
655}
656
6a2b6dd1
TJ
657std::string utf8_to_utf7imap(const std::string& utf8string)
658{
6ab3bc95 659 string result;
6a2b6dd1 660
6ab3bc95 661 iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
6a2b6dd1 662
6ab3bc95
RP
663 if (utf82utf7imap == (iconv_t)-1)
664 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
6a2b6dd1 665
6ab3bc95
RP
666 // UTF-7 is base64 encoded, a buffer 10x as large
667 // as the utf-8 buffer should be enough. If not the string will be truncated.
668 size_t in_size=utf8string.size();
669 size_t out_size=in_size*10;
6a2b6dd1 670
6ab3bc95
RP
671 char *buf = (char *)malloc(out_size+1);
672 if (buf == NULL)
673 throw runtime_error("out of memory for iconv buffer");
6a2b6dd1 674
6ab3bc95
RP
675 const char *in = utf8string.c_str();
676 char *out = buf;
677 iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
6a2b6dd1 678
6ab3bc95 679 buf[utf8string.size()*10-out_size]= 0;
6a2b6dd1 680
6ab3bc95 681 result=buf;
6a2b6dd1 682
6ab3bc95
RP
683 free(buf);
684 iconv_close(utf82utf7imap);
6a2b6dd1 685
6ab3bc95 686 return result;
6a2b6dd1
TJ
687}
688
118e216e
TJ
689// Tokenize string by (html) tags
690void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
691{
6ab3bc95
RP
692 string::size_type pos, len = input.size();
693 bool inside_tag = false;
694 string current;
695
696 for (pos = 0; pos < len; pos++)
697 {
698 if (input[pos] == '<')
699 {
700 inside_tag = true;
701
702 if (!current.empty() )
703 {
704 tokenized.push_back( make_pair(current, false) );
705 current = "";
706 }
707
708 current += input[pos];
709 }
710 else if (input[pos] == '>' && inside_tag)
711 {
712 current += input[pos];
713 inside_tag = false;
714 if (!current.empty() )
715 {
716 tokenized.push_back( make_pair(current, true) );
717 current = "";
718 }
719 }
720 else
721 current += input[pos];
722 }
723
724 // String left over in buffer?
725 if (!current.empty() )
726 tokenized.push_back( make_pair(current, false) );
727} // eo tokenize_by_tag
118e216e 728
118e216e
TJ
729
730std::string strip_html_tags(const std::string &input)
731{
6ab3bc95
RP
732 // Pair first: string, second: isTag
733 vector<pair<string,bool> > tokenized;
734 tokenize_by_tag (tokenized, input);
118e216e 735
6ab3bc95
RP
736 string output;
737 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
738 for (token = tokenized.begin(); token != tokens_end; token++)
739 if (!token->second)
740 output += token->first;
741
742 return output;
743} // eo strip_html_tags
118e216e 744
118e216e
TJ
745
746// Smart-encode HTML en
747string smart_html_entities(const std::string &input)
748{
6ab3bc95
RP
749 // Pair first: string, second: isTag
750 vector<pair<string,bool> > tokenized;
751 tokenize_by_tag (tokenized, input);
752
753 string output;
754 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
755 for (token = tokenized.begin(); token != tokens_end; token++)
756 {
757 // keep HTML tags as they are
758 if (token->second)
759 output += token->first;
760 else
761 output += html_entities(token->first);
762 }
763
764 return output;
118e216e
TJ
765}
766
6ab3bc95 767
a5f3af6e
GE
768string::size_type find_8bit(const std::string &str)
769{
6ab3bc95
RP
770 string::size_type l=str.size();
771 for (string::size_type p=0; p < l; p++)
772 if (static_cast<unsigned char>(str[p]) > 127)
773 return p;
a5f3af6e 774
6ab3bc95 775 return string::npos;
a5f3af6e
GE
776}
777
118e216e
TJ
778// encoded UTF-8 chars into HTML entities
779string html_entities(std::string str)
780{
6ab3bc95
RP
781 // Normal chars
782 replace_all (str, "&", "&amp;");
783 replace_all (str, "\"", "&quot;");
784 replace_all (str, "<", "&lt;");
785 replace_all (str, ">", "&gt;");
786
787 // Umlauts
788 replace_all (str, "\xC3\xA4", "&auml;");
789 replace_all (str, "\xC3\xB6", "&ouml;");
790 replace_all (str, "\xC3\xBC", "&uuml;");
791 replace_all (str, "\xC3\x84", "&Auml;");
792 replace_all (str, "\xC3\x96", "&Ouml;");
793 replace_all (str, "\xC3\x9C", "&Uuml;");
794
795 // Misc
796 replace_all (str, "\xC3\x9F", "&szlig;");
797
798 // conversion of remaining non-ASCII chars needed?
799 // just do if needed because of performance
800 if (find_8bit(str) != string::npos)
801 {
802 // convert to fixed-size encoding UTF-32
803 wchar_t* wbuf=utf8_to_wbuf(str);
804 ostringstream target;
805
806 // replace all non-ASCII chars with HTML representation
807 for (int p=0; wbuf[p] != 0; p++)
808 {
809 unsigned int c=wbuf[p];
810
811 if (c <= 127)
812 target << static_cast<unsigned char>(c);
813 else
814 target << "&#" << c << ';';
815 }
816
817 free(wbuf);
818
819 str=target.str();
820 }
821
822 return str;
823} // eo html_entities(std::string)
824
118e216e 825
e93545dd
GE
826bool replace_all(string &base, const char *ist, const char *soll)
827{
6ab3bc95
RP
828 string i=ist;
829 string s=soll;
830 return replace_all(base,&i,&s);
e93545dd
GE
831}
832
833bool replace_all(string &base, const string &ist, const char *soll)
834{
6ab3bc95
RP
835 string s=soll;
836 return replace_all(base,&ist,&s);
e93545dd
GE
837}
838
839bool replace_all(string &base, const string *ist, const string *soll)
840{
6ab3bc95 841 return replace_all(base,*ist,*soll);
e93545dd
GE
842}
843
844bool replace_all(string &base, const char *ist, const string *soll)
845{
6ab3bc95
RP
846 string i=ist;
847 return replace_all(base,&i,soll);
e93545dd
GE
848}
849
850bool replace_all(string &base, const string &ist, const string &soll)
851{
6ab3bc95
RP
852 bool found_ist = false;
853 string::size_type a=0;
854
855 if (ist.empty() )
856 throw runtime_error ("replace_all called with empty search string");
e93545dd 857
6ab3bc95
RP
858 while ( (a=base.find(ist,a) ) != string::npos)
859 {
860 base.replace(a,ist.size(),soll);
861 a=a+soll.size();
862 found_ist = true;
863 }
1ec2064e 864
6ab3bc95 865 return found_ist;
e93545dd
GE
866}
867
868string to_lower(const string &src)
869{
6ab3bc95 870 string dst = src;
e93545dd 871
6ab3bc95
RP
872 string::size_type pos, end = dst.size();
873 for (pos = 0; pos < end; pos++)
874 dst[pos] = tolower(dst[pos]);
e93545dd 875
6ab3bc95 876 return dst;
e93545dd
GE
877}
878
879string to_upper(const string &src)
880{
6ab3bc95 881 string dst = src;
e93545dd 882
6ab3bc95
RP
883 string::size_type pos, end = dst.size();
884 for (pos = 0; pos < end; pos++)
885 dst[pos] = toupper(dst[pos]);
e93545dd 886
6ab3bc95 887 return dst;
e93545dd
GE
888}
889
6ab3bc95
RP
890string nice_unit_format(int input)
891{
892 float size = input;
893 int sizecount = 0;
894
895 while (size > 1000)
896 {
897 size = size / 1000;
898 sizecount++;
899 }
900
901 float tmp; // round
902 tmp = size*10;
903 tmp += 0.5;
904 tmp = int (tmp);
905 tmp = float (tmp) /float (10);
906 size = tmp;
907
908 ostringstream out;
909
910 out.setf (ios::fixed);
911 out.precision (2);
912 switch (sizecount)
913 {
914 case 0:
915 out << size << i18n (" Bytes");
916 break;
917 case 1:
918 out << size << i18n (" KBytes");
919 break;
920 case 2:
921 out << size << i18n (" MBytes");
922 break;
923 case 3:
924 out << size << i18n (" GBytes");
925 break;
926 case 4:
927 out << size << i18n (" TBytes");
928 break;
929 case 5:
930 out << size << i18n (" PBytes");
931 break;
932 case 6:
933 out << size << i18n (" EBytes");
934 break;
935 default:
936 out << size << "*10^" << (sizecount*3)<< i18n (" Bytes");
937 break;
938 }
939
940 return out.str();
941} // eo nice_unit_format(int input)
942
e93545dd 943
47c07fba
GE
944string escape(const string &s)
945{
6ab3bc95
RP
946 string out(s);
947 string::size_type p;
47c07fba 948
6ab3bc95
RP
949 p=0;
950 while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
951 {
952 out.insert (p,"\\");
953 p+=2;
954 }
47c07fba 955
6ab3bc95
RP
956 p=0;
957 while ( (p=out.find_first_of("\r",p) ) !=out.npos)
958 {
959 out.replace (p,1,"\\r");
960 p+=2;
961 }
47c07fba 962
6ab3bc95
RP
963 p=0;
964 while ( (p=out.find_first_of("\n",p) ) !=out.npos)
965 {
966 out.replace (p,1,"\\n");
967 p+=2;
968 }
47c07fba 969
6ab3bc95 970 out='"'+out+'"';
47c07fba 971
6ab3bc95
RP
972 return out;
973} // eo scape(const std::string&)
47c07fba 974
47c07fba 975
6ab3bc95
RP
976string descape(const string &s, int startpos, int &endpos)
977{
978 string out;
979
980 if (s.at(startpos) != '"')
981 throw out_of_range("value not type escaped string");
982
983 out=s.substr(startpos+1);
984 string::size_type p=0;
985
986 // search for the end of the string
987 while ( (p=out.find("\"",p) ) !=out.npos)
988 {
989 int e=p-1;
990 bool escaped=false;
991
992 // the " might be escaped with a backslash
993 while (e>=0 && out.at (e) =='\\')
994 {
995 if (escaped == false)
996 escaped=true;
997 else
998 escaped=false;
999
1000 e--;
1001 }
1002
1003 if (escaped==false)
1004 break;
1005 else
1006 p++;
1007 }
1008
1009 // we now have the end of the string
1010 out=out.substr(0,p);
1011
1012 // tell calling prog about the endposition
1013 endpos=startpos+p+1;
1014
1015 // descape all \ stuff inside the string now
1016 p=0;
1017 while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1018 {
1019 switch (out.at(p+1) )
1020 {
1021 case 'r':
47c07fba
GE
1022 out.replace(p,2,"\r");
1023 break;
6ab3bc95 1024 case 'n':
47c07fba
GE
1025 out.replace(p,2,"\n");
1026 break;
6ab3bc95 1027 default:
47c07fba 1028 out.erase(p,1);
6ab3bc95
RP
1029 }
1030 p++;
1031 }
1032
1033 return out;
1034} // eo descape(const std::string&,int,int&)
47c07fba 1035
e93545dd 1036
47c07fba
GE
1037string escape_shellarg(const string &input)
1038{
6ab3bc95
RP
1039 string output = "'";
1040 string::const_iterator it, it_end = input.end();
1041 for (it = input.begin(); it != it_end; it++)
1042 {
1043 if ( (*it) == '\'')
1044 output += "'\\'";
1045
1046 output += *it;
1047 }
1048
1049 output += "'";
1050 return output;
47c07fba 1051}