developer.intra2net.com Git - libi2ncommon/blob - src/stringfunc.cpp

   1 /*
   2 The software in this package is distributed under the GNU General
   3 Public License version 2 (with a special exception described below).
   4
   5 A copy of GNU General Public License (GPL) is included in this distribution,
   6 in the file COPYING.GPL.
   7
   8 As a special exception, if other files instantiate templates or use macros
   9 or inline functions from this file, or you compile this file and link it
  10 with other works to produce a work based on this file, this file
  11 does not by itself cause the resulting work to be covered
  12 by the GNU General Public License.
  13
  14 However the source code for this file must still be made available
  15 in accordance with section (3) of the GNU General Public License.
  16
  17 This exception does not invalidate any other reasons why a work based
  18 on this file might be covered by the GNU General Public License.
  19 */
  20 /** @file
  21  *
  22  * (c) Copyright 2007-2008 by Intra2net AG
  23  */
  24
  25 #include <iostream>
  26 #include <string>
  27 #include <sstream>
  28 #include <stdexcept>
  29 #include <algorithm>
  30 #include <cmath>    // for round()
  31
  32 #include <wchar.h>
  33 #include <stdlib.h>
  34 #include <iconv.h>
  35 #include <i18n.h>
  36
  37 #include <boost/numeric/conversion/cast.hpp>
  38 #include <boost/foreach.hpp>
  39
  40 #include <stringfunc.hxx>
  41
  42 using namespace std;
  43
  44 namespace I2n
  45 {
  46
  47
  48 namespace
  49 {
  50
  51 const std::string hexDigitsLower("0123456789abcdef");
  52 const std::string hexDigitsUpper("0123456789ABCDEF");
  53
  54
  55 struct UpperFunc
  56 {
  57    char operator() (char c)
  58    {
  59       return std::toupper(c);
  60    }
  61 }; // eo struct UpperFunc
  62
  63
  64 struct LowerFunc
  65 {
  66    char operator() (char c)
  67    {
  68       return std::tolower(c);
  69    }
  70 }; // eo struct LowerFunc
  71
  72
  73 } // eo namespace <anonymous>
  74
  75
  76
  77 /**
  78  * default list of Whitespaces (" \t\r\n");
  79  */
  80 const std::string Whitespaces = " \t\r\n";
  81
  82 /**
  83  * default list of lineendings ("\r\n");
  84  */
  85 const std::string LineEndings= "\r\n";
  86
  87
  88
  89 /**
  90  * @brief checks if a string begins with a given prefix.
  91  * @param[in,out] str the string which is tested
  92  * @param prefix the prefix which should be tested for.
  93  * @return @a true iff the prefix is not empty and the string begins with that prefix.
  94  */
  95 bool has_prefix(const std::string& str, const std::string& prefix)
  96 {
  97    if (prefix.empty() || str.empty() || str.size() < prefix.size() )
  98    {
  99       return false;
 100    }
 101    return str.compare(0, prefix.size(), prefix) == 0;
 102 } // eo has_prefix(const std::string&,const std::string&)
 103
 104
 105 /**
 106  * @brief checks if a string ends with a given suffix.
 107  * @param[in,out] str the string which is tested
 108  * @param suffix the suffix which should be tested for.
 109  * @return @a true iff the suffix is not empty and the string ends with that suffix.
 110  */
 111 bool has_suffix(const std::string& str, const std::string& suffix)
 112 {
 113    if (suffix.empty() || str.empty() || str.size() < suffix.size() )
 114    {
 115       return false;
 116    }
 117    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 118 } // eo has_suffix(const std::string&,const std::string&)
 119
 120
 121 /**
 122  * cut off characters from a given list from front and end of a string.
 123  * @param[in,out] str the string which should be trimmed.
 124  * @param charlist the list of characters to remove from beginning and end of string
 125  * @return the result string.
 126  */
 127 std::string trim_mod(std::string& str, const std::string& charlist)
 128 {
 129    // first: trim the beginning:
 130    std::string::size_type pos= str.find_first_not_of (charlist);
 131    if (pos == std::string::npos)
 132    {
 133       // whole string consists of charlist (or is already empty)
 134       str.clear();
 135       return str;
 136    }
 137    else if (pos>0)
 138    {
 139       // str starts with charlist
 140       str.erase(0,pos);
 141    }
 142    // now let's look at the tail:
 143    pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
 144    if ( pos < str.size() )
 145    {
 146       str.erase(pos, str.size()-pos);
 147    }
 148    return str;
 149 } // eo trim_mod(std::string&,const std::string&)
 150
 151
 152
 153 /**
 154  * removes last character from a string when it is in a list of chars to be removed.
 155  * @param[in,out] str the string.
 156  * @param what the list of chars which will be tested for.
 157  * @return the resulting string with last char removed (if applicable)
 158  */
 159 std::string chomp_mod(std::string& str, const std::string& what)
 160 {
 161    if (str.empty() || what.empty() )
 162    {
 163       return str;
 164    }
 165    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 166    {
 167       str.erase(str.size() - 1);
 168    }
 169    return str;
 170 } // eo chomp_mod(std::string&,const std::string&)
 171
 172
 173 /**
 174  * @brief converts a string to lower case.
 175  * @param[in,out] str the string to modify.
 176  * @return the string
 177  */
 178 std::string to_lower_mod(std::string& str)
 179 {
 180    std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
 181    return str;
 182 } // eo to_lower_mod(std::string&)
 183
 184
 185 /**
 186  * @brief converts a string to upper case.
 187  * @param[in,out] str the string to modify.
 188  * @return the string
 189  */
 190 std::string to_upper_mod(std::string& str)
 191 {
 192    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
 193    return str;
 194 } // eo to_upper_mod(std::string&)
 195
 196
 197
 198 /**
 199  * cut off characters from a given list from front and end of a string.
 200  * @param str the string which should be trimmed.
 201  * @param charlist the list of characters to remove from beginning and end of string
 202  * @return the result string.
 203  */
 204 std::string trim (const std::string& str, const std::string& charlist)
 205 {
 206    // first: trim the beginning:
 207    std::string::size_type pos0= str.find_first_not_of(charlist);
 208    if (pos0 == std::string::npos)
 209    {
 210       // whole string consists of charlist (or is already empty)
 211       return std::string();
 212    }
 213    // now let's look at the end:
 214    std::string::size_type pos1= str.find_last_not_of(charlist);
 215    return str.substr(pos0, pos1 - pos0 + 1);
 216 } // eo trim(const std:.string&,const std::string&)
 217
 218
 219 /**
 220  * removes last character from a string when it is in a list of chars to be removed.
 221  * @param str the string.
 222  * @param what the list of chars which will be tested for.
 223  * @return the resulting string with last char removed (if applicable)
 224  */
 225 std::string chomp (const std::string& str, const std::string& what)
 226 {
 227    if (str.empty() || what.empty() )
 228    {
 229       return str;
 230    }
 231    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 232    {
 233       return str.substr(0, str.size()-1);
 234    }
 235    return str;
 236 } // eo chomp(const std:.string&,const std::string&)
 237
 238
 239 /**
 240  * @brief returns a lower case version of a given string.
 241  * @param str the string
 242  * @return the lower case version of the string
 243  */
 244 std::string to_lower (const std::string& str)
 245 {
 246    std::string result(str);
 247    return to_lower_mod(result);
 248 } // eo to_lower(const std::string&)
 249
 250
 251 /**
 252  * @brief returns a upper case version of a given string.
 253  * @param str the string
 254  * @return the upper case version of the string
 255  */
 256 std::string to_upper(const std::string& str)
 257 {
 258    std::string result(str);
 259    return to_upper_mod(result);
 260 } // eo to_upper(const std::string&)
 261
 262
 263
 264 /**
 265  * @brief removes a given suffix from a string.
 266  * @param str the string.
 267  * @param suffix the suffix which should be removed if the string ends with it.
 268  * @return the string without the suffix.
 269  *
 270  * If the string ends with the suffix, it is removed. If the the string doesn't end
 271  * with the suffix the original string is returned.
 272  */
 273 std::string remove_suffix(const std::string& str, const std::string& suffix)
 274 {
 275    if (has_suffix(str,suffix) )
 276    {
 277       return str.substr(0, str.size()-suffix.size() );
 278    }
 279    return str;
 280 } // eo remove_suffix(const std::string&,const std::string&)
 281
 282
 283
 284 /**
 285  * @brief removes a given prefix from a string.
 286  * @param str the string.
 287  * @param prefix the prefix which should be removed if the string begins with it.
 288  * @return the string without the prefix.
 289  *
 290  * If the string begins with the prefix, it is removed. If the the string doesn't begin
 291  * with the prefix the original string is returned.
 292  */
 293 std::string remove_prefix(const std::string& str, const std::string& prefix)
 294 {
 295    if (has_prefix(str,prefix) )
 296    {
 297       return str.substr( prefix.size() );
 298    }
 299    return str;
 300 } // eo remove_prefix(const std::string&,const std::string&)
 301
 302
 303 /**
 304  * split a string to key and value delimited by a given delimiter.
 305  * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 306  * @param str the string which should be splitted.
 307  * @param[out] key the resulting key
 308  * @param[out] value the resulting value
 309  * @param delimiter the delimiter between key and value; default is '='.
 310  * @return @a true if the split was successful.
 311  */
 312 bool pair_split(
 313    const std::string& str,
 314    std::string& key,
 315    std::string& value,
 316    char delimiter)
 317 {
 318    std::string::size_type pos = str.find (delimiter);
 319    if (pos == std::string::npos) return false;
 320    key= str.substr(0,pos);
 321    value= str.substr(pos+1);
 322    trim_mod(key);
 323    trim_mod(value);
 324    return true;
 325 } // eo pair_split(const std::string&,std::string&,std::string&,char)
 326
 327
 328 /**
 329  * splits a string by given delimiter
 330  *
 331  * @param[in] str the string which should be splitted.
 332  * @param[out] result the list resulting from splitting  @a str.
 333  * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 334  * @param[in] omit_empty should empty parts not be stored?
 335  * @param[in] trim_list list of characters the parts should be trimmed by.
 336  *  (empty string results in no trim)
 337  */
 338 void split_string(
 339    const std::string& str,
 340    std::list<std::string>& result,
 341    const std::string& delimiter,
 342    bool omit_empty,
 343    const std::string& trim_list
 344 )
 345 {
 346    std::string::size_type pos, last_pos=0;
 347    bool delimiter_found= false;
 348    while ( last_pos < str.size()  && last_pos != std::string::npos)
 349    {
 350       pos= str.find(delimiter, last_pos);
 351       std::string part;
 352       if (pos == std::string::npos)
 353       {
 354          part= str.substr(last_pos);
 355          delimiter_found= false;
 356       }
 357       else
 358       {
 359          part= str.substr(last_pos, pos-last_pos);
 360          delimiter_found=true;
 361       }
 362       if (pos != std::string::npos)
 363       {
 364          last_pos= pos+ delimiter.size();
 365       }
 366       else
 367       {
 368          last_pos= std::string::npos;
 369       }
 370       if (!trim_list.empty() ) trim_mod (part, trim_list);
 371       if (omit_empty && part.empty() ) continue;
 372       result.push_back( part );
 373    }
 374    // if the string ends with a delimiter we need to append an empty string if no omit_empty
 375    // was given.
 376    // (this way we keep the split result consistent to a join operation)
 377    if (delimiter_found && !omit_empty)
 378    {
 379       result.push_back("");
 380    }
 381 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
 382
 383
 384 /**
 385  * splits a string by a given delimiter
 386  * @param str the string which should be splitted.
 387  * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 388  * @param[in] omit_empty should empty parts not be stored?
 389  * @param[in] trim_list list of characters the parts should be trimmed by.
 390  *  (empty string results in no trim)
 391  * @return the list resulting from splitting @a str.
 392  */
 393 std::list<std::string> split_string(
 394    const std::string& str,
 395    const std::string& delimiter,
 396    bool omit_empty,
 397    const std::string& trim_list
 398 )
 399 {
 400    std::list<std::string> result;
 401    split_string(str, result, delimiter, omit_empty, trim_list);
 402    return result;
 403 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
 404
 405
 406 /**
 407  * @brief joins a list of strings into a single string.
 408  *
 409  * This funtion is (basically) the reverse operation of @a split_string.
 410  *
 411  * @param parts the list of strings.
 412  * @param delimiter the delimiter which is inserted between the strings.
 413  * @return the joined string.
 414  */
 415 std::string join_string(
 416    const std::list< std::string >& parts,
 417    const std::string& delimiter
 418 )
 419 {
 420    std::string result;
 421    if (! parts.empty() )
 422    {
 423       std::list< std::string >::const_iterator it= parts.begin();
 424       result = *it;
 425       while ( ++it != parts.end() )
 426       {
 427          result+= delimiter;
 428          result+= *it;
 429       }
 430    }
 431    return result;
 432 } // eo join_string(const std::list< std::string >&,const std::string&)
 433
 434
 435 /** @brief same as join_string for list, except uses a vector */
 436 std::string join_string(
 437    const std::vector< std::string >& parts,
 438    const std::string& delimiter
 439 )
 440 {
 441    std::string result;
 442    if (! parts.empty() )
 443    {
 444       std::vector< std::string >::const_iterator it= parts.begin();
 445       result = *it;
 446       while ( ++it != parts.end() )
 447       {
 448          result+= delimiter;
 449          result+= *it;
 450       }
 451    }
 452    return result;
 453 } // eo join_string(const std::vector< std::string >&,const std::string&)
 454
 455
 456
 457 /*
 458 ** conversions
 459 */
 460
 461
 462 /**
 463  * @brief returns a hex string from a binary string.
 464  * @param str the (binary) string
 465  * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 466  * @return the string in hex notation.
 467  */
 468 std::string convert_binary_to_hex(
 469    const std::string& str,
 470    bool upper_case_digits
 471 )
 472 {
 473    std::string result;
 474    std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
 475    for ( std::string::const_iterator it= str.begin();
 476          it != str.end();
 477          ++it)
 478    {
 479       result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
 480       result.push_back( hexDigits[ (*it) & 0x0f ] );
 481    }
 482    return result;
 483 } // eo convert_binary_to_hex(const std::string&,bool)
 484
 485
 486 /**
 487  * @brief converts a hex digit string to binary string.
 488  * @param str hex digit string
 489  * @return the binary string.
 490  *
 491  * The hex digit string may contains white spaces or colons which are treated
 492  * as delimiters between hex digit groups.
 493  *
 494  * @todo rework the handling of half nibbles (consistency)!
 495  */
 496 std::string convert_hex_to_binary(
 497    const std::string& str
 498 )
 499 throw (std::runtime_error)
 500 {
 501    std::string result;
 502    char c= 0;
 503    bool hasNibble= false;
 504    bool lastWasWS= true;
 505    for ( std::string::const_iterator it= str.begin();
 506          it != str.end();
 507          ++it)
 508    {
 509       std::string::size_type p = hexDigitsLower.find( *it );
 510       if (p== std::string::npos)
 511       {
 512          p= hexDigitsUpper.find( *it );
 513       }
 514       if (p == std::string::npos)
 515       {
 516          if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
 517                 or ( *it == ':') // or a colon?
 518             )
 519          {
 520             // we treat that as a valid delimiter:
 521             if (hasNibble)
 522             {
 523                // 1 nibble before WS is treate as lower part:
 524                result.push_back(c);
 525                // reset state:
 526                hasNibble= false;
 527             }
 528             lastWasWS= true;
 529             continue;
 530          }
 531       }
 532       if (p == std::string::npos )
 533       {
 534          throw runtime_error("illegal character in hex digit string: " + str);
 535       }
 536       lastWasWS= false;
 537       if (hasNibble)
 538       {
 539          c<<=4;
 540       }
 541       else
 542       {
 543          c=0;
 544       }
 545       c+= (p & 0x0f);
 546       if (hasNibble)
 547       {
 548          //we already had a nibble, so a char is complete now:
 549          result.push_back( c );
 550          hasNibble=false;
 551       }
 552       else
 553       {
 554          // this is the first nibble of a new char:
 555          hasNibble=true;
 556       }
 557    }
 558    if (hasNibble)
 559    {
 560       //well, there is one nibble left
 561       // let's do some heuristics:
 562       if (lastWasWS)
 563       {
 564          // if the preceeding character was a white space (or a colon)
 565          // we treat the nibble as lower part:
 566          //( this is consistent with shortened hex notations where leading zeros are not noted)
 567          result.push_back( c );
 568       }
 569       else
 570       {
 571          // if it was part of a hex digit chain, we treat it as UPPER part (!!)
 572          result.push_back( c << 4 );
 573       }
 574    }
 575    return result;
 576 } // eo convert_hex_to_binary(const std::string&)
 577
 578
 579 } // eo namespace I2n
 580
 581
 582
 583
 584 std::string iso_to_utf8(const std::string& isostring)
 585 {
 586    string result;
 587
 588    iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
 589
 590    if (iso_to_utf8 == (iconv_t)-1)
 591       throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
 592
 593    size_t in_size=isostring.size();
 594    size_t out_size=in_size*4;
 595
 596    char *buf = (char *)malloc(out_size+1);
 597    if (buf == NULL)
 598       throw runtime_error("out of memory for iconv buffer");
 599
 600    char *in = (char *)isostring.c_str();
 601    char *out = buf;
 602    iconv(i2utf8, &in, &in_size, &out, &out_size);
 603
 604    buf[isostring.size()*4-out_size]=0;
 605
 606    result=buf;
 607
 608    free(buf);
 609    iconv_close(i2utf8);
 610
 611    return result;
 612 }
 613
 614 std::string utf8_to_iso(const std::string& utf8string)
 615 {
 616    string result;
 617
 618    iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
 619
 620    if (utf82iso == (iconv_t)-1)
 621       throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
 622
 623    size_t in_size=utf8string.size();
 624    size_t out_size=in_size;
 625
 626    char *buf = (char *)malloc(out_size+1);
 627    if (buf == NULL)
 628       throw runtime_error("out of memory for iconv buffer");
 629
 630    char *in = (char *)utf8string.c_str();
 631    char *out = buf;
 632    iconv(utf82iso, &in, &in_size, &out, &out_size);
 633
 634    buf[utf8string.size()-out_size]=0;
 635
 636    result=buf;
 637
 638    free(buf);
 639    iconv_close(utf82iso);
 640
 641    return result;
 642 }
 643
 644 wchar_t* utf8_to_wbuf(const std::string& utf8string)
 645 {
 646    iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
 647
 648    if (utf82wstr == (iconv_t)-1)
 649       throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
 650
 651    size_t in_size=utf8string.size();
 652    size_t out_size= (in_size+1)*sizeof(wchar_t);
 653
 654    wchar_t *buf = (wchar_t *)malloc(out_size);
 655    if (buf == NULL)
 656       throw runtime_error("out of memory for iconv buffer");
 657
 658    char *in = (char *)utf8string.c_str();
 659    char *out = (char*) buf;
 660    if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
 661       throw runtime_error("error converting char encodings");
 662
 663    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
 664
 665    iconv_close(utf82wstr);
 666
 667    return buf;
 668 }
 669
 670 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
 671 {
 672    string result;
 673
 674    iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
 675
 676    if (utf7imap2utf8 == (iconv_t)-1)
 677       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 678
 679    size_t in_size=utf7imapstring.size();
 680    size_t out_size=in_size*4;
 681
 682    char *buf = (char *)malloc(out_size+1);
 683    if (buf == NULL)
 684       throw runtime_error("out of memory for iconv buffer");
 685
 686    char *in = (char *)utf7imapstring.c_str();
 687    char *out = buf;
 688    iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
 689
 690    buf[utf7imapstring.size()*4-out_size]=0;
 691
 692    result=buf;
 693
 694    free(buf);
 695    iconv_close(utf7imap2utf8);
 696
 697    return result;
 698 }
 699
 700 std::string utf8_to_utf7imap(const std::string& utf8string)
 701 {
 702    string result;
 703
 704    iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
 705
 706    if (utf82utf7imap == (iconv_t)-1)
 707       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 708
 709    // UTF-7 is base64 encoded, a buffer 10x as large
 710    // as the utf-8 buffer should be enough. If not the string will be truncated.
 711    size_t in_size=utf8string.size();
 712    size_t out_size=in_size*10;
 713
 714    char *buf = (char *)malloc(out_size+1);
 715    if (buf == NULL)
 716       throw runtime_error("out of memory for iconv buffer");
 717
 718    char *in = (char *)utf8string.c_str();
 719    char *out = buf;
 720    iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
 721
 722    buf[utf8string.size()*10-out_size]= 0;
 723
 724    result=buf;
 725
 726    free(buf);
 727    iconv_close(utf82utf7imap);
 728
 729    return result;
 730 }
 731
 732 // Tokenize string by (html) tags
 733 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
 734 {
 735    string::size_type pos, len = input.size();
 736    bool inside_tag = false;
 737    string current;
 738
 739    for (pos = 0; pos < len; pos++)
 740    {
 741       if (input[pos] == '<')
 742       {
 743          inside_tag = true;
 744
 745          if (!current.empty() )
 746          {
 747             tokenized.push_back( make_pair(current, false) );
 748             current = "";
 749          }
 750
 751          current += input[pos];
 752       }
 753       else if (input[pos] == '>' && inside_tag)
 754       {
 755          current += input[pos];
 756          inside_tag = false;
 757          if (!current.empty() )
 758          {
 759             tokenized.push_back( make_pair(current, true) );
 760             current = "";
 761          }
 762       }
 763       else
 764          current += input[pos];
 765    }
 766
 767    // String left over in buffer?
 768    if (!current.empty() )
 769       tokenized.push_back( make_pair(current, false) );
 770 } // eo tokenize_by_tag
 771
 772
 773 std::string strip_html_tags(const std::string &input)
 774 {
 775    // Pair first: string, second: isTag
 776    vector<pair<string,bool> > tokenized;
 777    tokenize_by_tag (tokenized, input);
 778
 779    string output;
 780    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 781    for (token = tokenized.begin(); token != tokens_end; ++token)
 782       if (!token->second)
 783          output += token->first;
 784
 785    return output;
 786 } // eo strip_html_tags
 787
 788
 789 // Smart-encode HTML en
 790 string smart_html_entities(const std::string &input)
 791 {
 792    // Pair first: string, second: isTag
 793    vector<pair<string,bool> > tokenized;
 794    tokenize_by_tag (tokenized, input);
 795
 796    string output;
 797    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 798    for (token = tokenized.begin(); token != tokens_end; ++token)
 799    {
 800       // keep HTML tags as they are
 801       if (token->second)
 802          output += token->first;
 803       else
 804          output += html_entities(token->first);
 805    }
 806
 807    return output;
 808 }
 809
 810
 811 string::size_type find_8bit(const std::string &str)
 812 {
 813    string::size_type l=str.size();
 814    for (string::size_type p=0; p < l; p++)
 815       if (static_cast<unsigned char>(str[p]) > 127)
 816          return p;
 817
 818    return string::npos;
 819 }
 820
 821 // encoded UTF-8 chars into HTML entities
 822 string html_entities(std::string str)
 823 {
 824    // Normal chars
 825    replace_all (str, "&", "&amp;");
 826    replace_all (str, "<", "&lt;");
 827    replace_all (str, ">", "&gt;");
 828    replace_all (str, "\"", "&quot;");
 829    replace_all (str, "'", "&#x27;");
 830    replace_all (str, "/", "&#x2F;");
 831
 832    // Umlauts
 833    replace_all (str, "\xC3\xA4", "&auml;");
 834    replace_all (str, "\xC3\xB6", "&ouml;");
 835    replace_all (str, "\xC3\xBC", "&uuml;");
 836    replace_all (str, "\xC3\x84", "&Auml;");
 837    replace_all (str, "\xC3\x96", "&Ouml;");
 838    replace_all (str, "\xC3\x9C", "&Uuml;");
 839
 840    // Misc
 841    replace_all (str, "\xC3\x9F", "&szlig;");
 842
 843    // conversion of remaining non-ASCII chars needed?
 844    // just do if needed because of performance
 845    if (find_8bit(str) != string::npos)
 846    {
 847       // convert to fixed-size encoding UTF-32
 848       wchar_t* wbuf=utf8_to_wbuf(str);
 849       ostringstream target;
 850
 851       // replace all non-ASCII chars with HTML representation
 852       for (int p=0; wbuf[p] != 0; p++)
 853       {
 854          unsigned int c=wbuf[p];
 855
 856          if (c <= 127)
 857             target << static_cast<unsigned char>(c);
 858          else
 859             target << "&#" << c << ';';
 860       }
 861
 862       free(wbuf);
 863
 864       str=target.str();
 865    }
 866
 867    return str;
 868 } // eo html_entities(std::string)
 869
 870 // convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
 871 string html_entities_to_console(std::string str)
 872 {
 873    // Normal chars
 874    replace_all (str, "&amp;", "&");
 875    replace_all (str, "&lt;", "<");
 876    replace_all (str, "&gt;", ">");
 877    replace_all (str, "&quot;", "\"");
 878    replace_all (str, "&#x27;", "'");
 879    replace_all (str, "&#x2F;", "/");
 880
 881    // Umlauts
 882    replace_all (str, "&auml;", "ae");
 883    replace_all (str, "&ouml;", "oe");
 884    replace_all (str, "&uuml;", "ue");
 885    replace_all (str, "&Auml;", "Ae");
 886    replace_all (str, "&Ouml;", "Oe");
 887    replace_all (str, "&Uuml;", "Ue");
 888
 889    // Misc
 890    replace_all (str, "&szlig;", "ss");
 891
 892    return str;
 893 }
 894
 895 // find_html_comments + remove_html_comments(str, comments)
 896 void remove_html_comments(string &str)
 897 {
 898     vector<CommentZone> comments = find_html_comments(str);
 899     remove_html_comments(str, comments);
 900 }
 901
 902 // find all html comments, behaving correctly if they are nested; ignores comment tags ("<!--FOO .... BAR-->")
 903 // If there are invalid comments ("-->" before "<!--" or different number of closing and opening tags),
 904 // then the unknown index of corresponding start/end tag will be represented by a string::npos
 905 // Indices are from start of start tag until first index after closing tag
 906 vector<CommentZone> find_html_comments(const std::string &str)
 907 {
 908     static const string START = "<!--";
 909     static const string CLOSE = "-->";
 910     static const string::size_type START_LEN = START.length();
 911     static const string::size_type CLOSE_LEN = CLOSE.length();
 912
 913     vector<CommentZone> comments;
 914
 915     // in order to find nested comments, need either recursion or a stack
 916     vector<string::size_type> starts;      // stack of start tags
 917
 918     string::size_type pos = 0;
 919     string::size_type len = str.length();
 920     string::size_type next_start, next_close;
 921
 922     while (pos < len)     // not really needed but just in case
 923     {
 924         next_start = str.find(START, pos);
 925         next_close = str.find(CLOSE, pos);
 926
 927         if ( (next_start == string::npos) && (next_close == string::npos) )
 928             break;   // we are done
 929
 930         else if ( (next_start == string::npos) || (next_close < next_start) )  // close one comment (pop)
 931         {
 932             if (starts.empty())    // closing tag without a start
 933                 comments.push_back(CommentZone(string::npos, next_close+CLOSE_LEN));
 934             else
 935             {
 936                 comments.push_back(CommentZone(starts.back(), next_close+CLOSE_LEN));
 937                 starts.pop_back();
 938             }
 939             pos = next_close + CLOSE_LEN;
 940         }
 941
 942         else if ( (next_close == string::npos) || (next_start < next_close) )  // start a new comment (push)
 943         {
 944             starts.push_back(next_start);
 945             pos = next_start + START_LEN;
 946         }
 947     }
 948
 949     // add comments that have no closing tag from back to front (important for remove_html_comments!)
 950     while (!starts.empty())
 951     {
 952         comments.push_back(CommentZone(starts.back(), string::npos));
 953         starts.pop_back();
 954     }
 955
 956     return comments;
 957 }
 958
 959 // remove all html comments foundby find_html_comments
 960 void remove_html_comments(std::string &str, const vector<CommentZone> &comments)
 961 {
 962     // remember position where last removal started
 963     string::size_type last_removal_start = str.length();
 964
 965     // Go from back to front to not mess up indices.
 966     // This requires that bigger comments, that contain smaller comments, come AFTER
 967     // the small contained comments in the comments vector (i.e. comments are ordered by
 968     // their closing tag, not their opening tag). This is true for results from find_html_comments
 969     BOOST_REVERSE_FOREACH(const CommentZone &comment, comments)
 970     {
 971         if (comment.first == string::npos)
 972         {
 973             str = str.replace(0, comment.second, "");   // comment starts "before" str --> delete from start
 974             break;   // there can be no more
 975         }
 976         else if (comment.first >= last_removal_start)
 977         {
 978             continue;    // this comment is inside another comment that we have removed already
 979         }
 980         else if (comment.second == string::npos)   // comment ends "after" str --> delete until end
 981         {
 982             str = str.replace(comment.first, string::npos, "");
 983             last_removal_start = comment.first;
 984         }
 985         else
 986         {
 987             str = str.replace(comment.first, comment.second-comment.first, "");
 988             last_removal_start = comment.first;
 989         }
 990     }
 991 }
 992
 993 bool replace_all(string &base, const char *ist, const char *soll)
 994 {
 995    string i=ist;
 996    string s=soll;
 997    return replace_all(base,&i,&s);
 998 }
 999
1000 bool replace_all(string &base, const string &ist, const char *soll)
1001 {
1002    string s=soll;
1003    return replace_all(base,&ist,&s);
1004 }
1005
1006 bool replace_all(string &base, const string *ist, const string *soll)
1007 {
1008    return replace_all(base,*ist,*soll);
1009 }
1010
1011 bool replace_all(string &base, const char *ist, const string *soll)
1012 {
1013    string i=ist;
1014    return replace_all(base,&i,soll);
1015 }
1016
1017 bool replace_all(string &base, const string &ist, const string &soll)
1018 {
1019    bool found_ist = false;
1020    string::size_type a=0;
1021
1022    if (ist.empty() )
1023       throw runtime_error ("replace_all called with empty search string");
1024
1025    while ( (a=base.find(ist,a) ) != string::npos)
1026    {
1027       base.replace(a,ist.size(),soll);
1028       a=a+soll.size();
1029       found_ist = true;
1030    }
1031
1032    return found_ist;
1033 }
1034
1035 /**
1036  * @brief replaces all characters that could be problematic or impose a security risk when being logged
1037  * @param str the original string
1038  * @param replace_with the character to replace the unsafe chars with
1039  * @return a string that is safe to send to syslog or other logfiles
1040  *
1041  * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
1042  * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
1043  * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
1044  *
1045  */
1046 std::string sanitize_for_logging(const std::string &str, const char replace_with)
1047 {
1048     std::string output=str;
1049
1050     const string::size_type len = output.size();
1051     for (std::string::size_type p=0; p < len; p++)
1052         if (output[p] < 0x20 || output[p] > 0x7E)
1053             output[p]=replace_with;
1054
1055     return output;
1056 }
1057
1058 #if 0
1059 string to_lower(const string &src)
1060 {
1061    string dst = src;
1062
1063    string::size_type pos, end = dst.size();
1064    for (pos = 0; pos < end; pos++)
1065       dst[pos] = tolower(dst[pos]);
1066
1067    return dst;
1068 }
1069
1070 string to_upper(const string &src)
1071 {
1072    string dst = src;
1073
1074    string::size_type pos, end = dst.size();
1075    for (pos = 0; pos < end; pos++)
1076       dst[pos] = toupper(dst[pos]);
1077
1078    return dst;
1079 }
1080 #endif
1081
1082 const int MAX_UNIT_FORMAT_SYMBOLS = 6;
1083
1084 const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
1085         " B",
1086         " KB",
1087         " MB",
1088         " GB",
1089         " TB",
1090         " PB"
1091 };
1092
1093 const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
1094         i18n_noop(" Bytes"),
1095         i18n_noop(" KBytes"),
1096         i18n_noop(" MBytes"),
1097         i18n_noop(" GBytes"),
1098         i18n_noop(" TBytes"),
1099         i18n_noop(" PBytes")
1100 };
1101
1102
1103 long double rounding_upwards(
1104         const long double number,
1105         const int rounding_multiplier
1106 )
1107 {
1108     long double rounded_number;
1109     rounded_number = number * rounding_multiplier;
1110     rounded_number += 0.5;
1111     rounded_number = (int64_t) (rounded_number);
1112     rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
1113
1114     return rounded_number;
1115 }
1116
1117
1118 string nice_unit_format(
1119         const int64_t input,
1120         const UnitFormat format,
1121         const UnitBase base
1122 )
1123 {
1124    // select the system of units (decimal or binary)
1125    int multiple = 0;
1126    if (base == UnitBase1000)
1127    {
1128        multiple = 1000;
1129    }
1130    else
1131    {
1132        multiple = 1024;
1133    }
1134
1135    long double size = input;
1136
1137    // check the size of the input number to fit in the appropriate symbol
1138    int sizecount = 0;
1139    while (size > multiple)
1140    {
1141        size = size / multiple;
1142        sizecount++;
1143
1144        // rollback to the previous values and stop the loop when cannot
1145        // represent the number length.
1146        if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1147        {
1148            size = size * multiple;
1149            sizecount--;
1150            break;
1151        }
1152    }
1153
1154    // round the input number "half up" to multiples of 10
1155    const int rounding_multiplier = 10;
1156    size = rounding_upwards(size, rounding_multiplier);
1157
1158    // format the input number, placing the appropriate symbol
1159    ostringstream out;
1160    out.setf (ios::fixed);
1161    if (format == ShortUnitFormat)
1162    {
1163        out.precision(1);
1164        out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
1165    }
1166    else
1167    {
1168        out.precision (2);
1169        out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
1170    }
1171
1172    return out.str();
1173 } // eo nice_unit_format(int input)
1174
1175
1176 string nice_unit_format(
1177         const double input,
1178         const UnitFormat format,
1179         const UnitBase base
1180 )
1181 {
1182     // round as double and cast to int64_t
1183     // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1184     int64_t input_casted_and_rounded =
1185         boost::numeric_cast<int64_t>( round(input) );
1186
1187     // now call other
1188     return nice_unit_format( input_casted_and_rounded, format, base );
1189 } // eo nice_unit_format(double input)
1190
1191
1192 string escape(const string &s)
1193 {
1194    string out(s);
1195    string::size_type p;
1196
1197    p=0;
1198    while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1199    {
1200       out.insert (p,"\\");
1201       p+=2;
1202    }
1203
1204    p=0;
1205    while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1206    {
1207       out.replace (p,1,"\\r");
1208       p+=2;
1209    }
1210
1211    p=0;
1212    while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1213    {
1214       out.replace (p,1,"\\n");
1215       p+=2;
1216    }
1217
1218    out='"'+out+'"';
1219
1220    return out;
1221 } // eo scape(const std::string&)
1222
1223
1224 string descape(const string &s, int startpos, int &endpos)
1225 {
1226    string out;
1227
1228    if (s.at(startpos) != '"')
1229       throw out_of_range("value not type escaped string");
1230
1231    out=s.substr(startpos+1);
1232    string::size_type p=0;
1233
1234    // search for the end of the string
1235    while ( (p=out.find("\"",p) ) !=out.npos)
1236    {
1237       int e=p-1;
1238       bool escaped=false;
1239
1240       // the " might be escaped with a backslash
1241       while (e>=0 && out.at (e) =='\\')
1242       {
1243          if (escaped == false)
1244             escaped=true;
1245          else
1246             escaped=false;
1247
1248          e--;
1249       }
1250
1251       if (escaped==false)
1252          break;
1253       else
1254          p++;
1255    }
1256
1257    // we now have the end of the string
1258    out=out.substr(0,p);
1259
1260    // tell calling prog about the endposition
1261    endpos=startpos+p+1;
1262
1263    // descape all \ stuff inside the string now
1264    p=0;
1265    while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1266    {
1267       switch (out.at(p+1) )
1268       {
1269          case 'r':
1270             out.replace(p,2,"\r");
1271             break;
1272          case 'n':
1273             out.replace(p,2,"\n");
1274             break;
1275          default:
1276             out.erase(p,1);
1277       }
1278       p++;
1279    }
1280
1281    return out;
1282 } // eo descape(const std::string&,int,int&)
1283
1284
1285 string escape_shellarg(const string &input)
1286 {
1287    string output = "'";
1288    string::const_iterator it, it_end = input.end();
1289    for (it = input.begin(); it != it_end; ++it)
1290    {
1291       if ( (*it) == '\'')
1292          output += "'\\'";
1293
1294       output += *it;
1295    }
1296
1297    output += "'";
1298    return output;
1299 }