developer.intra2net.com Git - libi2ncommon/blob - src/stringfunc.cpp

   1 /*
   2 The software in this package is distributed under the GNU General
   3 Public License version 2 (with a special exception described below).
   4
   5 A copy of GNU General Public License (GPL) is included in this distribution,
   6 in the file COPYING.GPL.
   7
   8 As a special exception, if other files instantiate templates or use macros
   9 or inline functions from this file, or you compile this file and link it
  10 with other works to produce a work based on this file, this file
  11 does not by itself cause the resulting work to be covered
  12 by the GNU General Public License.
  13
  14 However the source code for this file must still be made available
  15 in accordance with section (3) of the GNU General Public License.
  16
  17 This exception does not invalidate any other reasons why a work based
  18 on this file might be covered by the GNU General Public License.
  19 */
  20 /** @file
  21  *
  22  * (c) Copyright 2007-2008 by Intra2net AG
  23  */
  24
  25 #include <iostream>
  26 #include <string>
  27 #include <sstream>
  28 #include <stdexcept>
  29 #include <algorithm>
  30 #include <cmath>    // for round()
  31
  32 #include <wchar.h>
  33 #include <stdlib.h>
  34 #include <iconv.h>
  35 #include <i18n.h>
  36
  37 #include <boost/numeric/conversion/cast.hpp>
  38
  39 #include <stringfunc.hxx>
  40
  41 using namespace std;
  42
  43 namespace I2n
  44 {
  45
  46
  47 namespace
  48 {
  49
  50 const std::string hexDigitsLower("0123456789abcdef");
  51 const std::string hexDigitsUpper("0123456789ABCDEF");
  52
  53
  54 struct UpperFunc
  55 {
  56    char operator() (char c)
  57    {
  58       return std::toupper(c);
  59    }
  60 }; // eo struct UpperFunc
  61
  62
  63 struct LowerFunc
  64 {
  65    char operator() (char c)
  66    {
  67       return std::tolower(c);
  68    }
  69 }; // eo struct LowerFunc
  70
  71
  72 } // eo namespace <anonymous>
  73
  74
  75
  76 /**
  77  * default list of Whitespaces (" \t\r\n");
  78  */
  79 const std::string Whitespaces = " \t\r\n";
  80
  81 /**
  82  * default list of lineendings ("\r\n");
  83  */
  84 const std::string LineEndings= "\r\n";
  85
  86
  87
  88 /**
  89  * @brief checks if a string begins with a given prefix.
  90  * @param[in,out] str the string which is tested
  91  * @param prefix the prefix which should be tested for.
  92  * @return @a true iff the prefix is not empty and the string begins with that prefix.
  93  */
  94 bool has_prefix(const std::string& str, const std::string& prefix)
  95 {
  96    if (prefix.empty() || str.empty() || str.size() < prefix.size() )
  97    {
  98       return false;
  99    }
 100    return str.compare(0, prefix.size(), prefix) == 0;
 101 } // eo has_prefix(const std::string&,const std::string&)
 102
 103
 104 /**
 105  * @brief checks if a string ends with a given suffix.
 106  * @param[in,out] str the string which is tested
 107  * @param suffix the suffix which should be tested for.
 108  * @return @a true iff the suffix is not empty and the string ends with that suffix.
 109  */
 110 bool has_suffix(const std::string& str, const std::string& suffix)
 111 {
 112    if (suffix.empty() || str.empty() || str.size() < suffix.size() )
 113    {
 114       return false;
 115    }
 116    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 117 } // eo has_suffix(const std::string&,const std::string&)
 118
 119
 120 /**
 121  * cut off characters from a given list from front and end of a string.
 122  * @param[in,out] str the string which should be trimmed.
 123  * @param charlist the list of characters to remove from beginning and end of string
 124  * @return the result string.
 125  */
 126 std::string trim_mod(std::string& str, const std::string& charlist)
 127 {
 128    // first: trim the beginning:
 129    std::string::size_type pos= str.find_first_not_of (charlist);
 130    if (pos == std::string::npos)
 131    {
 132       // whole string consists of charlist (or is already empty)
 133       str.clear();
 134       return str;
 135    }
 136    else if (pos>0)
 137    {
 138       // str starts with charlist
 139       str.erase(0,pos);
 140    }
 141    // now let's look at the tail:
 142    pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
 143    if ( pos < str.size() )
 144    {
 145       str.erase(pos, str.size()-pos);
 146    }
 147    return str;
 148 } // eo trim_mod(std::string&,const std::string&)
 149
 150
 151
 152 /**
 153  * removes last character from a string when it is in a list of chars to be removed.
 154  * @param[in,out] str the string.
 155  * @param what the list of chars which will be tested for.
 156  * @return the resulting string with last char removed (if applicable)
 157  */
 158 std::string chomp_mod(std::string& str, const std::string& what)
 159 {
 160    if (str.empty() || what.empty() )
 161    {
 162       return str;
 163    }
 164    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 165    {
 166       str.erase(str.size() - 1);
 167    }
 168    return str;
 169 } // eo chomp_mod(std::string&,const std::string&)
 170
 171
 172 /**
 173  * @brief converts a string to lower case.
 174  * @param[in,out] str the string to modify.
 175  * @return the string
 176  */
 177 std::string to_lower_mod(std::string& str)
 178 {
 179    std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
 180    return str;
 181 } // eo to_lower_mod(std::string&)
 182
 183
 184 /**
 185  * @brief converts a string to upper case.
 186  * @param[in,out] str the string to modify.
 187  * @return the string
 188  */
 189 std::string to_upper_mod(std::string& str)
 190 {
 191    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
 192    return str;
 193 } // eo to_upper_mod(std::string&)
 194
 195
 196
 197 /**
 198  * cut off characters from a given list from front and end of a string.
 199  * @param str the string which should be trimmed.
 200  * @param charlist the list of characters to remove from beginning and end of string
 201  * @return the result string.
 202  */
 203 std::string trim (const std::string& str, const std::string& charlist)
 204 {
 205    // first: trim the beginning:
 206    std::string::size_type pos0= str.find_first_not_of(charlist);
 207    if (pos0 == std::string::npos)
 208    {
 209       // whole string consists of charlist (or is already empty)
 210       return std::string();
 211    }
 212    // now let's look at the end:
 213    std::string::size_type pos1= str.find_last_not_of(charlist);
 214    return str.substr(pos0, pos1 - pos0 + 1);
 215 } // eo trim(const std:.string&,const std::string&)
 216
 217
 218 /**
 219  * removes last character from a string when it is in a list of chars to be removed.
 220  * @param str the string.
 221  * @param what the list of chars which will be tested for.
 222  * @return the resulting string with last char removed (if applicable)
 223  */
 224 std::string chomp (const std::string& str, const std::string& what)
 225 {
 226    if (str.empty() || what.empty() )
 227    {
 228       return str;
 229    }
 230    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 231    {
 232       return str.substr(0, str.size()-1);
 233    }
 234    return str;
 235 } // eo chomp(const std:.string&,const std::string&)
 236
 237
 238 /**
 239  * @brief returns a lower case version of a given string.
 240  * @param str the string
 241  * @return the lower case version of the string
 242  */
 243 std::string to_lower (const std::string& str)
 244 {
 245    std::string result(str);
 246    return to_lower_mod(result);
 247 } // eo to_lower(const std::string&)
 248
 249
 250 /**
 251  * @brief returns a upper case version of a given string.
 252  * @param str the string
 253  * @return the upper case version of the string
 254  */
 255 std::string to_upper(const std::string& str)
 256 {
 257    std::string result(str);
 258    return to_upper_mod(result);
 259 } // eo to_upper(const std::string&)
 260
 261
 262
 263 /**
 264  * @brief removes a given suffix from a string.
 265  * @param str the string.
 266  * @param suffix the suffix which should be removed if the string ends with it.
 267  * @return the string without the suffix.
 268  *
 269  * If the string ends with the suffix, it is removed. If the the string doesn't end
 270  * with the suffix the original string is returned.
 271  */
 272 std::string remove_suffix(const std::string& str, const std::string& suffix)
 273 {
 274    if (has_suffix(str,suffix) )
 275    {
 276       return str.substr(0, str.size()-suffix.size() );
 277    }
 278    return str;
 279 } // eo remove_suffix(const std::string&,const std::string&)
 280
 281
 282
 283 /**
 284  * @brief removes a given prefix from a string.
 285  * @param str the string.
 286  * @param prefix the prefix which should be removed if the string begins with it.
 287  * @return the string without the prefix.
 288  *
 289  * If the string begins with the prefix, it is removed. If the the string doesn't begin
 290  * with the prefix the original string is returned.
 291  */
 292 std::string remove_prefix(const std::string& str, const std::string& prefix)
 293 {
 294    if (has_prefix(str,prefix) )
 295    {
 296       return str.substr( prefix.size() );
 297    }
 298    return str;
 299 } // eo remove_prefix(const std::string&,const std::string&)
 300
 301
 302 /**
 303  * split a string to key and value delimited by a given delimiter.
 304  * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 305  * @param str the string which should be splitted.
 306  * @param[out] key the resulting key
 307  * @param[out] value the resulting value
 308  * @param delimiter the delimiter between key and value; default is '='.
 309  * @return @a true if the split was successful.
 310  */
 311 bool pair_split(
 312    const std::string& str,
 313    std::string& key,
 314    std::string& value,
 315    char delimiter)
 316 {
 317    std::string::size_type pos = str.find (delimiter);
 318    if (pos == std::string::npos) return false;
 319    key= str.substr(0,pos);
 320    value= str.substr(pos+1);
 321    trim_mod(key);
 322    trim_mod(value);
 323    return true;
 324 } // eo pair_split(const std::string&,std::string&,std::string&,char)
 325
 326
 327 /**
 328  * splits a string by given delimiter
 329  *
 330  * @param[in] str the string which should be splitted.
 331  * @param[out] result the list resulting from splitting  @a str.
 332  * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 333  * @param[in] omit_empty should empty parts not be stored?
 334  * @param[in] trim_list list of characters the parts should be trimmed by.
 335  *  (empty string results in no trim)
 336  */
 337 void split_string(
 338    const std::string& str,
 339    std::list<std::string>& result,
 340    const std::string& delimiter,
 341    bool omit_empty,
 342    const std::string& trim_list
 343 )
 344 {
 345    std::string::size_type pos, last_pos=0;
 346    bool delimiter_found= false;
 347    while ( last_pos < str.size()  && last_pos != std::string::npos)
 348    {
 349       pos= str.find(delimiter, last_pos);
 350       std::string part;
 351       if (pos == std::string::npos)
 352       {
 353          part= str.substr(last_pos);
 354          delimiter_found= false;
 355       }
 356       else
 357       {
 358          part= str.substr(last_pos, pos-last_pos);
 359          delimiter_found=true;
 360       }
 361       if (pos != std::string::npos)
 362       {
 363          last_pos= pos+ delimiter.size();
 364       }
 365       else
 366       {
 367          last_pos= std::string::npos;
 368       }
 369       if (!trim_list.empty() ) trim_mod (part, trim_list);
 370       if (omit_empty && part.empty() ) continue;
 371       result.push_back( part );
 372    }
 373    // if the string ends with a delimiter we need to append an empty string if no omit_empty
 374    // was given.
 375    // (this way we keep the split result consistent to a join operation)
 376    if (delimiter_found && !omit_empty)
 377    {
 378       result.push_back("");
 379    }
 380 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
 381
 382
 383 /**
 384  * splits a string by a given delimiter
 385  * @param str the string which should be splitted.
 386  * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 387  * @param[in] omit_empty should empty parts not be stored?
 388  * @param[in] trim_list list of characters the parts should be trimmed by.
 389  *  (empty string results in no trim)
 390  * @return the list resulting from splitting @a str.
 391  */
 392 std::list<std::string> split_string(
 393    const std::string& str,
 394    const std::string& delimiter,
 395    bool omit_empty,
 396    const std::string& trim_list
 397 )
 398 {
 399    std::list<std::string> result;
 400    split_string(str, result, delimiter, omit_empty, trim_list);
 401    return result;
 402 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
 403
 404
 405 /**
 406  * @brief joins a list of strings into a single string.
 407  *
 408  * This funtion is (basically) the reverse operation of @a split_string.
 409  *
 410  * @param parts the list of strings.
 411  * @param delimiter the delimiter which is inserted between the strings.
 412  * @return the joined string.
 413  */
 414 std::string join_string(
 415    const std::list< std::string >& parts,
 416    const std::string& delimiter
 417 )
 418 {
 419    std::string result;
 420    if (! parts.empty() )
 421    {
 422       std::list< std::string >::const_iterator it= parts.begin();
 423       result = *it;
 424       while ( ++it != parts.end() )
 425       {
 426          result+= delimiter;
 427          result+= *it;
 428       }
 429    }
 430    return result;
 431 } // eo join_string(const std::list< std::string >&,const std::string&)
 432
 433
 434 /** @brief same as join_string for list, except uses a vector */
 435 std::string join_string(
 436    const std::vector< std::string >& parts,
 437    const std::string& delimiter
 438 )
 439 {
 440    std::string result;
 441    if (! parts.empty() )
 442    {
 443       std::vector< std::string >::const_iterator it= parts.begin();
 444       result = *it;
 445       while ( ++it != parts.end() )
 446       {
 447          result+= delimiter;
 448          result+= *it;
 449       }
 450    }
 451    return result;
 452 } // eo join_string(const std::vector< std::string >&,const std::string&)
 453
 454
 455
 456 /*
 457 ** conversions
 458 */
 459
 460
 461 /**
 462  * @brief returns a hex string from a binary string.
 463  * @param str the (binary) string
 464  * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 465  * @return the string in hex notation.
 466  */
 467 std::string convert_binary_to_hex(
 468    const std::string& str,
 469    bool upper_case_digits
 470 )
 471 {
 472    std::string result;
 473    std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
 474    for ( std::string::const_iterator it= str.begin();
 475          it != str.end();
 476          ++it)
 477    {
 478       result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
 479       result.push_back( hexDigits[ (*it) & 0x0f ] );
 480    }
 481    return result;
 482 } // eo convert_binary_to_hex(const std::string&,bool)
 483
 484
 485 /**
 486  * @brief converts a hex digit string to binary string.
 487  * @param str hex digit string
 488  * @return the binary string.
 489  *
 490  * The hex digit string may contains white spaces or colons which are treated
 491  * as delimiters between hex digit groups.
 492  *
 493  * @todo rework the handling of half nibbles (consistency)!
 494  */
 495 std::string convert_hex_to_binary(
 496    const std::string& str
 497 )
 498 throw (std::runtime_error)
 499 {
 500    std::string result;
 501    char c= 0;
 502    bool hasNibble= false;
 503    bool lastWasWS= true;
 504    for ( std::string::const_iterator it= str.begin();
 505          it != str.end();
 506          ++it)
 507    {
 508       std::string::size_type p = hexDigitsLower.find( *it );
 509       if (p== std::string::npos)
 510       {
 511          p= hexDigitsUpper.find( *it );
 512       }
 513       if (p == std::string::npos)
 514       {
 515          if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
 516                 or ( *it == ':') // or a colon?
 517             )
 518          {
 519             // we treat that as a valid delimiter:
 520             if (hasNibble)
 521             {
 522                // 1 nibble before WS is treate as lower part:
 523                result.push_back(c);
 524                // reset state:
 525                hasNibble= false;
 526             }
 527             lastWasWS= true;
 528             continue;
 529          }
 530       }
 531       if (p == std::string::npos )
 532       {
 533          throw runtime_error("illegal character in hex digit string: " + str);
 534       }
 535       lastWasWS= false;
 536       if (hasNibble)
 537       {
 538          c<<=4;
 539       }
 540       else
 541       {
 542          c=0;
 543       }
 544       c+= (p & 0x0f);
 545       if (hasNibble)
 546       {
 547          //we already had a nibble, so a char is complete now:
 548          result.push_back( c );
 549          hasNibble=false;
 550       }
 551       else
 552       {
 553          // this is the first nibble of a new char:
 554          hasNibble=true;
 555       }
 556    }
 557    if (hasNibble)
 558    {
 559       //well, there is one nibble left
 560       // let's do some heuristics:
 561       if (lastWasWS)
 562       {
 563          // if the preceeding character was a white space (or a colon)
 564          // we treat the nibble as lower part:
 565          //( this is consistent with shortened hex notations where leading zeros are not noted)
 566          result.push_back( c );
 567       }
 568       else
 569       {
 570          // if it was part of a hex digit chain, we treat it as UPPER part (!!)
 571          result.push_back( c << 4 );
 572       }
 573    }
 574    return result;
 575 } // eo convert_hex_to_binary(const std::string&)
 576
 577
 578 } // eo namespace I2n
 579
 580
 581
 582
 583 std::string iso_to_utf8(const std::string& isostring)
 584 {
 585    string result;
 586
 587    iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
 588
 589    if (iso_to_utf8 == (iconv_t)-1)
 590       throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
 591
 592    size_t in_size=isostring.size();
 593    size_t out_size=in_size*4;
 594
 595    char *buf = (char *)malloc(out_size+1);
 596    if (buf == NULL)
 597       throw runtime_error("out of memory for iconv buffer");
 598
 599    char *in = (char *)isostring.c_str();
 600    char *out = buf;
 601    iconv(i2utf8, &in, &in_size, &out, &out_size);
 602
 603    buf[isostring.size()*4-out_size]=0;
 604
 605    result=buf;
 606
 607    free(buf);
 608    iconv_close(i2utf8);
 609
 610    return result;
 611 }
 612
 613 std::string utf8_to_iso(const std::string& utf8string)
 614 {
 615    string result;
 616
 617    iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
 618
 619    if (utf82iso == (iconv_t)-1)
 620       throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
 621
 622    size_t in_size=utf8string.size();
 623    size_t out_size=in_size;
 624
 625    char *buf = (char *)malloc(out_size+1);
 626    if (buf == NULL)
 627       throw runtime_error("out of memory for iconv buffer");
 628
 629    char *in = (char *)utf8string.c_str();
 630    char *out = buf;
 631    iconv(utf82iso, &in, &in_size, &out, &out_size);
 632
 633    buf[utf8string.size()-out_size]=0;
 634
 635    result=buf;
 636
 637    free(buf);
 638    iconv_close(utf82iso);
 639
 640    return result;
 641 }
 642
 643 wchar_t* utf8_to_wbuf(const std::string& utf8string)
 644 {
 645    iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
 646
 647    if (utf82wstr == (iconv_t)-1)
 648       throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
 649
 650    size_t in_size=utf8string.size();
 651    size_t out_size= (in_size+1)*sizeof(wchar_t);
 652
 653    wchar_t *buf = (wchar_t *)malloc(out_size);
 654    if (buf == NULL)
 655       throw runtime_error("out of memory for iconv buffer");
 656
 657    char *in = (char *)utf8string.c_str();
 658    char *out = (char*) buf;
 659    if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
 660       throw runtime_error("error converting char encodings");
 661
 662    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
 663
 664    iconv_close(utf82wstr);
 665
 666    return buf;
 667 }
 668
 669 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
 670 {
 671    string result;
 672
 673    iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
 674
 675    if (utf7imap2utf8 == (iconv_t)-1)
 676       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 677
 678    size_t in_size=utf7imapstring.size();
 679    size_t out_size=in_size*4;
 680
 681    char *buf = (char *)malloc(out_size+1);
 682    if (buf == NULL)
 683       throw runtime_error("out of memory for iconv buffer");
 684
 685    char *in = (char *)utf7imapstring.c_str();
 686    char *out = buf;
 687    iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
 688
 689    buf[utf7imapstring.size()*4-out_size]=0;
 690
 691    result=buf;
 692
 693    free(buf);
 694    iconv_close(utf7imap2utf8);
 695
 696    return result;
 697 }
 698
 699 std::string utf8_to_utf7imap(const std::string& utf8string)
 700 {
 701    string result;
 702
 703    iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
 704
 705    if (utf82utf7imap == (iconv_t)-1)
 706       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 707
 708    // UTF-7 is base64 encoded, a buffer 10x as large
 709    // as the utf-8 buffer should be enough. If not the string will be truncated.
 710    size_t in_size=utf8string.size();
 711    size_t out_size=in_size*10;
 712
 713    char *buf = (char *)malloc(out_size+1);
 714    if (buf == NULL)
 715       throw runtime_error("out of memory for iconv buffer");
 716
 717    char *in = (char *)utf8string.c_str();
 718    char *out = buf;
 719    iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
 720
 721    buf[utf8string.size()*10-out_size]= 0;
 722
 723    result=buf;
 724
 725    free(buf);
 726    iconv_close(utf82utf7imap);
 727
 728    return result;
 729 }
 730
 731 // Tokenize string by (html) tags
 732 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
 733 {
 734    string::size_type pos, len = input.size();
 735    bool inside_tag = false;
 736    string current;
 737
 738    for (pos = 0; pos < len; pos++)
 739    {
 740       if (input[pos] == '<')
 741       {
 742          inside_tag = true;
 743
 744          if (!current.empty() )
 745          {
 746             tokenized.push_back( make_pair(current, false) );
 747             current = "";
 748          }
 749
 750          current += input[pos];
 751       }
 752       else if (input[pos] == '>' && inside_tag)
 753       {
 754          current += input[pos];
 755          inside_tag = false;
 756          if (!current.empty() )
 757          {
 758             tokenized.push_back( make_pair(current, true) );
 759             current = "";
 760          }
 761       }
 762       else
 763          current += input[pos];
 764    }
 765
 766    // String left over in buffer?
 767    if (!current.empty() )
 768       tokenized.push_back( make_pair(current, false) );
 769 } // eo tokenize_by_tag
 770
 771
 772 std::string strip_html_tags(const std::string &input)
 773 {
 774    // Pair first: string, second: isTag
 775    vector<pair<string,bool> > tokenized;
 776    tokenize_by_tag (tokenized, input);
 777
 778    string output;
 779    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 780    for (token = tokenized.begin(); token != tokens_end; ++token)
 781       if (!token->second)
 782          output += token->first;
 783
 784    return output;
 785 } // eo strip_html_tags
 786
 787
 788 // Smart-encode HTML en
 789 string smart_html_entities(const std::string &input)
 790 {
 791    // Pair first: string, second: isTag
 792    vector<pair<string,bool> > tokenized;
 793    tokenize_by_tag (tokenized, input);
 794
 795    string output;
 796    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 797    for (token = tokenized.begin(); token != tokens_end; ++token)
 798    {
 799       // keep HTML tags as they are
 800       if (token->second)
 801          output += token->first;
 802       else
 803          output += html_entities(token->first);
 804    }
 805
 806    return output;
 807 }
 808
 809
 810 string::size_type find_8bit(const std::string &str)
 811 {
 812    string::size_type l=str.size();
 813    for (string::size_type p=0; p < l; p++)
 814       if (static_cast<unsigned char>(str[p]) > 127)
 815          return p;
 816
 817    return string::npos;
 818 }
 819
 820 // encoded UTF-8 chars into HTML entities
 821 string html_entities(std::string str)
 822 {
 823    // Normal chars
 824    replace_all (str, "&", "&amp;");
 825    replace_all (str, "<", "&lt;");
 826    replace_all (str, ">", "&gt;");
 827    replace_all (str, "\"", "&quot;");
 828    replace_all (str, "'", "&#x27;");
 829    replace_all (str, "/", "&#x2F;");
 830
 831    // Umlauts
 832    replace_all (str, "\xC3\xA4", "&auml;");
 833    replace_all (str, "\xC3\xB6", "&ouml;");
 834    replace_all (str, "\xC3\xBC", "&uuml;");
 835    replace_all (str, "\xC3\x84", "&Auml;");
 836    replace_all (str, "\xC3\x96", "&Ouml;");
 837    replace_all (str, "\xC3\x9C", "&Uuml;");
 838
 839    // Misc
 840    replace_all (str, "\xC3\x9F", "&szlig;");
 841
 842    // conversion of remaining non-ASCII chars needed?
 843    // just do if needed because of performance
 844    if (find_8bit(str) != string::npos)
 845    {
 846       // convert to fixed-size encoding UTF-32
 847       wchar_t* wbuf=utf8_to_wbuf(str);
 848       ostringstream target;
 849
 850       // replace all non-ASCII chars with HTML representation
 851       for (int p=0; wbuf[p] != 0; p++)
 852       {
 853          unsigned int c=wbuf[p];
 854
 855          if (c <= 127)
 856             target << static_cast<unsigned char>(c);
 857          else
 858             target << "&#" << c << ';';
 859       }
 860
 861       free(wbuf);
 862
 863       str=target.str();
 864    }
 865
 866    return str;
 867 } // eo html_entities(std::string)
 868
 869 // convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
 870 string html_entities_to_console(std::string str)
 871 {
 872    // Normal chars
 873    replace_all (str, "&amp;", "&");
 874    replace_all (str, "&lt;", "<");
 875    replace_all (str, "&gt;", ">");
 876    replace_all (str, "&quot;", "\"");
 877    replace_all (str, "&#x27;", "'");
 878    replace_all (str, "&#x2F;", "/");
 879
 880    // Umlauts
 881    replace_all (str, "&auml;", "ae");
 882    replace_all (str, "&ouml;", "oe");
 883    replace_all (str, "&uuml;", "ue");
 884    replace_all (str, "&Auml;", "Ae");
 885    replace_all (str, "&Ouml;", "Oe");
 886    replace_all (str, "&Uuml;", "Ue");
 887
 888    // Misc
 889    replace_all (str, "&szlig;", "ss");
 890
 891    return str;
 892 }
 893
 894 bool replace_all(string &base, const char *ist, const char *soll)
 895 {
 896    string i=ist;
 897    string s=soll;
 898    return replace_all(base,&i,&s);
 899 }
 900
 901 bool replace_all(string &base, const string &ist, const char *soll)
 902 {
 903    string s=soll;
 904    return replace_all(base,&ist,&s);
 905 }
 906
 907 bool replace_all(string &base, const string *ist, const string *soll)
 908 {
 909    return replace_all(base,*ist,*soll);
 910 }
 911
 912 bool replace_all(string &base, const char *ist, const string *soll)
 913 {
 914    string i=ist;
 915    return replace_all(base,&i,soll);
 916 }
 917
 918 bool replace_all(string &base, const string &ist, const string &soll)
 919 {
 920    bool found_ist = false;
 921    string::size_type a=0;
 922
 923    if (ist.empty() )
 924       throw runtime_error ("replace_all called with empty search string");
 925
 926    while ( (a=base.find(ist,a) ) != string::npos)
 927    {
 928       base.replace(a,ist.size(),soll);
 929       a=a+soll.size();
 930       found_ist = true;
 931    }
 932
 933    return found_ist;
 934 }
 935
 936 /**
 937  * @brief replaces all characters that could be problematic or impose a security risk when being logged
 938  * @param str the original string
 939  * @param replace_with the character to replace the unsafe chars with
 940  * @return a string that is safe to send to syslog or other logfiles
 941  *
 942  * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
 943  * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
 944  * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
 945  *
 946  */
 947 std::string sanitize_for_logging(const std::string &str, const char replace_with)
 948 {
 949     std::string output=str;
 950
 951     const string::size_type len = output.size();
 952     for (std::string::size_type p=0; p < len; p++)
 953         if (output[p] < 0x20 || output[p] > 0x7E)
 954             output[p]=replace_with;
 955
 956     return output;
 957 }
 958
 959 #if 0
 960 string to_lower(const string &src)
 961 {
 962    string dst = src;
 963
 964    string::size_type pos, end = dst.size();
 965    for (pos = 0; pos < end; pos++)
 966       dst[pos] = tolower(dst[pos]);
 967
 968    return dst;
 969 }
 970
 971 string to_upper(const string &src)
 972 {
 973    string dst = src;
 974
 975    string::size_type pos, end = dst.size();
 976    for (pos = 0; pos < end; pos++)
 977       dst[pos] = toupper(dst[pos]);
 978
 979    return dst;
 980 }
 981 #endif
 982
 983 const int MAX_UNIT_FORMAT_SYMBOLS = 6;
 984
 985 const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
 986         " B",
 987         " KB",
 988         " MB",
 989         " GB",
 990         " TB",
 991         " PB"
 992 };
 993
 994 const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
 995         i18n_noop(" Bytes"),
 996         i18n_noop(" KBytes"),
 997         i18n_noop(" MBytes"),
 998         i18n_noop(" GBytes"),
 999         i18n_noop(" TBytes"),
1000         i18n_noop(" PBytes")
1001 };
1002
1003
1004 long double rounding_upwards(
1005         const long double number,
1006         const int rounding_multiplier
1007 )
1008 {
1009     long double rounded_number;
1010     rounded_number = number * rounding_multiplier;
1011     rounded_number += 0.5;
1012     rounded_number = (int64_t) (rounded_number);
1013     rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
1014
1015     return rounded_number;
1016 }
1017
1018
1019 string nice_unit_format(
1020         const int64_t input,
1021         const UnitFormat format,
1022         const UnitBase base
1023 )
1024 {
1025    // select the system of units (decimal or binary)
1026    int multiple = 0;
1027    if (base == UnitBase1000)
1028    {
1029        multiple = 1000;
1030    }
1031    else
1032    {
1033        multiple = 1024;
1034    }
1035
1036    long double size = input;
1037
1038    // check the size of the input number to fit in the appropriate symbol
1039    int sizecount = 0;
1040    while (size > multiple)
1041    {
1042        size = size / multiple;
1043        sizecount++;
1044
1045        // rollback to the previous values and stop the loop when cannot
1046        // represent the number length.
1047        if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1048        {
1049            size = size * multiple;
1050            sizecount--;
1051            break;
1052        }
1053    }
1054
1055    // round the input number "half up" to multiples of 10
1056    const int rounding_multiplier = 10;
1057    size = rounding_upwards(size, rounding_multiplier);
1058
1059    // format the input number, placing the appropriate symbol
1060    ostringstream out;
1061    out.setf (ios::fixed);
1062    if (format == ShortUnitFormat)
1063    {
1064        out.precision(1);
1065        out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
1066    }
1067    else
1068    {
1069        out.precision (2);
1070        out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
1071    }
1072
1073    return out.str();
1074 } // eo nice_unit_format(int input)
1075
1076
1077 string nice_unit_format(
1078         const double input,
1079         const UnitFormat format,
1080         const UnitBase base
1081 )
1082 {
1083     // round as double and cast to int64_t
1084     // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1085     int64_t input_casted_and_rounded =
1086         boost::numeric_cast<int64_t>( round(input) );
1087
1088     // now call other
1089     return nice_unit_format( input_casted_and_rounded, format, base );
1090 } // eo nice_unit_format(double input)
1091
1092
1093 string escape(const string &s)
1094 {
1095    string out(s);
1096    string::size_type p;
1097
1098    p=0;
1099    while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1100    {
1101       out.insert (p,"\\");
1102       p+=2;
1103    }
1104
1105    p=0;
1106    while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1107    {
1108       out.replace (p,1,"\\r");
1109       p+=2;
1110    }
1111
1112    p=0;
1113    while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1114    {
1115       out.replace (p,1,"\\n");
1116       p+=2;
1117    }
1118
1119    out='"'+out+'"';
1120
1121    return out;
1122 } // eo scape(const std::string&)
1123
1124
1125 string descape(const string &s, int startpos, int &endpos)
1126 {
1127    string out;
1128
1129    if (s.at(startpos) != '"')
1130       throw out_of_range("value not type escaped string");
1131
1132    out=s.substr(startpos+1);
1133    string::size_type p=0;
1134
1135    // search for the end of the string
1136    while ( (p=out.find("\"",p) ) !=out.npos)
1137    {
1138       int e=p-1;
1139       bool escaped=false;
1140
1141       // the " might be escaped with a backslash
1142       while (e>=0 && out.at (e) =='\\')
1143       {
1144          if (escaped == false)
1145             escaped=true;
1146          else
1147             escaped=false;
1148
1149          e--;
1150       }
1151
1152       if (escaped==false)
1153          break;
1154       else
1155          p++;
1156    }
1157
1158    // we now have the end of the string
1159    out=out.substr(0,p);
1160
1161    // tell calling prog about the endposition
1162    endpos=startpos+p+1;
1163
1164    // descape all \ stuff inside the string now
1165    p=0;
1166    while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1167    {
1168       switch (out.at(p+1) )
1169       {
1170          case 'r':
1171             out.replace(p,2,"\r");
1172             break;
1173          case 'n':
1174             out.replace(p,2,"\n");
1175             break;
1176          default:
1177             out.erase(p,1);
1178       }
1179       p++;
1180    }
1181
1182    return out;
1183 } // eo descape(const std::string&,int,int&)
1184
1185
1186 string escape_shellarg(const string &input)
1187 {
1188    string output = "'";
1189    string::const_iterator it, it_end = input.end();
1190    for (it = input.begin(); it != it_end; ++it)
1191    {
1192       if ( (*it) == '\'')
1193          output += "'\\'";
1194
1195       output += *it;
1196    }
1197
1198    output += "'";
1199    return output;
1200 }