developer.intra2net.com Git - libi2ncommon/blob - src/stringfunc.cpp

   1 /*
   2 The software in this package is distributed under the GNU General
   3 Public License version 2 (with a special exception described below).
   4
   5 A copy of GNU General Public License (GPL) is included in this distribution,
   6 in the file COPYING.GPL.
   7
   8 As a special exception, if other files instantiate templates or use macros
   9 or inline functions from this file, or you compile this file and link it
  10 with other works to produce a work based on this file, this file
  11 does not by itself cause the resulting work to be covered
  12 by the GNU General Public License.
  13
  14 However the source code for this file must still be made available
  15 in accordance with section (3) of the GNU General Public License.
  16
  17 This exception does not invalidate any other reasons why a work based
  18 on this file might be covered by the GNU General Public License.
  19 */
  20 /** @file
  21  *
  22  * (c) Copyright 2007-2008 by Intra2net AG
  23  */
  24
  25 #include <iostream>
  26 #include <string>
  27 #include <sstream>
  28 #include <stdexcept>
  29 #include <algorithm>
  30 #include <cmath>    // for round()
  31
  32 #include <wchar.h>
  33 #include <stdlib.h>
  34 #include <iconv.h>
  35 #include <i18n.h>
  36
  37 #include <boost/numeric/conversion/cast.hpp>
  38
  39 #include <stringfunc.hxx>
  40
  41 using namespace std;
  42
  43 namespace I2n
  44 {
  45
  46
  47 namespace
  48 {
  49
  50 const std::string hexDigitsLower("0123456789abcdef");
  51 const std::string hexDigitsUpper("0123456789ABCDEF");
  52
  53
  54 struct UpperFunc
  55 {
  56    char operator() (char c)
  57    {
  58       return std::toupper(c);
  59    }
  60 }; // eo struct UpperFunc
  61
  62
  63 struct LowerFunc
  64 {
  65    char operator() (char c)
  66    {
  67       return std::tolower(c);
  68    }
  69 }; // eo struct LowerFunc
  70
  71
  72 } // eo namespace <anonymous>
  73
  74
  75
  76 /**
  77  * default list of Whitespaces (" \t\r\n");
  78  */
  79 const std::string Whitespaces = " \t\r\n";
  80
  81 /**
  82  * default list of lineendings ("\r\n");
  83  */
  84 const std::string LineEndings= "\r\n";
  85
  86
  87
  88 /**
  89  * @brief checks if a string begins with a given prefix.
  90  * @param[in,out] str the string which is tested
  91  * @param prefix the prefix which should be tested for.
  92  * @return @a true iff the prefix is not empty and the string begins with that prefix.
  93  */
  94 bool has_prefix(const std::string& str, const std::string& prefix)
  95 {
  96    if (prefix.empty() || str.empty() || str.size() < prefix.size() )
  97    {
  98       return false;
  99    }
 100    return str.compare(0, prefix.size(), prefix) == 0;
 101 } // eo has_prefix(const std::string&,const std::string&)
 102
 103
 104 /**
 105  * @brief checks if a string ends with a given suffix.
 106  * @param[in,out] str the string which is tested
 107  * @param suffix the suffix which should be tested for.
 108  * @return @a true iff the suffix is not empty and the string ends with that suffix.
 109  */
 110 bool has_suffix(const std::string& str, const std::string& suffix)
 111 {
 112    if (suffix.empty() || str.empty() || str.size() < suffix.size() )
 113    {
 114       return false;
 115    }
 116    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 117 } // eo has_suffix(const std::string&,const std::string&)
 118
 119
 120 /**
 121  * cut off characters from a given list from front and end of a string.
 122  * @param[in,out] str the string which should be trimmed.
 123  * @param charlist the list of characters to remove from beginning and end of string
 124  * @return the result string.
 125  */
 126 std::string trim_mod(std::string& str, const std::string& charlist)
 127 {
 128    // first: trim the beginning:
 129    std::string::size_type pos= str.find_first_not_of (charlist);
 130    if (pos == std::string::npos)
 131    {
 132       // whole string consists of charlist (or is already empty)
 133       str.clear();
 134       return str;
 135    }
 136    else if (pos>0)
 137    {
 138       // str starts with charlist
 139       str.erase(0,pos);
 140    }
 141    // now let's look at the tail:
 142    pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
 143    if ( pos < str.size() )
 144    {
 145       str.erase(pos, str.size()-pos);
 146    }
 147    return str;
 148 } // eo trim_mod(std::string&,const std::string&)
 149
 150
 151
 152 /**
 153  * removes last character from a string when it is in a list of chars to be removed.
 154  * @param[in,out] str the string.
 155  * @param what the list of chars which will be tested for.
 156  * @return the resulting string with last char removed (if applicable)
 157  */
 158 std::string chomp_mod(std::string& str, const std::string& what)
 159 {
 160    if (str.empty() || what.empty() )
 161    {
 162       return str;
 163    }
 164    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 165    {
 166       str.erase(str.size() - 1);
 167    }
 168    return str;
 169 } // eo chomp_mod(std::string&,const std::string&)
 170
 171
 172 /**
 173  * @brief converts a string to lower case.
 174  * @param[in,out] str the string to modify.
 175  * @return the string
 176  */
 177 std::string to_lower_mod(std::string& str)
 178 {
 179    std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
 180    return str;
 181 } // eo to_lower_mod(std::string&)
 182
 183
 184 /**
 185  * @brief converts a string to upper case.
 186  * @param[in,out] str the string to modify.
 187  * @return the string
 188  */
 189 std::string to_upper_mod(std::string& str)
 190 {
 191    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
 192    return str;
 193 } // eo to_upper_mod(std::string&)
 194
 195
 196
 197 /**
 198  * cut off characters from a given list from front and end of a string.
 199  * @param str the string which should be trimmed.
 200  * @param charlist the list of characters to remove from beginning and end of string
 201  * @return the result string.
 202  */
 203 std::string trim (const std::string& str, const std::string& charlist)
 204 {
 205    // first: trim the beginning:
 206    std::string::size_type pos0= str.find_first_not_of(charlist);
 207    if (pos0 == std::string::npos)
 208    {
 209       // whole string consists of charlist (or is already empty)
 210       return std::string();
 211    }
 212    // now let's look at the end:
 213    std::string::size_type pos1= str.find_last_not_of(charlist);
 214    return str.substr(pos0, pos1 - pos0 + 1);
 215 } // eo trim(const std:.string&,const std::string&)
 216
 217
 218 /**
 219  * removes last character from a string when it is in a list of chars to be removed.
 220  * @param str the string.
 221  * @param what the list of chars which will be tested for.
 222  * @return the resulting string with last char removed (if applicable)
 223  */
 224 std::string chomp (const std::string& str, const std::string& what)
 225 {
 226    if (str.empty() || what.empty() )
 227    {
 228       return str;
 229    }
 230    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 231    {
 232       return str.substr(0, str.size()-1);
 233    }
 234    return str;
 235 } // eo chomp(const std:.string&,const std::string&)
 236
 237
 238 /**
 239  * @brief returns a lower case version of a given string.
 240  * @param str the string
 241  * @return the lower case version of the string
 242  */
 243 std::string to_lower (const std::string& str)
 244 {
 245    std::string result(str);
 246    return to_lower_mod(result);
 247 } // eo to_lower(const std::string&)
 248
 249
 250 /**
 251  * @brief returns a upper case version of a given string.
 252  * @param str the string
 253  * @return the upper case version of the string
 254  */
 255 std::string to_upper(const std::string& str)
 256 {
 257    std::string result(str);
 258    return to_upper_mod(result);
 259 } // eo to_upper(const std::string&)
 260
 261
 262
 263 /**
 264  * @brief removes a given suffix from a string.
 265  * @param str the string.
 266  * @param suffix the suffix which should be removed if the string ends with it.
 267  * @return the string without the suffix.
 268  *
 269  * If the string ends with the suffix, it is removed. If the the string doesn't end
 270  * with the suffix the original string is returned.
 271  */
 272 std::string remove_suffix(const std::string& str, const std::string& suffix)
 273 {
 274    if (has_suffix(str,suffix) )
 275    {
 276       return str.substr(0, str.size()-suffix.size() );
 277    }
 278    return str;
 279 } // eo remove_suffix(const std::string&,const std::string&)
 280
 281
 282
 283 /**
 284  * @brief removes a given prefix from a string.
 285  * @param str the string.
 286  * @param prefix the prefix which should be removed if the string begins with it.
 287  * @return the string without the prefix.
 288  *
 289  * If the string begins with the prefix, it is removed. If the the string doesn't begin
 290  * with the prefix the original string is returned.
 291  */
 292 std::string remove_prefix(const std::string& str, const std::string& prefix)
 293 {
 294    if (has_prefix(str,prefix) )
 295    {
 296       return str.substr( prefix.size() );
 297    }
 298    return str;
 299 } // eo remove_prefix(const std::string&,const std::string&)
 300
 301
 302 /**
 303  * split a string to key and value delimited by a given delimiter.
 304  * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 305  * @param str the string which should be splitted.
 306  * @param[out] key the resulting key
 307  * @param[out] value the resulting value
 308  * @param delimiter the delimiter between key and value; default is '='.
 309  * @return @a true if the split was successful.
 310  */
 311 bool pair_split(
 312    const std::string& str,
 313    std::string& key,
 314    std::string& value,
 315    char delimiter)
 316 {
 317    std::string::size_type pos = str.find (delimiter);
 318    if (pos == std::string::npos) return false;
 319    key= str.substr(0,pos);
 320    value= str.substr(pos+1);
 321    trim_mod(key);
 322    trim_mod(value);
 323    return true;
 324 } // eo pair_split(const std::string&,std::string&,std::string&,char)
 325
 326
 327 /**
 328  * splits a string by given delimiter
 329  *
 330  * @param[in] str the string which should be splitted.
 331  * @param[out] result the list resulting from splitting  @a str.
 332  * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 333  * @param[in] omit_empty should empty parts not be stored?
 334  * @param[in] trim_list list of characters the parts should be trimmed by.
 335  *  (empty string results in no trim)
 336  */
 337 void split_string(
 338    const std::string& str,
 339    std::list<std::string>& result,
 340    const std::string& delimiter,
 341    bool omit_empty,
 342    const std::string& trim_list
 343 )
 344 {
 345    std::string::size_type pos, last_pos=0;
 346    bool delimiter_found= false;
 347    while ( last_pos < str.size()  && last_pos != std::string::npos)
 348    {
 349       pos= str.find(delimiter, last_pos);
 350       std::string part;
 351       if (pos == std::string::npos)
 352       {
 353          part= str.substr(last_pos);
 354          delimiter_found= false;
 355       }
 356       else
 357       {
 358          part= str.substr(last_pos, pos-last_pos);
 359          delimiter_found=true;
 360       }
 361       if (pos != std::string::npos)
 362       {
 363          last_pos= pos+ delimiter.size();
 364       }
 365       else
 366       {
 367          last_pos= std::string::npos;
 368       }
 369       if (!trim_list.empty() ) trim_mod (part, trim_list);
 370       if (omit_empty && part.empty() ) continue;
 371       result.push_back( part );
 372    }
 373    // if the string ends with a delimiter we need to append an empty string if no omit_empty
 374    // was given.
 375    // (this way we keep the split result consistent to a join operation)
 376    if (delimiter_found && !omit_empty)
 377    {
 378       result.push_back("");
 379    }
 380 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
 381
 382
 383 /**
 384  * splits a string by a given delimiter
 385  * @param str the string which should be splitted.
 386  * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 387  * @param[in] omit_empty should empty parts not be stored?
 388  * @param[in] trim_list list of characters the parts should be trimmed by.
 389  *  (empty string results in no trim)
 390  * @return the list resulting from splitting @a str.
 391  */
 392 std::list<std::string> split_string(
 393    const std::string& str,
 394    const std::string& delimiter,
 395    bool omit_empty,
 396    const std::string& trim_list
 397 )
 398 {
 399    std::list<std::string> result;
 400    split_string(str, result, delimiter, omit_empty, trim_list);
 401    return result;
 402 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
 403
 404
 405 /**
 406  * @brief joins a list of strings into a single string.
 407  *
 408  * This funtion is (basically) the reverse operation of @a split_string.
 409  *
 410  * @param parts the list of strings.
 411  * @param delimiter the delimiter which is inserted between the strings.
 412  * @return the joined string.
 413  */
 414 std::string join_string(
 415    const std::list< std::string >& parts,
 416    const std::string& delimiter
 417 )
 418 {
 419    std::string result;
 420    if (! parts.empty() )
 421    {
 422       std::list< std::string >::const_iterator it= parts.begin();
 423       result = *it;
 424       while ( ++it != parts.end() )
 425       {
 426          result+= delimiter;
 427          result+= *it;
 428       }
 429    }
 430    return result;
 431 } // eo join_string(const std::list< std::string >&,const std::string&)
 432
 433
 434
 435 /*
 436 ** conversions
 437 */
 438
 439
 440 /**
 441  * @brief returns a hex string from a binary string.
 442  * @param str the (binary) string
 443  * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 444  * @return the string in hex notation.
 445  */
 446 std::string convert_binary_to_hex(
 447    const std::string& str,
 448    bool upper_case_digits
 449 )
 450 {
 451    std::string result;
 452    std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
 453    for ( std::string::const_iterator it= str.begin();
 454          it != str.end();
 455          ++it)
 456    {
 457       result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
 458       result.push_back( hexDigits[ (*it) & 0x0f ] );
 459    }
 460    return result;
 461 } // eo convert_binary_to_hex(const std::string&,bool)
 462
 463
 464 /**
 465  * @brief converts a hex digit string to binary string.
 466  * @param str hex digit string
 467  * @return the binary string.
 468  *
 469  * The hex digit string may contains white spaces or colons which are treated
 470  * as delimiters between hex digit groups.
 471  *
 472  * @todo rework the handling of half nibbles (consistency)!
 473  */
 474 std::string convert_hex_to_binary(
 475    const std::string& str
 476 )
 477 throw (std::runtime_error)
 478 {
 479    std::string result;
 480    char c= 0;
 481    bool hasNibble= false;
 482    bool lastWasWS= true;
 483    for ( std::string::const_iterator it= str.begin();
 484          it != str.end();
 485          ++it)
 486    {
 487       std::string::size_type p = hexDigitsLower.find( *it );
 488       if (p== std::string::npos)
 489       {
 490          p= hexDigitsUpper.find( *it );
 491       }
 492       if (p == std::string::npos)
 493       {
 494          if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
 495                 or ( *it == ':') // or a colon?
 496             )
 497          {
 498             // we treat that as a valid delimiter:
 499             if (hasNibble)
 500             {
 501                // 1 nibble before WS is treate as lower part:
 502                result.push_back(c);
 503                // reset state:
 504                hasNibble= false;
 505             }
 506             lastWasWS= true;
 507             continue;
 508          }
 509       }
 510       if (p == std::string::npos )
 511       {
 512          throw runtime_error("illegal character in hex digit string: " + str);
 513       }
 514       lastWasWS= false;
 515       if (hasNibble)
 516       {
 517          c<<=4;
 518       }
 519       else
 520       {
 521          c=0;
 522       }
 523       c+= (p & 0x0f);
 524       if (hasNibble)
 525       {
 526          //we already had a nibble, so a char is complete now:
 527          result.push_back( c );
 528          hasNibble=false;
 529       }
 530       else
 531       {
 532          // this is the first nibble of a new char:
 533          hasNibble=true;
 534       }
 535    }
 536    if (hasNibble)
 537    {
 538       //well, there is one nibble left
 539       // let's do some heuristics:
 540       if (lastWasWS)
 541       {
 542          // if the preceeding character was a white space (or a colon)
 543          // we treat the nibble as lower part:
 544          //( this is consistent with shortened hex notations where leading zeros are not noted)
 545          result.push_back( c );
 546       }
 547       else
 548       {
 549          // if it was part of a hex digit chain, we treat it as UPPER part (!!)
 550          result.push_back( c << 4 );
 551       }
 552    }
 553    return result;
 554 } // eo convert_hex_to_binary(const std::string&)
 555
 556
 557 } // eo namespace I2n
 558
 559
 560
 561
 562 std::string iso_to_utf8(const std::string& isostring)
 563 {
 564    string result;
 565
 566    iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
 567
 568    if (iso_to_utf8 == (iconv_t)-1)
 569       throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
 570
 571    size_t in_size=isostring.size();
 572    size_t out_size=in_size*4;
 573
 574    char *buf = (char *)malloc(out_size+1);
 575    if (buf == NULL)
 576       throw runtime_error("out of memory for iconv buffer");
 577
 578    char *in = (char *)isostring.c_str();
 579    char *out = buf;
 580    iconv(i2utf8, &in, &in_size, &out, &out_size);
 581
 582    buf[isostring.size()*4-out_size]=0;
 583
 584    result=buf;
 585
 586    free(buf);
 587    iconv_close(i2utf8);
 588
 589    return result;
 590 }
 591
 592 std::string utf8_to_iso(const std::string& utf8string)
 593 {
 594    string result;
 595
 596    iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
 597
 598    if (utf82iso == (iconv_t)-1)
 599       throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
 600
 601    size_t in_size=utf8string.size();
 602    size_t out_size=in_size;
 603
 604    char *buf = (char *)malloc(out_size+1);
 605    if (buf == NULL)
 606       throw runtime_error("out of memory for iconv buffer");
 607
 608    char *in = (char *)utf8string.c_str();
 609    char *out = buf;
 610    iconv(utf82iso, &in, &in_size, &out, &out_size);
 611
 612    buf[utf8string.size()-out_size]=0;
 613
 614    result=buf;
 615
 616    free(buf);
 617    iconv_close(utf82iso);
 618
 619    return result;
 620 }
 621
 622 wchar_t* utf8_to_wbuf(const std::string& utf8string)
 623 {
 624    iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
 625
 626    if (utf82wstr == (iconv_t)-1)
 627       throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
 628
 629    size_t in_size=utf8string.size();
 630    size_t out_size= (in_size+1)*sizeof(wchar_t);
 631
 632    wchar_t *buf = (wchar_t *)malloc(out_size);
 633    if (buf == NULL)
 634       throw runtime_error("out of memory for iconv buffer");
 635
 636    char *in = (char *)utf8string.c_str();
 637    char *out = (char*) buf;
 638    if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
 639       throw runtime_error("error converting char encodings");
 640
 641    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
 642
 643    iconv_close(utf82wstr);
 644
 645    return buf;
 646 }
 647
 648 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
 649 {
 650    string result;
 651
 652    iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
 653
 654    if (utf7imap2utf8 == (iconv_t)-1)
 655       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 656
 657    size_t in_size=utf7imapstring.size();
 658    size_t out_size=in_size*4;
 659
 660    char *buf = (char *)malloc(out_size+1);
 661    if (buf == NULL)
 662       throw runtime_error("out of memory for iconv buffer");
 663
 664    char *in = (char *)utf7imapstring.c_str();
 665    char *out = buf;
 666    iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
 667
 668    buf[utf7imapstring.size()*4-out_size]=0;
 669
 670    result=buf;
 671
 672    free(buf);
 673    iconv_close(utf7imap2utf8);
 674
 675    return result;
 676 }
 677
 678 std::string utf8_to_utf7imap(const std::string& utf8string)
 679 {
 680    string result;
 681
 682    iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
 683
 684    if (utf82utf7imap == (iconv_t)-1)
 685       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 686
 687    // UTF-7 is base64 encoded, a buffer 10x as large
 688    // as the utf-8 buffer should be enough. If not the string will be truncated.
 689    size_t in_size=utf8string.size();
 690    size_t out_size=in_size*10;
 691
 692    char *buf = (char *)malloc(out_size+1);
 693    if (buf == NULL)
 694       throw runtime_error("out of memory for iconv buffer");
 695
 696    char *in = (char *)utf8string.c_str();
 697    char *out = buf;
 698    iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
 699
 700    buf[utf8string.size()*10-out_size]= 0;
 701
 702    result=buf;
 703
 704    free(buf);
 705    iconv_close(utf82utf7imap);
 706
 707    return result;
 708 }
 709
 710 // Tokenize string by (html) tags
 711 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
 712 {
 713    string::size_type pos, len = input.size();
 714    bool inside_tag = false;
 715    string current;
 716
 717    for (pos = 0; pos < len; pos++)
 718    {
 719       if (input[pos] == '<')
 720       {
 721          inside_tag = true;
 722
 723          if (!current.empty() )
 724          {
 725             tokenized.push_back( make_pair(current, false) );
 726             current = "";
 727          }
 728
 729          current += input[pos];
 730       }
 731       else if (input[pos] == '>' && inside_tag)
 732       {
 733          current += input[pos];
 734          inside_tag = false;
 735          if (!current.empty() )
 736          {
 737             tokenized.push_back( make_pair(current, true) );
 738             current = "";
 739          }
 740       }
 741       else
 742          current += input[pos];
 743    }
 744
 745    // String left over in buffer?
 746    if (!current.empty() )
 747       tokenized.push_back( make_pair(current, false) );
 748 } // eo tokenize_by_tag
 749
 750
 751 std::string strip_html_tags(const std::string &input)
 752 {
 753    // Pair first: string, second: isTag
 754    vector<pair<string,bool> > tokenized;
 755    tokenize_by_tag (tokenized, input);
 756
 757    string output;
 758    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 759    for (token = tokenized.begin(); token != tokens_end; ++token)
 760       if (!token->second)
 761          output += token->first;
 762
 763    return output;
 764 } // eo strip_html_tags
 765
 766
 767 // Smart-encode HTML en
 768 string smart_html_entities(const std::string &input)
 769 {
 770    // Pair first: string, second: isTag
 771    vector<pair<string,bool> > tokenized;
 772    tokenize_by_tag (tokenized, input);
 773
 774    string output;
 775    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 776    for (token = tokenized.begin(); token != tokens_end; ++token)
 777    {
 778       // keep HTML tags as they are
 779       if (token->second)
 780          output += token->first;
 781       else
 782          output += html_entities(token->first);
 783    }
 784
 785    return output;
 786 }
 787
 788
 789 string::size_type find_8bit(const std::string &str)
 790 {
 791    string::size_type l=str.size();
 792    for (string::size_type p=0; p < l; p++)
 793       if (static_cast<unsigned char>(str[p]) > 127)
 794          return p;
 795
 796    return string::npos;
 797 }
 798
 799 // encoded UTF-8 chars into HTML entities
 800 string html_entities(std::string str)
 801 {
 802    // Normal chars
 803    replace_all (str, "&", "&amp;");
 804    replace_all (str, "<", "&lt;");
 805    replace_all (str, ">", "&gt;");
 806    replace_all (str, "\"", "&quot;");
 807    replace_all (str, "'", "&#x27;");
 808    replace_all (str, "/", "&#x2F;");
 809
 810    // Umlauts
 811    replace_all (str, "\xC3\xA4", "&auml;");
 812    replace_all (str, "\xC3\xB6", "&ouml;");
 813    replace_all (str, "\xC3\xBC", "&uuml;");
 814    replace_all (str, "\xC3\x84", "&Auml;");
 815    replace_all (str, "\xC3\x96", "&Ouml;");
 816    replace_all (str, "\xC3\x9C", "&Uuml;");
 817
 818    // Misc
 819    replace_all (str, "\xC3\x9F", "&szlig;");
 820
 821    // conversion of remaining non-ASCII chars needed?
 822    // just do if needed because of performance
 823    if (find_8bit(str) != string::npos)
 824    {
 825       // convert to fixed-size encoding UTF-32
 826       wchar_t* wbuf=utf8_to_wbuf(str);
 827       ostringstream target;
 828
 829       // replace all non-ASCII chars with HTML representation
 830       for (int p=0; wbuf[p] != 0; p++)
 831       {
 832          unsigned int c=wbuf[p];
 833
 834          if (c <= 127)
 835             target << static_cast<unsigned char>(c);
 836          else
 837             target << "&#" << c << ';';
 838       }
 839
 840       free(wbuf);
 841
 842       str=target.str();
 843    }
 844
 845    return str;
 846 } // eo html_entities(std::string)
 847
 848 // convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
 849 string html_entities_to_console(std::string str)
 850 {
 851    // Normal chars
 852    replace_all (str, "&amp;", "&");
 853    replace_all (str, "&lt;", "<");
 854    replace_all (str, "&gt;", ">");
 855    replace_all (str, "&quot;", "\"");
 856    replace_all (str, "&#x27;", "'");
 857    replace_all (str, "&#x2F;", "/");
 858
 859    // Umlauts
 860    replace_all (str, "&auml;", "ae");
 861    replace_all (str, "&ouml;", "oe");
 862    replace_all (str, "&uuml;", "ue");
 863    replace_all (str, "&Auml;", "Ae");
 864    replace_all (str, "&Ouml;", "Oe");
 865    replace_all (str, "&Uuml;", "Ue");
 866
 867    // Misc
 868    replace_all (str, "&szlig;", "ss");
 869
 870    return str;
 871 }
 872
 873 bool replace_all(string &base, const char *ist, const char *soll)
 874 {
 875    string i=ist;
 876    string s=soll;
 877    return replace_all(base,&i,&s);
 878 }
 879
 880 bool replace_all(string &base, const string &ist, const char *soll)
 881 {
 882    string s=soll;
 883    return replace_all(base,&ist,&s);
 884 }
 885
 886 bool replace_all(string &base, const string *ist, const string *soll)
 887 {
 888    return replace_all(base,*ist,*soll);
 889 }
 890
 891 bool replace_all(string &base, const char *ist, const string *soll)
 892 {
 893    string i=ist;
 894    return replace_all(base,&i,soll);
 895 }
 896
 897 bool replace_all(string &base, const string &ist, const string &soll)
 898 {
 899    bool found_ist = false;
 900    string::size_type a=0;
 901
 902    if (ist.empty() )
 903       throw runtime_error ("replace_all called with empty search string");
 904
 905    while ( (a=base.find(ist,a) ) != string::npos)
 906    {
 907       base.replace(a,ist.size(),soll);
 908       a=a+soll.size();
 909       found_ist = true;
 910    }
 911
 912    return found_ist;
 913 }
 914
 915 /**
 916  * @brief replaces all characters that could be problematic or impose a security risk when being logged
 917  * @param str the original string
 918  * @param replace_with the character to replace the unsafe chars with
 919  * @return a string that is safe to send to syslog or other logfiles
 920  *
 921  * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
 922  * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
 923  * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
 924  *
 925  */
 926 std::string sanitize_for_logging(const std::string &str, const char replace_with)
 927 {
 928     std::string output=str;
 929
 930     const string::size_type len = output.size();
 931     for (std::string::size_type p=0; p < len; p++)
 932         if (output[p] < 0x20 || output[p] > 0x7E)
 933             output[p]=replace_with;
 934
 935     return output;
 936 }
 937
 938 #if 0
 939 string to_lower(const string &src)
 940 {
 941    string dst = src;
 942
 943    string::size_type pos, end = dst.size();
 944    for (pos = 0; pos < end; pos++)
 945       dst[pos] = tolower(dst[pos]);
 946
 947    return dst;
 948 }
 949
 950 string to_upper(const string &src)
 951 {
 952    string dst = src;
 953
 954    string::size_type pos, end = dst.size();
 955    for (pos = 0; pos < end; pos++)
 956       dst[pos] = toupper(dst[pos]);
 957
 958    return dst;
 959 }
 960 #endif
 961
 962 const int MAX_UNIT_FORMAT_SYMBOLS = 6;
 963
 964 const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
 965         " B",
 966         " KB",
 967         " MB",
 968         " GB",
 969         " TB",
 970         " PB"
 971 };
 972
 973 const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
 974         i18n_noop(" Bytes"),
 975         i18n_noop(" KBytes"),
 976         i18n_noop(" MBytes"),
 977         i18n_noop(" GBytes"),
 978         i18n_noop(" TBytes"),
 979         i18n_noop(" PBytes")
 980 };
 981
 982
 983 long double rounding_upwards(
 984         const long double number,
 985         const int rounding_multiplier
 986 )
 987 {
 988     long double rounded_number;
 989     rounded_number = number * rounding_multiplier;
 990     rounded_number += 0.5;
 991     rounded_number = (int64_t) (rounded_number);
 992     rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
 993
 994     return rounded_number;
 995 }
 996
 997
 998 string nice_unit_format(
 999         const int64_t input,
1000         const UnitFormat format,
1001         const UnitBase base
1002 )
1003 {
1004    // select the system of units (decimal or binary)
1005    int multiple = 0;
1006    if (base == UnitBase1000)
1007    {
1008        multiple = 1000;
1009    }
1010    else
1011    {
1012        multiple = 1024;
1013    }
1014
1015    long double size = input;
1016
1017    // check the size of the input number to fit in the appropriate symbol
1018    int sizecount = 0;
1019    while (size > multiple)
1020    {
1021        size = size / multiple;
1022        sizecount++;
1023
1024        // rollback to the previous values and stop the loop when cannot
1025        // represent the number length.
1026        if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1027        {
1028            size = size * multiple;
1029            sizecount--;
1030            break;
1031        }
1032    }
1033
1034    // round the input number "half up" to multiples of 10
1035    const int rounding_multiplier = 10;
1036    size = rounding_upwards(size, rounding_multiplier);
1037
1038    // format the input number, placing the appropriate symbol
1039    ostringstream out;
1040    out.setf (ios::fixed);
1041    if (format == ShortUnitFormat)
1042    {
1043        out.precision(1);
1044        out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
1045    }
1046    else
1047    {
1048        out.precision (2);
1049        out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
1050    }
1051
1052    return out.str();
1053 } // eo nice_unit_format(int input)
1054
1055
1056 string nice_unit_format(
1057         const double input,
1058         const UnitFormat format,
1059         const UnitBase base
1060 )
1061 {
1062     // round as double and cast to int64_t
1063     // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1064     int64_t input_casted_and_rounded =
1065         boost::numeric_cast<int64_t>( round(input) );
1066
1067     // now call other
1068     return nice_unit_format( input_casted_and_rounded, format, base );
1069 } // eo nice_unit_format(double input)
1070
1071
1072 string escape(const string &s)
1073 {
1074    string out(s);
1075    string::size_type p;
1076
1077    p=0;
1078    while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1079    {
1080       out.insert (p,"\\");
1081       p+=2;
1082    }
1083
1084    p=0;
1085    while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1086    {
1087       out.replace (p,1,"\\r");
1088       p+=2;
1089    }
1090
1091    p=0;
1092    while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1093    {
1094       out.replace (p,1,"\\n");
1095       p+=2;
1096    }
1097
1098    out='"'+out+'"';
1099
1100    return out;
1101 } // eo scape(const std::string&)
1102
1103
1104 string descape(const string &s, int startpos, int &endpos)
1105 {
1106    string out;
1107
1108    if (s.at(startpos) != '"')
1109       throw out_of_range("value not type escaped string");
1110
1111    out=s.substr(startpos+1);
1112    string::size_type p=0;
1113
1114    // search for the end of the string
1115    while ( (p=out.find("\"",p) ) !=out.npos)
1116    {
1117       int e=p-1;
1118       bool escaped=false;
1119
1120       // the " might be escaped with a backslash
1121       while (e>=0 && out.at (e) =='\\')
1122       {
1123          if (escaped == false)
1124             escaped=true;
1125          else
1126             escaped=false;
1127
1128          e--;
1129       }
1130
1131       if (escaped==false)
1132          break;
1133       else
1134          p++;
1135    }
1136
1137    // we now have the end of the string
1138    out=out.substr(0,p);
1139
1140    // tell calling prog about the endposition
1141    endpos=startpos+p+1;
1142
1143    // descape all \ stuff inside the string now
1144    p=0;
1145    while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1146    {
1147       switch (out.at(p+1) )
1148       {
1149          case 'r':
1150             out.replace(p,2,"\r");
1151             break;
1152          case 'n':
1153             out.replace(p,2,"\n");
1154             break;
1155          default:
1156             out.erase(p,1);
1157       }
1158       p++;
1159    }
1160
1161    return out;
1162 } // eo descape(const std::string&,int,int&)
1163
1164
1165 string escape_shellarg(const string &input)
1166 {
1167    string output = "'";
1168    string::const_iterator it, it_end = input.end();
1169    for (it = input.begin(); it != it_end; ++it)
1170    {
1171       if ( (*it) == '\'')
1172          output += "'\\'";
1173
1174       output += *it;
1175    }
1176
1177    output += "'";
1178    return output;
1179 }