developer.intra2net.com Git - libi2ncommon/blob - src/stringfunc.cpp

   1 /*
   2 The software in this package is distributed under the GNU General
   3 Public License version 2 (with a special exception described below).
   4
   5 A copy of GNU General Public License (GPL) is included in this distribution,
   6 in the file COPYING.GPL.
   7
   8 As a special exception, if other files instantiate templates or use macros
   9 or inline functions from this file, or you compile this file and link it
  10 with other works to produce a work based on this file, this file
  11 does not by itself cause the resulting work to be covered
  12 by the GNU General Public License.
  13
  14 However the source code for this file must still be made available
  15 in accordance with section (3) of the GNU General Public License.
  16
  17 This exception does not invalidate any other reasons why a work based
  18 on this file might be covered by the GNU General Public License.
  19 */
  20 /** @file
  21  *
  22  * (c) Copyright 2007-2008 by Intra2net AG
  23  */
  24
  25 #include <iostream>
  26 #include <string>
  27 #include <sstream>
  28 #include <stdexcept>
  29 #include <algorithm>
  30 #include <cmath>    // for round()
  31
  32 #include <wchar.h>
  33 #include <stdlib.h>
  34 #include <iconv.h>
  35 #include <i18n.h>
  36
  37 #include <boost/numeric/conversion/cast.hpp>
  38 #include <boost/foreach.hpp>
  39
  40 #include <stringfunc.hxx>
  41
  42 using namespace std;
  43
  44 namespace I2n
  45 {
  46
  47
  48 namespace
  49 {
  50
  51 const std::string hexDigitsLower("0123456789abcdef");
  52 const std::string hexDigitsUpper("0123456789ABCDEF");
  53
  54
  55 struct UpperFunc
  56 {
  57    char operator() (char c)
  58    {
  59       return std::toupper(c);
  60    }
  61 }; // eo struct UpperFunc
  62
  63
  64 struct LowerFunc
  65 {
  66    char operator() (char c)
  67    {
  68       return std::tolower(c);
  69    }
  70 }; // eo struct LowerFunc
  71
  72
  73 } // eo namespace <anonymous>
  74
  75
  76
  77 /**
  78  * default list of Whitespaces (" \t\r\n");
  79  */
  80 const std::string Whitespaces = " \t\r\n";
  81
  82 /**
  83  * default list of lineendings ("\r\n");
  84  */
  85 const std::string LineEndings= "\r\n";
  86
  87
  88
  89 /**
  90  * @brief checks if a string begins with a given prefix.
  91  * @param[in,out] str the string which is tested
  92  * @param prefix the prefix which should be tested for.
  93  * @return @a true iff the prefix is not empty and the string begins with that prefix.
  94  */
  95 bool has_prefix(const std::string& str, const std::string& prefix)
  96 {
  97    if (prefix.empty() || str.empty() || str.size() < prefix.size() )
  98    {
  99       return false;
 100    }
 101    return str.compare(0, prefix.size(), prefix) == 0;
 102 } // eo has_prefix(const std::string&,const std::string&)
 103
 104
 105 /**
 106  * @brief checks if a string ends with a given suffix.
 107  * @param[in,out] str the string which is tested
 108  * @param suffix the suffix which should be tested for.
 109  * @return @a true iff the suffix is not empty and the string ends with that suffix.
 110  */
 111 bool has_suffix(const std::string& str, const std::string& suffix)
 112 {
 113    if (suffix.empty() || str.empty() || str.size() < suffix.size() )
 114    {
 115       return false;
 116    }
 117    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 118 } // eo has_suffix(const std::string&,const std::string&)
 119
 120
 121 /**
 122  * cut off characters from a given list from front and end of a string.
 123  * @param[in,out] str the string which should be trimmed.
 124  * @param charlist the list of characters to remove from beginning and end of string
 125  * @return the result string.
 126  */
 127 std::string trim_mod(std::string& str, const std::string& charlist)
 128 {
 129    // first: trim the beginning:
 130    std::string::size_type pos= str.find_first_not_of (charlist);
 131    if (pos == std::string::npos)
 132    {
 133       // whole string consists of charlist (or is already empty)
 134       str.clear();
 135       return str;
 136    }
 137    else if (pos>0)
 138    {
 139       // str starts with charlist
 140       str.erase(0,pos);
 141    }
 142    // now let's look at the tail:
 143    pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
 144    if ( pos < str.size() )
 145    {
 146       str.erase(pos, str.size()-pos);
 147    }
 148    return str;
 149 } // eo trim_mod(std::string&,const std::string&)
 150
 151
 152
 153 /**
 154  * removes last character from a string when it is in a list of chars to be removed.
 155  * @param[in,out] str the string.
 156  * @param what the list of chars which will be tested for.
 157  * @return the resulting string with last char removed (if applicable)
 158  */
 159 std::string chomp_mod(std::string& str, const std::string& what)
 160 {
 161    if (str.empty() || what.empty() )
 162    {
 163       return str;
 164    }
 165    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 166    {
 167       str.erase(str.size() - 1);
 168    }
 169    return str;
 170 } // eo chomp_mod(std::string&,const std::string&)
 171
 172
 173 /**
 174  * @brief converts a string to lower case.
 175  * @param[in,out] str the string to modify.
 176  * @return the string
 177  */
 178 std::string to_lower_mod(std::string& str)
 179 {
 180    std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
 181    return str;
 182 } // eo to_lower_mod(std::string&)
 183
 184
 185 /**
 186  * @brief converts a string to upper case.
 187  * @param[in,out] str the string to modify.
 188  * @return the string
 189  */
 190 std::string to_upper_mod(std::string& str)
 191 {
 192    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
 193    return str;
 194 } // eo to_upper_mod(std::string&)
 195
 196
 197
 198 /**
 199  * cut off characters from a given list from front and end of a string.
 200  * @param str the string which should be trimmed.
 201  * @param charlist the list of characters to remove from beginning and end of string
 202  * @return the result string.
 203  */
 204 std::string trim (const std::string& str, const std::string& charlist)
 205 {
 206    // first: trim the beginning:
 207    std::string::size_type pos0= str.find_first_not_of(charlist);
 208    if (pos0 == std::string::npos)
 209    {
 210       // whole string consists of charlist (or is already empty)
 211       return std::string();
 212    }
 213    // now let's look at the end:
 214    std::string::size_type pos1= str.find_last_not_of(charlist);
 215    return str.substr(pos0, pos1 - pos0 + 1);
 216 } // eo trim(const std:.string&,const std::string&)
 217
 218
 219 /**
 220  * removes last character from a string when it is in a list of chars to be removed.
 221  * @param str the string.
 222  * @param what the list of chars which will be tested for.
 223  * @return the resulting string with last char removed (if applicable)
 224  */
 225 std::string chomp (const std::string& str, const std::string& what)
 226 {
 227    if (str.empty() || what.empty() )
 228    {
 229       return str;
 230    }
 231    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 232    {
 233       return str.substr(0, str.size()-1);
 234    }
 235    return str;
 236 } // eo chomp(const std:.string&,const std::string&)
 237
 238
 239 /**
 240  * @brief returns a lower case version of a given string.
 241  * @param str the string
 242  * @return the lower case version of the string
 243  */
 244 std::string to_lower (const std::string& str)
 245 {
 246    std::string result(str);
 247    return to_lower_mod(result);
 248 } // eo to_lower(const std::string&)
 249
 250
 251 /**
 252  * @brief returns a upper case version of a given string.
 253  * @param str the string
 254  * @return the upper case version of the string
 255  */
 256 std::string to_upper(const std::string& str)
 257 {
 258    std::string result(str);
 259    return to_upper_mod(result);
 260 } // eo to_upper(const std::string&)
 261
 262
 263
 264 /**
 265  * @brief removes a given suffix from a string.
 266  * @param str the string.
 267  * @param suffix the suffix which should be removed if the string ends with it.
 268  * @return the string without the suffix.
 269  *
 270  * If the string ends with the suffix, it is removed. If the the string doesn't end
 271  * with the suffix the original string is returned.
 272  */
 273 std::string remove_suffix(const std::string& str, const std::string& suffix)
 274 {
 275    if (has_suffix(str,suffix) )
 276    {
 277       return str.substr(0, str.size()-suffix.size() );
 278    }
 279    return str;
 280 } // eo remove_suffix(const std::string&,const std::string&)
 281
 282
 283
 284 /**
 285  * @brief removes a given prefix from a string.
 286  * @param str the string.
 287  * @param prefix the prefix which should be removed if the string begins with it.
 288  * @return the string without the prefix.
 289  *
 290  * If the string begins with the prefix, it is removed. If the the string doesn't begin
 291  * with the prefix the original string is returned.
 292  */
 293 std::string remove_prefix(const std::string& str, const std::string& prefix)
 294 {
 295    if (has_prefix(str,prefix) )
 296    {
 297       return str.substr( prefix.size() );
 298    }
 299    return str;
 300 } // eo remove_prefix(const std::string&,const std::string&)
 301
 302
 303 /**
 304  * split a string to key and value delimited by a given delimiter.
 305  * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 306  * @param str the string which should be splitted.
 307  * @param[out] key the resulting key
 308  * @param[out] value the resulting value
 309  * @param delimiter the delimiter between key and value; default is '='.
 310  * @return @a true if the split was successful.
 311  */
 312 bool pair_split(
 313    const std::string& str,
 314    std::string& key,
 315    std::string& value,
 316    char delimiter)
 317 {
 318    std::string::size_type pos = str.find (delimiter);
 319    if (pos == std::string::npos) return false;
 320    key= str.substr(0,pos);
 321    value= str.substr(pos+1);
 322    trim_mod(key);
 323    trim_mod(value);
 324    return true;
 325 } // eo pair_split(const std::string&,std::string&,std::string&,char)
 326
 327
 328 /**
 329  * splits a string by given delimiter
 330  *
 331  * @param[in] str the string which should be splitted.
 332  * @param[out] result the list resulting from splitting  @a str.
 333  * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 334  * @param[in] omit_empty should empty parts not be stored?
 335  * @param[in] trim_list list of characters the parts should be trimmed by.
 336  *  (empty string results in no trim)
 337  */
 338 void split_string(
 339    const std::string& str,
 340    std::list<std::string>& result,
 341    const std::string& delimiter,
 342    bool omit_empty,
 343    const std::string& trim_list
 344 )
 345 {
 346    std::string::size_type pos, last_pos=0;
 347    bool delimiter_found= false;
 348    while ( last_pos < str.size()  && last_pos != std::string::npos)
 349    {
 350       pos= str.find(delimiter, last_pos);
 351       std::string part;
 352       if (pos == std::string::npos)
 353       {
 354          part= str.substr(last_pos);
 355          delimiter_found= false;
 356       }
 357       else
 358       {
 359          part= str.substr(last_pos, pos-last_pos);
 360          delimiter_found=true;
 361       }
 362       if (pos != std::string::npos)
 363       {
 364          last_pos= pos+ delimiter.size();
 365       }
 366       else
 367       {
 368          last_pos= std::string::npos;
 369       }
 370       if (!trim_list.empty() ) trim_mod (part, trim_list);
 371       if (omit_empty && part.empty() ) continue;
 372       result.push_back( part );
 373    }
 374    // if the string ends with a delimiter we need to append an empty string if no omit_empty
 375    // was given.
 376    // (this way we keep the split result consistent to a join operation)
 377    if (delimiter_found && !omit_empty)
 378    {
 379       result.push_back("");
 380    }
 381 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
 382
 383
 384 /** call split_string with list<string>, converts result to vector; vector is clear()-ed first
 385  *
 386  * Note: Uses 3 O(n)-operations: list.size, vector.resize and std::swap_ranges;
 387  *       not sure whether there is a better way to do this
 388  * */
 389 void split_string(
 390    const std::string& str,
 391    std::vector<std::string>& result,
 392    const std::string& delimiter,
 393    bool omit_empty,
 394    const std::string& trim_list
 395 )
 396 {
 397     std::list<std::string> tmp;
 398     split_string(str, tmp, delimiter, omit_empty, trim_list);
 399     std::size_t size = tmp.size();   // this is O(n)
 400     result.clear();
 401     result.resize(size);             // also O(n)
 402     std::swap_ranges(tmp.begin(), tmp.end(), result.begin());   // also O(n)
 403 }
 404
 405 /**
 406  * splits a string by a given delimiter
 407  * @param str the string which should be splitted.
 408  * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 409  * @param[in] omit_empty should empty parts not be stored?
 410  * @param[in] trim_list list of characters the parts should be trimmed by.
 411  *  (empty string results in no trim)
 412  * @return the list resulting from splitting @a str.
 413  */
 414 std::list<std::string> split_string(
 415    const std::string& str,
 416    const std::string& delimiter,
 417    bool omit_empty,
 418    const std::string& trim_list
 419 )
 420 {
 421    std::list<std::string> result;
 422    split_string(str, result, delimiter, omit_empty, trim_list);
 423    return result;
 424 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
 425
 426
 427 /**
 428  * @brief joins a list of strings into a single string.
 429  *
 430  * This funtion is (basically) the reverse operation of @a split_string.
 431  *
 432  * @param parts the list of strings.
 433  * @param delimiter the delimiter which is inserted between the strings.
 434  * @return the joined string.
 435  */
 436 std::string join_string(
 437    const std::list< std::string >& parts,
 438    const std::string& delimiter
 439 )
 440 {
 441    std::string result;
 442    if (! parts.empty() )
 443    {
 444       std::list< std::string >::const_iterator it= parts.begin();
 445       result = *it;
 446       while ( ++it != parts.end() )
 447       {
 448          result+= delimiter;
 449          result+= *it;
 450       }
 451    }
 452    return result;
 453 } // eo join_string(const std::list< std::string >&,const std::string&)
 454
 455
 456 /** @brief same as join_string for list, except uses a vector */
 457 std::string join_string(
 458    const std::vector< std::string >& parts,
 459    const std::string& delimiter
 460 )
 461 {
 462    std::string result;
 463    if (! parts.empty() )
 464    {
 465       std::vector< std::string >::const_iterator it= parts.begin();
 466       result = *it;
 467       while ( ++it != parts.end() )
 468       {
 469          result+= delimiter;
 470          result+= *it;
 471       }
 472    }
 473    return result;
 474 } // eo join_string(const std::vector< std::string >&,const std::string&)
 475
 476
 477
 478 /*
 479 ** conversions
 480 */
 481
 482
 483 /**
 484  * @brief returns a hex string from a binary string.
 485  * @param str the (binary) string
 486  * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 487  * @return the string in hex notation.
 488  */
 489 std::string convert_binary_to_hex(
 490    const std::string& str,
 491    bool upper_case_digits
 492 )
 493 {
 494    std::string result;
 495    std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
 496    for ( std::string::const_iterator it= str.begin();
 497          it != str.end();
 498          ++it)
 499    {
 500       result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
 501       result.push_back( hexDigits[ (*it) & 0x0f ] );
 502    }
 503    return result;
 504 } // eo convert_binary_to_hex(const std::string&,bool)
 505
 506
 507 /**
 508  * @brief converts a hex digit string to binary string.
 509  * @param str hex digit string
 510  * @return the binary string.
 511  *
 512  * The hex digit string may contains white spaces or colons which are treated
 513  * as delimiters between hex digit groups.
 514  *
 515  * @todo rework the handling of half nibbles (consistency)!
 516  */
 517 std::string convert_hex_to_binary(
 518    const std::string& str
 519 )
 520 throw (std::runtime_error)
 521 {
 522    std::string result;
 523    char c= 0;
 524    bool hasNibble= false;
 525    bool lastWasWS= true;
 526    for ( std::string::const_iterator it= str.begin();
 527          it != str.end();
 528          ++it)
 529    {
 530       std::string::size_type p = hexDigitsLower.find( *it );
 531       if (p== std::string::npos)
 532       {
 533          p= hexDigitsUpper.find( *it );
 534       }
 535       if (p == std::string::npos)
 536       {
 537          if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
 538                 or ( *it == ':') // or a colon?
 539             )
 540          {
 541             // we treat that as a valid delimiter:
 542             if (hasNibble)
 543             {
 544                // 1 nibble before WS is treate as lower part:
 545                result.push_back(c);
 546                // reset state:
 547                hasNibble= false;
 548             }
 549             lastWasWS= true;
 550             continue;
 551          }
 552       }
 553       if (p == std::string::npos )
 554       {
 555          throw runtime_error("illegal character in hex digit string: " + str);
 556       }
 557       lastWasWS= false;
 558       if (hasNibble)
 559       {
 560          c<<=4;
 561       }
 562       else
 563       {
 564          c=0;
 565       }
 566       c+= (p & 0x0f);
 567       if (hasNibble)
 568       {
 569          //we already had a nibble, so a char is complete now:
 570          result.push_back( c );
 571          hasNibble=false;
 572       }
 573       else
 574       {
 575          // this is the first nibble of a new char:
 576          hasNibble=true;
 577       }
 578    }
 579    if (hasNibble)
 580    {
 581       //well, there is one nibble left
 582       // let's do some heuristics:
 583       if (lastWasWS)
 584       {
 585          // if the preceeding character was a white space (or a colon)
 586          // we treat the nibble as lower part:
 587          //( this is consistent with shortened hex notations where leading zeros are not noted)
 588          result.push_back( c );
 589       }
 590       else
 591       {
 592          // if it was part of a hex digit chain, we treat it as UPPER part (!!)
 593          result.push_back( c << 4 );
 594       }
 595    }
 596    return result;
 597 } // eo convert_hex_to_binary(const std::string&)
 598
 599
 600 } // eo namespace I2n
 601
 602
 603
 604
 605 std::string iso_to_utf8(const std::string& isostring)
 606 {
 607    string result;
 608
 609    iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
 610
 611    if (iso_to_utf8 == (iconv_t)-1)
 612       throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
 613
 614    size_t in_size=isostring.size();
 615    size_t out_size=in_size*4;
 616
 617    char *buf = (char *)malloc(out_size+1);
 618    if (buf == NULL)
 619       throw runtime_error("out of memory for iconv buffer");
 620
 621    char *in = (char *)isostring.c_str();
 622    char *out = buf;
 623    iconv(i2utf8, &in, &in_size, &out, &out_size);
 624
 625    buf[isostring.size()*4-out_size]=0;
 626
 627    result=buf;
 628
 629    free(buf);
 630    iconv_close(i2utf8);
 631
 632    return result;
 633 }
 634
 635 std::string utf8_to_iso(const std::string& utf8string)
 636 {
 637    string result;
 638
 639    iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
 640
 641    if (utf82iso == (iconv_t)-1)
 642       throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
 643
 644    size_t in_size=utf8string.size();
 645    size_t out_size=in_size;
 646
 647    char *buf = (char *)malloc(out_size+1);
 648    if (buf == NULL)
 649       throw runtime_error("out of memory for iconv buffer");
 650
 651    char *in = (char *)utf8string.c_str();
 652    char *out = buf;
 653    iconv(utf82iso, &in, &in_size, &out, &out_size);
 654
 655    buf[utf8string.size()-out_size]=0;
 656
 657    result=buf;
 658
 659    free(buf);
 660    iconv_close(utf82iso);
 661
 662    return result;
 663 }
 664
 665 wchar_t* utf8_to_wbuf(const std::string& utf8string)
 666 {
 667    iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
 668
 669    if (utf82wstr == (iconv_t)-1)
 670       throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
 671
 672    size_t in_size=utf8string.size();
 673    size_t out_size= (in_size+1)*sizeof(wchar_t);
 674
 675    wchar_t *buf = (wchar_t *)malloc(out_size);
 676    if (buf == NULL)
 677       throw runtime_error("out of memory for iconv buffer");
 678
 679    char *in = (char *)utf8string.c_str();
 680    char *out = (char*) buf;
 681    if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
 682       throw runtime_error("error converting char encodings");
 683
 684    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
 685
 686    iconv_close(utf82wstr);
 687
 688    return buf;
 689 }
 690
 691 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
 692 {
 693    string result;
 694
 695    iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
 696
 697    if (utf7imap2utf8 == (iconv_t)-1)
 698       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 699
 700    size_t in_size=utf7imapstring.size();
 701    size_t out_size=in_size*4;
 702
 703    char *buf = (char *)malloc(out_size+1);
 704    if (buf == NULL)
 705       throw runtime_error("out of memory for iconv buffer");
 706
 707    char *in = (char *)utf7imapstring.c_str();
 708    char *out = buf;
 709    iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
 710
 711    buf[utf7imapstring.size()*4-out_size]=0;
 712
 713    result=buf;
 714
 715    free(buf);
 716    iconv_close(utf7imap2utf8);
 717
 718    return result;
 719 }
 720
 721 std::string utf8_to_utf7imap(const std::string& utf8string)
 722 {
 723    string result;
 724
 725    iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
 726
 727    if (utf82utf7imap == (iconv_t)-1)
 728       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 729
 730    // UTF-7 is base64 encoded, a buffer 10x as large
 731    // as the utf-8 buffer should be enough. If not the string will be truncated.
 732    size_t in_size=utf8string.size();
 733    size_t out_size=in_size*10;
 734
 735    char *buf = (char *)malloc(out_size+1);
 736    if (buf == NULL)
 737       throw runtime_error("out of memory for iconv buffer");
 738
 739    char *in = (char *)utf8string.c_str();
 740    char *out = buf;
 741    iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
 742
 743    buf[utf8string.size()*10-out_size]= 0;
 744
 745    result=buf;
 746
 747    free(buf);
 748    iconv_close(utf82utf7imap);
 749
 750    return result;
 751 }
 752
 753 // Tokenize string by (html) tags
 754 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
 755 {
 756    string::size_type pos, len = input.size();
 757    bool inside_tag = false;
 758    string current;
 759
 760    for (pos = 0; pos < len; pos++)
 761    {
 762       if (input[pos] == '<')
 763       {
 764          inside_tag = true;
 765
 766          if (!current.empty() )
 767          {
 768             tokenized.push_back( make_pair(current, false) );
 769             current = "";
 770          }
 771
 772          current += input[pos];
 773       }
 774       else if (input[pos] == '>' && inside_tag)
 775       {
 776          current += input[pos];
 777          inside_tag = false;
 778          if (!current.empty() )
 779          {
 780             tokenized.push_back( make_pair(current, true) );
 781             current = "";
 782          }
 783       }
 784       else
 785          current += input[pos];
 786    }
 787
 788    // String left over in buffer?
 789    if (!current.empty() )
 790       tokenized.push_back( make_pair(current, false) );
 791 } // eo tokenize_by_tag
 792
 793
 794 std::string strip_html_tags(const std::string &input)
 795 {
 796    // Pair first: string, second: isTag
 797    vector<pair<string,bool> > tokenized;
 798    tokenize_by_tag (tokenized, input);
 799
 800    string output;
 801    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 802    for (token = tokenized.begin(); token != tokens_end; ++token)
 803       if (!token->second)
 804          output += token->first;
 805
 806    return output;
 807 } // eo strip_html_tags
 808
 809
 810 // Smart-encode HTML en
 811 string smart_html_entities(const std::string &input)
 812 {
 813    // Pair first: string, second: isTag
 814    vector<pair<string,bool> > tokenized;
 815    tokenize_by_tag (tokenized, input);
 816
 817    string output;
 818    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 819    for (token = tokenized.begin(); token != tokens_end; ++token)
 820    {
 821       // keep HTML tags as they are
 822       if (token->second)
 823          output += token->first;
 824       else
 825          output += html_entities(token->first);
 826    }
 827
 828    return output;
 829 }
 830
 831
 832 string::size_type find_8bit(const std::string &str)
 833 {
 834    string::size_type l=str.size();
 835    for (string::size_type p=0; p < l; p++)
 836       if (static_cast<unsigned char>(str[p]) > 127)
 837          return p;
 838
 839    return string::npos;
 840 }
 841
 842 // encoded UTF-8 chars into HTML entities
 843 string html_entities(std::string str)
 844 {
 845    // Normal chars
 846    replace_all (str, "&", "&amp;");
 847    replace_all (str, "<", "&lt;");
 848    replace_all (str, ">", "&gt;");
 849    replace_all (str, "\"", "&quot;");
 850    replace_all (str, "'", "&#x27;");
 851    replace_all (str, "/", "&#x2F;");
 852
 853    // Umlauts
 854    replace_all (str, "\xC3\xA4", "&auml;");
 855    replace_all (str, "\xC3\xB6", "&ouml;");
 856    replace_all (str, "\xC3\xBC", "&uuml;");
 857    replace_all (str, "\xC3\x84", "&Auml;");
 858    replace_all (str, "\xC3\x96", "&Ouml;");
 859    replace_all (str, "\xC3\x9C", "&Uuml;");
 860
 861    // Misc
 862    replace_all (str, "\xC3\x9F", "&szlig;");
 863
 864    // conversion of remaining non-ASCII chars needed?
 865    // just do if needed because of performance
 866    if (find_8bit(str) != string::npos)
 867    {
 868       // convert to fixed-size encoding UTF-32
 869       wchar_t* wbuf=utf8_to_wbuf(str);
 870       ostringstream target;
 871
 872       // replace all non-ASCII chars with HTML representation
 873       for (int p=0; wbuf[p] != 0; p++)
 874       {
 875          unsigned int c=wbuf[p];
 876
 877          if (c <= 127)
 878             target << static_cast<unsigned char>(c);
 879          else
 880             target << "&#" << c << ';';
 881       }
 882
 883       free(wbuf);
 884
 885       str=target.str();
 886    }
 887
 888    return str;
 889 } // eo html_entities(std::string)
 890
 891 // convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
 892 string html_entities_to_console(std::string str)
 893 {
 894    // Normal chars
 895    replace_all (str, "&amp;", "&");
 896    replace_all (str, "&lt;", "<");
 897    replace_all (str, "&gt;", ">");
 898    replace_all (str, "&quot;", "\"");
 899    replace_all (str, "&#x27;", "'");
 900    replace_all (str, "&#x2F;", "/");
 901
 902    // Umlauts
 903    replace_all (str, "&auml;", "ae");
 904    replace_all (str, "&ouml;", "oe");
 905    replace_all (str, "&uuml;", "ue");
 906    replace_all (str, "&Auml;", "Ae");
 907    replace_all (str, "&Ouml;", "Oe");
 908    replace_all (str, "&Uuml;", "Ue");
 909
 910    // Misc
 911    replace_all (str, "&szlig;", "ss");
 912
 913    return str;
 914 }
 915
 916 // find_html_comments + remove_html_comments(str, comments)
 917 void remove_html_comments(string &str)
 918 {
 919     vector<CommentZone> comments = find_html_comments(str);
 920     remove_html_comments(str, comments);
 921 }
 922
 923 // find all html comments, behaving correctly if they are nested; ignores comment tags ("<!--FOO .... BAR-->")
 924 // If there are invalid comments ("-->" before "<!--" or different number of closing and opening tags),
 925 // then the unknown index of corresponding start/end tag will be represented by a string::npos
 926 // Indices are from start of start tag until first index after closing tag
 927 vector<CommentZone> find_html_comments(const std::string &str)
 928 {
 929     static const string START = "<!--";
 930     static const string CLOSE = "-->";
 931     static const string::size_type START_LEN = START.length();
 932     static const string::size_type CLOSE_LEN = CLOSE.length();
 933
 934     vector<CommentZone> comments;
 935
 936     // in order to find nested comments, need either recursion or a stack
 937     vector<string::size_type> starts;      // stack of start tags
 938
 939     string::size_type pos = 0;
 940     string::size_type len = str.length();
 941     string::size_type next_start, next_close;
 942
 943     while (pos < len)     // not really needed but just in case
 944     {
 945         next_start = str.find(START, pos);
 946         next_close = str.find(CLOSE, pos);
 947
 948         if ( (next_start == string::npos) && (next_close == string::npos) )
 949             break;   // we are done
 950
 951         else if ( (next_start == string::npos) || (next_close < next_start) )  // close one comment (pop)
 952         {
 953             if (starts.empty())    // closing tag without a start
 954                 comments.push_back(CommentZone(string::npos, next_close+CLOSE_LEN));
 955             else
 956             {
 957                 comments.push_back(CommentZone(starts.back(), next_close+CLOSE_LEN));
 958                 starts.pop_back();
 959             }
 960             pos = next_close + CLOSE_LEN;
 961         }
 962
 963         else if ( (next_close == string::npos) || (next_start < next_close) )  // start a new comment (push)
 964         {
 965             starts.push_back(next_start);
 966             pos = next_start + START_LEN;
 967         }
 968     }
 969
 970     // add comments that have no closing tag from back to front (important for remove_html_comments!)
 971     while (!starts.empty())
 972     {
 973         comments.push_back(CommentZone(starts.back(), string::npos));
 974         starts.pop_back();
 975     }
 976
 977     return comments;
 978 }
 979
 980 // remove all html comments foundby find_html_comments
 981 void remove_html_comments(std::string &str, const vector<CommentZone> &comments)
 982 {
 983     // remember position where last removal started
 984     string::size_type last_removal_start = str.length();
 985
 986     // Go from back to front to not mess up indices.
 987     // This requires that bigger comments, that contain smaller comments, come AFTER
 988     // the small contained comments in the comments vector (i.e. comments are ordered by
 989     // their closing tag, not their opening tag). This is true for results from find_html_comments
 990     BOOST_REVERSE_FOREACH(const CommentZone &comment, comments)
 991     {
 992         if (comment.first == string::npos)
 993         {
 994             str = str.replace(0, comment.second, "");   // comment starts "before" str --> delete from start
 995             break;   // there can be no more
 996         }
 997         else if (comment.first >= last_removal_start)
 998         {
 999             continue;    // this comment is inside another comment that we have removed already
1000         }
1001         else if (comment.second == string::npos)   // comment ends "after" str --> delete until end
1002         {
1003             str = str.replace(comment.first, string::npos, "");
1004             last_removal_start = comment.first;
1005         }
1006         else
1007         {
1008             str = str.replace(comment.first, comment.second-comment.first, "");
1009             last_removal_start = comment.first;
1010         }
1011     }
1012 }
1013
1014 bool replace_all(string &base, const char *ist, const char *soll)
1015 {
1016    string i=ist;
1017    string s=soll;
1018    return replace_all(base,&i,&s);
1019 }
1020
1021 bool replace_all(string &base, const string &ist, const char *soll)
1022 {
1023    string s=soll;
1024    return replace_all(base,&ist,&s);
1025 }
1026
1027 bool replace_all(string &base, const string *ist, const string *soll)
1028 {
1029    return replace_all(base,*ist,*soll);
1030 }
1031
1032 bool replace_all(string &base, const char *ist, const string *soll)
1033 {
1034    string i=ist;
1035    return replace_all(base,&i,soll);
1036 }
1037
1038 bool replace_all(string &base, const string &ist, const string &soll)
1039 {
1040    bool found_ist = false;
1041    string::size_type a=0;
1042
1043    if (ist.empty() )
1044       throw runtime_error ("replace_all called with empty search string");
1045
1046    while ( (a=base.find(ist,a) ) != string::npos)
1047    {
1048       base.replace(a,ist.size(),soll);
1049       a=a+soll.size();
1050       found_ist = true;
1051    }
1052
1053    return found_ist;
1054 }
1055
1056 /**
1057  * @brief replaces all characters that could be problematic or impose a security risk when being logged
1058  * @param str the original string
1059  * @param replace_with the character to replace the unsafe chars with
1060  * @return a string that is safe to send to syslog or other logfiles
1061  *
1062  * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
1063  * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
1064  * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
1065  *
1066  */
1067 std::string sanitize_for_logging(const std::string &str, const char replace_with)
1068 {
1069     std::string output=str;
1070
1071     const string::size_type len = output.size();
1072     for (std::string::size_type p=0; p < len; p++)
1073         if (output[p] < 0x20 || output[p] > 0x7E)
1074             output[p]=replace_with;
1075
1076     return output;
1077 }
1078
1079 #if 0
1080 string to_lower(const string &src)
1081 {
1082    string dst = src;
1083
1084    string::size_type pos, end = dst.size();
1085    for (pos = 0; pos < end; pos++)
1086       dst[pos] = tolower(dst[pos]);
1087
1088    return dst;
1089 }
1090
1091 string to_upper(const string &src)
1092 {
1093    string dst = src;
1094
1095    string::size_type pos, end = dst.size();
1096    for (pos = 0; pos < end; pos++)
1097       dst[pos] = toupper(dst[pos]);
1098
1099    return dst;
1100 }
1101 #endif
1102
1103 const int MAX_UNIT_FORMAT_SYMBOLS = 6;
1104
1105 const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
1106         " B",
1107         " KB",
1108         " MB",
1109         " GB",
1110         " TB",
1111         " PB"
1112 };
1113
1114 const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
1115         i18n_noop(" Bytes"),
1116         i18n_noop(" KBytes"),
1117         i18n_noop(" MBytes"),
1118         i18n_noop(" GBytes"),
1119         i18n_noop(" TBytes"),
1120         i18n_noop(" PBytes")
1121 };
1122
1123
1124 long double rounding_upwards(
1125         const long double number,
1126         const int rounding_multiplier
1127 )
1128 {
1129     long double rounded_number;
1130     rounded_number = number * rounding_multiplier;
1131     rounded_number += 0.5;
1132     rounded_number = (int64_t) (rounded_number);
1133     rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
1134
1135     return rounded_number;
1136 }
1137
1138
1139 string nice_unit_format(
1140         const int64_t input,
1141         const UnitFormat format,
1142         const UnitBase base
1143 )
1144 {
1145    // select the system of units (decimal or binary)
1146    int multiple = 0;
1147    if (base == UnitBase1000)
1148    {
1149        multiple = 1000;
1150    }
1151    else
1152    {
1153        multiple = 1024;
1154    }
1155
1156    long double size = input;
1157
1158    // check the size of the input number to fit in the appropriate symbol
1159    int sizecount = 0;
1160    while (size > multiple)
1161    {
1162        size = size / multiple;
1163        sizecount++;
1164
1165        // rollback to the previous values and stop the loop when cannot
1166        // represent the number length.
1167        if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1168        {
1169            size = size * multiple;
1170            sizecount--;
1171            break;
1172        }
1173    }
1174
1175    // round the input number "half up" to multiples of 10
1176    const int rounding_multiplier = 10;
1177    size = rounding_upwards(size, rounding_multiplier);
1178
1179    // format the input number, placing the appropriate symbol
1180    ostringstream out;
1181    out.setf (ios::fixed);
1182    if (format == ShortUnitFormat)
1183    {
1184        out.precision(1);
1185        out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
1186    }
1187    else
1188    {
1189        out.precision (2);
1190        out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
1191    }
1192
1193    return out.str();
1194 } // eo nice_unit_format(int input)
1195
1196
1197 string nice_unit_format(
1198         const double input,
1199         const UnitFormat format,
1200         const UnitBase base
1201 )
1202 {
1203     // round as double and cast to int64_t
1204     // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1205     int64_t input_casted_and_rounded =
1206         boost::numeric_cast<int64_t>( round(input) );
1207
1208     // now call other
1209     return nice_unit_format( input_casted_and_rounded, format, base );
1210 } // eo nice_unit_format(double input)
1211
1212
1213 string escape(const string &s)
1214 {
1215    string out(s);
1216    string::size_type p;
1217
1218    p=0;
1219    while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1220    {
1221       out.insert (p,"\\");
1222       p+=2;
1223    }
1224
1225    p=0;
1226    while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1227    {
1228       out.replace (p,1,"\\r");
1229       p+=2;
1230    }
1231
1232    p=0;
1233    while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1234    {
1235       out.replace (p,1,"\\n");
1236       p+=2;
1237    }
1238
1239    out='"'+out+'"';
1240
1241    return out;
1242 } // eo scape(const std::string&)
1243
1244
1245 string descape(const string &s, int startpos, int &endpos)
1246 {
1247    string out;
1248
1249    if (s.at(startpos) != '"')
1250       throw out_of_range("value not type escaped string");
1251
1252    out=s.substr(startpos+1);
1253    string::size_type p=0;
1254
1255    // search for the end of the string
1256    while ( (p=out.find("\"",p) ) !=out.npos)
1257    {
1258       int e=p-1;
1259       bool escaped=false;
1260
1261       // the " might be escaped with a backslash
1262       while (e>=0 && out.at (e) =='\\')
1263       {
1264          if (escaped == false)
1265             escaped=true;
1266          else
1267             escaped=false;
1268
1269          e--;
1270       }
1271
1272       if (escaped==false)
1273          break;
1274       else
1275          p++;
1276    }
1277
1278    // we now have the end of the string
1279    out=out.substr(0,p);
1280
1281    // tell calling prog about the endposition
1282    endpos=startpos+p+1;
1283
1284    // descape all \ stuff inside the string now
1285    p=0;
1286    while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1287    {
1288       switch (out.at(p+1) )
1289       {
1290          case 'r':
1291             out.replace(p,2,"\r");
1292             break;
1293          case 'n':
1294             out.replace(p,2,"\n");
1295             break;
1296          default:
1297             out.erase(p,1);
1298       }
1299       p++;
1300    }
1301
1302    return out;
1303 } // eo descape(const std::string&,int,int&)
1304
1305
1306 string escape_shellarg(const string &input)
1307 {
1308    string output = "'";
1309    string::const_iterator it, it_end = input.end();
1310    for (it = input.begin(); it != it_end; ++it)
1311    {
1312       if ( (*it) == '\'')
1313          output += "'\\'";
1314
1315       output += *it;
1316    }
1317
1318    output += "'";
1319    return output;
1320 }