developer.intra2net.com Git - libi2ncommon/blob - src/stringfunc.cpp

   1 /*
   2 The software in this package is distributed under the GNU General
   3 Public License version 2 (with a special exception described below).
   4
   5 A copy of GNU General Public License (GPL) is included in this distribution,
   6 in the file COPYING.GPL.
   7
   8 As a special exception, if other files instantiate templates or use macros
   9 or inline functions from this file, or you compile this file and link it
  10 with other works to produce a work based on this file, this file
  11 does not by itself cause the resulting work to be covered
  12 by the GNU General Public License.
  13
  14 However the source code for this file must still be made available
  15 in accordance with section (3) of the GNU General Public License.
  16
  17 This exception does not invalidate any other reasons why a work based
  18 on this file might be covered by the GNU General Public License.
  19 */
  20 /** @file
  21  *
  22  * (c) Copyright 2007-2008 by Intra2net AG
  23  */
  24
  25 #include <iostream>
  26 #include <string>
  27 #include <sstream>
  28 #include <stdexcept>
  29 #include <algorithm>
  30 #include <cmath>    // for round()
  31
  32 #include <wchar.h>
  33 #include <stdlib.h>
  34 #include <iconv.h>
  35 #include <i18n.h>
  36
  37 #include <boost/numeric/conversion/cast.hpp>
  38
  39 #include <stringfunc.hxx>
  40
  41 using namespace std;
  42
  43 namespace I2n
  44 {
  45
  46
  47 namespace
  48 {
  49
  50 const std::string hexDigitsLower("0123456789abcdef");
  51 const std::string hexDigitsUpper("0123456789ABCDEF");
  52
  53
  54 struct UpperFunc
  55 {
  56    char operator() (char c)
  57    {
  58       return std::toupper(c);
  59    }
  60 }; // eo struct UpperFunc
  61
  62
  63 struct LowerFunc
  64 {
  65    char operator() (char c)
  66    {
  67       return std::tolower(c);
  68    }
  69 }; // eo struct LowerFunc
  70
  71
  72 } // eo namespace <anonymous>
  73
  74
  75
  76 /**
  77  * default list of Whitespaces (" \t\r\n");
  78  */
  79 const std::string Whitespaces = " \t\r\n";
  80
  81 /**
  82  * default list of lineendings ("\r\n");
  83  */
  84 const std::string LineEndings= "\r\n";
  85
  86
  87
  88 /**
  89  * @brief checks if a string begins with a given prefix.
  90  * @param[in,out] str the string which is tested
  91  * @param prefix the prefix which should be tested for.
  92  * @return @a true iff the prefix is not empty and the string begins with that prefix.
  93  */
  94 bool has_prefix(const std::string& str, const std::string& prefix)
  95 {
  96    if (prefix.empty() || str.empty() || str.size() < prefix.size() )
  97    {
  98       return false;
  99    }
 100    return str.compare(0, prefix.size(), prefix) == 0;
 101 } // eo has_prefix(const std::string&,const std::string&)
 102
 103
 104 /**
 105  * @brief checks if a string ends with a given suffix.
 106  * @param[in,out] str the string which is tested
 107  * @param suffix the suffix which should be tested for.
 108  * @return @a true iff the suffix is not empty and the string ends with that suffix.
 109  */
 110 bool has_suffix(const std::string& str, const std::string& suffix)
 111 {
 112    if (suffix.empty() || str.empty() || str.size() < suffix.size() )
 113    {
 114       return false;
 115    }
 116    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 117 } // eo has_suffix(const std::string&,const std::string&)
 118
 119
 120 /**
 121  * cut off characters from a given list from front and end of a string.
 122  * @param[in,out] str the string which should be trimmed.
 123  * @param charlist the list of characters to remove from beginning and end of string
 124  * @return the result string.
 125  */
 126 std::string trim_mod(std::string& str, const std::string& charlist)
 127 {
 128    // first: trim the beginning:
 129    std::string::size_type pos= str.find_first_not_of (charlist);
 130    if (pos == std::string::npos)
 131    {
 132       // whole string consists of charlist (or is already empty)
 133       str.clear();
 134       return str;
 135    }
 136    else if (pos>0)
 137    {
 138       // str starts with charlist
 139       str.erase(0,pos);
 140    }
 141    // now let's look at the tail:
 142    pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
 143    if ( pos < str.size() )
 144    {
 145       str.erase(pos, str.size()-pos);
 146    }
 147    return str;
 148 } // eo trim_mod(std::string&,const std::string&)
 149
 150
 151
 152 /**
 153  * removes last character from a string when it is in a list of chars to be removed.
 154  * @param[in,out] str the string.
 155  * @param what the list of chars which will be tested for.
 156  * @return the resulting string with last char removed (if applicable)
 157  */
 158 std::string chomp_mod(std::string& str, const std::string& what)
 159 {
 160    if (str.empty() || what.empty() )
 161    {
 162       return str;
 163    }
 164    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 165    {
 166       str.erase(str.size() - 1);
 167    }
 168    return str;
 169 } // eo chomp_mod(std::string&,const std::string&)
 170
 171
 172 /**
 173  * @brief converts a string to lower case.
 174  * @param[in,out] str the string to modify.
 175  * @return the string
 176  */
 177 std::string to_lower_mod(std::string& str)
 178 {
 179    std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
 180    return str;
 181 } // eo to_lower_mod(std::string&)
 182
 183
 184 /**
 185  * @brief converts a string to upper case.
 186  * @param[in,out] str the string to modify.
 187  * @return the string
 188  */
 189 std::string to_upper_mod(std::string& str)
 190 {
 191    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
 192    return str;
 193 } // eo to_upper_mod(std::string&)
 194
 195
 196
 197 /**
 198  * cut off characters from a given list from front and end of a string.
 199  * @param str the string which should be trimmed.
 200  * @param charlist the list of characters to remove from beginning and end of string
 201  * @return the result string.
 202  */
 203 std::string trim (const std::string& str, const std::string& charlist)
 204 {
 205    // first: trim the beginning:
 206    std::string::size_type pos0= str.find_first_not_of(charlist);
 207    if (pos0 == std::string::npos)
 208    {
 209       // whole string consists of charlist (or is already empty)
 210       return std::string();
 211    }
 212    // now let's look at the end:
 213    std::string::size_type pos1= str.find_last_not_of(charlist);
 214    return str.substr(pos0, pos1 - pos0 + 1);
 215 } // eo trim(const std:.string&,const std::string&)
 216
 217
 218 /**
 219  * removes last character from a string when it is in a list of chars to be removed.
 220  * @param str the string.
 221  * @param what the list of chars which will be tested for.
 222  * @return the resulting string with last char removed (if applicable)
 223  */
 224 std::string chomp (const std::string& str, const std::string& what)
 225 {
 226    if (str.empty() || what.empty() )
 227    {
 228       return str;
 229    }
 230    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 231    {
 232       return str.substr(0, str.size()-1);
 233    }
 234    return str;
 235 } // eo chomp(const std:.string&,const std::string&)
 236
 237
 238 /**
 239  * @brief returns a lower case version of a given string.
 240  * @param str the string
 241  * @return the lower case version of the string
 242  */
 243 std::string to_lower (const std::string& str)
 244 {
 245    std::string result(str);
 246    return to_lower_mod(result);
 247 } // eo to_lower(const std::string&)
 248
 249
 250 /**
 251  * @brief returns a upper case version of a given string.
 252  * @param str the string
 253  * @return the upper case version of the string
 254  */
 255 std::string to_upper(const std::string& str)
 256 {
 257    std::string result(str);
 258    return to_upper_mod(result);
 259 } // eo to_upper(const std::string&)
 260
 261
 262
 263 /**
 264  * @brief removes a given suffix from a string.
 265  * @param str the string.
 266  * @param suffix the suffix which should be removed if the string ends with it.
 267  * @return the string without the suffix.
 268  *
 269  * If the string ends with the suffix, it is removed. If the the string doesn't end
 270  * with the suffix the original string is returned.
 271  */
 272 std::string remove_suffix(const std::string& str, const std::string& suffix)
 273 {
 274    if (has_suffix(str,suffix) )
 275    {
 276       return str.substr(0, str.size()-suffix.size() );
 277    }
 278    return str;
 279 } // eo remove_suffix(const std::string&,const std::string&)
 280
 281
 282
 283 /**
 284  * @brief removes a given prefix from a string.
 285  * @param str the string.
 286  * @param prefix the prefix which should be removed if the string begins with it.
 287  * @return the string without the prefix.
 288  *
 289  * If the string begins with the prefix, it is removed. If the the string doesn't begin
 290  * with the prefix the original string is returned.
 291  */
 292 std::string remove_prefix(const std::string& str, const std::string& prefix)
 293 {
 294    if (has_prefix(str,prefix) )
 295    {
 296       return str.substr( prefix.size() );
 297    }
 298    return str;
 299 } // eo remove_prefix(const std::string&,const std::string&)
 300
 301
 302 /**
 303  * split a string to key and value delimited by a given delimiter.
 304  * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 305  * @param str the string which should be splitted.
 306  * @param[out] key the resulting key
 307  * @param[out] value the resulting value
 308  * @param delimiter the delimiter between key and value; default is '='.
 309  * @return @a true if the split was successful.
 310  */
 311 bool pair_split(
 312    const std::string& str,
 313    std::string& key,
 314    std::string& value,
 315    char delimiter)
 316 {
 317    std::string::size_type pos = str.find (delimiter);
 318    if (pos == std::string::npos) return false;
 319    key= str.substr(0,pos);
 320    value= str.substr(pos+1);
 321    trim_mod(key);
 322    trim_mod(value);
 323    return true;
 324 } // eo pair_split(const std::string&,std::string&,std::string&,char)
 325
 326
 327 /**
 328  * splits a string by given delimiter
 329  *
 330  * @param[in] str the string which should be splitted.
 331  * @param[out] result the list resulting from splitting  @a str.
 332  * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 333  * @param[in] omit_empty should empty parts not be stored?
 334  * @param[in] trim_list list of characters the parts should be trimmed by.
 335  *  (empty string results in no trim)
 336  */
 337 void split_string(
 338    const std::string& str,
 339    std::list<std::string>& result,
 340    const std::string& delimiter,
 341    bool omit_empty,
 342    const std::string& trim_list
 343 )
 344 {
 345    std::string::size_type pos, last_pos=0;
 346    bool delimiter_found= false;
 347    while ( last_pos < str.size()  && last_pos != std::string::npos)
 348    {
 349       pos= str.find(delimiter, last_pos);
 350       std::string part;
 351       if (pos == std::string::npos)
 352       {
 353          part= str.substr(last_pos);
 354          delimiter_found= false;
 355       }
 356       else
 357       {
 358          part= str.substr(last_pos, pos-last_pos);
 359          delimiter_found=true;
 360       }
 361       if (pos != std::string::npos)
 362       {
 363          last_pos= pos+ delimiter.size();
 364       }
 365       else
 366       {
 367          last_pos= std::string::npos;
 368       }
 369       if (!trim_list.empty() ) trim_mod (part, trim_list);
 370       if (omit_empty && part.empty() ) continue;
 371       result.push_back( part );
 372    }
 373    // if the string ends with a delimiter we need to append an empty string if no omit_empty
 374    // was given.
 375    // (this way we keep the split result consistent to a join operation)
 376    if (delimiter_found && !omit_empty)
 377    {
 378       result.push_back("");
 379    }
 380 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
 381
 382
 383 /** call split_string with list<string>, converts result to vector; vector is clear()-ed first
 384  *
 385  * Note: Uses 3 O(n)-operations: list.size, vector.resize and std::swap_ranges;
 386  *       not sure whether there is a better way to do this
 387  * */
 388 void split_string(
 389    const std::string& str,
 390    std::vector<std::string>& result,
 391    const std::string& delimiter,
 392    bool omit_empty,
 393    const std::string& trim_list
 394 )
 395 {
 396     std::list<std::string> tmp;
 397     split_string(str, tmp, delimiter, omit_empty, trim_list);
 398     std::size_t size = tmp.size();   // this is O(n)
 399     result.clear();
 400     result.resize(size);             // also O(n)
 401     std::swap_ranges(tmp.begin(), tmp.end(), result.begin());   // also O(n)
 402 }
 403
 404 /**
 405  * splits a string by a given delimiter
 406  * @param str the string which should be splitted.
 407  * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 408  * @param[in] omit_empty should empty parts not be stored?
 409  * @param[in] trim_list list of characters the parts should be trimmed by.
 410  *  (empty string results in no trim)
 411  * @return the list resulting from splitting @a str.
 412  */
 413 std::list<std::string> split_string(
 414    const std::string& str,
 415    const std::string& delimiter,
 416    bool omit_empty,
 417    const std::string& trim_list
 418 )
 419 {
 420    std::list<std::string> result;
 421    split_string(str, result, delimiter, omit_empty, trim_list);
 422    return result;
 423 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
 424
 425
 426 /**
 427  * @brief joins a list of strings into a single string.
 428  *
 429  * This funtion is (basically) the reverse operation of @a split_string.
 430  *
 431  * @param parts the list of strings.
 432  * @param delimiter the delimiter which is inserted between the strings.
 433  * @return the joined string.
 434  */
 435 std::string join_string(
 436    const std::list< std::string >& parts,
 437    const std::string& delimiter
 438 )
 439 {
 440    std::string result;
 441    if (! parts.empty() )
 442    {
 443       std::list< std::string >::const_iterator it= parts.begin();
 444       result = *it;
 445       while ( ++it != parts.end() )
 446       {
 447          result+= delimiter;
 448          result+= *it;
 449       }
 450    }
 451    return result;
 452 } // eo join_string(const std::list< std::string >&,const std::string&)
 453
 454
 455 /** @brief same as join_string for list, except uses a vector */
 456 std::string join_string(
 457    const std::vector< std::string >& parts,
 458    const std::string& delimiter
 459 )
 460 {
 461    std::string result;
 462    if (! parts.empty() )
 463    {
 464       std::vector< std::string >::const_iterator it= parts.begin();
 465       result = *it;
 466       while ( ++it != parts.end() )
 467       {
 468          result+= delimiter;
 469          result+= *it;
 470       }
 471    }
 472    return result;
 473 } // eo join_string(const std::vector< std::string >&,const std::string&)
 474
 475
 476
 477 /*
 478 ** conversions
 479 */
 480
 481
 482 /**
 483  * @brief returns a hex string from a binary string.
 484  * @param str the (binary) string
 485  * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 486  * @return the string in hex notation.
 487  */
 488 std::string convert_binary_to_hex(
 489    const std::string& str,
 490    bool upper_case_digits
 491 )
 492 {
 493    std::string result;
 494    std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
 495    for ( std::string::const_iterator it= str.begin();
 496          it != str.end();
 497          ++it)
 498    {
 499       result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
 500       result.push_back( hexDigits[ (*it) & 0x0f ] );
 501    }
 502    return result;
 503 } // eo convert_binary_to_hex(const std::string&,bool)
 504
 505
 506 /**
 507  * @brief converts a hex digit string to binary string.
 508  * @param str hex digit string
 509  * @return the binary string.
 510  *
 511  * The hex digit string may contains white spaces or colons which are treated
 512  * as delimiters between hex digit groups.
 513  *
 514  * @todo rework the handling of half nibbles (consistency)!
 515  */
 516 std::string convert_hex_to_binary(
 517    const std::string& str
 518 )
 519 throw (std::runtime_error)
 520 {
 521    std::string result;
 522    char c= 0;
 523    bool hasNibble= false;
 524    bool lastWasWS= true;
 525    for ( std::string::const_iterator it= str.begin();
 526          it != str.end();
 527          ++it)
 528    {
 529       std::string::size_type p = hexDigitsLower.find( *it );
 530       if (p== std::string::npos)
 531       {
 532          p= hexDigitsUpper.find( *it );
 533       }
 534       if (p == std::string::npos)
 535       {
 536          if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
 537                 or ( *it == ':') // or a colon?
 538             )
 539          {
 540             // we treat that as a valid delimiter:
 541             if (hasNibble)
 542             {
 543                // 1 nibble before WS is treate as lower part:
 544                result.push_back(c);
 545                // reset state:
 546                hasNibble= false;
 547             }
 548             lastWasWS= true;
 549             continue;
 550          }
 551       }
 552       if (p == std::string::npos )
 553       {
 554          throw runtime_error("illegal character in hex digit string: " + str);
 555       }
 556       lastWasWS= false;
 557       if (hasNibble)
 558       {
 559          c<<=4;
 560       }
 561       else
 562       {
 563          c=0;
 564       }
 565       c+= (p & 0x0f);
 566       if (hasNibble)
 567       {
 568          //we already had a nibble, so a char is complete now:
 569          result.push_back( c );
 570          hasNibble=false;
 571       }
 572       else
 573       {
 574          // this is the first nibble of a new char:
 575          hasNibble=true;
 576       }
 577    }
 578    if (hasNibble)
 579    {
 580       //well, there is one nibble left
 581       // let's do some heuristics:
 582       if (lastWasWS)
 583       {
 584          // if the preceeding character was a white space (or a colon)
 585          // we treat the nibble as lower part:
 586          //( this is consistent with shortened hex notations where leading zeros are not noted)
 587          result.push_back( c );
 588       }
 589       else
 590       {
 591          // if it was part of a hex digit chain, we treat it as UPPER part (!!)
 592          result.push_back( c << 4 );
 593       }
 594    }
 595    return result;
 596 } // eo convert_hex_to_binary(const std::string&)
 597
 598
 599 } // eo namespace I2n
 600
 601
 602
 603
 604 std::string iso_to_utf8(const std::string& isostring)
 605 {
 606    string result;
 607
 608    iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
 609
 610    if (iso_to_utf8 == (iconv_t)-1)
 611       throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
 612
 613    size_t in_size=isostring.size();
 614    size_t out_size=in_size*4;
 615
 616    char *buf = (char *)malloc(out_size+1);
 617    if (buf == NULL)
 618       throw runtime_error("out of memory for iconv buffer");
 619
 620    char *in = (char *)isostring.c_str();
 621    char *out = buf;
 622    iconv(i2utf8, &in, &in_size, &out, &out_size);
 623
 624    buf[isostring.size()*4-out_size]=0;
 625
 626    result=buf;
 627
 628    free(buf);
 629    iconv_close(i2utf8);
 630
 631    return result;
 632 }
 633
 634 std::string utf8_to_iso(const std::string& utf8string)
 635 {
 636    string result;
 637
 638    iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
 639
 640    if (utf82iso == (iconv_t)-1)
 641       throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
 642
 643    size_t in_size=utf8string.size();
 644    size_t out_size=in_size;
 645
 646    char *buf = (char *)malloc(out_size+1);
 647    if (buf == NULL)
 648       throw runtime_error("out of memory for iconv buffer");
 649
 650    char *in = (char *)utf8string.c_str();
 651    char *out = buf;
 652    iconv(utf82iso, &in, &in_size, &out, &out_size);
 653
 654    buf[utf8string.size()-out_size]=0;
 655
 656    result=buf;
 657
 658    free(buf);
 659    iconv_close(utf82iso);
 660
 661    return result;
 662 }
 663
 664 wchar_t* utf8_to_wbuf(const std::string& utf8string)
 665 {
 666    iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
 667
 668    if (utf82wstr == (iconv_t)-1)
 669       throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
 670
 671    size_t in_size=utf8string.size();
 672    size_t out_size= (in_size+1)*sizeof(wchar_t);
 673
 674    wchar_t *buf = (wchar_t *)malloc(out_size);
 675    if (buf == NULL)
 676       throw runtime_error("out of memory for iconv buffer");
 677
 678    char *in = (char *)utf8string.c_str();
 679    char *out = (char*) buf;
 680    if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
 681       throw runtime_error("error converting char encodings");
 682
 683    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
 684
 685    iconv_close(utf82wstr);
 686
 687    return buf;
 688 }
 689
 690 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
 691 {
 692    string result;
 693
 694    iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
 695
 696    if (utf7imap2utf8 == (iconv_t)-1)
 697       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 698
 699    size_t in_size=utf7imapstring.size();
 700    size_t out_size=in_size*4;
 701
 702    char *buf = (char *)malloc(out_size+1);
 703    if (buf == NULL)
 704       throw runtime_error("out of memory for iconv buffer");
 705
 706    char *in = (char *)utf7imapstring.c_str();
 707    char *out = buf;
 708    iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
 709
 710    buf[utf7imapstring.size()*4-out_size]=0;
 711
 712    result=buf;
 713
 714    free(buf);
 715    iconv_close(utf7imap2utf8);
 716
 717    return result;
 718 }
 719
 720 std::string utf8_to_utf7imap(const std::string& utf8string)
 721 {
 722    string result;
 723
 724    iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
 725
 726    if (utf82utf7imap == (iconv_t)-1)
 727       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 728
 729    // UTF-7 is base64 encoded, a buffer 10x as large
 730    // as the utf-8 buffer should be enough. If not the string will be truncated.
 731    size_t in_size=utf8string.size();
 732    size_t out_size=in_size*10;
 733
 734    char *buf = (char *)malloc(out_size+1);
 735    if (buf == NULL)
 736       throw runtime_error("out of memory for iconv buffer");
 737
 738    char *in = (char *)utf8string.c_str();
 739    char *out = buf;
 740    iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
 741
 742    buf[utf8string.size()*10-out_size]= 0;
 743
 744    result=buf;
 745
 746    free(buf);
 747    iconv_close(utf82utf7imap);
 748
 749    return result;
 750 }
 751
 752 // Tokenize string by (html) tags
 753 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
 754 {
 755    string::size_type pos, len = input.size();
 756    bool inside_tag = false;
 757    string current;
 758
 759    for (pos = 0; pos < len; pos++)
 760    {
 761       if (input[pos] == '<')
 762       {
 763          inside_tag = true;
 764
 765          if (!current.empty() )
 766          {
 767             tokenized.push_back( make_pair(current, false) );
 768             current = "";
 769          }
 770
 771          current += input[pos];
 772       }
 773       else if (input[pos] == '>' && inside_tag)
 774       {
 775          current += input[pos];
 776          inside_tag = false;
 777          if (!current.empty() )
 778          {
 779             tokenized.push_back( make_pair(current, true) );
 780             current = "";
 781          }
 782       }
 783       else
 784          current += input[pos];
 785    }
 786
 787    // String left over in buffer?
 788    if (!current.empty() )
 789       tokenized.push_back( make_pair(current, false) );
 790 } // eo tokenize_by_tag
 791
 792
 793 std::string strip_html_tags(const std::string &input)
 794 {
 795    // Pair first: string, second: isTag
 796    vector<pair<string,bool> > tokenized;
 797    tokenize_by_tag (tokenized, input);
 798
 799    string output;
 800    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 801    for (token = tokenized.begin(); token != tokens_end; ++token)
 802       if (!token->second)
 803          output += token->first;
 804
 805    return output;
 806 } // eo strip_html_tags
 807
 808
 809 // Smart-encode HTML en
 810 string smart_html_entities(const std::string &input)
 811 {
 812    // Pair first: string, second: isTag
 813    vector<pair<string,bool> > tokenized;
 814    tokenize_by_tag (tokenized, input);
 815
 816    string output;
 817    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 818    for (token = tokenized.begin(); token != tokens_end; ++token)
 819    {
 820       // keep HTML tags as they are
 821       if (token->second)
 822          output += token->first;
 823       else
 824          output += html_entities(token->first);
 825    }
 826
 827    return output;
 828 }
 829
 830
 831 string::size_type find_8bit(const std::string &str)
 832 {
 833    string::size_type l=str.size();
 834    for (string::size_type p=0; p < l; p++)
 835       if (static_cast<unsigned char>(str[p]) > 127)
 836          return p;
 837
 838    return string::npos;
 839 }
 840
 841 // encoded UTF-8 chars into HTML entities
 842 string html_entities(std::string str)
 843 {
 844    // Normal chars
 845    replace_all (str, "&", "&amp;");
 846    replace_all (str, "<", "&lt;");
 847    replace_all (str, ">", "&gt;");
 848    replace_all (str, "\"", "&quot;");
 849    replace_all (str, "'", "&#x27;");
 850    replace_all (str, "/", "&#x2F;");
 851
 852    // Umlauts
 853    replace_all (str, "\xC3\xA4", "&auml;");
 854    replace_all (str, "\xC3\xB6", "&ouml;");
 855    replace_all (str, "\xC3\xBC", "&uuml;");
 856    replace_all (str, "\xC3\x84", "&Auml;");
 857    replace_all (str, "\xC3\x96", "&Ouml;");
 858    replace_all (str, "\xC3\x9C", "&Uuml;");
 859
 860    // Misc
 861    replace_all (str, "\xC3\x9F", "&szlig;");
 862
 863    // conversion of remaining non-ASCII chars needed?
 864    // just do if needed because of performance
 865    if (find_8bit(str) != string::npos)
 866    {
 867       // convert to fixed-size encoding UTF-32
 868       wchar_t* wbuf=utf8_to_wbuf(str);
 869       ostringstream target;
 870
 871       // replace all non-ASCII chars with HTML representation
 872       for (int p=0; wbuf[p] != 0; p++)
 873       {
 874          unsigned int c=wbuf[p];
 875
 876          if (c <= 127)
 877             target << static_cast<unsigned char>(c);
 878          else
 879             target << "&#" << c << ';';
 880       }
 881
 882       free(wbuf);
 883
 884       str=target.str();
 885    }
 886
 887    return str;
 888 } // eo html_entities(std::string)
 889
 890 // convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
 891 string html_entities_to_console(std::string str)
 892 {
 893    // Normal chars
 894    replace_all (str, "&amp;", "&");
 895    replace_all (str, "&lt;", "<");
 896    replace_all (str, "&gt;", ">");
 897    replace_all (str, "&quot;", "\"");
 898    replace_all (str, "&#x27;", "'");
 899    replace_all (str, "&#x2F;", "/");
 900
 901    // Umlauts
 902    replace_all (str, "&auml;", "ae");
 903    replace_all (str, "&ouml;", "oe");
 904    replace_all (str, "&uuml;", "ue");
 905    replace_all (str, "&Auml;", "Ae");
 906    replace_all (str, "&Ouml;", "Oe");
 907    replace_all (str, "&Uuml;", "Ue");
 908
 909    // Misc
 910    replace_all (str, "&szlig;", "ss");
 911
 912    return str;
 913 }
 914
 915 bool replace_all(string &base, const char *ist, const char *soll)
 916 {
 917    string i=ist;
 918    string s=soll;
 919    return replace_all(base,&i,&s);
 920 }
 921
 922 bool replace_all(string &base, const string &ist, const char *soll)
 923 {
 924    string s=soll;
 925    return replace_all(base,&ist,&s);
 926 }
 927
 928 bool replace_all(string &base, const string *ist, const string *soll)
 929 {
 930    return replace_all(base,*ist,*soll);
 931 }
 932
 933 bool replace_all(string &base, const char *ist, const string *soll)
 934 {
 935    string i=ist;
 936    return replace_all(base,&i,soll);
 937 }
 938
 939 bool replace_all(string &base, const string &ist, const string &soll)
 940 {
 941    bool found_ist = false;
 942    string::size_type a=0;
 943
 944    if (ist.empty() )
 945       throw runtime_error ("replace_all called with empty search string");
 946
 947    while ( (a=base.find(ist,a) ) != string::npos)
 948    {
 949       base.replace(a,ist.size(),soll);
 950       a=a+soll.size();
 951       found_ist = true;
 952    }
 953
 954    return found_ist;
 955 }
 956
 957 /**
 958  * @brief replaces all characters that could be problematic or impose a security risk when being logged
 959  * @param str the original string
 960  * @param replace_with the character to replace the unsafe chars with
 961  * @return a string that is safe to send to syslog or other logfiles
 962  *
 963  * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
 964  * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
 965  * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
 966  *
 967  */
 968 std::string sanitize_for_logging(const std::string &str, const char replace_with)
 969 {
 970     std::string output=str;
 971
 972     const string::size_type len = output.size();
 973     for (std::string::size_type p=0; p < len; p++)
 974         if (output[p] < 0x20 || output[p] > 0x7E)
 975             output[p]=replace_with;
 976
 977     return output;
 978 }
 979
 980 #if 0
 981 string to_lower(const string &src)
 982 {
 983    string dst = src;
 984
 985    string::size_type pos, end = dst.size();
 986    for (pos = 0; pos < end; pos++)
 987       dst[pos] = tolower(dst[pos]);
 988
 989    return dst;
 990 }
 991
 992 string to_upper(const string &src)
 993 {
 994    string dst = src;
 995
 996    string::size_type pos, end = dst.size();
 997    for (pos = 0; pos < end; pos++)
 998       dst[pos] = toupper(dst[pos]);
 999
1000    return dst;
1001 }
1002 #endif
1003
1004 const int MAX_UNIT_FORMAT_SYMBOLS = 6;
1005
1006 const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
1007         " B",
1008         " KB",
1009         " MB",
1010         " GB",
1011         " TB",
1012         " PB"
1013 };
1014
1015 const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
1016         i18n_noop(" Bytes"),
1017         i18n_noop(" KBytes"),
1018         i18n_noop(" MBytes"),
1019         i18n_noop(" GBytes"),
1020         i18n_noop(" TBytes"),
1021         i18n_noop(" PBytes")
1022 };
1023
1024
1025 long double rounding_upwards(
1026         const long double number,
1027         const int rounding_multiplier
1028 )
1029 {
1030     long double rounded_number;
1031     rounded_number = number * rounding_multiplier;
1032     rounded_number += 0.5;
1033     rounded_number = (int64_t) (rounded_number);
1034     rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
1035
1036     return rounded_number;
1037 }
1038
1039
1040 string nice_unit_format(
1041         const int64_t input,
1042         const UnitFormat format,
1043         const UnitBase base
1044 )
1045 {
1046    // select the system of units (decimal or binary)
1047    int multiple = 0;
1048    if (base == UnitBase1000)
1049    {
1050        multiple = 1000;
1051    }
1052    else
1053    {
1054        multiple = 1024;
1055    }
1056
1057    long double size = input;
1058
1059    // check the size of the input number to fit in the appropriate symbol
1060    int sizecount = 0;
1061    while (size > multiple)
1062    {
1063        size = size / multiple;
1064        sizecount++;
1065
1066        // rollback to the previous values and stop the loop when cannot
1067        // represent the number length.
1068        if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1069        {
1070            size = size * multiple;
1071            sizecount--;
1072            break;
1073        }
1074    }
1075
1076    // round the input number "half up" to multiples of 10
1077    const int rounding_multiplier = 10;
1078    size = rounding_upwards(size, rounding_multiplier);
1079
1080    // format the input number, placing the appropriate symbol
1081    ostringstream out;
1082    out.setf (ios::fixed);
1083    if (format == ShortUnitFormat)
1084    {
1085        out.precision(1);
1086        out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
1087    }
1088    else
1089    {
1090        out.precision (2);
1091        out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
1092    }
1093
1094    return out.str();
1095 } // eo nice_unit_format(int input)
1096
1097
1098 string nice_unit_format(
1099         const double input,
1100         const UnitFormat format,
1101         const UnitBase base
1102 )
1103 {
1104     // round as double and cast to int64_t
1105     // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
1106     int64_t input_casted_and_rounded =
1107         boost::numeric_cast<int64_t>( round(input) );
1108
1109     // now call other
1110     return nice_unit_format( input_casted_and_rounded, format, base );
1111 } // eo nice_unit_format(double input)
1112
1113
1114 string escape(const string &s)
1115 {
1116    string out(s);
1117    string::size_type p;
1118
1119    p=0;
1120    while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1121    {
1122       out.insert (p,"\\");
1123       p+=2;
1124    }
1125
1126    p=0;
1127    while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1128    {
1129       out.replace (p,1,"\\r");
1130       p+=2;
1131    }
1132
1133    p=0;
1134    while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1135    {
1136       out.replace (p,1,"\\n");
1137       p+=2;
1138    }
1139
1140    out='"'+out+'"';
1141
1142    return out;
1143 } // eo scape(const std::string&)
1144
1145
1146 string descape(const string &s, int startpos, int &endpos)
1147 {
1148    string out;
1149
1150    if (s.at(startpos) != '"')
1151       throw out_of_range("value not type escaped string");
1152
1153    out=s.substr(startpos+1);
1154    string::size_type p=0;
1155
1156    // search for the end of the string
1157    while ( (p=out.find("\"",p) ) !=out.npos)
1158    {
1159       int e=p-1;
1160       bool escaped=false;
1161
1162       // the " might be escaped with a backslash
1163       while (e>=0 && out.at (e) =='\\')
1164       {
1165          if (escaped == false)
1166             escaped=true;
1167          else
1168             escaped=false;
1169
1170          e--;
1171       }
1172
1173       if (escaped==false)
1174          break;
1175       else
1176          p++;
1177    }
1178
1179    // we now have the end of the string
1180    out=out.substr(0,p);
1181
1182    // tell calling prog about the endposition
1183    endpos=startpos+p+1;
1184
1185    // descape all \ stuff inside the string now
1186    p=0;
1187    while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1188    {
1189       switch (out.at(p+1) )
1190       {
1191          case 'r':
1192             out.replace(p,2,"\r");
1193             break;
1194          case 'n':
1195             out.replace(p,2,"\n");
1196             break;
1197          default:
1198             out.erase(p,1);
1199       }
1200       p++;
1201    }
1202
1203    return out;
1204 } // eo descape(const std::string&,int,int&)
1205
1206
1207 string escape_shellarg(const string &input)
1208 {
1209    string output = "'";
1210    string::const_iterator it, it_end = input.end();
1211    for (it = input.begin(); it != it_end; ++it)
1212    {
1213       if ( (*it) == '\'')
1214          output += "'\\'";
1215
1216       output += *it;
1217    }
1218
1219    output += "'";
1220    return output;
1221 }