developer.intra2net.com Git - libi2ncommon/blob - src/stringfunc.cpp

   1 /*
   2 The software in this package is distributed under the GNU General
   3 Public License version 2 (with a special exception described below).
   4
   5 A copy of GNU General Public License (GPL) is included in this distribution,
   6 in the file COPYING.GPL.
   7
   8 As a special exception, if other files instantiate templates or use macros
   9 or inline functions from this file, or you compile this file and link it
  10 with other works to produce a work based on this file, this file
  11 does not by itself cause the resulting work to be covered
  12 by the GNU General Public License.
  13
  14 However the source code for this file must still be made available
  15 in accordance with section (3) of the GNU General Public License.
  16
  17 This exception does not invalidate any other reasons why a work based
  18 on this file might be covered by the GNU General Public License.
  19 */
  20 /** @file
  21  *
  22  * (c) Copyright 2007-2008 by Intra2net AG
  23  */
  24
  25 #include <iostream>
  26 #include <string>
  27 #include <sstream>
  28 #include <stdexcept>
  29 #include <algorithm>
  30
  31 #include <wchar.h>
  32 #include <stdlib.h>
  33 #include <iconv.h>
  34 #include <i18n.h>
  35
  36 #include <stringfunc.hxx>
  37
  38 using namespace std;
  39
  40 namespace I2n
  41 {
  42
  43
  44 namespace
  45 {
  46
  47 const std::string hexDigitsLower("0123456789abcdef");
  48 const std::string hexDigitsUpper("0123456789ABCDEF");
  49
  50
  51 struct UpperFunc
  52 {
  53    char operator() (char c)
  54    {
  55       return std::toupper(c);
  56    }
  57 }; // eo struct UpperFunc
  58
  59
  60 struct LowerFunc
  61 {
  62    char operator() (char c)
  63    {
  64       return std::tolower(c);
  65    }
  66 }; // eo struct LowerFunc
  67
  68
  69 } // eo namespace <anonymous>
  70
  71
  72
  73 /**
  74  * default list of Whitespaces (" \t\r\n");
  75  */
  76 const std::string Whitespaces = " \t\r\n";
  77
  78 /**
  79  * default list of lineendings ("\r\n");
  80  */
  81 const std::string LineEndings= "\r\n";
  82
  83
  84
  85 /**
  86  * @brief checks if a string begins with a given prefix.
  87  * @param[in,out] str the string which is tested
  88  * @param prefix the prefix which should be tested for.
  89  * @return @a true iff the prefix is not empty and the string begins with that prefix.
  90  */
  91 bool has_prefix(const std::string& str, const std::string& prefix)
  92 {
  93    if (prefix.empty() || str.empty() || str.size() < prefix.size() )
  94    {
  95       return false;
  96    }
  97    return str.compare(0, prefix.size(), prefix) == 0;
  98 } // eo has_prefix(const std::string&,const std::string&)
  99
 100
 101 /**
 102  * @brief checks if a string ends with a given suffix.
 103  * @param[in,out] str the string which is tested
 104  * @param suffix the suffix which should be tested for.
 105  * @return @a true iff the suffix is not empty and the string ends with that suffix.
 106  */
 107 bool has_suffix(const std::string& str, const std::string& suffix)
 108 {
 109    if (suffix.empty() || str.empty() || str.size() < suffix.size() )
 110    {
 111       return false;
 112    }
 113    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 114 } // eo has_suffix(const std::string&,const std::string&)
 115
 116
 117 /**
 118  * cut off characters from a given list from front and end of a string.
 119  * @param[in,out] str the string which should be trimmed.
 120  * @param charlist the list of characters to remove from beginning and end of string
 121  * @return the result string.
 122  */
 123 std::string trim_mod(std::string& str, const std::string& charlist)
 124 {
 125    // first: trim the beginning:
 126    std::string::size_type pos= str.find_first_not_of (charlist);
 127    if (pos == std::string::npos)
 128    {
 129       // whole string consists of charlist (or is already empty)
 130       str.clear();
 131       return str;
 132    }
 133    else if (pos>0)
 134    {
 135       // str starts with charlist
 136       str.erase(0,pos);
 137    }
 138    // now let's look at the tail:
 139    pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
 140    if ( pos < str.size() )
 141    {
 142       str.erase(pos, str.size()-pos);
 143    }
 144    return str;
 145 } // eo trim_mod(std::string&,const std::string&)
 146
 147
 148
 149 /**
 150  * removes last character from a string when it is in a list of chars to be removed.
 151  * @param[in,out] str the string.
 152  * @param what the list of chars which will be tested for.
 153  * @return the resulting string with last char removed (if applicable)
 154  */
 155 std::string chomp_mod(std::string& str, const std::string& what)
 156 {
 157    if (str.empty() || what.empty() )
 158    {
 159       return str;
 160    }
 161    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 162    {
 163       str.erase(str.size() - 1);
 164    }
 165    return str;
 166 } // eo chomp_mod(std::string&,const std::string&)
 167
 168
 169 /**
 170  * @brief converts a string to lower case.
 171  * @param[in,out] str the string to modify.
 172  * @return the string
 173  */
 174 std::string to_lower_mod(std::string& str)
 175 {
 176    std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
 177    return str;
 178 } // eo to_lower_mod(std::string&)
 179
 180
 181 /**
 182  * @brief converts a string to upper case.
 183  * @param[in,out] str the string to modify.
 184  * @return the string
 185  */
 186 std::string to_upper_mod(std::string& str)
 187 {
 188    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
 189    return str;
 190 } // eo to_upper_mod(std::string&)
 191
 192
 193
 194 /**
 195  * cut off characters from a given list from front and end of a string.
 196  * @param str the string which should be trimmed.
 197  * @param charlist the list of characters to remove from beginning and end of string
 198  * @return the result string.
 199  */
 200 std::string trim (const std::string& str, const std::string& charlist)
 201 {
 202    // first: trim the beginning:
 203    std::string::size_type pos0= str.find_first_not_of(charlist);
 204    if (pos0 == std::string::npos)
 205    {
 206       // whole string consists of charlist (or is already empty)
 207       return std::string();
 208    }
 209    // now let's look at the end:
 210    std::string::size_type pos1= str.find_last_not_of(charlist);
 211    return str.substr(pos0, pos1 - pos0 + 1);
 212 } // eo trim(const std:.string&,const std::string&)
 213
 214
 215 /**
 216  * removes last character from a string when it is in a list of chars to be removed.
 217  * @param str the string.
 218  * @param what the list of chars which will be tested for.
 219  * @return the resulting string with last char removed (if applicable)
 220  */
 221 std::string chomp (const std::string& str, const std::string& what)
 222 {
 223    if (str.empty() || what.empty() )
 224    {
 225       return str;
 226    }
 227    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 228    {
 229       return str.substr(0, str.size()-1);
 230    }
 231    return str;
 232 } // eo chomp(const std:.string&,const std::string&)
 233
 234
 235 /**
 236  * @brief returns a lower case version of a given string.
 237  * @param str the string
 238  * @return the lower case version of the string
 239  */
 240 std::string to_lower (const std::string& str)
 241 {
 242    std::string result(str);
 243    return to_lower_mod(result);
 244 } // eo to_lower(const std::string&)
 245
 246
 247 /**
 248  * @brief returns a upper case version of a given string.
 249  * @param str the string
 250  * @return the upper case version of the string
 251  */
 252 std::string to_upper(const std::string& str)
 253 {
 254    std::string result(str);
 255    return to_upper_mod(result);
 256 } // eo to_upper(const std::string&)
 257
 258
 259
 260 /**
 261  * @brief removes a given suffix from a string.
 262  * @param str the string.
 263  * @param suffix the suffix which should be removed if the string ends with it.
 264  * @return the string without the suffix.
 265  *
 266  * If the string ends with the suffix, it is removed. If the the string doesn't end
 267  * with the suffix the original string is returned.
 268  */
 269 std::string remove_suffix(const std::string& str, const std::string& suffix)
 270 {
 271    if (has_suffix(str,suffix) )
 272    {
 273       return str.substr(0, str.size()-suffix.size() );
 274    }
 275    return str;
 276 } // eo remove_suffix(const std::string&,const std::string&)
 277
 278
 279
 280 /**
 281  * @brief removes a given prefix from a string.
 282  * @param str the string.
 283  * @param prefix the prefix which should be removed if the string begins with it.
 284  * @return the string without the prefix.
 285  *
 286  * If the string begins with the prefix, it is removed. If the the string doesn't begin
 287  * with the prefix the original string is returned.
 288  */
 289 std::string remove_prefix(const std::string& str, const std::string& prefix)
 290 {
 291    if (has_prefix(str,prefix) )
 292    {
 293       return str.substr( prefix.size() );
 294    }
 295    return str;
 296 } // eo remove_prefix(const std::string&,const std::string&)
 297
 298
 299 /**
 300  * split a string to key and value delimited by a given delimiter.
 301  * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 302  * @param str the string which should be splitted.
 303  * @param[out] key the resulting key
 304  * @param[out] value the resulting value
 305  * @param delimiter the delimiter between key and value; default is '='.
 306  * @return @a true if the split was successful.
 307  */
 308 bool pair_split(
 309    const std::string& str,
 310    std::string& key,
 311    std::string& value,
 312    char delimiter)
 313 {
 314    std::string::size_type pos = str.find (delimiter);
 315    if (pos == std::string::npos) return false;
 316    key= str.substr(0,pos);
 317    value= str.substr(pos+1);
 318    trim_mod(key);
 319    trim_mod(value);
 320    return true;
 321 } // eo pair_split(const std::string&,std::string&,std::string&,char)
 322
 323
 324 /**
 325  * splits a string by given delimiter
 326  *
 327  * @param[in] str the string which should be splitted.
 328  * @param[out] result the list resulting from splitting  @a str.
 329  * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 330  * @param[in] omit_empty should empty parts not be stored?
 331  * @param[in] trim_list list of characters the parts should be trimmed by.
 332  *  (empty string results in no trim)
 333  */
 334 void split_string(
 335    const std::string& str,
 336    std::list<std::string>& result,
 337    const std::string& delimiter,
 338    bool omit_empty,
 339    const std::string& trim_list
 340 )
 341 {
 342    std::string::size_type pos, last_pos=0;
 343    bool delimiter_found= false;
 344    while ( last_pos < str.size()  && last_pos != std::string::npos)
 345    {
 346       pos= str.find(delimiter, last_pos);
 347       std::string part;
 348       if (pos == std::string::npos)
 349       {
 350          part= str.substr(last_pos);
 351          delimiter_found= false;
 352       }
 353       else
 354       {
 355          part= str.substr(last_pos, pos-last_pos);
 356          delimiter_found=true;
 357       }
 358       if (pos != std::string::npos)
 359       {
 360          last_pos= pos+ delimiter.size();
 361       }
 362       else
 363       {
 364          last_pos= std::string::npos;
 365       }
 366       if (!trim_list.empty() ) trim_mod (part, trim_list);
 367       if (omit_empty && part.empty() ) continue;
 368       result.push_back( part );
 369    }
 370    // if the string ends with a delimiter we need to append an empty string if no omit_empty
 371    // was given.
 372    // (this way we keep the split result consistent to a join operation)
 373    if (delimiter_found && !omit_empty)
 374    {
 375       result.push_back("");
 376    }
 377 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
 378
 379
 380 /**
 381  * splits a string by a given delimiter
 382  * @param str the string which should be splitted.
 383  * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 384  * @param[in] omit_empty should empty parts not be stored?
 385  * @param[in] trim_list list of characters the parts should be trimmed by.
 386  *  (empty string results in no trim)
 387  * @return the list resulting from splitting @a str.
 388  */
 389 std::list<std::string> split_string(
 390    const std::string& str,
 391    const std::string& delimiter,
 392    bool omit_empty,
 393    const std::string& trim_list
 394 )
 395 {
 396    std::list<std::string> result;
 397    split_string(str, result, delimiter, omit_empty, trim_list);
 398    return result;
 399 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
 400
 401
 402 /**
 403  * @brief joins a list of strings into a single string.
 404  *
 405  * This funtion is (basically) the reverse operation of @a split_string.
 406  *
 407  * @param parts the list of strings.
 408  * @param delimiter the delimiter which is inserted between the strings.
 409  * @return the joined string.
 410  */
 411 std::string join_string(
 412    const std::list< std::string >& parts,
 413    const std::string& delimiter
 414 )
 415 {
 416    std::string result;
 417    if (! parts.empty() )
 418    {
 419       std::list< std::string >::const_iterator it= parts.begin();
 420       result = *it;
 421       while ( ++it != parts.end() )
 422       {
 423          result+= delimiter;
 424          result+= *it;
 425       }
 426    }
 427    return result;
 428 } // eo join_string(const std::list< std::string >&,const std::string&)
 429
 430
 431
 432 /*
 433 ** conversions
 434 */
 435
 436
 437 /**
 438  * @brief returns a hex string from a binary string.
 439  * @param str the (binary) string
 440  * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 441  * @return the string in hex notation.
 442  */
 443 std::string convert_binary_to_hex(
 444    const std::string& str,
 445    bool upper_case_digits
 446 )
 447 {
 448    std::string result;
 449    std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
 450    for ( std::string::const_iterator it= str.begin();
 451          it != str.end();
 452          ++it)
 453    {
 454       result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
 455       result.push_back( hexDigits[ (*it) & 0x0f ] );
 456    }
 457    return result;
 458 } // eo convert_binary_to_hex(const std::string&,bool)
 459
 460
 461 /**
 462  * @brief converts a hex digit string to binary string.
 463  * @param str hex digit string
 464  * @return the binary string.
 465  *
 466  * The hex digit string may contains white spaces or colons which are treated
 467  * as delimiters between hex digit groups.
 468  *
 469  * @todo rework the handling of half nibbles (consistency)!
 470  */
 471 std::string convert_hex_to_binary(
 472    const std::string& str
 473 )
 474 throw (std::runtime_error)
 475 {
 476    std::string result;
 477    char c= 0;
 478    bool hasNibble= false;
 479    bool lastWasWS= true;
 480    for ( std::string::const_iterator it= str.begin();
 481          it != str.end();
 482          ++it)
 483    {
 484       std::string::size_type p = hexDigitsLower.find( *it );
 485       if (p== std::string::npos)
 486       {
 487          p= hexDigitsUpper.find( *it );
 488       }
 489       if (p == std::string::npos)
 490       {
 491          if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
 492                 or ( *it == ':') // or a colon?
 493             )
 494          {
 495             // we treat that as a valid delimiter:
 496             if (hasNibble)
 497             {
 498                // 1 nibble before WS is treate as lower part:
 499                result.push_back(c);
 500                // reset state:
 501                hasNibble= false;
 502             }
 503             lastWasWS= true;
 504             continue;
 505          }
 506       }
 507       if (p == std::string::npos )
 508       {
 509          throw runtime_error("illegal character in hex digit string: " + str);
 510       }
 511       lastWasWS= false;
 512       if (hasNibble)
 513       {
 514          c<<=4;
 515       }
 516       else
 517       {
 518          c=0;
 519       }
 520       c+= (p & 0x0f);
 521       if (hasNibble)
 522       {
 523          //we already had a nibble, so a char is complete now:
 524          result.push_back( c );
 525          hasNibble=false;
 526       }
 527       else
 528       {
 529          // this is the first nibble of a new char:
 530          hasNibble=true;
 531       }
 532    }
 533    if (hasNibble)
 534    {
 535       //well, there is one nibble left
 536       // let's do some heuristics:
 537       if (lastWasWS)
 538       {
 539          // if the preceeding character was a white space (or a colon)
 540          // we treat the nibble as lower part:
 541          //( this is consistent with shortened hex notations where leading zeros are not noted)
 542          result.push_back( c );
 543       }
 544       else
 545       {
 546          // if it was part of a hex digit chain, we treat it as UPPER part (!!)
 547          result.push_back( c << 4 );
 548       }
 549    }
 550    return result;
 551 } // eo convert_hex_to_binary(const std::string&)
 552
 553
 554 } // eo namespace I2n
 555
 556
 557
 558
 559 std::string iso_to_utf8(const std::string& isostring)
 560 {
 561    string result;
 562
 563    iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
 564
 565    if (iso_to_utf8 == (iconv_t)-1)
 566       throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
 567
 568    size_t in_size=isostring.size();
 569    size_t out_size=in_size*4;
 570
 571    char *buf = (char *)malloc(out_size+1);
 572    if (buf == NULL)
 573       throw runtime_error("out of memory for iconv buffer");
 574
 575    char *in = (char *)isostring.c_str();
 576    char *out = buf;
 577    iconv(i2utf8, &in, &in_size, &out, &out_size);
 578
 579    buf[isostring.size()*4-out_size]=0;
 580
 581    result=buf;
 582
 583    free(buf);
 584    iconv_close(i2utf8);
 585
 586    return result;
 587 }
 588
 589 std::string utf8_to_iso(const std::string& utf8string)
 590 {
 591    string result;
 592
 593    iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
 594
 595    if (utf82iso == (iconv_t)-1)
 596       throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
 597
 598    size_t in_size=utf8string.size();
 599    size_t out_size=in_size;
 600
 601    char *buf = (char *)malloc(out_size+1);
 602    if (buf == NULL)
 603       throw runtime_error("out of memory for iconv buffer");
 604
 605    char *in = (char *)utf8string.c_str();
 606    char *out = buf;
 607    iconv(utf82iso, &in, &in_size, &out, &out_size);
 608
 609    buf[utf8string.size()-out_size]=0;
 610
 611    result=buf;
 612
 613    free(buf);
 614    iconv_close(utf82iso);
 615
 616    return result;
 617 }
 618
 619 wchar_t* utf8_to_wbuf(const std::string& utf8string)
 620 {
 621    iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
 622
 623    if (utf82wstr == (iconv_t)-1)
 624       throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
 625
 626    size_t in_size=utf8string.size();
 627    size_t out_size= (in_size+1)*sizeof(wchar_t);
 628
 629    wchar_t *buf = (wchar_t *)malloc(out_size);
 630    if (buf == NULL)
 631       throw runtime_error("out of memory for iconv buffer");
 632
 633    char *in = (char *)utf8string.c_str();
 634    char *out = (char*) buf;
 635    if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
 636       throw runtime_error("error converting char encodings");
 637
 638    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
 639
 640    iconv_close(utf82wstr);
 641
 642    return buf;
 643 }
 644
 645 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
 646 {
 647    string result;
 648
 649    iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
 650
 651    if (utf7imap2utf8 == (iconv_t)-1)
 652       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 653
 654    size_t in_size=utf7imapstring.size();
 655    size_t out_size=in_size*4;
 656
 657    char *buf = (char *)malloc(out_size+1);
 658    if (buf == NULL)
 659       throw runtime_error("out of memory for iconv buffer");
 660
 661    char *in = (char *)utf7imapstring.c_str();
 662    char *out = buf;
 663    iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
 664
 665    buf[utf7imapstring.size()*4-out_size]=0;
 666
 667    result=buf;
 668
 669    free(buf);
 670    iconv_close(utf7imap2utf8);
 671
 672    return result;
 673 }
 674
 675 std::string utf8_to_utf7imap(const std::string& utf8string)
 676 {
 677    string result;
 678
 679    iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
 680
 681    if (utf82utf7imap == (iconv_t)-1)
 682       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 683
 684    // UTF-7 is base64 encoded, a buffer 10x as large
 685    // as the utf-8 buffer should be enough. If not the string will be truncated.
 686    size_t in_size=utf8string.size();
 687    size_t out_size=in_size*10;
 688
 689    char *buf = (char *)malloc(out_size+1);
 690    if (buf == NULL)
 691       throw runtime_error("out of memory for iconv buffer");
 692
 693    char *in = (char *)utf8string.c_str();
 694    char *out = buf;
 695    iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
 696
 697    buf[utf8string.size()*10-out_size]= 0;
 698
 699    result=buf;
 700
 701    free(buf);
 702    iconv_close(utf82utf7imap);
 703
 704    return result;
 705 }
 706
 707 // Tokenize string by (html) tags
 708 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
 709 {
 710    string::size_type pos, len = input.size();
 711    bool inside_tag = false;
 712    string current;
 713
 714    for (pos = 0; pos < len; pos++)
 715    {
 716       if (input[pos] == '<')
 717       {
 718          inside_tag = true;
 719
 720          if (!current.empty() )
 721          {
 722             tokenized.push_back( make_pair(current, false) );
 723             current = "";
 724          }
 725
 726          current += input[pos];
 727       }
 728       else if (input[pos] == '>' && inside_tag)
 729       {
 730          current += input[pos];
 731          inside_tag = false;
 732          if (!current.empty() )
 733          {
 734             tokenized.push_back( make_pair(current, true) );
 735             current = "";
 736          }
 737       }
 738       else
 739          current += input[pos];
 740    }
 741
 742    // String left over in buffer?
 743    if (!current.empty() )
 744       tokenized.push_back( make_pair(current, false) );
 745 } // eo tokenize_by_tag
 746
 747
 748 std::string strip_html_tags(const std::string &input)
 749 {
 750    // Pair first: string, second: isTag
 751    vector<pair<string,bool> > tokenized;
 752    tokenize_by_tag (tokenized, input);
 753
 754    string output;
 755    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 756    for (token = tokenized.begin(); token != tokens_end; ++token)
 757       if (!token->second)
 758          output += token->first;
 759
 760    return output;
 761 } // eo strip_html_tags
 762
 763
 764 // Smart-encode HTML en
 765 string smart_html_entities(const std::string &input)
 766 {
 767    // Pair first: string, second: isTag
 768    vector<pair<string,bool> > tokenized;
 769    tokenize_by_tag (tokenized, input);
 770
 771    string output;
 772    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 773    for (token = tokenized.begin(); token != tokens_end; ++token)
 774    {
 775       // keep HTML tags as they are
 776       if (token->second)
 777          output += token->first;
 778       else
 779          output += html_entities(token->first);
 780    }
 781
 782    return output;
 783 }
 784
 785
 786 string::size_type find_8bit(const std::string &str)
 787 {
 788    string::size_type l=str.size();
 789    for (string::size_type p=0; p < l; p++)
 790       if (static_cast<unsigned char>(str[p]) > 127)
 791          return p;
 792
 793    return string::npos;
 794 }
 795
 796 // encoded UTF-8 chars into HTML entities
 797 string html_entities(std::string str)
 798 {
 799    // Normal chars
 800    replace_all (str, "&", "&amp;");
 801    replace_all (str, "<", "&lt;");
 802    replace_all (str, ">", "&gt;");
 803    replace_all (str, "\"", "&quot;");
 804    replace_all (str, "'", "&#x27;");
 805    replace_all (str, "/", "&#x2F;");
 806
 807    // Umlauts
 808    replace_all (str, "\xC3\xA4", "&auml;");
 809    replace_all (str, "\xC3\xB6", "&ouml;");
 810    replace_all (str, "\xC3\xBC", "&uuml;");
 811    replace_all (str, "\xC3\x84", "&Auml;");
 812    replace_all (str, "\xC3\x96", "&Ouml;");
 813    replace_all (str, "\xC3\x9C", "&Uuml;");
 814
 815    // Misc
 816    replace_all (str, "\xC3\x9F", "&szlig;");
 817
 818    // conversion of remaining non-ASCII chars needed?
 819    // just do if needed because of performance
 820    if (find_8bit(str) != string::npos)
 821    {
 822       // convert to fixed-size encoding UTF-32
 823       wchar_t* wbuf=utf8_to_wbuf(str);
 824       ostringstream target;
 825
 826       // replace all non-ASCII chars with HTML representation
 827       for (int p=0; wbuf[p] != 0; p++)
 828       {
 829          unsigned int c=wbuf[p];
 830
 831          if (c <= 127)
 832             target << static_cast<unsigned char>(c);
 833          else
 834             target << "&#" << c << ';';
 835       }
 836
 837       free(wbuf);
 838
 839       str=target.str();
 840    }
 841
 842    return str;
 843 } // eo html_entities(std::string)
 844
 845 // convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
 846 string html_entities_to_console(std::string str)
 847 {
 848    // Normal chars
 849    replace_all (str, "&amp;", "&");
 850    replace_all (str, "&lt;", "<");
 851    replace_all (str, "&gt;", ">");
 852    replace_all (str, "&quot;", "\"");
 853    replace_all (str, "&#x27;", "'");
 854    replace_all (str, "&#x2F;", "/");
 855
 856    // Umlauts
 857    replace_all (str, "&auml;", "ae");
 858    replace_all (str, "&ouml;", "oe");
 859    replace_all (str, "&uuml;", "ue");
 860    replace_all (str, "&Auml;", "Ae");
 861    replace_all (str, "&Ouml;", "Oe");
 862    replace_all (str, "&Uuml;", "Ue");
 863
 864    // Misc
 865    replace_all (str, "&szlig;", "ss");
 866
 867    return str;
 868 }
 869
 870 bool replace_all(string &base, const char *ist, const char *soll)
 871 {
 872    string i=ist;
 873    string s=soll;
 874    return replace_all(base,&i,&s);
 875 }
 876
 877 bool replace_all(string &base, const string &ist, const char *soll)
 878 {
 879    string s=soll;
 880    return replace_all(base,&ist,&s);
 881 }
 882
 883 bool replace_all(string &base, const string *ist, const string *soll)
 884 {
 885    return replace_all(base,*ist,*soll);
 886 }
 887
 888 bool replace_all(string &base, const char *ist, const string *soll)
 889 {
 890    string i=ist;
 891    return replace_all(base,&i,soll);
 892 }
 893
 894 bool replace_all(string &base, const string &ist, const string &soll)
 895 {
 896    bool found_ist = false;
 897    string::size_type a=0;
 898
 899    if (ist.empty() )
 900       throw runtime_error ("replace_all called with empty search string");
 901
 902    while ( (a=base.find(ist,a) ) != string::npos)
 903    {
 904       base.replace(a,ist.size(),soll);
 905       a=a+soll.size();
 906       found_ist = true;
 907    }
 908
 909    return found_ist;
 910 }
 911
 912 /**
 913  * @brief replaces all characters that could be problematic or impose a security risk when being logged
 914  * @param str the original string
 915  * @param replace_with the character to replace the unsafe chars with
 916  * @return a string that is safe to send to syslog or other logfiles
 917  *
 918  * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
 919  * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
 920  * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
 921  *
 922  */
 923 std::string sanitize_for_logging(const std::string &str, const char replace_with)
 924 {
 925     std::string output=str;
 926
 927     const string::size_type len = output.size();
 928     for (std::string::size_type p=0; p < len; p++)
 929         if (output[p] < 0x20 || output[p] > 0x7E)
 930             output[p]=replace_with;
 931
 932     return output;
 933 }
 934
 935 #if 0
 936 string to_lower(const string &src)
 937 {
 938    string dst = src;
 939
 940    string::size_type pos, end = dst.size();
 941    for (pos = 0; pos < end; pos++)
 942       dst[pos] = tolower(dst[pos]);
 943
 944    return dst;
 945 }
 946
 947 string to_upper(const string &src)
 948 {
 949    string dst = src;
 950
 951    string::size_type pos, end = dst.size();
 952    for (pos = 0; pos < end; pos++)
 953       dst[pos] = toupper(dst[pos]);
 954
 955    return dst;
 956 }
 957 #endif
 958
 959 const int MAX_UNIT_FORMAT_SYMBOLS = 6;
 960
 961 const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
 962         " B",
 963         " KB",
 964         " MB",
 965         " GB",
 966         " TB",
 967         " PB"
 968 };
 969
 970 const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
 971         i18n_noop(" Bytes"),
 972         i18n_noop(" KBytes"),
 973         i18n_noop(" MBytes"),
 974         i18n_noop(" GBytes"),
 975         i18n_noop(" TBytes"),
 976         i18n_noop(" PBytes")
 977 };
 978
 979
 980 long double rounding_upwards(
 981         long double number,
 982         const int rounding_multiplier
 983 )
 984 {
 985     long double rounded_number;
 986     rounded_number = number * rounding_multiplier;
 987     rounded_number += 0.5;
 988     rounded_number = (int64_t) (rounded_number);
 989     rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
 990
 991     return rounded_number;
 992 }
 993
 994
 995 string nice_unit_format(
 996         const int64_t input,
 997         const UnitFormat format,
 998         const UnitBase base
 999 )
1000 {
1001    // select the system of units (decimal or binary)
1002    int multiple = 0;
1003    if (base == UnitBase1000)
1004    {
1005        multiple = 1000;
1006    }
1007    else
1008    {
1009        multiple = 1024;
1010    }
1011
1012    long double size = input;
1013
1014    // check the size of the input number to fit in the appropriate symbol
1015    int sizecount = 0;
1016    while (size > multiple)
1017    {
1018        size = size / multiple;
1019        sizecount++;
1020
1021        // rollback to the previous values and stop the loop when cannot
1022        // represent the number length.
1023        if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
1024        {
1025            size = size * multiple;
1026            sizecount--;
1027            break;
1028        }
1029    }
1030
1031    // round the input number "half up" to multiples of 10
1032    const int rounding_multiplier = 10;
1033    size = rounding_upwards(size, rounding_multiplier);
1034
1035    // format the input number, placing the appropriate symbol
1036    ostringstream out;
1037    out.setf (ios::fixed);
1038    if (format == ShortUnitFormat)
1039    {
1040        out.precision(1);
1041        out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
1042    }
1043    else
1044    {
1045        out.precision (2);
1046        out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
1047    }
1048
1049    return out.str();
1050 } // eo nice_unit_format(int input)
1051
1052
1053 string escape(const string &s)
1054 {
1055    string out(s);
1056    string::size_type p;
1057
1058    p=0;
1059    while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
1060    {
1061       out.insert (p,"\\");
1062       p+=2;
1063    }
1064
1065    p=0;
1066    while ( (p=out.find_first_of("\r",p) ) !=out.npos)
1067    {
1068       out.replace (p,1,"\\r");
1069       p+=2;
1070    }
1071
1072    p=0;
1073    while ( (p=out.find_first_of("\n",p) ) !=out.npos)
1074    {
1075       out.replace (p,1,"\\n");
1076       p+=2;
1077    }
1078
1079    out='"'+out+'"';
1080
1081    return out;
1082 } // eo scape(const std::string&)
1083
1084
1085 string descape(const string &s, int startpos, int &endpos)
1086 {
1087    string out;
1088
1089    if (s.at(startpos) != '"')
1090       throw out_of_range("value not type escaped string");
1091
1092    out=s.substr(startpos+1);
1093    string::size_type p=0;
1094
1095    // search for the end of the string
1096    while ( (p=out.find("\"",p) ) !=out.npos)
1097    {
1098       int e=p-1;
1099       bool escaped=false;
1100
1101       // the " might be escaped with a backslash
1102       while (e>=0 && out.at (e) =='\\')
1103       {
1104          if (escaped == false)
1105             escaped=true;
1106          else
1107             escaped=false;
1108
1109          e--;
1110       }
1111
1112       if (escaped==false)
1113          break;
1114       else
1115          p++;
1116    }
1117
1118    // we now have the end of the string
1119    out=out.substr(0,p);
1120
1121    // tell calling prog about the endposition
1122    endpos=startpos+p+1;
1123
1124    // descape all \ stuff inside the string now
1125    p=0;
1126    while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1127    {
1128       switch (out.at(p+1) )
1129       {
1130          case 'r':
1131             out.replace(p,2,"\r");
1132             break;
1133          case 'n':
1134             out.replace(p,2,"\n");
1135             break;
1136          default:
1137             out.erase(p,1);
1138       }
1139       p++;
1140    }
1141
1142    return out;
1143 } // eo descape(const std::string&,int,int&)
1144
1145
1146 string escape_shellarg(const string &input)
1147 {
1148    string output = "'";
1149    string::const_iterator it, it_end = input.end();
1150    for (it = input.begin(); it != it_end; ++it)
1151    {
1152       if ( (*it) == '\'')
1153          output += "'\\'";
1154
1155       output += *it;
1156    }
1157
1158    output += "'";
1159    return output;
1160 }