developer.intra2net.com Git - libi2ncommon/blob - src/stringfunc.cpp

   1 /** @file
   2  *
   3  * (c) Copyright 2007-2008 by Intra2net AG
   4  *
   5  * info@intra2net.com
   6  */
   7
   8 #include <iostream>
   9 #include <string>
  10 #include <sstream>
  11 #include <stdexcept>
  12 #include <algorithm>
  13
  14 #include <wchar.h>
  15 #include <stdlib.h>
  16 #include <iconv.h>
  17 #include <i18n.h>
  18
  19 #include <stringfunc.hxx>
  20
  21 using namespace std;
  22
  23 namespace I2n
  24 {
  25
  26
  27 namespace
  28 {
  29
  30 const std::string hexDigitsLower("0123456789abcdef");
  31 const std::string hexDigitsUpper("0123456789ABCDEF");
  32
  33
  34 struct UpperFunc
  35 {
  36    char operator() (char c)
  37    {
  38       return std::toupper(c);
  39    }
  40 }; // eo struct UpperFunc
  41
  42
  43 struct LowerFunc
  44 {
  45    char operator() (char c)
  46    {
  47       return std::tolower(c);
  48    }
  49 }; // eo struct LowerFunc
  50
  51
  52 } // eo namespace <anonymous>
  53
  54
  55
  56 /**
  57  * default list of Whitespaces (" \t\r\n");
  58  */
  59 const std::string Whitespaces = " \t\r\n";
  60
  61 /**
  62  * default list of lineendings ("\r\n");
  63  */
  64 const std::string LineEndings= "\r\n";
  65
  66
  67
  68 /**
  69  * @brief checks if a string begins with a given prefix.
  70  * @param[in,out] str the string which is tested
  71  * @param prefix the prefix which should be tested for.
  72  * @return @a true iff the prefix is not empty and the string begins with that prefix.
  73  */
  74 bool has_prefix(const std::string& str, const std::string& prefix)
  75 {
  76    if (prefix.empty() || str.empty() || str.size() < prefix.size() )
  77    {
  78       return false;
  79    }
  80    return str.compare(0, prefix.size(), prefix) == 0;
  81 } // eo has_prefix(const std::string&,const std::string&)
  82
  83
  84 /**
  85  * @brief checks if a string ends with a given suffix.
  86  * @param[in,out] str the string which is tested
  87  * @param suffix the suffix which should be tested for.
  88  * @return @a true iff the suffix is not empty and the string ends with that suffix.
  89  */
  90 bool has_suffix(const std::string& str, const std::string& suffix)
  91 {
  92    if (suffix.empty() || str.empty() || str.size() < suffix.size() )
  93    {
  94       return false;
  95    }
  96    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
  97 } // eo has_suffix(const std::string&,const std::string&)
  98
  99
 100 /**
 101  * cut off characters from a given list from front and end of a string.
 102  * @param[in,out] str the string which should be trimmed.
 103  * @param charlist the list of characters to remove from beginning and end of string
 104  * @return the result string.
 105  */
 106 std::string trim_mod(std::string& str, const std::string& charlist)
 107 {
 108    // first: trim the beginning:
 109    std::string::size_type pos= str.find_first_not_of (charlist);
 110    if (pos == std::string::npos)
 111    {
 112       // whole string consists of charlist (or is already empty)
 113       str.clear();
 114       return str;
 115    }
 116    else if (pos>0)
 117    {
 118       // str starts with charlist
 119       str.erase(0,pos);
 120    }
 121    // now let's look at the tail:
 122    pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
 123    if ( pos < str.size() )
 124    {
 125       str.erase(pos, str.size()-pos);
 126    }
 127    return str;
 128 } // eo trim_mod(std::string&,const std::string&)
 129
 130
 131
 132 /**
 133  * removes last character from a string when it is in a list of chars to be removed.
 134  * @param[in,out] str the string.
 135  * @param what the list of chars which will be tested for.
 136  * @return the resulting string with last char removed (if applicable)
 137  */
 138 std::string chomp_mod(std::string& str, const std::string& what)
 139 {
 140    if (str.empty() || what.empty() )
 141    {
 142       return str;
 143    }
 144    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 145    {
 146       str.erase(str.size() - 1);
 147    }
 148    return str;
 149 } // eo chomp_mod(std::string&,const std::string&)
 150
 151
 152 /**
 153  * @brief converts a string to lower case.
 154  * @param[in,out] str the string to modify.
 155  * @return the string
 156  */
 157 std::string to_lower_mod(std::string& str)
 158 {
 159    std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
 160    return str;
 161 } // eo to_lower_mod(std::string&)
 162
 163
 164 /**
 165  * @brief converts a string to upper case.
 166  * @param[in,out] str the string to modify.
 167  * @return the string
 168  */
 169 std::string to_upper_mod(std::string& str)
 170 {
 171    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
 172    return str;
 173 } // eo to_upper_mod(std::string&)
 174
 175
 176
 177 /**
 178  * cut off characters from a given list from front and end of a string.
 179  * @param str the string which should be trimmed.
 180  * @param charlist the list of characters to remove from beginning and end of string
 181  * @return the result string.
 182  */
 183 std::string trim (const std::string& str, const std::string& charlist)
 184 {
 185    // first: trim the beginning:
 186    std::string::size_type pos0= str.find_first_not_of(charlist);
 187    if (pos0 == std::string::npos)
 188    {
 189       // whole string consists of charlist (or is already empty)
 190       return std::string();
 191    }
 192    // now let's look at the end:
 193    std::string::size_type pos1= str.find_last_not_of(charlist);
 194    return str.substr(pos0, pos1 - pos0 + 1);
 195 } // eo trim(const std:.string&,const std::string&)
 196
 197
 198 /**
 199  * removes last character from a string when it is in a list of chars to be removed.
 200  * @param str the string.
 201  * @param what the list of chars which will be tested for.
 202  * @return the resulting string with last char removed (if applicable)
 203  */
 204 std::string chomp (const std::string& str, const std::string& what)
 205 {
 206    if (str.empty() || what.empty() )
 207    {
 208       return str;
 209    }
 210    if (what.find(str.at (str.size()-1) ) != std::string::npos)
 211    {
 212       return str.substr(0, str.size()-1);
 213    }
 214    return str;
 215 } // eo chomp(const std:.string&,const std::string&)
 216
 217
 218 /**
 219  * @brief returns a lower case version of a given string.
 220  * @param str the string
 221  * @return the lower case version of the string
 222  */
 223 std::string to_lower (const std::string& str)
 224 {
 225    std::string result(str);
 226    return to_lower_mod(result);
 227 } // eo to_lower(const std::string&)
 228
 229
 230 /**
 231  * @brief returns a upper case version of a given string.
 232  * @param str the string
 233  * @return the upper case version of the string
 234  */
 235 std::string to_upper(const std::string& str)
 236 {
 237    std::string result(str);
 238    return to_upper_mod(result);
 239 } // eo to_upper(const std::string&)
 240
 241
 242
 243 /**
 244  * @brief removes a given suffix from a string.
 245  * @param str the string.
 246  * @param suffix the suffix which should be removed if the string ends with it.
 247  * @return the string without the suffix.
 248  *
 249  * If the string ends with the suffix, it is removed. If the the string doesn't end
 250  * with the suffix the original string is returned.
 251  */
 252 std::string remove_suffix(const std::string& str, const std::string& suffix)
 253 {
 254    if (has_suffix(str,suffix) )
 255    {
 256       return str.substr(0, str.size()-suffix.size() );
 257    }
 258    return str;
 259 } // eo remove_suffix(const std::string&,const std::string&)
 260
 261
 262
 263 /**
 264  * @brief removes a given prefix from a string.
 265  * @param str the string.
 266  * @param prefix the prefix which should be removed if the string begins with it.
 267  * @return the string without the prefix.
 268  *
 269  * If the string begins with the prefix, it is removed. If the the string doesn't begin
 270  * with the prefix the original string is returned.
 271  */
 272 std::string remove_prefix(const std::string& str, const std::string& prefix)
 273 {
 274    if (has_prefix(str,prefix) )
 275    {
 276       return str.substr( prefix.size() );
 277    }
 278    return str;
 279 } // eo remove_prefix(const std::string&,const std::string&)
 280
 281
 282 /**
 283  * split a string to key and value delimited by a given delimiter.
 284  * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 285  * @param str the string which should be splitted.
 286  * @param[out] key the resulting key
 287  * @param[out] value the resulting value
 288  * @param delimiter the delimiter between key and value; default is '='.
 289  * @return @a true if the split was successful.
 290  */
 291 bool pair_split(
 292    const std::string& str,
 293    std::string& key,
 294    std::string& value,
 295    char delimiter)
 296 {
 297    std::string::size_type pos = str.find (delimiter);
 298    if (pos == std::string::npos) return false;
 299    key= str.substr(0,pos);
 300    value= str.substr(pos+1);
 301    trim_mod(key);
 302    trim_mod(value);
 303    return true;
 304 } // eo pair_split(const std::string&,std::string&,std::string&,char)
 305
 306
 307 /**
 308  * splits a string by given delimiter
 309  *
 310  * @param[in] str the string which should be splitted.
 311  * @param[out] result the list resulting from splitting  @a str.
 312  * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 313  * @param[in] omit_empty should empty parts not be stored?
 314  * @param[in] trim_list list of characters the parts should be trimmed by.
 315  *  (empty string results in no trim)
 316  */
 317 void split_string(
 318    const std::string& str,
 319    std::list<std::string>& result,
 320    const std::string& delimiter,
 321    bool omit_empty,
 322    const std::string& trim_list
 323 )
 324 {
 325    std::string::size_type pos, last_pos=0;
 326    bool delimiter_found= false;
 327    while ( last_pos < str.size()  && last_pos != std::string::npos)
 328    {
 329       pos= str.find(delimiter, last_pos);
 330       std::string part;
 331       if (pos == std::string::npos)
 332       {
 333          part= str.substr(last_pos);
 334          delimiter_found= false;
 335       }
 336       else
 337       {
 338          part= str.substr(last_pos, pos-last_pos);
 339          delimiter_found=true;
 340       }
 341       if (pos != std::string::npos)
 342       {
 343          last_pos= pos+ delimiter.size();
 344       }
 345       else
 346       {
 347          last_pos= std::string::npos;
 348       }
 349       if (!trim_list.empty() ) trim_mod (part, trim_list);
 350       if (omit_empty && part.empty() ) continue;
 351       result.push_back( part );
 352    }
 353    // if the string ends with a delimiter we need to append an empty string if no omit_empty
 354    // was given.
 355    // (this way we keep the split result consistent to a join operation)
 356    if (delimiter_found && !omit_empty)
 357    {
 358       result.push_back("");
 359    }
 360 } // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)
 361
 362
 363 /**
 364  * splits a string by a given delimiter
 365  * @param str the string which should be splitted.
 366  * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 367  * @param[in] omit_empty should empty parts not be stored?
 368  * @param[in] trim_list list of characters the parts should be trimmed by.
 369  *  (empty string results in no trim)
 370  * @return the list resulting from splitting @a str.
 371  */
 372 std::list<std::string> split_string(
 373    const std::string& str,
 374    const std::string& delimiter,
 375    bool omit_empty,
 376    const std::string& trim_list
 377 )
 378 {
 379    std::list<std::string> result;
 380    split_string(str, result, delimiter, omit_empty, trim_list);
 381    return result;
 382 } // eo split_string(const std::string&,const std::string&,bool,const std::string&)
 383
 384
 385 /**
 386  * @brief joins a list of strings into a single string.
 387  *
 388  * This funtion is (basically) the reverse operation of @a split_string.
 389  *
 390  * @param parts the list of strings.
 391  * @param delimiter the delimiter which is inserted between the strings.
 392  * @return the joined string.
 393  */
 394 std::string join_string(
 395    const std::list< std::string >& parts,
 396    const std::string& delimiter
 397 )
 398 {
 399    std::string result;
 400    if (! parts.empty() )
 401    {
 402       std::list< std::string >::const_iterator it= parts.begin();
 403       result = *it;
 404       while ( ++it != parts.end() )
 405       {
 406          result+= delimiter;
 407          result+= *it;
 408       }
 409    }
 410    return result;
 411 } // eo join_string(const std::list< std::string >&,const std::string&)
 412
 413
 414
 415 /*
 416 ** conversions
 417 */
 418
 419
 420 /**
 421  * @brief returns a hex string from a binary string.
 422  * @param str the (binary) string
 423  * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 424  * @return the string in hex notation.
 425  */
 426 std::string convert_binary_to_hex(
 427    const std::string& str,
 428    bool upper_case_digits
 429 )
 430 {
 431    std::string result;
 432    std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
 433    for ( std::string::const_iterator it= str.begin();
 434          it != str.end();
 435          ++it)
 436    {
 437       result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
 438       result.push_back( hexDigits[ (*it) & 0x0f ] );
 439    }
 440    return result;
 441 } // eo convert_binary_to_hex(const std::string&,bool)
 442
 443
 444 /**
 445  * @brief converts a hex digit string to binary string.
 446  * @param str hex digit string
 447  * @return the binary string.
 448  *
 449  * The hex digit string may contains white spaces or colons which are treated
 450  * as delimiters between hex digit groups.
 451  *
 452  * @todo rework the handling of half nibbles (consistency)!
 453  */
 454 std::string convert_hex_to_binary(
 455    const std::string& str
 456 )
 457 throw (std::runtime_error)
 458 {
 459    std::string result;
 460    char c= 0;
 461    bool hasNibble= false;
 462    bool lastWasWS= true;
 463    for ( std::string::const_iterator it= str.begin();
 464          it != str.end();
 465          ++it)
 466    {
 467       std::string::size_type p = hexDigitsLower.find( *it );
 468       if (p== std::string::npos)
 469       {
 470          p= hexDigitsUpper.find( *it );
 471       }
 472       if (p == std::string::npos)
 473       {
 474          if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
 475                 or ( *it == ':') // or a colon?
 476             )
 477          {
 478             // we treat that as a valid delimiter:
 479             if (hasNibble)
 480             {
 481                // 1 nibble before WS is treate as lower part:
 482                result.push_back(c);
 483                // reset state:
 484                hasNibble= false;
 485             }
 486             lastWasWS= true;
 487             continue;
 488          }
 489       }
 490       if (p == std::string::npos )
 491       {
 492          throw runtime_error("illegal character in hex digit string: " + str);
 493       }
 494       lastWasWS= false;
 495       if (hasNibble)
 496       {
 497          c<<=4;
 498       }
 499       else
 500       {
 501          c=0;
 502       }
 503       c+= (p & 0x0f);
 504       if (hasNibble)
 505       {
 506          //we already had a nibble, so a char is complete now:
 507          result.push_back( c );
 508          hasNibble=false;
 509       }
 510       else
 511       {
 512          // this is the first nibble of a new char:
 513          hasNibble=true;
 514       }
 515    }
 516    if (hasNibble)
 517    {
 518       //well, there is one nibble left
 519       // let's do some heuristics:
 520       if (lastWasWS)
 521       {
 522          // if the preceeding character was a white space (or a colon)
 523          // we treat the nibble as lower part:
 524          //( this is consistent with shortened hex notations where leading zeros are not noted)
 525          result.push_back( c );
 526       }
 527       else
 528       {
 529          // if it was part of a hex digit chain, we treat it as UPPER part (!!)
 530          result.push_back( c << 4 );
 531       }
 532    }
 533    return result;
 534 } // eo convert_hex_to_binary(const std::string&)
 535
 536
 537 } // eo namespace I2n
 538
 539
 540
 541
 542 std::string iso_to_utf8(const std::string& isostring)
 543 {
 544    string result;
 545
 546    iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");
 547
 548    if (iso_to_utf8 == (iconv_t)-1)
 549       throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
 550
 551    size_t in_size=isostring.size();
 552    size_t out_size=in_size*4;
 553
 554    char *buf = (char *)malloc(out_size+1);
 555    if (buf == NULL)
 556       throw runtime_error("out of memory for iconv buffer");
 557
 558    char *in = (char *)isostring.c_str();
 559    char *out = buf;
 560    iconv(i2utf8, &in, &in_size, &out, &out_size);
 561
 562    buf[isostring.size()*4-out_size]=0;
 563
 564    result=buf;
 565
 566    free(buf);
 567    iconv_close(i2utf8);
 568
 569    return result;
 570 }
 571
 572 std::string utf8_to_iso(const std::string& utf8string)
 573 {
 574    string result;
 575
 576    iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");
 577
 578    if (utf82iso == (iconv_t)-1)
 579       throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
 580
 581    size_t in_size=utf8string.size();
 582    size_t out_size=in_size;
 583
 584    char *buf = (char *)malloc(out_size+1);
 585    if (buf == NULL)
 586       throw runtime_error("out of memory for iconv buffer");
 587
 588    char *in = (char *)utf8string.c_str();
 589    char *out = buf;
 590    iconv(utf82iso, &in, &in_size, &out, &out_size);
 591
 592    buf[utf8string.size()-out_size]=0;
 593
 594    result=buf;
 595
 596    free(buf);
 597    iconv_close(utf82iso);
 598
 599    return result;
 600 }
 601
 602 wchar_t* utf8_to_wbuf(const std::string& utf8string)
 603 {
 604    iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");
 605
 606    if (utf82wstr == (iconv_t)-1)
 607       throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
 608
 609    size_t in_size=utf8string.size();
 610    size_t out_size= (in_size+1)*sizeof(wchar_t);
 611
 612    wchar_t *buf = (wchar_t *)malloc(out_size);
 613    if (buf == NULL)
 614       throw runtime_error("out of memory for iconv buffer");
 615
 616    char *in = (char *)utf8string.c_str();
 617    char *out = (char*) buf;
 618    if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == -1)
 619       throw runtime_error("error converting char encodings");
 620
 621    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
 622
 623    iconv_close(utf82wstr);
 624
 625    return buf;
 626 }
 627
 628 std::string utf7imap_to_utf8(const std::string& utf7imapstring)
 629 {
 630    string result;
 631
 632    iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");
 633
 634    if (utf7imap2utf8 == (iconv_t)-1)
 635       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 636
 637    size_t in_size=utf7imapstring.size();
 638    size_t out_size=in_size*4;
 639
 640    char *buf = (char *)malloc(out_size+1);
 641    if (buf == NULL)
 642       throw runtime_error("out of memory for iconv buffer");
 643
 644    char *in = (char *)utf7imapstring.c_str();
 645    char *out = buf;
 646    iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);
 647
 648    buf[utf7imapstring.size()*4-out_size]=0;
 649
 650    result=buf;
 651
 652    free(buf);
 653    iconv_close(utf7imap2utf8);
 654
 655    return result;
 656 }
 657
 658 std::string utf8_to_utf7imap(const std::string& utf8string)
 659 {
 660    string result;
 661
 662    iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");
 663
 664    if (utf82utf7imap == (iconv_t)-1)
 665       throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
 666
 667    // UTF-7 is base64 encoded, a buffer 10x as large
 668    // as the utf-8 buffer should be enough. If not the string will be truncated.
 669    size_t in_size=utf8string.size();
 670    size_t out_size=in_size*10;
 671
 672    char *buf = (char *)malloc(out_size+1);
 673    if (buf == NULL)
 674       throw runtime_error("out of memory for iconv buffer");
 675
 676    char *in = (char *)utf8string.c_str();
 677    char *out = buf;
 678    iconv(utf82utf7imap, &in, &in_size, &out, &out_size);
 679
 680    buf[utf8string.size()*10-out_size]= 0;
 681
 682    result=buf;
 683
 684    free(buf);
 685    iconv_close(utf82utf7imap);
 686
 687    return result;
 688 }
 689
 690 // Tokenize string by (html) tags
 691 void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
 692 {
 693    string::size_type pos, len = input.size();
 694    bool inside_tag = false;
 695    string current;
 696
 697    for (pos = 0; pos < len; pos++)
 698    {
 699       if (input[pos] == '<')
 700       {
 701          inside_tag = true;
 702
 703          if (!current.empty() )
 704          {
 705             tokenized.push_back( make_pair(current, false) );
 706             current = "";
 707          }
 708
 709          current += input[pos];
 710       }
 711       else if (input[pos] == '>' && inside_tag)
 712       {
 713          current += input[pos];
 714          inside_tag = false;
 715          if (!current.empty() )
 716          {
 717             tokenized.push_back( make_pair(current, true) );
 718             current = "";
 719          }
 720       }
 721       else
 722          current += input[pos];
 723    }
 724
 725    // String left over in buffer?
 726    if (!current.empty() )
 727       tokenized.push_back( make_pair(current, false) );
 728 } // eo tokenize_by_tag
 729
 730
 731 std::string strip_html_tags(const std::string &input)
 732 {
 733    // Pair first: string, second: isTag
 734    vector<pair<string,bool> > tokenized;
 735    tokenize_by_tag (tokenized, input);
 736
 737    string output;
 738    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 739    for (token = tokenized.begin(); token != tokens_end; token++)
 740       if (!token->second)
 741          output += token->first;
 742
 743    return output;
 744 } // eo strip_html_tags
 745
 746
 747 // Smart-encode HTML en
 748 string smart_html_entities(const std::string &input)
 749 {
 750    // Pair first: string, second: isTag
 751    vector<pair<string,bool> > tokenized;
 752    tokenize_by_tag (tokenized, input);
 753
 754    string output;
 755    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
 756    for (token = tokenized.begin(); token != tokens_end; token++)
 757    {
 758       // keep HTML tags as they are
 759       if (token->second)
 760          output += token->first;
 761       else
 762          output += html_entities(token->first);
 763    }
 764
 765    return output;
 766 }
 767
 768
 769 string::size_type find_8bit(const std::string &str)
 770 {
 771    string::size_type l=str.size();
 772    for (string::size_type p=0; p < l; p++)
 773       if (static_cast<unsigned char>(str[p]) > 127)
 774          return p;
 775
 776    return string::npos;
 777 }
 778
 779 // encoded UTF-8 chars into HTML entities
 780 string html_entities(std::string str)
 781 {
 782    // Normal chars
 783    replace_all (str, "&", "&amp;");
 784    replace_all (str, "<", "&lt;");
 785    replace_all (str, ">", "&gt;");
 786    replace_all (str, "\"", "&quot;");
 787    replace_all (str, "'", "&#x27;");
 788    replace_all (str, "/", "&#x2F;");
 789
 790    // Umlauts
 791    replace_all (str, "\xC3\xA4", "&auml;");
 792    replace_all (str, "\xC3\xB6", "&ouml;");
 793    replace_all (str, "\xC3\xBC", "&uuml;");
 794    replace_all (str, "\xC3\x84", "&Auml;");
 795    replace_all (str, "\xC3\x96", "&Ouml;");
 796    replace_all (str, "\xC3\x9C", "&Uuml;");
 797
 798    // Misc
 799    replace_all (str, "\xC3\x9F", "&szlig;");
 800
 801    // conversion of remaining non-ASCII chars needed?
 802    // just do if needed because of performance
 803    if (find_8bit(str) != string::npos)
 804    {
 805       // convert to fixed-size encoding UTF-32
 806       wchar_t* wbuf=utf8_to_wbuf(str);
 807       ostringstream target;
 808
 809       // replace all non-ASCII chars with HTML representation
 810       for (int p=0; wbuf[p] != 0; p++)
 811       {
 812          unsigned int c=wbuf[p];
 813
 814          if (c <= 127)
 815             target << static_cast<unsigned char>(c);
 816          else
 817             target << "&#" << c << ';';
 818       }
 819
 820       free(wbuf);
 821
 822       str=target.str();
 823    }
 824
 825    return str;
 826 } // eo html_entities(std::string)
 827
 828
 829 bool replace_all(string &base, const char *ist, const char *soll)
 830 {
 831    string i=ist;
 832    string s=soll;
 833    return replace_all(base,&i,&s);
 834 }
 835
 836 bool replace_all(string &base, const string &ist, const char *soll)
 837 {
 838    string s=soll;
 839    return replace_all(base,&ist,&s);
 840 }
 841
 842 bool replace_all(string &base, const string *ist, const string *soll)
 843 {
 844    return replace_all(base,*ist,*soll);
 845 }
 846
 847 bool replace_all(string &base, const char *ist, const string *soll)
 848 {
 849    string i=ist;
 850    return replace_all(base,&i,soll);
 851 }
 852
 853 bool replace_all(string &base, const string &ist, const string &soll)
 854 {
 855    bool found_ist = false;
 856    string::size_type a=0;
 857
 858    if (ist.empty() )
 859       throw runtime_error ("replace_all called with empty search string");
 860
 861    while ( (a=base.find(ist,a) ) != string::npos)
 862    {
 863       base.replace(a,ist.size(),soll);
 864       a=a+soll.size();
 865       found_ist = true;
 866    }
 867
 868    return found_ist;
 869 }
 870
 871 #if 0
 872 string to_lower(const string &src)
 873 {
 874    string dst = src;
 875
 876    string::size_type pos, end = dst.size();
 877    for (pos = 0; pos < end; pos++)
 878       dst[pos] = tolower(dst[pos]);
 879
 880    return dst;
 881 }
 882
 883 string to_upper(const string &src)
 884 {
 885    string dst = src;
 886
 887    string::size_type pos, end = dst.size();
 888    for (pos = 0; pos < end; pos++)
 889       dst[pos] = toupper(dst[pos]);
 890
 891    return dst;
 892 }
 893 #endif
 894
 895
 896 const int MAX_SYMBOL_FORMATS = 9;
 897
 898 const string symbolFormatShort[MAX_SYMBOL_FORMATS] = {
 899         " B",
 900         " KB",
 901         " MB",
 902         " GB",
 903         " TB",
 904         " PB",
 905         " EB",
 906         " ZB",
 907         " YB"
 908 };
 909
 910 const string symbolFormatLong[MAX_SYMBOL_FORMATS] = {
 911         " Bytes",
 912         " KBytes",
 913         " MBytes",
 914         " GBytes",
 915         " TBytes",
 916         " PBytes",
 917         " EBytes",
 918         " ZBytes",
 919         " YBytes"
 920 };
 921
 922 string nice_unit_format(
 923         const int64_t input,
 924         const UnitBase base,
 925         const UnitFormat format
 926 )
 927 {
 928    // select the system of units (decimal or binary)
 929    int multiple = 0;
 930    if (base == UnitBase1000)
 931    {
 932        multiple = 1000;
 933    }
 934    else
 935    {
 936        multiple = 1024;
 937    }
 938
 939    long double size = input;
 940
 941    // check the size of the input number to fit in the appropriate symbol
 942    int sizecount = 0;
 943    while (size > multiple)
 944    {
 945        size = size / multiple;
 946        sizecount++;
 947    }
 948
 949    // round the input number "half up" to multiples of 10
 950    const int rounding_multiplier = 10;
 951    long double tmp;
 952    tmp = size * rounding_multiplier;
 953    tmp += 0.5;
 954    tmp = (int64_t) (tmp);
 955    tmp = (long double) (tmp) / (long double) (rounding_multiplier);
 956    size = tmp;
 957
 958    // format the input number, placing the appropriate symbol
 959    ostringstream out;
 960    out.setf (ios::fixed);
 961    if (format == ShortUnitFormat)
 962    {
 963        out.precision(1);
 964        out << size << i18n( symbolFormatShort[sizecount].c_str() );
 965    }
 966    else
 967    {
 968        out.precision (2);
 969        out << size << i18n( symbolFormatLong[sizecount].c_str() );
 970    }
 971
 972    return out.str();
 973 } // eo nice_unit_format(int input)
 974
 975
 976 string escape(const string &s)
 977 {
 978    string out(s);
 979    string::size_type p;
 980
 981    p=0;
 982    while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
 983    {
 984       out.insert (p,"\\");
 985       p+=2;
 986    }
 987
 988    p=0;
 989    while ( (p=out.find_first_of("\r",p) ) !=out.npos)
 990    {
 991       out.replace (p,1,"\\r");
 992       p+=2;
 993    }
 994
 995    p=0;
 996    while ( (p=out.find_first_of("\n",p) ) !=out.npos)
 997    {
 998       out.replace (p,1,"\\n");
 999       p+=2;
1000    }
1001
1002    out='"'+out+'"';
1003
1004    return out;
1005 } // eo scape(const std::string&)
1006
1007
1008 string descape(const string &s, int startpos, int &endpos)
1009 {
1010    string out;
1011
1012    if (s.at(startpos) != '"')
1013       throw out_of_range("value not type escaped string");
1014
1015    out=s.substr(startpos+1);
1016    string::size_type p=0;
1017
1018    // search for the end of the string
1019    while ( (p=out.find("\"",p) ) !=out.npos)
1020    {
1021       int e=p-1;
1022       bool escaped=false;
1023
1024       // the " might be escaped with a backslash
1025       while (e>=0 && out.at (e) =='\\')
1026       {
1027          if (escaped == false)
1028             escaped=true;
1029          else
1030             escaped=false;
1031
1032          e--;
1033       }
1034
1035       if (escaped==false)
1036          break;
1037       else
1038          p++;
1039    }
1040
1041    // we now have the end of the string
1042    out=out.substr(0,p);
1043
1044    // tell calling prog about the endposition
1045    endpos=startpos+p+1;
1046
1047    // descape all \ stuff inside the string now
1048    p=0;
1049    while ( (p=out.find_first_of("\\",p) ) !=out.npos)
1050    {
1051       switch (out.at(p+1) )
1052       {
1053          case 'r':
1054             out.replace(p,2,"\r");
1055             break;
1056          case 'n':
1057             out.replace(p,2,"\n");
1058             break;
1059          default:
1060             out.erase(p,1);
1061       }
1062       p++;
1063    }
1064
1065    return out;
1066 } // eo descape(const std::string&,int,int&)
1067
1068
1069 string escape_shellarg(const string &input)
1070 {
1071    string output = "'";
1072    string::const_iterator it, it_end = input.end();
1073    for (it = input.begin(); it != it_end; it++)
1074    {
1075       if ( (*it) == '\'')
1076          output += "'\\'";
1077
1078       output += *it;
1079    }
1080
1081    output += "'";
1082    return output;
1083 }