[libi2ncommon] / src / stringfunc.cpp

/*
The software in this package is distributed under the GNU General
Public License version 2 (with a special exception described below).

A copy of GNU General Public License (GPL) is included in this distribution,
in the file COPYING.GPL.

As a special exception, if other files instantiate templates or use macros
or inline functions from this file, or you compile this file and link it
with other works to produce a work based on this file, this file
does not by itself cause the resulting work to be covered
by the GNU General Public License.

However the source code for this file must still be made available
in accordance with section (3) of the GNU General Public License.

This exception does not invalidate any other reasons why a work based
on this file might be covered by the GNU General Public License.
*/
/** @file
 *
 * (c) Copyright 2007-2008 by Intra2net AG
 */

#include <iostream>
#include <string>
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include <cmath>    // for round()

#include <wchar.h>
#include <stdlib.h>
#include <iconv.h>
#include <i18n.h>

#include <boost/numeric/conversion/cast.hpp>
#include <boost/foreach.hpp>

#include <stringfunc.hxx>

using namespace std;

namespace I2n
{


namespace
{

const std::string hexDigitsLower("0123456789abcdef");
const std::string hexDigitsUpper("0123456789ABCDEF");


struct UpperFunc
{
   char operator() (char c)
   {
      return std::toupper(c);
   }
}; // eo struct UpperFunc


struct LowerFunc
{
   char operator() (char c)
   {
      return std::tolower(c);
   }
}; // eo struct LowerFunc


} // eo namespace <anonymous>


/**
 * default list of Whitespaces (" \t\r\n");
 */
const std::string Whitespaces = " \t\r\n";

/**
 * default list of lineendings ("\r\n");
 */
const std::string LineEndings= "\r\n";


/**
 * @brief checks if a string begins with a given prefix.
 * @param[in,out] str the string which is tested
 * @param prefix the prefix which should be tested for.
 * @return @a true iff the prefix is not empty and the string begins with that prefix.
 */
bool has_prefix(const std::string& str, const std::string& prefix)
{
   if (prefix.empty() || str.empty() || str.size() < prefix.size() )
   {
      return false;
   }
   return str.compare(0, prefix.size(), prefix) == 0;
} // eo has_prefix(const std::string&,const std::string&)


/**
 * @brief checks if a string ends with a given suffix.
 * @param[in,out] str the string which is tested
 * @param suffix the suffix which should be tested for.
 * @return @a true iff the suffix is not empty and the string ends with that suffix.
 */
bool has_suffix(const std::string& str, const std::string& suffix)
{
   if (suffix.empty() || str.empty() || str.size() < suffix.size() )
   {
      return false;
   }
   return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
} // eo has_suffix(const std::string&,const std::string&)


/**
 * cut off characters from a given list from front and end of a string.
 * @param[in,out] str the string which should be trimmed.
 * @param charlist the list of characters to remove from beginning and end of string
 * @return the result string.
 */
std::string trim_mod(std::string& str, const std::string& charlist)
{
   // first: trim the beginning:
   std::string::size_type pos= str.find_first_not_of (charlist);
   if (pos == std::string::npos)
   {
      // whole string consists of charlist (or is already empty)
      str.clear();
      return str;
   }
   else if (pos>0)
   {
      // str starts with charlist
      str.erase(0,pos);
   }
   // now let's look at the tail:
   pos= str.find_last_not_of(charlist) +1;  // note: we already know there is at least one other char!
   if ( pos < str.size() )
   {
      str.erase(pos, str.size()-pos);
   }
   return str;
} // eo trim_mod(std::string&,const std::string&)


/**
 * removes last character from a string when it is in a list of chars to be removed.
 * @param[in,out] str the string.
 * @param what the list of chars which will be tested for.
 * @return the resulting string with last char removed (if applicable)
 */
std::string chomp_mod(std::string& str, const std::string& what)
{
   if (str.empty() || what.empty() )
   {
      return str;
   }
   if (what.find(str.at (str.size()-1) ) != std::string::npos)
   {
      str.erase(str.size() - 1);
   }
   return str;
} // eo chomp_mod(std::string&,const std::string&)


/**
 * @brief converts a string to lower case.
 * @param[in,out] str the string to modify.
 * @return the string
 */
std::string to_lower_mod(std::string& str)
{
   std::transform(str.begin(), str.end(), str.begin(), LowerFunc() );
   return str;
} // eo to_lower_mod(std::string&)


/**
 * @brief converts a string to upper case.
 * @param[in,out] str the string to modify.
 * @return the string
 */
std::string to_upper_mod(std::string& str)
{
   std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
   return str;
} // eo to_upper_mod(std::string&)


/**
 * cut off characters from a given list from front and end of a string.
 * @param str the string which should be trimmed.
 * @param charlist the list of characters to remove from beginning and end of string
 * @return the result string.
 */
std::string trim (const std::string& str, const std::string& charlist)
{
   // first: trim the beginning:
   std::string::size_type pos0= str.find_first_not_of(charlist);
   if (pos0 == std::string::npos)
   {
      // whole string consists of charlist (or is already empty)
      return std::string();
   }
   // now let's look at the end:
   std::string::size_type pos1= str.find_last_not_of(charlist);
   return str.substr(pos0, pos1 - pos0 + 1);
} // eo trim(const std:.string&,const std::string&)


/**
 * removes last character from a string when it is in a list of chars to be removed.
 * @param str the string.
 * @param what the list of chars which will be tested for.
 * @return the resulting string with last char removed (if applicable)
 */
std::string chomp (const std::string& str, const std::string& what)
{
   if (str.empty() || what.empty() )
   {
      return str;
   }
   if (what.find(str.at (str.size()-1) ) != std::string::npos)
   {
      return str.substr(0, str.size()-1);
   }
   return str;
} // eo chomp(const std:.string&,const std::string&)


/**
 * @brief returns a lower case version of a given string.
 * @param str the string
 * @return the lower case version of the string
 */
std::string to_lower (const std::string& str)
{
   std::string result(str);
   return to_lower_mod(result);
} // eo to_lower(const std::string&)


/**
 * @brief returns a upper case version of a given string.
 * @param str the string
 * @return the upper case version of the string
 */
std::string to_upper(const std::string& str)
{
   std::string result(str);
   return to_upper_mod(result);
} // eo to_upper(const std::string&)


/**
 * @brief removes a given suffix from a string.
 * @param str the string.
 * @param suffix the suffix which should be removed if the string ends with it.
 * @return the string without the suffix.
 *
 * If the string ends with the suffix, it is removed. If the the string doesn't end
 * with the suffix the original string is returned.
 */
std::string remove_suffix(const std::string& str, const std::string& suffix)
{
   if (has_suffix(str,suffix) )
   {
      return str.substr(0, str.size()-suffix.size() );
   }
   return str;
} // eo remove_suffix(const std::string&,const std::string&)


/**
 * @brief removes a given prefix from a string.
 * @param str the string.
 * @param prefix the prefix which should be removed if the string begins with it.
 * @return the string without the prefix.
 *
 * If the string begins with the prefix, it is removed. If the the string doesn't begin
 * with the prefix the original string is returned.
 */
std::string remove_prefix(const std::string& str, const std::string& prefix)
{
   if (has_prefix(str,prefix) )
   {
      return str.substr( prefix.size() );
   }
   return str;
} // eo remove_prefix(const std::string&,const std::string&)


/**
 * split a string to key and value delimited by a given delimiter.
 * The resulting key and value strings are trimmed (Whitespaces removed at beginning and end).
 * @param str the string which should be splitted.
 * @param[out] key the resulting key
 * @param[out] value the resulting value
 * @param delimiter the delimiter between key and value; default is '='.
 * @return @a true if the split was successful.
 */
bool pair_split(
   const std::string& str,
   std::string& key,
   std::string& value,
   char delimiter)
{
   std::string::size_type pos = str.find (delimiter);
   if (pos == std::string::npos) return false;
   key= str.substr(0,pos);
   value= str.substr(pos+1);
   trim_mod(key);
   trim_mod(value);
   return true;
} // eo pair_split(const std::string&,std::string&,std::string&,char)


/**
 * splits a string by given delimiter
 *
 * @param[in] str the string which should be splitted.
 * @param[out] result the list resulting from splitting  @a str.
 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 * @param[in] omit_empty should empty parts not be stored?
 * @param[in] trim_list list of characters the parts should be trimmed by.
 *  (empty string results in no trim)
 */
void split_string(
   const std::string& str,
   std::list<std::string>& result,
   const std::string& delimiter,
   bool omit_empty,
   const std::string& trim_list
)
{
   std::string::size_type pos, last_pos=0;
   bool delimiter_found= false;
   while ( last_pos < str.size()  && last_pos != std::string::npos)
   {
      pos= str.find(delimiter, last_pos);
      std::string part;
      if (pos == std::string::npos)
      {
         part= str.substr(last_pos);
         delimiter_found= false;
      }
      else
      {
         part= str.substr(last_pos, pos-last_pos);
         delimiter_found=true;
      }
      if (pos != std::string::npos)
      {
         last_pos= pos+ delimiter.size();
      }
      else
      {
         last_pos= std::string::npos;
      }
      if (!trim_list.empty() ) trim_mod (part, trim_list);
      if (omit_empty && part.empty() ) continue;
      result.push_back( part );
   }
   // if the string ends with a delimiter we need to append an empty string if no omit_empty
   // was given.
   // (this way we keep the split result consistent to a join operation)
   if (delimiter_found && !omit_empty)
   {
      result.push_back("");
   }
} // eo split_string(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)


/**
 * splits a string by a given delimiter
 * @param str the string which should be splitted.
 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 * @param[in] omit_empty should empty parts not be stored?
 * @param[in] trim_list list of characters the parts should be trimmed by.
 *  (empty string results in no trim)
 * @return the list resulting from splitting @a str.
 */
std::list<std::string> split_string(
   const std::string& str,
   const std::string& delimiter,
   bool omit_empty,
   const std::string& trim_list
)
{
   std::list<std::string> result;
   split_string(str, result, delimiter, omit_empty, trim_list);
   return result;
} // eo split_string(const std::string&,const std::string&,bool,const std::string&)


/**
 * @brief joins a list of strings into a single string.
 *
 * This funtion is (basically) the reverse operation of @a split_string.
 *
 * @param parts the list of strings.
 * @param delimiter the delimiter which is inserted between the strings.
 * @return the joined string.
 */
std::string join_string(
   const std::list< std::string >& parts,
   const std::string& delimiter
)
{
   std::string result;
   if (! parts.empty() )
   {
      std::list< std::string >::const_iterator it= parts.begin();
      result = *it;
      while ( ++it != parts.end() )
      {
         result+= delimiter;
         result+= *it;
      }
   }
   return result;
} // eo join_string(const std::list< std::string >&,const std::string&)


/** @brief same as join_string for list, except uses a vector */
std::string join_string(
   const std::vector< std::string >& parts,
   const std::string& delimiter
)
{
   std::string result;
   if (! parts.empty() )
   {
      std::vector< std::string >::const_iterator it= parts.begin();
      result = *it;
      while ( ++it != parts.end() )
      {
         result+= delimiter;
         result+= *it;
      }
   }
   return result;
} // eo join_string(const std::vector< std::string >&,const std::string&)


/*
** conversions
*/


/**
 * @brief returns a hex string from a binary string.
 * @param str the (binary) string
 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 * @return the string in hex notation.
 */
std::string convert_binary_to_hex(
   const std::string& str,
   bool upper_case_digits
)
{
   std::string result;
   std::string hexDigits(upper_case_digits ? hexDigitsUpper : hexDigitsLower);
   for ( std::string::const_iterator it= str.begin();
         it != str.end();
         ++it)
   {
      result.push_back( hexDigits[ ( (*it) >> 4) & 0x0f ] );
      result.push_back( hexDigits[ (*it) & 0x0f ] );
   }
   return result;
} // eo convert_binary_to_hex(const std::string&,bool)


/**
 * @brief converts a hex digit string to binary string.
 * @param str hex digit string
 * @return the binary string.
 *
 * The hex digit string may contains white spaces or colons which are treated
 * as delimiters between hex digit groups.
 *
 * @todo rework the handling of half nibbles (consistency)!
 */
std::string convert_hex_to_binary(
   const std::string& str
)
throw (std::runtime_error)
{
   std::string result;
   char c= 0;
   bool hasNibble= false;
   bool lastWasWS= true;
   for ( std::string::const_iterator it= str.begin();
         it != str.end();
         ++it)
   {
      std::string::size_type p = hexDigitsLower.find( *it );
      if (p== std::string::npos)
      {
         p= hexDigitsUpper.find( *it );
      }
      if (p == std::string::npos)
      {
         if (   ( Whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
                or ( *it == ':') // or a colon?
            )
         {
            // we treat that as a valid delimiter:
            if (hasNibble)
            {
               // 1 nibble before WS is treate as lower part:
               result.push_back(c);
               // reset state:
               hasNibble= false;
            }
            lastWasWS= true;
            continue;
         }
      }
      if (p == std::string::npos )
      {
         throw runtime_error("illegal character in hex digit string: " + str);
      }
      lastWasWS= false;
      if (hasNibble)
      {
         c<<=4;
      }
      else
      {
         c=0;
      }
      c+= (p & 0x0f);
      if (hasNibble)
      {
         //we already had a nibble, so a char is complete now:
         result.push_back( c );
         hasNibble=false;
      }
      else
      {
         // this is the first nibble of a new char:
         hasNibble=true;
      }
   }
   if (hasNibble)
   {
      //well, there is one nibble left
      // let's do some heuristics:
      if (lastWasWS)
      {
         // if the preceeding character was a white space (or a colon)
         // we treat the nibble as lower part:
         //( this is consistent with shortened hex notations where leading zeros are not noted)
         result.push_back( c );
      }
      else
      {
         // if it was part of a hex digit chain, we treat it as UPPER part (!!)
         result.push_back( c << 4 );
      }
   }
   return result;
} // eo convert_hex_to_binary(const std::string&)


} // eo namespace I2n


std::string iso_to_utf8(const std::string& isostring)
{
   string result;

   iconv_t i2utf8 = iconv_open("UTF-8", "ISO-8859-1");

   if (iso_to_utf8 == (iconv_t)-1)
      throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");

   size_t in_size=isostring.size();
   size_t out_size=in_size*4;

   char *buf = (char *)malloc(out_size+1);
   if (buf == NULL)
      throw runtime_error("out of memory for iconv buffer");

   char *in = (char *)isostring.c_str();
   char *out = buf;
   iconv(i2utf8, &in, &in_size, &out, &out_size);

   buf[isostring.size()*4-out_size]=0;

   result=buf;

   free(buf);
   iconv_close(i2utf8);

   return result;
}

std::string utf8_to_iso(const std::string& utf8string)
{
   string result;

   iconv_t utf82iso = iconv_open("ISO-8859-1","UTF-8");

   if (utf82iso == (iconv_t)-1)
      throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");

   size_t in_size=utf8string.size();
   size_t out_size=in_size;

   char *buf = (char *)malloc(out_size+1);
   if (buf == NULL)
      throw runtime_error("out of memory for iconv buffer");

   char *in = (char *)utf8string.c_str();
   char *out = buf;
   iconv(utf82iso, &in, &in_size, &out, &out_size);

   buf[utf8string.size()-out_size]=0;

   result=buf;

   free(buf);
   iconv_close(utf82iso);

   return result;
}

wchar_t* utf8_to_wbuf(const std::string& utf8string)
{
   iconv_t utf82wstr = iconv_open("UCS-4LE","UTF-8");

   if (utf82wstr == (iconv_t)-1)
      throw runtime_error("iconv can't convert from UTF-8 to UCS-4");

   size_t in_size=utf8string.size();
   size_t out_size= (in_size+1)*sizeof(wchar_t);

   wchar_t *buf = (wchar_t *)malloc(out_size);
   if (buf == NULL)
      throw runtime_error("out of memory for iconv buffer");

   char *in = (char *)utf8string.c_str();
   char *out = (char*) buf;
   if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
      throw runtime_error("error converting char encodings");

   buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;

   iconv_close(utf82wstr);

   return buf;
}

std::string utf7imap_to_utf8(const std::string& utf7imapstring)
{
   string result;

   iconv_t utf7imap2utf8 = iconv_open("UTF-8","UTF-7-IMAP");

   if (utf7imap2utf8 == (iconv_t)-1)
      throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");

   size_t in_size=utf7imapstring.size();
   size_t out_size=in_size*4;

   char *buf = (char *)malloc(out_size+1);
   if (buf == NULL)
      throw runtime_error("out of memory for iconv buffer");

   char *in = (char *)utf7imapstring.c_str();
   char *out = buf;
   iconv(utf7imap2utf8, &in, &in_size, &out, &out_size);

   buf[utf7imapstring.size()*4-out_size]=0;

   result=buf;

   free(buf);
   iconv_close(utf7imap2utf8);

   return result;
}

std::string utf8_to_utf7imap(const std::string& utf8string)
{
   string result;

   iconv_t utf82utf7imap = iconv_open("UTF-7-IMAP", "UTF-8");

   if (utf82utf7imap == (iconv_t)-1)
      throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");

   // UTF-7 is base64 encoded, a buffer 10x as large
   // as the utf-8 buffer should be enough. If not the string will be truncated.
   size_t in_size=utf8string.size();
   size_t out_size=in_size*10;

   char *buf = (char *)malloc(out_size+1);
   if (buf == NULL)
      throw runtime_error("out of memory for iconv buffer");

   char *in = (char *)utf8string.c_str();
   char *out = buf;
   iconv(utf82utf7imap, &in, &in_size, &out, &out_size);

   buf[utf8string.size()*10-out_size]= 0;

   result=buf;

   free(buf);
   iconv_close(utf82utf7imap);

   return result;
}

// Tokenize string by (html) tags
void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
{
   string::size_type pos, len = input.size();
   bool inside_tag = false;
   string current;

   for (pos = 0; pos < len; pos++)
   {
      if (input[pos] == '<')
      {
         inside_tag = true;

         if (!current.empty() )
         {
            tokenized.push_back( make_pair(current, false) );
            current = "";
         }

         current += input[pos];
      }
      else if (input[pos] == '>' && inside_tag)
      {
         current += input[pos];
         inside_tag = false;
         if (!current.empty() )
         {
            tokenized.push_back( make_pair(current, true) );
            current = "";
         }
      }
      else
         current += input[pos];
   }

   // String left over in buffer?
   if (!current.empty() )
      tokenized.push_back( make_pair(current, false) );
} // eo tokenize_by_tag


std::string strip_html_tags(const std::string &input)
{
   // Pair first: string, second: isTag
   vector<pair<string,bool> > tokenized;
   tokenize_by_tag (tokenized, input);

   string output;
   vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
   for (token = tokenized.begin(); token != tokens_end; ++token)
      if (!token->second)
         output += token->first;

   return output;
} // eo strip_html_tags


// Smart-encode HTML en
string smart_html_entities(const std::string &input)
{
   // Pair first: string, second: isTag
   vector<pair<string,bool> > tokenized;
   tokenize_by_tag (tokenized, input);

   string output;
   vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
   for (token = tokenized.begin(); token != tokens_end; ++token)
   {
      // keep HTML tags as they are
      if (token->second)
         output += token->first;
      else
         output += html_entities(token->first);
   }

   return output;
}


string::size_type find_8bit(const std::string &str)
{
   string::size_type l=str.size();
   for (string::size_type p=0; p < l; p++)
      if (static_cast<unsigned char>(str[p]) > 127)
         return p;

   return string::npos;
}

// encoded UTF-8 chars into HTML entities
string html_entities(std::string str)
{
   // Normal chars
   replace_all (str, "&", "&amp;");
   replace_all (str, "<", "&lt;");
   replace_all (str, ">", "&gt;");
   replace_all (str, "\"", "&quot;");
   replace_all (str, "'", "&#x27;");
   replace_all (str, "/", "&#x2F;");

   // Umlauts
   replace_all (str, "\xC3\xA4", "&auml;");
   replace_all (str, "\xC3\xB6", "&ouml;");
   replace_all (str, "\xC3\xBC", "&uuml;");
   replace_all (str, "\xC3\x84", "&Auml;");
   replace_all (str, "\xC3\x96", "&Ouml;");
   replace_all (str, "\xC3\x9C", "&Uuml;");

   // Misc
   replace_all (str, "\xC3\x9F", "&szlig;");

   // conversion of remaining non-ASCII chars needed?
   // just do if needed because of performance
   if (find_8bit(str) != string::npos)
   {
      // convert to fixed-size encoding UTF-32
      wchar_t* wbuf=utf8_to_wbuf(str);
      ostringstream target;

      // replace all non-ASCII chars with HTML representation
      for (int p=0; wbuf[p] != 0; p++)
      {
         unsigned int c=wbuf[p];

         if (c <= 127)
            target << static_cast<unsigned char>(c);
         else
            target << "&#" << c << ';';
      }

      free(wbuf);

      str=target.str();
   }

   return str;
} // eo html_entities(std::string)

// convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
string html_entities_to_console(std::string str)
{
   // Normal chars
   replace_all (str, "&amp;", "&");
   replace_all (str, "&lt;", "<");
   replace_all (str, "&gt;", ">");
   replace_all (str, "&quot;", "\"");
   replace_all (str, "&#x27;", "'");
   replace_all (str, "&#x2F;", "/");

   // Umlauts
   replace_all (str, "&auml;", "ae");
   replace_all (str, "&ouml;", "oe");
   replace_all (str, "&uuml;", "ue");
   replace_all (str, "&Auml;", "Ae");
   replace_all (str, "&Ouml;", "Oe");
   replace_all (str, "&Uuml;", "Ue");

   // Misc
   replace_all (str, "&szlig;", "ss");

   return str;
}

// find_html_comments + remove_html_comments(str, comments)
void remove_html_comments(string &str)
{
    vector<CommentZone> comments;
    find_html_comments(str, comments);
    remove_html_comments(str, comments);
}

// find all html comments, behaving correctly if they are nested; ignores comment tags ("<!--FOO .... BAR-->")
// If there are invalid comments ("-->" before "<!--" or different number of closing and opening tags),
// then the unknown index of corresponding start/end tag will be represented by a string::npos
// Indices are from start of start tag until first index after closing tag
void find_html_comments(const std::string &str, vector<CommentZone> &comments)
{
    static const string START = "<!--";
    static const string CLOSE = "-->";
    static const string::size_type START_LEN = START.length();
    static const string::size_type CLOSE_LEN = CLOSE.length();

    // in order to find nested comments, need either recursion or a stack
    vector<string::size_type> starts;      // stack of start tags

    string::size_type pos = 0;
    string::size_type len = str.length();
    string::size_type next_start, next_close;

    while (pos < len)     // not really needed but just in case
    {
        next_start = str.find(START, pos);
        next_close = str.find(CLOSE, pos);

        if ( (next_start == string::npos) && (next_close == string::npos) )
            break;   // we are done

        else if ( (next_start == string::npos) || (next_close < next_start) )  // close one comment (pop)
        {
            if (starts.empty())    // closing tag without a start
                comments.push_back(CommentZone(string::npos, next_close+CLOSE_LEN));
            else
            {
                comments.push_back(CommentZone(starts.back(), next_close+CLOSE_LEN));
                starts.pop_back();
            }
            pos = next_close + CLOSE_LEN;
        }

        else if ( (next_close == string::npos) || (next_start < next_close) )  // start a new comment (push)
        {
            starts.push_back(next_start);
            pos = next_start + START_LEN;
        }
    }

    // add comments that have no closing tag from back to front (important for remove_html_comments!)
    while (!starts.empty())
    {
        comments.push_back(CommentZone(starts.back(), string::npos));
        starts.pop_back();
    }
}

// remove all html comments foundby find_html_comments
void remove_html_comments(std::string &str, const vector<CommentZone> &comments)
{
    // remember position where last removal started
    string::size_type last_removal_start = str.length();

    // Go from back to front to not mess up indices.
    // This requires that bigger comments, that contain smaller comments, come AFTER
    // the small contained comments in the comments vector (i.e. comments are ordered by
    // their closing tag, not their opening tag). This is true for results from find_html_comments
    BOOST_REVERSE_FOREACH(const CommentZone &comment, comments)
    {
        if (comment.first == string::npos)
        {
            str = str.replace(0, comment.second, "");   // comment starts "before" str --> delete from start
            break;   // there can be no more
        }
        else if (comment.first >= last_removal_start)
        {
            continue;    // this comment is inside another comment that we have removed already
        }
        else if (comment.second == string::npos)   // comment ends "after" str --> delete until end
        {
            str = str.replace(comment.first, string::npos, "");
            last_removal_start = comment.first;
        }
        else
        {
            str = str.replace(comment.first, comment.second-comment.first, "");
            last_removal_start = comment.first;
        }
    }
}

bool replace_all(string &base, const char *ist, const char *soll)
{
   string i=ist;
   string s=soll;
   return replace_all(base,&i,&s);
}

bool replace_all(string &base, const string &ist, const char *soll)
{
   string s=soll;
   return replace_all(base,&ist,&s);
}

bool replace_all(string &base, const string *ist, const string *soll)
{
   return replace_all(base,*ist,*soll);
}

bool replace_all(string &base, const char *ist, const string *soll)
{
   string i=ist;
   return replace_all(base,&i,soll);
}

bool replace_all(string &base, const string &ist, const string &soll)
{
   bool found_ist = false;
   string::size_type a=0;

   if (ist.empty() )
      throw runtime_error ("replace_all called with empty search string");

   while ( (a=base.find(ist,a) ) != string::npos)
   {
      base.replace(a,ist.size(),soll);
      a=a+soll.size();
      found_ist = true;
   }

   return found_ist;
}

/**
 * @brief replaces all characters that could be problematic or impose a security risk when being logged
 * @param str the original string
 * @param replace_with the character to replace the unsafe chars with
 * @return a string that is safe to send to syslog or other logfiles
 *
 * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
 * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
 * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
 *
 */
std::string sanitize_for_logging(const std::string &str, const char replace_with)
{
    std::string output=str;

    const string::size_type len = output.size();
    for (std::string::size_type p=0; p < len; p++)
        if (output[p] < 0x20 || output[p] > 0x7E)
            output[p]=replace_with;

    return output;
}

#if 0
string to_lower(const string &src)
{
   string dst = src;

   string::size_type pos, end = dst.size();
   for (pos = 0; pos < end; pos++)
      dst[pos] = tolower(dst[pos]);

   return dst;
}

string to_upper(const string &src)
{
   string dst = src;

   string::size_type pos, end = dst.size();
   for (pos = 0; pos < end; pos++)
      dst[pos] = toupper(dst[pos]);

   return dst;
}
#endif

const int MAX_UNIT_FORMAT_SYMBOLS = 6;

const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
        " B",
        " KB",
        " MB",
        " GB",
        " TB",
        " PB"
};

const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
        i18n_noop(" Bytes"),
        i18n_noop(" KBytes"),
        i18n_noop(" MBytes"),
        i18n_noop(" GBytes"),
        i18n_noop(" TBytes"),
        i18n_noop(" PBytes")
};


long double rounding_upwards(
        const long double number,
        const int rounding_multiplier
)
{
    long double rounded_number;
    rounded_number = number * rounding_multiplier;
    rounded_number += 0.5;
    rounded_number = (int64_t) (rounded_number);
    rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);

    return rounded_number;
}


string nice_unit_format(
        const int64_t input,
        const UnitFormat format,
        const UnitBase base
)
{
   // select the system of units (decimal or binary)
   int multiple = 0;
   if (base == UnitBase1000)
   {
       multiple = 1000;
   }
   else
   {
       multiple = 1024;
   }

   long double size = input;

   // check the size of the input number to fit in the appropriate symbol
   int sizecount = 0;
   while (size > multiple)
   {
       size = size / multiple;
       sizecount++;

       // rollback to the previous values and stop the loop when cannot
       // represent the number length.
       if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
       {
           size = size * multiple;
           sizecount--;
           break;
       }
   }

   // round the input number "half up" to multiples of 10
   const int rounding_multiplier = 10;
   size = rounding_upwards(size, rounding_multiplier);

   // format the input number, placing the appropriate symbol
   ostringstream out;
   out.setf (ios::fixed);
   if (format == ShortUnitFormat)
   {
       out.precision(1);
       out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
   }
   else
   {
       out.precision (2);
       out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
   }

   return out.str();
} // eo nice_unit_format(int input)


string nice_unit_format(
        const double input,
        const UnitFormat format,
        const UnitBase base
)
{
    // round as double and cast to int64_t
    // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
    int64_t input_casted_and_rounded =
        boost::numeric_cast<int64_t>( round(input) );

    // now call other
    return nice_unit_format( input_casted_and_rounded, format, base );
} // eo nice_unit_format(double input)


string escape(const string &s)
{
   string out(s);
   string::size_type p;

   p=0;
   while ( (p=out.find_first_of("\"\\",p) ) !=out.npos)
   {
      out.insert (p,"\\");
      p+=2;
   }

   p=0;
   while ( (p=out.find_first_of("\r",p) ) !=out.npos)
   {
      out.replace (p,1,"\\r");
      p+=2;
   }

   p=0;
   while ( (p=out.find_first_of("\n",p) ) !=out.npos)
   {
      out.replace (p,1,"\\n");
      p+=2;
   }

   out='"'+out+'"';

   return out;
} // eo scape(const std::string&)


string descape(const string &s, int startpos, int &endpos)
{
   string out;

   if (s.at(startpos) != '"')
      throw out_of_range("value not type escaped string");

   out=s.substr(startpos+1);
   string::size_type p=0;

   // search for the end of the string
   while ( (p=out.find("\"",p) ) !=out.npos)
   {
      int e=p-1;
      bool escaped=false;

      // the " might be escaped with a backslash
      while (e>=0 && out.at (e) =='\\')
      {
         if (escaped == false)
            escaped=true;
         else
            escaped=false;

         e--;
      }

      if (escaped==false)
         break;
      else
         p++;
   }

   // we now have the end of the string
   out=out.substr(0,p);

   // tell calling prog about the endposition
   endpos=startpos+p+1;

   // descape all \ stuff inside the string now
   p=0;
   while ( (p=out.find_first_of("\\",p) ) !=out.npos)
   {
      switch (out.at(p+1) )
      {
         case 'r':
            out.replace(p,2,"\r");
            break;
         case 'n':
            out.replace(p,2,"\n");
            break;
         default:
            out.erase(p,1);
      }
      p++;
   }

   return out;
} // eo descape(const std::string&,int,int&)


string escape_shellarg(const string &input)
{
   string output = "'";
   string::const_iterator it, it_end = input.end();
   for (it = input.begin(); it != it_end; ++it)
   {
      if ( (*it) == '\'')
         output += "'\\'";

      output += *it;
   }

   output += "'";
   return output;
}