[libi2ncommon] / src / stringfunc.cpp

/** @file
 *
 * (c) Copyright 2007-2008 by Intra2net AG
 * 
 * info@intra2net.com
 */

#include <iostream>
#include <string>
#include <sstream>
#include <stdexcept>

#include <wchar.h>
#include <stdlib.h>
#include <iconv.h>
#include <i18n.h>

#include <stringfunc.hxx>

using namespace std;

namespace i2n {


namespace {

const std::string hexDigitsLower("0123456789abcdef");
const std::string hexDigitsUpper("0123456789ABCDEF");


struct UpperFunc
{
    char operator() (char c)
    {
        return std::toupper(c);
    }
}; // eo struct UpperFunc


struct LowerFunc
{
    char operator() (char c)
    {
        return std::tolower(c);
    }
}; // eo struct LowerFunc


} // eo namespace <anonymous>


/**
 * default list of whitespaces (" \t\r\n");
 */
const std::string whitespaces = " \t\r\n";

/**
 * default list of lineendings ("\r\n");
 */
const std::string lineends= "\r\n";


/**
 * @brief checks if a string begins with a given prefix.
 * @param[in,out] str the string which is tested
 * @param prefix the prefix which should be tested for.
 * @return @a true iff the prefix is not empty and the string begins with that prefix.
 */
bool hasPrefix(const std::string& str, const std::string& prefix)
{
    if (prefix.empty() || str.empty() || str.size() < prefix.size())
    {
        return false;
    }
    return str.compare(0, prefix.size(), prefix) == 0;
} // eo hasPrefix(const std::string&,const std::string&)


/**
 * @brief checks if a string ends with a given suffix.
 * @param[in,out] str the string which is tested
 * @param suffix the suffix which should be tested for.
 * @return @a true iff the suffix is not empty and the string ends with that suffix.
 */
bool hasSuffix(const std::string& str, const std::string& suffix)
{
    if (suffix.empty() || str.empty() || str.size() < suffix.size())
    {
        return false;
    }
    return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
} // eo hasSuffix(const std::string&,const std::string&)


/**
 * cut off characters from a given list from front and end of a string.
 * @param[in,out] str the string which should be trimmed.
 * @param charlist the list of characters to remove from beginning and end of string
 * @return the result string.
 */
std::string trimMod(std::string& str, const std::string& charlist)
{
    // first: trim the beginning:
    std::string::size_type pos= str.find_first_not_of(charlist);
    if (pos == std::string::npos)
    {
        // whole string consists of charlist (or is already empty)
        str.clear();
        return str;
    }
    else if (pos>0)
    {
        // str starts with charlist
        str.erase(0,pos);
    }
    // now let's look at the tail:
    pos= str.find_last_not_of(charlist)+1;  // note: we already know there is at least one other char!
    if ( pos < str.size() )
    {
        str.erase(pos, str.size()-pos);
    }
    return str;
} // eo trimMod(std::string&,const std::string&)


/**
 * removes last character from a string when it is in a list of chars to be removed.
 * @param[in,out] str the string.
 * @param what the list of chars which will be tested for.
 * @return the resulting string with last char removed (if applicable)
 */
std::string chompMod(std::string& str, const std::string& what)
{
    if (str.empty() || what.empty())
    {
        return str;
    }
    if (what.find( str.at(str.size()-1) ) != std::string::npos)
    {
        str.erase( str.size() - 1);
    }
    return str;
} // eo chompMod(std::string&,const std::string&)


/**
 * @brief converts a string to lower case.
 * @param[in,out] str the string to modify.
 * @return the string
 */
std::string lowerMod(std::string& str)
{
    std::transform( str.begin(), str.end(), str.begin(), LowerFunc() );
    return str;
} // eo lowerMod(std::string&)


/**
 * @brief converts a string to upper case.
 * @param[in,out] str the string to modify.
 * @return the string
 */
std::string upperMod(std::string& str)
{
    std::transform( str.begin(), str.end(), str.begin(), UpperFunc() );
    return str;
} // eo upperMod(std::string&)


/**
 * cut off characters from a given list from front and end of a string.
 * @param str the string which should be trimmed.
 * @param charlist the list of characters to remove from beginning and end of string
 * @return the result string.
 */
std::string trim(const std::string& str, const std::string& charlist)
{
    // first: trim the beginning:
    std::string::size_type pos0= str.find_first_not_of(charlist);
    if (pos0 == std::string::npos)
    {
        // whole string consists of charlist (or is already empty)
        return std::string();
    }
    // now let's look at the end:
    std::string::size_type pos1= str.find_last_not_of(charlist);
    return str.substr(pos0, pos1 - pos0 + 1);
} // eo trim(const std:.string&,const std::string&)


/**
 * removes last character from a string when it is in a list of chars to be removed.
 * @param str the string.
 * @param what the list of chars which will be tested for.
 * @return the resulting string with last char removed (if applicable)
 */
std::string chomp(const std::string& str, const std::string& what)
{
    if (str.empty() || what.empty())
    {
        return str;
    }
    if (what.find( str.at(str.size()-1) ) != std::string::npos)
    {
        return str.substr(0, str.size()-1);
    }
    return str;
} // eo chomp(const std:.string&,const std::string&)


/**
 * @brief returns a lower case version of a given string.
 * @param str the string
 * @return the lower case version of the string
 */
std::string lower(const std::string& str)
{
    std::string result(str);
    return lowerMod(result);
} // eo lower(const std::string&)


/**
 * @brief returns a upper case version of a given string.
 * @param str the string
 * @return the upper case version of the string
 */
std::string upper(const std::string& str)
{
    std::string result(str);
    return upperMod(result);
} // eo upper(const std::string&)


/**
 * @brief removes a given suffix from a string.
 * @param str the string.
 * @param suffix the suffix which should be removed if the string ends with it.
 * @return the string without the suffix.
 *
 * If the string ends with the suffix, it is removed. If the the string doesn't end
 * with the suffix the original string is returned.
 */
std::string removeSuffix(const std::string& str, const std::string& suffix)
{
    if (hasSuffix(str,suffix))
    {
        return str.substr(0, str.size()-suffix.size() );
    }
    return str;
} // eo removeSuffix(const std::string&,const std::string&)


/**
 * @brief removes a given prefix from a string.
 * @param str the string.
 * @param prefix the prefix which should be removed if the string begins with it.
 * @return the string without the prefix.
 *
 * If the string begins with the prefix, it is removed. If the the string doesn't begin
 * with the prefix the original string is returned.
 */
std::string removePrefix(const std::string& str, const std::string& prefix)
{
    if (hasPrefix(str,prefix))
    {
        return str.substr( prefix.size() );
    }
    return str;
} // eo removePrefix(const std::string&,const std::string&)


/**
 * split a string to key and value delimited by a given delimiter.
 * The resulting key and value strings are trimmed (whitespaces removed at beginning and end).
 * @param str the string which should be splitted.
 * @param[out] key the resulting key
 * @param[out] value the resulting value
 * @param delimiter the delimiter between key and value; default is '='.
 * @return @a true if the split was successful.
 */
bool pairSplit(
    const std::string& str,
    std::string& key,
    std::string& value,
    char delimiter)
{
    std::string::size_type pos = str.find(delimiter);
    if (pos == std::string::npos) return false;
    key= str.substr(0,pos);
    value= str.substr(pos+1);
    trimMod(key);
    trimMod(value);
    return true;
} // eo pairSplit(const std::string&,std::string&,std::string&,char)


/**
 * splits a string by given delimiter
 *
 * @param[in] str the string which should be splitted.
 * @param[out] result the list resulting from splitting  @a str.
 * @param[in] delimiter the delimiter (word/phrase) at which @a str should be splitted.
 * @param[in] omit_empty should empty parts not be stored?
 * @param[in] trim_list list of characters the parts should be trimmed by.
 *  (empty string results in no trim)
 */
void splitString(
    const std::string& str,
    std::list<std::string>& result,
    const std::string& delimiter,
    bool omit_empty,
    const std::string& trim_list
)
{
    std::string::size_type pos, last_pos=0;
    bool delimiter_found= false;
    while ( last_pos < str.size()  && last_pos != std::string::npos)
    {
        pos= str.find(delimiter, last_pos);
        std::string part;
        if (pos == std::string::npos)
        {
            part= str.substr(last_pos);
            delimiter_found= false;
        }
        else
        {
            part= str.substr(last_pos, pos-last_pos);
            delimiter_found=true;
        }
        if (pos != std::string::npos)
        {
            last_pos= pos+ delimiter.size();
        }
        else
        {
            last_pos= std::string::npos;
        }
        if (!trim_list.empty()) trimMod(part, trim_list);
        if (omit_empty && part.empty()) continue;
        result.push_back( part );
    }
    // if the string ends with a delimiter we need to append an empty string if no omit_empty
    // was given.
    // (this way we keep the split result consistent to a join operation)
    if (delimiter_found && !omit_empty)
    {
        result.push_back("");
    }
} // eo splitString(const std::string&,std::list< std::string >&,const std::string&,bool,const std::string&)


/**
 * splits a string by a given delimiter
 * @param str the string which should be splitted.
 * @param delimiter delimiter the delimiter (word/phrase) at which @a str should be splitted.
 * @param[in] omit_empty should empty parts not be stored?
 * @param[in] trim_list list of characters the parts should be trimmed by.
 *  (empty string results in no trim)
 * @return the list resulting from splitting @a str.
 */
std::list<std::string> splitString(
    const std::string& str,
    const std::string& delimiter,
    bool omit_empty,
    const std::string& trim_list
)
{
    std::list<std::string> result;
    splitString(str, result, delimiter, omit_empty, trim_list);
    return result;
} // eo splitString(const std::string&,const std::string&,bool,const std::string&)


/**
 * @brief joins a list of strings into a single string.
 *
 * This funtion is (basically) the reverse operation of @a splitString.
 * 
 * @param parts the list of strings.
 * @param delimiter the delimiter which is inserted between the strings.
 * @return the joined string.
 */
std::string joinString(
    const std::list< std::string >& parts,
    const std::string& delimiter
)
{
    std::string result;
    if (! parts.empty())
    {
        std::list< std::string >::const_iterator it= parts.begin();
        result = *it;
        while( ++it != parts.end() )
        {
            result+= delimiter;
            result+= *it;
        }
    }
    return result;
} // eo joinString(const std::list< std::string >&,const std::string&)


/*
** conversions
*/


/**
 * @brief returns a hex string from a binary string.
 * @param str the (binary) string
 * @param upper_case_digits determine whether to use upper case characters for digits A-F.
 * @return the string in hex notation.
 */
std::string binaryToHex(
    const std::string& str,
    bool upper_case_digits
)
{
    std::string result;
    std::string hexDigits( upper_case_digits ? hexDigitsUpper : hexDigitsLower);
    for(std::string::const_iterator it= str.begin();
        it != str.end();
        ++it)
    {
        result.push_back( hexDigits[ ((*it) >> 4) & 0x0f ] );
        result.push_back( hexDigits[ (*it) & 0x0f ] );
    }
    return result;
} // eo binaryToHex(const std::string&,bool)


/**
 * @brief converts a hex digit string to binary string.
 * @param str hex digit string
 * @return the binary string.
 *
 * The hex digit string may contains white spaces or colons which are treated
 * as delimiters between hex digit groups.
 *
 * @todo rework the handling of half nibbles (consistency)!
 */
std::string hexToBinary(
    const std::string& str
)
throw(std::runtime_error)
{
    std::string result;
    char c= 0;
    bool hasNibble= false;
    bool lastWasWS= true;
    for(std::string::const_iterator it= str.begin();
        it != str.end();
        ++it)
    {
        std::string::size_type p = hexDigitsLower.find( *it );
        if (p== std::string::npos)
        {
            p= hexDigitsUpper.find( *it );
        }
        if (p == std::string::npos)
        {
            if (   ( whitespaces.find( *it ) != std::string::npos) // is it a whitespace?
                or ( *it == ':') // or a colon?
               )
            {
                // we treat that as a valid delimiter:
                if (hasNibble)
                {
                    // 1 nibble before WS is treate as lower part:
                    result.push_back(c);
                    // reset state:
                    hasNibble= false;
                }
                lastWasWS= true;
                continue;
            }
        }
        if (p == std::string::npos )
        {
            throw runtime_error("illegal character in hex digit string: " + str);
        }
        lastWasWS= false;
        if (hasNibble)
        {
            c<<=4;
        }
        else
        {
            c=0;
        }
        c+= (p & 0x0f);
        if (hasNibble)
        {
            //we already had a nibble, so a char is complete now:
            result.push_back( c );
            hasNibble=false;
        }
        else
        {
            // this is the first nibble of a new char:
            hasNibble=true;
        }
    }
    if (hasNibble)
    {
        //well, there is one nibble left
        // let's do some heuristics:
        if (lastWasWS)
        {
            // if the preceeding character was a white space (or a colon)
            // we treat the nibble as lower part:
            //( this is consistent with shortened hex notations where leading zeros are not noted)
            result.push_back( c );
        }
        else
        {
            // if it was part of a hex digit chain, we treat it as UPPER part (!!)
            result.push_back( c << 4 );
        }
    }
    return result;
} // eo hexToBinary(const std::string&)


} // eo namespace i2n

std::string iso_to_utf8(const std::string& isostring)
{
    string result;

    iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1");

    if (iso_to_utf8 == (iconv_t)-1)
        throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");

    size_t in_size=isostring.size();
    size_t out_size=in_size*4;

    char *buf = (char *)malloc(out_size+1);
    if (buf == NULL)
        throw runtime_error("out of memory for iconv buffer");

    const char *in = isostring.c_str();
    char *out = buf;
    iconv (i2utf8, &in, &in_size, &out, &out_size);

    buf[isostring.size()*4-out_size]=0;

    result=buf;

    free(buf);
    iconv_close (i2utf8);

    return result;
}

std::string utf8_to_iso(const std::string& utf8string)
{
    string result;

    iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8");

    if (utf82iso == (iconv_t)-1)
        throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");

    size_t in_size=utf8string.size();
    size_t out_size=in_size;

    char *buf = (char *)malloc(out_size+1);
    if (buf == NULL)
        throw runtime_error("out of memory for iconv buffer");

    const char *in = utf8string.c_str();
    char *out = buf;
    iconv (utf82iso, &in, &in_size, &out, &out_size);

    buf[utf8string.size()-out_size]=0;

    result=buf;

    free(buf);
    iconv_close (utf82iso);

    return result;
}

wchar_t* utf8_to_wbuf(const std::string& utf8string)
{
    iconv_t utf82wstr = iconv_open ("UCS-4LE","UTF-8");

    if (utf82wstr == (iconv_t)-1)
        throw runtime_error("iconv can't convert from UTF-8 to UCS-4");

    size_t in_size=utf8string.size();
    size_t out_size=(in_size+1)*sizeof(wchar_t);

    wchar_t *buf = (wchar_t *)malloc(out_size);
    if (buf == NULL)
        throw runtime_error("out of memory for iconv buffer");

    const char *in = utf8string.c_str();
    char *out = (char*)buf;
    if (iconv (utf82wstr, &in, &in_size, &out, &out_size) == -1)
        throw runtime_error("error converting char encodings");

    buf[((utf8string.size()+1)*sizeof(wchar_t)-out_size)/sizeof(wchar_t)]=0;

    iconv_close (utf82wstr);

    return buf;
}

std::string utf7imap_to_utf8(const std::string& utf7imapstring)
{
    string result;

    iconv_t utf7imap2utf8 = iconv_open ("UTF-8","UTF-7-IMAP");

    if (utf7imap2utf8 == (iconv_t)-1)
        throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");

    size_t in_size=utf7imapstring.size();
    size_t out_size=in_size*4;

    char *buf = (char *)malloc(out_size+1);
    if (buf == NULL)
        throw runtime_error("out of memory for iconv buffer");

    const char *in = utf7imapstring.c_str();
    char *out = buf;
    iconv (utf7imap2utf8, &in, &in_size, &out, &out_size);

    buf[utf7imapstring.size()*4-out_size]=0;

    result=buf;

    free(buf);
    iconv_close (utf7imap2utf8);

    return result;
}

std::string utf8_to_utf7imap(const std::string& utf8string)
{
    string result;

    iconv_t utf82utf7imap = iconv_open ("UTF-7-IMAP", "UTF-8");

    if (utf82utf7imap == (iconv_t)-1)
        throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");

    // UTF-7 is base64 encoded, a buffer 10x as large
    // as the utf-8 buffer should be enough. If not the string will be truncated.
    size_t in_size=utf8string.size();
    size_t out_size=in_size*10;

    char *buf = (char *)malloc(out_size+1);
    if (buf == NULL)
        throw runtime_error("out of memory for iconv buffer");

    const char *in = utf8string.c_str();
    char *out = buf;
    iconv (utf82utf7imap, &in, &in_size, &out, &out_size);

    buf[utf8string.size()*10-out_size]=0;

    result=buf;

    free(buf);
    iconv_close (utf82utf7imap);

    return result;
}

// Tokenize string by (html) tags
void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
{
    string::size_type pos, len = input.size();
    bool inside_tag = false;
    string current;

    for (pos = 0; pos < len; pos++) {
        if (input[pos] == '<') {
            inside_tag = true;

            if (!current.empty()) {
                tokenized.push_back(make_pair(current, false));
                current = "";
            }

            current += input[pos];
        } else if (input[pos] == '>' && inside_tag) {
            current += input[pos];
            inside_tag = false;
            if (!current.empty()) {
                tokenized.push_back(make_pair(current, true));
                current = "";
            }
        } else
            current += input[pos];
    }

    // String left over in buffer?
    if (!current.empty())
        tokenized.push_back(make_pair(current, false));
}

std::string strip_html_tags(const std::string &input)
{
    // Pair first: string, second: isTag
    vector<pair<string,bool> > tokenized;
    tokenize_by_tag(tokenized, input);

    string output;
    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
    for (token = tokenized.begin(); token != tokens_end; token++)
        if (!token->second)
            output += token->first;

    return output;
}

// Smart-encode HTML en
string smart_html_entities(const std::string &input)
{
    // Pair first: string, second: isTag
    vector<pair<string,bool> > tokenized;
    tokenize_by_tag(tokenized, input);

    string output;
    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
    for (token = tokenized.begin(); token != tokens_end; token++) {
        // keep HTML tags as they are
        if (token->second)
            output += token->first;
        else
            output += html_entities(token->first);
    }

    return output;
}

string::size_type find_8bit(const std::string &str)
{
    string::size_type l=str.size();
    for (string::size_type p=0; p < l; p++)
        if (static_cast<unsigned char>(str[p]) > 127)
            return p;

    return string::npos;
}

// encoded UTF-8 chars into HTML entities
string html_entities(std::string str)
{
    // Normal chars
    replace_all (str, "&", "&amp;");
    replace_all (str, "\"", "&quot;");
    replace_all (str, "<", "&lt;");
    replace_all (str, ">", "&gt;");

    // Umlauts
    replace_all (str, "\xC3\xA4", "&auml;");
    replace_all (str, "\xC3\xB6", "&ouml;");
    replace_all (str, "\xC3\xBC", "&uuml;");
    replace_all (str, "\xC3\x84", "&Auml;");
    replace_all (str, "\xC3\x96", "&Ouml;");
    replace_all (str, "\xC3\x9C", "&Uuml;");

    // Misc
    replace_all (str, "\xC3\x9F", "&szlig;");

    // conversion of remaining non-ASCII chars needed?
    // just do if needed because of performance
    if (find_8bit(str) != string::npos)
    {
        // convert to fixed-size encoding UTF-32
        wchar_t* wbuf=utf8_to_wbuf(str);
        ostringstream target;

        // replace all non-ASCII chars with HTML representation
        for (int p=0; wbuf[p] != 0; p++)
        {
            unsigned int c=wbuf[p];

            if (c <= 127)
                target << static_cast<unsigned char>(c);
            else
                target << "&#" << c << ';';
        }

        free(wbuf);

        str=target.str();
    }

    return str;
}

bool replace_all(string &base, const char *ist, const char *soll)
{
    string i=ist;
    string s=soll;
    return replace_all(base,&i,&s);
}

bool replace_all(string &base, const string &ist, const char *soll)
{
    string s=soll;
    return replace_all(base,&ist,&s);
}

bool replace_all(string &base, const string *ist, const string *soll)
{
    return replace_all(base,*ist,*soll);
}

bool replace_all(string &base, const char *ist, const string *soll)
{
    string i=ist;
    return replace_all(base,&i,soll);
}

bool replace_all(string &base, const string &ist, const string &soll)
{
    bool found_ist = false;
    string::size_type a=0;

    if (ist.empty())
        throw runtime_error("replace_all called with empty search string");

    while((a=base.find(ist,a))!=string::npos)
    {
        base.replace(a,ist.size(),soll);
        a=a+soll.size();
        found_ist = true;
    }
    
    return found_ist;
}

string to_lower(const string &src)
{
    string dst = src;

    string::size_type pos, end = dst.size();
    for (pos = 0; pos < end; pos++)
        dst[pos] = tolower(dst[pos]);

    return dst;
}

string to_upper(const string &src)
{
    string dst = src;

    string::size_type pos, end = dst.size();
    for (pos = 0; pos < end; pos++)
        dst[pos] = toupper(dst[pos]);

    return dst;
}

string nice_unit_format (int input) {
    float size = input;
    int sizecount = 0;

    while (size > 1000) {
        size = size / 1000;
        sizecount++;
    }

    float tmp;                       // round
    tmp = size*10;
    tmp += 0.5;
    tmp = int (tmp);
    tmp = float(tmp)/float(10);
    size = tmp;

    ostringstream out;

    out.setf (ios::fixed);
    out.precision(2);
    switch (sizecount) {
    case 1:
        out << size << i18n(" KBytes");
        break;
    case 2:
        out << size << i18n(" MBytes");
        break;
    case 3:
        out << size << i18n(" Gbytes");
        break;
    default:
        out << size << i18n(" Bytes");
        break;
    }

    return out.str();
}

string escape(const string &s)
{
    string out(s);
    string::size_type p;

    p=0;
    while ((p=out.find_first_of("\"\\",p))!=out.npos)
    {
        out.insert(p,"\\");
        p+=2;
    }

    p=0;
    while ((p=out.find_first_of("\r",p))!=out.npos)
    {
        out.replace(p,1,"\\r");
        p+=2;
    }

    p=0;
    while ((p=out.find_first_of("\n",p))!=out.npos)
    {
        out.replace(p,1,"\\n");
        p+=2;
    }

    out='"'+out+'"';

    return out;
}

string descape(const string &s, int startpos, int &endpos)
{
    string out;

    if (s.at(startpos) != '"')
        throw out_of_range("value not type escaped string");

    out=s.substr(startpos+1);
    string::size_type p=0;

    // search for the end of the string
    while((p=out.find("\"",p))!=out.npos)
    {
        int e=p-1;
        bool escaped=false;

        // the " might be escaped with a backslash
        while(e>=0 && out.at(e)=='\\')
        {
            if (escaped == false)
                escaped=true;
            else
                escaped=false;

            e--;
        }

        if (escaped==false)
            break;
        else
            p++;
    }

    // we now have the end of the string
    out=out.substr(0,p);

    // tell calling prog about the endposition
    endpos=startpos+p+1;

    // descape all \ stuff inside the string now
    p=0;
    while((p=out.find_first_of("\\",p))!=out.npos)
    {
        switch(out.at(p+1))
        {
        case 'r':
            out.replace(p,2,"\r");
            break;
        case 'n':
            out.replace(p,2,"\n");
            break;
        default:
            out.erase(p,1);
        }
        p++;
    }

    return out;
}

string escape_shellarg(const string &input)
{
    string output = "'";
    string::const_iterator it, it_end = input.end();
    for (it = input.begin(); it != it_end; it++) {
        if ((*it) == '\'')
            output += "'\\'";

        output += *it;
    }

    output += "'";
    return output;
}