Create functions find/remove_html_comments

[libi2ncommon] / src / stringfunc.cpp
diff --git a/src/stringfunc.cpp b/src/stringfunc.cpp

index f9c7d95..63424bc 100644 (file)
--- a/src/stringfunc.cpp
+++ b/src/stringfunc.cpp
@@ -1,8 +1,25 @@
+/*
+The software in this package is distributed under the GNU General
+Public License version 2 (with a special exception described below).
+
+A copy of GNU General Public License (GPL) is included in this distribution,
+in the file COPYING.GPL.
+
+As a special exception, if other files instantiate templates or use macros
+or inline functions from this file, or you compile this file and link it
+with other works to produce a work based on this file, this file
+does not by itself cause the resulting work to be covered
+by the GNU General Public License.
+
+However the source code for this file must still be made available
+in accordance with section (3) of the GNU General Public License.
+
+This exception does not invalidate any other reasons why a work based
+on this file might be covered by the GNU General Public License.
+*/
 /** @file
  *
  * (c) Copyright 2007-2008 by Intra2net AG
- *
- * info@intra2net.com
  */
 
 #include <iostream>
@@ -10,12 +27,16 @@
 #include <sstream>
 #include <stdexcept>
 #include <algorithm>
+#include <cmath>    // for round()
 
 #include <wchar.h>
 #include <stdlib.h>
 #include <iconv.h>
 #include <i18n.h>
 
+#include <boost/numeric/conversion/cast.hpp>
+#include <boost/foreach.hpp>
+
 #include <stringfunc.hxx>
 
 using namespace std;
@@ -411,6 +432,27 @@ std::string join_string(
 } // eo join_string(const std::list< std::string >&,const std::string&)
 
 
+/** @brief same as join_string for list, except uses a vector */
+std::string join_string(
+   const std::vector< std::string >& parts,
+   const std::string& delimiter
+)
+{
+   std::string result;
+   if (! parts.empty() )
+   {
+      std::vector< std::string >::const_iterator it= parts.begin();
+      result = *it;
+      while ( ++it != parts.end() )
+      {
+         result+= delimiter;
+         result+= *it;
+      }
+   }
+   return result;
+} // eo join_string(const std::vector< std::string >&,const std::string&)
+
+
 
 /*
 ** conversions
@@ -615,7 +657,7 @@ wchar_t* utf8_to_wbuf(const std::string& utf8string)
 
    char *in = (char *)utf8string.c_str();
    char *out = (char*) buf;
-   if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == -1)
+   if (iconv(utf82wstr, &in, &in_size, &out, &out_size) == (size_t)-1)
       throw runtime_error("error converting char encodings");
 
    buf[ ( (utf8string.size()+1)*sizeof(wchar_t)-out_size) /sizeof(wchar_t) ]=0;
@@ -736,7 +778,7 @@ std::string strip_html_tags(const std::string &input)
 
    string output;
    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
-   for (token = tokenized.begin(); token != tokens_end; token++)
+   for (token = tokenized.begin(); token != tokens_end; ++token)
       if (!token->second)
          output += token->first;
 
@@ -753,7 +795,7 @@ string smart_html_entities(const std::string &input)
 
    string output;
    vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
-   for (token = tokenized.begin(); token != tokens_end; token++)
+   for (token = tokenized.begin(); token != tokens_end; ++token)
    {
       // keep HTML tags as they are
       if (token->second)
@@ -825,6 +867,125 @@ string html_entities(std::string str)
    return str;
 } // eo html_entities(std::string)
 
+// convert HTML entities to something that can be viewed on a basic text console (restricted to ASCII-7)
+string html_entities_to_console(std::string str)
+{
+   // Normal chars
+   replace_all (str, "&amp;", "&");
+   replace_all (str, "&lt;", "<");
+   replace_all (str, "&gt;", ">");
+   replace_all (str, "&quot;", "\"");
+   replace_all (str, "&#x27;", "'");
+   replace_all (str, "&#x2F;", "/");
+
+   // Umlauts
+   replace_all (str, "&auml;", "ae");
+   replace_all (str, "&ouml;", "oe");
+   replace_all (str, "&uuml;", "ue");
+   replace_all (str, "&Auml;", "Ae");
+   replace_all (str, "&Ouml;", "Oe");
+   replace_all (str, "&Uuml;", "Ue");
+
+   // Misc
+   replace_all (str, "&szlig;", "ss");
+
+   return str;
+}
+
+// find_html_comments + remove_html_comments(str, comments)
+void remove_html_comments(string &str)
+{
+    vector<CommentZone> comments;
+    find_html_comments(str, comments);
+    remove_html_comments(str, comments);
+}
+
+// find all html comments, behaving correctly if they are nested; ignores comment tags ("<!--FOO .... BAR-->")
+// If there are invalid comments ("-->" before "<!--" or different number of closing and opening tags),
+// then the unknown index of corresponding start/end tag will be represented by a string::npos
+// Indices are from start of start tag until first index after closing tag
+void find_html_comments(const std::string &str, vector<CommentZone> &comments)
+{
+    static const string START = "<!--";
+    static const string CLOSE = "-->";
+    static const string::size_type START_LEN = START.length();
+    static const string::size_type CLOSE_LEN = CLOSE.length();
+
+    // in order to find nested comments, need either recursion or a stack
+    vector<string::size_type> starts;      // stack of start tags
+
+    string::size_type pos = 0;
+    string::size_type len = str.length();
+    string::size_type next_start, next_close;
+
+    while (pos < len)     // not really needed but just in case
+    {
+        next_start = str.find(START, pos);
+        next_close = str.find(CLOSE, pos);
+
+        if ( (next_start == string::npos) && (next_close == string::npos) )
+            break;   // we are done
+
+        else if ( (next_start == string::npos) || (next_close < next_start) )  // close one comment (pop)
+        {
+            if (starts.empty())    // closing tag without a start
+                comments.push_back(CommentZone(string::npos, next_close+CLOSE_LEN));
+            else
+            {
+                comments.push_back(CommentZone(starts.back(), next_close+CLOSE_LEN));
+                starts.pop_back();
+            }
+            pos = next_close + CLOSE_LEN;
+        }
+
+        else if ( (next_close == string::npos) || (next_start < next_close) )  // start a new comment (push)
+        {
+            starts.push_back(next_start);
+            pos = next_start + START_LEN;
+        }
+    }
+
+    // add comments that have no closing tag from back to front (important for remove_html_comments!)
+    while (!starts.empty())
+    {
+        comments.push_back(CommentZone(starts.back(), string::npos));
+        starts.pop_back();
+    }
+}
+
+// remove all html comments foundby find_html_comments
+void remove_html_comments(std::string &str, const vector<CommentZone> &comments)
+{
+    // remember position where last removal started
+    string::size_type last_removal_start = str.length();
+
+    // Go from back to front to not mess up indices.
+    // This requires that bigger comments, that contain smaller comments, come AFTER
+    // the small contained comments in the comments vector (i.e. comments are ordered by
+    // their closing tag, not their opening tag). This is true for results from find_html_comments
+    BOOST_REVERSE_FOREACH(const CommentZone &comment, comments)
+    {
+        if (comment.first == string::npos)
+        {
+            str = str.replace(0, comment.second, "");   // comment starts "before" str --> delete from start
+            break;   // there can be no more
+        }
+        else if (comment.first >= last_removal_start)
+        {
+            continue;    // this comment is inside another comment that we have removed already
+        }
+        else if (comment.second == string::npos)   // comment ends "after" str --> delete until end
+        {
+            str = str.replace(comment.first, string::npos, "");
+            last_removal_start = comment.first;
+        }
+        else
+        {
+            str = str.replace(comment.first, comment.second-comment.first, "");
+            last_removal_start = comment.first;
+        }
+    }
+}
 
 bool replace_all(string &base, const char *ist, const char *soll)
 {
@@ -868,6 +1029,29 @@ bool replace_all(string &base, const string &ist, const string &soll)
    return found_ist;
 }
 
+/**
+ * @brief replaces all characters that could be problematic or impose a security risk when being logged
+ * @param str the original string
+ * @param replace_with the character to replace the unsafe chars with
+ * @return a string that is safe to send to syslog or other logfiles
+ *
+ * All chars between 0x20 (space) and 0x7E (~) (including) are considered safe for logging.
+ * See e.g. RFC 5424, section 8.2 or the posix character class "printable".
+ * This eliminates all possible problems with NUL, control characters, 8 bit chars, UTF8.
+ *
+ */
+std::string sanitize_for_logging(const std::string &str, const char replace_with)
+{
+    std::string output=str;
+
+    const string::size_type len = output.size();
+    for (std::string::size_type p=0; p < len; p++)
+        if (output[p] < 0x20 || output[p] > 0x7E)
+            output[p]=replace_with;
+
+    return output;
+}
+
 #if 0
 string to_lower(const string &src)
 {
@@ -892,14 +1076,51 @@ string to_upper(const string &src)
 }
 #endif
 
+const int MAX_UNIT_FORMAT_SYMBOLS = 6;
+
+const string shortUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
+        " B",
+        " KB",
+        " MB",
+        " GB",
+        " TB",
+        " PB"
+};
+
+const string longUnitFormatSymbols[MAX_UNIT_FORMAT_SYMBOLS] = {
+        i18n_noop(" Bytes"),
+        i18n_noop(" KBytes"),
+        i18n_noop(" MBytes"),
+        i18n_noop(" GBytes"),
+        i18n_noop(" TBytes"),
+        i18n_noop(" PBytes")
+};
+
+
+long double rounding_upwards(
+        const long double number,
+        const int rounding_multiplier
+)
+{
+    long double rounded_number;
+    rounded_number = number * rounding_multiplier;
+    rounded_number += 0.5;
+    rounded_number = (int64_t) (rounded_number);
+    rounded_number = (long double) (rounded_number) / (long double) (rounding_multiplier);
+
+    return rounded_number;
+}
+
+
 string nice_unit_format(
         const int64_t input,
-        const UnitSystem system
+        const UnitFormat format,
+        const UnitBase base
 )
 {
+   // select the system of units (decimal or binary)
    int multiple = 0;
-
-   if (system == US_SI)
+   if (base == UnitBase1000)
    {
        multiple = 1000;
    }
@@ -909,63 +1130,62 @@ string nice_unit_format(
    }
 
    long double size = input;
-   int sizecount = 0;
 
+   // check the size of the input number to fit in the appropriate symbol
+   int sizecount = 0;
    while (size > multiple)
    {
        size = size / multiple;
        sizecount++;
+
+       // rollback to the previous values and stop the loop when cannot
+       // represent the number length.
+       if (sizecount >= MAX_UNIT_FORMAT_SYMBOLS)
+       {
+           size = size * multiple;
+           sizecount--;
+           break;
+       }
    }
 
-   long double tmp;                       // round
-   tmp = size * 10;
-   tmp += 0.5;
-   tmp = (int64_t) (tmp);
-   tmp = (long double) (tmp) / (long double) (10);
-   size = tmp;
+   // round the input number "half up" to multiples of 10
+   const int rounding_multiplier = 10;
+   size = rounding_upwards(size, rounding_multiplier);
 
+   // format the input number, placing the appropriate symbol
    ostringstream out;
-
    out.setf (ios::fixed);
-   out.precision (2);
-   switch (sizecount)
+   if (format == ShortUnitFormat)
    {
-      case 0:
-         out << size << i18n (" Bytes");
-         break;
-      case 1:
-         out << size << i18n (" KBytes");
-         break;
-      case 2:
-         out << size << i18n (" MBytes");
-         break;
-      case 3:
-         out << size << i18n (" GBytes");
-         break;
-      case 4:
-         out << size << i18n (" TBytes");
-         break;
-      case 5:
-         out << size << i18n (" PBytes");
-         break;
-      case 6:
-         out << size << i18n (" EBytes");
-         break;
-      case 7:
-         out << size << i18n (" ZBytes");
-         break;
-      case 8:
-         out << size << i18n (" YBytes");
-         break;
-      default:
-         out << size << "*10^" << (sizecount*3)<< i18n (" Bytes");
-         break;
+       out.precision(1);
+       out << size << i18n( shortUnitFormatSymbols[sizecount].c_str() );
+   }
+   else
+   {
+       out.precision (2);
+       out << size << i18n( longUnitFormatSymbols[sizecount].c_str() );
    }
 
    return out.str();
 } // eo nice_unit_format(int input)
 
 
+string nice_unit_format(
+        const double input,
+        const UnitFormat format,
+        const UnitBase base
+)
+{
+    // round as double and cast to int64_t
+    // cast raised overflow error near max val of int64_t (~9.2e18, see unittest)
+    int64_t input_casted_and_rounded =
+        boost::numeric_cast<int64_t>( round(input) );
+
+    // now call other
+    return nice_unit_format( input_casted_and_rounded, format, base );
+} // eo nice_unit_format(double input)
+
+
 string escape(const string &s)
 {
    string out(s);
@@ -1063,7 +1283,7 @@ string escape_shellarg(const string &input)
 {
    string output = "'";
    string::const_iterator it, it_end = input.end();
-   for (it = input.begin(); it != it_end; it++)
+   for (it = input.begin(); it != it_end; ++it)
    {
       if ( (*it) == '\'')
          output += "'\\'";