Create functions find/remove_html_comments

author Christian Herdtweck <christian.herdtweck@intra2net.com>

Wed, 8 Feb 2017 15:22:34 +0000 (16:22 +0100)

committer Thomas Jarosch <thomas.jarosch@intra2net.com>

Wed, 22 Mar 2017 09:50:44 +0000 (10:50 +0100)
author Christian Herdtweck <christian.herdtweck@intra2net.com>
Wed, 8 Feb 2017 15:22:34 +0000 (16:22 +0100)
committer Thomas Jarosch <thomas.jarosch@intra2net.com>
Wed, 22 Mar 2017 09:50:44 +0000 (10:50 +0100)
diff --git a/src/stringfunc.cpp b/src/stringfunc.cpp

index c50eb24..63424bc 100644 (file)
--- a/src/stringfunc.cpp
+++ b/src/stringfunc.cpp
@@ -35,6 +35,7 @@ on this file might be covered by the GNU General Public License.
 #include <i18n.h>
 
 #include <boost/numeric/conversion/cast.hpp>
+#include <boost/foreach.hpp>
 
 #include <stringfunc.hxx>
 
@@ -891,6 +892,101 @@ string html_entities_to_console(std::string str)
    return str;
 }
 
+// find_html_comments + remove_html_comments(str, comments)
+void remove_html_comments(string &str)
+{
+    vector<CommentZone> comments;
+    find_html_comments(str, comments);
+    remove_html_comments(str, comments);
+}
+
+// find all html comments, behaving correctly if they are nested; ignores comment tags ("<!--FOO .... BAR-->")
+// If there are invalid comments ("-->" before "<!--" or different number of closing and opening tags),
+// then the unknown index of corresponding start/end tag will be represented by a string::npos
+// Indices are from start of start tag until first index after closing tag
+void find_html_comments(const std::string &str, vector<CommentZone> &comments)
+{
+    static const string START = "<!--";
+    static const string CLOSE = "-->";
+    static const string::size_type START_LEN = START.length();
+    static const string::size_type CLOSE_LEN = CLOSE.length();
+
+    // in order to find nested comments, need either recursion or a stack
+    vector<string::size_type> starts;      // stack of start tags
+
+    string::size_type pos = 0;
+    string::size_type len = str.length();
+    string::size_type next_start, next_close;
+
+    while (pos < len)     // not really needed but just in case
+    {
+        next_start = str.find(START, pos);
+        next_close = str.find(CLOSE, pos);
+
+        if ( (next_start == string::npos) && (next_close == string::npos) )
+            break;   // we are done
+
+        else if ( (next_start == string::npos) || (next_close < next_start) )  // close one comment (pop)
+        {
+            if (starts.empty())    // closing tag without a start
+                comments.push_back(CommentZone(string::npos, next_close+CLOSE_LEN));
+            else
+            {
+                comments.push_back(CommentZone(starts.back(), next_close+CLOSE_LEN));
+                starts.pop_back();
+            }
+            pos = next_close + CLOSE_LEN;
+        }
+
+        else if ( (next_close == string::npos) || (next_start < next_close) )  // start a new comment (push)
+        {
+            starts.push_back(next_start);
+            pos = next_start + START_LEN;
+        }
+    }
+
+    // add comments that have no closing tag from back to front (important for remove_html_comments!)
+    while (!starts.empty())
+    {
+        comments.push_back(CommentZone(starts.back(), string::npos));
+        starts.pop_back();
+    }
+}
+
+// remove all html comments foundby find_html_comments
+void remove_html_comments(std::string &str, const vector<CommentZone> &comments)
+{
+    // remember position where last removal started
+    string::size_type last_removal_start = str.length();
+
+    // Go from back to front to not mess up indices.
+    // This requires that bigger comments, that contain smaller comments, come AFTER
+    // the small contained comments in the comments vector (i.e. comments are ordered by
+    // their closing tag, not their opening tag). This is true for results from find_html_comments
+    BOOST_REVERSE_FOREACH(const CommentZone &comment, comments)
+    {
+        if (comment.first == string::npos)
+        {
+            str = str.replace(0, comment.second, "");   // comment starts "before" str --> delete from start
+            break;   // there can be no more
+        }
+        else if (comment.first >= last_removal_start)
+        {
+            continue;    // this comment is inside another comment that we have removed already
+        }
+        else if (comment.second == string::npos)   // comment ends "after" str --> delete until end
+        {
+            str = str.replace(comment.first, string::npos, "");
+            last_removal_start = comment.first;
+        }
+        else
+        {
+            str = str.replace(comment.first, comment.second-comment.first, "");
+            last_removal_start = comment.first;
+        }
+    }
+}
+
 bool replace_all(string &base, const char *ist, const char *soll)
 {
    string i=ist;
diff --git a/src/stringfunc.hxx b/src/stringfunc.hxx

index 67c38e7..5d3455c 100644 (file)
--- a/src/stringfunc.hxx
+++ b/src/stringfunc.hxx
@@ -299,6 +299,11 @@ std::string smart_html_entities(const std::string &input);
 std::string html_entities(std::string str);
 std::string html_entities_to_console(std::string str);
 
+typedef std::pair<std::string::size_type, std::string::size_type> CommentZone;
+void find_html_comments(const std::string &str, std::vector<CommentZone> &result);
+void remove_html_comments(std::string &str);
+void remove_html_comments(std::string &str, const std::vector<CommentZone> &comments);
+
 std::string sanitize_for_logging(const std::string &str, const char replace_with='?');
 
 std::string escape(const std::string &s);
author	Christian Herdtweck <christian.herdtweck@intra2net.com>
	Wed, 8 Feb 2017 15:22:34 +0000 (16:22 +0100)
committer	Thomas Jarosch <thomas.jarosch@intra2net.com>
	Wed, 22 Mar 2017 09:50:44 +0000 (10:50 +0100)
src/stringfunc.cpp		patch \| blob \| blame \| history
src/stringfunc.hxx		patch \| blob \| blame \| history