From: Juliana Rodrigueiro
Date: Wed, 8 Aug 2018 11:47:57 +0000 (+0200)
Subject: Implement restric_html method
X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=c5d9be573510f2f3e0d8fce7b067bef8210fc177;p=libi2ncommon
Implement restric_html method
---
diff --git a/src/restricted_html.cpp b/src/restricted_html.cpp
index d59bcff..6796d1e 100644
--- a/src/restricted_html.cpp
+++ b/src/restricted_html.cpp
@@ -27,17 +27,171 @@ on this file might be covered by the GNU General Public License.
#include
#include
#include
+#include
+
+#include
#include
#include
using namespace std;
+typedef pair TOKEN;
+
+const vector ALLOWED_PROTOCOLS = {"http://", "https://"};
+const vector ALLOWED_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6",
+ "a", "p", "br", "i", "ul", "li",
+ "table", "tr", "th", "td"
+ };
+const string AHREF = "
+ * returns
+ *
+ * @param tag html "a" tag.
+ * @return true if link inside "a" tag is valid. False otherwise.
+ */
+bool link_sanitizer(string &tag)
+{
+ // tag =
+ string link = tag.substr(AHREF.size());
+ if (link.find_first_of("\"\'") == 0)
+ {
+ size_t pos = link.find_first_of("\"\'", 1);
+ if (pos == string::npos)
+ return false; // Quotation mark never closes.
+
+ string end(link, pos+1);
+ if (end.compare(" >") != 0 && end.compare(">") != 0)
+ return false; //Probably extra attributes.
+ link = link.substr(1, pos -1);
+ }
+ else
+ {
+ size_t space = link.find_first_of(" ");
+ if (space != link.size()-2 && space != string::npos )
+ return false; //Probably extra attributes.
+
+ link = link.substr(0, space);
+ }
+
+ if (is_protocol_allowed(link))
+ tag = AHREF + "\"" + REDIRECT_PREFIX + link + "\" " + TARGET_BLANK + ">";
+ else if (link[0] != '/')
+ return false;
+
+ if (link.find_first_not_of(SAFE_URL_CHARS) != string::npos)
+ return false;
+
+ return true;
+}
+
+/**
+ * @brief Check if tag is in a whitelist of alowed tags.
+ * Does not accept a tag containing attributes.
+ * Example:
+ * or
returns true.
+ *
returns false.
+ *
+ * @param tag html tag to be verified.
+ * @return true if is an allowed tag, false otherwise.
+ */
+bool is_tag_allowed(string tag)
+{
+ replace_all(tag, "<", "");
+ replace_all(tag, ">", "");
+ replace_all(tag, "/", "");
+ to_lower_mod(tag);
+
+ BOOST_FOREACH(const string &a_tag, ALLOWED_TAGS)
+ {
+ if (tag.compare(a_tag) == 0)
+ return true;
+ }
+
+ return false;
+}
+
+const string restrict_html(const string &html_code_orig, bool strip)
+{
+ string html_code(html_code_orig);
+ remove_html_comments (html_code);
+ vector tokenized;
+ tokenize_by_tag (tokenized, html_code);
+ string result = "";
+
+ vector expected_tags;
+ BOOST_FOREACH(TOKEN s, tokenized)
+ {
+ if (!s.second)
+ {
+ result = result + html_entities(s.first);
+ continue;
+ }
+ if (is_tag_allowed(s.first) )
+ {
+ if (s.first.compare(0, 2, "") != 0)
+ {
+ result = result + s.first;
+ if (to_lower(s.first).compare("
") != 0)
+ expected_tags.push_back(s.first.insert(1,"/"));
+ continue;
+ }
+ else if (expected_tags.size() > 0 && expected_tags.back().compare(s.first) == 0)
+ {
+ result = result + s.first;
+ expected_tags.pop_back();
+ continue;
+ }
+ }
+
+ if (to_lower(s.first).compare(0, AHREF.size(), AHREF) == 0 && link_sanitizer(s.first))
+ {
+ result = result + s.first;
+ expected_tags.push_back("");
+ continue;
+ }
+
+ if (!strip)
+ throw runtime_error("Invalid tag: " + s.first);
+
+ }
+ if (expected_tags.size() > 0) //One or more tags were not closed.
+ BOOST_REVERSE_FOREACH(const string &s, expected_tags)
+ {
+ result = result + s;
+ }
+
+ return result.c_str();
+}
/**
* @brief Replace all "+" characters found in s to spaces (" ").
diff --git a/src/restricted_html.hpp b/src/restricted_html.hpp
index 09b3d14..ed5711e 100644
--- a/src/restricted_html.hpp
+++ b/src/restricted_html.hpp
@@ -20,6 +20,26 @@ on this file might be covered by the GNU General Public License.
/** @file
* @brief restricts html code to an allowed group of tags.
*
+ * Example of acceptable html code:
+ *
+ Restrict HTML
+
+
+ Tags |
+ Protocos |
+
+
+ Link |
+ $10000 |
+
+
+ Paragraph with Link.
+
+
+ *
+ *
* @copyright © Copyright 2017 Intra2net AG
*
*/
@@ -31,7 +51,20 @@ on this file might be covered by the GNU General Public License.
namespace I2n
{
-
+ /**
+ * @brief Restricts html code to a small list of allowed tags.
+ * The attribute "href" from the tag "a" has its value sanitized and if it
+ * contains unsafe caracters, the tag is stripped.
+ * The link sanitizer adds a redirector in case of an acceptable protocol, strip otherwise.
+ * Any other attributes found will result in the tag being stripped.
+ * Any comments will be excluded.
+ * Strip closing tags that were not open and close tags that were not closed.
+ *
+ * @param html_code_orig input html code. Non case sensitive.
+ * @param strip if true disallowed tags are stripped otherwise throw an error. Defaults to true.
+ * @return output html code
+ */
+ const std::string restrict_html(const std::string &html_code_orig, bool strip=true);
std::string decode_url(std::string s);