From c5d9be573510f2f3e0d8fce7b067bef8210fc177 Mon Sep 17 00:00:00 2001 From: Juliana Rodrigueiro Date: Wed, 8 Aug 2018 13:47:57 +0200 Subject: [PATCH] Implement restric_html method --- src/restricted_html.cpp | 154 +++++++++++++++++++++++++++++++++++++++++++++++ src/restricted_html.hpp | 35 ++++++++++- 2 files changed, 188 insertions(+), 1 deletions(-) diff --git a/src/restricted_html.cpp b/src/restricted_html.cpp index d59bcff..6796d1e 100644 --- a/src/restricted_html.cpp +++ b/src/restricted_html.cpp @@ -27,17 +27,171 @@ on this file might be covered by the GNU General Public License. #include #include #include +#include + +#include #include #include using namespace std; +typedef pair TOKEN; + +const vector ALLOWED_PROTOCOLS = {"http://", "https://"}; +const vector ALLOWED_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", + "a", "p", "br", "i", "ul", "li", + "table", "tr", "th", "td" + }; +const string AHREF = " + * returns + * + * @param tag html "a" tag. + * @return true if link inside "a" tag is valid. False otherwise. + */ +bool link_sanitizer(string &tag) +{ + // tag = + string link = tag.substr(AHREF.size()); + if (link.find_first_of("\"\'") == 0) + { + size_t pos = link.find_first_of("\"\'", 1); + if (pos == string::npos) + return false; // Quotation mark never closes. + + string end(link, pos+1); + if (end.compare(" >") != 0 && end.compare(">") != 0) + return false; //Probably extra attributes. + link = link.substr(1, pos -1); + } + else + { + size_t space = link.find_first_of(" "); + if (space != link.size()-2 && space != string::npos ) + return false; //Probably extra attributes. + + link = link.substr(0, space); + } + + if (is_protocol_allowed(link)) + tag = AHREF + "\"" + REDIRECT_PREFIX + link + "\" " + TARGET_BLANK + ">"; + else if (link[0] != '/') + return false; + + if (link.find_first_not_of(SAFE_URL_CHARS) != string::npos) + return false; + + return true; +} + +/** + * @brief Check if tag is in a whitelist of alowed tags. + * Does not accept a tag containing attributes. + * Example: + *

or

returns true. + *
returns false. + * + * @param tag html tag to be verified. + * @return true if is an allowed tag, false otherwise. + */ +bool is_tag_allowed(string tag) +{ + replace_all(tag, "<", ""); + replace_all(tag, ">", ""); + replace_all(tag, "/", ""); + to_lower_mod(tag); + + BOOST_FOREACH(const string &a_tag, ALLOWED_TAGS) + { + if (tag.compare(a_tag) == 0) + return true; + } + + return false; +} + +const string restrict_html(const string &html_code_orig, bool strip) +{ + string html_code(html_code_orig); + remove_html_comments (html_code); + vector tokenized; + tokenize_by_tag (tokenized, html_code); + string result = ""; + + vector expected_tags; + BOOST_FOREACH(TOKEN s, tokenized) + { + if (!s.second) + { + result = result + html_entities(s.first); + continue; + } + if (is_tag_allowed(s.first) ) + { + if (s.first.compare(0, 2, "") != 0) + expected_tags.push_back(s.first.insert(1,"/")); + continue; + } + else if (expected_tags.size() > 0 && expected_tags.back().compare(s.first) == 0) + { + result = result + s.first; + expected_tags.pop_back(); + continue; + } + } + + if (to_lower(s.first).compare(0, AHREF.size(), AHREF) == 0 && link_sanitizer(s.first)) + { + result = result + s.first; + expected_tags.push_back("
"); + continue; + } + + if (!strip) + throw runtime_error("Invalid tag: " + s.first); + + } + if (expected_tags.size() > 0) //One or more tags were not closed. + BOOST_REVERSE_FOREACH(const string &s, expected_tags) + { + result = result + s; + } + + return result.c_str(); +} /** * @brief Replace all "+" characters found in s to spaces (" "). diff --git a/src/restricted_html.hpp b/src/restricted_html.hpp index 09b3d14..ed5711e 100644 --- a/src/restricted_html.hpp +++ b/src/restricted_html.hpp @@ -20,6 +20,26 @@ on this file might be covered by the GNU General Public License. /** @file * @brief restricts html code to an allowed group of tags. * + * Example of acceptable html code: + * +

Restrict HTML

+ + + + + + + + + +
TagsProtocos
Link$10000
+

Paragraph with Link.

+
+
    +
  • Tags
  • +
+ * + * * @copyright © Copyright 2017 Intra2net AG * */ @@ -31,7 +51,20 @@ on this file might be covered by the GNU General Public License. namespace I2n { - + /** + * @brief Restricts html code to a small list of allowed tags. + * The attribute "href" from the tag "a" has its value sanitized and if it + * contains unsafe caracters, the tag is stripped. + * The link sanitizer adds a redirector in case of an acceptable protocol, strip otherwise. + * Any other attributes found will result in the tag being stripped. + * Any comments will be excluded. + * Strip closing tags that were not open and close tags that were not closed. + * + * @param html_code_orig input html code. Non case sensitive. + * @param strip if true disallowed tags are stripped otherwise throw an error. Defaults to true. + * @return output html code + */ + const std::string restrict_html(const std::string &html_code_orig, bool strip=true); std::string decode_url(std::string s); -- 1.7.1