#include <string>
#include <sstream>
#include <iomanip>
+#include <vector>
+
+#include <boost/foreach.hpp>
#include <stringfunc.hxx>
#include <restricted_html.hpp>
using namespace std;
+typedef pair<string,bool> TOKEN;
+
+const vector<string> ALLOWED_PROTOCOLS = {"http://", "https://"};
+const vector<string> ALLOWED_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6",
+ "a", "p", "br", "i", "ul", "li",
+ "table", "tr", "th", "td"
+ };
+const string AHREF = "<a href=";
+const string SAFE_URL_CHARS = "$-_.+!*'(),;/?:@=&abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+const string REDIRECT_PREFIX = "/arnie?form=redirect&url=";
+const string TARGET_BLANK = "target=_blank";
+
namespace I2n
{
+ /**
+ * @brief Compare protocol found in link against a whitelist.
+ *
+ * @param link url that will be checked.
+ * @return true if link has an allowed protocol, false otherwise.
+ */
+ bool is_protocol_allowed(const string &link)
+{
+ BOOST_FOREACH(const string &protocol, ALLOWED_PROTOCOLS)
+ {
+ if (has_prefix(link, protocol))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * @brief Verifies if a html "a" tag has a valid link and sanitize it if necessary.
+ * Modify tag nd add redirector prefix if link has a valid protocol.
+ * Example: <a href="http://somelink.com">
+ * returns <a href="/arnie?form=redirect&url=http://somelink.com" target=_blank>
+ *
+ * @param tag html "a" tag.
+ * @return true if link inside "a" tag is valid. False otherwise.
+ */
+bool link_sanitizer(string &tag)
+{
+ // tag = <a href="somelink">
+ string link = tag.substr(AHREF.size());
+ if (link.find_first_of("\"\'") == 0)
+ {
+ size_t pos = link.find_first_of("\"\'", 1);
+ if (pos == string::npos)
+ return false; // Quotation mark never closes.
+
+ string end(link, pos+1);
+ if (end.compare(" >") != 0 && end.compare(">") != 0)
+ return false; //Probably extra attributes.
+ link = link.substr(1, pos -1);
+ }
+ else
+ {
+ size_t space = link.find_first_of(" ");
+ if (space != link.size()-2 && space != string::npos )
+ return false; //Probably extra attributes.
+
+ link = link.substr(0, space);
+ }
+
+ if (is_protocol_allowed(link))
+ tag = AHREF + "\"" + REDIRECT_PREFIX + link + "\" " + TARGET_BLANK + ">";
+ else if (link[0] != '/')
+ return false;
+
+ if (link.find_first_not_of(SAFE_URL_CHARS) != string::npos)
+ return false;
+
+ return true;
+}
+
+/**
+ * @brief Check if tag is in a whitelist of alowed tags.
+ * Does not accept a tag containing attributes.
+ * Example:
+ * <h1> or </p> returns true.
+ * <br size="param"> returns false.
+ *
+ * @param tag html tag to be verified.
+ * @return true if is an allowed tag, false otherwise.
+ */
+bool is_tag_allowed(string tag)
+{
+ replace_all(tag, "<", "");
+ replace_all(tag, ">", "");
+ replace_all(tag, "/", "");
+ to_lower_mod(tag);
+
+ BOOST_FOREACH(const string &a_tag, ALLOWED_TAGS)
+ {
+ if (tag.compare(a_tag) == 0)
+ return true;
+ }
+
+ return false;
+}
+
+const string restrict_html(const string &html_code_orig, bool strip)
+{
+ string html_code(html_code_orig);
+ remove_html_comments (html_code);
+ vector<TOKEN > tokenized;
+ tokenize_by_tag (tokenized, html_code);
+ string result = "";
+
+ vector<string> expected_tags;
+ BOOST_FOREACH(TOKEN s, tokenized)
+ {
+ if (!s.second)
+ {
+ result = result + html_entities(s.first);
+ continue;
+ }
+ if (is_tag_allowed(s.first) )
+ {
+ if (s.first.compare(0, 2, "</") != 0)
+ {
+ result = result + s.first;
+ if (to_lower(s.first).compare("<br>") != 0)
+ expected_tags.push_back(s.first.insert(1,"/"));
+ continue;
+ }
+ else if (expected_tags.size() > 0 && expected_tags.back().compare(s.first) == 0)
+ {
+ result = result + s.first;
+ expected_tags.pop_back();
+ continue;
+ }
+ }
+
+ if (to_lower(s.first).compare(0, AHREF.size(), AHREF) == 0 && link_sanitizer(s.first))
+ {
+ result = result + s.first;
+ expected_tags.push_back("</a>");
+ continue;
+ }
+
+ if (!strip)
+ throw runtime_error("Invalid tag: " + s.first);
+
+ }
+ if (expected_tags.size() > 0) //One or more tags were not closed.
+ BOOST_REVERSE_FOREACH(const string &s, expected_tags)
+ {
+ result = result + s;
+ }
+
+ return result.c_str();
+}
/**
* @brief Replace all "+" characters found in s to spaces (" ").
/** @file
* @brief restricts html code to an allowed group of tags.
*
+ * Example of acceptable html code:
+ *
+ <h1>Restrict HTML</h1>
+ <table>
+ <tr>
+ <th>Tags</th>
+ <th>Protocos</th>
+ </tr>
+ <tr>
+ <td>Link</td>
+ <td>$10000</td>
+ </tr>
+ </table>
+ <p>Paragraph with <a href="https://example.de"><i>Link</i></a>.</p>
+ <br>
+ <ul>
+ <li>Tags</li>
+ </ul>
+ *
+ *
* @copyright © Copyright 2017 Intra2net AG
*
*/
namespace I2n
{
-
+ /**
+ * @brief Restricts html code to a small list of allowed tags.
+ * The attribute "href" from the tag "a" has its value sanitized and if it
+ * contains unsafe caracters, the tag is stripped.
+ * The link sanitizer adds a redirector in case of an acceptable protocol, strip otherwise.
+ * Any other attributes found will result in the tag being stripped.
+ * Any comments will be excluded.
+ * Strip closing tags that were not open and close tags that were not closed.
+ *
+ * @param html_code_orig input html code. Non case sensitive.
+ * @param strip if true disallowed tags are stripped otherwise throw an error. Defaults to true.
+ * @return output html code
+ */
+ const std::string restrict_html(const std::string &html_code_orig, bool strip=true);
std::string decode_url(std::string s);