Fix other minor things.
/** @file
* @brief restricts html messages to an allowed group of tags.
*
+ * See examples of acceptable and non-acceptable HTML code in the unit tests
+ * for this module.
+ *
* @copyright © Copyright 2017 Intra2net AG
*
*/
-#include <string>
-#include <sstream>
#include <iomanip>
+#include <set>
+#include <sstream>
+#include <string>
#include <vector>
#include <pcrecpp.h>
#include <boost/foreach.hpp>
+#include <boost/assign/list_of.hpp>
#include <stringfunc.hxx>
#include <restricted_html.hpp>
using namespace std;
-typedef pair<string,bool> TOKEN;
-const vector<string> ALLOWED_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6",
- "a", "p", "br", "i", "ul", "li",
- "table", "tr", "th", "td"
- };
+namespace I2n
+{
+
+namespace
+{
+
+/**
+* @brief Token is composed by a key string and a flag signalizing if it
+* is an html tag, ie., starts with "<" and finishes with ">".
+* Token is the type value returned by the I2n::tokenize_by_tag function.
+*/
+typedef pair<string,bool> Token;
+
+*/
+
+const set<string> ALLOWED_TAGS = boost::assign::list_of("h1")("h2")("h3")("h4")
+ ("h5")("h6")("a")("p")
+ ("br")("i")("ul")("li")
+ ("tr")("th")("td")
+ ("table");
const string AHREF = "<a href=";
const string REDIRECT_PREFIX = "/arnie?form=redirect&url=";
const string TARGET_BLANK = "target=_blank";
-const pcrecpp::RE SAFE_URL("^http(s)?:\\/\\/[\\w.-]+(?:\\.[\\w\\.-]+)+"
- "[\\w\\-\\._~:/?#@!\\$&'\\(\\)\\*\\+,;=.]+$");
+const pcrecpp::RE SAFE_URL("^(http(s?):\\/\\/)(([a-zA-Z0-9\\.\\-\\_]+(\\.[a-zA-"
+ "Z]{2,3})+)|((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-"
+ "9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+ ":[0-9]{1,5}))(\\/[a-zA-Z0-9\\@\\+\\_\\-\\.\\/\\?\\%"
+ "\\#\\&\\=]*)?$");
-
-namespace I2n
-{
+} // eo namespace <anonymous>
/**
* @brief Verifies if a html "a" tag has a valid link and sanitize it if necessary.
* Example: <a href="http://somelink.com">
* returns <a href="/arnie?form=redirect&url=http://somelink.com" target=_blank>
*
- * @param tag html "a" tag.
- * @return true if link inside "a" tag is valid. False otherwise.
+ * @param tag (output) html "a" tag. If valid, it may be modified to regularize
+ * and encode the URL.
+ * @param redirect_prefix Prefix that will be added to the url in case it is valid.
+ * @return bool true if link inside "a" tag is valid. False otherwise.
*/
-bool link_sanitizer(string &tag)
+bool link_sanitizer(string &tag, const std::string &redirect_prefix)
{
// tag = <a href="somelink">
string link = tag.substr(AHREF.size());
- if (link.find_first_of("\"") == 0)
+ if (link.find('\"') == 0)
{
- size_t pos = link.find_first_of("\"", 1);
+ size_t pos = link.find('\"', 1);
if (pos == string::npos)
return false; // Quotation mark never closes.
string end(link, pos+1);
- if (end.compare(" >") != 0 && end.compare(">") != 0)
+ if (end != " >" && end != ">")
return false; //Probably extra attributes. Or " inside the link (invalid).
link = link.substr(1, pos -1);
link = link.substr(0, space);
}
+ // Check if the link already has a known prefix.
+ // See "InvalidRedirection" unit test.
+ if (has_prefix(link, redirect_prefix))
+ link = remove_prefix(link, redirect_prefix);
if (has_prefix(link, REDIRECT_PREFIX))
link = remove_prefix(link, REDIRECT_PREFIX);
if (link.find("javascript:") != string::npos)
return false;
- tag = AHREF + "\"" + REDIRECT_PREFIX + encode_url(link) + "\" " + TARGET_BLANK + ">";
+ tag = AHREF + "\"" + redirect_prefix + encode_url(link) + "\" "
+ + TARGET_BLANK + ">";
return true;
}
replace_all(tag, "/", "");
to_lower_mod(tag);
- BOOST_FOREACH(const string &a_tag, ALLOWED_TAGS)
- {
- if (tag.compare(a_tag) == 0)
- return true;
- }
-
- return false;
+ return ALLOWED_TAGS.find(tag) != ALLOWED_TAGS.end();
}
-const string restrict_html(const string &html_code_orig, bool strip)
+/**
+* @brief Restricts html code to a small list of allowed tags.
+* The attribute "href" from the tag "a" has its value sanitized and if it
+* contains unsafe characters, the tag is stripped.
+* The link sanitizer adds a redirector in case of an acceptable protocol, strip
+* otherwise.
+* Any other attributes found will result in the tag being stripped.
+* Any comments will be excluded.
+* Strip closing tags that were not open and close tags that were not closed.
+*
+* @param html_code_orig input html code. Non case sensitive.
+* @param redirect_prefix Prefix that will be added to the url in case of a valid
+* html "a" tag.
+* @param strip if true disallowed tags are stripped otherwise throw an error.
+* Defaults to true.
+* @return output html code
+*/
+const string restrict_html(const string &html_code_orig,
+ const std::string &redirect_prefix,
+ bool strip)
{
string html_code(html_code_orig);
remove_html_comments (html_code);
- vector<TOKEN > tokenized;
+ vector<Token > tokenized;
tokenize_by_tag (tokenized, html_code);
string result = "";
vector<string> expected_tags;
- BOOST_FOREACH(const TOKEN &s, tokenized)
+ BOOST_FOREACH(const Token &s, tokenized)
{
if (!s.second)
{
}
if (is_tag_allowed(s.first) )
{
+ // Checks if this tag (s.first) is not a closing tag.
if (s.first.compare(0, 2, "</") != 0)
{
result = result + s.first;
- if (to_lower(s.first).compare("<br>") != 0)
+ // The tag "<br>" does not have a closing tag, so don't push it
+ // into the expected tags vector.
+ if (to_lower(s.first) != "<br>")
expected_tags.push_back("</"+s.first.substr(1));
continue;
}
- else if (expected_tags.size() > 0 && expected_tags.back().compare(s.first) == 0)
+ // If it is a closing tag, check if this is the expected one.
+ // We can't accept an closing tag that was never opened.
+ else if (expected_tags.size() > 0 && expected_tags.back() == s.first)
{
+ // When the closing tag is indeed what we expect, pop it out
+ // from the LIFO queue.
result = result + s.first;
expected_tags.pop_back();
continue;
}
}
+ // Copy s.first to a string since it might be modified by link_sanitizer.
string tag = s.first;
- if (to_lower(s.first).compare(0, AHREF.size(), AHREF) == 0 && link_sanitizer(tag))
+ if (to_lower_mod(tag).compare(0, AHREF.size(), AHREF) == 0
+ && link_sanitizer(tag, redirect_prefix))
{
result = result + tag;
expected_tags.push_back("</a>");
return result.c_str();
}
+const string restrict_html(const string &html_code_orig, bool strip)
+{
+ return restrict_html(html_code_orig, REDIRECT_PREFIX, strip);
+}
+
/**
* @brief Replace all "+" characters found in s to spaces (" ").
*
/** @file
* @brief restricts html code to an allowed group of tags.
*
- * Example of acceptable html code:
- *
- <h1>Restrict HTML</h1>
- <table>
- <tr>
- <th>Tags</th>
- <th>Protocos</th>
- </tr>
- <tr>
- <td>Link</td>
- <td>$10000</td>
- </tr>
- </table>
- <p>Paragraph with <a href="https://example.de"><i>Link</i></a>.</p>
- <br>
- <ul>
- <li>Tags</li>
- </ul>
- *
- *
* @copyright © Copyright 2017 Intra2net AG
*
*/
namespace I2n
{
- /**
- * @brief Restricts html code to a small list of allowed tags.
- * The attribute "href" from the tag "a" has its value sanitized and if it
- * contains unsafe characters, the tag is stripped.
- * The link sanitizer adds a redirector in case of an acceptable protocol, strip otherwise.
- * Any other attributes found will result in the tag being stripped.
- * Any comments will be excluded.
- * Strip closing tags that were not open and close tags that were not closed.
- *
- * @param html_code_orig input html code. Non case sensitive.
- * @param strip if true disallowed tags are stripped otherwise throw an error. Defaults to true.
- * @return output html code
- */
- const std::string restrict_html(const std::string &html_code_orig, bool strip=true);
+
+ const std::string restrict_html(const std::string &html_code_orig,
+ const std::string &redirect_prefix,
+ bool strip=true);
+ const std::string restrict_html(const std::string &html_code_orig,
+ bool strip=true);
std::string decode_url(std::string s);
BOOST_AUTO_TEST_CASE(BasicTest)
{
- string output = restrict_html("<h1>Table</h1><table> <tr><th>Month</th> <th>Savings</th> </tr> <tr> <td>January</td> <td>$100</td> </tr></table> <p>Paragraph with a <a href=\"https://example.de\"><i>Acceptable Link</i></a>.</p> <ul> <li>Coffee</li> <li>Tea</li> <li>Milk</li></ul>");
- BOOST_CHECK_EQUAL(string("<h1>Table</h1><table> <tr><th>Month</th> <th>Savings</th> </tr> <tr> <td>January</td> <td>$100</td> </tr></table> <p>Paragraph with a <a href=\"/arnie?form=redirect&url=https%3A%2F%2Fexample%2Ede\" target=_blank><i>Acceptable Link</i></a>.</p> <ul> <li>Coffee</li> <li>Tea</li> <li>Milk</li></ul>"), output);
+ string output = restrict_html("<h1>Table</h1><table> <tr><th>Month</th> "
+ "<th>Savings</th> </tr> <tr> <td>January"
+ "</td> <td>$100</td> </tr></table> <p>Par"
+ "agraph with a <a href=\"https://example.de\""
+ "><i>Acceptable Link</i></a>.</p> <ul> <li>C"
+ "offee</li> <li>Tea</li> <li>Milk</li></ul"
+ ">");
+ BOOST_CHECK_EQUAL(string("<h1>Table</h1><table> <tr><th>Month</th> <th>"
+ "Savings</th> </tr> <tr> <td>January</td> "
+ " <td>$100</td> </tr></table> <p>Paragraph with "
+ "a <a href=\"/arnie?form=redirect&url=https%3A%2"
+ "F%2Fexample%2Ede\" target=_blank><i>Acceptable L"
+ "ink</i></a>.</p> <ul> <li>Coffee</li> <li>Tea"
+ "</li> <li>Milk</li></ul>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(SpecialCharacters)
+{
+ string output = restrict_html("<h1>Letters: äöü</h1>");
+ BOOST_CHECK_EQUAL(string("<h1>Letters: äöü</h1>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(SpecialCharacters2)
+{
+ string output = restrict_html("<h1>Letters: äöü</h1>");
+ BOOST_CHECK_EQUAL(string("<h1>Letters: äöü</h1>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(SpecialCharacters3)
+{
+ string output = restrict_html("<h1>Letters: <br> Ampersand (&) test </h1>"
+ "<p> Others: ' / " }");
+ BOOST_CHECK_EQUAL(string("<h1>Letters: <br> Ampersand (&) test </h1>"
+ "<p> Others: ' / " }</p>"),
+ output);
}
BOOST_AUTO_TEST_CASE(ScriptInjection)
BOOST_AUTO_TEST_CASE(ExtraAttribute)
{
- string output = restrict_html("<a onclick=\"evil\" href=\"http://i2n.de/\">test</a>");
+ string output = restrict_html("<a onclick=\"evil\" href=\"http://i2n.de/\">"
+ "test</a>");
BOOST_CHECK_EQUAL(string("test"), output);
}
BOOST_AUTO_TEST_CASE(ExtraAttribute2)
{
- string output = restrict_html("<a href=\"http://site.com/dir\" onclick=\"evil\">test</a>");
+ string output = restrict_html("<a href=\"http://site.com/dir\" onclick=\"ev"
+ "il\">test</a>");
BOOST_CHECK_EQUAL(string("test"), output);
}
BOOST_AUTO_TEST_CASE(ExtraAttribute3)
{
- string output = restrict_html("<a href=\"http://site.com/dir\"onclick=\"evil\">test</a>");
+ string output = restrict_html("<a href=\"http://site.com/dir\"onclick=\"evi"
+ "l\">test</a>");
BOOST_CHECK_EQUAL(string("test"), output);
}
BOOST_AUTO_TEST_CASE(AhrefLink)
{
string output = restrict_html("<a href=\"http://i2n.de/\" >test</a>");
- BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fi2n%2Ede%2F\" target=_blank>test</a>"), output);
+ BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fi"
+ "2n%2Ede%2F\" target=_blank>test</a>"), output);
}
BOOST_AUTO_TEST_CASE(AhrefLink2)
{
- string output = restrict_html("<a href=http://;href=javascript:alert(String.fromCharCode(88,83,83)) target=_blank>test</a>");
+ string output = restrict_html("<a href=http://;href=javascript:alert(String"
+ ".fromCharCode(88,83,83)) target=_blank>test</a>");
+ BOOST_CHECK_EQUAL(string("test"), output);
+}
+
+BOOST_AUTO_TEST_CASE(AhrefLink3)
+{
+ string output = restrict_html("<a href=\"http://www.test.com/form\">test</a>");
+ BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fw"
+ "ww%2Etest%2Ecom%2Fform\" target=_blank>test</a>"),
+ output);
+}
+
+BOOST_AUTO_TEST_CASE(AhrefLinkIPAddress)
+{
+ string output = restrict_html("<a href=\"http://192.168.10.10:8080/form\">"
+ "test</a>");
+ BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2F"
+ "192%2E168%2E10%2E10%3A8080%2Fform\" target=_blan"
+ "k>test</a>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(AhrefLinkIPAddressNoPort)
+{
+ string output = restrict_html("<a href=\"http://192.168.10.10\">test</a>");
BOOST_CHECK_EQUAL(string("test"), output);
}
BOOST_AUTO_TEST_CASE(AhrefProtocol)
{
string output = restrict_html("<a href=\"http://www.foo.com\">foo</a>");
- BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fwww%2Efoo%2Ecom\" target=_blank>foo</a>"), output);
+ BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2F"
+ "www%2Efoo%2Ecom\" target=_blank>foo</a>"), output);
}
BOOST_AUTO_TEST_CASE(AhrefWrongProtocol)
BOOST_AUTO_TEST_CASE(UnclosedTags)
{
string output = restrict_html("<table><tr><td><h1>Test</h1>");
- BOOST_CHECK_EQUAL(string("<table><tr><td><h1>Test</h1></td></tr></table>"), output);
+ BOOST_CHECK_EQUAL(string("<table><tr><td><h1>Test</h1></td></tr></table>"),
+ output);
}
BOOST_AUTO_TEST_CASE(UnopenedTags)
BOOST_AUTO_TEST_CASE(UnsafeURLChars2)
{
- string output = restrict_html("<a href=http://aa.com\nonclick=\\u0061> Test Me!!</a>");
+ string output = restrict_html("<a href=http://aa.com\nonclick=\\u0061> Test"
+ " Me!!</a>");
BOOST_CHECK_EQUAL(string(" Test Me!!"), output);
}
BOOST_AUTO_TEST_CASE(EncodedURL)
{
- string output = restrict_html("<A HREF=\"http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D\">test</A>");
- BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fwww%2Egoogle%2Ecom\" target=_blank>test</a>"), output);
+ string output = restrict_html("<A HREF=\"http://%77%77%77%2E%67%6F%6F%67%6C"
+ "%65%2E%63%6F%6D\">test</A>");
+ BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fw"
+ "ww%2Egoogle%2Ecom\" target=_blank>test</a>"),
+ output);
}
BOOST_AUTO_TEST_CASE(InvalidRedirection)
{
- string output = restrict_html("<a href=\"/arnie?form=redirect&url=javascript:alert(String.fromCharCode(88,83,83))\">test</a>");
+ string output = restrict_html("<a href=\"/arnie?form=redirect&url=javascrip"
+ "t:alert(String.fromCharCode(88,83,83))\">tes"
+ "t</a>");
BOOST_CHECK_EQUAL(string("test"), output);
}
BOOST_AUTO_TEST_CASE(InvalidRedirection2)
{
- string output = restrict_html("<a href=/arnie?form=redirect&url=http://something.com;href=javascript:alert(1)>test</a>");
+ string output = restrict_html("<a href=/arnie?form=redirect&url=http://some"
+ "thing.com;href=javascript:alert(1)>test</a>");
BOOST_CHECK_EQUAL(string("test"), output);
}
BOOST_AUTO_TEST_CASE(InvalidTag)
{
- string output = restrict_html("<STYLE TYPE=\"text/javascript\">alert('XSS');</STYLE>");
+ string output = restrict_html("<STYLE TYPE=\"text/javascript\">alert('XSS')"
+ ";</STYLE>");
BOOST_CHECK_EQUAL(string("alert('XSS');"), output);
}
BOOST_AUTO_TEST_CASE(InvalidTag2)
{
- string output = restrict_html("<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>");
+ string output = restrict_html("<IMG SRC=javascript:alert(String.fromCharCod"
+ "e(88,83,83))>");
BOOST_CHECK_EQUAL(string(""), output);
}
+BOOST_AUTO_TEST_CASE(NonStripMode)
+{
+ BOOST_CHECK_THROW(restrict_html("<a href=\"http://site.com/dir\"onclick=\"e"
+ "vil\">test</a>", false), runtime_error);
+}
+
+BOOST_AUTO_TEST_CASE(NonStripMode2)
+{
+ BOOST_CHECK_THROW(restrict_html("<IMG SRC=javascript:alert(String.fromCharC"
+ "ode(88,83,83))>", false), runtime_error);
+}
+
+BOOST_AUTO_TEST_CASE(NonStripMode3)
+{
+ BOOST_CHECK_NO_THROW(restrict_html("<a href=\"http://192.168.10.10:8080/for"
+ "m\">test</a>", false));
+}
+
BOOST_AUTO_TEST_CASE(DecodeStringURL)
{
string output = decode_url("%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D");