From: Juliana Rodrigueiro Date: Wed, 8 Aug 2018 12:50:40 +0000 (+0200) Subject: Improve url regex and add documentation X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=596f2c353d21d020c2399d066189fb53339cca50;p=libi2ncommon Improve url regex and add documentation Fix other minor things. --- diff --git a/src/restricted_html.cpp b/src/restricted_html.cpp index 52e68ec..28c1eed 100644 --- a/src/restricted_html.cpp +++ b/src/restricted_html.cpp @@ -20,38 +20,59 @@ on this file might be covered by the GNU General Public License. /** @file * @brief restricts html messages to an allowed group of tags. * + * See examples of acceptable and non-acceptable HTML code in the unit tests + * for this module. + * * @copyright © Copyright 2017 Intra2net AG * */ -#include -#include #include +#include +#include +#include #include #include #include +#include #include #include using namespace std; -typedef pair TOKEN; -const vector ALLOWED_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", - "a", "p", "br", "i", "ul", "li", - "table", "tr", "th", "td" - }; +namespace I2n +{ + +namespace +{ + +/** +* @brief Token is composed by a key string and a flag signalizing if it +* is an html tag, ie., starts with "<" and finishes with ">". +* Token is the type value returned by the I2n::tokenize_by_tag function. +*/ +typedef pair Token; + +*/ + +const set ALLOWED_TAGS = boost::assign::list_of("h1")("h2")("h3")("h4") + ("h5")("h6")("a")("p") + ("br")("i")("ul")("li") + ("tr")("th")("td") + ("table"); const string AHREF = " /** * @brief Verifies if a html "a" tag has a valid link and sanitize it if necessary. @@ -59,21 +80,23 @@ namespace I2n * Example: * returns * - * @param tag html "a" tag. - * @return true if link inside "a" tag is valid. False otherwise. + * @param tag (output) html "a" tag. If valid, it may be modified to regularize + * and encode the URL. + * @param redirect_prefix Prefix that will be added to the url in case it is valid. + * @return bool true if link inside "a" tag is valid. False otherwise. */ -bool link_sanitizer(string &tag) +bool link_sanitizer(string &tag, const std::string &redirect_prefix) { // tag = string link = tag.substr(AHREF.size()); - if (link.find_first_of("\"") == 0) + if (link.find('\"') == 0) { - size_t pos = link.find_first_of("\"", 1); + size_t pos = link.find('\"', 1); if (pos == string::npos) return false; // Quotation mark never closes. string end(link, pos+1); - if (end.compare(" >") != 0 && end.compare(">") != 0) + if (end != " >" && end != ">") return false; //Probably extra attributes. Or " inside the link (invalid). link = link.substr(1, pos -1); @@ -87,6 +110,10 @@ bool link_sanitizer(string &tag) link = link.substr(0, space); } + // Check if the link already has a known prefix. + // See "InvalidRedirection" unit test. + if (has_prefix(link, redirect_prefix)) + link = remove_prefix(link, redirect_prefix); if (has_prefix(link, REDIRECT_PREFIX)) link = remove_prefix(link, REDIRECT_PREFIX); @@ -98,7 +125,8 @@ bool link_sanitizer(string &tag) if (link.find("javascript:") != string::npos) return false; - tag = AHREF + "\"" + REDIRECT_PREFIX + encode_url(link) + "\" " + TARGET_BLANK + ">"; + tag = AHREF + "\"" + redirect_prefix + encode_url(link) + "\" " + + TARGET_BLANK + ">"; return true; } @@ -120,25 +148,38 @@ bool is_tag_allowed(string tag) replace_all(tag, "/", ""); to_lower_mod(tag); - BOOST_FOREACH(const string &a_tag, ALLOWED_TAGS) - { - if (tag.compare(a_tag) == 0) - return true; - } - - return false; + return ALLOWED_TAGS.find(tag) != ALLOWED_TAGS.end(); } -const string restrict_html(const string &html_code_orig, bool strip) +/** +* @brief Restricts html code to a small list of allowed tags. +* The attribute "href" from the tag "a" has its value sanitized and if it +* contains unsafe characters, the tag is stripped. +* The link sanitizer adds a redirector in case of an acceptable protocol, strip +* otherwise. +* Any other attributes found will result in the tag being stripped. +* Any comments will be excluded. +* Strip closing tags that were not open and close tags that were not closed. +* +* @param html_code_orig input html code. Non case sensitive. +* @param redirect_prefix Prefix that will be added to the url in case of a valid +* html "a" tag. +* @param strip if true disallowed tags are stripped otherwise throw an error. +* Defaults to true. +* @return output html code +*/ +const string restrict_html(const string &html_code_orig, + const std::string &redirect_prefix, + bool strip) { string html_code(html_code_orig); remove_html_comments (html_code); - vector tokenized; + vector tokenized; tokenize_by_tag (tokenized, html_code); string result = ""; vector expected_tags; - BOOST_FOREACH(const TOKEN &s, tokenized) + BOOST_FOREACH(const Token &s, tokenized) { if (!s.second) { @@ -147,23 +188,32 @@ const string restrict_html(const string &html_code_orig, bool strip) } if (is_tag_allowed(s.first) ) { + // Checks if this tag (s.first) is not a closing tag. if (s.first.compare(0, 2, "") != 0) + // The tag "
" does not have a closing tag, so don't push it + // into the expected tags vector. + if (to_lower(s.first) != "
") expected_tags.push_back(" 0 && expected_tags.back().compare(s.first) == 0) + // If it is a closing tag, check if this is the expected one. + // We can't accept an closing tag that was never opened. + else if (expected_tags.size() > 0 && expected_tags.back() == s.first) { + // When the closing tag is indeed what we expect, pop it out + // from the LIFO queue. result = result + s.first; expected_tags.pop_back(); continue; } } + // Copy s.first to a string since it might be modified by link_sanitizer. string tag = s.first; - if (to_lower(s.first).compare(0, AHREF.size(), AHREF) == 0 && link_sanitizer(tag)) + if (to_lower_mod(tag).compare(0, AHREF.size(), AHREF) == 0 + && link_sanitizer(tag, redirect_prefix)) { result = result + tag; expected_tags.push_back("
"); @@ -185,6 +235,11 @@ const string restrict_html(const string &html_code_orig, bool strip) return result.c_str(); } +const string restrict_html(const string &html_code_orig, bool strip) +{ + return restrict_html(html_code_orig, REDIRECT_PREFIX, strip); +} + /** * @brief Replace all "+" characters found in s to spaces (" "). * diff --git a/src/restricted_html.hpp b/src/restricted_html.hpp index 00140ac..fb1277d 100644 --- a/src/restricted_html.hpp +++ b/src/restricted_html.hpp @@ -20,26 +20,6 @@ on this file might be covered by the GNU General Public License. /** @file * @brief restricts html code to an allowed group of tags. * - * Example of acceptable html code: - * -

Restrict HTML

- - - - - - - - - -
TagsProtocos
Link$10000
-

Paragraph with Link.

-
-
    -
  • Tags
  • -
- * - * * @copyright © Copyright 2017 Intra2net AG * */ @@ -51,20 +31,12 @@ on this file might be covered by the GNU General Public License. namespace I2n { - /** - * @brief Restricts html code to a small list of allowed tags. - * The attribute "href" from the tag "a" has its value sanitized and if it - * contains unsafe characters, the tag is stripped. - * The link sanitizer adds a redirector in case of an acceptable protocol, strip otherwise. - * Any other attributes found will result in the tag being stripped. - * Any comments will be excluded. - * Strip closing tags that were not open and close tags that were not closed. - * - * @param html_code_orig input html code. Non case sensitive. - * @param strip if true disallowed tags are stripped otherwise throw an error. Defaults to true. - * @return output html code - */ - const std::string restrict_html(const std::string &html_code_orig, bool strip=true); + + const std::string restrict_html(const std::string &html_code_orig, + const std::string &redirect_prefix, + bool strip=true); + const std::string restrict_html(const std::string &html_code_orig, + bool strip=true); std::string decode_url(std::string s); diff --git a/test/test_restricted_html.cpp b/test/test_restricted_html.cpp index b4d21ae..65f4dcd 100644 --- a/test/test_restricted_html.cpp +++ b/test/test_restricted_html.cpp @@ -35,8 +35,41 @@ BOOST_AUTO_TEST_SUITE(test_restricted_html) BOOST_AUTO_TEST_CASE(BasicTest) { - string output = restrict_html("

Table

Month Savings
January $100

Paragraph with a Acceptable Link.

  • Coffee
  • Tea
  • Milk
"); - BOOST_CHECK_EQUAL(string("

Table

Month Savings
January $100

Paragraph with a Acceptable Link.

  • Coffee
  • Tea
  • Milk
"), output); + string output = restrict_html("

Table

" + "
MonthSavings
January" + " $100

Par" + "agraph with a Acceptable Link.

  • C" + "offee
  • Tea
  • Milk
  • "); + BOOST_CHECK_EQUAL(string("

    Table

    " + "
    Month " + "Savings
    January$100

    Paragraph with " + "a Acceptable L" + "ink.

    • Coffee
    • Tea" + "
    • Milk
    "), output); +} + +BOOST_AUTO_TEST_CASE(SpecialCharacters) +{ + string output = restrict_html("

    Letters: äöü

    "); + BOOST_CHECK_EQUAL(string("

    Letters: äöü

    "), output); +} + +BOOST_AUTO_TEST_CASE(SpecialCharacters2) +{ + string output = restrict_html("

    Letters: äöü

    "); + BOOST_CHECK_EQUAL(string("

    Letters: äöü

    "), output); +} + +BOOST_AUTO_TEST_CASE(SpecialCharacters3) +{ + string output = restrict_html("

    Letters:
    Ampersand (&) test

    " + "

    Others: ' / " }"); + BOOST_CHECK_EQUAL(string("

    Letters:
    Ampersand (&) test

    " + "

    Others: ' / " }

    "), + output); } BOOST_AUTO_TEST_CASE(ScriptInjection) @@ -71,38 +104,67 @@ BOOST_AUTO_TEST_CASE(NestedScript4) BOOST_AUTO_TEST_CASE(ExtraAttribute) { - string output = restrict_html("test"); + string output = restrict_html("" + "test"); BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(ExtraAttribute2) { - string output = restrict_html("test"); + string output = restrict_html("test"); BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(ExtraAttribute3) { - string output = restrict_html("test"); + string output = restrict_html("test"); BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(AhrefLink) { string output = restrict_html("test"); - BOOST_CHECK_EQUAL(string("test"), output); + BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(AhrefLink2) { - string output = restrict_html("test"); + string output = restrict_html("test"); + BOOST_CHECK_EQUAL(string("test"), output); +} + +BOOST_AUTO_TEST_CASE(AhrefLink3) +{ + string output = restrict_html("test"); + BOOST_CHECK_EQUAL(string("test"), + output); +} + +BOOST_AUTO_TEST_CASE(AhrefLinkIPAddress) +{ + string output = restrict_html("" + "test"); + BOOST_CHECK_EQUAL(string("test"), output); +} + +BOOST_AUTO_TEST_CASE(AhrefLinkIPAddressNoPort) +{ + string output = restrict_html("test"); BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(AhrefProtocol) { string output = restrict_html("foo"); - BOOST_CHECK_EQUAL(string("foo"), output); + BOOST_CHECK_EQUAL(string("foo"), output); } BOOST_AUTO_TEST_CASE(AhrefWrongProtocol) @@ -114,7 +176,8 @@ BOOST_AUTO_TEST_CASE(AhrefWrongProtocol) BOOST_AUTO_TEST_CASE(UnclosedTags) { string output = restrict_html("

    Test

    "); - BOOST_CHECK_EQUAL(string("

    Test

    "), output); + BOOST_CHECK_EQUAL(string("

    Test

    "), + output); } BOOST_AUTO_TEST_CASE(UnopenedTags) @@ -131,40 +194,67 @@ BOOST_AUTO_TEST_CASE(UnsafeURLChars) BOOST_AUTO_TEST_CASE(UnsafeURLChars2) { - string output = restrict_html(" Test Me!!"); + string output = restrict_html(" Test" + " Me!!"); BOOST_CHECK_EQUAL(string(" Test Me!!"), output); } BOOST_AUTO_TEST_CASE(EncodedURL) { - string output = restrict_html("test"); - BOOST_CHECK_EQUAL(string("test"), output); + string output = restrict_html("test"); + BOOST_CHECK_EQUAL(string("test"), + output); } BOOST_AUTO_TEST_CASE(InvalidRedirection) { - string output = restrict_html("test"); + string output = restrict_html("tes" + "t"); BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(InvalidRedirection2) { - string output = restrict_html("test"); + string output = restrict_html("test"); BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(InvalidTag) { - string output = restrict_html(""); + string output = restrict_html(""); BOOST_CHECK_EQUAL(string("alert('XSS');"), output); } BOOST_AUTO_TEST_CASE(InvalidTag2) { - string output = restrict_html(""); + string output = restrict_html(""); BOOST_CHECK_EQUAL(string(""), output); } +BOOST_AUTO_TEST_CASE(NonStripMode) +{ + BOOST_CHECK_THROW(restrict_html("test", false), runtime_error); +} + +BOOST_AUTO_TEST_CASE(NonStripMode2) +{ + BOOST_CHECK_THROW(restrict_html("", false), runtime_error); +} + +BOOST_AUTO_TEST_CASE(NonStripMode3) +{ + BOOST_CHECK_NO_THROW(restrict_html("test", false)); +} + BOOST_AUTO_TEST_CASE(DecodeStringURL) { string output = decode_url("%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D");