Improve url regex and add documentation
authorJuliana Rodrigueiro <juliana.rodrigueiro@intra2net.com>
Wed, 8 Aug 2018 12:50:40 +0000 (14:50 +0200)
committerJuliana Rodrigueiro <juliana.rodrigueiro@intra2net.com>
Wed, 19 Sep 2018 15:46:17 +0000 (17:46 +0200)
Fix other minor things.

src/restricted_html.cpp
src/restricted_html.hpp
test/test_restricted_html.cpp

index 52e68ec..28c1eed 100644 (file)
@@ -20,38 +20,59 @@ on this file might be covered by the GNU General Public License.
 /** @file
  * @brief restricts html messages to an allowed group of tags.
  *
+ * See examples of acceptable and non-acceptable HTML code in the unit tests
+ * for this module.
+ *
  * @copyright &copy; Copyright 2017 Intra2net AG
  *
  */
 
-#include <string>
-#include <sstream>
 #include <iomanip>
+#include <set>
+#include <sstream>
+#include <string>
 #include <vector>
 
 #include <pcrecpp.h>
 #include <boost/foreach.hpp>
+#include <boost/assign/list_of.hpp>
 
 #include <stringfunc.hxx>
 #include <restricted_html.hpp>
 
 using namespace std;
 
-typedef pair<string,bool> TOKEN;
 
-const vector<string> ALLOWED_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6",
-                                     "a", "p", "br", "i", "ul", "li",
-                                     "table", "tr", "th", "td"
-                                    };
+namespace I2n
+{
+
+namespace
+{
+
+/**
+* @brief Token is composed by a key string and a flag signalizing if it
+* is an html tag, ie., starts with "<" and finishes with ">".
+* Token is the type value returned by the I2n::tokenize_by_tag function.
+*/
+typedef pair<string,bool> Token;
+
+*/
+
+const set<string> ALLOWED_TAGS = boost::assign::list_of("h1")("h2")("h3")("h4")
+                                                       ("h5")("h6")("a")("p")
+                                                       ("br")("i")("ul")("li")
+                                                       ("tr")("th")("td")
+                                                       ("table");
 const string AHREF = "<a href=";
 const string REDIRECT_PREFIX = "/arnie?form=redirect&url=";
 const string TARGET_BLANK = "target=_blank";
-const pcrecpp::RE SAFE_URL("^http(s)?:\\/\\/[\\w.-]+(?:\\.[\\w\\.-]+)+"
-                           "[\\w\\-\\._~:/?#@!\\$&'\\(\\)\\*\\+,;=.]+$");
+const pcrecpp::RE SAFE_URL("^(http(s?):\\/\\/)(([a-zA-Z0-9\\.\\-\\_]+(\\.[a-zA-"
+                           "Z]{2,3})+)|((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-"
+                           "9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+                           ":[0-9]{1,5}))(\\/[a-zA-Z0-9\\@\\+\\_\\-\\.\\/\\?\\%"
+                           "\\#\\&\\=]*)?$");
 
-
-namespace I2n
-{
+} // eo namespace <anonymous>
 
 /**
  * @brief Verifies if a html "a" tag has a valid link and sanitize it if necessary.
@@ -59,21 +80,23 @@ namespace I2n
  * Example: <a href="http://somelink.com">
  * returns <a href="/arnie?form=redirect&url=http://somelink.com" target=_blank>
  *
- * @param tag html "a" tag.
- * @return true if link inside "a" tag is valid. False otherwise.
+ * @param tag (output) html "a" tag. If valid, it may be modified to regularize
+ * and encode the URL.
+ * @param redirect_prefix Prefix that will be added to the url in case it is valid.
+ * @return bool true if link inside "a" tag is valid. False otherwise.
  */
-bool link_sanitizer(string &tag)
+bool link_sanitizer(string &tag, const std::string &redirect_prefix)
 {
     // tag = <a href="somelink">
     string link = tag.substr(AHREF.size());
-    if (link.find_first_of("\"") == 0)
+    if (link.find('\"') == 0)
     {
-        size_t pos = link.find_first_of("\"", 1);
+        size_t pos = link.find('\"', 1);
         if (pos == string::npos)
             return false; // Quotation mark never closes.
 
         string end(link, pos+1);
-        if (end.compare(" >") != 0 && end.compare(">") != 0)
+        if (end != " >" && end != ">")
             return false; //Probably extra attributes. Or " inside the link (invalid).
 
         link = link.substr(1, pos -1);
@@ -87,6 +110,10 @@ bool link_sanitizer(string &tag)
         link = link.substr(0, space);
     }
 
+    // Check if the link already has a known prefix.
+    // See "InvalidRedirection" unit test.
+    if (has_prefix(link, redirect_prefix))
+        link = remove_prefix(link, redirect_prefix);
     if (has_prefix(link, REDIRECT_PREFIX))
         link = remove_prefix(link, REDIRECT_PREFIX);
 
@@ -98,7 +125,8 @@ bool link_sanitizer(string &tag)
     if (link.find("javascript:") != string::npos)
         return false;
 
-    tag = AHREF + "\"" + REDIRECT_PREFIX + encode_url(link) + "\" " + TARGET_BLANK + ">";
+    tag = AHREF + "\"" + redirect_prefix + encode_url(link) + "\" "
+        + TARGET_BLANK + ">";
 
     return true;
 }
@@ -120,25 +148,38 @@ bool is_tag_allowed(string tag)
     replace_all(tag, "/", "");
     to_lower_mod(tag);
 
-    BOOST_FOREACH(const string &a_tag, ALLOWED_TAGS)
-    {
-        if (tag.compare(a_tag) == 0)
-            return true;
-    }
-
-    return false;
+    return ALLOWED_TAGS.find(tag) != ALLOWED_TAGS.end();
 }
 
-const string restrict_html(const string &html_code_orig, bool strip)
+/**
+* @brief Restricts html code to a small list of allowed tags.
+* The attribute "href" from the tag "a" has its value sanitized and if it
+* contains unsafe characters, the tag is stripped.
+* The link sanitizer adds a redirector in case of an acceptable protocol, strip
+* otherwise.
+* Any other attributes found will result in the tag being stripped.
+* Any comments will be excluded.
+* Strip closing tags that were not open and close tags that were not closed.
+*
+* @param html_code_orig input html code. Non case sensitive.
+* @param redirect_prefix Prefix that will be added to the url in case of a valid
+* html "a" tag.
+* @param strip if true disallowed tags are stripped otherwise throw an error.
+* Defaults to true.
+* @return output html code
+*/
+const string restrict_html(const string &html_code_orig,
+                           const std::string &redirect_prefix,
+                           bool strip)
 {
     string html_code(html_code_orig);
     remove_html_comments (html_code);
-    vector<TOKEN > tokenized;
+    vector<Token > tokenized;
     tokenize_by_tag (tokenized, html_code);
     string result = "";
 
     vector<string> expected_tags;
-    BOOST_FOREACH(const TOKEN &s, tokenized)
+    BOOST_FOREACH(const Token &s, tokenized)
     {
         if (!s.second)
         {
@@ -147,23 +188,32 @@ const string restrict_html(const string &html_code_orig, bool strip)
         }
         if (is_tag_allowed(s.first) )
         {
+            // Checks if this tag (s.first) is not a closing tag.
             if (s.first.compare(0, 2, "</") != 0)
             {
                 result = result + s.first;
-                if (to_lower(s.first).compare("<br>") != 0)
+                // The tag "<br>" does not have a closing tag, so don't push it
+                // into the expected tags vector.
+                if (to_lower(s.first) != "<br>")
                     expected_tags.push_back("</"+s.first.substr(1));
                 continue;
             }
-            else if (expected_tags.size() > 0 && expected_tags.back().compare(s.first) == 0)
+            // If it is a closing tag, check if this is the expected one.
+            // We can't accept an closing tag that was never opened.
+            else if (expected_tags.size() > 0 && expected_tags.back() == s.first)
             {
+                // When the closing tag is indeed what we expect, pop it out
+                // from the LIFO queue.
                 result = result + s.first;
                 expected_tags.pop_back();
                 continue;
             }
         }
 
+        // Copy s.first to a string since it might be modified by link_sanitizer.
         string tag = s.first;
-        if (to_lower(s.first).compare(0, AHREF.size(), AHREF) == 0 && link_sanitizer(tag))
+        if (to_lower_mod(tag).compare(0, AHREF.size(), AHREF) == 0
+            && link_sanitizer(tag, redirect_prefix))
         {
             result = result + tag;
             expected_tags.push_back("</a>");
@@ -185,6 +235,11 @@ const string restrict_html(const string &html_code_orig, bool strip)
     return result.c_str();
 }
 
+const string restrict_html(const string &html_code_orig, bool strip)
+{
+    return restrict_html(html_code_orig, REDIRECT_PREFIX, strip);
+}
+
 /**
  * @brief Replace all "+" characters found in s to spaces (" ").
  *
index 00140ac..fb1277d 100644 (file)
@@ -20,26 +20,6 @@ on this file might be covered by the GNU General Public License.
 /** @file
  * @brief restricts html code to an allowed group of tags.
  *
- * Example of acceptable html code:
- *
-    <h1>Restrict HTML</h1>
-    <table>
-        <tr>
-            <th>Tags</th>
-            <th>Protocos</th>
-        </tr>
-        <tr>
-            <td>Link</td>
-            <td>$10000</td>
-        </tr>
-    </table>
-    <p>Paragraph with <a href="https://example.de"><i>Link</i></a>.</p>
-    <br>
-    <ul>
-        <li>Tags</li>
-    </ul>
- *
- *
  * @copyright &copy; Copyright 2017 Intra2net AG
  *
  */
@@ -51,20 +31,12 @@ on this file might be covered by the GNU General Public License.
 
 namespace I2n
 {
-    /**
-     * @brief Restricts html code to a small list of allowed tags.
-     * The attribute "href" from the tag "a" has its value sanitized and if it
-     * contains unsafe characters, the tag is stripped.
-     * The link sanitizer adds a redirector in case of an acceptable protocol, strip otherwise.
-     * Any other attributes found will result in the tag being stripped.
-     * Any comments will be excluded.
-     * Strip closing tags that were not open and close tags that were not closed.
-     *
-     * @param html_code_orig input html code. Non case sensitive.
-     * @param strip if true disallowed tags are stripped otherwise throw an error. Defaults to true.
-     * @return output html code
-     */
-    const std::string restrict_html(const std::string &html_code_orig, bool strip=true);
+
+    const std::string restrict_html(const std::string &html_code_orig,
+                                    const std::string &redirect_prefix,
+                                    bool strip=true);
+    const std::string restrict_html(const std::string &html_code_orig,
+                                    bool strip=true);
 
     std::string decode_url(std::string s);
 
index b4d21ae..65f4dcd 100644 (file)
@@ -35,8 +35,41 @@ BOOST_AUTO_TEST_SUITE(test_restricted_html)
 
 BOOST_AUTO_TEST_CASE(BasicTest)
 {
-    string output = restrict_html("<h1>Table</h1><table>  <tr><th>Month</th>    <th>Savings</th>  </tr>  <tr>    <td>January</td>    <td>$100</td>  </tr></table> <p>Paragraph with a <a href=\"https://example.de\"><i>Acceptable Link</i></a>.</p> <ul>  <li>Coffee</li>   <li>Tea</li>  <li>Milk</li></ul>");
-    BOOST_CHECK_EQUAL(string("<h1>Table</h1><table>  <tr><th>Month</th>    <th>Savings</th>  </tr>  <tr>    <td>January</td>    <td>$100</td>  </tr></table> <p>Paragraph with a <a href=\"/arnie?form=redirect&url=https%3A%2F%2Fexample%2Ede\" target=_blank><i>Acceptable Link</i></a>.</p> <ul>  <li>Coffee</li>   <li>Tea</li>  <li>Milk</li></ul>"), output);
+    string output = restrict_html("<h1>Table</h1><table>  <tr><th>Month</th>   "
+                                  "<th>Savings</th>  </tr>  <tr>    <td>January"
+                                  "</td>    <td>$100</td>  </tr></table> <p>Par"
+                                  "agraph with a  <a href=\"https://example.de\""
+                                  "><i>Acceptable Link</i></a>.</p> <ul>  <li>C"
+                                  "offee</li>   <li>Tea</li>  <li>Milk</li></ul"
+                                  ">");
+    BOOST_CHECK_EQUAL(string("<h1>Table</h1><table>  <tr><th>Month</th>   <th>"
+                             "Savings</th>  </tr>  <tr>    <td>January</td>   "
+                             " <td>$100</td>  </tr></table> <p>Paragraph with "
+                             "a  <a href=\"/arnie?form=redirect&url=https%3A%2"
+                             "F%2Fexample%2Ede\" target=_blank><i>Acceptable L"
+                             "ink</i></a>.</p> <ul>  <li>Coffee</li>   <li>Tea"
+                             "</li>  <li>Milk</li></ul>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(SpecialCharacters)
+{
+    string output = restrict_html("<h1>Letters:&nbsp;&auml;&ouml;&uuml;</h1>");
+    BOOST_CHECK_EQUAL(string("<h1>Letters:&nbsp;&auml;&ouml;&uuml;</h1>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(SpecialCharacters2)
+{
+    string output = restrict_html("<h1>Letters: äöü</h1>");
+    BOOST_CHECK_EQUAL(string("<h1>Letters: &auml;&ouml;&uuml;</h1>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(SpecialCharacters3)
+{
+    string output = restrict_html("<h1>Letters: <br> Ampersand (&) test </h1>"
+                                  "<p> Others: &#x27; &#x2F; &quot; &#125;");
+    BOOST_CHECK_EQUAL(string("<h1>Letters: <br> Ampersand (&amp;) test </h1>"
+                             "<p> Others: &#x27; &#x2F; &quot; &#125;</p>"),
+                      output);
 }
 
 BOOST_AUTO_TEST_CASE(ScriptInjection)
@@ -71,38 +104,67 @@ BOOST_AUTO_TEST_CASE(NestedScript4)
 
 BOOST_AUTO_TEST_CASE(ExtraAttribute)
 {
-    string output = restrict_html("<a onclick=\"evil\" href=\"http://i2n.de/\">test</a>");
+    string output = restrict_html("<a onclick=\"evil\" href=\"http://i2n.de/\">"
+                                  "test</a>");
     BOOST_CHECK_EQUAL(string("test"), output);
 }
 
 BOOST_AUTO_TEST_CASE(ExtraAttribute2)
 {
-    string output = restrict_html("<a href=\"http://site.com/dir\" onclick=\"evil\">test</a>");
+    string output = restrict_html("<a href=\"http://site.com/dir\" onclick=\"ev"
+                                  "il\">test</a>");
     BOOST_CHECK_EQUAL(string("test"), output);
 }
 
 BOOST_AUTO_TEST_CASE(ExtraAttribute3)
 {
-    string output = restrict_html("<a href=\"http://site.com/dir\"onclick=\"evil\">test</a>");
+    string output = restrict_html("<a href=\"http://site.com/dir\"onclick=\"evi"
+                                  "l\">test</a>");
     BOOST_CHECK_EQUAL(string("test"), output);
 }
 
 BOOST_AUTO_TEST_CASE(AhrefLink)
 {
     string output = restrict_html("<a href=\"http://i2n.de/\" >test</a>");
-    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fi2n%2Ede%2F\" target=_blank>test</a>"), output);
+    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fi"
+                             "2n%2Ede%2F\" target=_blank>test</a>"), output);
 }
 
 BOOST_AUTO_TEST_CASE(AhrefLink2)
 {
-    string output = restrict_html("<a href=http://;href=javascript:alert(String.fromCharCode(88,83,83)) target=_blank>test</a>");
+    string output = restrict_html("<a href=http://;href=javascript:alert(String"
+                                  ".fromCharCode(88,83,83)) target=_blank>test</a>");
+    BOOST_CHECK_EQUAL(string("test"), output);
+}
+
+BOOST_AUTO_TEST_CASE(AhrefLink3)
+{
+    string output = restrict_html("<a href=\"http://www.test.com/form\">test</a>");
+    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fw"
+                             "ww%2Etest%2Ecom%2Fform\" target=_blank>test</a>"),
+                      output);
+}
+
+BOOST_AUTO_TEST_CASE(AhrefLinkIPAddress)
+{
+    string output = restrict_html("<a href=\"http://192.168.10.10:8080/form\">"
+                                  "test</a>");
+    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2F"
+                             "192%2E168%2E10%2E10%3A8080%2Fform\" target=_blan"
+                             "k>test</a>"), output);
+}
+
+BOOST_AUTO_TEST_CASE(AhrefLinkIPAddressNoPort)
+{
+    string output = restrict_html("<a href=\"http://192.168.10.10\">test</a>");
     BOOST_CHECK_EQUAL(string("test"), output);
 }
 
 BOOST_AUTO_TEST_CASE(AhrefProtocol)
 {
     string output = restrict_html("<a href=\"http://www.foo.com\">foo</a>");
-    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fwww%2Efoo%2Ecom\" target=_blank>foo</a>"), output);
+    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2F"
+                             "www%2Efoo%2Ecom\" target=_blank>foo</a>"), output);
 }
 
 BOOST_AUTO_TEST_CASE(AhrefWrongProtocol)
@@ -114,7 +176,8 @@ BOOST_AUTO_TEST_CASE(AhrefWrongProtocol)
 BOOST_AUTO_TEST_CASE(UnclosedTags)
 {
     string output = restrict_html("<table><tr><td><h1>Test</h1>");
-    BOOST_CHECK_EQUAL(string("<table><tr><td><h1>Test</h1></td></tr></table>"), output);
+    BOOST_CHECK_EQUAL(string("<table><tr><td><h1>Test</h1></td></tr></table>"),
+                      output);
 }
 
 BOOST_AUTO_TEST_CASE(UnopenedTags)
@@ -131,40 +194,67 @@ BOOST_AUTO_TEST_CASE(UnsafeURLChars)
 
 BOOST_AUTO_TEST_CASE(UnsafeURLChars2)
 {
-    string output = restrict_html("<a href=http://aa.com\nonclick=\\u0061> Test Me!!</a>");
+    string output = restrict_html("<a href=http://aa.com\nonclick=\\u0061> Test"
+                                  " Me!!</a>");
     BOOST_CHECK_EQUAL(string(" Test Me!!"), output);
 }
 
 BOOST_AUTO_TEST_CASE(EncodedURL)
 {
-    string output = restrict_html("<A HREF=\"http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D\">test</A>");
-    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fwww%2Egoogle%2Ecom\" target=_blank>test</a>"), output);
+    string output = restrict_html("<A HREF=\"http://%77%77%77%2E%67%6F%6F%67%6C"
+                                  "%65%2E%63%6F%6D\">test</A>");
+    BOOST_CHECK_EQUAL(string("<a href=\"/arnie?form=redirect&url=http%3A%2F%2Fw"
+                             "ww%2Egoogle%2Ecom\" target=_blank>test</a>"),
+                      output);
 }
 
 BOOST_AUTO_TEST_CASE(InvalidRedirection)
 {
-    string output = restrict_html("<a href=\"/arnie?form=redirect&url=javascript:alert(String.fromCharCode(88,83,83))\">test</a>");
+    string output = restrict_html("<a href=\"/arnie?form=redirect&url=javascrip"
+                                  "t:alert(String.fromCharCode(88,83,83))\">tes"
+                                  "t</a>");
     BOOST_CHECK_EQUAL(string("test"), output);
 }
 
 BOOST_AUTO_TEST_CASE(InvalidRedirection2)
 {
-    string output = restrict_html("<a href=/arnie?form=redirect&url=http://something.com;href=javascript:alert(1)>test</a>");
+    string output = restrict_html("<a href=/arnie?form=redirect&url=http://some"
+                                  "thing.com;href=javascript:alert(1)>test</a>");
     BOOST_CHECK_EQUAL(string("test"), output);
 }
 
 BOOST_AUTO_TEST_CASE(InvalidTag)
 {
-    string output = restrict_html("<STYLE TYPE=\"text/javascript\">alert('XSS');</STYLE>");
+    string output = restrict_html("<STYLE TYPE=\"text/javascript\">alert('XSS')"
+                                  ";</STYLE>");
     BOOST_CHECK_EQUAL(string("alert(&#x27;XSS&#x27;);"), output);
 }
 
 BOOST_AUTO_TEST_CASE(InvalidTag2)
 {
-    string output = restrict_html("<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>");
+    string output = restrict_html("<IMG SRC=javascript:alert(String.fromCharCod"
+                                  "e(88,83,83))>");
     BOOST_CHECK_EQUAL(string(""), output);
 }
 
+BOOST_AUTO_TEST_CASE(NonStripMode)
+{
+    BOOST_CHECK_THROW(restrict_html("<a href=\"http://site.com/dir\"onclick=\"e"
+                                    "vil\">test</a>", false), runtime_error);
+}
+
+BOOST_AUTO_TEST_CASE(NonStripMode2)
+{
+    BOOST_CHECK_THROW(restrict_html("<IMG SRC=javascript:alert(String.fromCharC"
+                                    "ode(88,83,83))>", false), runtime_error);
+}
+
+BOOST_AUTO_TEST_CASE(NonStripMode3)
+{
+    BOOST_CHECK_NO_THROW(restrict_html("<a href=\"http://192.168.10.10:8080/for"
+                                       "m\">test</a>", false));
+}
+
 BOOST_AUTO_TEST_CASE(DecodeStringURL)
 {
     string output = decode_url("%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D");