Handle tag validation according to its white list group
authorJuliana Rodrigueiro <juliana.rodrigueiro@intra2net.com>
Wed, 19 Sep 2018 15:03:24 +0000 (17:03 +0200)
committerJuliana Rodrigueiro <juliana.rodrigueiro@intra2net.com>
Wed, 19 Sep 2018 15:49:34 +0000 (17:49 +0200)
Make the whole validation process dependent of the group type
and not of specific tags and their particularities.

src/restricted_html.cpp

index 0aeab2d..ee91156 100644 (file)
@@ -92,6 +92,157 @@ const pcrecpp::RE SAFE_URL("^(http(s?):\\/\\/)(([a-zA-Z0-9\\.\\-\\_]+(\\.[a-zA-"
 
 } // eo namespace <anonymous>
 
+template < typename CONT >
+bool is_contained(const string &name, const CONT &container)
+{
+    return container.find(name) != container.end();
+}
+
+class HtmlTag
+{
+private:
+    const string FullTag;
+    string Name;
+    string Attributes;
+    bool IsAllowed;
+
+    /**
+    * @brief Checks if this fits in one of the three groups of white-listed
+    * tags and populate private variables on the way.
+    *
+    * @return bool True if tag and its attributes are valid.
+    */
+    bool process_tag()
+    {
+        string name = FullTag;
+
+        if (is_end_tag())
+            name = remove_prefix(name, "</");
+        else
+            name = remove_prefix(name, "<");
+
+        // Spaces after "<" or "</" are invalid.
+        if (name.substr(0,1).find_first_of(I2n::Whitespaces) != string::npos)
+            return false;
+
+        to_lower_mod(name);
+
+        // If this tag has the self-closing notation, it must be contained in
+        // the allowed void tags list.
+        if (has_suffix(FullTag, "/>"))
+        {
+            name = remove_suffix(name, "/>");
+            trim_mod(name);
+
+            if (is_contained(name, ALLOWED_VOID))
+            {
+                Name = name;
+                return true;
+            }
+
+            return false;
+        }
+
+        name = remove_suffix(name, ">");
+        trim_mod(name);
+
+        // Try to match any allowed tag.
+        if (is_contained(name, ALLOWED_NORMAL)
+            || is_contained(name, ALLOWED_VOID)
+            || is_contained(name, ALLOWED_WITH_ATTR))
+        {
+            Name = name;
+            return true;
+        }
+
+        // It may be a tag with attributes which requires special handling.
+        string tmp_name, tmp_attr;
+        if (!pair_split(name, tmp_name, tmp_attr, ' '))
+            return false;
+
+        // But end tags should not have attributes.
+        if (is_end_tag())
+            return false;
+
+        if (!is_contained(tmp_name, ALLOWED_WITH_ATTR))
+            return false;
+
+        Name = tmp_name;
+
+        // The tag is contained in the allowed tags list, but all attributes
+        // and their content should also be allowed and valid.
+        list<string> attrs = split_string(tmp_attr, " ", true, Whitespaces);
+
+        // Map of allowed attributes of the given tag.
+        const map< string, AttributeHandler> &attrs_handlers
+            = ALLOWED_WITH_ATTR.find(tmp_name)->second;
+
+        BOOST_FOREACH(const string &attribute, attrs)
+        {
+            string attr_name = "", attr_value = "";
+            if (!pair_split(attribute, attr_name, attr_value, '='))
+                attr_name = attribute;
+
+            if (!is_contained(attr_name, attrs_handlers))
+                return false;
+
+            // Execute the AttributeHandler for the given attribute.
+            if (!attrs_handlers.find(attr_name)->second(attr_value))
+                return false;
+
+            if (!Attributes.empty())
+                Attributes += " ";
+
+            Attributes += attr_name + "=" + attr_value;
+        }
+
+        return true;
+    }
+
+public:
+    HtmlTag(const string &tag)
+        : FullTag(tag)
+        , Name("")
+        , Attributes("")
+    {
+        IsAllowed = process_tag();
+    }
+
+    bool is_allowed() const
+    {
+        return IsAllowed;
+    }
+
+    const string get_name() const
+    {
+        return Name;
+    }
+
+    bool is_end_tag() const
+    {
+        if (has_prefix(FullTag, "</"))
+            return true;
+        return false;
+    }
+
+    bool is_void() const
+    {
+        if (is_contained(Name, ALLOWED_VOID))
+            return true;
+        return false;
+    }
+
+    const string sanitized() const
+    {
+        if (is_end_tag()
+            || !is_contained(Name, ALLOWED_WITH_ATTR))
+            return to_lower(FullTag);
+
+        return "<" + Name + " " + Attributes + ">";
+    }
+
+};
+
 /**
  * @brief Verifies if a html "a" tag has a valid link and sanitize it if necessary.
  * Modify tag and add redirector prefix if link has a valid protocol.
@@ -150,26 +301,6 @@ bool link_sanitizer(string &tag, const std::string &redirect_prefix)
 }
 
 /**
- * @brief Check if tag is in a whitelist of allowed tags.
- * Does not accept a tag containing attributes.
- * Example:
- * <h1> or </p> returns true.
- * <br size="param"> returns false.
- *
- * @param tag html tag to be verified.
- * @return true if is an allowed tag, false otherwise.
- */
-bool is_tag_allowed(string tag)
-{
-    replace_all(tag, "<", "");
-    replace_all(tag, ">", "");
-    replace_all(tag, "/", "");
-    to_lower_mod(tag);
-
-    return ALLOWED_TAGS.find(tag) != ALLOWED_TAGS.end();
-}
-
-/**
 * @brief Restricts html code to a small list of allowed tags.
 * The attribute "href" from the tag "a" has its value sanitized and if it
 * contains unsafe characters, the tag is stripped.
@@ -201,55 +332,43 @@ const string restrict_html(const string &html_code_orig,
     {
         if (!s.second)
         {
-            result = result + html_entities(s.first);
+            result = result + html_entities(s.first, true);
             continue;
         }
-        if (is_tag_allowed(s.first) )
+
+        HtmlTag tag(s.first);
+
+        if (tag.is_allowed())
         {
-            // Checks if this tag (s.first) is not a closing tag.
-            if (s.first.compare(0, 2, "</") != 0)
-            {
-                result = result + s.first;
-                // The tag "<br>" does not have a closing tag, so don't push it
-                // into the expected tags vector.
-                if (to_lower(s.first) != "<br>")
-                    expected_tags.push_back("</"+s.first.substr(1));
-                continue;
-            }
             // If it is a closing tag, check if this is the expected one.
             // We can't accept an closing tag that was never opened.
-            else if (expected_tags.size() > 0 && expected_tags.back() == s.first)
+            if (tag.is_end_tag())
             {
-                // When the closing tag is indeed what we expect, pop it out
-                // from the LIFO queue.
-                result = result + s.first;
-                expected_tags.pop_back();
-                continue;
+                if (expected_tags.size() > 0 && expected_tags.back() == tag.get_name())
+                    expected_tags.pop_back();
+                else if (!strip)
+                    throw runtime_error("Invalid tag: " + s.first);
+                else
+                    continue;
+            }
+            else if (!tag.is_void())
+            {
+                expected_tags.push_back(tag.get_name());
             }
-        }
 
-        // Copy s.first to a string since it might be modified by link_sanitizer.
-        string tag = s.first;
-        if (to_lower_mod(tag).compare(0, AHREF.size(), AHREF) == 0
-            && link_sanitizer(tag, redirect_prefix))
-        {
-            result = result + tag;
-            expected_tags.push_back("</a>");
+            // Add a sanitized version of the tag to the result string.
+            result += tag.sanitized();
             continue;
         }
 
         if (!strip)
             throw runtime_error("Invalid tag: " + s.first);
-
-    }
-    if (expected_tags.size() > 0) //One or more tags were not closed.
-    {
-        BOOST_REVERSE_FOREACH(const string &s, expected_tags)
-        {
-            result = result + s;
-        }
     }
 
+    // One or more tags were not closed.
+    BOOST_REVERSE_FOREACH(const string &close_tag, expected_tags)
+        result = result + "</" + close_tag + ">";
+
     return result.c_str();
 }