From 2e5ff9d444b4f606097ad6f29dc888e44f0aa546 Mon Sep 17 00:00:00 2001 From: Juliana Rodrigueiro Date: Wed, 8 Aug 2018 14:42:55 +0200 Subject: [PATCH] Identify and skip html entities in the input --- src/stringfunc.cpp | 32 +++++++++++++++++++++++++++++--- src/stringfunc.hxx | 2 +- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/stringfunc.cpp b/src/stringfunc.cpp index d611abf..bd66e9b 100644 --- a/src/stringfunc.cpp +++ b/src/stringfunc.cpp @@ -42,6 +42,7 @@ on this file might be covered by the GNU General Public License. #include #include #include +#include #include @@ -1018,11 +1019,36 @@ string::size_type find_8bit(const std::string &str) return string::npos; } -// encoded UTF-8 chars into HTML entities -string html_entities(std::string str) +/** +* @brief Encoded UTF-8 chars into HTML entities. +* +* @param[in,out] str the string that will have its special characters replaced +* by HTML entities. +* @param expect_amp When true, expect to find ampersand characters and HTML +* entities, replace any additional special characters. +* @return std::string same as str. +*/ +string html_entities(std::string str, bool expect_amp) { + if (expect_amp) + { + const string amp = "&"; + const pcrecpp::RE re_amp("^&((#([0-9]{3}|x[0-9a-fA-F]{2}))|[a-zA-Z]{2,6});"); + string::size_type pos = 0; + while ((pos = str.find("&", pos)) != string::npos) + { + if (!re_amp.PartialMatch(str.substr(pos))) + str.replace(pos, 1, amp); + + pos++; + } + } + else + { + replace_all (str, "&", "&"); + } + // Normal chars - replace_all (str, "&", "&"); replace_all (str, "<", "<"); replace_all (str, ">", ">"); replace_all (str, "\"", """); diff --git a/src/stringfunc.hxx b/src/stringfunc.hxx index d6cd254..459d283 100644 --- a/src/stringfunc.hxx +++ b/src/stringfunc.hxx @@ -357,7 +357,7 @@ void tokenize_by_tag(std::vector > &tokenized, const std::string &input); std::string strip_html_tags(const std::string &input); std::string smart_html_entities(const std::string &input); -std::string html_entities(std::string str); +std::string html_entities(std::string str, bool expect_amp=false); std::string html_entities_to_console(std::string str); typedef std::pair CommentZone; -- 1.7.1