From d9766f9d74cbff2b6b39439c28e97371f1c14e21 Mon Sep 17 00:00:00 2001 From: Juliana Rodrigueiro Date: Wed, 8 Aug 2018 14:38:35 +0200 Subject: [PATCH] Use regex to identify urls Fix typos and adapt tests as well. --- CMakeLists.txt | 4 +++ src/CMakeLists.txt | 3 +- src/restricted_html.cpp | 43 +++++++++++++++------------------------- test/test_restricted_html.cpp | 41 ++++++++++++++++++--------------------- 4 files changed, 41 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18d53b6..017244a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,6 +152,10 @@ pkg_check_modules(OPENSSL REQUIRED openssl) INCLUDE_DIRECTORIES(${OPENSSL_INCLUDE_DIRS}) LINK_DIRECTORIES(${OPENSSL_LIBRARY_DIRS}) +pkg_check_modules(PCRECPP REQUIRED libpcrecpp) +include_directories(${PCRECPP_INCLUDE_DIRS}) +link_directories(${PCRECPP_LIBRARY_DIRS}) + # pkgconfig output set(prefix ${CMAKE_INSTALL_PREFIX}) set(exec_prefix ${CMAKE_INSTALL_PREFIX}/bin) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2ef9dc0..292fd26 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -64,7 +64,8 @@ target_link_libraries(i2ncommon ${Boost_IOSTREAMS_LIBRARIES} ${Boost_THREAD_LIBRARIES} ${ICONV_LIBRARIES} - ${OPENSSL_LIBRARIES}) + ${OPENSSL_LIBRARIES} + ${PCRECPP_LIBRARIES}) set_target_properties(i2ncommon PROPERTIES VERSION ${VERSION} SOVERSION 7) set_target_properties(i2ncommon PROPERTIES OUTPUT_NAME i2ncommon CLEAN_DIRECT_OUTPUT 1) diff --git a/src/restricted_html.cpp b/src/restricted_html.cpp index 428edca..705d927 100644 --- a/src/restricted_html.cpp +++ b/src/restricted_html.cpp @@ -29,6 +29,7 @@ on this file might be covered by the GNU General Public License. #include #include +#include #include #include @@ -38,40 +39,23 @@ using namespace std; typedef pair TOKEN; -const vector ALLOWED_PROTOCOLS = {"http://", "https://"}; const vector ALLOWED_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "a", "p", "br", "i", "ul", "li", "table", "tr", "th", "td" }; const string AHREF = " * returns * @@ -82,15 +66,15 @@ bool link_sanitizer(string &tag) { // tag = string link = tag.substr(AHREF.size()); - if (link.find_first_of("\"\'") == 0) + if (link.find_first_of("\"") == 0) { - size_t pos = link.find_first_of("\"\'", 1); + size_t pos = link.find_first_of("\"", 1); if (pos == string::npos) return false; // Quotation mark never closes. string end(link, pos+1); if (end.compare(" >") != 0 && end.compare(">") != 0) - return false; //Probably extra attributes. + return false; //Probably extra attributes. Or " inside the link (invalid). link = link.substr(1, pos -1); } @@ -103,14 +87,19 @@ bool link_sanitizer(string &tag) link = link.substr(0, space); } - if (is_protocol_allowed(link)) - tag = AHREF + "\"" + REDIRECT_PREFIX + link + "\" " + TARGET_BLANK + ">"; - else if (link[0] != '/') + if (has_prefix(link, REDIRECT_PREFIX)) + link = remove_prefix(link, REDIRECT_PREFIX); + + link = decode_url(link); + + if (!SAFE_URL.FullMatch(link)) return false; - if (link.find_first_not_of(SAFE_URL_CHARS) != string::npos) + if (link.find("javascript:") != string::npos) return false; + tag = AHREF + "\"" + REDIRECT_PREFIX + link + "\" " + TARGET_BLANK + ">"; + return true; } diff --git a/test/test_restricted_html.cpp b/test/test_restricted_html.cpp index c209b63..b32e403 100644 --- a/test/test_restricted_html.cpp +++ b/test/test_restricted_html.cpp @@ -32,11 +32,7 @@ using namespace std; using namespace I2n; BOOST_AUTO_TEST_SUITE(test_restricted_html) -/** - * TODO Create more tests for: - * html comments removed - * Test the transformation from non asccii to html_entities - */ + BOOST_AUTO_TEST_CASE(BasicTest) { string output = restrict_html("

Table

Month Savings
January $100

Paragraph with a Acceptable Link.

  • Coffee
  • Tea
  • Milk
"); @@ -73,45 +69,45 @@ BOOST_AUTO_TEST_CASE(NestedScript4) BOOST_CHECK_EQUAL(string("<scri<script>pt>alert(1)"), output); } -BOOST_AUTO_TEST_CASE(AhrefLink) +BOOST_AUTO_TEST_CASE(ExtraAttribute) { - string output = restrict_html("test"); + string output = restrict_html("test"); BOOST_CHECK_EQUAL(string("test"), output); } -BOOST_AUTO_TEST_CASE(AhrefLink2) -{ - string output = restrict_html("test"); - BOOST_CHECK_EQUAL(string("test"), output); -} - -BOOST_AUTO_TEST_CASE(AhrefLink3) +BOOST_AUTO_TEST_CASE(ExtraAttribute2) { string output = restrict_html("test"); BOOST_CHECK_EQUAL(string("test"), output); } -BOOST_AUTO_TEST_CASE(AhrefLink4) +BOOST_AUTO_TEST_CASE(ExtraAttribute3) { string output = restrict_html("test"); BOOST_CHECK_EQUAL(string("test"), output); } -BOOST_AUTO_TEST_CASE(AhrefLink5) +BOOST_AUTO_TEST_CASE(AhrefLink) { - string output = restrict_html("\" Test Me"); - BOOST_CHECK_EQUAL(string("" Test Me"), output); + string output = restrict_html("test"); + BOOST_CHECK_EQUAL(string("test"), output); +} + +BOOST_AUTO_TEST_CASE(AhrefLink2) +{ + string output = restrict_html("test"); + BOOST_CHECK_EQUAL(string("test"), output); } BOOST_AUTO_TEST_CASE(AhrefProtocol) { - string output = restrict_html("foo"); - BOOST_CHECK_EQUAL(string("foo"), output); + string output = restrict_html("foo"); + BOOST_CHECK_EQUAL(string("foo"), output); } BOOST_AUTO_TEST_CASE(AhrefWrongProtocol) { - string output = restrict_html("foo"); + string output = restrict_html("foo"); BOOST_CHECK_EQUAL(string("foo"), output); } @@ -135,7 +131,7 @@ BOOST_AUTO_TEST_CASE(UnsafeURLChars) BOOST_AUTO_TEST_CASE(UnsafeURLChars2) { - string output = restrict_html(" Test Me!!"); + string output = restrict_html(" Test Me!!"); BOOST_CHECK_EQUAL(string(" Test Me!!"), output); } @@ -164,6 +160,7 @@ BOOST_AUTO_TEST_CASE(EncodeStringURL2) string output = encode_url("http://www.google.com/