From 118e216ee5f4246b6ea2492a7e91d4b374335c0b Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Tue, 28 Mar 2006 15:19:30 +0000 Subject: [PATCH] libi2ncommon: (tomj) smart HTML entities engine --- libi2ncommon.kdevelop | 2 +- src/stringfunc.cpp | 184 ++++++++++++++++++++++++++++++++++++------------ src/stringfunc.hxx | 8 ++- test/Makefile.am | 2 +- test/stringfunc.cpp | 102 +++++++++++++++++++++++++++ 5 files changed, 248 insertions(+), 50 deletions(-) create mode 100644 test/stringfunc.cpp diff --git a/libi2ncommon.kdevelop b/libi2ncommon.kdevelop index 4f98356..01beb7f 100644 --- a/libi2ncommon.kdevelop +++ b/libi2ncommon.kdevelop @@ -21,7 +21,7 @@ true executable - + diff --git a/src/stringfunc.cpp b/src/stringfunc.cpp index 199b277..2cf0ddc 100644 --- a/src/stringfunc.cpp +++ b/src/stringfunc.cpp @@ -22,15 +22,15 @@ using namespace std; std::string iso_to_utf8(const std::string& isostring) { string result; - + iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1"); - + if (iso_to_utf8 == (iconv_t)-1) throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8"); - + size_t in_size=isostring.size(); size_t out_size=in_size*4; - + char *buf = (char *)malloc(out_size+1); if (buf == NULL) throw runtime_error("out of memory for iconv buffer"); @@ -38,29 +38,29 @@ std::string iso_to_utf8(const std::string& isostring) const char *in = isostring.c_str(); char *out = buf; iconv (i2utf8, &in, &in_size, &out, &out_size); - + buf[isostring.size()*4-out_size]=0; - + result=buf; - + free(buf); iconv_close (i2utf8); - + return result; } std::string utf8_to_iso(const std::string& utf8string) { string result; - + iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8"); - + if (utf82iso == (iconv_t)-1) throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1"); - + size_t in_size=utf8string.size(); size_t out_size=in_size; - + char *buf = (char *)malloc(out_size+1); if (buf == NULL) throw runtime_error("out of memory for iconv buffer"); @@ -68,53 +68,29 @@ std::string utf8_to_iso(const std::string& utf8string) const char *in = utf8string.c_str(); char *out = buf; iconv (utf82iso, &in, &in_size, &out, &out_size); - + buf[utf8string.size()-out_size]=0; - + result=buf; - + free(buf); iconv_close (utf82iso); return result; } -std::string iso_to_html(const std::string& isostring, bool showerr_bug) -{ - string result = isostring; - - // TODO: This needs to be removed soon by a proper - // HTML quoted chars engine. Then we can also remove ü from i18n files. - if (!showerr_bug) { - replace_all (result, "\"", """); - replace_all (result, "&", "&"); - replace_all (result, "<", "<"); - replace_all (result, ">", ">"); - } - - replace_all (result, "ä", "ä"); - replace_all (result, "ö", "ö"); - replace_all (result, "ü", "ü"); - replace_all (result, "Ä", "Ä"); - replace_all (result, "Ö", "Ö"); - replace_all (result, "Ü", "Ü"); - replace_all (result, "ß", "ß"); - - return result; -} - std::string utf7imap_to_iso(const std::string& utf7imapstring) { string result; - + iconv_t utf7imap2iso = iconv_open ("ISO-8859-1","UTF-7-IMAP"); - + if (utf7imap2iso == (iconv_t)-1) throw runtime_error("iconv can't convert from UTF-7-IMAP to ISO-8859-1"); - + size_t in_size=utf7imapstring.size(); size_t out_size=in_size; - + char *buf = (char *)malloc(out_size+1); if (buf == NULL) throw runtime_error("out of memory for iconv buffer"); @@ -122,17 +98,133 @@ std::string utf7imap_to_iso(const std::string& utf7imapstring) const char *in = utf7imapstring.c_str(); char *out = buf; iconv (utf7imap2iso, &in, &in_size, &out, &out_size); - + buf[utf7imapstring.size()-out_size]=0; - + result=buf; - + free(buf); iconv_close (utf7imap2iso); return result; } +// DEPRECATED, WILL BE REMOVED TOMORROW! +std::string iso_to_html(const std::string& isostring, bool showerr_bug) +{ + string result = isostring; + + // TODO: This needs to be removed soon by a proper + // HTML quoted chars engine. Then we can also remove ü from i18n files. + if (!showerr_bug) { + replace_all (result, "&", "&"); + replace_all (result, "\"", """); + replace_all (result, "<", "<"); + replace_all (result, ">", ">"); + } + + replace_all (result, utf8_to_iso("ä"), "ä"); + replace_all (result, utf8_to_iso("ö"), "ö"); + replace_all (result, utf8_to_iso("ü"), "ü"); + replace_all (result, utf8_to_iso("Ä"), "Ä"); + replace_all (result, utf8_to_iso("Ö"), "Ö"); + replace_all (result, utf8_to_iso("Ü"), "Ü"); + replace_all (result, utf8_to_iso("ß"), "ß"); + + return result; +} + +// Tokenize string by (html) tags +void tokenize_by_tag(vector > &tokenized, const std::string &input) +{ + string::size_type pos, len = input.size(); + bool inside_tag = false; + string current; + + for (pos = 0; pos < len; pos++) { + if (input[pos] == '<') { + inside_tag = true; + + if (!current.empty()) { + tokenized.push_back(make_pair(current, false)); + current = ""; + } + + current += input[pos]; + } else if (input[pos] == '>' && inside_tag) { + current += input[pos]; + inside_tag = false; + if (!current.empty()) { + tokenized.push_back(make_pair(current, true)); + current = ""; + } + } else + current += input[pos]; + } + + // String left over in buffer? + if (!current.empty()) + tokenized.push_back(make_pair(current, false)); +} + +std::string strip_html_tags(const std::string &input) +{ + // Pair first: string, second: isTag + vector > tokenized; + tokenize_by_tag(tokenized, input); + + string output; + vector >::const_iterator token, tokens_end = tokenized.end(); + for (token = tokenized.begin(); token != tokens_end; token++) + if (!token->second) + output += token->first; + + return output; +} + +// Smart-encode HTML en +string smart_html_entities(const std::string &input) +{ + // Pair first: string, second: isTag + vector > tokenized; + tokenize_by_tag(tokenized, input); + + string output; + vector >::const_iterator token, tokens_end = tokenized.end(); + for (token = tokenized.begin(); token != tokens_end; token++) { + // keep HTML tags as they are + if (token->second) + output += token->first; + else + output += html_entities(token->first); + } + + return output; +} + +// encoded UTF-8 chars into HTML entities +string html_entities(std::string str) +{ + // Normal chars + replace_all (str, "&", "&"); + replace_all (str, "\"", """); + replace_all (str, "<", "<"); + replace_all (str, ">", ">"); + + // Umlauts + replace_all (str, "ä", "ä"); + replace_all (str, "ö", "ö"); + replace_all (str, "ü", "ü"); + replace_all (str, "Ä", "Ä"); + replace_all (str, "Ö", "Ö"); + replace_all (str, "Ü", "Ü"); + + // Misc + replace_all (str, "ß", "ß"); + + return str; +} + bool replace_all(string &base, const char *ist, const char *soll) { string i=ist; diff --git a/src/stringfunc.hxx b/src/stringfunc.hxx index d7c4c10..0fda2d1 100644 --- a/src/stringfunc.hxx +++ b/src/stringfunc.hxx @@ -22,10 +22,14 @@ bool replace_all(std::string &base, const std::string &ist, const std::string &s std::string iso_to_utf8(const std::string& isostring); std::string utf8_to_iso(const std::string& utf8string); -std::string iso_to_html(const std::string& isostring, bool showerr_bug=false); - std::string utf7imap_to_iso(const std::string &utf7imapstring); +std::string iso_to_html(const std::string& isostring, bool showerr_bug); // DEPRECATED! + +std::string strip_html_tags(const std::string &input); +std::string smart_html_entities(const std::string &input); +std::string html_entities(std::string str); + std::string escape(const std::string &s); std::string descape(const std::string &s, int startpos, int &endpos); diff --git a/test/Makefile.am b/test/Makefile.am index e939b4d..4aa118b 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,7 +1,7 @@ INCLUDES = -I$(top_srcdir)/src @CPPUNIT_CFLAGS@ METASOURCES = AUTO check_PROGRAMS = test -test_SOURCES = ip_range.cpp test.cpp +test_SOURCES = stringfunc.cpp ip_range.cpp test.cpp test_LDADD = $(top_builddir)/src/libi2ncommon.la @CPPUNIT_LIBS@ TESTS = test diff --git a/test/stringfunc.cpp b/test/stringfunc.cpp new file mode 100644 index 0000000..6759959 --- /dev/null +++ b/test/stringfunc.cpp @@ -0,0 +1,102 @@ +/*************************************************************************** + * Copyright (C) 2006 by Intra2net AG * + * info@intra2net.com * + * * + ***************************************************************************/ + +// #include +#include +// #include +// #include + +#include +#include +#include + +#include + +using namespace std; +using namespace CppUnit; + +class stringfunc : public TestFixture +{ + CPPUNIT_TEST_SUITE(stringfunc); + + CPPUNIT_TEST(smart_html_entites1); + CPPUNIT_TEST(smart_html_entites2); + CPPUNIT_TEST(smart_html_entites3); + CPPUNIT_TEST(smart_html_entites4); + CPPUNIT_TEST(smart_html_entites5); + CPPUNIT_TEST(smart_html_entites6); + CPPUNIT_TEST(smart_html_entites7); + CPPUNIT_TEST(strip_html_tags1); + CPPUNIT_TEST(strip_html_tags2); + + CPPUNIT_TEST_SUITE_END(); + + public: + void smart_html_entites1() + { + string output = smart_html_entities("Test"); + + CPPUNIT_ASSERT_EQUAL(string("Test"), output); + } + + void smart_html_entites2() + { + string output = smart_html_entities("Täst"); + + CPPUNIT_ASSERT_EQUAL(string("Täst"), output); + } + + void smart_html_entites3() + { + string output = smart_html_entities("<>"); + + CPPUNIT_ASSERT_EQUAL(string("<>"), output); + } + + void smart_html_entites4() + { + string output = smart_html_entities("<ümlaut>"); + + CPPUNIT_ASSERT_EQUAL(string("<ümlaut>"), output); + } + + void smart_html_entites5() + { + string output = smart_html_entities("Test<ümlaut>Blä"); + + CPPUNIT_ASSERT_EQUAL(string("Test<ümlaut>Blä"), output); + } + + void smart_html_entites6() + { + string output = smart_html_entities("System > Einstellungen"); + + CPPUNIT_ASSERT_EQUAL(string("System > Einstellungen"), output); + } + + void smart_html_entites7() + { + string output = smart_html_entities("Finden Sie auf der Seite \"System > Einstellungen\". Oder etwa nicht?"); + + CPPUNIT_ASSERT_EQUAL(string("Finden Sie auf der Seite "System > Einstellungen". Oder etwa nicht?"), output); + } + + void strip_html_tags1() + { + string output = strip_html_tags("Was für ein schöner Tag, finden Sie nicht?"); + + CPPUNIT_ASSERT_EQUAL(string("Was für ein schöner Tag, finden Sie nicht?"), output); + } + + void strip_html_tags2() + { + string output = strip_html_tags("Was für ein schöner Tag, finden Sie nicht?"); + + CPPUNIT_ASSERT_EQUAL(string("Was für ein schöner Tag, finden Sie nicht?"), output); + } +}; + +CPPUNIT_TEST_SUITE_REGISTRATION(stringfunc); -- 1.7.1