std::string iso_to_utf8(const std::string& isostring)
{
string result;
-
+
iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1");
-
+
if (iso_to_utf8 == (iconv_t)-1)
throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
-
+
size_t in_size=isostring.size();
size_t out_size=in_size*4;
-
+
char *buf = (char *)malloc(out_size+1);
if (buf == NULL)
throw runtime_error("out of memory for iconv buffer");
const char *in = isostring.c_str();
char *out = buf;
iconv (i2utf8, &in, &in_size, &out, &out_size);
-
+
buf[isostring.size()*4-out_size]=0;
-
+
result=buf;
-
+
free(buf);
iconv_close (i2utf8);
-
+
return result;
}
std::string utf8_to_iso(const std::string& utf8string)
{
string result;
-
+
iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8");
-
+
if (utf82iso == (iconv_t)-1)
throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
-
+
size_t in_size=utf8string.size();
size_t out_size=in_size;
-
+
char *buf = (char *)malloc(out_size+1);
if (buf == NULL)
throw runtime_error("out of memory for iconv buffer");
const char *in = utf8string.c_str();
char *out = buf;
iconv (utf82iso, &in, &in_size, &out, &out_size);
-
+
buf[utf8string.size()-out_size]=0;
-
+
result=buf;
-
+
free(buf);
iconv_close (utf82iso);
return result;
}
-std::string iso_to_html(const std::string& isostring, bool showerr_bug)
-{
- string result = isostring;
-
- // TODO: This needs to be removed soon by a proper
- // HTML quoted chars engine. Then we can also remove ü from i18n files.
- if (!showerr_bug) {
- replace_all (result, "\"", """);
- replace_all (result, "&", "&");
- replace_all (result, "<", "<");
- replace_all (result, ">", ">");
- }
-
- replace_all (result, "ä", "ä");
- replace_all (result, "ö", "ö");
- replace_all (result, "ü", "ü");
- replace_all (result, "Ä", "Ä");
- replace_all (result, "Ö", "Ö");
- replace_all (result, "Ü", "Ü");
- replace_all (result, "ß", "ß");
-
- return result;
-}
-
std::string utf7imap_to_iso(const std::string& utf7imapstring)
{
string result;
-
+
iconv_t utf7imap2iso = iconv_open ("ISO-8859-1","UTF-7-IMAP");
-
+
if (utf7imap2iso == (iconv_t)-1)
throw runtime_error("iconv can't convert from UTF-7-IMAP to ISO-8859-1");
-
+
size_t in_size=utf7imapstring.size();
size_t out_size=in_size;
-
+
char *buf = (char *)malloc(out_size+1);
if (buf == NULL)
throw runtime_error("out of memory for iconv buffer");
const char *in = utf7imapstring.c_str();
char *out = buf;
iconv (utf7imap2iso, &in, &in_size, &out, &out_size);
-
+
buf[utf7imapstring.size()-out_size]=0;
-
+
result=buf;
-
+
free(buf);
iconv_close (utf7imap2iso);
return result;
}
+// DEPRECATED, WILL BE REMOVED TOMORROW!
+std::string iso_to_html(const std::string& isostring, bool showerr_bug)
+{
+ string result = isostring;
+
+ // TODO: This needs to be removed soon by a proper
+ // HTML quoted chars engine. Then we can also remove ü from i18n files.
+ if (!showerr_bug) {
+ replace_all (result, "&", "&");
+ replace_all (result, "\"", """);
+ replace_all (result, "<", "<");
+ replace_all (result, ">", ">");
+ }
+
+ replace_all (result, utf8_to_iso("ä"), "ä");
+ replace_all (result, utf8_to_iso("ö"), "ö");
+ replace_all (result, utf8_to_iso("ü"), "ü");
+ replace_all (result, utf8_to_iso("Ä"), "Ä");
+ replace_all (result, utf8_to_iso("Ö"), "Ö");
+ replace_all (result, utf8_to_iso("Ü"), "Ü");
+ replace_all (result, utf8_to_iso("ß"), "ß");
+
+ return result;
+}
+
+// Tokenize string by (html) tags
+void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
+{
+ string::size_type pos, len = input.size();
+ bool inside_tag = false;
+ string current;
+
+ for (pos = 0; pos < len; pos++) {
+ if (input[pos] == '<') {
+ inside_tag = true;
+
+ if (!current.empty()) {
+ tokenized.push_back(make_pair(current, false));
+ current = "";
+ }
+
+ current += input[pos];
+ } else if (input[pos] == '>' && inside_tag) {
+ current += input[pos];
+ inside_tag = false;
+ if (!current.empty()) {
+ tokenized.push_back(make_pair(current, true));
+ current = "";
+ }
+ } else
+ current += input[pos];
+ }
+
+ // String left over in buffer?
+ if (!current.empty())
+ tokenized.push_back(make_pair(current, false));
+}
+
+std::string strip_html_tags(const std::string &input)
+{
+ // Pair first: string, second: isTag
+ vector<pair<string,bool> > tokenized;
+ tokenize_by_tag(tokenized, input);
+
+ string output;
+ vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
+ for (token = tokenized.begin(); token != tokens_end; token++)
+ if (!token->second)
+ output += token->first;
+
+ return output;
+}
+
+// Smart-encode HTML en
+string smart_html_entities(const std::string &input)
+{
+ // Pair first: string, second: isTag
+ vector<pair<string,bool> > tokenized;
+ tokenize_by_tag(tokenized, input);
+
+ string output;
+ vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
+ for (token = tokenized.begin(); token != tokens_end; token++) {
+ // keep HTML tags as they are
+ if (token->second)
+ output += token->first;
+ else
+ output += html_entities(token->first);
+ }
+
+ return output;
+}
+
+// encoded UTF-8 chars into HTML entities
+string html_entities(std::string str)
+{
+ // Normal chars
+ replace_all (str, "&", "&");
+ replace_all (str, "\"", """);
+ replace_all (str, "<", "<");
+ replace_all (str, ">", ">");
+
+ // Umlauts
+ replace_all (str, "ä", "ä");
+ replace_all (str, "ö", "ö");
+ replace_all (str, "ü", "ü");
+ replace_all (str, "Ä", "Ä");
+ replace_all (str, "Ö", "Ö");
+ replace_all (str, "Ü", "Ü");
+
+ // Misc
+ replace_all (str, "ß", "ß");
+
+ return str;
+}
+
bool replace_all(string &base, const char *ist, const char *soll)
{
string i=ist;
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2006 by Intra2net AG *
+ * info@intra2net.com *
+ * *
+ ***************************************************************************/
+
+// #include <iostream>
+#include <string>
+// #include <sstream>
+// #include <stdexcept>
+
+#include <cppunit/extensions/TestFactoryRegistry.h>
+#include <cppunit/ui/text/TestRunner.h>
+#include <cppunit/extensions/HelperMacros.h>
+
+#include <stringfunc.hxx>
+
+using namespace std;
+using namespace CppUnit;
+
+class stringfunc : public TestFixture
+{
+ CPPUNIT_TEST_SUITE(stringfunc);
+
+ CPPUNIT_TEST(smart_html_entites1);
+ CPPUNIT_TEST(smart_html_entites2);
+ CPPUNIT_TEST(smart_html_entites3);
+ CPPUNIT_TEST(smart_html_entites4);
+ CPPUNIT_TEST(smart_html_entites5);
+ CPPUNIT_TEST(smart_html_entites6);
+ CPPUNIT_TEST(smart_html_entites7);
+ CPPUNIT_TEST(strip_html_tags1);
+ CPPUNIT_TEST(strip_html_tags2);
+
+ CPPUNIT_TEST_SUITE_END();
+
+ public:
+ void smart_html_entites1()
+ {
+ string output = smart_html_entities("Test");
+
+ CPPUNIT_ASSERT_EQUAL(string("Test"), output);
+ }
+
+ void smart_html_entites2()
+ {
+ string output = smart_html_entities("Täst");
+
+ CPPUNIT_ASSERT_EQUAL(string("Täst"), output);
+ }
+
+ void smart_html_entites3()
+ {
+ string output = smart_html_entities("<>");
+
+ CPPUNIT_ASSERT_EQUAL(string("<>"), output);
+ }
+
+ void smart_html_entites4()
+ {
+ string output = smart_html_entities("<ümlaut>");
+
+ CPPUNIT_ASSERT_EQUAL(string("<ümlaut>"), output);
+ }
+
+ void smart_html_entites5()
+ {
+ string output = smart_html_entities("Test<ümlaut>Blä");
+
+ CPPUNIT_ASSERT_EQUAL(string("Test<ümlaut>Blä"), output);
+ }
+
+ void smart_html_entites6()
+ {
+ string output = smart_html_entities("System > Einstellungen");
+
+ CPPUNIT_ASSERT_EQUAL(string("System > Einstellungen"), output);
+ }
+
+ void smart_html_entites7()
+ {
+ string output = smart_html_entities("Finden Sie <b>auf</b> der Seite <a href=\"fdslfsl\">\"System > Einstellungen\"</a>. Oder etwa nicht?");
+
+ CPPUNIT_ASSERT_EQUAL(string("Finden Sie <b>auf</b> der Seite <a href=\"fdslfsl\">"System > Einstellungen"</a>. Oder etwa nicht?"), output);
+ }
+
+ void strip_html_tags1()
+ {
+ string output = strip_html_tags("Was für ein schöner Tag, finden Sie nicht?");
+
+ CPPUNIT_ASSERT_EQUAL(string("Was für ein schöner Tag, finden Sie nicht?"), output);
+ }
+
+ void strip_html_tags2()
+ {
+ string output = strip_html_tags("Was für ein <a href=\"wikipedia\" target=\"new\">schöner Tag</a>, finden Sie nicht?");
+
+ CPPUNIT_ASSERT_EQUAL(string("Was für ein schöner Tag, finden Sie nicht?"), output);
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(stringfunc);