libi2ncommon: (tomj) smart HTML entities engine
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
e93545dd
GE
1/***************************************************************************
2 escape.cpp - escaping of strings
3 -------------------
4 begin : Sun Nov 14 1999
5 copyright : (C) 1999 by Intra2net AG
6 email : info@intra2net.com
7 ***************************************************************************/
8
9#include <iostream>
10#include <string>
11#include <sstream>
12#include <stdexcept>
13
14#include <stdlib.h>
15#include <iconv.h>
16#include <i18n.h>
17
18#include <stringfunc.hxx>
19
20using namespace std;
21
22std::string iso_to_utf8(const std::string& isostring)
23{
24 string result;
118e216e 25
e93545dd 26 iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1");
118e216e 27
e93545dd
GE
28 if (iso_to_utf8 == (iconv_t)-1)
29 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 30
e93545dd
GE
31 size_t in_size=isostring.size();
32 size_t out_size=in_size*4;
118e216e 33
e93545dd
GE
34 char *buf = (char *)malloc(out_size+1);
35 if (buf == NULL)
36 throw runtime_error("out of memory for iconv buffer");
37
38 const char *in = isostring.c_str();
39 char *out = buf;
40 iconv (i2utf8, &in, &in_size, &out, &out_size);
118e216e 41
e93545dd 42 buf[isostring.size()*4-out_size]=0;
118e216e 43
e93545dd 44 result=buf;
118e216e 45
e93545dd
GE
46 free(buf);
47 iconv_close (i2utf8);
118e216e 48
e93545dd
GE
49 return result;
50}
51
52std::string utf8_to_iso(const std::string& utf8string)
53{
54 string result;
118e216e 55
e93545dd 56 iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8");
118e216e 57
e93545dd
GE
58 if (utf82iso == (iconv_t)-1)
59 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 60
e93545dd
GE
61 size_t in_size=utf8string.size();
62 size_t out_size=in_size;
118e216e 63
e93545dd
GE
64 char *buf = (char *)malloc(out_size+1);
65 if (buf == NULL)
66 throw runtime_error("out of memory for iconv buffer");
67
68 const char *in = utf8string.c_str();
69 char *out = buf;
70 iconv (utf82iso, &in, &in_size, &out, &out_size);
118e216e 71
e93545dd 72 buf[utf8string.size()-out_size]=0;
118e216e 73
e93545dd 74 result=buf;
118e216e 75
e93545dd
GE
76 free(buf);
77 iconv_close (utf82iso);
78
79 return result;
80}
81
d116a071
TJ
82std::string utf7imap_to_iso(const std::string& utf7imapstring)
83{
84 string result;
118e216e 85
d116a071 86 iconv_t utf7imap2iso = iconv_open ("ISO-8859-1","UTF-7-IMAP");
118e216e 87
d116a071
TJ
88 if (utf7imap2iso == (iconv_t)-1)
89 throw runtime_error("iconv can't convert from UTF-7-IMAP to ISO-8859-1");
118e216e 90
d116a071
TJ
91 size_t in_size=utf7imapstring.size();
92 size_t out_size=in_size;
118e216e 93
d116a071
TJ
94 char *buf = (char *)malloc(out_size+1);
95 if (buf == NULL)
96 throw runtime_error("out of memory for iconv buffer");
97
98 const char *in = utf7imapstring.c_str();
99 char *out = buf;
100 iconv (utf7imap2iso, &in, &in_size, &out, &out_size);
118e216e 101
d116a071 102 buf[utf7imapstring.size()-out_size]=0;
118e216e 103
d116a071 104 result=buf;
118e216e 105
d116a071
TJ
106 free(buf);
107 iconv_close (utf7imap2iso);
108
109 return result;
110}
111
118e216e
TJ
112// DEPRECATED, WILL BE REMOVED TOMORROW!
113std::string iso_to_html(const std::string& isostring, bool showerr_bug)
114{
115 string result = isostring;
116
117 // TODO: This needs to be removed soon by a proper
118 // HTML quoted chars engine. Then we can also remove &uuml; from i18n files.
119 if (!showerr_bug) {
120 replace_all (result, "&", "&amp;");
121 replace_all (result, "\"", "&quot;");
122 replace_all (result, "<", "&lt;");
123 replace_all (result, ">", "&gt;");
124 }
125
126 replace_all (result, utf8_to_iso("ä"), "&auml;");
127 replace_all (result, utf8_to_iso("ö"), "&ouml;");
128 replace_all (result, utf8_to_iso("ü"), "&uuml;");
129 replace_all (result, utf8_to_iso("Ä"), "&Auml;");
130 replace_all (result, utf8_to_iso("Ö"), "&Ouml;");
131 replace_all (result, utf8_to_iso("Ü"), "&Uuml;");
132 replace_all (result, utf8_to_iso("ß"), "&szlig;");
133
134 return result;
135}
136
137// Tokenize string by (html) tags
138void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
139{
140 string::size_type pos, len = input.size();
141 bool inside_tag = false;
142 string current;
143
144 for (pos = 0; pos < len; pos++) {
145 if (input[pos] == '<') {
146 inside_tag = true;
147
148 if (!current.empty()) {
149 tokenized.push_back(make_pair(current, false));
150 current = "";
151 }
152
153 current += input[pos];
154 } else if (input[pos] == '>' && inside_tag) {
155 current += input[pos];
156 inside_tag = false;
157 if (!current.empty()) {
158 tokenized.push_back(make_pair(current, true));
159 current = "";
160 }
161 } else
162 current += input[pos];
163 }
164
165 // String left over in buffer?
166 if (!current.empty())
167 tokenized.push_back(make_pair(current, false));
168}
169
170std::string strip_html_tags(const std::string &input)
171{
172 // Pair first: string, second: isTag
173 vector<pair<string,bool> > tokenized;
174 tokenize_by_tag(tokenized, input);
175
176 string output;
177 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
178 for (token = tokenized.begin(); token != tokens_end; token++)
179 if (!token->second)
180 output += token->first;
181
182 return output;
183}
184
185// Smart-encode HTML en
186string smart_html_entities(const std::string &input)
187{
188 // Pair first: string, second: isTag
189 vector<pair<string,bool> > tokenized;
190 tokenize_by_tag(tokenized, input);
191
192 string output;
193 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
194 for (token = tokenized.begin(); token != tokens_end; token++) {
195 // keep HTML tags as they are
196 if (token->second)
197 output += token->first;
198 else
199 output += html_entities(token->first);
200 }
201
202 return output;
203}
204
205// encoded UTF-8 chars into HTML entities
206string html_entities(std::string str)
207{
208 // Normal chars
209 replace_all (str, "&", "&amp;");
210 replace_all (str, "\"", "&quot;");
211 replace_all (str, "<", "&lt;");
212 replace_all (str, ">", "&gt;");
213
214 // Umlauts
215 replace_all (str, "ä", "&auml;");
216 replace_all (str, "ö", "&ouml;");
217 replace_all (str, "ü", "&uuml;");
218 replace_all (str, "Ä", "&Auml;");
219 replace_all (str, "Ö", "&Ouml;");
220 replace_all (str, "Ü", "&Uuml;");
221
222 // Misc
223 replace_all (str, "ß", "&szlig;");
224
225 return str;
226}
227
e93545dd
GE
228bool replace_all(string &base, const char *ist, const char *soll)
229{
230 string i=ist;
231 string s=soll;
232 return replace_all(base,&i,&s);
233}
234
235bool replace_all(string &base, const string &ist, const char *soll)
236{
237 string s=soll;
238 return replace_all(base,&ist,&s);
239}
240
241bool replace_all(string &base, const string *ist, const string *soll)
242{
243 return replace_all(base,*ist,*soll);
244}
245
246bool replace_all(string &base, const char *ist, const string *soll)
247{
248 string i=ist;
249 return replace_all(base,&i,soll);
250}
251
252bool replace_all(string &base, const string &ist, const string &soll)
253{
254 bool found_ist = false;
255 string::size_type a=0;
256
1ec2064e
TJ
257 if (ist.empty())
258 throw runtime_error("replace_all called with empty search string");
259
e93545dd
GE
260 while((a=base.find(ist,a))!=string::npos)
261 {
262 base.replace(a,ist.size(),soll);
263 a=a+soll.size();
264 found_ist = true;
265 }
266
267 return found_ist;
268}
269
270string to_lower(const string &src)
271{
272 string dst = src;
273
274 string::size_type pos = 0, end = dst.size();
275 for (pos = 0; pos < end; pos++)
276 dst[pos] = tolower(dst[pos]);
277
278 return dst;
279}
280
281string to_upper(const string &src)
282{
283 string dst = src;
284
285 string::size_type pos = 0, end = dst.size();
286 for (pos = 0; pos < end; pos++)
287 dst[pos] = toupper(dst[pos]);
288
289 return dst;
290}
291
292string nice_unit_format (int input) {
293 float size = input;
294 int sizecount = 0;
295
296 while (size > 1000) {
297 size = size / 1000;
298 sizecount++;
299 }
300
301 float tmp; // round
302 tmp = size*10;
303 tmp += 0.5;
304 tmp = int (tmp);
305 tmp = float(tmp)/float(10);
306 size = tmp;
307
308 ostringstream out;
309
310 out.setf (ios::fixed);
311 out.precision(2);
312 switch (sizecount) {
313 case 1:
314 out << size << i18n(" KBytes");
315 break;
316 case 2:
317 out << size << i18n(" MBytes");
318 break;
319 case 3:
320 out << size << i18n(" Gbytes");
321 break;
322 default:
323 out << size << i18n(" Bytes");
324 break;
325 }
326
327 return out.str();
328}
329
47c07fba
GE
330string escape(const string &s)
331{
332 string out(s);
333 string::size_type p;
334
335 p=0;
336 while ((p=out.find_first_of("\"\\",p))!=out.npos)
337 {
338 out.insert(p,"\\");
339 p+=2;
340 }
341
342 p=0;
343 while ((p=out.find_first_of("\r",p))!=out.npos)
344 {
345 out.replace(p,1,"\\r");
346 p+=2;
347 }
348
349 p=0;
350 while ((p=out.find_first_of("\n",p))!=out.npos)
351 {
352 out.replace(p,1,"\\n");
353 p+=2;
354 }
355
356 out='"'+out+'"';
357
358 return out;
359}
360
361string descape(const string &s, int startpos, int &endpos)
362{
363 string out;
364
365 if (s.at(startpos) != '"')
366 throw out_of_range("value not type escaped string");
367
368 out=s.substr(startpos+1);
369 string::size_type p=0;
370
371 // search for the end of the string
372 while((p=out.find("\"",p))!=out.npos)
373 {
374 int e=p-1;
375 bool escaped=false;
376
377 // the " might be escaped with a backslash
378 while(e>=0 && out.at(e)=='\\')
379 {
380 if (escaped == false)
381 escaped=true;
382 else
383 escaped=false;
384
385 e--;
386 }
387
388 if (escaped==false)
389 break;
390 else
391 p++;
392 }
393
394 // we now have the end of the string
395 out=out.substr(0,p);
396
397 // tell calling prog about the endposition
398 endpos=startpos+p+1;
399
400 // descape all \ stuff inside the string now
401 p=0;
402 while((p=out.find_first_of("\\",p))!=out.npos)
403 {
404 switch(out.at(p+1))
405 {
406 case 'r':
407 out.replace(p,2,"\r");
408 break;
409 case 'n':
410 out.replace(p,2,"\n");
411 break;
412 default:
413 out.erase(p,1);
414 }
415 p++;
416 }
417
418 return out;
419}
e93545dd 420
47c07fba
GE
421string escape_shellarg(const string &input)
422{
423 if (!input.size())
424 return "";
425
426 string output = "'";
427 string::const_iterator it, it_end = input.end();
428 for (it = input.begin(); it != it_end; it++) {
429 if ((*it) == '\'')
430 output += "'\\'";
431
432 output += *it;
433 }
434
435 output += "'";
436 return output;
437}