Commit | Line | Data |
---|---|---|
e93545dd GE |
1 | /*************************************************************************** |
2 | escape.cpp - escaping of strings | |
3 | ------------------- | |
4 | begin : Sun Nov 14 1999 | |
5 | copyright : (C) 1999 by Intra2net AG | |
6 | email : info@intra2net.com | |
7 | ***************************************************************************/ | |
8 | ||
9 | #include <iostream> | |
10 | #include <string> | |
11 | #include <sstream> | |
12 | #include <stdexcept> | |
13 | ||
a5f3af6e | 14 | #include <wchar.h> |
e93545dd GE |
15 | #include <stdlib.h> |
16 | #include <iconv.h> | |
17 | #include <i18n.h> | |
18 | ||
19 | #include <stringfunc.hxx> | |
20 | ||
21 | using namespace std; | |
22 | ||
23 | std::string iso_to_utf8(const std::string& isostring) | |
24 | { | |
25 | string result; | |
118e216e | 26 | |
e93545dd | 27 | iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1"); |
118e216e | 28 | |
e93545dd GE |
29 | if (iso_to_utf8 == (iconv_t)-1) |
30 | throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8"); | |
118e216e | 31 | |
e93545dd GE |
32 | size_t in_size=isostring.size(); |
33 | size_t out_size=in_size*4; | |
118e216e | 34 | |
e93545dd GE |
35 | char *buf = (char *)malloc(out_size+1); |
36 | if (buf == NULL) | |
37 | throw runtime_error("out of memory for iconv buffer"); | |
38 | ||
39 | const char *in = isostring.c_str(); | |
40 | char *out = buf; | |
41 | iconv (i2utf8, &in, &in_size, &out, &out_size); | |
118e216e | 42 | |
e93545dd | 43 | buf[isostring.size()*4-out_size]=0; |
118e216e | 44 | |
e93545dd | 45 | result=buf; |
118e216e | 46 | |
e93545dd GE |
47 | free(buf); |
48 | iconv_close (i2utf8); | |
118e216e | 49 | |
e93545dd GE |
50 | return result; |
51 | } | |
52 | ||
53 | std::string utf8_to_iso(const std::string& utf8string) | |
54 | { | |
55 | string result; | |
118e216e | 56 | |
e93545dd | 57 | iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8"); |
118e216e | 58 | |
e93545dd GE |
59 | if (utf82iso == (iconv_t)-1) |
60 | throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1"); | |
118e216e | 61 | |
e93545dd GE |
62 | size_t in_size=utf8string.size(); |
63 | size_t out_size=in_size; | |
118e216e | 64 | |
e93545dd GE |
65 | char *buf = (char *)malloc(out_size+1); |
66 | if (buf == NULL) | |
67 | throw runtime_error("out of memory for iconv buffer"); | |
68 | ||
69 | const char *in = utf8string.c_str(); | |
70 | char *out = buf; | |
71 | iconv (utf82iso, &in, &in_size, &out, &out_size); | |
118e216e | 72 | |
e93545dd | 73 | buf[utf8string.size()-out_size]=0; |
118e216e | 74 | |
e93545dd | 75 | result=buf; |
118e216e | 76 | |
e93545dd GE |
77 | free(buf); |
78 | iconv_close (utf82iso); | |
79 | ||
80 | return result; | |
81 | } | |
82 | ||
a5f3af6e GE |
83 | wchar_t* utf8_to_wbuf(const std::string& utf8string) |
84 | { | |
85 | iconv_t utf82wstr = iconv_open ("UCS-4LE","UTF-8"); | |
86 | ||
87 | if (utf82wstr == (iconv_t)-1) | |
88 | throw runtime_error("iconv can't convert from UTF-8 to UCS-4"); | |
89 | ||
90 | size_t in_size=utf8string.size(); | |
91 | size_t out_size=(in_size+1)*sizeof(wchar_t); | |
92 | ||
93 | wchar_t *buf = (wchar_t *)malloc(out_size); | |
94 | if (buf == NULL) | |
95 | throw runtime_error("out of memory for iconv buffer"); | |
96 | ||
97 | const char *in = utf8string.c_str(); | |
98 | char *out = (char*)buf; | |
99 | if (iconv (utf82wstr, &in, &in_size, &out, &out_size) == -1) | |
100 | throw runtime_error("error converting char encodings"); | |
101 | ||
102 | buf[((utf8string.size()+1)*sizeof(wchar_t)-out_size)/sizeof(wchar_t)]=0; | |
103 | ||
104 | iconv_close (utf82wstr); | |
105 | ||
106 | return buf; | |
107 | } | |
108 | ||
13cc4db1 | 109 | std::string utf7imap_to_utf8(const std::string& utf7imapstring) |
d116a071 TJ |
110 | { |
111 | string result; | |
118e216e | 112 | |
13cc4db1 | 113 | iconv_t utf7imap2utf8 = iconv_open ("UTF-8","UTF-7-IMAP"); |
118e216e | 114 | |
13cc4db1 TJ |
115 | if (utf7imap2utf8 == (iconv_t)-1) |
116 | throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8"); | |
118e216e | 117 | |
d116a071 | 118 | size_t in_size=utf7imapstring.size(); |
13cc4db1 | 119 | size_t out_size=in_size*4; |
118e216e | 120 | |
d116a071 TJ |
121 | char *buf = (char *)malloc(out_size+1); |
122 | if (buf == NULL) | |
123 | throw runtime_error("out of memory for iconv buffer"); | |
124 | ||
125 | const char *in = utf7imapstring.c_str(); | |
126 | char *out = buf; | |
13cc4db1 | 127 | iconv (utf7imap2utf8, &in, &in_size, &out, &out_size); |
118e216e | 128 | |
13cc4db1 | 129 | buf[utf7imapstring.size()*4-out_size]=0; |
118e216e | 130 | |
d116a071 | 131 | result=buf; |
118e216e | 132 | |
d116a071 | 133 | free(buf); |
13cc4db1 | 134 | iconv_close (utf7imap2utf8); |
118e216e TJ |
135 | |
136 | return result; | |
137 | } | |
138 | ||
6a2b6dd1 TJ |
139 | std::string utf8_to_utf7imap(const std::string& utf8string) |
140 | { | |
141 | string result; | |
142 | ||
143 | iconv_t utf82utf7imap = iconv_open ("UTF-7-IMAP", "UTF-8"); | |
144 | ||
145 | if (utf82utf7imap == (iconv_t)-1) | |
146 | throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8"); | |
147 | ||
148 | // UTF-7 is base64 encoded, a buffer 10x as large | |
149 | // as the utf-8 buffer should be enough. If not the string will be truncated. | |
150 | size_t in_size=utf8string.size(); | |
151 | size_t out_size=in_size*10; | |
152 | ||
153 | char *buf = (char *)malloc(out_size+1); | |
154 | if (buf == NULL) | |
155 | throw runtime_error("out of memory for iconv buffer"); | |
156 | ||
157 | const char *in = utf8string.c_str(); | |
158 | char *out = buf; | |
159 | iconv (utf82utf7imap, &in, &in_size, &out, &out_size); | |
160 | ||
161 | buf[utf8string.size()*10-out_size]=0; | |
162 | ||
163 | result=buf; | |
164 | ||
165 | free(buf); | |
166 | iconv_close (utf82utf7imap); | |
167 | ||
168 | return result; | |
169 | } | |
170 | ||
118e216e TJ |
171 | // Tokenize string by (html) tags |
172 | void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input) | |
173 | { | |
174 | string::size_type pos, len = input.size(); | |
175 | bool inside_tag = false; | |
176 | string current; | |
177 | ||
178 | for (pos = 0; pos < len; pos++) { | |
179 | if (input[pos] == '<') { | |
180 | inside_tag = true; | |
181 | ||
182 | if (!current.empty()) { | |
183 | tokenized.push_back(make_pair(current, false)); | |
184 | current = ""; | |
185 | } | |
186 | ||
187 | current += input[pos]; | |
188 | } else if (input[pos] == '>' && inside_tag) { | |
189 | current += input[pos]; | |
190 | inside_tag = false; | |
191 | if (!current.empty()) { | |
192 | tokenized.push_back(make_pair(current, true)); | |
193 | current = ""; | |
194 | } | |
195 | } else | |
196 | current += input[pos]; | |
197 | } | |
198 | ||
199 | // String left over in buffer? | |
200 | if (!current.empty()) | |
201 | tokenized.push_back(make_pair(current, false)); | |
202 | } | |
203 | ||
204 | std::string strip_html_tags(const std::string &input) | |
205 | { | |
206 | // Pair first: string, second: isTag | |
207 | vector<pair<string,bool> > tokenized; | |
208 | tokenize_by_tag(tokenized, input); | |
209 | ||
210 | string output; | |
211 | vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end(); | |
212 | for (token = tokenized.begin(); token != tokens_end; token++) | |
213 | if (!token->second) | |
214 | output += token->first; | |
215 | ||
216 | return output; | |
217 | } | |
218 | ||
219 | // Smart-encode HTML en | |
220 | string smart_html_entities(const std::string &input) | |
221 | { | |
222 | // Pair first: string, second: isTag | |
223 | vector<pair<string,bool> > tokenized; | |
224 | tokenize_by_tag(tokenized, input); | |
225 | ||
226 | string output; | |
227 | vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end(); | |
228 | for (token = tokenized.begin(); token != tokens_end; token++) { | |
229 | // keep HTML tags as they are | |
230 | if (token->second) | |
231 | output += token->first; | |
232 | else | |
233 | output += html_entities(token->first); | |
234 | } | |
235 | ||
236 | return output; | |
237 | } | |
238 | ||
a5f3af6e GE |
239 | string::size_type find_8bit(const std::string &str) |
240 | { | |
241 | string::size_type l=str.size(); | |
242 | for (string::size_type p=0; p < l; p++) | |
243 | if (static_cast<unsigned char>(str[p]) > 127) | |
244 | return p; | |
245 | ||
246 | return string::npos; | |
247 | } | |
248 | ||
118e216e TJ |
249 | // encoded UTF-8 chars into HTML entities |
250 | string html_entities(std::string str) | |
251 | { | |
252 | // Normal chars | |
253 | replace_all (str, "&", "&"); | |
254 | replace_all (str, "\"", """); | |
255 | replace_all (str, "<", "<"); | |
256 | replace_all (str, ">", ">"); | |
257 | ||
258 | // Umlauts | |
a5f3af6e GE |
259 | replace_all (str, "\xC3\xA4", "ä"); |
260 | replace_all (str, "\xC3\xB6", "ö"); | |
261 | replace_all (str, "\xC3\xBC", "ü"); | |
262 | replace_all (str, "\xC3\x84", "Ä"); | |
263 | replace_all (str, "\xC3\x96", "Ö"); | |
264 | replace_all (str, "\xC3\x9C", "Ü"); | |
118e216e TJ |
265 | |
266 | // Misc | |
a5f3af6e GE |
267 | replace_all (str, "\xC3\x9F", "ß"); |
268 | ||
269 | // conversion of remaining non-ASCII chars needed? | |
270 | // just do if needed because of performance | |
271 | if (find_8bit(str) != string::npos) | |
272 | { | |
273 | // convert to fixed-size encoding UTF-32 | |
274 | wchar_t* wbuf=utf8_to_wbuf(str); | |
275 | ostringstream target; | |
276 | ||
277 | // replace all non-ASCII chars with HTML representation | |
278 | for (int p=0; wbuf[p] != 0; p++) | |
279 | { | |
280 | unsigned int c=wbuf[p]; | |
281 | ||
282 | if (c <= 127) | |
283 | target << static_cast<unsigned char>(c); | |
284 | else | |
285 | target << "&#" << c << ';'; | |
286 | } | |
287 | ||
288 | free(wbuf); | |
289 | ||
290 | str=target.str(); | |
291 | } | |
118e216e TJ |
292 | |
293 | return str; | |
294 | } | |
295 | ||
e93545dd GE |
296 | bool replace_all(string &base, const char *ist, const char *soll) |
297 | { | |
298 | string i=ist; | |
299 | string s=soll; | |
300 | return replace_all(base,&i,&s); | |
301 | } | |
302 | ||
303 | bool replace_all(string &base, const string &ist, const char *soll) | |
304 | { | |
305 | string s=soll; | |
306 | return replace_all(base,&ist,&s); | |
307 | } | |
308 | ||
309 | bool replace_all(string &base, const string *ist, const string *soll) | |
310 | { | |
311 | return replace_all(base,*ist,*soll); | |
312 | } | |
313 | ||
314 | bool replace_all(string &base, const char *ist, const string *soll) | |
315 | { | |
316 | string i=ist; | |
317 | return replace_all(base,&i,soll); | |
318 | } | |
319 | ||
320 | bool replace_all(string &base, const string &ist, const string &soll) | |
321 | { | |
322 | bool found_ist = false; | |
323 | string::size_type a=0; | |
324 | ||
1ec2064e TJ |
325 | if (ist.empty()) |
326 | throw runtime_error("replace_all called with empty search string"); | |
327 | ||
e93545dd GE |
328 | while((a=base.find(ist,a))!=string::npos) |
329 | { | |
330 | base.replace(a,ist.size(),soll); | |
331 | a=a+soll.size(); | |
332 | found_ist = true; | |
333 | } | |
334 | ||
335 | return found_ist; | |
336 | } | |
337 | ||
338 | string to_lower(const string &src) | |
339 | { | |
340 | string dst = src; | |
341 | ||
ca189cac | 342 | string::size_type pos, end = dst.size(); |
e93545dd GE |
343 | for (pos = 0; pos < end; pos++) |
344 | dst[pos] = tolower(dst[pos]); | |
345 | ||
346 | return dst; | |
347 | } | |
348 | ||
349 | string to_upper(const string &src) | |
350 | { | |
351 | string dst = src; | |
352 | ||
ca189cac | 353 | string::size_type pos, end = dst.size(); |
e93545dd GE |
354 | for (pos = 0; pos < end; pos++) |
355 | dst[pos] = toupper(dst[pos]); | |
356 | ||
357 | return dst; | |
358 | } | |
359 | ||
360 | string nice_unit_format (int input) { | |
361 | float size = input; | |
362 | int sizecount = 0; | |
363 | ||
364 | while (size > 1000) { | |
365 | size = size / 1000; | |
366 | sizecount++; | |
367 | } | |
368 | ||
369 | float tmp; // round | |
370 | tmp = size*10; | |
371 | tmp += 0.5; | |
372 | tmp = int (tmp); | |
373 | tmp = float(tmp)/float(10); | |
374 | size = tmp; | |
375 | ||
376 | ostringstream out; | |
377 | ||
378 | out.setf (ios::fixed); | |
379 | out.precision(2); | |
380 | switch (sizecount) { | |
381 | case 1: | |
382 | out << size << i18n(" KBytes"); | |
383 | break; | |
384 | case 2: | |
385 | out << size << i18n(" MBytes"); | |
386 | break; | |
387 | case 3: | |
388 | out << size << i18n(" Gbytes"); | |
389 | break; | |
390 | default: | |
391 | out << size << i18n(" Bytes"); | |
392 | break; | |
393 | } | |
394 | ||
395 | return out.str(); | |
396 | } | |
397 | ||
47c07fba GE |
398 | string escape(const string &s) |
399 | { | |
400 | string out(s); | |
401 | string::size_type p; | |
402 | ||
403 | p=0; | |
404 | while ((p=out.find_first_of("\"\\",p))!=out.npos) | |
405 | { | |
406 | out.insert(p,"\\"); | |
407 | p+=2; | |
408 | } | |
409 | ||
410 | p=0; | |
411 | while ((p=out.find_first_of("\r",p))!=out.npos) | |
412 | { | |
413 | out.replace(p,1,"\\r"); | |
414 | p+=2; | |
415 | } | |
416 | ||
417 | p=0; | |
418 | while ((p=out.find_first_of("\n",p))!=out.npos) | |
419 | { | |
420 | out.replace(p,1,"\\n"); | |
421 | p+=2; | |
422 | } | |
423 | ||
424 | out='"'+out+'"'; | |
425 | ||
426 | return out; | |
427 | } | |
428 | ||
429 | string descape(const string &s, int startpos, int &endpos) | |
430 | { | |
431 | string out; | |
432 | ||
433 | if (s.at(startpos) != '"') | |
434 | throw out_of_range("value not type escaped string"); | |
435 | ||
436 | out=s.substr(startpos+1); | |
437 | string::size_type p=0; | |
438 | ||
439 | // search for the end of the string | |
440 | while((p=out.find("\"",p))!=out.npos) | |
441 | { | |
442 | int e=p-1; | |
443 | bool escaped=false; | |
444 | ||
445 | // the " might be escaped with a backslash | |
446 | while(e>=0 && out.at(e)=='\\') | |
447 | { | |
448 | if (escaped == false) | |
449 | escaped=true; | |
450 | else | |
451 | escaped=false; | |
452 | ||
453 | e--; | |
454 | } | |
455 | ||
456 | if (escaped==false) | |
457 | break; | |
458 | else | |
459 | p++; | |
460 | } | |
461 | ||
462 | // we now have the end of the string | |
463 | out=out.substr(0,p); | |
464 | ||
465 | // tell calling prog about the endposition | |
466 | endpos=startpos+p+1; | |
467 | ||
468 | // descape all \ stuff inside the string now | |
469 | p=0; | |
470 | while((p=out.find_first_of("\\",p))!=out.npos) | |
471 | { | |
472 | switch(out.at(p+1)) | |
473 | { | |
474 | case 'r': | |
475 | out.replace(p,2,"\r"); | |
476 | break; | |
477 | case 'n': | |
478 | out.replace(p,2,"\n"); | |
479 | break; | |
480 | default: | |
481 | out.erase(p,1); | |
482 | } | |
483 | p++; | |
484 | } | |
485 | ||
486 | return out; | |
487 | } | |
e93545dd | 488 | |
47c07fba GE |
489 | string escape_shellarg(const string &input) |
490 | { | |
47c07fba GE |
491 | string output = "'"; |
492 | string::const_iterator it, it_end = input.end(); | |
493 | for (it = input.begin(); it != it_end; it++) { | |
494 | if ((*it) == '\'') | |
495 | output += "'\\'"; | |
0f65241e | 496 | |
47c07fba GE |
497 | output += *it; |
498 | } | |
0f65241e | 499 | |
47c07fba GE |
500 | output += "'"; |
501 | return output; | |
502 | } |