Commit | Line | Data |
---|---|---|
e93545dd GE |
1 | /*************************************************************************** |
2 | escape.cpp - escaping of strings | |
3 | ------------------- | |
4 | begin : Sun Nov 14 1999 | |
5 | copyright : (C) 1999 by Intra2net AG | |
6 | email : info@intra2net.com | |
7 | ***************************************************************************/ | |
8 | ||
9 | #include <iostream> | |
10 | #include <string> | |
11 | #include <sstream> | |
12 | #include <stdexcept> | |
13 | ||
a5f3af6e | 14 | #include <wchar.h> |
e93545dd GE |
15 | #include <stdlib.h> |
16 | #include <iconv.h> | |
17 | #include <i18n.h> | |
18 | ||
19 | #include <stringfunc.hxx> | |
20 | ||
21 | using namespace std; | |
22 | ||
23 | std::string iso_to_utf8(const std::string& isostring) | |
24 | { | |
25 | string result; | |
118e216e | 26 | |
e93545dd | 27 | iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1"); |
118e216e | 28 | |
e93545dd GE |
29 | if (iso_to_utf8 == (iconv_t)-1) |
30 | throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8"); | |
118e216e | 31 | |
e93545dd GE |
32 | size_t in_size=isostring.size(); |
33 | size_t out_size=in_size*4; | |
118e216e | 34 | |
e93545dd GE |
35 | char *buf = (char *)malloc(out_size+1); |
36 | if (buf == NULL) | |
37 | throw runtime_error("out of memory for iconv buffer"); | |
38 | ||
39 | const char *in = isostring.c_str(); | |
40 | char *out = buf; | |
41 | iconv (i2utf8, &in, &in_size, &out, &out_size); | |
118e216e | 42 | |
e93545dd | 43 | buf[isostring.size()*4-out_size]=0; |
118e216e | 44 | |
e93545dd | 45 | result=buf; |
118e216e | 46 | |
e93545dd GE |
47 | free(buf); |
48 | iconv_close (i2utf8); | |
118e216e | 49 | |
e93545dd GE |
50 | return result; |
51 | } | |
52 | ||
53 | std::string utf8_to_iso(const std::string& utf8string) | |
54 | { | |
55 | string result; | |
118e216e | 56 | |
e93545dd | 57 | iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8"); |
118e216e | 58 | |
e93545dd GE |
59 | if (utf82iso == (iconv_t)-1) |
60 | throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1"); | |
118e216e | 61 | |
e93545dd GE |
62 | size_t in_size=utf8string.size(); |
63 | size_t out_size=in_size; | |
118e216e | 64 | |
e93545dd GE |
65 | char *buf = (char *)malloc(out_size+1); |
66 | if (buf == NULL) | |
67 | throw runtime_error("out of memory for iconv buffer"); | |
68 | ||
69 | const char *in = utf8string.c_str(); | |
70 | char *out = buf; | |
71 | iconv (utf82iso, &in, &in_size, &out, &out_size); | |
118e216e | 72 | |
e93545dd | 73 | buf[utf8string.size()-out_size]=0; |
118e216e | 74 | |
e93545dd | 75 | result=buf; |
118e216e | 76 | |
e93545dd GE |
77 | free(buf); |
78 | iconv_close (utf82iso); | |
79 | ||
80 | return result; | |
81 | } | |
82 | ||
a5f3af6e GE |
83 | wchar_t* utf8_to_wbuf(const std::string& utf8string) |
84 | { | |
85 | iconv_t utf82wstr = iconv_open ("UCS-4LE","UTF-8"); | |
86 | ||
87 | if (utf82wstr == (iconv_t)-1) | |
88 | throw runtime_error("iconv can't convert from UTF-8 to UCS-4"); | |
89 | ||
90 | size_t in_size=utf8string.size(); | |
91 | size_t out_size=(in_size+1)*sizeof(wchar_t); | |
92 | ||
93 | wchar_t *buf = (wchar_t *)malloc(out_size); | |
94 | if (buf == NULL) | |
95 | throw runtime_error("out of memory for iconv buffer"); | |
96 | ||
97 | const char *in = utf8string.c_str(); | |
98 | char *out = (char*)buf; | |
99 | if (iconv (utf82wstr, &in, &in_size, &out, &out_size) == -1) | |
100 | throw runtime_error("error converting char encodings"); | |
101 | ||
102 | buf[((utf8string.size()+1)*sizeof(wchar_t)-out_size)/sizeof(wchar_t)]=0; | |
103 | ||
104 | iconv_close (utf82wstr); | |
105 | ||
106 | return buf; | |
107 | } | |
108 | ||
13cc4db1 | 109 | std::string utf7imap_to_utf8(const std::string& utf7imapstring) |
d116a071 TJ |
110 | { |
111 | string result; | |
118e216e | 112 | |
13cc4db1 | 113 | iconv_t utf7imap2utf8 = iconv_open ("UTF-8","UTF-7-IMAP"); |
118e216e | 114 | |
13cc4db1 TJ |
115 | if (utf7imap2utf8 == (iconv_t)-1) |
116 | throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8"); | |
118e216e | 117 | |
d116a071 | 118 | size_t in_size=utf7imapstring.size(); |
13cc4db1 | 119 | size_t out_size=in_size*4; |
118e216e | 120 | |
d116a071 TJ |
121 | char *buf = (char *)malloc(out_size+1); |
122 | if (buf == NULL) | |
123 | throw runtime_error("out of memory for iconv buffer"); | |
124 | ||
125 | const char *in = utf7imapstring.c_str(); | |
126 | char *out = buf; | |
13cc4db1 | 127 | iconv (utf7imap2utf8, &in, &in_size, &out, &out_size); |
118e216e | 128 | |
13cc4db1 | 129 | buf[utf7imapstring.size()*4-out_size]=0; |
118e216e | 130 | |
d116a071 | 131 | result=buf; |
118e216e | 132 | |
d116a071 | 133 | free(buf); |
13cc4db1 | 134 | iconv_close (utf7imap2utf8); |
118e216e TJ |
135 | |
136 | return result; | |
137 | } | |
138 | ||
139 | // Tokenize string by (html) tags | |
140 | void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input) | |
141 | { | |
142 | string::size_type pos, len = input.size(); | |
143 | bool inside_tag = false; | |
144 | string current; | |
145 | ||
146 | for (pos = 0; pos < len; pos++) { | |
147 | if (input[pos] == '<') { | |
148 | inside_tag = true; | |
149 | ||
150 | if (!current.empty()) { | |
151 | tokenized.push_back(make_pair(current, false)); | |
152 | current = ""; | |
153 | } | |
154 | ||
155 | current += input[pos]; | |
156 | } else if (input[pos] == '>' && inside_tag) { | |
157 | current += input[pos]; | |
158 | inside_tag = false; | |
159 | if (!current.empty()) { | |
160 | tokenized.push_back(make_pair(current, true)); | |
161 | current = ""; | |
162 | } | |
163 | } else | |
164 | current += input[pos]; | |
165 | } | |
166 | ||
167 | // String left over in buffer? | |
168 | if (!current.empty()) | |
169 | tokenized.push_back(make_pair(current, false)); | |
170 | } | |
171 | ||
172 | std::string strip_html_tags(const std::string &input) | |
173 | { | |
174 | // Pair first: string, second: isTag | |
175 | vector<pair<string,bool> > tokenized; | |
176 | tokenize_by_tag(tokenized, input); | |
177 | ||
178 | string output; | |
179 | vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end(); | |
180 | for (token = tokenized.begin(); token != tokens_end; token++) | |
181 | if (!token->second) | |
182 | output += token->first; | |
183 | ||
184 | return output; | |
185 | } | |
186 | ||
187 | // Smart-encode HTML en | |
188 | string smart_html_entities(const std::string &input) | |
189 | { | |
190 | // Pair first: string, second: isTag | |
191 | vector<pair<string,bool> > tokenized; | |
192 | tokenize_by_tag(tokenized, input); | |
193 | ||
194 | string output; | |
195 | vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end(); | |
196 | for (token = tokenized.begin(); token != tokens_end; token++) { | |
197 | // keep HTML tags as they are | |
198 | if (token->second) | |
199 | output += token->first; | |
200 | else | |
201 | output += html_entities(token->first); | |
202 | } | |
203 | ||
204 | return output; | |
205 | } | |
206 | ||
a5f3af6e GE |
207 | string::size_type find_8bit(const std::string &str) |
208 | { | |
209 | string::size_type l=str.size(); | |
210 | for (string::size_type p=0; p < l; p++) | |
211 | if (static_cast<unsigned char>(str[p]) > 127) | |
212 | return p; | |
213 | ||
214 | return string::npos; | |
215 | } | |
216 | ||
118e216e TJ |
217 | // encoded UTF-8 chars into HTML entities |
218 | string html_entities(std::string str) | |
219 | { | |
220 | // Normal chars | |
221 | replace_all (str, "&", "&"); | |
222 | replace_all (str, "\"", """); | |
223 | replace_all (str, "<", "<"); | |
224 | replace_all (str, ">", ">"); | |
225 | ||
226 | // Umlauts | |
a5f3af6e GE |
227 | replace_all (str, "\xC3\xA4", "ä"); |
228 | replace_all (str, "\xC3\xB6", "ö"); | |
229 | replace_all (str, "\xC3\xBC", "ü"); | |
230 | replace_all (str, "\xC3\x84", "Ä"); | |
231 | replace_all (str, "\xC3\x96", "Ö"); | |
232 | replace_all (str, "\xC3\x9C", "Ü"); | |
118e216e TJ |
233 | |
234 | // Misc | |
a5f3af6e GE |
235 | replace_all (str, "\xC3\x9F", "ß"); |
236 | ||
237 | // conversion of remaining non-ASCII chars needed? | |
238 | // just do if needed because of performance | |
239 | if (find_8bit(str) != string::npos) | |
240 | { | |
241 | // convert to fixed-size encoding UTF-32 | |
242 | wchar_t* wbuf=utf8_to_wbuf(str); | |
243 | ostringstream target; | |
244 | ||
245 | // replace all non-ASCII chars with HTML representation | |
246 | for (int p=0; wbuf[p] != 0; p++) | |
247 | { | |
248 | unsigned int c=wbuf[p]; | |
249 | ||
250 | if (c <= 127) | |
251 | target << static_cast<unsigned char>(c); | |
252 | else | |
253 | target << "&#" << c << ';'; | |
254 | } | |
255 | ||
256 | free(wbuf); | |
257 | ||
258 | str=target.str(); | |
259 | } | |
118e216e TJ |
260 | |
261 | return str; | |
262 | } | |
263 | ||
e93545dd GE |
264 | bool replace_all(string &base, const char *ist, const char *soll) |
265 | { | |
266 | string i=ist; | |
267 | string s=soll; | |
268 | return replace_all(base,&i,&s); | |
269 | } | |
270 | ||
271 | bool replace_all(string &base, const string &ist, const char *soll) | |
272 | { | |
273 | string s=soll; | |
274 | return replace_all(base,&ist,&s); | |
275 | } | |
276 | ||
277 | bool replace_all(string &base, const string *ist, const string *soll) | |
278 | { | |
279 | return replace_all(base,*ist,*soll); | |
280 | } | |
281 | ||
282 | bool replace_all(string &base, const char *ist, const string *soll) | |
283 | { | |
284 | string i=ist; | |
285 | return replace_all(base,&i,soll); | |
286 | } | |
287 | ||
288 | bool replace_all(string &base, const string &ist, const string &soll) | |
289 | { | |
290 | bool found_ist = false; | |
291 | string::size_type a=0; | |
292 | ||
1ec2064e TJ |
293 | if (ist.empty()) |
294 | throw runtime_error("replace_all called with empty search string"); | |
295 | ||
e93545dd GE |
296 | while((a=base.find(ist,a))!=string::npos) |
297 | { | |
298 | base.replace(a,ist.size(),soll); | |
299 | a=a+soll.size(); | |
300 | found_ist = true; | |
301 | } | |
302 | ||
303 | return found_ist; | |
304 | } | |
305 | ||
306 | string to_lower(const string &src) | |
307 | { | |
308 | string dst = src; | |
309 | ||
ca189cac | 310 | string::size_type pos, end = dst.size(); |
e93545dd GE |
311 | for (pos = 0; pos < end; pos++) |
312 | dst[pos] = tolower(dst[pos]); | |
313 | ||
314 | return dst; | |
315 | } | |
316 | ||
317 | string to_upper(const string &src) | |
318 | { | |
319 | string dst = src; | |
320 | ||
ca189cac | 321 | string::size_type pos, end = dst.size(); |
e93545dd GE |
322 | for (pos = 0; pos < end; pos++) |
323 | dst[pos] = toupper(dst[pos]); | |
324 | ||
325 | return dst; | |
326 | } | |
327 | ||
328 | string nice_unit_format (int input) { | |
329 | float size = input; | |
330 | int sizecount = 0; | |
331 | ||
332 | while (size > 1000) { | |
333 | size = size / 1000; | |
334 | sizecount++; | |
335 | } | |
336 | ||
337 | float tmp; // round | |
338 | tmp = size*10; | |
339 | tmp += 0.5; | |
340 | tmp = int (tmp); | |
341 | tmp = float(tmp)/float(10); | |
342 | size = tmp; | |
343 | ||
344 | ostringstream out; | |
345 | ||
346 | out.setf (ios::fixed); | |
347 | out.precision(2); | |
348 | switch (sizecount) { | |
349 | case 1: | |
350 | out << size << i18n(" KBytes"); | |
351 | break; | |
352 | case 2: | |
353 | out << size << i18n(" MBytes"); | |
354 | break; | |
355 | case 3: | |
356 | out << size << i18n(" Gbytes"); | |
357 | break; | |
358 | default: | |
359 | out << size << i18n(" Bytes"); | |
360 | break; | |
361 | } | |
362 | ||
363 | return out.str(); | |
364 | } | |
365 | ||
47c07fba GE |
366 | string escape(const string &s) |
367 | { | |
368 | string out(s); | |
369 | string::size_type p; | |
370 | ||
371 | p=0; | |
372 | while ((p=out.find_first_of("\"\\",p))!=out.npos) | |
373 | { | |
374 | out.insert(p,"\\"); | |
375 | p+=2; | |
376 | } | |
377 | ||
378 | p=0; | |
379 | while ((p=out.find_first_of("\r",p))!=out.npos) | |
380 | { | |
381 | out.replace(p,1,"\\r"); | |
382 | p+=2; | |
383 | } | |
384 | ||
385 | p=0; | |
386 | while ((p=out.find_first_of("\n",p))!=out.npos) | |
387 | { | |
388 | out.replace(p,1,"\\n"); | |
389 | p+=2; | |
390 | } | |
391 | ||
392 | out='"'+out+'"'; | |
393 | ||
394 | return out; | |
395 | } | |
396 | ||
397 | string descape(const string &s, int startpos, int &endpos) | |
398 | { | |
399 | string out; | |
400 | ||
401 | if (s.at(startpos) != '"') | |
402 | throw out_of_range("value not type escaped string"); | |
403 | ||
404 | out=s.substr(startpos+1); | |
405 | string::size_type p=0; | |
406 | ||
407 | // search for the end of the string | |
408 | while((p=out.find("\"",p))!=out.npos) | |
409 | { | |
410 | int e=p-1; | |
411 | bool escaped=false; | |
412 | ||
413 | // the " might be escaped with a backslash | |
414 | while(e>=0 && out.at(e)=='\\') | |
415 | { | |
416 | if (escaped == false) | |
417 | escaped=true; | |
418 | else | |
419 | escaped=false; | |
420 | ||
421 | e--; | |
422 | } | |
423 | ||
424 | if (escaped==false) | |
425 | break; | |
426 | else | |
427 | p++; | |
428 | } | |
429 | ||
430 | // we now have the end of the string | |
431 | out=out.substr(0,p); | |
432 | ||
433 | // tell calling prog about the endposition | |
434 | endpos=startpos+p+1; | |
435 | ||
436 | // descape all \ stuff inside the string now | |
437 | p=0; | |
438 | while((p=out.find_first_of("\\",p))!=out.npos) | |
439 | { | |
440 | switch(out.at(p+1)) | |
441 | { | |
442 | case 'r': | |
443 | out.replace(p,2,"\r"); | |
444 | break; | |
445 | case 'n': | |
446 | out.replace(p,2,"\n"); | |
447 | break; | |
448 | default: | |
449 | out.erase(p,1); | |
450 | } | |
451 | p++; | |
452 | } | |
453 | ||
454 | return out; | |
455 | } | |
e93545dd | 456 | |
47c07fba GE |
457 | string escape_shellarg(const string &input) |
458 | { | |
47c07fba GE |
459 | string output = "'"; |
460 | string::const_iterator it, it_end = input.end(); | |
461 | for (it = input.begin(); it != it_end; it++) { | |
462 | if ((*it) == '\'') | |
463 | output += "'\\'"; | |
0f65241e | 464 | |
47c07fba GE |
465 | output += *it; |
466 | } | |
0f65241e | 467 | |
47c07fba GE |
468 | output += "'"; |
469 | return output; | |
470 | } |