libi2ncommon: (tomj) fix typo
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
e93545dd
GE
1/***************************************************************************
2 escape.cpp - escaping of strings
3 -------------------
4 begin : Sun Nov 14 1999
5 copyright : (C) 1999 by Intra2net AG
6 email : info@intra2net.com
7 ***************************************************************************/
8
9#include <iostream>
10#include <string>
11#include <sstream>
12#include <stdexcept>
13
a5f3af6e 14#include <wchar.h>
e93545dd
GE
15#include <stdlib.h>
16#include <iconv.h>
17#include <i18n.h>
18
19#include <stringfunc.hxx>
20
21using namespace std;
22
23std::string iso_to_utf8(const std::string& isostring)
24{
25 string result;
118e216e 26
e93545dd 27 iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1");
118e216e 28
e93545dd
GE
29 if (iso_to_utf8 == (iconv_t)-1)
30 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 31
e93545dd
GE
32 size_t in_size=isostring.size();
33 size_t out_size=in_size*4;
118e216e 34
e93545dd
GE
35 char *buf = (char *)malloc(out_size+1);
36 if (buf == NULL)
37 throw runtime_error("out of memory for iconv buffer");
38
39 const char *in = isostring.c_str();
40 char *out = buf;
41 iconv (i2utf8, &in, &in_size, &out, &out_size);
118e216e 42
e93545dd 43 buf[isostring.size()*4-out_size]=0;
118e216e 44
e93545dd 45 result=buf;
118e216e 46
e93545dd
GE
47 free(buf);
48 iconv_close (i2utf8);
118e216e 49
e93545dd
GE
50 return result;
51}
52
53std::string utf8_to_iso(const std::string& utf8string)
54{
55 string result;
118e216e 56
e93545dd 57 iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8");
118e216e 58
e93545dd
GE
59 if (utf82iso == (iconv_t)-1)
60 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 61
e93545dd
GE
62 size_t in_size=utf8string.size();
63 size_t out_size=in_size;
118e216e 64
e93545dd
GE
65 char *buf = (char *)malloc(out_size+1);
66 if (buf == NULL)
67 throw runtime_error("out of memory for iconv buffer");
68
69 const char *in = utf8string.c_str();
70 char *out = buf;
71 iconv (utf82iso, &in, &in_size, &out, &out_size);
118e216e 72
e93545dd 73 buf[utf8string.size()-out_size]=0;
118e216e 74
e93545dd 75 result=buf;
118e216e 76
e93545dd
GE
77 free(buf);
78 iconv_close (utf82iso);
79
80 return result;
81}
82
a5f3af6e
GE
83wchar_t* utf8_to_wbuf(const std::string& utf8string)
84{
85 iconv_t utf82wstr = iconv_open ("UCS-4LE","UTF-8");
86
87 if (utf82wstr == (iconv_t)-1)
88 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
89
90 size_t in_size=utf8string.size();
91 size_t out_size=(in_size+1)*sizeof(wchar_t);
92
93 wchar_t *buf = (wchar_t *)malloc(out_size);
94 if (buf == NULL)
95 throw runtime_error("out of memory for iconv buffer");
96
97 const char *in = utf8string.c_str();
98 char *out = (char*)buf;
99 if (iconv (utf82wstr, &in, &in_size, &out, &out_size) == -1)
100 throw runtime_error("error converting char encodings");
101
102 buf[((utf8string.size()+1)*sizeof(wchar_t)-out_size)/sizeof(wchar_t)]=0;
103
104 iconv_close (utf82wstr);
105
106 return buf;
107}
108
13cc4db1 109std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071
TJ
110{
111 string result;
118e216e 112
13cc4db1 113 iconv_t utf7imap2utf8 = iconv_open ("UTF-8","UTF-7-IMAP");
118e216e 114
13cc4db1
TJ
115 if (utf7imap2utf8 == (iconv_t)-1)
116 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 117
d116a071 118 size_t in_size=utf7imapstring.size();
13cc4db1 119 size_t out_size=in_size*4;
118e216e 120
d116a071
TJ
121 char *buf = (char *)malloc(out_size+1);
122 if (buf == NULL)
123 throw runtime_error("out of memory for iconv buffer");
124
125 const char *in = utf7imapstring.c_str();
126 char *out = buf;
13cc4db1 127 iconv (utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 128
13cc4db1 129 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 130
d116a071 131 result=buf;
118e216e 132
d116a071 133 free(buf);
13cc4db1 134 iconv_close (utf7imap2utf8);
118e216e
TJ
135
136 return result;
137}
138
6a2b6dd1
TJ
139std::string utf8_to_utf7imap(const std::string& utf8string)
140{
141 string result;
142
143 iconv_t utf82utf7imap = iconv_open ("UTF-7-IMAP", "UTF-8");
144
145 if (utf82utf7imap == (iconv_t)-1)
146 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
147
148 // UTF-7 is base64 encoded, a buffer 10x as large
149 // as the utf-8 buffer should be enough. If not the string will be truncated.
150 size_t in_size=utf8string.size();
151 size_t out_size=in_size*10;
152
153 char *buf = (char *)malloc(out_size+1);
154 if (buf == NULL)
155 throw runtime_error("out of memory for iconv buffer");
156
157 const char *in = utf8string.c_str();
158 char *out = buf;
159 iconv (utf82utf7imap, &in, &in_size, &out, &out_size);
160
161 buf[utf8string.size()*10-out_size]=0;
162
163 result=buf;
164
165 free(buf);
166 iconv_close (utf82utf7imap);
167
168 return result;
169}
170
118e216e
TJ
171// Tokenize string by (html) tags
172void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
173{
174 string::size_type pos, len = input.size();
175 bool inside_tag = false;
176 string current;
177
178 for (pos = 0; pos < len; pos++) {
179 if (input[pos] == '<') {
180 inside_tag = true;
181
182 if (!current.empty()) {
183 tokenized.push_back(make_pair(current, false));
184 current = "";
185 }
186
187 current += input[pos];
188 } else if (input[pos] == '>' && inside_tag) {
189 current += input[pos];
190 inside_tag = false;
191 if (!current.empty()) {
192 tokenized.push_back(make_pair(current, true));
193 current = "";
194 }
195 } else
196 current += input[pos];
197 }
198
199 // String left over in buffer?
200 if (!current.empty())
201 tokenized.push_back(make_pair(current, false));
202}
203
204std::string strip_html_tags(const std::string &input)
205{
206 // Pair first: string, second: isTag
207 vector<pair<string,bool> > tokenized;
208 tokenize_by_tag(tokenized, input);
209
210 string output;
211 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
212 for (token = tokenized.begin(); token != tokens_end; token++)
213 if (!token->second)
214 output += token->first;
215
216 return output;
217}
218
219// Smart-encode HTML en
220string smart_html_entities(const std::string &input)
221{
222 // Pair first: string, second: isTag
223 vector<pair<string,bool> > tokenized;
224 tokenize_by_tag(tokenized, input);
225
226 string output;
227 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
228 for (token = tokenized.begin(); token != tokens_end; token++) {
229 // keep HTML tags as they are
230 if (token->second)
231 output += token->first;
232 else
233 output += html_entities(token->first);
234 }
235
236 return output;
237}
238
a5f3af6e
GE
239string::size_type find_8bit(const std::string &str)
240{
241 string::size_type l=str.size();
242 for (string::size_type p=0; p < l; p++)
243 if (static_cast<unsigned char>(str[p]) > 127)
244 return p;
245
246 return string::npos;
247}
248
118e216e
TJ
249// encoded UTF-8 chars into HTML entities
250string html_entities(std::string str)
251{
252 // Normal chars
253 replace_all (str, "&", "&amp;");
254 replace_all (str, "\"", "&quot;");
255 replace_all (str, "<", "&lt;");
256 replace_all (str, ">", "&gt;");
257
258 // Umlauts
a5f3af6e
GE
259 replace_all (str, "\xC3\xA4", "&auml;");
260 replace_all (str, "\xC3\xB6", "&ouml;");
261 replace_all (str, "\xC3\xBC", "&uuml;");
262 replace_all (str, "\xC3\x84", "&Auml;");
263 replace_all (str, "\xC3\x96", "&Ouml;");
264 replace_all (str, "\xC3\x9C", "&Uuml;");
118e216e
TJ
265
266 // Misc
a5f3af6e
GE
267 replace_all (str, "\xC3\x9F", "&szlig;");
268
269 // conversion of remaining non-ASCII chars needed?
270 // just do if needed because of performance
271 if (find_8bit(str) != string::npos)
272 {
273 // convert to fixed-size encoding UTF-32
274 wchar_t* wbuf=utf8_to_wbuf(str);
275 ostringstream target;
276
277 // replace all non-ASCII chars with HTML representation
278 for (int p=0; wbuf[p] != 0; p++)
279 {
280 unsigned int c=wbuf[p];
281
282 if (c <= 127)
283 target << static_cast<unsigned char>(c);
284 else
285 target << "&#" << c << ';';
286 }
287
288 free(wbuf);
289
290 str=target.str();
291 }
118e216e
TJ
292
293 return str;
294}
295
e93545dd
GE
296bool replace_all(string &base, const char *ist, const char *soll)
297{
298 string i=ist;
299 string s=soll;
300 return replace_all(base,&i,&s);
301}
302
303bool replace_all(string &base, const string &ist, const char *soll)
304{
305 string s=soll;
306 return replace_all(base,&ist,&s);
307}
308
309bool replace_all(string &base, const string *ist, const string *soll)
310{
311 return replace_all(base,*ist,*soll);
312}
313
314bool replace_all(string &base, const char *ist, const string *soll)
315{
316 string i=ist;
317 return replace_all(base,&i,soll);
318}
319
320bool replace_all(string &base, const string &ist, const string &soll)
321{
322 bool found_ist = false;
323 string::size_type a=0;
324
1ec2064e
TJ
325 if (ist.empty())
326 throw runtime_error("replace_all called with empty search string");
327
e93545dd
GE
328 while((a=base.find(ist,a))!=string::npos)
329 {
330 base.replace(a,ist.size(),soll);
331 a=a+soll.size();
332 found_ist = true;
333 }
334
335 return found_ist;
336}
337
338string to_lower(const string &src)
339{
340 string dst = src;
341
ca189cac 342 string::size_type pos, end = dst.size();
e93545dd
GE
343 for (pos = 0; pos < end; pos++)
344 dst[pos] = tolower(dst[pos]);
345
346 return dst;
347}
348
349string to_upper(const string &src)
350{
351 string dst = src;
352
ca189cac 353 string::size_type pos, end = dst.size();
e93545dd
GE
354 for (pos = 0; pos < end; pos++)
355 dst[pos] = toupper(dst[pos]);
356
357 return dst;
358}
359
360string nice_unit_format (int input) {
361 float size = input;
362 int sizecount = 0;
363
364 while (size > 1000) {
365 size = size / 1000;
366 sizecount++;
367 }
368
369 float tmp; // round
370 tmp = size*10;
371 tmp += 0.5;
372 tmp = int (tmp);
373 tmp = float(tmp)/float(10);
374 size = tmp;
375
376 ostringstream out;
377
378 out.setf (ios::fixed);
379 out.precision(2);
380 switch (sizecount) {
381 case 1:
382 out << size << i18n(" KBytes");
383 break;
384 case 2:
385 out << size << i18n(" MBytes");
386 break;
387 case 3:
388 out << size << i18n(" Gbytes");
389 break;
390 default:
391 out << size << i18n(" Bytes");
392 break;
393 }
394
395 return out.str();
396}
397
47c07fba
GE
398string escape(const string &s)
399{
400 string out(s);
401 string::size_type p;
402
403 p=0;
404 while ((p=out.find_first_of("\"\\",p))!=out.npos)
405 {
406 out.insert(p,"\\");
407 p+=2;
408 }
409
410 p=0;
411 while ((p=out.find_first_of("\r",p))!=out.npos)
412 {
413 out.replace(p,1,"\\r");
414 p+=2;
415 }
416
417 p=0;
418 while ((p=out.find_first_of("\n",p))!=out.npos)
419 {
420 out.replace(p,1,"\\n");
421 p+=2;
422 }
423
424 out='"'+out+'"';
425
426 return out;
427}
428
429string descape(const string &s, int startpos, int &endpos)
430{
431 string out;
432
433 if (s.at(startpos) != '"')
434 throw out_of_range("value not type escaped string");
435
436 out=s.substr(startpos+1);
437 string::size_type p=0;
438
439 // search for the end of the string
440 while((p=out.find("\"",p))!=out.npos)
441 {
442 int e=p-1;
443 bool escaped=false;
444
445 // the " might be escaped with a backslash
446 while(e>=0 && out.at(e)=='\\')
447 {
448 if (escaped == false)
449 escaped=true;
450 else
451 escaped=false;
452
453 e--;
454 }
455
456 if (escaped==false)
457 break;
458 else
459 p++;
460 }
461
462 // we now have the end of the string
463 out=out.substr(0,p);
464
465 // tell calling prog about the endposition
466 endpos=startpos+p+1;
467
468 // descape all \ stuff inside the string now
469 p=0;
470 while((p=out.find_first_of("\\",p))!=out.npos)
471 {
472 switch(out.at(p+1))
473 {
474 case 'r':
475 out.replace(p,2,"\r");
476 break;
477 case 'n':
478 out.replace(p,2,"\n");
479 break;
480 default:
481 out.erase(p,1);
482 }
483 p++;
484 }
485
486 return out;
487}
e93545dd 488
47c07fba
GE
489string escape_shellarg(const string &input)
490{
47c07fba
GE
491 string output = "'";
492 string::const_iterator it, it_end = input.end();
493 for (it = input.begin(); it != it_end; it++) {
494 if ((*it) == '\'')
495 output += "'\\'";
0f65241e 496
47c07fba
GE
497 output += *it;
498 }
0f65241e 499
47c07fba
GE
500 output += "'";
501 return output;
502}