ALL INTRA2NET-MADE PACKAGES: (gerd) update license
[libi2ncommon] / src / stringfunc.cpp
CommitLineData
e93545dd
GE
1/***************************************************************************
2 escape.cpp - escaping of strings
3 -------------------
4 begin : Sun Nov 14 1999
5 copyright : (C) 1999 by Intra2net AG
6 email : info@intra2net.com
7 ***************************************************************************/
8
9#include <iostream>
10#include <string>
11#include <sstream>
12#include <stdexcept>
13
a5f3af6e 14#include <wchar.h>
e93545dd
GE
15#include <stdlib.h>
16#include <iconv.h>
17#include <i18n.h>
18
19#include <stringfunc.hxx>
20
21using namespace std;
22
23std::string iso_to_utf8(const std::string& isostring)
24{
25 string result;
118e216e 26
e93545dd 27 iconv_t i2utf8 = iconv_open ("UTF-8", "ISO-8859-1");
118e216e 28
e93545dd
GE
29 if (iso_to_utf8 == (iconv_t)-1)
30 throw runtime_error("iconv can't convert from ISO-8859-1 to UTF-8");
118e216e 31
e93545dd
GE
32 size_t in_size=isostring.size();
33 size_t out_size=in_size*4;
118e216e 34
e93545dd
GE
35 char *buf = (char *)malloc(out_size+1);
36 if (buf == NULL)
37 throw runtime_error("out of memory for iconv buffer");
38
39 const char *in = isostring.c_str();
40 char *out = buf;
41 iconv (i2utf8, &in, &in_size, &out, &out_size);
118e216e 42
e93545dd 43 buf[isostring.size()*4-out_size]=0;
118e216e 44
e93545dd 45 result=buf;
118e216e 46
e93545dd
GE
47 free(buf);
48 iconv_close (i2utf8);
118e216e 49
e93545dd
GE
50 return result;
51}
52
53std::string utf8_to_iso(const std::string& utf8string)
54{
55 string result;
118e216e 56
e93545dd 57 iconv_t utf82iso = iconv_open ("ISO-8859-1","UTF-8");
118e216e 58
e93545dd
GE
59 if (utf82iso == (iconv_t)-1)
60 throw runtime_error("iconv can't convert from UTF-8 to ISO-8859-1");
118e216e 61
e93545dd
GE
62 size_t in_size=utf8string.size();
63 size_t out_size=in_size;
118e216e 64
e93545dd
GE
65 char *buf = (char *)malloc(out_size+1);
66 if (buf == NULL)
67 throw runtime_error("out of memory for iconv buffer");
68
69 const char *in = utf8string.c_str();
70 char *out = buf;
71 iconv (utf82iso, &in, &in_size, &out, &out_size);
118e216e 72
e93545dd 73 buf[utf8string.size()-out_size]=0;
118e216e 74
e93545dd 75 result=buf;
118e216e 76
e93545dd
GE
77 free(buf);
78 iconv_close (utf82iso);
79
80 return result;
81}
82
a5f3af6e
GE
83wchar_t* utf8_to_wbuf(const std::string& utf8string)
84{
85 iconv_t utf82wstr = iconv_open ("UCS-4LE","UTF-8");
86
87 if (utf82wstr == (iconv_t)-1)
88 throw runtime_error("iconv can't convert from UTF-8 to UCS-4");
89
90 size_t in_size=utf8string.size();
91 size_t out_size=(in_size+1)*sizeof(wchar_t);
92
93 wchar_t *buf = (wchar_t *)malloc(out_size);
94 if (buf == NULL)
95 throw runtime_error("out of memory for iconv buffer");
96
97 const char *in = utf8string.c_str();
98 char *out = (char*)buf;
99 if (iconv (utf82wstr, &in, &in_size, &out, &out_size) == -1)
100 throw runtime_error("error converting char encodings");
101
102 buf[((utf8string.size()+1)*sizeof(wchar_t)-out_size)/sizeof(wchar_t)]=0;
103
104 iconv_close (utf82wstr);
105
106 return buf;
107}
108
13cc4db1 109std::string utf7imap_to_utf8(const std::string& utf7imapstring)
d116a071
TJ
110{
111 string result;
118e216e 112
13cc4db1 113 iconv_t utf7imap2utf8 = iconv_open ("UTF-8","UTF-7-IMAP");
118e216e 114
13cc4db1
TJ
115 if (utf7imap2utf8 == (iconv_t)-1)
116 throw runtime_error("iconv can't convert from UTF-7-IMAP to UTF-8");
118e216e 117
d116a071 118 size_t in_size=utf7imapstring.size();
13cc4db1 119 size_t out_size=in_size*4;
118e216e 120
d116a071
TJ
121 char *buf = (char *)malloc(out_size+1);
122 if (buf == NULL)
123 throw runtime_error("out of memory for iconv buffer");
124
125 const char *in = utf7imapstring.c_str();
126 char *out = buf;
13cc4db1 127 iconv (utf7imap2utf8, &in, &in_size, &out, &out_size);
118e216e 128
13cc4db1 129 buf[utf7imapstring.size()*4-out_size]=0;
118e216e 130
d116a071 131 result=buf;
118e216e 132
d116a071 133 free(buf);
13cc4db1 134 iconv_close (utf7imap2utf8);
118e216e
TJ
135
136 return result;
137}
138
139// Tokenize string by (html) tags
140void tokenize_by_tag(vector<pair<string,bool> > &tokenized, const std::string &input)
141{
142 string::size_type pos, len = input.size();
143 bool inside_tag = false;
144 string current;
145
146 for (pos = 0; pos < len; pos++) {
147 if (input[pos] == '<') {
148 inside_tag = true;
149
150 if (!current.empty()) {
151 tokenized.push_back(make_pair(current, false));
152 current = "";
153 }
154
155 current += input[pos];
156 } else if (input[pos] == '>' && inside_tag) {
157 current += input[pos];
158 inside_tag = false;
159 if (!current.empty()) {
160 tokenized.push_back(make_pair(current, true));
161 current = "";
162 }
163 } else
164 current += input[pos];
165 }
166
167 // String left over in buffer?
168 if (!current.empty())
169 tokenized.push_back(make_pair(current, false));
170}
171
172std::string strip_html_tags(const std::string &input)
173{
174 // Pair first: string, second: isTag
175 vector<pair<string,bool> > tokenized;
176 tokenize_by_tag(tokenized, input);
177
178 string output;
179 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
180 for (token = tokenized.begin(); token != tokens_end; token++)
181 if (!token->second)
182 output += token->first;
183
184 return output;
185}
186
187// Smart-encode HTML en
188string smart_html_entities(const std::string &input)
189{
190 // Pair first: string, second: isTag
191 vector<pair<string,bool> > tokenized;
192 tokenize_by_tag(tokenized, input);
193
194 string output;
195 vector<pair<string,bool> >::const_iterator token, tokens_end = tokenized.end();
196 for (token = tokenized.begin(); token != tokens_end; token++) {
197 // keep HTML tags as they are
198 if (token->second)
199 output += token->first;
200 else
201 output += html_entities(token->first);
202 }
203
204 return output;
205}
206
a5f3af6e
GE
207string::size_type find_8bit(const std::string &str)
208{
209 string::size_type l=str.size();
210 for (string::size_type p=0; p < l; p++)
211 if (static_cast<unsigned char>(str[p]) > 127)
212 return p;
213
214 return string::npos;
215}
216
118e216e
TJ
217// encoded UTF-8 chars into HTML entities
218string html_entities(std::string str)
219{
220 // Normal chars
221 replace_all (str, "&", "&amp;");
222 replace_all (str, "\"", "&quot;");
223 replace_all (str, "<", "&lt;");
224 replace_all (str, ">", "&gt;");
225
226 // Umlauts
a5f3af6e
GE
227 replace_all (str, "\xC3\xA4", "&auml;");
228 replace_all (str, "\xC3\xB6", "&ouml;");
229 replace_all (str, "\xC3\xBC", "&uuml;");
230 replace_all (str, "\xC3\x84", "&Auml;");
231 replace_all (str, "\xC3\x96", "&Ouml;");
232 replace_all (str, "\xC3\x9C", "&Uuml;");
118e216e
TJ
233
234 // Misc
a5f3af6e
GE
235 replace_all (str, "\xC3\x9F", "&szlig;");
236
237 // conversion of remaining non-ASCII chars needed?
238 // just do if needed because of performance
239 if (find_8bit(str) != string::npos)
240 {
241 // convert to fixed-size encoding UTF-32
242 wchar_t* wbuf=utf8_to_wbuf(str);
243 ostringstream target;
244
245 // replace all non-ASCII chars with HTML representation
246 for (int p=0; wbuf[p] != 0; p++)
247 {
248 unsigned int c=wbuf[p];
249
250 if (c <= 127)
251 target << static_cast<unsigned char>(c);
252 else
253 target << "&#" << c << ';';
254 }
255
256 free(wbuf);
257
258 str=target.str();
259 }
118e216e
TJ
260
261 return str;
262}
263
e93545dd
GE
264bool replace_all(string &base, const char *ist, const char *soll)
265{
266 string i=ist;
267 string s=soll;
268 return replace_all(base,&i,&s);
269}
270
271bool replace_all(string &base, const string &ist, const char *soll)
272{
273 string s=soll;
274 return replace_all(base,&ist,&s);
275}
276
277bool replace_all(string &base, const string *ist, const string *soll)
278{
279 return replace_all(base,*ist,*soll);
280}
281
282bool replace_all(string &base, const char *ist, const string *soll)
283{
284 string i=ist;
285 return replace_all(base,&i,soll);
286}
287
288bool replace_all(string &base, const string &ist, const string &soll)
289{
290 bool found_ist = false;
291 string::size_type a=0;
292
1ec2064e
TJ
293 if (ist.empty())
294 throw runtime_error("replace_all called with empty search string");
295
e93545dd
GE
296 while((a=base.find(ist,a))!=string::npos)
297 {
298 base.replace(a,ist.size(),soll);
299 a=a+soll.size();
300 found_ist = true;
301 }
302
303 return found_ist;
304}
305
306string to_lower(const string &src)
307{
308 string dst = src;
309
ca189cac 310 string::size_type pos, end = dst.size();
e93545dd
GE
311 for (pos = 0; pos < end; pos++)
312 dst[pos] = tolower(dst[pos]);
313
314 return dst;
315}
316
317string to_upper(const string &src)
318{
319 string dst = src;
320
ca189cac 321 string::size_type pos, end = dst.size();
e93545dd
GE
322 for (pos = 0; pos < end; pos++)
323 dst[pos] = toupper(dst[pos]);
324
325 return dst;
326}
327
328string nice_unit_format (int input) {
329 float size = input;
330 int sizecount = 0;
331
332 while (size > 1000) {
333 size = size / 1000;
334 sizecount++;
335 }
336
337 float tmp; // round
338 tmp = size*10;
339 tmp += 0.5;
340 tmp = int (tmp);
341 tmp = float(tmp)/float(10);
342 size = tmp;
343
344 ostringstream out;
345
346 out.setf (ios::fixed);
347 out.precision(2);
348 switch (sizecount) {
349 case 1:
350 out << size << i18n(" KBytes");
351 break;
352 case 2:
353 out << size << i18n(" MBytes");
354 break;
355 case 3:
356 out << size << i18n(" Gbytes");
357 break;
358 default:
359 out << size << i18n(" Bytes");
360 break;
361 }
362
363 return out.str();
364}
365
47c07fba
GE
366string escape(const string &s)
367{
368 string out(s);
369 string::size_type p;
370
371 p=0;
372 while ((p=out.find_first_of("\"\\",p))!=out.npos)
373 {
374 out.insert(p,"\\");
375 p+=2;
376 }
377
378 p=0;
379 while ((p=out.find_first_of("\r",p))!=out.npos)
380 {
381 out.replace(p,1,"\\r");
382 p+=2;
383 }
384
385 p=0;
386 while ((p=out.find_first_of("\n",p))!=out.npos)
387 {
388 out.replace(p,1,"\\n");
389 p+=2;
390 }
391
392 out='"'+out+'"';
393
394 return out;
395}
396
397string descape(const string &s, int startpos, int &endpos)
398{
399 string out;
400
401 if (s.at(startpos) != '"')
402 throw out_of_range("value not type escaped string");
403
404 out=s.substr(startpos+1);
405 string::size_type p=0;
406
407 // search for the end of the string
408 while((p=out.find("\"",p))!=out.npos)
409 {
410 int e=p-1;
411 bool escaped=false;
412
413 // the " might be escaped with a backslash
414 while(e>=0 && out.at(e)=='\\')
415 {
416 if (escaped == false)
417 escaped=true;
418 else
419 escaped=false;
420
421 e--;
422 }
423
424 if (escaped==false)
425 break;
426 else
427 p++;
428 }
429
430 // we now have the end of the string
431 out=out.substr(0,p);
432
433 // tell calling prog about the endposition
434 endpos=startpos+p+1;
435
436 // descape all \ stuff inside the string now
437 p=0;
438 while((p=out.find_first_of("\\",p))!=out.npos)
439 {
440 switch(out.at(p+1))
441 {
442 case 'r':
443 out.replace(p,2,"\r");
444 break;
445 case 'n':
446 out.replace(p,2,"\n");
447 break;
448 default:
449 out.erase(p,1);
450 }
451 p++;
452 }
453
454 return out;
455}
e93545dd 456
47c07fba
GE
457string escape_shellarg(const string &input)
458{
47c07fba
GE
459 string output = "'";
460 string::const_iterator it, it_end = input.end();
461 for (it = input.begin(); it != it_end; it++) {
462 if ((*it) == '\'')
463 output += "'\\'";
0f65241e 464
47c07fba
GE
465 output += *it;
466 }
0f65241e 467
47c07fba
GE
468 output += "'";
469 return output;
470}