comparison src/vbuf.c @ 116:ed2a260bbb98 stable-0-6-25

improve handling of content-type charset values in mime parts
author Carl Byington <carl@five-ten-sg.com>
date Fri, 16 Jan 2009 15:23:52 -0800
parents cb14583c119a
children 6395ced2b8b2
comparison
equal deleted inserted replaced
115:7689c006b166 116:ed2a260bbb98
38 38
39 nextr = memchr(vs->b, '\r', vs->dlen); 39 nextr = memchr(vs->b, '\r', vs->dlen);
40 nextn = memchr(vs->b, '\n', vs->dlen); 40 nextn = memchr(vs->b, '\n', vs->dlen);
41 41
42 //case 1: UNIX, we find \n first 42 //case 1: UNIX, we find \n first
43 if (nextn && (nextr == NULL || nextr > nextn)) { 43 if (nextn && (!nextr || (nextr > nextn))) {
44 return nextn - vs->b; 44 return nextn - vs->b;
45 } 45 }
46 //case 2: DOS, we find \r\n 46 //case 2: DOS, we find \r\n
47 if (NULL != nextr && NULL != nextn && 1 == (char *) nextn - (char *) nextr) { 47 if (nextr && nextn && (nextn-nextr == 1)) {
48 return nextr - vs->b; 48 return nextr - vs->b;
49 } 49 }
50 //case 3: we find nothing 50 //case 3: we find nothing
51 51
52 return -1; 52 return -1;
53 } 53 }
54 54
55 55
56 // UTF8 <-> UTF16 <-> ISO8859 Character set conversion functions and (ack) their globals 56 // UTF8 <-> UTF16 <-> ISO8859 Character set conversion functions and (ack) their globals
57 57
58 //TODO: the following should not be
59 char *wwbuf = NULL;
60 size_t nwwbuf = 0;
61 static int unicode_up = 0; 58 static int unicode_up = 0;
62 iconv_t i16to8, i8to16, i8859_1to8, i8toi8859_1; 59 static iconv_t i16to8;
60 static const char *target_charset = NULL;
61 static iconv_t i8totarget;
63 62
64 63
65 void unicode_init() 64 void unicode_init()
66 { 65 {
67 char *wipe = ""; 66 if (unicode_up) unicode_close();
68 char dump[4]; 67 i16to8 = iconv_open("UTF-8", "UTF-16LE");
69 68 if (i16to8 == (iconv_t)-1) {
70 if (unicode_up) 69 fprintf(stderr, "Couldn't open iconv descriptor for UTF-16LE to UTF-8.\n");
71 unicode_close();
72
73 if ((iconv_t) - 1 == (i16to8 = iconv_open("UTF-8", "UTF-16LE"))) {
74 fprintf(stderr, "doexport(): Couldn't open iconv descriptor for UTF-16LE to UTF-8.\n");
75 exit(1); 70 exit(1);
76 } 71 }
77
78 if ((iconv_t) - 1 == (i8to16 = iconv_open("UTF-16LE", "UTF-8"))) {
79 fprintf(stderr, "doexport(): Couldn't open iconv descriptor for UTF-8 to UTF-16LE.\n");
80 exit(2);
81 }
82 //iconv will prefix output with an FF FE (utf-16 start seq), the following dumps that.
83 memset(dump, 'x', 4);
84 ASSERT(0 == utf8to16(wipe, 1, dump, 4), "unicode_init(): attempt to dump FF FE failed.");
85
86 if ((iconv_t) - 1 == (i8859_1to8 = iconv_open("UTF-8", "ISO_8859-1"))) {
87 fprintf(stderr, "doexport(): Couldn't open iconv descriptor for ASCII to UTF-8.\n");
88 exit(1);
89 }
90
91 if ((iconv_t) - 1 == (i8toi8859_1 = iconv_open("ISO_8859-1", "UTF-8"))) {
92 fprintf(stderr, "doexport(): Couldn't open iconv descriptor for UTF-8 to ASCII.\n");
93 exit(1);
94 }
95
96 unicode_up = 1; 72 unicode_up = 1;
97 } 73 }
98 74
99 75
100 void unicode_close() 76 void unicode_close()
101 { 77 {
78 iconv_close(i16to8);
79 if (target_charset) {
80 iconv_close(i8totarget);
81 free((char *)target_charset);
82 target_charset = NULL;
83 }
102 unicode_up = 0; 84 unicode_up = 0;
103 iconv_close(i8to16); 85 }
104 iconv_close(i16to8); 86
105 iconv_close(i8859_1to8); 87
106 iconv_close(i8toi8859_1); 88 int utf16_is_terminated(const char *str, int length)
107 }
108
109
110 int utf16_is_terminated(char *str, int length)
111 { 89 {
112 VSTR_STATIC(errbuf, 100); 90 VSTR_STATIC(errbuf, 100);
113 int len = -1; 91 int len = -1;
114 int i; 92 int i;
115 for (i = 0; i < length; i += 2) { 93 for (i = 0; i < length; i += 2) {
125 103
126 return (-1 == len) ? 0 : 1; 104 return (-1 == len) ? 0 : 1;
127 } 105 }
128 106
129 107
130 int vb_utf16to8(vbuf * dest, char *buf, int len) 108 size_t vb_utf16to8(vbuf *dest, const char *inbuf, int iblen)
131 { 109 {
132 size_t inbytesleft = len; 110 size_t inbytesleft = iblen;
133 char *inbuf = buf; 111 size_t icresult = (size_t)-1;
134 size_t icresult = (size_t)-1;
135 VBUF_STATIC(dumpster, 100);
136
137 size_t outbytesleft = 0; 112 size_t outbytesleft = 0;
138 char *outbuf = NULL; 113 char *outbuf = NULL;
139 114
140 ASSERT(unicode_up, "vb_utf16to8() called before unicode started."); 115 ASSERT(unicode_up, "vb_utf16to8() called before unicode started.");
141 116
142 if (2 > dest->blen) 117 if (2 > dest->blen) vbresize(dest, 2);
143 vbresize(dest, 2);
144 dest->dlen = 0; 118 dest->dlen = 0;
145 119
146 //Bad Things can happen if a non-zero-terminated utf16 string comes through here 120 //Bad Things can happen if a non-zero-terminated utf16 string comes through here
147 if (!utf16_is_terminated(buf, len)) 121 if (!utf16_is_terminated(inbuf, iblen))
148 return -1; 122 return (size_t)-1;
149 123
150 do { 124 do {
151 outbytesleft = dest->blen - dest->dlen; 125 outbytesleft = dest->blen - dest->dlen;
152 outbuf = dest->b + dest->dlen; 126 outbuf = dest->b + dest->dlen;
153 icresult = iconv(i16to8, &inbuf, &inbytesleft, &outbuf, &outbytesleft); 127 icresult = iconv(i16to8, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft);
154 dest->dlen = outbuf - dest->b; 128 dest->dlen = outbuf - dest->b;
155 vbgrow(dest, inbytesleft); 129 vbgrow(dest, inbytesleft);
156 } while ((size_t)-1 == icresult && E2BIG == errno); 130 } while ((size_t)-1 == icresult && E2BIG == errno);
157 131
158 if (0 != vb_utf8to16T(dumpster, dest->b, dest->dlen))
159 DIE(("Reverse conversion failed."));
160
161 if (icresult == (size_t)-1) {
162 //TODO: error
163 //ERR_UNIX( errno, "vb_utf16to8():iconv failure: %s", strerror( errno ) );
164 unicode_init();
165 return -1;
166 /*
167 fprintf(stderr, " attempted to convert:\n");
168 hexdump( (char*)cin, 0, inlen, 1 );
169 fprintf(stderr, " result:\n");
170 hexdump( (char*)bout->b, 0, bout->dlen, 1 );
171 fprintf(stderr, " MyDirtyOut:\n");
172 for( i=0; i<inlen; i++) {
173 if( inbuf[i] != '\0' ) fprintf(stderr, "%c", inbuf[i] );
174 }
175
176 fprintf( stderr, "\n" );
177 raise( SIGSEGV );
178 exit(1);
179 */
180 }
181
182 if (icresult) {
183 //ERR_UNIX( EILSEQ, "Uhhhh...vb_utf16to8() returning icresult == %d", icresult );
184 return -1;
185 }
186 return icresult;
187 }
188
189
190 int utf8to16(char *inbuf_o, int iblen, char *outbuf_o, int oblen) // iblen, oblen: bytes including \0
191 {
192 //TODO: this is *only* used to dump the utf16 preamble now...
193 //TODO: This (and 8to16) are the most horrible things I have ever seen...
194 size_t inbytesleft = 0;
195 size_t outbytesleft = oblen;
196 char *inbuf = inbuf_o;
197 char *outbuf = outbuf_o;
198 size_t icresult = (size_t)-1;
199 char *stend;
200
201 stend = memchr(inbuf_o, '\0', iblen);
202 ASSERT(NULL != stend, "utf8to16(): in string not zero terminated.");
203 inbytesleft = (stend - inbuf_o + 1 < iblen) ? stend - inbuf_o + 1 : iblen;
204 icresult = iconv(i8to16, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
205
206 if (icresult == (size_t)-1) {
207 DIE(("iconv failure(%d): %s\n", errno, strerror(errno)));
208 }
209 if (icresult > (size_t)INT_MAX) {
210 return (-1);
211 }
212 return (int) icresult;
213 }
214
215
216 int vb_utf8to16T(vbuf * bout, char *cin, int inlen)
217 {
218 //TODO: This (and 8to16) are the most horrible things I have ever seen...
219 size_t inbytesleft = inlen;
220 char *inbuf = cin;
221 //int rlen = -1, tlen;
222 size_t icresult = (size_t)-1;
223 size_t outbytesleft = 0;
224 char *outbuf = NULL;
225
226 if (2 > bout->blen)
227 vbresize(bout, 2);
228 bout->dlen = 0;
229
230 do {
231 outbytesleft = bout->blen - bout->dlen;
232 outbuf = bout->b + bout->dlen;
233 icresult = iconv(i8to16, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
234 bout->dlen = outbuf - bout->b;
235 vbgrow(bout, 20);
236 } while ((size_t)-1 == icresult && E2BIG == errno);
237
238 if (icresult == (size_t)-1) { 132 if (icresult == (size_t)-1) {
239 WARN(("iconv failure: %s", strerror(errno))); 133 WARN(("iconv failure: %s", strerror(errno)));
240 unicode_init(); 134 unicode_init();
241 return -1; 135 return (size_t)-1;
242 } 136 }
243 if (icresult > (size_t) INT_MAX) { 137 return (icresult) ? (size_t)-1 : 0;
244 return (-1); 138 }
245 } 139
246 return icresult; 140
247 } 141 size_t vb_utf8to8bit(vbuf *dest, const char *inbuf, int iblen, const char* charset)
248 142 {
249 143 size_t inbytesleft = iblen;
250 /* Quick and dirty UNICODE to std. ascii */ 144 size_t icresult = (size_t)-1;
251 void cheap_uni2ascii(char *src, char *dest, int l) 145 size_t outbytesleft = 0;
252 { 146 char *outbuf = NULL;
253 147
254 for (; l > 0; l -= 2) { 148 if (!target_charset || (target_charset && strcasecmp(target_charset, charset))) {
255 *dest = *src; 149 if (target_charset) {
256 dest++; 150 iconv_close(i8totarget);
257 src += 2; 151 free((char *)target_charset);
258 } 152 }
259 *dest = 0; 153 target_charset = strdup(charset);
260 } 154 i8totarget = iconv_open(target_charset, "UTF-8");
261 155 if (i8totarget == (iconv_t)-1) {
262 156 fprintf(stderr, "Couldn't open iconv descriptor for UTF-8 to %s.\n", target_charset);
263 /* Quick and dirty ascii to unicode */ 157 return (size_t)-1;
264 void cheap_ascii2uni(char *src, char *dest, int l) 158 }
265 { 159 }
266 for (; l > 0; l--) { 160
267 *dest++ = *src++; 161 if (2 > dest->blen) vbresize(dest, 2);
268 *dest++ = 0; 162 dest->dlen = 0;
269 163
270 } 164 do {
165 outbytesleft = dest->blen - dest->dlen;
166 outbuf = dest->b + dest->dlen;
167 icresult = iconv(i8totarget, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft);
168 dest->dlen = outbuf - dest->b;
169 vbgrow(dest, 20);
170 } while ((size_t)-1 == icresult && E2BIG == errno);
171
172 if (icresult == (size_t)-1) {
173 WARN(("iconv failure: %s", strerror(errno)));
174 unicode_init();
175 return (size_t)-1;
176 }
177 return (icresult) ? (size_t)-1 : 0;
271 } 178 }
272 179
273 180
274 vbuf *vballoc(size_t len) 181 vbuf *vballoc(size_t len)
275 { 182 {
607 vbgrow((vbuf *) vs, size); 514 vbgrow((vbuf *) vs, size);
608 } 515 }
609 } 516 }
610 517
611 518
612 void vshexdump(vstr * vs, char *b, size_t start, size_t stop, int ascii) 519 void vshexdump(vstr * vs, const char *b, size_t start, size_t stop, int ascii)
613 { 520 {
614 char c; 521 char c;
615 int diff, i; 522 int diff, i;
616 523
617 while (start < stop) { 524 while (start < stop) {