Mercurial > libpst
view src/vbuf.c @ 142:2189a6b8134e
improve character set handling - don't try to convert utf-8 to single byte for fields that were not originally unicode.
if the conversion fails, leave the data in utf-8.
author | Carl Byington <carl@five-ten-sg.com> |
---|---|
date | Mon, 23 Feb 2009 20:40:51 -0800 |
parents | fc11b1d1ad34 |
children | cda7c812ec01 |
line wrap: on
line source
#include "define.h" #define ASSERT(x,...) { if( !(x) ) DIE(( __VA_ARGS__)); } int skip_nl(char *s) { if (s[0] == '\n') return 1; if (s[0] == '\r' && s[1] == '\n') return 2; if (s[0] == '\0') return 0; return -1; } int find_nl(vstr * vs) { char *nextr, *nextn; nextr = memchr(vs->b, '\r', vs->dlen); nextn = memchr(vs->b, '\n', vs->dlen); //case 1: UNIX, we find \n first if (nextn && (!nextr || (nextr > nextn))) { return nextn - vs->b; } //case 2: DOS, we find \r\n if (nextr && nextn && (nextn-nextr == 1)) { return nextr - vs->b; } //case 3: we find nothing return -1; } // UTF8 <-> UTF16 <-> ISO8859 Character set conversion functions and (ack) their globals static int unicode_up = 0; static iconv_t i16to8; static const char *target_charset = NULL; static int target_open = 0; static iconv_t i8totarget; void unicode_init() { if (unicode_up) unicode_close(); i16to8 = iconv_open("UTF-8", "UTF-16LE"); if (i16to8 == (iconv_t)-1) { fprintf(stderr, "Couldn't open iconv descriptor for UTF-16LE to UTF-8.\n"); exit(1); } unicode_up = 1; } void unicode_close() { iconv_close(i16to8); if (target_open) { iconv_close(i8totarget); free((char *)target_charset); target_charset = NULL; target_open = 0; } unicode_up = 0; } int utf16_is_terminated(const char *str, int length) { VSTR_STATIC(errbuf, 100); int len = -1; int i; for (i = 0; i < length; i += 2) { if (str[i] == 0 && str[i + 1] == 0) { len = i; } } if (-1 == len) { vshexdump(errbuf, str, 0, length, 1); DEBUG_WARN(("String is not zero terminated (probably broken data from registry) %s.\n", errbuf->b)); } return (-1 == len) ? 0 : 1; } size_t vb_utf16to8(vbuf *dest, const char *inbuf, int iblen) { size_t inbytesleft = iblen; size_t icresult = (size_t)-1; size_t outbytesleft = 0; char *outbuf = NULL; ASSERT(unicode_up, "vb_utf16to8() called before unicode started."); if (2 > dest->blen) vbresize(dest, 2); dest->dlen = 0; //Bad Things can happen if a non-zero-terminated utf16 string comes through here if (!utf16_is_terminated(inbuf, iblen)) return (size_t)-1; do { outbytesleft = dest->blen - dest->dlen; outbuf = dest->b + dest->dlen; icresult = iconv(i16to8, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft); dest->dlen = outbuf - dest->b; vbgrow(dest, inbytesleft); } while ((size_t)-1 == icresult && E2BIG == errno); if (icresult == (size_t)-1) { DEBUG_WARN(("iconv failure: %s\n", strerror(errno))); unicode_init(); return (size_t)-1; } return (icresult) ? (size_t)-1 : 0; } size_t vb_utf8to8bit(vbuf *dest, const char *inbuf, int iblen, const char* charset) { size_t inbytesleft = iblen; size_t icresult = (size_t)-1; size_t outbytesleft = 0; char *outbuf = NULL; if (!target_charset || strcasecmp(target_charset, charset)) { if (target_open) { iconv_close(i8totarget); free((char *)target_charset); } target_charset = strdup(charset); target_open = 1; i8totarget = iconv_open(target_charset, "UTF-8"); if (i8totarget == (iconv_t)-1) { target_open = 0; fprintf(stderr, "Couldn't open iconv descriptor for UTF-8 to %s.\n", target_charset); return (size_t)-1; } } if (!target_open) return (size_t)-1; // previous failure to open the target if (2 > dest->blen) vbresize(dest, 2); dest->dlen = 0; do { outbytesleft = dest->blen - dest->dlen; outbuf = dest->b + dest->dlen; icresult = iconv(i8totarget, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft); dest->dlen = outbuf - dest->b; vbgrow(dest, 20); } while ((size_t)-1 == icresult && E2BIG == errno); if (icresult == (size_t)-1) { WARN(("iconv failure: %s\n", strerror(errno))); unicode_init(); return (size_t)-1; } return (icresult) ? (size_t)-1 : 0; } vbuf *vballoc(size_t len) { struct varbuf *result = malloc(sizeof(struct varbuf)); if (result) { result->dlen = 0; result->blen = 0; result->buf = NULL; vbresize(result, len); } else DIE(("malloc() failure")); return result; } void vbcheck(vbuf * vb) { ASSERT(vb->b >= vb->buf, "vbcheck(): data not inside buffer"); ASSERT((size_t)(vb->b - vb->buf) <= vb->blen, "vbcheck(): vb->b outside of buffer range."); ASSERT(vb->dlen <= vb->blen, "vbcheck(): data length > buffer length."); ASSERT(vb->blen < 1024 * 1024, "vbcheck(): blen is a bit large...hmmm."); } void vbfree(vbuf * vb) { free(vb->buf); free(vb); } void vbclear(struct varbuf *vb) // ditch the data, keep the buffer { vbresize(vb, 0); } void vbresize(struct varbuf *vb, size_t len) // DESTRUCTIVELY grow or shrink buffer { vb->dlen = 0; if (vb->blen >= len) { vb->b = vb->buf; return; } vb->buf = realloc(vb->buf, len); vb->b = vb->buf; vb->blen = len; } size_t vbavail(vbuf * vb) { return vb->blen - vb->dlen - (size_t)(vb->b - vb->buf); } //void vbdump( vbuf *vb ) // TODO: to stdout? Yuck //{ // printf("vb dump-------------\n"); // printf("dlen: %d\n", vb->dlen ); // printf("blen: %d\n", vb->blen ); // printf("b - buf: %d\n", vb->b - vb->buf ); // printf("buf:\n"); // hexdump( vb->buf, 0, vb->blen, 1 ); // printf("b:\n"); // hexdump( vb->b, 0, vb->dlen, 1 ); // printf("^^^^^^^^^^^^^^^^^^^^\n"); //} void vbgrow(struct varbuf *vb, size_t len) // out: vbavail(vb) >= len, data are preserved { if (0 == len) return; if (0 == vb->blen) { vbresize(vb, len); return; } if (vb->dlen + len > vb->blen) { if (vb->dlen + len < vb->blen * 1.5) len = vb->blen * 1.5; char *nb = malloc(vb->blen + len); if (!nb) DIE(("malloc() failure")); vb->blen = vb->blen + len; memcpy(nb, vb->b, vb->dlen); free(vb->buf); vb->buf = nb; vb->b = vb->buf; } else { if (vb->b != vb->buf) memcpy(vb->buf, vb->b, vb->dlen); } vb->b = vb->buf; ASSERT(vbavail(vb) >= len, "vbgrow(): I have failed in my mission."); } void vbset(vbuf * vb, void *b, size_t len) // set vbuf b size=len, resize if necessary, relen = how much to over-allocate { vbresize(vb, len); memcpy(vb->b, b, len); vb->dlen = len; } void vsskipws(vstr * vs) { char *p = vs->b; while ((size_t)(p - vs->b) < vs->dlen && isspace(p[0])) p++; vbskip((vbuf *) vs, p - vs->b); } // append len bytes of b to vbuf, resize if necessary void vbappend(struct varbuf *vb, void *b, size_t len) { if (0 == vb->dlen) { vbset(vb, b, len); return; } vbgrow(vb, len); memcpy(vb->b + vb->dlen, b, len); vb->dlen += len; } // dumps the first skip bytes from vbuf void vbskip(struct varbuf *vb, size_t skip) { ASSERT(skip <= vb->dlen, "vbskip(): Attempt to seek past end of buffer."); vb->b += skip; vb->dlen -= skip; } // overwrite vbdest with vbsrc void vboverwrite(struct varbuf *vbdest, struct varbuf *vbsrc) { vbresize(vbdest, vbsrc->blen); memcpy(vbdest->b, vbsrc->b, vbsrc->dlen); vbdest->blen = vbsrc->blen; vbdest->dlen = vbsrc->dlen; } vstr *vsalloc(size_t len) { vstr *result = (vstr *) vballoc(len + 1); vsset(result, ""); return result; } char *vsstr(vstr * vs) { return vs->b; } size_t vslen(vstr * vs) { return strlen(vsstr(vs)); } void vsfree(vstr * vs) { vbfree((vbuf *) vs); } void vscharcat(vstr * vb, int ch) { vbgrow((vbuf *) vb, 1); vb->b[vb->dlen - 1] = ch; vb->b[vb->dlen] = '\0'; vb->dlen++; } // prependappend string str to vbuf, vbuf must already contain a valid string void vsnprepend(vstr * vb, char *str, size_t len) { ASSERT(vb->b[vb->dlen - 1] == '\0', "vsncat(): attempt to append string to non-string."); size_t sl = strlen(str); size_t n = (sl < len) ? sl : len; vbgrow((vbuf *) vb, n + 1); memmove(vb->b + n, vb->b, vb->dlen - 1); memcpy(vb->b, str, n); vb->dlen += n; vb->b[vb->dlen - 1] = '\0'; } // len < dlen-1 -> skip len chars, else DIE void vsskip(vstr * vs, size_t len) { ASSERT(len < vs->dlen - 1, "Attempt to skip past end of string"); vbskip((vbuf *) vs, len); } // in: vb->b == "stuff\nmore_stuff"; out: vb->b == "more_stuff" int vsskipline(vstr * vs) { int nloff = find_nl(vs); int nll = skip_nl(vs->b + nloff); if (nloff < 0) { //TODO: error printf("vb_skipline(): there seems to be no newline here.\n"); return -1; } if (nll < 0) { //TODO: error printf("vb_skipline(): there seems to be no newline here...except there should be. :P\n"); return -1; } memmove(vs->b, vs->b + nloff + nll, vs->dlen - nloff - nll); vs->dlen -= nloff + nll; return 0; } int vscatprintf(vstr * vs, char *fmt, ...) { int size; va_list ap; /* Guess we need no more than 100 bytes. */ //vsresize( vb, 100 ); if (!vs->b || vs->dlen == 0) { vsset(vs, ""); } while (1) { /* Try to print in the allocated space. */ va_start(ap, fmt); size = vsnprintf(vs->b + vs->dlen - 1, vs->blen - vs->dlen, fmt, ap); va_end(ap); /* If that worked, return the string. */ if ((size > -1) && ((size_t)size < vs->blen - vs->dlen)) { vs->dlen += size; return size; } /* Else try again with more space. */ if (size >= 0) /* glibc 2.1 */ vbgrow((vbuf *) vs, size + 1); /* precisely what is needed */ else /* glibc 2.0 */ vbgrow((vbuf *) vs, vs->blen); } } // returns the last character stored in a vstr int vslast(vstr * vs) { if (vs->dlen < 1) return -1; if (vs->b[vs->dlen - 1] != '\0') return -1; if (vs->dlen == 1) return '\0'; return vs->b[vs->dlen - 2]; } // print over vb void vs_printf(vstr * vs, char *fmt, ...) { int size; va_list ap; /* Guess we need no more than 100 bytes. */ vbresize((vbuf *) vs, 100); while (1) { /* Try to print in the allocated space. */ va_start(ap, fmt); size = vsnprintf(vs->b, vs->blen, fmt, ap); va_end(ap); /* If that worked, return the string. */ if ((size > -1) && ((size_t)size < vs->blen)) { vs->dlen = size + 1; return; } /* Else try again with more space. */ if (size >= 0) /* glibc 2.1 */ vbresize((vbuf *) vs, size + 1); /* precisely what is needed */ else /* glibc 2.0 */ vbresize((vbuf *) vs, vs->blen * 2); } } // printf append to vs void vs_printfa(vstr * vs, char *fmt, ...) { int size; va_list ap; if (vs->blen - vs->dlen < 50) vbgrow((vbuf *) vs, 100); while (1) { /* Try to print in the allocated space. */ va_start(ap, fmt); size = vsnprintf(vs->b + vs->dlen - 1, vs->blen - vs->dlen + 1, fmt, ap); va_end(ap); /* If that worked, return the string. */ if ((size > -1) && ((size_t)size < vs->blen)) { vs->dlen += size; return; } /* Else try again with more space. */ if (size >= 0) /* glibc 2.1 */ vbgrow((vbuf *) vs, size + 1 - vs->dlen); /* precisely what is needed */ else /* glibc 2.0 */ vbgrow((vbuf *) vs, size); } } void vshexdump(vstr * vs, const char *b, size_t start, size_t stop, int ascii) { char c; int diff, i; while (start < stop) { diff = stop - start; if (diff > 16) diff = 16; vs_printfa(vs, ":%08X ", start); for (i = 0; i < diff; i++) { if (8 == i) vs_printfa(vs, " "); vs_printfa(vs, "%02X ", (unsigned char) *(b + start + i)); } if (ascii) { for (i = diff; i < 16; i++) vs_printfa(vs, " "); for (i = 0; i < diff; i++) { c = *(b + start + i); vs_printfa(vs, "%c", isprint(c) ? c : '.'); } } vs_printfa(vs, "\n"); start += 16; } } void vsset(vstr * vs, char *s) // Store string s in vs { vsnset(vs, s, strlen(s)); } void vsnset(vstr * vs, char *s, size_t n) // Store string s in vs { vbresize((vbuf *) vs, n + 1); memcpy(vs->b, s, n); vs->b[n] = '\0'; vs->dlen = n + 1; } void vsgrow(vstr * vs, size_t len) // grow buffer by len bytes, data are preserved { vbgrow((vbuf *) vs, len); } size_t vsavail(vstr * vs) { return vbavail((vbuf *) vs); } void vsnset16(vstr * vs, char *s, size_t len) // Like vbstrnset, but for UTF16 { vbresize((vbuf *) vs, len + 1); memcpy(vs->b, s, len); vs->b[len] = '\0'; vs->dlen = len + 1; vs->b[len] = '\0'; } void vscat(vstr * vs, char *str) { vsncat(vs, str, strlen(str)); } int vscmp(vstr * vs, char *str) { return strcmp(vs->b, str); } void vsncat(vstr * vs, char *str, size_t len) // append string str to vstr, vstr must already contain a valid string { ASSERT(vs->b[vs->dlen - 1] == '\0', "vsncat(): attempt to append string to non-string."); size_t sl = strlen(str); size_t n = (sl < len) ? sl : len; //string append vbgrow((vbuf *) vs, n + 1); memcpy(vs->b + vs->dlen - 1, str, n); vs->dlen += n; vs->b[vs->dlen - 1] = '\0'; } void vstrunc(vstr * v, size_t off) // Drop chars [off..dlen] { if (off >= v->dlen - 1) return; //nothing to do v->b[off] = '\0'; v->dlen = off + 1; }