changeset 116:ed2a260bbb98 stable-0-6-25

improve handling of content-type charset values in mime parts
author Carl Byington <carl@five-ten-sg.com>
date Fri, 16 Jan 2009 15:23:52 -0800
parents 7689c006b166
children 0a3d854b53f6
files ChangeLog NEWS configure.in libpst.spec.in regression/regression-tests.bash src/libpst.c src/libpst.h src/readpst.c src/vbuf.c src/vbuf.h
diffstat 10 files changed, 177 insertions(+), 241 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Thu Dec 11 12:06:03 2008 -0800
+++ b/ChangeLog	Fri Jan 16 15:23:52 2009 -0800
@@ -1,3 +1,7 @@
+LibPST 0.6.25 (2009-01-16)
+===============================
+        * improve handling of content-type charset values in mime parts
+
 LibPST 0.6.24 (2008-12-11)
 ===============================
         * patch from Chris Eagle to build on cygwin
--- a/NEWS	Thu Dec 11 12:06:03 2008 -0800
+++ b/NEWS	Fri Jan 16 15:23:52 2009 -0800
@@ -1,3 +1,4 @@
+0.6.25  2009-01-16 improve handling of content-type charset values in mime parts
 0.6.24  2008-12-11 patch from Chris Eagle to build on cygwin
 0.6.23  2008-12-04 bump version to avoid cvs tagging mistake in fedora
 0.6.22  2008-11-28 process emails with type PST_TYPE_OTHER, fix malloc error and possible segfault
--- a/configure.in	Thu Dec 11 12:06:03 2008 -0800
+++ b/configure.in	Fri Jan 16 15:23:52 2009 -0800
@@ -1,5 +1,5 @@
 AC_PREREQ(2.59)
-AC_INIT(libpst,0.6.24,carl@five-ten-sg.com)
+AC_INIT(libpst,0.6.25,carl@five-ten-sg.com)
 AC_CONFIG_SRCDIR([config.h.in])
 AC_CONFIG_HEADER([config.h])
 
--- a/libpst.spec.in	Thu Dec 11 12:06:03 2008 -0800
+++ b/libpst.spec.in	Fri Jan 16 15:23:52 2009 -0800
@@ -47,6 +47,9 @@
 
 
 %changelog
+* Fri Jan 16 2009 Carl Byington <carl@five-ten-sg.com> - 0.6.25-1
+- improve handling of content-type charset values in mime parts
+
 * Thu Dec 11 2008 Carl Byington <carl@five-ten-sg.com> - 0.6.24-1
 - patch from Chris Eagle to build on cygwin
 
--- a/regression/regression-tests.bash	Thu Dec 11 12:06:03 2008 -0800
+++ b/regression/regression-tests.bash	Fri Jan 16 15:23:52 2009 -0800
@@ -19,7 +19,7 @@
     fn="$2"
     rm -rf output$n
     mkdir output$n
-    $val ../src/readpst -cv -o output$n -d dumper $fn >$fn.pst.err 2>&1
+    $val ../src/readpst -cv -o output$n -d dumper $fn >$fn.err 2>&1
          ../src/readpstlog -f I dumper >$fn.log
     #$val ../src/pst2ldif -b 'o=ams-cc.com, c=US' -c 'newPerson' -o $fn >$fn.ldif.err 2>&1
     #$val ../src/pst2ldif -b 'o=ams-cc.com, c=US' -c 'inetOrgPerson' $fn >$fn.ldif2.err 2>&1
@@ -40,19 +40,20 @@
     dodii 2 sample_64.pst
     dodii 3 test.pst
     dodii 4 big_mail.pst
-elif [ "$1" == "flow" ]; then
-    dopst 11 flow.pst
 else
-    dopst  1 ams.pst
-   #dopst  2 sample_64.pst
-   #dopst  3 test.pst
-   #dopst  4 big_mail.pst
-   #dopst  5 mbmg.archive.pst
-   #dopst  6 Single2003-read.pst
-   #dopst  7 Single2003-unread.pst
-   #dopst  8 ol2k3high.pst
-   #dopst  9 ol97high.pst
-   #dopst 10 returned_message.pst
-   #dopst 11 flow.pst
+   #dopst   1 ams.pst
+   #dopst   2 sample_64.pst
+   #dopst   3 test.pst
+   #dopst   4 big_mail.pst
+   #dopst   5 mbmg.archive.pst
+   #dopst   6 Single2003-read.pst
+   #dopst   7 Single2003-unread.pst
+   #dopst   8 ol2k3high.pst
+   #dopst   9 ol97high.pst
+   #dopst  10 returned_message.pst
+   #dopst  11 flow.pst
+   #dopst  12 test-html.pst
+   dopst  13 test-text.pst
+   #dopst  14 joe.romanowski.pst
 fi
 
--- a/src/libpst.c	Thu Dec 11 12:06:03 2008 -0800
+++ b/src/libpst.c	Fri Jan 16 15:23:52 2009 -0800
@@ -1606,6 +1606,7 @@
                 }
                 if (table_rec.ref_type == (uint16_t)0x1f) {
                     // there is more to do for the type 0x1f unicode strings
+                    size_t rc;
                     static vbuf *strbuf = NULL;
                     static vbuf *unibuf = NULL;
                     if (!strbuf) strbuf=vballoc((size_t)1024);
@@ -1620,11 +1621,17 @@
                     vbappend(strbuf, "\0\0", (size_t)2);
                     DEBUG_INDEX(("Iconv in:\n"));
                     DEBUG_HEXDUMPC(strbuf->b, strbuf->dlen, 0x10);
-                    (void)vb_utf16to8(unibuf, strbuf->b, strbuf->dlen);
-                    free(na_ptr->items[x]->data);
-                    na_ptr->items[x]->size = unibuf->dlen;
-                    na_ptr->items[x]->data = xmalloc(unibuf->dlen);
-                    memcpy(na_ptr->items[x]->data, unibuf->b, unibuf->dlen);
+                    rc = vb_utf16to8(unibuf, strbuf->b, strbuf->dlen);
+                    if (rc == (size_t)-1) {
+                        free(unibuf->b);
+                        DEBUG_EMAIL(("Failed to convert utf-16 to utf-8\n"));
+                    }
+                    else {
+                        free(na_ptr->items[x]->data);
+                        na_ptr->items[x]->size = unibuf->dlen;
+                        na_ptr->items[x]->data = xmalloc(unibuf->dlen);
+                        memcpy(na_ptr->items[x]->data, unibuf->b, unibuf->dlen);
+                    }
                     DEBUG_INDEX(("Iconv out:\n"));
                     DEBUG_HEXDUMPC(na_ptr->items[x]->data, na_ptr->items[x]->size, 0x10);
                 }
@@ -1732,6 +1739,22 @@
                         ef->next = item->extra_fields;
                         item->extra_fields = ef;
                         DEBUG_EMAIL(("\"%s\" = \"%s\"\n", ef->field_name, ef->value));
+                        if (strcmp(ef->field_name, "content-type") == 0) {
+                            char *p = strstr(ef->value, "charset=\"");
+                            if (p) {
+                                p += 9; // skip over charset="
+                                char *pp = strchr(p, '"');
+                                if (pp) {
+                                    *pp = '\0';
+                                    char *set = strdup(p);
+                                    *pp = '"';
+                                    MALLOC_EMAIL(item);
+                                    if (item->email->body_charset) free(item->email->body_charset);
+                                    item->email->body_charset = set;
+                                    DEBUG_EMAIL(("body charset %s from content-type extra field\n", set));
+                                }
+                            }
+                        }
                     }
                     else {
                         DEBUG_EMAIL(("NULL extra field\n"));
@@ -2209,8 +2232,7 @@
                     DEBUG_EMAIL(("Plain Text body - "));
                     MALLOC_EMAIL(item);
                     LIST_COPY(item->email->body, (char*));
-                    //DEBUG_EMAIL("%s\n", item->email->body);
-                    DEBUG_EMAIL(("NOT PRINTED\n"));
+                    DEBUG_EMAIL(("%s\n", item->email->body));
                     break;
                 case 0x1006: // PR_RTF_SYNC_BODY_CRC
                     DEBUG_EMAIL(("RTF Sync Body CRC - "));
@@ -2261,8 +2283,7 @@
                     DEBUG_EMAIL(("HTML body - "));
                     MALLOC_EMAIL(item);
                     LIST_COPY(item->email->htmlbody, (char*));
-                    //  DEBUG_EMAIL(("%s\n", item->email->htmlbody));
-                    DEBUG_EMAIL(("NOT PRINTED\n"));
+                    DEBUG_EMAIL(("%s\n", item->email->htmlbody));
                     break;
                 case 0x1035: // Message ID
                     DEBUG_EMAIL(("Message ID - "));
@@ -3699,6 +3720,7 @@
         if (item->email) {
             SAFE_FREE(item->email->arrival_date);
             SAFE_FREE(item->email->body);
+            SAFE_FREE(item->email->body_charset);
             SAFE_FREE(item->email->cc_address);
             SAFE_FREE(item->email->bcc_address);
             SAFE_FREE(item->email->common_name);
--- a/src/libpst.h	Thu Dec 11 12:06:03 2008 -0800
+++ b/src/libpst.h	Fri Jan 16 15:23:52 2009 -0800
@@ -211,6 +211,7 @@
     FILETIME *arrival_date;
     int       autoforward;            // 1 = true, 0 = not set, -1 = false
     char     *body;
+    char     *body_charset;           // null if not specified
     char     *cc_address;
     char     *bcc_address;
     char     *common_name;
--- a/src/readpst.c	Thu Dec 11 12:06:03 2008 -0800
+++ b/src/readpst.c	Fri Jan 16 15:23:52 2009 -0800
@@ -6,6 +6,7 @@
  */
 #include "define.h"
 #include "libstrfunc.h"
+#include "vbuf.h"
 #include "libpst.h"
 #include "common.h"
 #include "timeconv.h"
@@ -762,11 +763,11 @@
         char *attach_filename;
         fprintf(f_output, "\n--%s\n", boundary);
         if (!current_attach->mimetype) {
-            fprintf(f_output, "Content-type: %s\n", MIME_TYPE_DEFAULT);
+            fprintf(f_output, "Content-Type: %s\n", MIME_TYPE_DEFAULT);
         } else {
-            fprintf(f_output, "Content-type: %s\n", current_attach->mimetype);
+            fprintf(f_output, "Content-Type: %s\n", current_attach->mimetype);
         }
-        fprintf(f_output, "Content-transfer-encoding: base64\n");
+        fprintf(f_output, "Content-Transfer-Encoding: base64\n");
         // If there is a long filename (filename2) use that, otherwise
         // use the 8.3 filename (filename1)
         if (current_attach->filename2) {
@@ -822,7 +823,7 @@
         // see if there is a boundary variable there
         // this search MUST be made case insensitive (DONE).
         // Also, we should check to find out if we are looking
-        // at the boundary associated with content-type, and that
+        // at the boundary associated with Content-Type, and that
         // the content type really is multipart
 
         removeCR(item->email->header);
@@ -1024,12 +1025,10 @@
         // in the headers above.
         if (item->attach) {
             // write the boundary stuff if we have attachments
-            fprintf(f_output, "Content-type: multipart/mixed;\n\tboundary=\"%s\"\n", boundary);
-        } else if (boundary) {
-            // else if we have multipart/alternative then tell it so
-            fprintf(f_output, "Content-type: multipart/alternative;\n\tboundary=\"%s\"\n", boundary);
-        } else if (item->email->htmlbody) {
-            fprintf(f_output, "Content-type: text/html\n");
+            fprintf(f_output, "Content-Type: multipart/mixed;\n\tboundary=\"%s\"\n", boundary);
+        } else {
+            // else we have multipart/alternative then tell it so
+            fprintf(f_output, "Content-Type: multipart/alternative;\n\tboundary=\"%s\"\n", boundary);
         }
     }
     fprintf(f_output, "\n");    // start the body
@@ -1037,12 +1036,57 @@
 
     if (item->email->body) {
         if (boundary) {
+            // try to find the charset for this body part
+            const char *def = "utf-8";
+            // it seems that if (item->email->body_charset) is set, then
+            // we actually have utf8 plain body text. If that is not set
+            // we have plain body text in an 8 bit charset specified in
+            // the headers.
+            char *c = my_stristr(item->email->header, "\nContent-Type:");
+            if (c) {
+                c++;
+                char *n = my_stristr(c, "\n");  // termination on the content type
+                if (n) {
+                    char *s = my_stristr(c, "; charset=");
+                    if (s && (s < n)) {
+                        char *e;
+                        s += 10;    // skip over charset=
+                        if (*s == '"') {
+                            s++;
+                            e = my_stristr(s, "\"");
+                        }
+                        else {
+                            e = my_stristr(s, ";");
+                        }
+                        if (!e || (e > n)) e = n;   // use the trailing lf as terminator if nothing better
+                        *e = '\0';      // corrupt the header, but we have already printed it
+                        def = s;
+                        DEBUG_EMAIL(("body charset %s from headers\n", def));
+                    }
+                }
+            }
             fprintf(f_output, "\n--%s\n", boundary);
-            fprintf(f_output, "Content-type: text/plain\n");
+            fprintf(f_output, "Content-Type: text/plain; charset=\"%s\"\n", def);
             if (base64_body)
                 fprintf(f_output, "Content-Transfer-Encoding: base64\n");
             fprintf(f_output, "\n");
         }
+        else if (item->email->body_charset && (strcasecmp("utf-8",item->email->body_charset))) {
+            // try to convert to the specified charset since it is not utf-8
+            size_t rc;
+            DEBUG_EMAIL(("Convert plain text utf-8 to %s\n", item->email->body_charset));
+            vbuf *newer = vballoc(2);
+            rc = vb_utf8to8bit(newer, item->email->body, strlen(item->email->body) + 1, item->email->body_charset);
+            if (rc == (size_t)-1) {
+                free(newer->b);
+                DEBUG_EMAIL(("Failed to convert plain text utf-8 to %s\n", item->email->body_charset));
+            }
+            else {
+                free(item->email->body);
+                item->email->body = newer->b;
+            }
+            free(newer);
+        }
         removeCR(item->email->body);
         if (base64_body) {
             char *enc = base64_encode(item->email->body, strlen(item->email->body));
@@ -1058,8 +1102,10 @@
 
     if (item->email->htmlbody) {
         if (boundary) {
+            const char *def = "utf-8";
+            if (item->email->body_charset) def = item->email->body_charset;
             fprintf(f_output, "\n--%s\n", boundary);
-            fprintf(f_output, "Content-type: text/html\n");
+            fprintf(f_output, "Content-Type: text/html; charset=\"%s\"\n", def);
             if (base64_body) fprintf(f_output, "Content-Transfer-Encoding: base64\n");
             fprintf(f_output, "\n");
         }
--- a/src/vbuf.c	Thu Dec 11 12:06:03 2008 -0800
+++ b/src/vbuf.c	Fri Jan 16 15:23:52 2009 -0800
@@ -40,11 +40,11 @@
     nextn = memchr(vs->b, '\n', vs->dlen);
 
     //case 1: UNIX, we find \n first
-    if (nextn && (nextr == NULL || nextr > nextn)) {
+    if (nextn && (!nextr || (nextr > nextn))) {
         return nextn - vs->b;
     }
     //case 2: DOS, we find \r\n
-    if (NULL != nextr && NULL != nextn && 1 == (char *) nextn - (char *) nextr) {
+    if (nextr && nextn && (nextn-nextr == 1)) {
         return nextr - vs->b;
     }
     //case 3: we find nothing
@@ -55,59 +55,37 @@
 
 //  UTF8 <-> UTF16 <-> ISO8859 Character set conversion functions and (ack) their globals
 
-//TODO: the following should not be
-char *wwbuf = NULL;
-size_t nwwbuf = 0;
 static int unicode_up = 0;
-iconv_t i16to8, i8to16, i8859_1to8, i8toi8859_1;
+static iconv_t i16to8;
+static const char *target_charset = NULL;
+static iconv_t    i8totarget;
 
 
 void unicode_init()
 {
-    char *wipe = "";
-    char dump[4];
-
-    if (unicode_up)
-        unicode_close();
-
-    if ((iconv_t) - 1 == (i16to8 = iconv_open("UTF-8", "UTF-16LE"))) {
-        fprintf(stderr, "doexport(): Couldn't open iconv descriptor for UTF-16LE to UTF-8.\n");
+    if (unicode_up) unicode_close();
+    i16to8 = iconv_open("UTF-8", "UTF-16LE");
+    if (i16to8 == (iconv_t)-1) {
+        fprintf(stderr, "Couldn't open iconv descriptor for UTF-16LE to UTF-8.\n");
         exit(1);
     }
-
-    if ((iconv_t) - 1 == (i8to16 = iconv_open("UTF-16LE", "UTF-8"))) {
-        fprintf(stderr, "doexport(): Couldn't open iconv descriptor for UTF-8 to UTF-16LE.\n");
-        exit(2);
-    }
-    //iconv will prefix output with an FF FE (utf-16 start seq), the following dumps that.
-    memset(dump, 'x', 4);
-    ASSERT(0 == utf8to16(wipe, 1, dump, 4), "unicode_init(): attempt to dump FF FE failed.");
-
-    if ((iconv_t) - 1 == (i8859_1to8 = iconv_open("UTF-8", "ISO_8859-1"))) {
-        fprintf(stderr, "doexport(): Couldn't open iconv descriptor for ASCII to UTF-8.\n");
-        exit(1);
-    }
-
-    if ((iconv_t) - 1 == (i8toi8859_1 = iconv_open("ISO_8859-1", "UTF-8"))) {
-        fprintf(stderr, "doexport(): Couldn't open iconv descriptor for UTF-8 to ASCII.\n");
-        exit(1);
-    }
-
     unicode_up = 1;
 }
 
 
 void unicode_close()
 {
+    iconv_close(i16to8);
+    if (target_charset) {
+        iconv_close(i8totarget);
+        free((char *)target_charset);
+        target_charset = NULL;
+    }
     unicode_up = 0;
-    iconv_close(i8to16);
-    iconv_close(i16to8);
-    iconv_close(i8859_1to8);
-    iconv_close(i8toi8859_1);
 }
 
 
-int utf16_is_terminated(char *str, int length)
+int utf16_is_terminated(const char *str, int length)
 {
     VSTR_STATIC(errbuf, 100);
     int len = -1;
@@ -127,147 +105,76 @@
 }
 
 
-int vb_utf16to8(vbuf * dest, char *buf, int len)
+size_t vb_utf16to8(vbuf *dest, const char *inbuf, int iblen)
 {
-    size_t inbytesleft = len;
-    char *inbuf = buf;
-    size_t icresult = (size_t)-1;
-    VBUF_STATIC(dumpster, 100);
-
+    size_t inbytesleft  = iblen;
+    size_t icresult     = (size_t)-1;
     size_t outbytesleft = 0;
-    char *outbuf = NULL;
+    char *outbuf        = NULL;
 
     ASSERT(unicode_up, "vb_utf16to8() called before unicode started.");
 
-    if (2 > dest->blen)
-        vbresize(dest, 2);
+    if (2 > dest->blen) vbresize(dest, 2);
     dest->dlen = 0;
 
     //Bad Things can happen if a non-zero-terminated utf16 string comes through here
-    if (!utf16_is_terminated(buf, len))
-        return -1;
+    if (!utf16_is_terminated(inbuf, iblen))
+        return (size_t)-1;
 
     do {
         outbytesleft = dest->blen - dest->dlen;
         outbuf = dest->b + dest->dlen;
-        icresult = iconv(i16to8, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+        icresult = iconv(i16to8, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft);
         dest->dlen = outbuf - dest->b;
         vbgrow(dest, inbytesleft);
     } while ((size_t)-1 == icresult && E2BIG == errno);
 
-    if (0 != vb_utf8to16T(dumpster, dest->b, dest->dlen))
-        DIE(("Reverse conversion failed."));
-
-    if (icresult == (size_t)-1) {
-        //TODO: error
-        //ERR_UNIX( errno, "vb_utf16to8():iconv failure: %s", strerror( errno ) );
-        unicode_init();
-        return -1;
-        /*
-           fprintf(stderr, "  attempted to convert:\n");
-           hexdump( (char*)cin, 0, inlen, 1 );
-           fprintf(stderr, "  result:\n");
-           hexdump( (char*)bout->b, 0, bout->dlen, 1 );
-           fprintf(stderr, "  MyDirtyOut:\n");
-           for( i=0; i<inlen; i++) {
-           if( inbuf[i] != '\0' ) fprintf(stderr, "%c", inbuf[i] );
-           }
-
-           fprintf( stderr, "\n" );
-           raise( SIGSEGV );
-           exit(1);
-         */
-    }
-
-    if (icresult) {
-        //ERR_UNIX( EILSEQ, "Uhhhh...vb_utf16to8() returning icresult == %d", icresult );
-        return -1;
-    }
-    return icresult;
-}
-
-
-int utf8to16(char *inbuf_o, int iblen, char *outbuf_o, int oblen)       // iblen, oblen: bytes including \0
-{
-    //TODO: this is *only* used to dump the utf16 preamble now...
-    //TODO: This (and 8to16) are the most horrible things I have ever seen...
-    size_t inbytesleft = 0;
-    size_t outbytesleft = oblen;
-    char *inbuf = inbuf_o;
-    char *outbuf = outbuf_o;
-    size_t icresult = (size_t)-1;
-    char *stend;
-
-    stend = memchr(inbuf_o, '\0', iblen);
-    ASSERT(NULL != stend, "utf8to16(): in string not zero terminated.");
-    inbytesleft = (stend - inbuf_o + 1 < iblen) ? stend - inbuf_o + 1 : iblen;
-    icresult = iconv(i8to16, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
-
-    if (icresult == (size_t)-1) {
-        DIE(("iconv failure(%d): %s\n", errno, strerror(errno)));
-    }
-    if (icresult > (size_t)INT_MAX) {
-        return (-1);
-    }
-    return (int) icresult;
-}
-
-
-int vb_utf8to16T(vbuf * bout, char *cin, int inlen)
-{
-    //TODO: This (and 8to16) are the most horrible things I have ever seen...
-    size_t inbytesleft = inlen;
-    char *inbuf = cin;
-    //int rlen = -1, tlen;
-    size_t icresult = (size_t)-1;
-    size_t outbytesleft = 0;
-    char *outbuf = NULL;
-
-    if (2 > bout->blen)
-        vbresize(bout, 2);
-    bout->dlen = 0;
-
-    do {
-        outbytesleft = bout->blen - bout->dlen;
-        outbuf = bout->b + bout->dlen;
-        icresult = iconv(i8to16, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
-        bout->dlen = outbuf - bout->b;
-        vbgrow(bout, 20);
-    } while ((size_t)-1 == icresult && E2BIG == errno);
-
     if (icresult == (size_t)-1) {
         WARN(("iconv failure: %s", strerror(errno)));
         unicode_init();
-        return -1;
+        return (size_t)-1;
     }
-    if (icresult > (size_t) INT_MAX) {
-        return (-1);
-    }
-    return icresult;
+    return (icresult) ? (size_t)-1 : 0;
 }
 
 
-/* Quick and dirty UNICODE to std. ascii */
-void cheap_uni2ascii(char *src, char *dest, int l)
+size_t vb_utf8to8bit(vbuf *dest, const char *inbuf, int iblen, const char* charset)
 {
+    size_t inbytesleft  = iblen;
+    size_t icresult     = (size_t)-1;
+    size_t outbytesleft = 0;
+    char *outbuf        = NULL;
 
-    for (; l > 0; l -= 2) {
-        *dest = *src;
-        dest++;
-        src += 2;
+    if (!target_charset || (target_charset && strcasecmp(target_charset, charset))) {
+        if (target_charset) {
+            iconv_close(i8totarget);
+            free((char *)target_charset);
+        }
+        target_charset = strdup(charset);
+        i8totarget = iconv_open(target_charset, "UTF-8");
+        if (i8totarget == (iconv_t)-1) {
+            fprintf(stderr, "Couldn't open iconv descriptor for UTF-8 to %s.\n", target_charset);
+            return (size_t)-1;
+        }
     }
-    *dest = 0;
-}
 
+    if (2 > dest->blen) vbresize(dest, 2);
+    dest->dlen = 0;
 
-/* Quick and dirty ascii to unicode */
-void cheap_ascii2uni(char *src, char *dest, int l)
-{
-    for (; l > 0; l--) {
-        *dest++ = *src++;
-        *dest++ = 0;
+    do {
+        outbytesleft = dest->blen - dest->dlen;
+        outbuf = dest->b + dest->dlen;
+        icresult = iconv(i8totarget, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft);
+        dest->dlen = outbuf - dest->b;
+        vbgrow(dest, 20);
+    } while ((size_t)-1 == icresult && E2BIG == errno);
 
+    if (icresult == (size_t)-1) {
+        WARN(("iconv failure: %s", strerror(errno)));
+        unicode_init();
+        return (size_t)-1;
     }
+    return (icresult) ? (size_t)-1 : 0;
 }
 
 
@@ -609,7 +516,7 @@
 }
 
 
-void vshexdump(vstr * vs, char *b, size_t start, size_t stop, int ascii)
+void vshexdump(vstr * vs, const char *b, size_t start, size_t stop, int ascii)
 {
     char c;
     int diff, i;
--- a/src/vbuf.h	Thu Dec 11 12:06:03 2008 -0800
+++ b/src/vbuf.h	Fri Jan 16 15:23:52 2009 -0800
@@ -17,19 +17,6 @@
 #include <stdarg.h>
 /***************************************************/
 
-// Tokenizer const TOK_EMPTY, TOK_ELEMENT, DELIM
-#define DELIM '\\'
-
-#define TOK_EMPTY	0
-#define TOK_DELIM	1
-#define TOK_PARENT	2
-#define TOK_CURRENT	3
-#define TOK_ELEMENT	4
-
-#define TOK_ERROR	10
-#define TOK_BUF_SMALL	11
-
-
 
 // Variable-length buffers
 struct varbuf {
@@ -55,6 +42,9 @@
 #define VBUF_STATIC(x,y) static vbuf *x = NULL; if(!x) x = vballoc(y);
 #define VSTR_STATIC(x,y) static vstr *x = NULL; if(!x) x = vsalloc(y);
 
+int skip_nl( char *s );  // returns the width of the newline at s[0]
+int find_nl( vstr *vs ); // find newline of type type in b
+
 // vbuf functions
 struct varbuf *vballoc( size_t len );
 void vbfree(      vbuf *vb );
@@ -86,57 +76,18 @@
 void vsskipws(    vstr *vs );
 void vs_printf(   vstr *vs, char *fmt, ... );
 void vs_printfa(  vstr *vs, char *fmt, ... );
-void vshexdump(   vstr *vs, char *b, size_t start, size_t stop, int ascii );
+void vshexdump(   vstr *vs, const char *b, size_t start, size_t stop, int ascii );
 int  vscatprintf( vstr *vs, char *fmt, ... );
 void vsvprintf(   vstr *vs, char *fmt, va_list ap );
 void vstrunc(     vstr *vs, size_t off ); // Drop chars [off..dlen]
 int  vslast(      vstr *vs ); // returns the last character stored in a vstr string
 void vscharcat(   vstr *vs, int ch );
-int  vsutf16(     vstr *vs, vbuf *in ); //in: in=zero-terminated utf16; out: vs=utf8; returns: 0 on success, else on fail
 
-int vs_parse_escaped_string( vstr *vs, char *str, size_t len );
-
-
-/*
- * Windows unicode output trash - this stuff sucks
- * TODO: most of this should not be here
- */
 
 void unicode_init();
 void unicode_close();
-int utf16_write( FILE* stream, const void *buf, size_t count );
-int utf16_fprintf( FILE* stream, const char *fmt, ... );
-int utf16to8( char *inbuf_o, char *outbuf_o, int length );
-int utf8to16( char *inbuf_o, int iblen, char *outbuf_o, int oblen);
-int vb_utf8to16T( vbuf *bout, char *cin, int inlen );
-int vb_utf16to8( vbuf *dest, char *buf, int len );
-int iso8859_1to8( char *inbuf_o, char *outbuf_o, int length );
-int utf8toascii( const char *inbuf_o, char *outbuf_o, int length );
-
-/* dump ascii hex in windoze format */
-void winhex(FILE* stream, unsigned char *hbuf, int start, int stop, int loff);
-void winhex8(FILE *stream, unsigned char *hbuf, int start, int stop, int loff );
-
-void vbwinhex8(vbuf *vb, unsigned char *hbuf, int start, int stop, int loff );
-
-/* general search routine, find something in something else */
-int find_in_buf(char *buf, char *what, int sz, int len, int start);
+size_t vb_utf16to8(vbuf *dest, const char *inbuf, int iblen);
+size_t vb_utf8to8bit(vbuf *dest, const char *inbuf, int iblen, const char* charset);
 
-/* Get INTEGER from memory. This is probably low-endian specific? */
-int get_int( char *array );
-
-int find_nl( vstr *vs ); // find newline of type type in b
-int skip_nl( char *s ); // returns the width of the newline at s[0]
-//int vb_readline( struct varbuf *vb, int *ctype, FILE *in ); // read *AT LEAST* one full line of data from in
 int vb_skipline( struct varbuf *vb ); // in: vb->b == "stuff\nmore_stuff"; out: vb->b == "more_stuff"
-/* Get a string of HEX bytes (space separated),
- * or if first char is ' get an ASCII string instead.  */
-int gethexorstr(char **c, char *wb);
-char *esc_index( char *s, int c ); // just like index(3), but works on strings with escape sequences
-char *esc_rindex( char *s, int c ); // just like rindex(3), but works on strings with escape sequences
-
-char *tok_esc_char( char *s, int *is_esc, int *c );
-int vb_path_token( vbuf *tok, char **path ); // returns things like TOK_EMPTY, TOK_ERROR, complete list at top
-
-int gettoken( char *tok, int len, char **path, char delim ); // Path tokenizer: increments path, dumps token in tok
 #endif