view src/vbuf.c @ 355:d1f930be4711

From Jeffrey Morlan: pst_build_id_ptr and pst_build_desc_ptr require that the first child of a BTree page have the same starting ID as itself. This is not required by the spec, and is not true in many real-world PSTs (presumably, the original first child of the page got deleted). Because of this, many emails are not being extracted from these PSTs. It also triggers an infinite loop in lspst (a separate bug, also fixed)
author Carl Byington <carl@five-ten-sg.com>
date Wed, 06 Jul 2016 10:12:22 -0700
parents 4fd5197aacc2
children
line wrap: on
line source


#include "define.h"

static int unicode_up = 0;
static iconv_t i16to8;
static const char *target_charset = NULL;
static int         target_open_from = 0;
static int         target_open_to   = 0;
static iconv_t     i8totarget = (iconv_t)-1;
static iconv_t     target2i8  = (iconv_t)-1;


#define ASSERT(x,...) { if( !(x) ) DIE(( __VA_ARGS__)); }


/** DESTRUCTIVELY grow or shrink buffer
 */
static void   pst_vbresize(pst_vbuf *vb, size_t len);
static void pst_vbresize(pst_vbuf *vb, size_t len)
{
    vb->dlen = 0;

    if (vb->blen >= len) {
        vb->b = vb->buf;
        return;
    }

    vb->buf  = realloc(vb->buf, len);
    vb->b    = vb->buf;
    vb->blen = len;
}


static size_t pst_vbavail(pst_vbuf * vb);
static size_t pst_vbavail(pst_vbuf * vb)
{
    return vb->blen  - vb->dlen - (size_t)(vb->b - vb->buf);
}


static void open_targets(const char* charset);
static void open_targets(const char* charset)
{
    if (!target_charset || strcasecmp(target_charset, charset)) {
        if (target_open_from) iconv_close(i8totarget);
        if (target_open_to)   iconv_close(target2i8);
        if (target_charset)   free((char *)target_charset);
        target_charset   = strdup(charset);
        target_open_from = 1;
        target_open_to   = 1;
        i8totarget = iconv_open(target_charset, "utf-8");
        if (i8totarget == (iconv_t)-1) {
            target_open_from = 0;
            DEBUG_WARN(("Couldn't open iconv descriptor for utf-8 to %s.\n", target_charset));
        }
        target2i8 = iconv_open("utf-8", target_charset);
        if (target2i8 == (iconv_t)-1) {
            target_open_to = 0;
            DEBUG_WARN(("Couldn't open iconv descriptor for %s to utf-8.\n", target_charset));
        }
    }
}


static size_t sbcs_conversion(pst_vbuf *dest, const char *inbuf, int iblen, iconv_t conversion);
static size_t sbcs_conversion(pst_vbuf *dest, const char *inbuf, int iblen, iconv_t conversion)
{
    size_t inbytesleft  = iblen;
    size_t icresult     = (size_t)-1;
    size_t outbytesleft = 0;
    char *outbuf        = NULL;
    int   myerrno;

    DEBUG_ENT("sbcs_conversion");
    pst_vbresize(dest, 2*iblen);

    do {
        outbytesleft = dest->blen - dest->dlen;
        outbuf = dest->b + dest->dlen;
        icresult = iconv(conversion, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft);
        myerrno  = errno;
        dest->dlen = outbuf - dest->b;
        if (inbytesleft) pst_vbgrow(dest, 2*inbytesleft);
    } while ((size_t)-1 == icresult && E2BIG == myerrno);

    if (icresult == (size_t)-1) {
        DEBUG_WARN(("iconv failure: %s\n", strerror(myerrno)));
        pst_unicode_init();
        DEBUG_RET();
        return (size_t)-1;
    }
    DEBUG_RET();
    return 0;
}


static void pst_unicode_close();
static void pst_unicode_close()
{
    iconv_close(i16to8);
    if (target_open_from) iconv_close(i8totarget);
    if (target_open_to)   iconv_close(target2i8);
    if (target_charset)   free((char *)target_charset);
    target_charset   = NULL;
    target_open_from = 0;
    target_open_to   = 0;
    unicode_up = 0;
}


static int utf16_is_terminated(const char *str, int length);
static int utf16_is_terminated(const char *str, int length)
{
    int len = -1;
    int i;
    for (i = 0; i < length; i += 2) {
        if (str[i] == 0 && str[i + 1] == 0) {
            len = i;
        }
    }

    if (len == -1) {
        DEBUG_WARN(("utf16 string is not zero terminated\n"));
    }

    return (len == -1) ? 0 : 1;
}


pst_vbuf *pst_vballoc(size_t len)
{
    pst_vbuf *result = pst_malloc(sizeof(pst_vbuf));
    if (result) {
        result->dlen = 0;
        result->blen = 0;
        result->buf = NULL;
        pst_vbresize(result, len);
    }
    else DIE(("malloc() failure"));
    return result;
}


/** out: vbavail(vb) >= len, data are preserved
 */
void pst_vbgrow(pst_vbuf *vb, size_t len)
{
    if (0 == len)
        return;

    if (0 == vb->blen) {
        pst_vbresize(vb, len);
        return;
    }

    if (vb->dlen + len > vb->blen) {
        if (vb->dlen + len < vb->blen * 1.5)
            len = vb->blen * 1.5;
        char *nb = pst_malloc(vb->blen + len);
        if (!nb) DIE(("malloc() failure"));
        vb->blen = vb->blen + len;
        memcpy(nb, vb->b, vb->dlen);

        free(vb->buf);
        vb->buf = nb;
        vb->b = vb->buf;
    } else {
        if (vb->b != vb->buf)
            memcpy(vb->buf, vb->b, vb->dlen);
    }

    vb->b = vb->buf;

    ASSERT(pst_vbavail(vb) >= len, "vbgrow(): I have failed in my mission.");
}


/** set vbuf b size=len, resize if necessary, relen = how much to over-allocate
 */
void pst_vbset(pst_vbuf * vb, void *b, size_t len)
{
    pst_vbresize(vb, len);
    memcpy(vb->b, b, len);
    vb->dlen = len;
}


/** append len bytes of b to vb, resize if necessary
 */
void pst_vbappend(pst_vbuf *vb, void *b, size_t len)
{
    if (0 == vb->dlen) {
        pst_vbset(vb, b, len);
        return;
    }
    pst_vbgrow(vb, len);
    memcpy(vb->b + vb->dlen, b, len);
    vb->dlen += len;
}


void pst_unicode_init()
{
    if (unicode_up) pst_unicode_close();
    i16to8 = iconv_open("utf-8", "utf-16le");
    if (i16to8 == (iconv_t)-1) {
        DEBUG_WARN(("Couldn't open iconv descriptor for utf-16le to utf-8.\n"));
    }
    unicode_up = 1;
}


size_t pst_vb_utf16to8(pst_vbuf *dest, const char *inbuf, int iblen)
{
    size_t inbytesleft  = iblen;
    size_t icresult     = (size_t)-1;
    size_t outbytesleft = 0;
    char *outbuf        = NULL;
    int   myerrno;

    if (!unicode_up) return (size_t)-1;   // failure to open iconv
    pst_vbresize(dest, iblen);

    //Bad Things can happen if a non-zero-terminated utf16 string comes through here
    if (!utf16_is_terminated(inbuf, iblen))
        return (size_t)-1;

    do {
        outbytesleft = dest->blen - dest->dlen;
        outbuf = dest->b + dest->dlen;
        icresult = iconv(i16to8, (ICONV_CONST char**)&inbuf, &inbytesleft, &outbuf, &outbytesleft);
        myerrno  = errno;
        dest->dlen = outbuf - dest->b;
        if (inbytesleft) pst_vbgrow(dest, inbytesleft);
    } while ((size_t)-1 == icresult && E2BIG == myerrno);

    if (icresult == (size_t)-1) {
        DEBUG_WARN(("iconv failure: %s\n", strerror(myerrno)));
        pst_unicode_init();
        return (size_t)-1;
    }
    return (icresult) ? (size_t)-1 : 0;
}


size_t pst_vb_utf8to8bit(pst_vbuf *dest, const char *inbuf, int iblen, const char* charset)
{
    open_targets(charset);
    if (!target_open_from) return (size_t)-1;   // failure to open the target
    return sbcs_conversion(dest, inbuf, iblen, i8totarget);
}


size_t pst_vb_8bit2utf8(pst_vbuf *dest, const char *inbuf, int iblen, const char* charset)
{
    open_targets(charset);
    if (!target_open_to) return (size_t)-1;     // failure to open the target
    return sbcs_conversion(dest, inbuf, iblen, target2i8);
}