view src/pst2dii.cpp.in @ 359:a3e674fade6c

From Jeffrey Morlan: pst_parse_block misreads Table Contexts (aka "type 2") with a multi-block Row Matrix ("ind2"). Rows are never split between blocks - every block except the last has padding at the end which should be ignored. I've only seen this affect the recipients table, but presumably it could affect attachments too. This was causing out-of-bounds memory ranges to be returned from pst_getBlockOffset and later access; patch fixes both the table reading issue and adds a missing bounds check to pst_getBlockOffset (so as not to risk a segfault if the PST is corrupted).
author Carl Byington <carl@five-ten-sg.com>
date Wed, 06 Jul 2016 10:20:12 -0700
parents cb67b335afcc
children
line wrap: on
line source

/*

Copyright (c) 2008 Carl Byington - 510 Software Group, released under
the GPL version 2 or any later version at your choice available at
http://www.fsf.org/licenses/gpl.txt

Based on readpst.c by David Smith

*/
#include <iostream>
#include <string>
#include <vector>

using namespace std;

extern "C" {
    #include "define.h"
    #include "lzfu.h"
}

struct file_ll {
    string  name;
    int32_t stored_count;
    int32_t email_count;
    int32_t skip_count;
    int32_t type;
    file_ll() {
        stored_count = 0;
        email_count  = 0;
        skip_count   = 0;
        type         = 0;
    };
};

// global settings
const char*    convert = "@CONVERT@";     // fully qualified path of the convert program from image magick
const char*    prog_name = NULL;          // our arg0 name
const char*    bates_prefix = "";         // string to prefix bates numbers
int            bates_index = 0;           // current bates sequence
const char*    output_directory = ".";
const char*    output_file = "load.dii";
char*    font_file = NULL;
int      bates_color = 0xff0000;    // color of bates header stamp
int      email_sequence = 0;        // current pdf sequence number
char*    pdf_name = NULL;           // current pdf file name
FILE*    dii_file = NULL;           // the output dii load file
pst_file pstfile;                   // the input pst file

// pdf writer globals
bool     pdf_open = false;          // is pdf writer started
char*    pst_folder;                // current folder name
int      page_sequence;             // current page number
string   conversion;                // conversion command
vector<string>  png_names;

// png writer globals
bool     png_open = false;          // is current page open
int      line_height;               // in pixels
int      char_width;                // in pixels
int      col_number, col_max;       // in characters
int      line_number, line_max;     // lines per page
int      x_position, y_position;    // in pixels
int      black, red;                // text colors
gdImagePtr  image;                  // current gd image

const int DPI         = 300;
const double sz       = 10.0;
const int margin      = DPI/2;
const int LINE_SIZE   = 2000;
const int PAGE_WIDTH  = DPI*17/2;
const int PAGE_HEIGHT = DPI*11;

// max size of the c_time char*. It will store the date of the email
#define C_TIME_SIZE 500

static void open_png();
static void close_png();


static void version();
static void version()
{
    printf("pst2dii v%s\n", VERSION);
#if BYTE_ORDER == BIG_ENDIAN
    printf("Big Endian implementation being used.\n");
#elif BYTE_ORDER == LITTLE_ENDIAN
    printf("Little Endian implementation being used.\n");
#else
#  error "Byte order not supported by this library"
#endif
}


static void usage();
static void usage()
{
    version();
    printf("Usage: %s -f ttf-font-file [OPTIONS] {PST FILENAME}\n", prog_name);
    printf("\t-f ttf-font-file  \t- Set the font file\n");
    printf("OPTIONS:\n");
    printf("\t-B bates-prefix   \t- Set the bates prefix string\n");
    printf("\t-O dii-output-file\t- Set the dii load file output filename\n");
    printf("\t-V                \t- Version. Display program version\n");
    printf("\t-b bates-number   \t- Set the starting bates sequence number\n");
    printf("\t-c bates-color    \t- Specify the color of the bates stamps as 6 digit hex\n");
    printf("\t-d filename       \t- Debug to file.\n");
    printf("\t-h                \t- Help. This screen\n");
    printf("\t-o dirname        \t- Output directory to write files to.\n");
}


static char *removeCR (char *c);
static char *removeCR (char *c) {
    // converts /r/n to /n
    char *a, *b;
    DEBUG_ENT("removeCR");
    a = b = c;
    while (*a != '\0') {
        *b = *a;
        if (*a != '\r')
            b++;
        a++;
    }
    *b = '\0';
    DEBUG_RET();
    return c;
}


// The sole purpose of this function is to bypass the pseudo-header prologue
// that Microsoft Outlook inserts at the beginning of the internet email
// headers for emails stored in their "Personal Folders" files.
static char *skip_header_prologue(char *headers);
static char *skip_header_prologue(char *headers) {
    const char *bad = "Microsoft Mail Internet Headers";
    if (strncmp(headers, bad, strlen(bad)) == 0) {
        // Found the offensive header prologue
        char *pc = strchr(headers, '\n');
        return pc + 1;
    }
    return headers;
}


static void check_filename(string &fname);
static void check_filename(string &fname) {
    char *t = strdup(fname.c_str());
    DEBUG_ENT("check_filename");
    if (!t) {
        DEBUG_RET();
        return;
    }
    char *tt = t;
    bool fixed = false;
    while ((t = strpbrk(t, " /\\:"))) {
        // while there are characters in the second string that we don't want
        *t = '_'; //replace them with an underscore
        fixed = true;
    }
    if (fixed) fname = string(tt);
    free(tt);
    DEBUG_RET();
}


static string write_separate_attachment(string fname, pst_item_attach* current_attach, int attach_num, pst_file* pst);
static string write_separate_attachment(string fname, pst_item_attach* current_attach, int attach_num, pst_file* pst)
{
    FILE *fp = NULL;
    int x = 0;
    char *temp = NULL;

    // If there is a long filename (filename2) use that, otherwise
    // use the 8.3 filename (filename1)
    char *attach_filename = (current_attach->filename2.str) ? current_attach->filename2.str
                                                            : current_attach->filename1.str;
    DEBUG_ENT("write_separate_attachment");
    check_filename(fname);
    const char* f_name = fname.c_str();
    DEBUG_INFO(("dirname=%s, pathname=%s, filename=%s\n", output_directory, f_name, attach_filename));
    int len = strlen(output_directory) + 1 + strlen(f_name) + 15;
    if (!attach_filename) {
        // generate our own (dummy) filename for the attachment
        temp = (char*)pst_malloc(len);
        sprintf(temp, "%s/%s_attach%i", output_directory, f_name, attach_num);
    } else {
        // have an attachment name, make sure it's unique
        temp = (char*)pst_malloc(len+strlen(attach_filename));
        do {
            if (fp) fclose(fp);
            if (x == 0)
                sprintf(temp, "%s/%s_%s", output_directory, f_name, attach_filename);
            else
                sprintf(temp, "%s/%s_%s-%i", output_directory, f_name, attach_filename, x);
        } while ((fp = fopen(temp, "r")) && ++x < 99999999);
        if (x > 99999999) {
            DIE(("error finding attachment name. exhausted possibilities to %s\n", temp));
        }
    }
    DEBUG_INFO(("Saving attachment to %s\n", temp));
    if (!(fp = fopen(temp, "wb"))) {
        DEBUG_WARN(("write_separate_attachment: Cannot open attachment save file \"%s\"\n", temp));
    } else {
        (void)pst_attach_to_file(pst, current_attach, fp);
        fclose(fp);
    }
    string rc(temp);
    if (temp) free(temp);
    DEBUG_RET();
    return rc;
}


static void print_pdf_short(const char *line, int len, int color);
static void print_pdf_short(const char *line, int len, int color)
{
    if (line_number >= line_max) {
        close_png();
        open_png();
    }
    int brect[8];
    gdFTStringExtra strex;
    strex.flags       = gdFTEX_RESOLUTION;
    strex.linespacing = 1.20;
    strex.charmap     = 0;
    strex.hdpi        = DPI;
    strex.vdpi        = DPI;
    char xline[len+1];
    memcpy(xline, line, len);
    xline[len] = '\0';
    char *p;
    char *l = xline;
    while ((p = strchr(l, '&'))) {
        *p = '\0';
        char *err = gdImageStringFTEx(image, &brect[0], color, font_file, sz, 0.0, x_position, y_position, l, &strex);
        if (err) printf("%s", err);
        x_position += (brect[2]-brect[6]);
        l = p+1;
        err = gdImageStringFTEx(image, &brect[0], color, font_file, sz, 0.0, x_position, y_position, (char*)"&amp;", &strex);
        if (err) printf("%s", err);
        x_position += (brect[2]-brect[6]);
    }
    char *err = gdImageStringFTEx(image, &brect[0], color, font_file, sz, 0.0, x_position, y_position, l, &strex);
    if (err) printf("%s", err);
    x_position += (brect[2]-brect[6]);
    col_number += len;
}


static void new_line();
static void new_line()
{
    y_position  += line_height;
    line_number += 1;
    x_position   = margin;
    col_number   = 0;
}


static void print_pdf_single(const char *line, int color);
static void print_pdf_single(const char *line, int color)
{
    while (*line == '\t') {
        char blanks[5];
        memset(blanks, ' ', 5);
        print_pdf_short(blanks, 4, color);
        line++;
        if (col_number >= col_max) new_line();
    }
    int n = strlen(line);
    while (n) {
        int m = col_max - col_number;   // number of chars that will fit on this line
        m = (n > m) ? m : n;
        print_pdf_short(line, m, color);
        line += m;
        n    -= m;
        if (n) new_line();
    }
}


static void print_pdf_only(char *line, int color);
static void print_pdf_only(char *line, int color)
{
    char *p;
    while ((p = strchr(line, '\n'))) {
        *p = '\0';
        print_pdf_single(line, color);
        *p = '\n';
        line = p+1;
        new_line();
    }
    print_pdf_single(line, color);
}


static void print_pdf(char *line);
static void print_pdf(char *line)
{
    pst_fwrite(line, 1, strlen(line), dii_file);
    print_pdf_only(line, black);
}


static void open_png()
{
    if (!png_open) {
        png_open  = true;
        int brect[8];
        image = gdImageCreate(PAGE_WIDTH, PAGE_HEIGHT);
                gdImageColorAllocate(image, 255, 255, 255); // background color first one allocated
        black = gdImageColorAllocate(image, 0, 0, 0);
        int r = (bates_color & 0xff0000) >> 16;
        int g = (bates_color & 0x00ff00) >> 8;
        int b = (bates_color & 0x0000ff);
        red   = gdImageColorAllocate(image, r, g, b);

        gdFTStringExtra strex;
        strex.flags       = gdFTEX_RESOLUTION;
        strex.linespacing = 1.20;
        strex.charmap     = 0;
        strex.hdpi        = DPI;
        strex.vdpi        = DPI;

        char line[LINE_SIZE];
        char *err = gdImageStringFTEx(NULL, &brect[0], black, font_file, sz, 0.0, margin, margin, (char*)"LMgqQ", &strex);
        if (err) printf("%s", err);
        line_height = (brect[3]-brect[7]) * 12/10;
        char_width  = (brect[2]-brect[6]) / 5;
        col_number  = 0;
        col_max     = (PAGE_WIDTH - margin*2) / char_width;
        line_number = 0;
        line_max    = (PAGE_HEIGHT - margin*2) / line_height;
        x_position  = margin;
        y_position  = margin + line_height;
        snprintf(line, sizeof(line), "%s%06d\n", bates_prefix, bates_index++);
        print_pdf_only(line, red);
        print_pdf_only(pst_folder, red);
    }
}


static void close_png()
{
    if (png_open) {
        png_open = false;
        int len = 4 + 11 + 4 +1;
        char *fn = (char*)pst_malloc(len);
        snprintf(fn, len, "page%d.png", ++page_sequence);
        FILE *pngout = fopen(fn, "wb");
        if (pngout) {
            gdImagePng(image, pngout);
            fclose(pngout);
        }
        gdImageDestroy(image); // free memory
        png_names.push_back(fn);
        conversion += string(" ") + fn;
        free(fn);
    }
}


static void open_pdf(char *line);
static void open_pdf(char *line)
{
    pst_folder    = line;
    page_sequence = 0;
    conversion    = string(convert);
    png_names.clear();
    open_png();
    /* Note; allocating the largest string length to pdf_name */
    int len = strlen(output_directory) + 4 + 6 + 4 + 1;
    pdf_name = (char*)pst_malloc(len);
    snprintf(pdf_name, 3 + 6 + 1, "dii%06d", ++email_sequence);
    fprintf(dii_file, "\n@T %s\n", pdf_name);
    snprintf(pdf_name, len, "%s/dii%06d.pdf", output_directory, email_sequence);
}


static void close_pdf();
static void close_pdf()
{
    close_png();
    conversion += string(" ") + pdf_name;
    (void)system(conversion.c_str());
    for (vector<string>::iterator i=png_names.begin(); i!=png_names.end(); i++) {
        remove((*i).c_str());
    }
    fprintf(dii_file, "@D %s\n", pdf_name);
    free(pdf_name);
}


static void write_simple(const char *tag, const char *value);
static void write_simple(const char *tag, const char *value)
{
    if (value) fprintf(dii_file, "@%s %s\n", tag, value);
}


static void write_simple(const char *tag, string value);
static void write_simple(const char *tag, string value)
{
    fprintf(dii_file, "@%s %s\n", tag, value.c_str());
}


static void write_simple(const char *tag, const char *value, const char *value2);
static void write_simple(const char *tag, const char *value, const char *value2)
{
    if (value) {
    if (value2) fprintf(dii_file, "@%s \"%s\" <%s>\n", tag, value, value2);
    else        fprintf(dii_file, "@%s \"%s\"\n", tag, value);
    }
}


static string extract_header(char *headers, const char *field);
static string extract_header(char *headers, const char *field)
{
    string rc;
    int len = strlen(field) + 4;
    char f[len];
    snprintf(f, len, "\n%s: ", field);
    char *p = strstr(headers, f);
    if (p) {
        p += strlen(f);
        char *n = strchr(p, '\n');
        if (n) {
            *n = '\0';
            rc = string(p);
            *n = '\n';
        }
        else {
            rc = string(p);
        }
    }
    return rc;
}


static void write_normal_email(file_ll &f, pst_item* item, pst_file* pst);
static void write_normal_email(file_ll &f, pst_item* item, pst_file* pst)
{
    DEBUG_ENT("write_normal_email");
    char *soh = NULL;  // real start of headers.
    if (item->email->header.str) {
        // some of the headers we get from the file are not properly defined.
        // they can contain some email stuff too. We will cut off the header
        // when we see a \n\n or \r\n\r\n
        removeCR(item->email->header.str);
        char *temp = strstr(item->email->header.str, "\n\n");
        if (temp) {
            DEBUG_INFO(("Found body text in header\n"));
            temp[1] = '\0'; // stop after first \n
        }
        soh = skip_header_prologue(item->email->header.str);
    }

    char folder_line[LINE_SIZE];
    char line[LINE_SIZE];
    // reset pdf writer to new file
    int bates = bates_index;    // save starting index
    snprintf(folder_line, sizeof(folder_line), "pst folder = %s\n", f.name.c_str());
    open_pdf(folder_line);

    // start printing this email
    fprintf(dii_file, "@FOLDERNAME %s\n", f.name.c_str());
    string myfrom = extract_header(soh, "From");
    string myto   = extract_header(soh, "To");
    string mycc   = extract_header(soh, "Cc");
    string mybcc  = extract_header(soh, "Bcc");
    if (myfrom.empty()) write_simple("FROM", item->email->outlook_sender_name.str, item->email->sender_address.str);
    else                write_simple("FROM", myfrom);
    if (myto.empty())   write_simple("TO",   item->email->sentto_address.str,      item->email->recip_address.str);
    else                write_simple("TO", myto);
    if (mycc.empty())   write_simple("CC",   item->email->cc_address.str);
    else                write_simple("CC", mycc);
    if (mybcc.empty())  write_simple("BCC",  item->email->bcc_address.str);
    else                write_simple("BCC", mybcc);
    if (item->email->sent_date) {
        time_t t = pst_fileTimeToUnixTime(item->email->sent_date);
        char c_time[C_TIME_SIZE];
        strftime(c_time, C_TIME_SIZE, "%F", gmtime(&t));
        write_simple("DATESENT", c_time);
        strftime(c_time, C_TIME_SIZE, "%T+0000", gmtime(&t));
        write_simple("TIMESENT", c_time);
    }
    if (item->email->arrival_date) {
        time_t t = pst_fileTimeToUnixTime(item->email->arrival_date);
        char c_time[C_TIME_SIZE];
        strftime(c_time, C_TIME_SIZE, "%F", gmtime(&t));
        write_simple("DATERCVD", c_time);
        strftime(c_time, C_TIME_SIZE, "%T+0000", gmtime(&t));
        write_simple("TIMERCVD", c_time);
    }
    if (item->subject.str) {
        write_simple("SUBJECT", item->subject.str);
    }
    write_simple("MSGID", item->email->messageid.str);
    write_simple("READ", (item->flags & 1) ? "Y" : "N");

    DEBUG_INFO(("About to print Header\n"));
    fprintf(dii_file, "@HEADER\n");

    if (item && item->subject.str) {
        DEBUG_INFO(("item->subject = %s\n", item->subject.str));
    }

    if (soh) {
        // Now, write out the header...
        print_pdf(soh);
        int len = strlen(soh);
        if (!len || (soh[len-1] != '\n')) {
            snprintf(line, sizeof(line), "\n");
            print_pdf(line);
        }

    } else {
        //make up our own headers
        const char *temp = item->email->outlook_sender.str;
        if (!temp) temp = "";
        snprintf(line, sizeof(line), "From: \"%s\" <%s>\n", item->email->outlook_sender_name.str, temp);
        print_pdf(line);

        if (item->subject.str) {
            snprintf(line, sizeof(line), "Subject: %s\n", item->subject.str);
        } else {
            snprintf(line, sizeof(line), "Subject: \n");
        }
        print_pdf(line);

        snprintf(line, sizeof(line), "To: %s\n", item->email->sentto_address.str);
        print_pdf(line);

        if (item->email->cc_address.str) {
            snprintf(line, sizeof(line), "Cc: %s\n", item->email->cc_address.str);
            print_pdf(line);
        }

        if (item->email->sent_date) {
            time_t em_time = pst_fileTimeToUnixTime(item->email->sent_date);
            char c_time[C_TIME_SIZE];
            strftime(c_time, C_TIME_SIZE, "%a, %d %b %Y %H:%M:%S %z", gmtime(&em_time));
            snprintf(line, sizeof(line), "Date: %s\n", c_time);
            print_pdf(line);
        }
    }
    snprintf(line, sizeof(line), "\n");
    print_pdf_only(line, black);
    fprintf(dii_file, "@HEADER-END\n");

    DEBUG_INFO(("About to print Body\n"));
    fprintf(dii_file, "@EMAIL-BODY\n");
    if (item->body.str) {
        removeCR(item->body.str);
        print_pdf(item->body.str);
    } else if (item->email->htmlbody.str) {
        removeCR(item->email->htmlbody.str);
        print_pdf(item->email->htmlbody.str);
    } else if (item->email->encrypted_body.data || item->email->encrypted_htmlbody.data) {
        char ln[LINE_SIZE];
        snprintf(ln, sizeof(ln), "%s", "The body of this email is encrypted. This isn't supported yet, but the body is now an attachment\n");
        print_pdf(ln);
    }
    fprintf(dii_file, "@EMAIL-END\n");

    int attach_num = 0;
    for (pst_item_attach* attach = item->attach; attach; attach = attach->next) {
        pst_convert_utf8_null(item, &attach->filename1);
        pst_convert_utf8_null(item, &attach->filename2);
        pst_convert_utf8_null(item, &attach->mimetype);
        DEBUG_INFO(("Attempting Attachment encoding\n"));
        if (attach->data.data || attach->i_id) {
            string an = write_separate_attachment(f.name, attach, ++attach_num, pst);
            fprintf(dii_file, "@EATTACH %s\n", an.c_str());
        }
    }
    close_pdf();
    fprintf(dii_file, "@BATESBEG %d\n", bates);
    fprintf(dii_file, "@BATESEND %d\n", bates_index-1);
    DEBUG_RET();
}


static void create_enter_dir(file_ll &f, file_ll *parent, pst_item *item);
static void create_enter_dir(file_ll &f, file_ll *parent, pst_item *item)
{
    pst_convert_utf8(item, &item->file_as);
    f.type         = item->type;
    f.stored_count = (item->folder) ? item->folder->item_count : 0;
    f.name         = ((parent) ? parent->name + "/" : "") + string(item->file_as.str);
}


static void close_enter_dir(file_ll &f);
static void close_enter_dir(file_ll &f)
{
}


static void process(pst_item *outeritem, file_ll *parent, pst_desc_tree *d_ptr);
static void process(pst_item *outeritem, file_ll *parent, pst_desc_tree *d_ptr)
{
    file_ll ff;
    pst_item *item = NULL;
    DEBUG_ENT("process");
    create_enter_dir(ff, parent, outeritem);
    for (; d_ptr; d_ptr = d_ptr->next) {
        if (d_ptr->desc) {
            item = pst_parse_item(&pstfile, d_ptr, NULL);
            DEBUG_INFO(("item pointer is %p\n", item));
            if (item) {
                if (item->folder && item->file_as.str && d_ptr->child )  {
                    //if this is a non-empty folder, we want to recurse into it
                    fprintf(stderr, "entering folder %s\n", item->file_as.str);
                    process(item, &ff, d_ptr->child);
                } else if (item->email && (item->type == PST_TYPE_NOTE || item->type == PST_TYPE_SCHEDULE || item->type == PST_TYPE_REPORT)) {
                    ff.email_count++;
                    write_normal_email(ff, item, &pstfile);
                }
                else {
                    ff.skip_count++;    // other mapi objects
                }
                pst_freeItem(item);
            } else {
                ff.skip_count++;
                DEBUG_INFO(("A NULL item was seen\n"));
            }
        }
    }
    close_enter_dir(ff);
    DEBUG_RET();
}


int main(int argc, char* const* argv)
{
    pst_desc_tree *d_ptr;
    char *fname = NULL;
    char c;
    char *d_log = NULL;
    prog_name = argv[0];
    pst_item *item = NULL;

    while ((c = getopt(argc, argv, "B:b:c:d:f:o:O:Vh"))!= -1) {
        switch (c) {
        case 'B':
            bates_prefix = optarg;
            break;
        case 'b':
            bates_index = atoi(optarg);
            break;
        case 'c':
            bates_color = (int)strtol(optarg, (char**)NULL, 16);
            break;
        case 'f':
            font_file = optarg;
            break;
        case 'o':
            output_directory = optarg;
            break;
        case 'O':
            output_file = optarg;
            break;
        case 'd':
            d_log = optarg;
            break;
        case 'h':
            usage();
            exit(0);
            break;
        case 'V':
            version();
            exit(0);
            break;
        default:
            usage();
            exit(1);
            break;
        }
    }

    if (!font_file) {
        usage();
        exit(1);
    }

    if (argc > optind) {
        fname = argv[optind];
    } else {
        usage();
        exit(2);
    }


    #ifdef DEBUG_ALL
        // force a log file
        if (!d_log) d_log = "pst2dii.log";
    #endif
    DEBUG_INIT(d_log, NULL);
    DEBUG_ENT("main");
    RET_DERROR(pst_open(&pstfile, fname, NULL), 1, ("Error opening File\n"));
    RET_DERROR(pst_load_index(&pstfile), 2, ("Index Error\n"));

    pst_load_extended_attributes(&pstfile);

    d_ptr = pstfile.d_head; // first record is main record
    item  = (pst_item*)pst_parse_item(&pstfile, d_ptr, NULL);
    if (!item || !item->message_store) {
        DEBUG_RET();
        DIE(("Could not get root record\n"));
    }

    d_ptr = pst_getTopOfFolders(&pstfile, item);
    if (!d_ptr) {
        DEBUG_RET();
        DIE(("Top of folders record not found. Cannot continue\n"));
    }

    dii_file = fopen(output_file, "wb");
    if (dii_file) {
        process(item, NULL, d_ptr->child);  // do the children of TOPF
        pst_freeItem(item);
        pst_close(&pstfile);
        fclose(dii_file);
    }
    DEBUG_RET();
    return 0;
}