Mercurial > libpst
view src/pst2dii.cpp.in @ 363:3a1d25c579c6 stable-0-6-68
allow folders containing multiple item types; better detection of valid internet headers
author | Carl Byington <carl@five-ten-sg.com> |
---|---|
date | Mon, 29 Aug 2016 09:50:24 -0700 |
parents | cb67b335afcc |
children |
line wrap: on
line source
/* Copyright (c) 2008 Carl Byington - 510 Software Group, released under the GPL version 2 or any later version at your choice available at http://www.fsf.org/licenses/gpl.txt Based on readpst.c by David Smith */ #include <iostream> #include <string> #include <vector> using namespace std; extern "C" { #include "define.h" #include "lzfu.h" } struct file_ll { string name; int32_t stored_count; int32_t email_count; int32_t skip_count; int32_t type; file_ll() { stored_count = 0; email_count = 0; skip_count = 0; type = 0; }; }; // global settings const char* convert = "@CONVERT@"; // fully qualified path of the convert program from image magick const char* prog_name = NULL; // our arg0 name const char* bates_prefix = ""; // string to prefix bates numbers int bates_index = 0; // current bates sequence const char* output_directory = "."; const char* output_file = "load.dii"; char* font_file = NULL; int bates_color = 0xff0000; // color of bates header stamp int email_sequence = 0; // current pdf sequence number char* pdf_name = NULL; // current pdf file name FILE* dii_file = NULL; // the output dii load file pst_file pstfile; // the input pst file // pdf writer globals bool pdf_open = false; // is pdf writer started char* pst_folder; // current folder name int page_sequence; // current page number string conversion; // conversion command vector<string> png_names; // png writer globals bool png_open = false; // is current page open int line_height; // in pixels int char_width; // in pixels int col_number, col_max; // in characters int line_number, line_max; // lines per page int x_position, y_position; // in pixels int black, red; // text colors gdImagePtr image; // current gd image const int DPI = 300; const double sz = 10.0; const int margin = DPI/2; const int LINE_SIZE = 2000; const int PAGE_WIDTH = DPI*17/2; const int PAGE_HEIGHT = DPI*11; // max size of the c_time char*. It will store the date of the email #define C_TIME_SIZE 500 static void open_png(); static void close_png(); static void version(); static void version() { printf("pst2dii v%s\n", VERSION); #if BYTE_ORDER == BIG_ENDIAN printf("Big Endian implementation being used.\n"); #elif BYTE_ORDER == LITTLE_ENDIAN printf("Little Endian implementation being used.\n"); #else # error "Byte order not supported by this library" #endif } static void usage(); static void usage() { version(); printf("Usage: %s -f ttf-font-file [OPTIONS] {PST FILENAME}\n", prog_name); printf("\t-f ttf-font-file \t- Set the font file\n"); printf("OPTIONS:\n"); printf("\t-B bates-prefix \t- Set the bates prefix string\n"); printf("\t-O dii-output-file\t- Set the dii load file output filename\n"); printf("\t-V \t- Version. Display program version\n"); printf("\t-b bates-number \t- Set the starting bates sequence number\n"); printf("\t-c bates-color \t- Specify the color of the bates stamps as 6 digit hex\n"); printf("\t-d filename \t- Debug to file.\n"); printf("\t-h \t- Help. This screen\n"); printf("\t-o dirname \t- Output directory to write files to.\n"); } static char *removeCR (char *c); static char *removeCR (char *c) { // converts /r/n to /n char *a, *b; DEBUG_ENT("removeCR"); a = b = c; while (*a != '\0') { *b = *a; if (*a != '\r') b++; a++; } *b = '\0'; DEBUG_RET(); return c; } // The sole purpose of this function is to bypass the pseudo-header prologue // that Microsoft Outlook inserts at the beginning of the internet email // headers for emails stored in their "Personal Folders" files. static char *skip_header_prologue(char *headers); static char *skip_header_prologue(char *headers) { const char *bad = "Microsoft Mail Internet Headers"; if (strncmp(headers, bad, strlen(bad)) == 0) { // Found the offensive header prologue char *pc = strchr(headers, '\n'); return pc + 1; } return headers; } static void check_filename(string &fname); static void check_filename(string &fname) { char *t = strdup(fname.c_str()); DEBUG_ENT("check_filename"); if (!t) { DEBUG_RET(); return; } char *tt = t; bool fixed = false; while ((t = strpbrk(t, " /\\:"))) { // while there are characters in the second string that we don't want *t = '_'; //replace them with an underscore fixed = true; } if (fixed) fname = string(tt); free(tt); DEBUG_RET(); } static string write_separate_attachment(string fname, pst_item_attach* current_attach, int attach_num, pst_file* pst); static string write_separate_attachment(string fname, pst_item_attach* current_attach, int attach_num, pst_file* pst) { FILE *fp = NULL; int x = 0; char *temp = NULL; // If there is a long filename (filename2) use that, otherwise // use the 8.3 filename (filename1) char *attach_filename = (current_attach->filename2.str) ? current_attach->filename2.str : current_attach->filename1.str; DEBUG_ENT("write_separate_attachment"); check_filename(fname); const char* f_name = fname.c_str(); DEBUG_INFO(("dirname=%s, pathname=%s, filename=%s\n", output_directory, f_name, attach_filename)); int len = strlen(output_directory) + 1 + strlen(f_name) + 15; if (!attach_filename) { // generate our own (dummy) filename for the attachment temp = (char*)pst_malloc(len); sprintf(temp, "%s/%s_attach%i", output_directory, f_name, attach_num); } else { // have an attachment name, make sure it's unique temp = (char*)pst_malloc(len+strlen(attach_filename)); do { if (fp) fclose(fp); if (x == 0) sprintf(temp, "%s/%s_%s", output_directory, f_name, attach_filename); else sprintf(temp, "%s/%s_%s-%i", output_directory, f_name, attach_filename, x); } while ((fp = fopen(temp, "r")) && ++x < 99999999); if (x > 99999999) { DIE(("error finding attachment name. exhausted possibilities to %s\n", temp)); } } DEBUG_INFO(("Saving attachment to %s\n", temp)); if (!(fp = fopen(temp, "wb"))) { DEBUG_WARN(("write_separate_attachment: Cannot open attachment save file \"%s\"\n", temp)); } else { (void)pst_attach_to_file(pst, current_attach, fp); fclose(fp); } string rc(temp); if (temp) free(temp); DEBUG_RET(); return rc; } static void print_pdf_short(const char *line, int len, int color); static void print_pdf_short(const char *line, int len, int color) { if (line_number >= line_max) { close_png(); open_png(); } int brect[8]; gdFTStringExtra strex; strex.flags = gdFTEX_RESOLUTION; strex.linespacing = 1.20; strex.charmap = 0; strex.hdpi = DPI; strex.vdpi = DPI; char xline[len+1]; memcpy(xline, line, len); xline[len] = '\0'; char *p; char *l = xline; while ((p = strchr(l, '&'))) { *p = '\0'; char *err = gdImageStringFTEx(image, &brect[0], color, font_file, sz, 0.0, x_position, y_position, l, &strex); if (err) printf("%s", err); x_position += (brect[2]-brect[6]); l = p+1; err = gdImageStringFTEx(image, &brect[0], color, font_file, sz, 0.0, x_position, y_position, (char*)"&", &strex); if (err) printf("%s", err); x_position += (brect[2]-brect[6]); } char *err = gdImageStringFTEx(image, &brect[0], color, font_file, sz, 0.0, x_position, y_position, l, &strex); if (err) printf("%s", err); x_position += (brect[2]-brect[6]); col_number += len; } static void new_line(); static void new_line() { y_position += line_height; line_number += 1; x_position = margin; col_number = 0; } static void print_pdf_single(const char *line, int color); static void print_pdf_single(const char *line, int color) { while (*line == '\t') { char blanks[5]; memset(blanks, ' ', 5); print_pdf_short(blanks, 4, color); line++; if (col_number >= col_max) new_line(); } int n = strlen(line); while (n) { int m = col_max - col_number; // number of chars that will fit on this line m = (n > m) ? m : n; print_pdf_short(line, m, color); line += m; n -= m; if (n) new_line(); } } static void print_pdf_only(char *line, int color); static void print_pdf_only(char *line, int color) { char *p; while ((p = strchr(line, '\n'))) { *p = '\0'; print_pdf_single(line, color); *p = '\n'; line = p+1; new_line(); } print_pdf_single(line, color); } static void print_pdf(char *line); static void print_pdf(char *line) { pst_fwrite(line, 1, strlen(line), dii_file); print_pdf_only(line, black); } static void open_png() { if (!png_open) { png_open = true; int brect[8]; image = gdImageCreate(PAGE_WIDTH, PAGE_HEIGHT); gdImageColorAllocate(image, 255, 255, 255); // background color first one allocated black = gdImageColorAllocate(image, 0, 0, 0); int r = (bates_color & 0xff0000) >> 16; int g = (bates_color & 0x00ff00) >> 8; int b = (bates_color & 0x0000ff); red = gdImageColorAllocate(image, r, g, b); gdFTStringExtra strex; strex.flags = gdFTEX_RESOLUTION; strex.linespacing = 1.20; strex.charmap = 0; strex.hdpi = DPI; strex.vdpi = DPI; char line[LINE_SIZE]; char *err = gdImageStringFTEx(NULL, &brect[0], black, font_file, sz, 0.0, margin, margin, (char*)"LMgqQ", &strex); if (err) printf("%s", err); line_height = (brect[3]-brect[7]) * 12/10; char_width = (brect[2]-brect[6]) / 5; col_number = 0; col_max = (PAGE_WIDTH - margin*2) / char_width; line_number = 0; line_max = (PAGE_HEIGHT - margin*2) / line_height; x_position = margin; y_position = margin + line_height; snprintf(line, sizeof(line), "%s%06d\n", bates_prefix, bates_index++); print_pdf_only(line, red); print_pdf_only(pst_folder, red); } } static void close_png() { if (png_open) { png_open = false; int len = 4 + 11 + 4 +1; char *fn = (char*)pst_malloc(len); snprintf(fn, len, "page%d.png", ++page_sequence); FILE *pngout = fopen(fn, "wb"); if (pngout) { gdImagePng(image, pngout); fclose(pngout); } gdImageDestroy(image); // free memory png_names.push_back(fn); conversion += string(" ") + fn; free(fn); } } static void open_pdf(char *line); static void open_pdf(char *line) { pst_folder = line; page_sequence = 0; conversion = string(convert); png_names.clear(); open_png(); /* Note; allocating the largest string length to pdf_name */ int len = strlen(output_directory) + 4 + 6 + 4 + 1; pdf_name = (char*)pst_malloc(len); snprintf(pdf_name, 3 + 6 + 1, "dii%06d", ++email_sequence); fprintf(dii_file, "\n@T %s\n", pdf_name); snprintf(pdf_name, len, "%s/dii%06d.pdf", output_directory, email_sequence); } static void close_pdf(); static void close_pdf() { close_png(); conversion += string(" ") + pdf_name; (void)system(conversion.c_str()); for (vector<string>::iterator i=png_names.begin(); i!=png_names.end(); i++) { remove((*i).c_str()); } fprintf(dii_file, "@D %s\n", pdf_name); free(pdf_name); } static void write_simple(const char *tag, const char *value); static void write_simple(const char *tag, const char *value) { if (value) fprintf(dii_file, "@%s %s\n", tag, value); } static void write_simple(const char *tag, string value); static void write_simple(const char *tag, string value) { fprintf(dii_file, "@%s %s\n", tag, value.c_str()); } static void write_simple(const char *tag, const char *value, const char *value2); static void write_simple(const char *tag, const char *value, const char *value2) { if (value) { if (value2) fprintf(dii_file, "@%s \"%s\" <%s>\n", tag, value, value2); else fprintf(dii_file, "@%s \"%s\"\n", tag, value); } } static string extract_header(char *headers, const char *field); static string extract_header(char *headers, const char *field) { string rc; int len = strlen(field) + 4; char f[len]; snprintf(f, len, "\n%s: ", field); char *p = strstr(headers, f); if (p) { p += strlen(f); char *n = strchr(p, '\n'); if (n) { *n = '\0'; rc = string(p); *n = '\n'; } else { rc = string(p); } } return rc; } static void write_normal_email(file_ll &f, pst_item* item, pst_file* pst); static void write_normal_email(file_ll &f, pst_item* item, pst_file* pst) { DEBUG_ENT("write_normal_email"); char *soh = NULL; // real start of headers. if (item->email->header.str) { // some of the headers we get from the file are not properly defined. // they can contain some email stuff too. We will cut off the header // when we see a \n\n or \r\n\r\n removeCR(item->email->header.str); char *temp = strstr(item->email->header.str, "\n\n"); if (temp) { DEBUG_INFO(("Found body text in header\n")); temp[1] = '\0'; // stop after first \n } soh = skip_header_prologue(item->email->header.str); } char folder_line[LINE_SIZE]; char line[LINE_SIZE]; // reset pdf writer to new file int bates = bates_index; // save starting index snprintf(folder_line, sizeof(folder_line), "pst folder = %s\n", f.name.c_str()); open_pdf(folder_line); // start printing this email fprintf(dii_file, "@FOLDERNAME %s\n", f.name.c_str()); string myfrom = extract_header(soh, "From"); string myto = extract_header(soh, "To"); string mycc = extract_header(soh, "Cc"); string mybcc = extract_header(soh, "Bcc"); if (myfrom.empty()) write_simple("FROM", item->email->outlook_sender_name.str, item->email->sender_address.str); else write_simple("FROM", myfrom); if (myto.empty()) write_simple("TO", item->email->sentto_address.str, item->email->recip_address.str); else write_simple("TO", myto); if (mycc.empty()) write_simple("CC", item->email->cc_address.str); else write_simple("CC", mycc); if (mybcc.empty()) write_simple("BCC", item->email->bcc_address.str); else write_simple("BCC", mybcc); if (item->email->sent_date) { time_t t = pst_fileTimeToUnixTime(item->email->sent_date); char c_time[C_TIME_SIZE]; strftime(c_time, C_TIME_SIZE, "%F", gmtime(&t)); write_simple("DATESENT", c_time); strftime(c_time, C_TIME_SIZE, "%T+0000", gmtime(&t)); write_simple("TIMESENT", c_time); } if (item->email->arrival_date) { time_t t = pst_fileTimeToUnixTime(item->email->arrival_date); char c_time[C_TIME_SIZE]; strftime(c_time, C_TIME_SIZE, "%F", gmtime(&t)); write_simple("DATERCVD", c_time); strftime(c_time, C_TIME_SIZE, "%T+0000", gmtime(&t)); write_simple("TIMERCVD", c_time); } if (item->subject.str) { write_simple("SUBJECT", item->subject.str); } write_simple("MSGID", item->email->messageid.str); write_simple("READ", (item->flags & 1) ? "Y" : "N"); DEBUG_INFO(("About to print Header\n")); fprintf(dii_file, "@HEADER\n"); if (item && item->subject.str) { DEBUG_INFO(("item->subject = %s\n", item->subject.str)); } if (soh) { // Now, write out the header... print_pdf(soh); int len = strlen(soh); if (!len || (soh[len-1] != '\n')) { snprintf(line, sizeof(line), "\n"); print_pdf(line); } } else { //make up our own headers const char *temp = item->email->outlook_sender.str; if (!temp) temp = ""; snprintf(line, sizeof(line), "From: \"%s\" <%s>\n", item->email->outlook_sender_name.str, temp); print_pdf(line); if (item->subject.str) { snprintf(line, sizeof(line), "Subject: %s\n", item->subject.str); } else { snprintf(line, sizeof(line), "Subject: \n"); } print_pdf(line); snprintf(line, sizeof(line), "To: %s\n", item->email->sentto_address.str); print_pdf(line); if (item->email->cc_address.str) { snprintf(line, sizeof(line), "Cc: %s\n", item->email->cc_address.str); print_pdf(line); } if (item->email->sent_date) { time_t em_time = pst_fileTimeToUnixTime(item->email->sent_date); char c_time[C_TIME_SIZE]; strftime(c_time, C_TIME_SIZE, "%a, %d %b %Y %H:%M:%S %z", gmtime(&em_time)); snprintf(line, sizeof(line), "Date: %s\n", c_time); print_pdf(line); } } snprintf(line, sizeof(line), "\n"); print_pdf_only(line, black); fprintf(dii_file, "@HEADER-END\n"); DEBUG_INFO(("About to print Body\n")); fprintf(dii_file, "@EMAIL-BODY\n"); if (item->body.str) { removeCR(item->body.str); print_pdf(item->body.str); } else if (item->email->htmlbody.str) { removeCR(item->email->htmlbody.str); print_pdf(item->email->htmlbody.str); } else if (item->email->encrypted_body.data || item->email->encrypted_htmlbody.data) { char ln[LINE_SIZE]; snprintf(ln, sizeof(ln), "%s", "The body of this email is encrypted. This isn't supported yet, but the body is now an attachment\n"); print_pdf(ln); } fprintf(dii_file, "@EMAIL-END\n"); int attach_num = 0; for (pst_item_attach* attach = item->attach; attach; attach = attach->next) { pst_convert_utf8_null(item, &attach->filename1); pst_convert_utf8_null(item, &attach->filename2); pst_convert_utf8_null(item, &attach->mimetype); DEBUG_INFO(("Attempting Attachment encoding\n")); if (attach->data.data || attach->i_id) { string an = write_separate_attachment(f.name, attach, ++attach_num, pst); fprintf(dii_file, "@EATTACH %s\n", an.c_str()); } } close_pdf(); fprintf(dii_file, "@BATESBEG %d\n", bates); fprintf(dii_file, "@BATESEND %d\n", bates_index-1); DEBUG_RET(); } static void create_enter_dir(file_ll &f, file_ll *parent, pst_item *item); static void create_enter_dir(file_ll &f, file_ll *parent, pst_item *item) { pst_convert_utf8(item, &item->file_as); f.type = item->type; f.stored_count = (item->folder) ? item->folder->item_count : 0; f.name = ((parent) ? parent->name + "/" : "") + string(item->file_as.str); } static void close_enter_dir(file_ll &f); static void close_enter_dir(file_ll &f) { } static void process(pst_item *outeritem, file_ll *parent, pst_desc_tree *d_ptr); static void process(pst_item *outeritem, file_ll *parent, pst_desc_tree *d_ptr) { file_ll ff; pst_item *item = NULL; DEBUG_ENT("process"); create_enter_dir(ff, parent, outeritem); for (; d_ptr; d_ptr = d_ptr->next) { if (d_ptr->desc) { item = pst_parse_item(&pstfile, d_ptr, NULL); DEBUG_INFO(("item pointer is %p\n", item)); if (item) { if (item->folder && item->file_as.str && d_ptr->child ) { //if this is a non-empty folder, we want to recurse into it fprintf(stderr, "entering folder %s\n", item->file_as.str); process(item, &ff, d_ptr->child); } else if (item->email && (item->type == PST_TYPE_NOTE || item->type == PST_TYPE_SCHEDULE || item->type == PST_TYPE_REPORT)) { ff.email_count++; write_normal_email(ff, item, &pstfile); } else { ff.skip_count++; // other mapi objects } pst_freeItem(item); } else { ff.skip_count++; DEBUG_INFO(("A NULL item was seen\n")); } } } close_enter_dir(ff); DEBUG_RET(); } int main(int argc, char* const* argv) { pst_desc_tree *d_ptr; char *fname = NULL; char c; char *d_log = NULL; prog_name = argv[0]; pst_item *item = NULL; while ((c = getopt(argc, argv, "B:b:c:d:f:o:O:Vh"))!= -1) { switch (c) { case 'B': bates_prefix = optarg; break; case 'b': bates_index = atoi(optarg); break; case 'c': bates_color = (int)strtol(optarg, (char**)NULL, 16); break; case 'f': font_file = optarg; break; case 'o': output_directory = optarg; break; case 'O': output_file = optarg; break; case 'd': d_log = optarg; break; case 'h': usage(); exit(0); break; case 'V': version(); exit(0); break; default: usage(); exit(1); break; } } if (!font_file) { usage(); exit(1); } if (argc > optind) { fname = argv[optind]; } else { usage(); exit(2); } #ifdef DEBUG_ALL // force a log file if (!d_log) d_log = "pst2dii.log"; #endif DEBUG_INIT(d_log, NULL); DEBUG_ENT("main"); RET_DERROR(pst_open(&pstfile, fname, NULL), 1, ("Error opening File\n")); RET_DERROR(pst_load_index(&pstfile), 2, ("Index Error\n")); pst_load_extended_attributes(&pstfile); d_ptr = pstfile.d_head; // first record is main record item = (pst_item*)pst_parse_item(&pstfile, d_ptr, NULL); if (!item || !item->message_store) { DEBUG_RET(); DIE(("Could not get root record\n")); } d_ptr = pst_getTopOfFolders(&pstfile, item); if (!d_ptr) { DEBUG_RET(); DIE(("Top of folders record not found. Cannot continue\n")); } dii_file = fopen(output_file, "wb"); if (dii_file) { process(item, NULL, d_ptr->child); // do the children of TOPF pst_freeItem(item); pst_close(&pstfile); fclose(dii_file); } DEBUG_RET(); return 0; }