# HG changeset patch # User Carl Byington # Date 1425923278 25200 # Node ID a8577226f7a91eb11b2ca61bdcb717bee8bbc248 # Parent aedcf979f4392e8492292a58a0e4f49cf4bd285f fixes from AJ Shankar for attachment processing and body encodings that contain embedded null chars diff -r aedcf979f439 -r a8577226f7a9 AUTHORS --- a/AUTHORS Mon Mar 09 08:49:47 2015 -0700 +++ b/AUTHORS Mon Mar 09 10:47:58 2015 -0700 @@ -37,6 +37,7 @@ Svante Signell Dominique Leuenberger a.k.a. Dimstar Daniel Gryniewicz + AJ Shankar Testing team: Mac OSX - Michael Watson diff -r aedcf979f439 -r a8577226f7a9 libpst.spec.in --- a/libpst.spec.in Mon Mar 09 08:49:47 2015 -0700 +++ b/libpst.spec.in Mon Mar 09 10:47:58 2015 -0700 @@ -163,6 +163,8 @@ * Mon Mar 09 2015 Carl Byington 0.6.64-1 - fix line wrap on python provides_exclude_from - fix unchecked errors found by cppcheck +- AJ Shankar fixes for attachment processing and body encodings + that contain embedded null chars. * Mon Jan 26 2015 Petr Machata - 0.6.63-5 - Rebuild for boost 1.57.0 diff -r aedcf979f439 -r a8577226f7a9 regression/regression-tests.bash --- a/regression/regression-tests.bash Mon Mar 09 08:49:47 2015 -0700 +++ b/regression/regression-tests.bash Mon Mar 09 10:47:58 2015 -0700 @@ -65,12 +65,13 @@ #$val ../src/readpst $jobs -te -r -D -cv -o output$n -d $ba.log $fn >$ba.err 2>&1 ## normal recursive dump + char='BIG-5' char='us-ascii' acc="-a '.xls,.doc'" acc='' - #char='BIG-5' - echo $val ../src/readpst $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn - $val ../src/readpst $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn >$ba.err 2>&1 + utf='-8' + echo $val ../src/readpst $utf $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn + $val ../src/readpst $utf $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn >$ba.err 2>&1 ## separate mode with filename extensions and .msg files #echo $val ../src/readpst $jobs -r -m -D -cv -o output$n -d $ba.log $fn @@ -108,36 +109,36 @@ [ "$2" == "reg" ] && regression="yes" [ "$regression" == "yes" ] && val="" -#$func 1 ams.pst -#$func 2 sample_64.pst -#$func 3 test.pst -#$func 4 big_mail.pst +$func 1 ams.pst +$func 2 sample_64.pst +$func 3 test.pst +$func 4 big_mail.pst $func 5 mbmg.archive.pst -#$func 6 Single2003-read.pst -#$func 7 Single2003-unread.pst -#$func 8 ol2k3high.pst -#$func 9 ol97high.pst -#$func 10 returned_message.pst -#$func 11 flow.pst -#$func 12 test-html.pst -#$func 13 test-text.pst -#$func 14 joe.romanowski.pst -#$func 15 hourig1.pst -#$func 16 test-mac.pst -#$func 18 spam.pst -#$func 19 rendgen.pst # single email appointment -#$func 20 rendgen2.pst # email appointment with no termination date -#$func 21 rendgen3.pst # mime signed email -#$func 22 rendgen4.pst # appointment test cases -#$func 23 rendgen5.pst # appointment test cases -#$func 24 paul.sheer.pst # embedded rfc822 attachment -#$func 25 jerry.pst # non ascii subject lines -#$func 26 phill.bertolus.pst # possible segfault in forked process, cannot reproduce -#$func 27 kaiser.pst # appointments with other character sets -#$func 28 pstsample.pst # character set issue -#$func 29 pstsample2.pst # embedded image in rtf data -#$func 30 pstsample3.pst # exports of rtf and html -#$func 31 Journal_Archives_08_29_2010.pst +$func 6 Single2003-read.pst +$func 7 Single2003-unread.pst +$func 8 ol2k3high.pst +$func 9 ol97high.pst +$func 10 returned_message.pst +$func 11 flow.pst +$func 12 test-html.pst +$func 13 test-text.pst +$func 14 joe.romanowski.pst +$func 15 hourig1.pst +$func 16 test-mac.pst +$func 18 spam.pst +$func 19 rendgen.pst # single email appointment +$func 20 rendgen2.pst # email appointment with no termination date +$func 21 rendgen3.pst # mime signed email +$func 22 rendgen4.pst # appointment test cases +$func 23 rendgen5.pst # appointment test cases +$func 24 paul.sheer.pst # embedded rfc822 attachment +$func 25 jerry.pst # non ascii subject lines +$func 26 phill.bertolus.pst # possible segfault in forked process, cannot reproduce +$func 27 kaiser.pst # appointments with other character sets +$func 28 pstsample.pst # character set issue +$func 29 pstsample2.pst # embedded image in rtf data +$func 30 pstsample3.pst # exports of rtf and html +$func 31 Journal_Archives_08_29_2010.pst [ -n "$val" ] && grep 'lost:' *err | grep -v 'lost: 0 ' diff -r aedcf979f439 -r a8577226f7a9 src/libpst.c --- a/src/libpst.c Mon Mar 09 08:49:47 2015 -0700 +++ b/src/libpst.c Mon Mar 09 10:47:58 2015 -0700 @@ -1304,7 +1304,10 @@ DEBUG_INFO(("ATTACHMENT processing attachment\n")); list = pst_parse_block(pf, id2_ptr->id->i_id, id2_head); if (!list) { - DEBUG_WARN(("ERROR error processing main attachment record\n")); + if (item->flags & PST_FLAG_HAS_ATTACHMENT) { + // Only report an error if we expected to see an attachment table and didn't. + DEBUG_WARN(("ERROR error processing main attachment record\n")); + } if (!m_head) pst_free_id2(id2_head); DEBUG_RET(); return item; @@ -1351,7 +1354,9 @@ continue; } pst_free_list(list); - id2_ptr = pst_getID2(id2_head, attach->id2_val); + // As per 2.4.6.2 in the spec, the attachment data is stored as a child of the + // attachment object, so we pass in id2_ptr as the head to search from. + id2_ptr = pst_getID2(id2_ptr, attach->id2_val); if (id2_ptr) { DEBUG_WARN(("second pass attachment updating id2 %#"PRIx64" found i_id %#"PRIx64"\n", attach->id2_val, id2_ptr->id->i_id)); // i_id has been updated to the datablock containing the attachment data diff -r aedcf979f439 -r a8577226f7a9 src/readpst.c --- a/src/readpst.c Mon Mar 09 08:49:47 2015 -0700 +++ b/src/readpst.c Mon Mar 09 10:47:58 2015 -0700 @@ -54,7 +54,7 @@ char* header_get_field(char *header, char *field); char* header_end_field(char *field); void header_strip_field(char *header, char *field); -int test_base64(char *body); +int test_base64(char *body, size_t len); void find_html_charset(char *html, char *charset, size_t charsetlen); void find_rfc822_headers(char** extra_mime_headers); void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst); @@ -132,6 +132,7 @@ int output_type_mode = 0xff; // Default to all. int contact_mode_specified = 0; int overwrite = 0; +int prefer_utf8 = 0; int save_rtf_body = 1; int file_name_len = 10; // enough room for MODE_SPEARATE file name pst_file pstfile; @@ -452,7 +453,7 @@ } // command-line option handling - while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw"))!= -1) { + while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw8"))!= -1) { switch (c) { case 'a': if (optarg) { @@ -587,6 +588,9 @@ case 'w': overwrite = 1; break; + case '8': + prefer_utf8 = 1; + break; default: usage(); exit(1); @@ -758,6 +762,7 @@ printf("\t-t[eajc]\t- Set the output type list. e = email, a = attachment, j = journal, c = contact\n"); printf("\t-u\t- Thunderbird mode. Write two extra .size and .type files\n"); printf("\t-w\t- Overwrite any output mbox files\n"); + printf("\t-8\t- Output bodies in UTF-8, rather than original encoding, if UTF-8 version is available\n"); printf("\n"); printf("Only one of -M -S -e -k -m -r should be specified\n"); DEBUG_RET(); @@ -1267,7 +1272,7 @@ if (!e || (e > n)) e = n; // use the trailing lf as terminator if nothing better save = *e; *e = '\0'; - snprintf(body_subfield, size_subfield, "%s", s); // copy the subfield to our buffer + snprintf(body_subfield, size_subfield, "%s", s); // copy the subfield to our buffer *e = save; DEBUG_INFO(("body %s %s from headers\n", subfield, body_subfield)); } @@ -1316,12 +1321,12 @@ } -int test_base64(char *body) +int test_base64(char *body, size_t len) { int b64 = 0; uint8_t *b = (uint8_t *)body; DEBUG_ENT("test_base64"); - while (*b) { + while (len--) { if ((*b < 32) && (*b != 9) && (*b != 10)) { DEBUG_INFO(("found base64 byte %d\n", (int)*b)); DEBUG_HEXDUMPC(body, strlen(body), 0x10); @@ -1401,37 +1406,44 @@ void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst) { DEBUG_ENT("write_body_part"); + removeCR(body->str); + size_t body_len = strlen(body->str); + if (body->is_utf8 && (strcasecmp("utf-8", charset))) { - // try to convert to the specified charset since the target - // is not utf-8, and the data came from a unicode (utf16) field - // and is now in utf-8. - size_t rc; - DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset)); - pst_vbuf *newer = pst_vballoc(2); - rc = pst_vb_utf8to8bit(newer, body->str, strlen(body->str), charset); - if (rc == (size_t)-1) { - // unable to convert, change the charset to utf8 - free(newer->b); - DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset)); + if (prefer_utf8) { charset = "utf-8"; + } else { + // try to convert to the specified charset since the target + // is not utf-8, and the data came from a unicode (utf16) field + // and is now in utf-8. + size_t rc; + DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset)); + pst_vbuf *newer = pst_vballoc(2); + rc = pst_vb_utf8to8bit(newer, body->str, body_len, charset); + if (rc == (size_t)-1) { + // unable to convert, change the charset to utf8 + free(newer->b); + DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset)); + charset = "utf-8"; + } else { + // null terminate the output string + pst_vbgrow(newer, 1); + newer->b[newer->dlen] = '\0'; + free(body->str); + body->str = newer->b; + body_len = newer->dlen; + } + free(newer); } - else { - // null terminate the output string - pst_vbgrow(newer, 1); - newer->b[newer->dlen] = '\0'; - free(body->str); - body->str = newer->b; - } - free(newer); } - removeCR(body->str); - int base64 = test_base64(body->str); + int base64 = test_base64(body->str, body_len); fprintf(f_output, "\n--%s\n", boundary); fprintf(f_output, "Content-Type: %s; charset=\"%s\"\n", mime, charset); if (base64) fprintf(f_output, "Content-Transfer-Encoding: base64\n"); fprintf(f_output, "\n"); + // Any body that uses an encoding with NULLs, e.g. UTF16, will be base64-encoded here. if (base64) { - char *enc = pst_base64_encode(body->str, strlen(body->str)); + char *enc = pst_base64_encode(body->str, body_len); if (enc) { write_email_body(f_output, enc); fprintf(f_output, "\n"); @@ -1535,9 +1547,9 @@ if (c_time) c_time[strlen(c_time)-1] = '\0'; //remove end \n else - c_time = "Fri Dec 28 12:06:21 2001"; + c_time = "Thu Jan 1 00:00:00 1970"; } else - c_time = "Fri Dec 28 12:06:21 2001"; + c_time = "Thu Jan 1 00:00:00 1970"; // create our MIME boundaries here. snprintf(boundary, sizeof(boundary), "--boundary-LibPST-iamunique-%i_-_-", rand());