changeset 345:a8577226f7a9

fixes from AJ Shankar for attachment processing and body encodings that contain embedded null chars
author Carl Byington <carl@five-ten-sg.com>
date Mon, 09 Mar 2015 10:47:58 -0700
parents aedcf979f439
children afa777d5bedf
files AUTHORS libpst.spec.in regression/regression-tests.bash src/libpst.c src/readpst.c
diffstat 5 files changed, 84 insertions(+), 63 deletions(-) [+]
line wrap: on
line diff
--- a/AUTHORS	Mon Mar 09 08:49:47 2015 -0700
+++ b/AUTHORS	Mon Mar 09 10:47:58 2015 -0700
@@ -37,6 +37,7 @@
     Svante Signell <svante.signell@telia.com>
     Dominique Leuenberger a.k.a. Dimstar <dimstar@opensuse.org>
     Daniel Gryniewicz <dang@linuxbox.com>
+    AJ Shankar <aj@everlaw.com>
 
 Testing team:
     Mac OSX - Michael Watson <mike@mikeandgayle.com>
--- a/libpst.spec.in	Mon Mar 09 08:49:47 2015 -0700
+++ b/libpst.spec.in	Mon Mar 09 10:47:58 2015 -0700
@@ -163,6 +163,8 @@
 * Mon Mar 09 2015 Carl Byington <carl@five-ten-sg.com> 0.6.64-1
 - fix line wrap on python provides_exclude_from
 - fix unchecked errors found by cppcheck
+- AJ Shankar fixes for attachment processing and body encodings
+  that contain embedded null chars.
 
 * Mon Jan 26 2015 Petr Machata <pmachata@redhat.com> - 0.6.63-5
 - Rebuild for boost 1.57.0
--- a/regression/regression-tests.bash	Mon Mar 09 08:49:47 2015 -0700
+++ b/regression/regression-tests.bash	Mon Mar 09 10:47:58 2015 -0700
@@ -65,12 +65,13 @@
             #$val ../src/readpst $jobs -te -r -D -cv -o output$n -d $ba.log $fn >$ba.err 2>&1
 
             ## normal recursive dump
+            char='BIG-5'
             char='us-ascii'
             acc="-a '.xls,.doc'"
             acc=''
-            #char='BIG-5'
-            echo $val ../src/readpst $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn
-                 $val ../src/readpst $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn >$ba.err 2>&1
+            utf='-8'
+            echo $val ../src/readpst $utf $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn
+                 $val ../src/readpst $utf $acc -C $char -j 0 -r -cv -o output$n -d $ba.log $fn >$ba.err 2>&1
 
             ## separate mode with filename extensions and .msg files
             #echo $val ../src/readpst $jobs     -r -m -D -cv -o output$n -d $ba.log $fn
@@ -108,36 +109,36 @@
 [ "$2" == "reg" ] && regression="yes"
 [ "$regression" == "yes" ] && val=""
 
-#$func   1 ams.pst
-#$func   2 sample_64.pst
-#$func   3 test.pst
-#$func   4 big_mail.pst
+$func   1 ams.pst
+$func   2 sample_64.pst
+$func   3 test.pst
+$func   4 big_mail.pst
 $func   5 mbmg.archive.pst
-#$func   6 Single2003-read.pst
-#$func   7 Single2003-unread.pst
-#$func   8 ol2k3high.pst
-#$func   9 ol97high.pst
-#$func  10 returned_message.pst
-#$func  11 flow.pst
-#$func  12 test-html.pst
-#$func  13 test-text.pst
-#$func  14 joe.romanowski.pst
-#$func  15 hourig1.pst
-#$func  16 test-mac.pst
-#$func  18 spam.pst
-#$func  19 rendgen.pst           # single email appointment
-#$func  20 rendgen2.pst          # email appointment with no termination date
-#$func  21 rendgen3.pst          # mime signed email
-#$func  22 rendgen4.pst          # appointment test cases
-#$func  23 rendgen5.pst          # appointment test cases
-#$func  24 paul.sheer.pst        # embedded rfc822 attachment
-#$func  25 jerry.pst             # non ascii subject lines
-#$func  26 phill.bertolus.pst    # possible segfault in forked process, cannot reproduce
-#$func  27 kaiser.pst            # appointments with other character sets
-#$func  28 pstsample.pst         # character set issue
-#$func  29 pstsample2.pst        # embedded image in rtf data
-#$func  30 pstsample3.pst        # exports of rtf and html
-#$func  31 Journal_Archives_08_29_2010.pst
+$func   6 Single2003-read.pst
+$func   7 Single2003-unread.pst
+$func   8 ol2k3high.pst
+$func   9 ol97high.pst
+$func  10 returned_message.pst
+$func  11 flow.pst
+$func  12 test-html.pst
+$func  13 test-text.pst
+$func  14 joe.romanowski.pst
+$func  15 hourig1.pst
+$func  16 test-mac.pst
+$func  18 spam.pst
+$func  19 rendgen.pst           # single email appointment
+$func  20 rendgen2.pst          # email appointment with no termination date
+$func  21 rendgen3.pst          # mime signed email
+$func  22 rendgen4.pst          # appointment test cases
+$func  23 rendgen5.pst          # appointment test cases
+$func  24 paul.sheer.pst        # embedded rfc822 attachment
+$func  25 jerry.pst             # non ascii subject lines
+$func  26 phill.bertolus.pst    # possible segfault in forked process, cannot reproduce
+$func  27 kaiser.pst            # appointments with other character sets
+$func  28 pstsample.pst         # character set issue
+$func  29 pstsample2.pst        # embedded image in rtf data
+$func  30 pstsample3.pst        # exports of rtf and html
+$func  31 Journal_Archives_08_29_2010.pst
 
 [ -n "$val" ] && grep 'lost:' *err | grep -v 'lost: 0 '
 
--- a/src/libpst.c	Mon Mar 09 08:49:47 2015 -0700
+++ b/src/libpst.c	Mon Mar 09 10:47:58 2015 -0700
@@ -1304,7 +1304,10 @@
         DEBUG_INFO(("ATTACHMENT processing attachment\n"));
         list = pst_parse_block(pf, id2_ptr->id->i_id, id2_head);
         if (!list) {
-            DEBUG_WARN(("ERROR error processing main attachment record\n"));
+            if (item->flags & PST_FLAG_HAS_ATTACHMENT) {
+                // Only report an error if we expected to see an attachment table and didn't.
+                DEBUG_WARN(("ERROR error processing main attachment record\n"));
+            }
             if (!m_head) pst_free_id2(id2_head);
             DEBUG_RET();
             return item;
@@ -1351,7 +1354,9 @@
                     continue;
                 }
                 pst_free_list(list);
-                id2_ptr = pst_getID2(id2_head, attach->id2_val);
+                // As per 2.4.6.2 in the spec, the attachment data is stored as a child of the
+                // attachment object, so we pass in id2_ptr as the head to search from.
+                id2_ptr = pst_getID2(id2_ptr, attach->id2_val);
                 if (id2_ptr) {
                     DEBUG_WARN(("second pass attachment updating id2 %#"PRIx64" found i_id %#"PRIx64"\n", attach->id2_val, id2_ptr->id->i_id));
                     // i_id has been updated to the datablock containing the attachment data
--- a/src/readpst.c	Mon Mar 09 08:49:47 2015 -0700
+++ b/src/readpst.c	Mon Mar 09 10:47:58 2015 -0700
@@ -54,7 +54,7 @@
 char*     header_get_field(char *header, char *field);
 char*     header_end_field(char *field);
 void      header_strip_field(char *header, char *field);
-int       test_base64(char *body);
+int       test_base64(char *body, size_t len);
 void      find_html_charset(char *html, char *charset, size_t charsetlen);
 void      find_rfc822_headers(char** extra_mime_headers);
 void      write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst);
@@ -132,6 +132,7 @@
 int         output_type_mode = 0xff;    // Default to all.
 int         contact_mode_specified = 0;
 int         overwrite = 0;
+int         prefer_utf8 = 0;
 int         save_rtf_body = 1;
 int         file_name_len = 10;     // enough room for MODE_SPEARATE file name
 pst_file    pstfile;
@@ -452,7 +453,7 @@
     }
 
     // command-line option handling
-    while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw"))!= -1) {
+    while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw8"))!= -1) {
         switch (c) {
         case 'a':
             if (optarg) {
@@ -587,6 +588,9 @@
         case 'w':
             overwrite = 1;
             break;
+        case '8':
+            prefer_utf8 = 1;
+            break;
         default:
             usage();
             exit(1);
@@ -758,6 +762,7 @@
     printf("\t-t[eajc]\t- Set the output type list. e = email, a = attachment, j = journal, c = contact\n");
     printf("\t-u\t- Thunderbird mode. Write two extra .size and .type files\n");
     printf("\t-w\t- Overwrite any output mbox files\n");
+    printf("\t-8\t- Output bodies in UTF-8, rather than original encoding, if UTF-8 version is available\n");
     printf("\n");
     printf("Only one of -M -S -e -k -m -r should be specified\n");
     DEBUG_RET();
@@ -1267,7 +1272,7 @@
         if (!e || (e > n)) e = n;   // use the trailing lf as terminator if nothing better
         save = *e;
         *e = '\0';
-            snprintf(body_subfield, size_subfield, "%s", s);  // copy the subfield to our buffer
+        snprintf(body_subfield, size_subfield, "%s", s);  // copy the subfield to our buffer
         *e = save;
         DEBUG_INFO(("body %s %s from headers\n", subfield, body_subfield));
     }
@@ -1316,12 +1321,12 @@
 }
 
 
-int  test_base64(char *body)
+int  test_base64(char *body, size_t len)
 {
     int b64 = 0;
     uint8_t *b = (uint8_t *)body;
     DEBUG_ENT("test_base64");
-    while (*b) {
+    while (len--) {
         if ((*b < 32) && (*b != 9) && (*b != 10)) {
             DEBUG_INFO(("found base64 byte %d\n", (int)*b));
             DEBUG_HEXDUMPC(body, strlen(body), 0x10);
@@ -1401,37 +1406,44 @@
 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst)
 {
     DEBUG_ENT("write_body_part");
+    removeCR(body->str);
+    size_t body_len = strlen(body->str);
+
     if (body->is_utf8 && (strcasecmp("utf-8", charset))) {
-        // try to convert to the specified charset since the target
-        // is not utf-8, and the data came from a unicode (utf16) field
-        // and is now in utf-8.
-        size_t rc;
-        DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset));
-        pst_vbuf *newer = pst_vballoc(2);
-        rc = pst_vb_utf8to8bit(newer, body->str, strlen(body->str), charset);
-        if (rc == (size_t)-1) {
-            // unable to convert, change the charset to utf8
-            free(newer->b);
-            DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset));
+        if (prefer_utf8) {
             charset = "utf-8";
+        } else {
+            // try to convert to the specified charset since the target
+            // is not utf-8, and the data came from a unicode (utf16) field
+            // and is now in utf-8.
+            size_t rc;
+            DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset));
+            pst_vbuf *newer = pst_vballoc(2);
+            rc = pst_vb_utf8to8bit(newer, body->str, body_len, charset);
+            if (rc == (size_t)-1) {
+                // unable to convert, change the charset to utf8
+                free(newer->b);
+                DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset));
+                charset = "utf-8";
+            } else {
+                // null terminate the output string
+                pst_vbgrow(newer, 1);
+                newer->b[newer->dlen] = '\0';
+                free(body->str);
+                body->str = newer->b;
+                body_len = newer->dlen;
+            }
+            free(newer);
         }
-        else {
-            // null terminate the output string
-            pst_vbgrow(newer, 1);
-            newer->b[newer->dlen] = '\0';
-            free(body->str);
-            body->str = newer->b;
-        }
-        free(newer);
     }
-    removeCR(body->str);
-    int base64 = test_base64(body->str);
+    int base64 = test_base64(body->str, body_len);
     fprintf(f_output, "\n--%s\n", boundary);
     fprintf(f_output, "Content-Type: %s; charset=\"%s\"\n", mime, charset);
     if (base64) fprintf(f_output, "Content-Transfer-Encoding: base64\n");
     fprintf(f_output, "\n");
+    // Any body that uses an encoding with NULLs, e.g. UTF16, will be base64-encoded here.
     if (base64) {
-        char *enc = pst_base64_encode(body->str, strlen(body->str));
+        char *enc = pst_base64_encode(body->str, body_len);
         if (enc) {
             write_email_body(f_output, enc);
             fprintf(f_output, "\n");
@@ -1535,9 +1547,9 @@
         if (c_time)
             c_time[strlen(c_time)-1] = '\0'; //remove end \n
         else
-            c_time = "Fri Dec 28 12:06:21 2001";
+            c_time = "Thu Jan 1 00:00:00 1970";
     } else
-        c_time = "Fri Dec 28 12:06:21 2001";
+        c_time = "Thu Jan 1 00:00:00 1970";
 
     // create our MIME boundaries here.
     snprintf(boundary, sizeof(boundary), "--boundary-LibPST-iamunique-%i_-_-", rand());