diff src/readpst.c @ 345:a8577226f7a9

fixes from AJ Shankar for attachment processing and body encodings that contain embedded null chars
author Carl Byington <carl@five-ten-sg.com>
date Mon, 09 Mar 2015 10:47:58 -0700
parents aedcf979f439
children a57c15b3108a
line wrap: on
line diff
--- a/src/readpst.c	Mon Mar 09 08:49:47 2015 -0700
+++ b/src/readpst.c	Mon Mar 09 10:47:58 2015 -0700
@@ -54,7 +54,7 @@
 char*     header_get_field(char *header, char *field);
 char*     header_end_field(char *field);
 void      header_strip_field(char *header, char *field);
-int       test_base64(char *body);
+int       test_base64(char *body, size_t len);
 void      find_html_charset(char *html, char *charset, size_t charsetlen);
 void      find_rfc822_headers(char** extra_mime_headers);
 void      write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst);
@@ -132,6 +132,7 @@
 int         output_type_mode = 0xff;    // Default to all.
 int         contact_mode_specified = 0;
 int         overwrite = 0;
+int         prefer_utf8 = 0;
 int         save_rtf_body = 1;
 int         file_name_len = 10;     // enough room for MODE_SPEARATE file name
 pst_file    pstfile;
@@ -452,7 +453,7 @@
     }
 
     // command-line option handling
-    while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw"))!= -1) {
+    while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw8"))!= -1) {
         switch (c) {
         case 'a':
             if (optarg) {
@@ -587,6 +588,9 @@
         case 'w':
             overwrite = 1;
             break;
+        case '8':
+            prefer_utf8 = 1;
+            break;
         default:
             usage();
             exit(1);
@@ -758,6 +762,7 @@
     printf("\t-t[eajc]\t- Set the output type list. e = email, a = attachment, j = journal, c = contact\n");
     printf("\t-u\t- Thunderbird mode. Write two extra .size and .type files\n");
     printf("\t-w\t- Overwrite any output mbox files\n");
+    printf("\t-8\t- Output bodies in UTF-8, rather than original encoding, if UTF-8 version is available\n");
     printf("\n");
     printf("Only one of -M -S -e -k -m -r should be specified\n");
     DEBUG_RET();
@@ -1267,7 +1272,7 @@
         if (!e || (e > n)) e = n;   // use the trailing lf as terminator if nothing better
         save = *e;
         *e = '\0';
-            snprintf(body_subfield, size_subfield, "%s", s);  // copy the subfield to our buffer
+        snprintf(body_subfield, size_subfield, "%s", s);  // copy the subfield to our buffer
         *e = save;
         DEBUG_INFO(("body %s %s from headers\n", subfield, body_subfield));
     }
@@ -1316,12 +1321,12 @@
 }
 
 
-int  test_base64(char *body)
+int  test_base64(char *body, size_t len)
 {
     int b64 = 0;
     uint8_t *b = (uint8_t *)body;
     DEBUG_ENT("test_base64");
-    while (*b) {
+    while (len--) {
         if ((*b < 32) && (*b != 9) && (*b != 10)) {
             DEBUG_INFO(("found base64 byte %d\n", (int)*b));
             DEBUG_HEXDUMPC(body, strlen(body), 0x10);
@@ -1401,37 +1406,44 @@
 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst)
 {
     DEBUG_ENT("write_body_part");
+    removeCR(body->str);
+    size_t body_len = strlen(body->str);
+
     if (body->is_utf8 && (strcasecmp("utf-8", charset))) {
-        // try to convert to the specified charset since the target
-        // is not utf-8, and the data came from a unicode (utf16) field
-        // and is now in utf-8.
-        size_t rc;
-        DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset));
-        pst_vbuf *newer = pst_vballoc(2);
-        rc = pst_vb_utf8to8bit(newer, body->str, strlen(body->str), charset);
-        if (rc == (size_t)-1) {
-            // unable to convert, change the charset to utf8
-            free(newer->b);
-            DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset));
+        if (prefer_utf8) {
             charset = "utf-8";
+        } else {
+            // try to convert to the specified charset since the target
+            // is not utf-8, and the data came from a unicode (utf16) field
+            // and is now in utf-8.
+            size_t rc;
+            DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset));
+            pst_vbuf *newer = pst_vballoc(2);
+            rc = pst_vb_utf8to8bit(newer, body->str, body_len, charset);
+            if (rc == (size_t)-1) {
+                // unable to convert, change the charset to utf8
+                free(newer->b);
+                DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset));
+                charset = "utf-8";
+            } else {
+                // null terminate the output string
+                pst_vbgrow(newer, 1);
+                newer->b[newer->dlen] = '\0';
+                free(body->str);
+                body->str = newer->b;
+                body_len = newer->dlen;
+            }
+            free(newer);
         }
-        else {
-            // null terminate the output string
-            pst_vbgrow(newer, 1);
-            newer->b[newer->dlen] = '\0';
-            free(body->str);
-            body->str = newer->b;
-        }
-        free(newer);
     }
-    removeCR(body->str);
-    int base64 = test_base64(body->str);
+    int base64 = test_base64(body->str, body_len);
     fprintf(f_output, "\n--%s\n", boundary);
     fprintf(f_output, "Content-Type: %s; charset=\"%s\"\n", mime, charset);
     if (base64) fprintf(f_output, "Content-Transfer-Encoding: base64\n");
     fprintf(f_output, "\n");
+    // Any body that uses an encoding with NULLs, e.g. UTF16, will be base64-encoded here.
     if (base64) {
-        char *enc = pst_base64_encode(body->str, strlen(body->str));
+        char *enc = pst_base64_encode(body->str, body_len);
         if (enc) {
             write_email_body(f_output, enc);
             fprintf(f_output, "\n");
@@ -1535,9 +1547,9 @@
         if (c_time)
             c_time[strlen(c_time)-1] = '\0'; //remove end \n
         else
-            c_time = "Fri Dec 28 12:06:21 2001";
+            c_time = "Thu Jan 1 00:00:00 1970";
     } else
-        c_time = "Fri Dec 28 12:06:21 2001";
+        c_time = "Thu Jan 1 00:00:00 1970";
 
     // create our MIME boundaries here.
     snprintf(boundary, sizeof(boundary), "--boundary-LibPST-iamunique-%i_-_-", rand());