changeset 257:c947b8812120

rfc2047 and rfc2231 encoding for non-ascii headers and attachment filenames
author Carl Byington <carl@five-ten-sg.com>
date Fri, 24 Dec 2010 19:26:05 -0800
parents a863de65e5b8
children 8ad8fd1c5451
files ChangeLog NEWS configure.in libpst.spec.in regression/regression-tests.bash src/libpst.c src/libpst.h src/libstrfunc.c src/libstrfunc.h src/readpst.c
diffstat 10 files changed, 161 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Mon Sep 13 15:10:52 2010 -0700
+++ b/ChangeLog	Fri Dec 24 19:26:05 2010 -0800
@@ -1,3 +1,7 @@
+LibPST 0.6.50 (2010-12-24)
+===============================
+    * rfc2047 and rfc2231 encoding for non-ascii headers and attachment filenames
+
 LibPST 0.6.49 (2010-09-13)
 ===============================
     * fix to ignore embedded objects that are not email messages
--- a/NEWS	Mon Sep 13 15:10:52 2010 -0700
+++ b/NEWS	Fri Dec 24 19:26:05 2010 -0800
@@ -1,3 +1,4 @@
+0.6.50  2010-12-24 rfc2047 and rfc2231 encoding for non-ascii headers and attachment filenames
 0.6.49  2010-09-13 fix to ignore embedded objects that are not email messages
 0.6.48  2010-09-02 fix for broken internet headers from Outlook, change to mboxrd quoting
 0.6.47  2010-05-07 patches from Kenneth Berland for solaris
--- a/configure.in	Mon Sep 13 15:10:52 2010 -0700
+++ b/configure.in	Fri Dec 24 19:26:05 2010 -0800
@@ -1,5 +1,5 @@
 AC_PREREQ(2.59)
-AC_INIT(libpst,0.6.49,carl@five-ten-sg.com)
+AC_INIT(libpst,0.6.50,carl@five-ten-sg.com)
 AC_CONFIG_SRCDIR([src/libpst.c])
 AC_CONFIG_HEADER([config.h])
 AM_INIT_AUTOMAKE
@@ -19,7 +19,7 @@
 #  6. libtool will build libpst.so.x.y.z where the SONAME is libpst.so.x
 #     and x=current-age, y=age, z=revision
 
-libpst_version_info='4:4:0'
+libpst_version_info='5:0:1'
 AC_SUBST(LIBPST_VERSION_INFO, [$libpst_version_info])
 libpst_so_major='4'
 AC_SUBST(LIBPST_SO_MAJOR, [$libpst_so_major])
@@ -34,6 +34,7 @@
 # 0.6.47    libpst.so.4     libpst.so.4.0.2
 # 0.6.48    libpst.so.4     libpst.so.4.0.3
 # 0.6.49    libpst.so.4     libpst.so.4.0.4
+# 0.6.50    libpst.so.4     libpst.so.4.1.0
 
 
 
--- a/libpst.spec.in	Mon Sep 13 15:10:52 2010 -0700
+++ b/libpst.spec.in	Fri Dec 24 19:26:05 2010 -0800
@@ -147,6 +147,10 @@
 
 
 %changelog
+* Fri Dec 24 2010 Carl Byington <carl@five-ten-sg.com> - 0.6.50-1
+- rfc2047 and rfc2231 encoding for non-ascii headers and
+  attachment filenames.
+
 * Mon Sep 13 2010 Carl Byington <carl@five-ten-sg.com> - 0.6.49-1
 - fix to ignore embedded objects that are not email messages
   fedora bugzilla 633498
--- a/regression/regression-tests.bash	Mon Sep 13 15:10:52 2010 -0700
+++ b/regression/regression-tests.bash	Fri Dec 24 19:26:05 2010 -0800
@@ -101,30 +101,31 @@
 [ "$2" == "reg" ] && regression="yes"
 [ "$regression" == "yes" ] && val=""
 
-$func   1 ams.pst
-$func   2 sample_64.pst
-$func   3 test.pst
-$func   4 big_mail.pst
-$func   5 mbmg.archive.pst
-$func   6 Single2003-read.pst
-$func   7 Single2003-unread.pst
-$func   8 ol2k3high.pst
-$func   9 ol97high.pst
-$func  10 returned_message.pst
-$func  11 flow.pst
-$func  12 test-html.pst
-$func  13 test-text.pst
-$func  14 joe.romanowski.pst
-$func  15 hourig1.pst
-$func  16 test-mac.pst
-$func  17 harris.pst
-$func  18 spam.pst
-$func  19 rendgen.pst       # single email appointment
-$func  20 rendgen2.pst      # email appointment with no termination date
-$func  21 rendgen3.pst      # mime signed email
-$func  22 rendgen4.pst      # appointment test cases
-$func  23 rendgen5.pst      # appointment test cases
-$func  24 paul.sheer.pst    # embedded rfc822 attachment
+#$func   1 ams.pst
+#$func   2 sample_64.pst
+#$func   3 test.pst
+#$func   4 big_mail.pst
+#$func   5 mbmg.archive.pst
+#$func   6 Single2003-read.pst
+#$func   7 Single2003-unread.pst
+##$func   8 ol2k3high.pst
+##$func   9 ol97high.pst
+#$func  10 returned_message.pst
+##$func  11 flow.pst
+#$func  12 test-html.pst
+#$func  13 test-text.pst
+#$func  14 joe.romanowski.pst
+#$func  15 hourig1.pst
+#$func  16 test-mac.pst
+##$func  17 harris.pst
+##$func  18 spam.pst
+#$func  19 rendgen.pst       # single email appointment
+#$func  20 rendgen2.pst      # email appointment with no termination date
+#$func  21 rendgen3.pst      # mime signed email
+#$func  22 rendgen4.pst      # appointment test cases
+#$func  23 rendgen5.pst      # appointment test cases
+#$func  24 paul.sheer.pst    # embedded rfc822 attachment
+$func  25 jerry.pst         # non ascii subject lines
 
 [ -n "$val" ] && grep 'lost:' *err | grep -v 'lost: 0 '
 
--- a/src/libpst.c	Mon Sep 13 15:10:52 2010 -0700
+++ b/src/libpst.c	Fri Dec 24 19:26:05 2010 -0800
@@ -4385,6 +4385,76 @@
 }
 
 
+/** Convert str to rfc2231 encoding of str
+ *
+ *  @param item  pointer to the containing mapi item
+ *  @param str   pointer to the mapi string of interest
+ */
+void pst_rfc2231(pst_string *str) {
+    int needs = 0;
+    const int8_t *x = (int8_t *)str->str;
+    while (*x) {
+        if (*x <= 32) needs++;
+        x++;
+    }
+    int n = strlen(str->str) + 2*needs + 15;
+    char *buffer = pst_malloc(n);
+    strcpy(buffer, "utf-8''");
+    x = (int8_t *)str->str;
+    const uint8_t *y = (uint8_t *)str->str;
+    uint8_t *z = (uint8_t *)buffer;
+    z += strlen(buffer);    // skip the utf8 prefix
+    while (*y) {
+        if (*x <= 32) {
+            *(z++) = (uint8_t)'%';
+            snprintf(z, 3, "%2x", *y);
+            z += 2;
+        }
+        else {
+            *(z++) = *y;
+        }
+        x++;
+        y++;
+    }
+    *z = '\0';
+    free(str->str);
+    str->str = buffer;
+}
+
+
+/** Convert str to rfc2047 encoding of str, possibly enclosed in quotes if it contains spaces
+ *
+ *  @param item  pointer to the containing mapi item
+ *  @param str   pointer to the mapi string of interest
+ */
+void pst_rfc2047(pst_item *item, pst_string *str, int needs_quote) {
+    int has_space = 0;
+    int needs_coding = 0;
+    pst_convert_utf8(item, str);
+    const int8_t *x = (int8_t *)str->str;
+    while (*x) {
+        if (*x == 32) has_space = 1;
+        if (*x < 32)  needs_coding = 1;
+        x++;
+    }
+    if (needs_coding) {
+        char *enc = pst_base64_encode_single(str->str, strlen(str->str));
+        free(str->str);
+        int n = strlen(enc) + 20;
+        str->str = pst_malloc(n);
+        snprintf(str->str, n, "=?utf-8?B?%s?=", enc);
+        free(enc);
+    }
+    else if (has_space && needs_quote) {
+        int n = strlen(str->str) + 10;
+        char *buffer = pst_malloc(n);
+        snprintf(buffer, n, "\"%s\"", str->str);
+        free(str->str);
+        str->str = buffer;
+    }
+}
+
+
 /** Convert str to utf8 if possible; null strings are preserved.
  *
  *  @param item  pointer to the containing mapi item
--- a/src/libpst.h	Mon Sep 13 15:10:52 2010 -0700
+++ b/src/libpst.h	Fri Dec 24 19:26:05 2010 -0800
@@ -1070,6 +1070,20 @@
 const char*     pst_default_charset(pst_item *item, int buflen, char* result);
 
 
+/** Convert str to rfc2231 encoding of str
+ *  @param item  pointer to the containing mapi item
+ *  @param str   pointer to the mapi string of interest
+ */
+void            pst_rfc2231(pst_string *str);
+
+
+/** Convert str to rfc2047 encoding of str, possibly enclosed in quotes if it contains spaces
+ *  @param item  pointer to the containing mapi item
+ *  @param str   pointer to the mapi string of interest
+ */
+void            pst_rfc2047(pst_item *item, pst_string *str, int needs_quote);
+
+
 /** Convert str to utf8 if possible; null strings are preserved.
  * @param item  pointer to the containing mapi item
  * @param str   pointer to the mapi string of interest
--- a/src/libstrfunc.c	Mon Sep 13 15:10:52 2010 -0700
+++ b/src/libstrfunc.c	Fri Dec 24 19:26:05 2010 -0800
@@ -14,7 +14,7 @@
         *line_count = 0;
     }
     *(*ou)++ = data;
-    (*line_count)++;
+    if (*line_count >= 0) (*line_count)++;
 }
 
 
@@ -25,6 +25,13 @@
 }
 
 
+char *pst_base64_encode_single(void *data, size_t size)
+{
+    int line_count = -1;
+    return pst_base64_encode_multiple(data, size, &line_count);
+}
+
+
 char *pst_base64_encode_multiple(void *data, size_t size, int *line_count)
 {
     char *output;
--- a/src/libstrfunc.h	Mon Sep 13 15:10:52 2010 -0700
+++ b/src/libstrfunc.h	Fri Dec 24 19:26:05 2010 -0800
@@ -5,6 +5,7 @@
 #include "common.h"
 
 char *pst_base64_encode(void *data, size_t size);
+char *pst_base64_encode_single(void *data, size_t size);
 char *pst_base64_encode_multiple(void *data, size_t size, int *line_count);
 
 #endif
--- a/src/readpst.c	Mon Sep 13 15:10:52 2010 -0700
+++ b/src/readpst.c	Fri Dec 24 19:26:05 2010 -0800
@@ -813,7 +813,7 @@
         if (y == 0)
             snprintf(dir_name, dirsize, "%s", dir);
         else
-      snprintf(dir_name, dirsize, "%s" SEP_MAIL_FILE_TEMPLATE, dir, y, ""); // enough for 9 digits allocated above
+            snprintf(dir_name, dirsize, "%s" SEP_MAIL_FILE_TEMPLATE, dir, y, ""); // enough for 9 digits allocated above
 
         check_filename(dir_name);
         DEBUG_INFO(("about to try creating %s\n", dir_name));
@@ -1050,13 +1050,19 @@
     }
     fprintf(f_output, "Content-Transfer-Encoding: base64\n");
 
-    // If there is a long filename (filename2) use that, otherwise
-    // use the 8.3 filename (filename1)
-    attach_filename = (attach->filename2.str) ? attach->filename2.str : attach->filename1.str;
-    if (!attach_filename) {
+    if (attach->filename2.str) {
+        // use the long filename, converted to proper encoding if needed.
+        // it is already utf8
+        pst_rfc2231(&attach->filename2);
+        fprintf(f_output, "Content-Disposition: attachment; \n        filename*=%s\n\n", attach->filename2.str);
+    }
+    else if (attach->filename1.str) {
+        // short filename never needs encoding
+        fprintf(f_output, "Content-Disposition: attachment; filename=\"%s\"\n\n", attach->filename1.str);
+    }
+    else {
+        // no filename is inline
         fprintf(f_output, "Content-Disposition: inline\n\n");
-    } else {
-        fprintf(f_output, "Content-Disposition: attachment; filename=\"%s\"\n\n", attach_filename);
     }
 
     (void)pst_attach_to_file_base64(pst, attach, f_output);
@@ -1154,7 +1160,7 @@
     int b64 = 0;
     uint8_t *b = (uint8_t *)body;
     DEBUG_ENT("test_base64");
-    while (*b != 0) {
+    while (*b) {
         if ((*b < 32) && (*b != 9) && (*b != 10)) {
             DEBUG_INFO(("found base64 byte %d\n", (int)*b));
             DEBUG_HEXDUMPC(body, strlen(body), 0x10);
@@ -1453,6 +1459,18 @@
             fprintf(f_output, "%s", headers);
             // make sure the headers end with a \n
             if (headers[len-1] != '\n') fprintf(f_output, "\n");
+            //char *h = headers;
+            //while (*h) {
+            //    char *e = strchr(h, '\n');
+            //    int   d = 1;    // normally e points to trailing \n
+            //    if (!e) {
+            //        e = h + strlen(h);  // e points to trailing null
+            //        d = 0;
+            //    }
+            //    // we could do rfc2047 encoding here if needed
+            //    fprintf(f_output, "%.*s\n", (int)(e-h), h);
+            //    h = e + d;
+            //}
         }
     }
 
@@ -1460,7 +1478,8 @@
 
     if (!has_from) {
         if (item->email->outlook_sender_name.str){
-            fprintf(f_output, "From: \"%s\" <%s>\n", item->email->outlook_sender_name.str, sender);
+            pst_rfc2047(item, &item->email->outlook_sender_name, 1);
+            fprintf(f_output, "From: %s <%s>\n", item->email->outlook_sender_name.str, sender);
         } else {
             fprintf(f_output, "From: <%s>\n", sender);
         }
@@ -1468,6 +1487,7 @@
 
     if (!has_subject) {
         if (item->subject.str) {
+            pst_rfc2047(item, &item->subject, 0);
             fprintf(f_output, "Subject: %s\n", item->subject.str);
         } else {
             fprintf(f_output, "Subject: \n");
@@ -1475,12 +1495,12 @@
     }
 
     if (!has_to && item->email->sentto_address.str) {
-        pst_convert_utf8(item, &item->email->sentto_address);
+        pst_rfc2047(item, &item->email->sentto_address, 0);
         fprintf(f_output, "To: %s\n", item->email->sentto_address.str);
     }
 
     if (!has_cc && item->email->cc_address.str) {
-        pst_convert_utf8(item, &item->email->cc_address);
+        pst_rfc2047(item, &item->email->cc_address, 0);
         fprintf(f_output, "Cc: %s\n", item->email->cc_address.str);
     }