changeset 298:201464dd356e

add default character set for items where the pst file does not specify a character set
author Carl Byington <carl@five-ten-sg.com>
date Tue, 02 Aug 2011 17:02:39 -0700
parents 8b3a827b71f4
children 1ddc61fd6189
files python/python-libpst.cpp regression/regression-tests.bash src/dumpblocks.c src/getidblock.c src/libpst.c src/libpst.h src/lspst.c src/pst2dii.cpp.in src/pst2ldif.cpp src/readpst.c
diffstat 10 files changed, 54 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/python/python-libpst.cpp	Thu Jul 28 17:28:49 2011 -0700
+++ b/python/python-libpst.cpp	Tue Aug 02 17:02:39 2011 -0700
@@ -27,7 +27,7 @@
 
 class pst {
 public:
-                    pst(const string filename);
+                    pst(const string filename, const string charset);
     virtual         ~pst();
     pst_desc_tree*  pst_getTopOfFolders();
     ppst_binary     pst_attach_to_mem(pst_item_attach *attach);
@@ -61,9 +61,8 @@
 };
 
 
-pst::pst(const string filename) {
-    char *f = (char *)filename.c_str(); // ok, since pst_open does not actually modify this buffer, and newer versions will change the signature to const anyway
-    is_open = (::pst_open(&pf, f) == 0);
+pst::pst(const string filename, const string charset) {
+    is_open = (::pst_open(&pf, filename.c_str(), charset.c_str()) == 0);
     root = NULL;
     topf = NULL;
     if (is_open) {
@@ -602,7 +601,7 @@
         .def_readonly("ind_type",    &pst_file::ind_type)
         ;
 
-    class_<pst>("pst", init<string>())
+    class_<pst>("pst", init<string,string>())
         .def("pst_getTopOfFolders",         &pst::pst_getTopOfFolders, return_value_policy<reference_existing_object>())
         .def("pst_attach_to_mem",           &pst::pst_attach_to_mem)
         .def("pst_attach_to_file",          &pst::pst_attach_to_file)
--- a/regression/regression-tests.bash	Thu Jul 28 17:28:49 2011 -0700
+++ b/regression/regression-tests.bash	Tue Aug 02 17:02:39 2011 -0700
@@ -71,7 +71,7 @@
             #$val ../src/readpst $jobs     -r -e -D -cv -o output$n -d $ba.log $fn >$ba.err 2>&1
 
             ## separate mode where we decode all attachments to binary files
-            $val ../src/readpst $jobs     -r -S -D -cv -o output$n -d $ba.log $fn >$ba.err 2>&1
+            $val ../src/readpst $jobs      -r -S -D -cv -o output$n -d $ba.log $fn >$ba.err 2>&1
 
             ## testing idblock
             #../src/getidblock -p $fn 0 >$ba.fulldump
@@ -127,6 +127,7 @@
 #$func  25 jerry.pst             # non ascii subject lines
 #$func  26 phill.bertolus.pst    # possible segfault in forked process, cannot reproduce
 #$func  27 justin.phelps.pst     # segfault?
+$func  27 kaiser.pst            # appointments with other character sets
 
 
 [ -n "$val" ] && grep 'lost:' *err | grep -v 'lost: 0 '
--- a/src/dumpblocks.c	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/dumpblocks.c	Tue Aug 02 17:02:39 2011 -0700
@@ -33,7 +33,7 @@
     DEBUG_ENT("main");
 
     printf("Opening file %s\n", file);
-    if (pst_open(&pstfile, file)) {
+    if (pst_open(&pstfile, file, NULL)) {
         printf("Failed to open file %s\n", file);
         exit(1);
     }
--- a/src/getidblock.c	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/getidblock.c	Tue Aug 02 17:02:39 2011 -0700
@@ -117,7 +117,7 @@
 
     DEBUG_INFO(("Opening file\n"));
     memset(&pstfile, 0, sizeof(pstfile));
-    if (pst_open(&pstfile, fname)) {
+    if (pst_open(&pstfile, fname, NULL)) {
         DIE(("Error opening file\n"));
     }
 
--- a/src/libpst.c	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/libpst.c	Tue Aug 02 17:02:39 2011 -0700
@@ -290,7 +290,7 @@
 
 
 
-int pst_open(pst_file *pf, const char *name) {
+int pst_open(pst_file *pf, const char *name, const char *charset) {
     int32_t sig;
 
     pst_unicode_init();
@@ -303,6 +303,7 @@
         return -1;
     }
     memset(pf, 0, sizeof(*pf));
+    pf->charset = charset;
 
     if ((pf->fp = fopen(name, "rb")) == NULL) {
         perror("Error opening PST file");
@@ -1245,6 +1246,7 @@
 
     item = (pst_item*) pst_malloc(sizeof(pst_item));
     memset(item, 0, sizeof(pst_item));
+    item->pf = pf;
 
     if (pst_process(d_ptr->desc->i_id, list, item, NULL)) {
         DEBUG_WARN(("pst_process() returned non-zero value. That is an error\n"));
@@ -4357,10 +4359,11 @@
  *  @return default character set as a string useable by iconv()
  */
 const char*    pst_default_charset(pst_item *item, int buflen, char* result) {
-    return (item->body_charset.str) ? item->body_charset.str :
-           (item->message_codepage) ? codepage(item->message_codepage, buflen, result) :
-           (item->internet_cpid)    ? codepage(item->internet_cpid, buflen, result) :
-           "utf-8";
+    return (item->body_charset.str)         ? item->body_charset.str :
+           (item->message_codepage)         ? codepage(item->message_codepage, buflen, result) :
+           (item->internet_cpid)            ? codepage(item->internet_cpid, buflen, result) :
+           (item->pf && item->pf->charset)  ? item->pf->charset :
+           "iso-8859-1";
 }
 
 
@@ -4451,15 +4454,25 @@
  *  @param str   pointer to the mapi string of interest
  */
 void pst_convert_utf8(pst_item *item, pst_string *str) {
+    DEBUG_ENT("pst_convert_utf8");
     char buffer[30];
-    if (str->is_utf8) return;
+    if (str->is_utf8) {
+        DEBUG_WARN(("Already utf8\n"));
+        DEBUG_RET();
+        return;
+    }
     if (!str->str) {
         str->str = strdup("");
+        DEBUG_WARN(("null to empty string\n"));
+        DEBUG_RET();
         return;
     }
     const char *charset = pst_default_charset(item, sizeof(buffer), buffer);
-    if (!strcasecmp("utf-8", charset)) return;  // already utf8
-    DEBUG_ENT("pst_convert_utf8");
+    DEBUG_WARN(("default charset is %s\n", charset));
+    if (!strcasecmp("utf-8", charset)) {
+        DEBUG_RET();
+        return;
+    }
     pst_vbuf *newer = pst_vballoc(2);
     size_t rc = pst_vb_8bit2utf8(newer, str->str, strlen(str->str) + 1, charset);
     if (rc == (size_t)-1) {
--- a/src/libpst.h	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/libpst.h	Tue Aug 02 17:02:39 2011 -0700
@@ -760,6 +760,8 @@
  *  each major mapi item type. It represents a complete mapi object.
  */
 typedef struct pst_item {
+    /** pointer to the pst_file */
+    struct pst_file        *pf;
     /** block id that can be used to generate uid */
     uint64_t               block_id;
     /** email mapi elements */
@@ -878,6 +880,8 @@
     char*   cwd;
     /** original file name when the file was opened */
     char*   fname;
+    /** default character set for items without one */
+    char*   charset;
     /** the head and tail of the linked list of index structures */
     pst_index_ll *i_head, *i_tail;
     /** the head and tail of the top level of the descriptor tree */
@@ -916,12 +920,13 @@
 
 
 /** Open a pst file.
- * @param pf   pointer to uninitialized pst_file structure. This structure
- *             will be filled in by this function.
- * @param name name of the file, suitable for fopen().
+ * @param pf       pointer to uninitialized pst_file structure. This structure
+ *                 will be filled in by this function.
+ * @param name     name of the file, suitable for fopen().
+ * @param charset  default charset for item with unspecified character sets
  * @return 0 if ok, -1 if error
  */
-int             pst_open(pst_file *pf, const char *name);
+int             pst_open(pst_file *pf, const char *name, const char *charset);
 
 
 /** Reopen the pst file after a fork
--- a/src/lspst.c	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/lspst.c	Tue Aug 02 17:02:39 2011 -0700
@@ -212,7 +212,7 @@
 	}
 
     // Open PST file
-    if (pst_open(&pstfile, argv[optind])) DIE(("Error opening File\n"));
+    if (pst_open(&pstfile, argv[optind], NULL)) DIE(("Error opening File\n"));
 
     // Load PST index
     if (pst_load_index(&pstfile)) DIE(("Index Error\n"));
--- a/src/pst2dii.cpp.in	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/pst2dii.cpp.in	Tue Aug 02 17:02:39 2011 -0700
@@ -694,7 +694,7 @@
     #endif
     DEBUG_INIT(d_log, NULL);
     DEBUG_ENT("main");
-    RET_DERROR(pst_open(&pstfile, fname), 1, ("Error opening File\n"));
+    RET_DERROR(pst_open(&pstfile, fname, NULL), 1, ("Error opening File\n"));
     RET_DERROR(pst_load_index(&pstfile), 2, ("Index Error\n"));
 
     pst_load_extended_attributes(&pstfile);
--- a/src/pst2ldif.cpp	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/pst2ldif.cpp	Tue Aug 02 17:02:39 2011 -0700
@@ -629,7 +629,7 @@
     #endif
     DEBUG_INIT(d_log, NULL);
     DEBUG_ENT("main");
-    RET_DERROR(pst_open(&pstfile, fname), 1, ("Error opening File\n"));
+    RET_DERROR(pst_open(&pstfile, fname, NULL), 1, ("Error opening File\n"));
     RET_DERROR(pst_load_index(&pstfile), 2, ("Index Error\n"));
 
     pst_load_extended_attributes(&pstfile);
--- a/src/readpst.c	Thu Jul 28 17:28:49 2011 -0700
+++ b/src/readpst.c	Tue Aug 02 17:02:39 2011 -0700
@@ -133,6 +133,7 @@
 int         file_name_len = 10;     // enough room for MODE_SPEARATE file name
 pst_file    pstfile;
 regex_t     meta_charset_pattern;
+char*       default_charset = NULL;
 
 int         number_processors = 1;  // number of cpus we have
 int         max_children  = 0;      // based on number of cpus and command line args
@@ -443,11 +444,20 @@
     }
 
     // command-line option handling
-    while ((c = getopt(argc, argv, "bc:Dd:ehj:kMo:qrSt:uVw"))!= -1) {
+    while ((c = getopt(argc, argv, "bC:c:Dd:ehj:kMo:qrSt:uVw"))!= -1) {
         switch (c) {
         case 'b':
             save_rtf_body = 0;
             break;
+        case 'C':
+            if (optarg) {
+                default_charset = optarg;
+            }
+            else {
+                usage();
+                exit(0);
+            }
+            break;
         case 'c':
             if (optarg && optarg[0]=='v') {
                 contact_mode=CMODE_VCARD;
@@ -597,7 +607,7 @@
 
     if (output_mode != OUTPUT_QUIET) printf("Opening PST file and indexes...\n");
 
-    RET_DERROR(pst_open(&pstfile, fname), 1, ("Error opening File\n"));
+    RET_DERROR(pst_open(&pstfile, fname, default_charset), 1, ("Error opening File\n"));
     RET_DERROR(pst_load_index(&pstfile), 2, ("Index Error\n"));
 
     pst_load_extended_attributes(&pstfile);
@@ -699,6 +709,7 @@
     printf("Usage: %s [OPTIONS] {PST FILENAME}\n", prog_name);
     printf("OPTIONS:\n");
     printf("\t-V\t- Version. Display program version\n");
+    printf("\t-C charset\t- character set for items with unspecified character set\n");
     printf("\t-D\t- Include deleted items in output\n");
     printf("\t-M\t- Write emails in the MH (rfc822) format\n");
     printf("\t-S\t- Separate. Write emails in the separate format\n");