diff src/pst2ldif.cpp @ 151:cda7c812ec01

track character set individually for each mapi element
author Carl Byington <carl@five-ten-sg.com>
date Sun, 08 Mar 2009 14:35:26 -0700
parents fdc58ad2c758
children 252ad66d3d6e
line wrap: on
line diff
--- a/src/pst2ldif.cpp	Thu Mar 05 08:23:32 2009 -0800
+++ b/src/pst2ldif.cpp	Sun Mar 08 14:35:26 2009 -0700
@@ -24,19 +24,19 @@
 void       version(void);
 char       *check_filename(char *fname);
 void        print_ldif_single(const char *attr, const char *value);
-void        print_ldif_address(const char *attr, int nvalues, char *value, ...);
-void        print_ldif_dn(const char *attr, const char *value, const char *base);
-void        print_ldif_multi(const char *dn, const char *value);
-void        print_ldif_two(const char *attr, const char *value1, const char *value2);
+void        print_ldif_single(const char *attr, pst_string value);
+void        print_ldif_address(const char *attr, int nvalues, pst_string value, ...);
+void        print_ldif_dn(const char *attr, pst_string value, const char *base);
+void        print_ldif_multi(const char *dn, pst_string value);
+void        print_ldif_two(const char *attr, pst_string value1, pst_string value2);
 void        print_escaped_dn(const char *value);
-void        build_cn(char *cn, size_t len, int nvalues, char *value, ...);
+void        build_cn(char *cn, size_t len, int nvalues, pst_string value, ...);
 
 char *prog_name;
 pst_file pstfile;
 bool    old_schema            = false;
 char    *ldap_base            = NULL;   // 'o=some.domain.tld,c=US'
 int     ldif_extra_line_count = 0;
-iconv_t cd                    = 0;      // Character set conversion descriptor
 vector<string> ldap_class;              // 'newPerson' or 'inetOrgPerson'
 vector<string> ldif_extra_line;         // 'o: myorg'
 
@@ -108,21 +108,64 @@
 
 static void process(pst_desc_ll *d_ptr);
 static void process(pst_desc_ll *d_ptr) {
+    DEBUG_ENT("process");
     pst_item *item = NULL;
     while (d_ptr) {
         if (d_ptr->desc) {
             item = pst_parse_item(&pstfile, d_ptr, NULL);
             DEBUG_INFO(("item pointer is %p\n", item));
             if (item) {
-                if (item->folder && d_ptr->child && strcasecmp(item->file_as, "Deleted Items")) {
+                if (item->folder && d_ptr->child && item->file_as.str && strcasecmp(item->file_as.str, "Deleted Items")) {
                     //if this is a non-empty folder other than deleted items, we want to recurse into it
-                    fprintf(stderr, "entering folder %s\n", item->file_as);
+                    fprintf(stderr, "entering folder %s\n", item->file_as.str);
                     process(d_ptr->child);
 
                 } else if (item->contact && (item->type == PST_TYPE_CONTACT)) {
                     // deal with a contact
                     char cn[1000];
 
+                    // convert everything to utf8
+                    pst_convert_utf8_null(item, &item->contact->display_name_prefix);
+                    pst_convert_utf8_null(item, &item->contact->first_name);
+                    pst_convert_utf8_null(item, &item->contact->surname);
+                    pst_convert_utf8_null(item, &item->contact->suffix);
+                    pst_convert_utf8_null(item, &item->contact->company_name);
+                    pst_convert_utf8_null(item, &item->contact->job_title);
+                    pst_convert_utf8_null(item, &item->contact->address1);
+                    pst_convert_utf8_null(item, &item->contact->address2);
+                    pst_convert_utf8_null(item, &item->contact->address3);
+                    pst_convert_utf8_null(item, &item->contact->address1a);
+                    pst_convert_utf8_null(item, &item->contact->address2a);
+                    pst_convert_utf8_null(item, &item->contact->address3a);
+                    pst_convert_utf8_null(item, &item->contact->business_address);
+                    pst_convert_utf8_null(item, &item->contact->business_po_box);
+                    pst_convert_utf8_null(item, &item->contact->business_street);
+                    pst_convert_utf8_null(item, &item->contact->business_city);
+                    pst_convert_utf8_null(item, &item->contact->business_state);
+                    pst_convert_utf8_null(item, &item->contact->business_postal_code);
+                    pst_convert_utf8_null(item, &item->contact->home_address);
+                    pst_convert_utf8_null(item, &item->contact->home_po_box);
+                    pst_convert_utf8_null(item, &item->contact->home_street);
+                    pst_convert_utf8_null(item, &item->contact->home_city);
+                    pst_convert_utf8_null(item, &item->contact->home_state);
+                    pst_convert_utf8_null(item, &item->contact->home_postal_code);
+                    pst_convert_utf8_null(item, &item->contact->other_address);
+                    pst_convert_utf8_null(item, &item->contact->other_po_box);
+                    pst_convert_utf8_null(item, &item->contact->other_street);
+                    pst_convert_utf8_null(item, &item->contact->other_city);
+                    pst_convert_utf8_null(item, &item->contact->other_state);
+                    pst_convert_utf8_null(item, &item->contact->other_postal_code);
+                    pst_convert_utf8_null(item, &item->contact->business_fax);
+                    pst_convert_utf8_null(item, &item->contact->home_fax);
+                    pst_convert_utf8_null(item, &item->contact->business_phone);
+                    pst_convert_utf8_null(item, &item->contact->home_phone);
+                    pst_convert_utf8_null(item, &item->contact->car_phone);
+                    pst_convert_utf8_null(item, &item->contact->mobile_phone);
+                    pst_convert_utf8_null(item, &item->contact->other_phone);
+                    pst_convert_utf8_null(item, &item->contact->business_homepage);
+                    pst_convert_utf8_null(item, &item->contact->personal_homepage);
+                    pst_convert_utf8_null(item, &item->comment);
+
                     build_cn(cn, sizeof(cn), 4,
                         item->contact->display_name_prefix,
                         item->contact->first_name,
@@ -130,93 +173,94 @@
                         item->contact->suffix);
                     if (cn[0] != 0) {
                         // have a valid cn
-                        const char *ucn = unique_string(cn);
+                        pst_string ucn;
+                        ucn.str = (char*)unique_string(cn);
 
                         print_ldif_dn("dn", ucn, ldap_base);
                         print_ldif_single("cn", ucn);
-                        if (item->contact->first_name) {
+                        if (item->contact->first_name.str) {
                             print_ldif_two("givenName",
                                            item->contact->display_name_prefix,
                                            item->contact->first_name);
                         }
-                        if (item->contact->surname) {
+                        if (item->contact->surname.str) {
                             print_ldif_two("sn",
                                            item->contact->surname,
                                            item->contact->suffix);
                         }
-                        else if (item->contact->company_name) {
+                        else if (item->contact->company_name.str) {
                             print_ldif_single("sn", item->contact->company_name);
                         }
                         else
                             print_ldif_single("sn", ucn); // use cn as sn if we cannot find something better
 
                         if (old_schema) {
-                            if (item->contact->job_title)
+                            if (item->contact->job_title.str)
                                 print_ldif_single("personalTitle", item->contact->job_title);
-                            if (item->contact->company_name)
+                            if (item->contact->company_name.str)
                                 print_ldif_single("company", item->contact->company_name);
                         }
                         else {
                             // new schema
-                            if (item->contact->job_title)
+                            if (item->contact->job_title.str)
                                 print_ldif_single("title", item->contact->job_title);
-                            if (item->contact->company_name)
+                            if (item->contact->company_name.str)
                                 print_ldif_single("o", item->contact->company_name);
                         }
-                        if (item->contact->address1  && *item->contact->address1)
+                        if (item->contact->address1.str  && *item->contact->address1.str)
                             print_ldif_single("mail", item->contact->address1);
-                        if (item->contact->address2  && *item->contact->address2)
+                        if (item->contact->address2.str  && *item->contact->address2.str)
                             print_ldif_single("mail", item->contact->address2);
-                        if (item->contact->address3  && *item->contact->address3)
+                        if (item->contact->address3.str  && *item->contact->address3.str)
                             print_ldif_single("mail", item->contact->address3);
-                        if (item->contact->address1a && *item->contact->address1a)
+                        if (item->contact->address1a.str && *item->contact->address1a.str)
                             print_ldif_single("mail", item->contact->address1a);
-                        if (item->contact->address2a && *item->contact->address2a)
+                        if (item->contact->address2a.str && *item->contact->address2a.str)
                             print_ldif_single("mail", item->contact->address2a);
-                        if (item->contact->address3a && *item->contact->address3a)
+                        if (item->contact->address3a.str && *item->contact->address3a.str)
                             print_ldif_single("mail", item->contact->address3a);
 
                         if (old_schema) {
-                            if (item->contact->business_address) {
-                                if (item->contact->business_po_box)
+                            if (item->contact->business_address.str) {
+                                if (item->contact->business_po_box.str)
                                     print_ldif_single("postalAddress", item->contact->business_po_box);
-                                if (item->contact->business_street)
+                                if (item->contact->business_street.str)
                                     print_ldif_multi("postalAddress", item->contact->business_street);
-                                if (item->contact->business_city)
+                                if (item->contact->business_city.str)
                                     print_ldif_single("l", item->contact->business_city);
-                                if (item->contact->business_state)
+                                if (item->contact->business_state.str)
                                     print_ldif_single("st", item->contact->business_state);
-                                if (item->contact->business_postal_code)
+                                if (item->contact->business_postal_code.str)
                                     print_ldif_single("postalCode", item->contact->business_postal_code);
                             }
-                            else if (item->contact->home_address) {
-                                if (item->contact->home_po_box)
+                            else if (item->contact->home_address.str) {
+                                if (item->contact->home_po_box.str)
                                     print_ldif_single("postalAddress", item->contact->home_po_box);
-                                if (item->contact->home_street)
+                                if (item->contact->home_street.str)
                                     print_ldif_multi("postalAddress", item->contact->home_street);
-                                if (item->contact->home_city)
+                                if (item->contact->home_city.str)
                                     print_ldif_single("l", item->contact->home_city);
-                                if (item->contact->home_state)
+                                if (item->contact->home_state.str)
                                     print_ldif_single("st", item->contact->home_state);
-                                if (item->contact->home_postal_code)
+                                if (item->contact->home_postal_code.str)
                                     print_ldif_single("postalCode", item->contact->home_postal_code);
                             }
-                            else if (item->contact->other_address) {
-                                if (item->contact->other_po_box)
+                            else if (item->contact->other_address.str) {
+                                if (item->contact->other_po_box.str)
                                     print_ldif_single("postalAddress", item->contact->other_po_box);
-                                if (item->contact->other_street)
+                                if (item->contact->other_street.str)
                                     print_ldif_multi("postalAddress", item->contact->other_street);
-                                if (item->contact->other_city)
+                                if (item->contact->other_city.str)
                                     print_ldif_single("l", item->contact->other_city);
-                                if (item->contact->other_state)
+                                if (item->contact->other_state.str)
                                     print_ldif_single("st", item->contact->other_state);
-                                if (item->contact->other_postal_code)
+                                if (item->contact->other_postal_code.str)
                                     print_ldif_single("postalCode", item->contact->other_postal_code);
                             }
                         }
                         else {
                             // new schema, with proper RFC4517 postal addresses
-                            if (item->contact->business_address) {
+                            if (item->contact->business_address.str) {
                                 print_ldif_address("postalAddress", 6,
                                     item->contact->business_po_box,
                                     item->contact->business_street,
@@ -224,22 +268,22 @@
                                     item->contact->business_state,
                                     item->contact->business_postal_code,
                                     item->contact->business_country);
-                                if (item->contact->business_city)
+                                if (item->contact->business_city.str)
                                     print_ldif_single("l", item->contact->business_city);
-                                if (item->contact->business_state)
+                                if (item->contact->business_state.str)
                                     print_ldif_single("st", item->contact->business_state);
-                                if (item->contact->business_postal_code)
+                                if (item->contact->business_postal_code.str)
                                     print_ldif_single("postalCode", item->contact->business_postal_code);
                             }
-                            else if (item->contact->home_address) {
-                                if (item->contact->home_city)
+                            else if (item->contact->home_address.str) {
+                                if (item->contact->home_city.str)
                                     print_ldif_single("l", item->contact->home_city);
-                                if (item->contact->home_state)
+                                if (item->contact->home_state.str)
                                     print_ldif_single("st", item->contact->home_state);
-                                if (item->contact->home_postal_code)
+                                if (item->contact->home_postal_code.str)
                                     print_ldif_single("postalCode", item->contact->home_postal_code);
                             }
-                            else if (item->contact->other_address) {
+                            else if (item->contact->other_address.str) {
                                 print_ldif_address("postalAddress", 6,
                                     item->contact->other_po_box,
                                     item->contact->other_street,
@@ -247,14 +291,14 @@
                                     item->contact->other_state,
                                     item->contact->other_postal_code,
                                     item->contact->other_country);
-                                if (item->contact->other_city)
+                                if (item->contact->other_city.str)
                                     print_ldif_single("l", item->contact->other_city);
-                                if (item->contact->other_state)
+                                if (item->contact->other_state.str)
                                     print_ldif_single("st", item->contact->other_state);
-                                if (item->contact->other_postal_code)
+                                if (item->contact->other_postal_code.str)
                                     print_ldif_single("postalCode", item->contact->other_postal_code);
                             }
-                            if (item->contact->home_address) {
+                            if (item->contact->home_address.str) {
                                 print_ldif_address("homePostalAddress", 6,
                                     item->contact->home_po_box,
                                     item->contact->home_street,
@@ -265,31 +309,31 @@
                             }
                         }
 
-                        if (item->contact->business_fax)
+                        if (item->contact->business_fax.str)
                             print_ldif_single("facsimileTelephoneNumber", item->contact->business_fax);
-                        else if (item->contact->home_fax)
+                        else if (item->contact->home_fax.str)
                             print_ldif_single("facsimileTelephoneNumber", item->contact->home_fax);
 
-                        if (item->contact->business_phone)
+                        if (item->contact->business_phone.str)
                             print_ldif_single("telephoneNumber", item->contact->business_phone);
-                        if (item->contact->home_phone)
+                        if (item->contact->home_phone.str)
                             print_ldif_single("homePhone", item->contact->home_phone);
 
-                        if (item->contact->car_phone)
+                        if (item->contact->car_phone.str)
                             print_ldif_single("mobile", item->contact->car_phone);
-                        else if (item->contact->mobile_phone)
+                        else if (item->contact->mobile_phone.str)
                             print_ldif_single("mobile", item->contact->mobile_phone);
-                        else if (item->contact->other_phone)
+                        else if (item->contact->other_phone.str)
                             print_ldif_single("mobile", item->contact->other_phone);
 
                         if (!old_schema) {
-                            if (item->contact->business_homepage)
+                            if (item->contact->business_homepage.str)
                                 print_ldif_single("labeledURI", item->contact->business_homepage);
-                            if (item->contact->personal_homepage)
+                            if (item->contact->personal_homepage.str)
                                 print_ldif_single("labeledURI", item->contact->personal_homepage);
                         }
 
-                        if (item->comment)
+                        if (item->comment.str)
                             print_ldif_single("description", item->comment);
 
                         for (vector<string>::size_type i=0; i<ldap_class.size(); i++)
@@ -305,6 +349,13 @@
         }
         d_ptr = d_ptr->next;
     }
+    DEBUG_RET();
+}
+
+
+void print_ldif_single(const char *attr, pst_string value)
+{
+    print_ldif_single(attr, value.str);
 }
 
 
@@ -315,7 +366,6 @@
 {
     size_t len;
     bool is_safe_string = true;
-    bool needs_code_conversion = false;
     bool space_flag = false;
 
     // Strip leading spaces
@@ -341,7 +391,6 @@
         }
         else {
             if ((ch & 0x80) == 0x80) {
-                needs_code_conversion = true;
                 is_safe_string = false;
             }
             if (space_flag) {
@@ -354,37 +403,20 @@
     *p = 0;
     if (is_safe_string) {
         printf("%s: %s\n", attr, &buffer[0]);
-        return;
     }
-
-    if (needs_code_conversion && cd != 0) {
-        size_t inlen = p - &buffer[0];
-        size_t utf8_len = 2 * inlen + 1;
-        vector<char> utf8_buffer(utf8_len);
-        char *utf8_p = &utf8_buffer[0];
-
-        iconv(cd, NULL, NULL, NULL, NULL);
-        p = &buffer[0];
-        int ret = iconv(cd, (ICONV_CONST char**)&p, &inlen, &utf8_p, &utf8_len);
-
-        if (ret >= 0) {
-            *utf8_p = 0;
-            p = base64_encode(&utf8_buffer[0], utf8_p - &utf8_buffer[0]);
-        }
-        else
-            p = base64_encode(&buffer[0], buffer.size());
+    else {
+        p = base64_encode(&buffer[0], buffer.size());
+        printf("%s:: %s\n", attr, p);
+        free(p);
     }
-    else
-        p = base64_encode(&buffer[0], buffer.size());
-    printf("%s:: %s\n", attr, p);
-    free(p);
 }
 
 
 // Combines values representing address lines into an address,i
 // lines separated with "$" as per PostalAddress syntax in RFC4517
-void print_ldif_address(const char *attr, int nvalues, char *value, ...)
+void print_ldif_address(const char *attr, int nvalues, pst_string value, ...)
 {
+    DEBUG_ENT("print_ldif_address");
     bool space_flag = false;
     bool newline_flag = false;
     char *address = NULL;    // Buffer where address is built up
@@ -393,26 +425,26 @@
     va_list ap;
 
     va_start(ap, value);
+    while (!value.str) {
+        nvalues--;
+        if (nvalues == 0) {    // Nothing at all to do!
+            va_end(ap);
+            DEBUG_RET();
+            return;
+        }
+        value = va_arg(ap, pst_string);
+    }
 
-    while (!value) {
-       nvalues--;
-       if (nvalues == 0) {    // Nothing at all to do!
-           va_end(ap);
-           return;
-       }
-       value = va_arg(ap, char *);
-    }
     for (;;) {
-        char ch = *value++;
+        char ch = *(value.str)++;
 
-        if (ch == 0 || ch == '\n') {
+        if (ch == 0) {
             do {
-                value = NULL;
                 nvalues--;
                 if (nvalues == 0) break;
-                value = va_arg(ap, char *);
-            } while (!value);
-            if (!value) break;
+                value = va_arg(ap, pst_string);
+            } while (!value.str);
+            if (!nvalues || !value.str) break;
             space_flag = true;
             newline_flag = true;
         }
@@ -449,46 +481,48 @@
     address[i] = 0;
     print_ldif_single(attr, address);
     free(address);
+    DEBUG_RET();
 }
 
 
-void print_ldif_multi(const char *dn, const char *value)
+void print_ldif_multi(const char *dn, pst_string value)
 {
-    const char *n;
-    while ((n = strchr(value, '\n'))) {
-        print_ldif_single(dn, value);
-        value = n + 1;
+    char *n;
+    char *valuestr = value.str;
+    while ((n = strchr(valuestr, '\n'))) {
+        print_ldif_single(dn, valuestr);
+        valuestr = n + 1;
     }
-    print_ldif_single(dn, value);
+    print_ldif_single(dn, valuestr);
 }
 
 
-void print_ldif_two(const char *attr, const char *value1, const char *value2)
+void print_ldif_two(const char *attr, pst_string value1, pst_string value2)
 {
     size_t len1, len2;
-    if (value1 && *value1)
-        len1 = strlen(value1);
+    if (value1.str && *value1.str)
+        len1 = strlen(value1.str);
     else {
         print_ldif_single(attr, value2);
         return;
     }
 
-    if (value2 && *value2)
-        len2 = strlen(value2);
+    if (value2.str && *value2.str)
+        len2 = strlen(value2.str);
     else {
         print_ldif_single(attr, value1);
         return;
     }
 
     vector<char> value(len1 + len2 + 2);
-    memcpy(&value[0], value1, len1);
+    memcpy(&value[0], value1.str, len1);
     value[len1] = ' ';
-    memcpy(&value[0] + len1 + 1, value2, len2 + 1);
+    memcpy(&value[0] + len1 + 1, value2.str, len2 + 1);
     print_ldif_single(attr, &value[0]);
 }
 
 
-void build_cn(char *cn, size_t len, int nvalues, char *value, ...)
+void build_cn(char *cn, size_t len, int nvalues, pst_string value, ...)
 {
     bool space_flag = false;
     size_t i = 0;
@@ -496,26 +530,25 @@
 
     va_start(ap, value);
 
-    while (!value) {
+    while (!value.str) {
        nvalues--;
        if (nvalues == 0) {
            cn[0] = 0;   // Just a terminating NUL
            va_end(ap);
            return;
        }
-       value = va_arg(ap, char *);
+       value = va_arg(ap, pst_string);
     }
     for (;;) {
-        char ch = *value++;
+        char ch = *(value.str)++;
 
         if (ch == 0 || ch == '\n') {
             do {
-                value = NULL;
                 nvalues--;
                 if (nvalues == 0) break;
-                value = va_arg(ap, char *);
-            } while (!value);
-            if (!value) break;
+                value = va_arg(ap, pst_string);
+            } while (!value.str);
+            if (!nvalues || !value.str) break;
             space_flag = true;
         }
         else if (ch == '\r')
@@ -549,7 +582,7 @@
     prog_name = argv[0];
     pst_item *item = NULL;
 
-    while ((c = getopt(argc, argv, "b:c:C:d:l:oVh"))!= -1) {
+    while ((c = getopt(argc, argv, "b:c:d:l:oVh"))!= -1) {
         switch (c) {
         case 'b':
             ldap_base = optarg;
@@ -557,14 +590,6 @@
         case 'c':
             ldap_class.push_back(string(optarg));
             break;
-        case 'C':
-            cd = iconv_open("UTF-8", optarg);
-            if (cd == (iconv_t)(-1)) {
-                fprintf(stderr, "I don't know character set \"%s\"!\n\n", optarg);
-                fprintf(stderr, "Type: \"iconv --list\" to get list of known character sets\n");
-                return 1;
-            }
-            break;
         case 'd':
             d_log = optarg;
             break;
@@ -645,8 +670,6 @@
     pst_close(&pstfile);
     DEBUG_RET();
     free_strings(all_strings);
-    if (cd) iconv_close(cd);
-
     return 0;
 }
 
@@ -656,7 +679,6 @@
     printf("Usage: %s [OPTIONS] {PST FILENAME}\n", prog_name);
     printf("OPTIONS:\n");
     printf("\t-V\t- Version. Display program version\n");
-    printf("\t-C charset\t- assumed character set of non-ASCII characters\n");
     printf("\t-b ldapbase\t- set the LDAP base value\n");
     printf("\t-c class\t- set the class of the LDAP objects (may contain more than one)\n");
     printf("\t-d <filename>\t- Debug to file. This is a binary log. Use readpstlog to print it\n");
@@ -695,14 +717,15 @@
 
 
 // This function escapes Distinguished Names (as per RFC4514)
-void print_ldif_dn(const char *attr, const char *value, const char *base)
+void print_ldif_dn(const char *attr, pst_string value, const char *base)
 {
     printf("dn: cn=");
+    const char *valuestr = value.str;
     // remove leading spaces (RFC says escape them)
-    while (*value == ' ')
-        value++;
+    while (*valuestr == ' ')
+        valuestr++;
 
-    print_escaped_dn(value);
+    print_escaped_dn(valuestr);
     if (base && base[0]) {
         printf(", %s", base);
     }
@@ -714,34 +737,6 @@
 void print_escaped_dn(const char *value)
 {
     char ch;
-    bool needs_code_conversion = false;
-    char *utf8_buffer = NULL;
-
-    // First do a quick scan to see if any code conversion is required
-    if (cd) {
-        const char *p = value;
-        while (*p) {
-            if (*p++ & 0x80) {
-                needs_code_conversion = true;
-                break;
-            }
-        }
-    }
-
-    if (needs_code_conversion) {
-        size_t inlen = strlen(value);
-        size_t utf8_len = 2 * inlen + 1;
-        char *p = (char *)value;
-        char *utf8_p = utf8_buffer;
-
-        utf8_buffer = (char *)malloc(utf8_len);
-        utf8_p = utf8_buffer;
-        iconv(cd, NULL, NULL, NULL, NULL);
-        if (iconv(cd, (ICONV_CONST char**)&p, &inlen, &utf8_p, &utf8_len) >= 0) {
-            *utf8_p = 0;
-            value = utf8_buffer;
-        }
-    }
 
     // escape initial '#' and space
     if (*value == '#' || *value == ' ')
@@ -765,6 +760,5 @@
                 putchar(ch);
         }
     }
-    if (utf8_buffer) free((void *)utf8_buffer);
     return;
 }