view src/pst2ldif.cpp @ 308:97c53c6868ab

add -m option to readpst to create Outlook .msg files
author Carl Byington <carl@five-ten-sg.com>
date Sun, 13 Dec 2009 14:48:20 -0800
parents 67b24d6a45d6
children 898118c3675e
line wrap: on
line source

/*

Copyright (c) 2004 Carl Byington - 510 Software Group, released under
the GPL version 2 or any later version at your choice available at
http://www.fsf.org/licenses/gpl.txt

Based on readpst.c by David Smith

*/

using namespace std;

// needed for std c++ collections
#include <set>
#include <vector>
#include <string>

extern "C" {
    #include "define.h"
    #include "lzfu.h"
}

void       usage(void);
void       version(void);
char       *check_filename(char *fname);
void        print_ldif_single(const char *attr, const char *value);
void        print_ldif_single(const char *attr, pst_string value);
void        print_ldif_address(const char *attr, int nvalues, pst_string value, ...);
void        print_ldif_dn(const char *attr, pst_string value, const char *base);
void        print_ldif_multi(const char *dn, pst_string value);
void        print_ldif_two(const char *attr, pst_string value1, pst_string value2);
void        print_escaped_dn(const char *value);
void        build_cn(char *cn, size_t len, int nvalues, pst_string value, ...);

char *prog_name;
pst_file pstfile;
bool    old_schema            = false;
char    *ldap_base            = NULL;   // 'o=some.domain.tld,c=US'
int     ldif_extra_line_count = 0;
vector<string> ldap_class;              // 'newPerson' or 'inetOrgPerson'
vector<string> ldif_extra_line;         // 'o: myorg'


////////////////////////////////////////////////
// define our ordering
struct ltstr {
    bool operator()(const char* s1, const char* s2) const {
        return strcasecmp(s1, s2) < 0;
    }
};
// define our set
typedef set<const char *, ltstr>    string_set;
// make a static set to hold the cn values
static string_set all_strings;


////////////////////////////////////////////////
// helper to free all the strings in a set
//
static void free_strings(string_set &s);
static void free_strings(string_set &s)
{
	if (s.empty()) return;
    for (string_set::iterator i=s.begin(); i!=s.end(); i++) {
        free((void*)*i);
    }
    s.clear();
}


////////////////////////////////////////////////
// helper to register a string in a string set
//
static const char* register_string(string_set &s, const char *name);
static const char* register_string(string_set &s, const char *name) {
    string_set::const_iterator i = s.find(name);
    if (i != s.end()) return *i;
    char *x = strdup(name);
    s.insert(x);
    return x;
}


////////////////////////////////////////////////
// register a global string
//
static const char* register_string(const char *name);
static const char* register_string(const char *name) {
    return register_string(all_strings, name);
}


////////////////////////////////////////////////
// make a unique string
//
static const char* unique_string(const char *name);
static const char* unique_string(const char *name) {
    int  unique = 2;
    string_set::iterator i = all_strings.find(name);
    if (i == all_strings.end()) return register_string(name);
    while (true) {
        vector<char> n(strlen(name)+10);
        snprintf(&n[0], n.size(), "%s %d", name, unique++);
        string_set::iterator i = all_strings.find(&n[0]);
        if (i == all_strings.end()) return register_string(&n[0]);
    }
}


static void process(pst_desc_tree *d_ptr);
static void process(pst_desc_tree *d_ptr) {
    DEBUG_ENT("process");
    pst_item *item = NULL;
    while (d_ptr) {
        if (d_ptr->desc) {
            item = pst_parse_item(&pstfile, d_ptr, NULL);
            DEBUG_INFO(("item pointer is %p\n", item));
            if (item) {
                if (item->folder && d_ptr->child && item->file_as.str && strcasecmp(item->file_as.str, "Deleted Items")) {
                    //if this is a non-empty folder other than deleted items, we want to recurse into it
                    fprintf(stderr, "entering folder %s\n", item->file_as.str);
                    process(d_ptr->child);

                } else if (item->contact && (item->type == PST_TYPE_CONTACT)) {
                    // deal with a contact
                    char cn[1000];

                    // convert everything to utf8
                    pst_convert_utf8_null(item, &item->contact->display_name_prefix);
                    pst_convert_utf8_null(item, &item->contact->first_name);
                    pst_convert_utf8_null(item, &item->contact->surname);
                    pst_convert_utf8_null(item, &item->contact->suffix);
                    pst_convert_utf8_null(item, &item->contact->company_name);
                    pst_convert_utf8_null(item, &item->contact->job_title);
                    pst_convert_utf8_null(item, &item->contact->address1);
                    pst_convert_utf8_null(item, &item->contact->address2);
                    pst_convert_utf8_null(item, &item->contact->address3);
                    pst_convert_utf8_null(item, &item->contact->address1a);
                    pst_convert_utf8_null(item, &item->contact->address2a);
                    pst_convert_utf8_null(item, &item->contact->address3a);
                    pst_convert_utf8_null(item, &item->contact->business_address);
                    pst_convert_utf8_null(item, &item->contact->business_po_box);
                    pst_convert_utf8_null(item, &item->contact->business_street);
                    pst_convert_utf8_null(item, &item->contact->business_city);
                    pst_convert_utf8_null(item, &item->contact->business_state);
                    pst_convert_utf8_null(item, &item->contact->business_postal_code);
                    pst_convert_utf8_null(item, &item->contact->home_address);
                    pst_convert_utf8_null(item, &item->contact->home_po_box);
                    pst_convert_utf8_null(item, &item->contact->home_street);
                    pst_convert_utf8_null(item, &item->contact->home_city);
                    pst_convert_utf8_null(item, &item->contact->home_state);
                    pst_convert_utf8_null(item, &item->contact->home_postal_code);
                    pst_convert_utf8_null(item, &item->contact->other_address);
                    pst_convert_utf8_null(item, &item->contact->other_po_box);
                    pst_convert_utf8_null(item, &item->contact->other_street);
                    pst_convert_utf8_null(item, &item->contact->other_city);
                    pst_convert_utf8_null(item, &item->contact->other_state);
                    pst_convert_utf8_null(item, &item->contact->other_postal_code);
                    pst_convert_utf8_null(item, &item->contact->business_fax);
                    pst_convert_utf8_null(item, &item->contact->home_fax);
                    pst_convert_utf8_null(item, &item->contact->business_phone);
                    pst_convert_utf8_null(item, &item->contact->home_phone);
                    pst_convert_utf8_null(item, &item->contact->car_phone);
                    pst_convert_utf8_null(item, &item->contact->mobile_phone);
                    pst_convert_utf8_null(item, &item->contact->other_phone);
                    pst_convert_utf8_null(item, &item->contact->business_homepage);
                    pst_convert_utf8_null(item, &item->contact->personal_homepage);
                    pst_convert_utf8_null(item, &item->comment);

                    build_cn(cn, sizeof(cn), 4,
                        item->contact->display_name_prefix,
                        item->contact->first_name,
                        item->contact->surname,
                        item->contact->suffix);
                    if (cn[0] != 0) {
                        // have a valid cn
                        pst_string ucn;
                        ucn.str     = (char*)unique_string(cn);
                        ucn.is_utf8 = 1;    // all the components are already utf8

                        print_ldif_dn("dn", ucn, ldap_base);
                        print_ldif_single("cn", ucn);
                        if (item->contact->first_name.str) {
                            print_ldif_two("givenName",
                                           item->contact->display_name_prefix,
                                           item->contact->first_name);
                        }
                        if (item->contact->surname.str) {
                            print_ldif_two("sn",
                                           item->contact->surname,
                                           item->contact->suffix);
                        }
                        else if (item->contact->company_name.str) {
                            print_ldif_single("sn", item->contact->company_name);
                        }
                        else
                            print_ldif_single("sn", ucn); // use cn as sn if we cannot find something better

                        if (old_schema) {
                            if (item->contact->job_title.str)
                                print_ldif_single("personalTitle", item->contact->job_title);
                            if (item->contact->company_name.str)
                                print_ldif_single("company", item->contact->company_name);
                        }
                        else {
                            // new schema
                            if (item->contact->job_title.str)
                                print_ldif_single("title", item->contact->job_title);
                            if (item->contact->company_name.str)
                                print_ldif_single("o", item->contact->company_name);
                        }
                        if (item->contact->address1.str  && *item->contact->address1.str)
                            print_ldif_single("mail", item->contact->address1);
                        if (item->contact->address2.str  && *item->contact->address2.str)
                            print_ldif_single("mail", item->contact->address2);
                        if (item->contact->address3.str  && *item->contact->address3.str)
                            print_ldif_single("mail", item->contact->address3);
                        if (item->contact->address1a.str && *item->contact->address1a.str)
                            print_ldif_single("mail", item->contact->address1a);
                        if (item->contact->address2a.str && *item->contact->address2a.str)
                            print_ldif_single("mail", item->contact->address2a);
                        if (item->contact->address3a.str && *item->contact->address3a.str)
                            print_ldif_single("mail", item->contact->address3a);

                        if (old_schema) {
                            if (item->contact->business_address.str) {
                                if (item->contact->business_po_box.str)
                                    print_ldif_single("postalAddress", item->contact->business_po_box);
                                if (item->contact->business_street.str)
                                    print_ldif_multi("postalAddress", item->contact->business_street);
                                if (item->contact->business_city.str)
                                    print_ldif_single("l", item->contact->business_city);
                                if (item->contact->business_state.str)
                                    print_ldif_single("st", item->contact->business_state);
                                if (item->contact->business_postal_code.str)
                                    print_ldif_single("postalCode", item->contact->business_postal_code);
                            }
                            else if (item->contact->home_address.str) {
                                if (item->contact->home_po_box.str)
                                    print_ldif_single("postalAddress", item->contact->home_po_box);
                                if (item->contact->home_street.str)
                                    print_ldif_multi("postalAddress", item->contact->home_street);
                                if (item->contact->home_city.str)
                                    print_ldif_single("l", item->contact->home_city);
                                if (item->contact->home_state.str)
                                    print_ldif_single("st", item->contact->home_state);
                                if (item->contact->home_postal_code.str)
                                    print_ldif_single("postalCode", item->contact->home_postal_code);
                            }
                            else if (item->contact->other_address.str) {
                                if (item->contact->other_po_box.str)
                                    print_ldif_single("postalAddress", item->contact->other_po_box);
                                if (item->contact->other_street.str)
                                    print_ldif_multi("postalAddress", item->contact->other_street);
                                if (item->contact->other_city.str)
                                    print_ldif_single("l", item->contact->other_city);
                                if (item->contact->other_state.str)
                                    print_ldif_single("st", item->contact->other_state);
                                if (item->contact->other_postal_code.str)
                                    print_ldif_single("postalCode", item->contact->other_postal_code);
                            }
                        }
                        else {
                            // new schema, with proper RFC4517 postal addresses
                            if (item->contact->business_address.str) {
                                print_ldif_address("postalAddress", 6,
                                    item->contact->business_po_box,
                                    item->contact->business_street,
                                    item->contact->business_city,
                                    item->contact->business_state,
                                    item->contact->business_postal_code,
                                    item->contact->business_country);
                                if (item->contact->business_city.str)
                                    print_ldif_single("l", item->contact->business_city);
                                if (item->contact->business_state.str)
                                    print_ldif_single("st", item->contact->business_state);
                                if (item->contact->business_postal_code.str)
                                    print_ldif_single("postalCode", item->contact->business_postal_code);
                            }
                            else if (item->contact->home_address.str) {
                                if (item->contact->home_city.str)
                                    print_ldif_single("l", item->contact->home_city);
                                if (item->contact->home_state.str)
                                    print_ldif_single("st", item->contact->home_state);
                                if (item->contact->home_postal_code.str)
                                    print_ldif_single("postalCode", item->contact->home_postal_code);
                            }
                            else if (item->contact->other_address.str) {
                                print_ldif_address("postalAddress", 6,
                                    item->contact->other_po_box,
                                    item->contact->other_street,
                                    item->contact->other_city,
                                    item->contact->other_state,
                                    item->contact->other_postal_code,
                                    item->contact->other_country);
                                if (item->contact->other_city.str)
                                    print_ldif_single("l", item->contact->other_city);
                                if (item->contact->other_state.str)
                                    print_ldif_single("st", item->contact->other_state);
                                if (item->contact->other_postal_code.str)
                                    print_ldif_single("postalCode", item->contact->other_postal_code);
                            }
                            if (item->contact->home_address.str) {
                                print_ldif_address("homePostalAddress", 6,
                                    item->contact->home_po_box,
                                    item->contact->home_street,
                                    item->contact->home_city,
                                    item->contact->home_state,
                                    item->contact->home_postal_code,
                                    item->contact->home_country);
                            }
                        }

                        if (item->contact->business_fax.str)
                            print_ldif_single("facsimileTelephoneNumber", item->contact->business_fax);
                        else if (item->contact->home_fax.str)
                            print_ldif_single("facsimileTelephoneNumber", item->contact->home_fax);

                        if (item->contact->business_phone.str)
                            print_ldif_single("telephoneNumber", item->contact->business_phone);
                        if (item->contact->home_phone.str)
                            print_ldif_single("homePhone", item->contact->home_phone);

                        if (item->contact->car_phone.str)
                            print_ldif_single("mobile", item->contact->car_phone);
                        else if (item->contact->mobile_phone.str)
                            print_ldif_single("mobile", item->contact->mobile_phone);
                        else if (item->contact->other_phone.str)
                            print_ldif_single("mobile", item->contact->other_phone);

                        if (!old_schema) {
                            if (item->contact->business_homepage.str)
                                print_ldif_single("labeledURI", item->contact->business_homepage);
                            if (item->contact->personal_homepage.str)
                                print_ldif_single("labeledURI", item->contact->personal_homepage);
                        }

                        if (item->comment.str)
                            print_ldif_single("description", item->comment);

                        for (vector<string>::size_type i=0; i<ldap_class.size(); i++)
                            print_ldif_single("objectClass", ldap_class[i].c_str());
                        printf("\n");
                    }
                }
                else {
                    DEBUG_INFO(("item is not a contact\n"));
                }
            }
            pst_freeItem(item);
        }
        d_ptr = d_ptr->next;
    }
    DEBUG_RET();
}


void print_ldif_single(const char *attr, pst_string value)
{
    print_ldif_single(attr, value.str);
}


// Prints an attribute together with its value.
// If the value isn't a "SAFE STRING" (as defined in RFC2849),
// then it is output as a BASE-64 encoded value
void print_ldif_single(const char *attr, const char *value)
{
    size_t len;
    bool is_safe_string = true;
    bool space_flag = false;

    // Strip leading spaces
    while (*value == ' ') value++;
    len = strlen(value) + 1;
    vector<char> buffer(len);
    char *p = &buffer[0];

    // See if "value" is a "SAFE STRING"
    // First check characters that are safe but not safe as initial characters
    if (*value == ':' || *value == '<')
        is_safe_string = false;
    for (;;) {
        char ch = *value++;

        if (ch == 0 || ch == '\n')
            break;
        else if (ch == '\r')
            continue;
        else if (ch == ' ') {
            space_flag = true;
            continue;
        }
        else {
            if ((ch & 0x80) == 0x80) {
                is_safe_string = false;
            }
            if (space_flag) {
                *p++ = ' ';
                space_flag = false;
            }
            *p++ = ch;
        }
    }
    *p = 0;
    if (is_safe_string) {
        printf("%s: %s\n", attr, &buffer[0]);
    }
    else {
        p = pst_base64_encode(&buffer[0], buffer.size());
        printf("%s:: %s\n", attr, p);
        free(p);
    }
}


// Combines values representing address lines into an address,i
// lines separated with "$" as per PostalAddress syntax in RFC4517
void print_ldif_address(const char *attr, int nvalues, pst_string value, ...)
{
    DEBUG_ENT("print_ldif_address");
    bool space_flag = false;
    bool newline_flag = false;
    char *address = NULL;    // Buffer where address is built up
    int len = 0;             // Length of buffer
    int i = 0;               // Index of next character position in buffer
    va_list ap;

    va_start(ap, value);
    while (!value.str) {
        nvalues--;
        if (nvalues == 0) {    // Nothing at all to do!
            va_end(ap);
            DEBUG_RET();
            return;
        }
        value = va_arg(ap, pst_string);
    }

    for (;;) {
        char ch = *(value.str)++;

        if (ch == 0) {
            do {
                nvalues--;
                if (nvalues == 0) break;
                value = va_arg(ap, pst_string);
            } while (!value.str);
            if (!nvalues || !value.str) break;
            space_flag = true;
            newline_flag = true;
        }
        else if (ch == '\r')
            continue;
        else if (ch == '\n') {
            newline_flag = true;
            continue;
        }
        else if (ch == ' ') {
            space_flag = true;
            continue;
        }
        else {
            if (i > (len-5)) {
                len += 256;
                address = (char *)realloc(address, len);
            }
            if (newline_flag) {
                address[i++] = '$';
                newline_flag = false;
                space_flag   = false;
            }
            else if (space_flag) {
                address[i++] = ' ';
                space_flag   = false;
            }
            if (ch == '$' || ch == '\\') address[i++] = '\\';
            address[i++] = ch;
        }
    }
    va_end(ap);
    if (i == 0) return;   // Nothing to do
    address[i] = 0;
    print_ldif_single(attr, address);
    free(address);
    DEBUG_RET();
}


void print_ldif_multi(const char *dn, pst_string value)
{
    char *n;
    char *valuestr = value.str;
    while ((n = strchr(valuestr, '\n'))) {
        print_ldif_single(dn, valuestr);
        valuestr = n + 1;
    }
    print_ldif_single(dn, valuestr);
}


void print_ldif_two(const char *attr, pst_string value1, pst_string value2)
{
    size_t len1, len2;
    if (value1.str && *value1.str)
        len1 = strlen(value1.str);
    else {
        print_ldif_single(attr, value2);
        return;
    }

    if (value2.str && *value2.str)
        len2 = strlen(value2.str);
    else {
        print_ldif_single(attr, value1);
        return;
    }

    vector<char> value(len1 + len2 + 2);
    memcpy(&value[0], value1.str, len1);
    value[len1] = ' ';
    memcpy(&value[0] + len1 + 1, value2.str, len2 + 1);
    print_ldif_single(attr, &value[0]);
}


void build_cn(char *cn, size_t len, int nvalues, pst_string value, ...)
{
    bool space_flag = false;
    size_t i = 0;
    va_list ap;

    va_start(ap, value);

    while (!value.str) {
       nvalues--;
       if (nvalues == 0) {
           cn[0] = 0;   // Just a terminating NUL
           va_end(ap);
           return;
       }
       value = va_arg(ap, pst_string);
    }
    for (;;) {
        char ch = *(value.str)++;

        if (ch == 0 || ch == '\n') {
            do {
                nvalues--;
                if (nvalues == 0) break;
                value = va_arg(ap, pst_string);
            } while (!value.str);
            if (!nvalues || !value.str) break;
            space_flag = true;
        }
        else if (ch == '\r')
            continue;
        else if (ch == ' ') {
            space_flag = true;
            continue;
        }
        else {
            if (space_flag) {
                if (i > 0) {
                    if (i < (len - 2)) cn[i++] = ' ';
                    else               break;
                }
                space_flag = false;
            }
            if (i < (len - 1)) cn[i++] = ch;
            else               break;
        }
    }
    cn[i] = 0;
    va_end(ap);
}


int main(int argc, char* const* argv) {
    pst_desc_tree *d_ptr;
    char *fname = NULL;
    int c;
    char *d_log = NULL;
    prog_name = argv[0];
    pst_item *item = NULL;

    while ((c = getopt(argc, argv, "b:c:d:l:oVh"))!= -1) {
        switch (c) {
        case 'b':
            ldap_base = optarg;
            break;
        case 'c':
            ldap_class.push_back(string(optarg));
            break;
        case 'd':
            d_log = optarg;
            break;
        case 'h':
            usage();
            exit(0);
            break;
        case 'l':
            ldif_extra_line.push_back(string(optarg));
            break;
        case 'o':
            old_schema = true;
            break;
        case 'V':
            version();
            exit(0);
            break;
        default:
            usage();
            exit(1);
            break;
        }
    }

    if ((argc > optind) && (ldap_base)) {
        fname = argv[optind];
    } else {
        usage();
        exit(2);
    }

    #ifdef DEBUG_ALL
        // force a log file
        if (!d_log) d_log = "pst2ldif.log";
    #endif
    DEBUG_INIT(d_log, NULL);
    DEBUG_ENT("main");
    RET_DERROR(pst_open(&pstfile, fname), 1, ("Error opening File\n"));
    RET_DERROR(pst_load_index(&pstfile), 2, ("Index Error\n"));

    pst_load_extended_attributes(&pstfile);

    d_ptr = pstfile.d_head; // first record is main record
    item  = (pst_item*)pst_parse_item(&pstfile, d_ptr, NULL);
    if (!item || !item->message_store) {
        DEBUG_RET();
        DIE(("main: Could not get root record\n"));
    }

    d_ptr = pst_getTopOfFolders(&pstfile, item);
    if (!d_ptr) {
        DEBUG_RET();
        DIE(("Top of folders record not found. Cannot continue\n"));
    }

    pst_freeItem(item);

    if (old_schema && (strlen(ldap_base) > 2)) {
        char *ldap_org = strdup(ldap_base+2); // assume first 2 chars are o=
        char *temp = strchr(ldap_org, ',');
        if (temp) {
            *temp = '\0';
            // write the ldap header
            printf("dn: %s\n", ldap_base);
            printf("o: %s\n", ldap_org);
            printf("objectClass: organization\n\n");
            printf("dn: cn=root, %s\n", ldap_base);
            printf("cn: root\n");
            printf("sn: root\n");
            for (vector<string>::size_type i=0; i<ldap_class.size(); i++)
                print_ldif_single("objectClass", ldap_class[i].c_str());
            printf("\n");
        }
        free(ldap_org); // found by cppcheck
    }

    process(d_ptr->child);  // do the children of TOPF
    pst_close(&pstfile);
    DEBUG_RET();
    free_strings(all_strings);
    return 0;
}


void usage(void) {
    version();
    printf("Usage: %s [OPTIONS] {PST FILENAME}\n", prog_name);
    printf("OPTIONS:\n");
    printf("\t-V\t- Version. Display program version\n");
    printf("\t-b ldapbase\t- set the LDAP base value\n");
    printf("\t-c class\t- set the class of the LDAP objects (may contain more than one)\n");
    printf("\t-d <filename>\t- Debug to file.\n");
    printf("\t-h\t- Help. This screen\n");
    printf("\t-l line\t- extra line to insert in the LDIF file for each contact\n");
    printf("\t-o\t- use old schema, default is new schema\n");
}


void version(void) {
    printf("pst2ldif v%s\n", VERSION);
#if BYTE_ORDER == BIG_ENDIAN
    printf("Big Endian implementation being used.\n");
#elif BYTE_ORDER == LITTLE_ENDIAN
    printf("Little Endian implementation being used.\n");
#else
#  error "Byte order not supported by this library"
#endif
#ifdef __GNUC__
    printf("GCC %d.%d : %s %s\n", __GNUC__, __GNUC_MINOR__, __DATE__, __TIME__);
#endif
}


char *check_filename(char *fname) {
    char *t = fname;
    if (t == NULL) {
        return fname;
    }
    while ((t = strpbrk(t, "/\\:"))) {
        // while there are characters in the second string that we don't want
        *t = '_'; //replace them with an underscore
    }
    return fname;
}


// This function escapes Distinguished Names (as per RFC4514)
void print_ldif_dn(const char *attr, pst_string value, const char *base)
{
    printf("dn: cn=");
    const char *valuestr = value.str;
    // remove leading spaces (RFC says escape them)
    while (*valuestr == ' ')
        valuestr++;

    print_escaped_dn(valuestr);
    if (base && base[0]) {
        printf(", %s", base);
    }
    printf("\n");
    return;
}


void print_escaped_dn(const char *value)
{
    char ch;

    // escape initial '#' and space
    if (*value == '#' || *value == ' ')
        putchar('\\');

    while ((ch = *value++) != 0) {
        if (((ch & 0x80) != 0) || (ch <= 0x1F))
            // Print as escaped hex digits
            printf("\\%2.2X", ch & 0xFF);
        else switch (ch) {
            case '\\':
            case '"' :
            case '+' :
            case ',' :
            case ';' :
            case '<' :
            case '>' :
                putchar('\\');
                // Fall through
            default:
                putchar(ch);
        }
    }
    return;
}