Mercurial > libpst
view src/pst2ldif.cpp @ 355:d1f930be4711
From Jeffrey Morlan:
pst_build_id_ptr and pst_build_desc_ptr require that the first child
of a BTree page have the same starting ID as itself. This is not
required by the spec, and is not true in many real-world PSTs
(presumably, the original first child of the page got
deleted). Because of this, many emails are not being extracted from
these PSTs. It also triggers an infinite loop in lspst (a separate
bug, also fixed)
author | Carl Byington <carl@five-ten-sg.com> |
---|---|
date | Wed, 06 Jul 2016 10:12:22 -0700 |
parents | aedcf979f439 |
children |
line wrap: on
line source
/* Copyright (c) 2004 Carl Byington - 510 Software Group, released under the GPL version 2 or any later version at your choice available at http://www.fsf.org/licenses/gpl.txt Based on readpst.c by David Smith */ using namespace std; // needed for std c++ collections #include <set> #include <vector> #include <string> extern "C" { #include "define.h" #include "lzfu.h" } void usage(void); void version(void); char *check_filename(char *fname); void print_ldif_single(const char *attr, const char *value); void print_ldif_single(const char *attr, pst_string value); void print_ldif_address(const char *attr, int nvalues, pst_string value, ...); void print_ldif_dn(const char *attr, pst_string value, const char *base); void print_ldif_multi(const char *dn, pst_string value); void print_ldif_two(const char *attr, pst_string value1, pst_string value2); void print_escaped_dn(const char *value); void build_cn(char *cn, size_t len, int nvalues, pst_string value, ...); char *prog_name; pst_file pstfile; bool old_schema = false; char *ldap_base = NULL; // 'o=some.domain.tld,c=US' int ldif_extra_line_count = 0; vector<string> ldap_class; // 'newPerson' or 'inetOrgPerson' vector<string> ldif_extra_line; // 'o: myorg' //////////////////////////////////////////////// // define our ordering struct ltstr { bool operator()(const char* s1, const char* s2) const { return strcasecmp(s1, s2) < 0; } }; // define our set typedef set<const char *, ltstr> string_set; // make a static set to hold the cn values static string_set all_strings; //////////////////////////////////////////////// // helper to free all the strings in a set // static void free_strings(string_set &s); static void free_strings(string_set &s) { if (s.empty()) return; for (string_set::iterator i=s.begin(); i!=s.end(); i++) { free((void*)*i); } s.clear(); } //////////////////////////////////////////////// // helper to register a string in a string set // static const char* register_string(string_set &s, const char *name); static const char* register_string(string_set &s, const char *name) { string_set::const_iterator i = s.find(name); if (i != s.end()) return *i; char *x = strdup(name); s.insert(x); return x; } //////////////////////////////////////////////// // register a global string // static const char* register_string(const char *name); static const char* register_string(const char *name) { return register_string(all_strings, name); } //////////////////////////////////////////////// // make a unique string // static const char* unique_string(const char *name); static const char* unique_string(const char *name) { int unique = 2; string_set::iterator i = all_strings.find(name); if (i == all_strings.end()) return register_string(name); while (true) { vector<char> n(strlen(name)+10); snprintf(&n[0], n.size(), "%s %d", name, unique++); string_set::iterator i = all_strings.find(&n[0]); if (i == all_strings.end()) return register_string(&n[0]); } } static void process(pst_desc_tree *d_ptr); static void process(pst_desc_tree *d_ptr) { DEBUG_ENT("process"); pst_item *item = NULL; while (d_ptr) { if (d_ptr->desc) { item = pst_parse_item(&pstfile, d_ptr, NULL); DEBUG_INFO(("item pointer is %p\n", item)); if (item) { if (item->folder && d_ptr->child && item->file_as.str && strcasecmp(item->file_as.str, "Deleted Items")) { //if this is a non-empty folder other than deleted items, we want to recurse into it fprintf(stderr, "entering folder %s\n", item->file_as.str); process(d_ptr->child); } else if (item->contact && (item->type == PST_TYPE_CONTACT)) { // deal with a contact char cn[1000]; // convert everything to utf8 pst_convert_utf8_null(item, &item->contact->display_name_prefix); pst_convert_utf8_null(item, &item->contact->first_name); pst_convert_utf8_null(item, &item->contact->surname); pst_convert_utf8_null(item, &item->contact->suffix); pst_convert_utf8_null(item, &item->contact->company_name); pst_convert_utf8_null(item, &item->contact->job_title); pst_convert_utf8_null(item, &item->contact->address1); pst_convert_utf8_null(item, &item->contact->address2); pst_convert_utf8_null(item, &item->contact->address3); pst_convert_utf8_null(item, &item->contact->address1a); pst_convert_utf8_null(item, &item->contact->address2a); pst_convert_utf8_null(item, &item->contact->address3a); pst_convert_utf8_null(item, &item->contact->business_address); pst_convert_utf8_null(item, &item->contact->business_po_box); pst_convert_utf8_null(item, &item->contact->business_street); pst_convert_utf8_null(item, &item->contact->business_city); pst_convert_utf8_null(item, &item->contact->business_state); pst_convert_utf8_null(item, &item->contact->business_postal_code); pst_convert_utf8_null(item, &item->contact->home_address); pst_convert_utf8_null(item, &item->contact->home_po_box); pst_convert_utf8_null(item, &item->contact->home_street); pst_convert_utf8_null(item, &item->contact->home_city); pst_convert_utf8_null(item, &item->contact->home_state); pst_convert_utf8_null(item, &item->contact->home_postal_code); pst_convert_utf8_null(item, &item->contact->other_address); pst_convert_utf8_null(item, &item->contact->other_po_box); pst_convert_utf8_null(item, &item->contact->other_street); pst_convert_utf8_null(item, &item->contact->other_city); pst_convert_utf8_null(item, &item->contact->other_state); pst_convert_utf8_null(item, &item->contact->other_postal_code); pst_convert_utf8_null(item, &item->contact->business_fax); pst_convert_utf8_null(item, &item->contact->home_fax); pst_convert_utf8_null(item, &item->contact->business_phone); pst_convert_utf8_null(item, &item->contact->home_phone); pst_convert_utf8_null(item, &item->contact->car_phone); pst_convert_utf8_null(item, &item->contact->mobile_phone); pst_convert_utf8_null(item, &item->contact->other_phone); pst_convert_utf8_null(item, &item->contact->business_homepage); pst_convert_utf8_null(item, &item->contact->personal_homepage); pst_convert_utf8_null(item, &item->comment); build_cn(cn, sizeof(cn), 4, item->contact->display_name_prefix, item->contact->first_name, item->contact->surname, item->contact->suffix); if (cn[0] != 0) { // have a valid cn pst_string ucn; ucn.str = (char*)unique_string(cn); ucn.is_utf8 = 1; // all the components are already utf8 print_ldif_dn("dn", ucn, ldap_base); print_ldif_single("cn", ucn); if (item->contact->first_name.str) { print_ldif_two("givenName", item->contact->display_name_prefix, item->contact->first_name); } if (item->contact->surname.str) { print_ldif_two("sn", item->contact->surname, item->contact->suffix); } else if (item->contact->company_name.str) { print_ldif_single("sn", item->contact->company_name); } else print_ldif_single("sn", ucn); // use cn as sn if we cannot find something better if (old_schema) { if (item->contact->job_title.str) print_ldif_single("personalTitle", item->contact->job_title); if (item->contact->company_name.str) print_ldif_single("company", item->contact->company_name); } else { // new schema if (item->contact->job_title.str) print_ldif_single("title", item->contact->job_title); if (item->contact->company_name.str) print_ldif_single("o", item->contact->company_name); } if (item->contact->address1.str && *item->contact->address1.str) print_ldif_single("mail", item->contact->address1); if (item->contact->address2.str && *item->contact->address2.str) print_ldif_single("mail", item->contact->address2); if (item->contact->address3.str && *item->contact->address3.str) print_ldif_single("mail", item->contact->address3); if (item->contact->address1a.str && *item->contact->address1a.str) print_ldif_single("mail", item->contact->address1a); if (item->contact->address2a.str && *item->contact->address2a.str) print_ldif_single("mail", item->contact->address2a); if (item->contact->address3a.str && *item->contact->address3a.str) print_ldif_single("mail", item->contact->address3a); if (old_schema) { if (item->contact->business_address.str) { if (item->contact->business_po_box.str) print_ldif_single("postalAddress", item->contact->business_po_box); if (item->contact->business_street.str) print_ldif_multi("postalAddress", item->contact->business_street); if (item->contact->business_city.str) print_ldif_single("l", item->contact->business_city); if (item->contact->business_state.str) print_ldif_single("st", item->contact->business_state); if (item->contact->business_postal_code.str) print_ldif_single("postalCode", item->contact->business_postal_code); } else if (item->contact->home_address.str) { if (item->contact->home_po_box.str) print_ldif_single("postalAddress", item->contact->home_po_box); if (item->contact->home_street.str) print_ldif_multi("postalAddress", item->contact->home_street); if (item->contact->home_city.str) print_ldif_single("l", item->contact->home_city); if (item->contact->home_state.str) print_ldif_single("st", item->contact->home_state); if (item->contact->home_postal_code.str) print_ldif_single("postalCode", item->contact->home_postal_code); } else if (item->contact->other_address.str) { if (item->contact->other_po_box.str) print_ldif_single("postalAddress", item->contact->other_po_box); if (item->contact->other_street.str) print_ldif_multi("postalAddress", item->contact->other_street); if (item->contact->other_city.str) print_ldif_single("l", item->contact->other_city); if (item->contact->other_state.str) print_ldif_single("st", item->contact->other_state); if (item->contact->other_postal_code.str) print_ldif_single("postalCode", item->contact->other_postal_code); } } else { // new schema, with proper RFC4517 postal addresses if (item->contact->business_address.str) { print_ldif_address("postalAddress", 6, item->contact->business_po_box, item->contact->business_street, item->contact->business_city, item->contact->business_state, item->contact->business_postal_code, item->contact->business_country); if (item->contact->business_city.str) print_ldif_single("l", item->contact->business_city); if (item->contact->business_state.str) print_ldif_single("st", item->contact->business_state); if (item->contact->business_postal_code.str) print_ldif_single("postalCode", item->contact->business_postal_code); } else if (item->contact->home_address.str) { if (item->contact->home_city.str) print_ldif_single("l", item->contact->home_city); if (item->contact->home_state.str) print_ldif_single("st", item->contact->home_state); if (item->contact->home_postal_code.str) print_ldif_single("postalCode", item->contact->home_postal_code); } else if (item->contact->other_address.str) { print_ldif_address("postalAddress", 6, item->contact->other_po_box, item->contact->other_street, item->contact->other_city, item->contact->other_state, item->contact->other_postal_code, item->contact->other_country); if (item->contact->other_city.str) print_ldif_single("l", item->contact->other_city); if (item->contact->other_state.str) print_ldif_single("st", item->contact->other_state); if (item->contact->other_postal_code.str) print_ldif_single("postalCode", item->contact->other_postal_code); } if (item->contact->home_address.str) { print_ldif_address("homePostalAddress", 6, item->contact->home_po_box, item->contact->home_street, item->contact->home_city, item->contact->home_state, item->contact->home_postal_code, item->contact->home_country); } } if (item->contact->business_fax.str) print_ldif_single("facsimileTelephoneNumber", item->contact->business_fax); else if (item->contact->home_fax.str) print_ldif_single("facsimileTelephoneNumber", item->contact->home_fax); if (item->contact->business_phone.str) print_ldif_single("telephoneNumber", item->contact->business_phone); if (item->contact->home_phone.str) print_ldif_single("homePhone", item->contact->home_phone); if (item->contact->car_phone.str) print_ldif_single("mobile", item->contact->car_phone); else if (item->contact->mobile_phone.str) print_ldif_single("mobile", item->contact->mobile_phone); else if (item->contact->other_phone.str) print_ldif_single("mobile", item->contact->other_phone); if (!old_schema) { if (item->contact->business_homepage.str) print_ldif_single("labeledURI", item->contact->business_homepage); if (item->contact->personal_homepage.str) print_ldif_single("labeledURI", item->contact->personal_homepage); } if (item->comment.str) print_ldif_single("description", item->comment); for (vector<string>::size_type i=0; i<ldap_class.size(); i++) print_ldif_single("objectClass", ldap_class[i].c_str()); printf("\n"); } } else { DEBUG_INFO(("item is not a contact\n")); } } pst_freeItem(item); } d_ptr = d_ptr->next; } DEBUG_RET(); } void print_ldif_single(const char *attr, pst_string value) { print_ldif_single(attr, value.str); } // Prints an attribute together with its value. // If the value isn't a "SAFE STRING" (as defined in RFC2849), // then it is output as a BASE-64 encoded value void print_ldif_single(const char *attr, const char *value) { size_t len; bool is_safe_string = true; bool space_flag = false; // Strip leading spaces while (*value == ' ') value++; len = strlen(value) + 1; vector<char> buffer(len); char *p = &buffer[0]; // See if "value" is a "SAFE STRING" // First check characters that are safe but not safe as initial characters if (*value == ':' || *value == '<') is_safe_string = false; for (;;) { char ch = *value++; if (ch == 0 || ch == '\n') break; else if (ch == '\r') continue; else if (ch == ' ') { space_flag = true; continue; } else { if ((ch & 0x80) == 0x80) { is_safe_string = false; } if (space_flag) { *p++ = ' '; space_flag = false; } *p++ = ch; } } *p = 0; if (is_safe_string) { printf("%s: %s\n", attr, &buffer[0]); } else { p = pst_base64_encode(&buffer[0], buffer.size()); printf("%s:: %s\n", attr, p); free(p); } } // Combines values representing address lines into an address,i // lines separated with "$" as per PostalAddress syntax in RFC4517 void print_ldif_address(const char *attr, int nvalues, pst_string value, ...) { DEBUG_ENT("print_ldif_address"); bool space_flag = false; bool newline_flag = false; char *address = NULL; // Buffer where address is built up int len = 0; // Length of buffer int i = 0; // Index of next character position in buffer va_list ap; va_start(ap, value); while (!value.str) { nvalues--; if (nvalues == 0) { // Nothing at all to do! va_end(ap); DEBUG_RET(); return; } value = va_arg(ap, pst_string); } for (;;) { char ch = *(value.str)++; if (ch == 0) { do { nvalues--; if (nvalues == 0) break; value = va_arg(ap, pst_string); } while (!value.str); if (!nvalues || !value.str) break; space_flag = true; newline_flag = true; } else if (ch == '\r') continue; else if (ch == '\n') { newline_flag = true; continue; } else if (ch == ' ') { space_flag = true; continue; } else { if (i > (len-5)) { len += 256; char *addr = (char *)realloc(address, len); // cppcheck found unchecked error if (!addr) exit(3); address = addr; } if (newline_flag) { address[i++] = '$'; newline_flag = false; space_flag = false; } else if (space_flag) { address[i++] = ' '; space_flag = false; } if (ch == '$' || ch == '\\') address[i++] = '\\'; address[i++] = ch; } } va_end(ap); if (i == 0) return; // Nothing to do address[i] = 0; print_ldif_single(attr, address); free(address); DEBUG_RET(); } void print_ldif_multi(const char *dn, pst_string value) { char *n; char *valuestr = value.str; while ((n = strchr(valuestr, '\n'))) { print_ldif_single(dn, valuestr); valuestr = n + 1; } print_ldif_single(dn, valuestr); } void print_ldif_two(const char *attr, pst_string value1, pst_string value2) { size_t len1, len2; if (value1.str && *value1.str) len1 = strlen(value1.str); else { print_ldif_single(attr, value2); return; } if (value2.str && *value2.str) len2 = strlen(value2.str); else { print_ldif_single(attr, value1); return; } vector<char> value(len1 + len2 + 2); memcpy(&value[0], value1.str, len1); value[len1] = ' '; memcpy(&value[0] + len1 + 1, value2.str, len2 + 1); print_ldif_single(attr, &value[0]); } void build_cn(char *cn, size_t len, int nvalues, pst_string value, ...) { bool space_flag = false; size_t i = 0; va_list ap; va_start(ap, value); while (!value.str) { nvalues--; if (nvalues == 0) { cn[0] = 0; // Just a terminating NUL va_end(ap); return; } value = va_arg(ap, pst_string); } for (;;) { char ch = *(value.str)++; if (ch == 0 || ch == '\n') { do { nvalues--; if (nvalues == 0) break; value = va_arg(ap, pst_string); } while (!value.str); if (!nvalues || !value.str) break; space_flag = true; } else if (ch == '\r') continue; else if (ch == ' ') { space_flag = true; continue; } else { if (space_flag) { if (i > 0) { if (i < (len - 2)) cn[i++] = ' '; else break; } space_flag = false; } if (i < (len - 1)) cn[i++] = ch; else break; } } cn[i] = 0; va_end(ap); } int main(int argc, char* const* argv) { pst_desc_tree *d_ptr; char *fname = NULL; int c; char *d_log = NULL; prog_name = argv[0]; pst_item *item = NULL; while ((c = getopt(argc, argv, "b:c:d:l:oVh"))!= -1) { switch (c) { case 'b': ldap_base = optarg; break; case 'c': ldap_class.push_back(string(optarg)); break; case 'd': d_log = optarg; break; case 'h': usage(); exit(0); break; case 'l': ldif_extra_line.push_back(string(optarg)); break; case 'o': old_schema = true; break; case 'V': version(); exit(0); break; default: usage(); exit(1); break; } } if ((argc > optind) && (ldap_base)) { fname = argv[optind]; } else { usage(); exit(2); } #ifdef DEBUG_ALL // force a log file if (!d_log) d_log = "pst2ldif.log"; #endif DEBUG_INIT(d_log, NULL); DEBUG_ENT("main"); RET_DERROR(pst_open(&pstfile, fname, NULL), 1, ("Error opening File\n")); RET_DERROR(pst_load_index(&pstfile), 2, ("Index Error\n")); pst_load_extended_attributes(&pstfile); d_ptr = pstfile.d_head; // first record is main record item = (pst_item*)pst_parse_item(&pstfile, d_ptr, NULL); if (!item || !item->message_store) { DEBUG_RET(); DIE(("main: Could not get root record\n")); } d_ptr = pst_getTopOfFolders(&pstfile, item); if (!d_ptr) { DEBUG_RET(); DIE(("Top of folders record not found. Cannot continue\n")); } pst_freeItem(item); if (old_schema && (strlen(ldap_base) > 2)) { char *ldap_org = strdup(ldap_base+2); // assume first 2 chars are o= char *temp = strchr(ldap_org, ','); if (temp) { *temp = '\0'; // write the ldap header printf("dn: %s\n", ldap_base); printf("o: %s\n", ldap_org); printf("objectClass: organization\n\n"); printf("dn: cn=root, %s\n", ldap_base); printf("cn: root\n"); printf("sn: root\n"); for (vector<string>::size_type i=0; i<ldap_class.size(); i++) print_ldif_single("objectClass", ldap_class[i].c_str()); printf("\n"); } free(ldap_org); // found by cppcheck } process(d_ptr->child); // do the children of TOPF pst_close(&pstfile); DEBUG_RET(); free_strings(all_strings); return 0; } void usage(void) { version(); printf("Usage: %s [OPTIONS] {PST FILENAME}\n", prog_name); printf("OPTIONS:\n"); printf("\t-V\t- Version. Display program version\n"); printf("\t-b ldapbase\t- set the LDAP base value\n"); printf("\t-c class\t- set the class of the LDAP objects (may contain more than one)\n"); printf("\t-d <filename>\t- Debug to file.\n"); printf("\t-h\t- Help. This screen\n"); printf("\t-l line\t- extra line to insert in the LDIF file for each contact\n"); printf("\t-o\t- use old schema, default is new schema\n"); } void version(void) { printf("pst2ldif v%s\n", VERSION); #if BYTE_ORDER == BIG_ENDIAN printf("Big Endian implementation being used.\n"); #elif BYTE_ORDER == LITTLE_ENDIAN printf("Little Endian implementation being used.\n"); #else # error "Byte order not supported by this library" #endif } char *check_filename(char *fname) { char *t = fname; if (t == NULL) { return fname; } while ((t = strpbrk(t, "/\\:"))) { // while there are characters in the second string that we don't want *t = '_'; //replace them with an underscore } return fname; } // This function escapes Distinguished Names (as per RFC4514) void print_ldif_dn(const char *attr, pst_string value, const char *base) { printf("dn: cn="); const char *valuestr = value.str; // remove leading spaces (RFC says escape them) while (*valuestr == ' ') valuestr++; print_escaped_dn(valuestr); if (base && base[0]) { printf(", %s", base); } printf("\n"); return; } void print_escaped_dn(const char *value) { char ch; // escape initial '#' and space if (*value == '#' || *value == ' ') putchar('\\'); while ((ch = *value++) != 0) { if (((ch & 0x80) != 0) || (ch <= 0x1F)) // Print as escaped hex digits printf("\\%2.2X", ch & 0xFF); else switch (ch) { case '\\': case '"' : case '+' : case ',' : case ';' : case '<' : case '>' : putchar('\\'); // Fall through default: putchar(ch); } } return; }