changeset 360:26c48ea9d896

From Jeffrey Morlan: pst_build_id_ptr reads the Block BTree into a linked list, which pst_getID does a linear scan through. For large PSTs that have millions of blocks, this is extremely slow - almost all time is spent in pst_getID. Since the BTree entries must be in order, this can be dramatically improved by reading into an array and using binary search.
author Carl Byington <carl@five-ten-sg.com>
date Wed, 06 Jul 2016 10:21:08 -0700 (2016-07-06)
parents a3e674fade6c
children a2da2bbe393a
files python/python-libpst.cpp src/dumpblocks.c src/getidblock.c src/libpst.c src/libpst.h
diffstat 5 files changed, 25 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
--- a/python/python-libpst.cpp	Wed Jul 06 10:20:12 2016 -0700
+++ b/python/python-libpst.cpp	Wed Jul 06 10:21:08 2016 -0700
@@ -269,7 +269,6 @@
         .def_readonly("offset",                 &pst_index_ll::offset)
         .def_readonly("size",                   &pst_index_ll::size)
         .def_readonly("u1",                     &pst_index_ll::u1)
-        .add_property("next",                   make_getter(&pst_index_ll::next, return_value_policy<reference_existing_object>()))
         ;
 
     class_<pst_id2_tree>("pst_id2_tree")
@@ -586,8 +585,6 @@
     class_<pst_file>("pst_file")
         .def_readonly("cwd",                    &pst_file::cwd)
         .def_readonly("fname",                  &pst_file::fname)
-        .add_property("i_head",                 make_getter(&pst_file::i_head, return_value_policy<reference_existing_object>()))
-        .add_property("i_tail",                 make_getter(&pst_file::i_tail, return_value_policy<reference_existing_object>()))
         .add_property("d_head",                 make_getter(&pst_file::d_head, return_value_policy<reference_existing_object>()))
         .add_property("d_tail",                 make_getter(&pst_file::d_tail, return_value_policy<reference_existing_object>()))
         .add_property("x_head",                 make_getter(&pst_file::x_head, return_value_policy<reference_existing_object>()))
--- a/src/dumpblocks.c	Wed Jul 06 10:20:12 2016 -0700
+++ b/src/dumpblocks.c	Wed Jul 06 10:21:08 2016 -0700
@@ -5,7 +5,7 @@
 int main(int argc, char* const* argv)
 {
     pst_file pstfile;
-    pst_index_ll *ptr;
+    size_t i;
     char *outdir = NULL, *file = NULL, *outname = NULL;
     char *buf = NULL;
     int c;
@@ -50,12 +50,11 @@
             exit(1);
         }
 
-    ptr = pstfile.i_head;
     outname = (char *) pst_malloc(OUT_BUF);
     printf("Saving blocks\n");
-    while (ptr != NULL) {
-        size_t c;
-        c = pst_ff_getIDblock_dec(&pstfile, ptr->i_id, &buf);
+    for (i = 0; i < pstfile.i_count; i++) {
+        pst_index_ll *ptr = &pstfile.i_table[i];
+        size_t c = pst_ff_getIDblock_dec(&pstfile, ptr->i_id, &buf);
         if (c) {
             snprintf(outname, OUT_BUF, "%#"PRIx64, ptr->i_id);
             if ((fp = fopen(outname, "wb")) == NULL) {
@@ -67,7 +66,6 @@
         } else {
             printf("Failed to read block i_id %#"PRIx64"\n", ptr->i_id);
         }
-        ptr = ptr->next;
     }
     pst_close(&pstfile);
     DEBUG_RET();
--- a/src/getidblock.c	Wed Jul 06 10:20:12 2016 -0700
+++ b/src/getidblock.c	Wed Jul 06 10:21:08 2016 -0700
@@ -130,10 +130,9 @@
         dumper(i_id);
     }
     else {
-        pst_index_ll *ptr = pstfile.i_head;
-        while (ptr) {
-            dumper(ptr->i_id);
-            ptr = ptr->next;
+        size_t i;
+        for (i = 0; i < pstfile.i_count; i++) {
+            dumper(pstfile.i_table[i].i_id);
         }
         dump_desc(pstfile.d_head, NULL);
     }
--- a/src/libpst.c	Wed Jul 06 10:20:12 2016 -0700
+++ b/src/libpst.c	Wed Jul 06 10:21:08 2016 -0700
@@ -268,7 +268,6 @@
 static void             pst_free_attach(pst_item_attach *attach);
 static void             pst_free_desc (pst_desc_tree *head);
 static void             pst_free_id2(pst_id2_tree * head);
-static void             pst_free_id (pst_index_ll *head);
 static void             pst_free_list(pst_mapi_object *list);
 static void             pst_free_xattrib(pst_x_attrib_ll *x);
 static size_t           pst_getAtPos(pst_file *pf, int64_t pos, void* buf, size_t size);
@@ -404,8 +403,8 @@
     // free the paths
     free(pf->cwd);
     free(pf->fname);
-    // we must free the id linklist and the desc tree
-    pst_free_id(pf->i_head);
+    // we must free the id array and the desc tree
+    free(pf->i_table);
     pst_free_desc(pf->d_head);
     pst_free_xattrib(pf->x_head);
     DEBUG_RET();
@@ -1043,15 +1042,15 @@
                 return -1;
             }
             old = index.id;
-            i_ptr = (pst_index_ll*) pst_malloc(sizeof(pst_index_ll));
+            if (pf->i_count == pf->i_capacity) {
+                pf->i_capacity += (pf->i_capacity >> 1) + 16; // arbitrary growth rate
+                pf->i_table = pst_realloc(pf->i_table, pf->i_capacity * sizeof(pst_index_ll));
+            }
+            i_ptr = &pf->i_table[pf->i_count++];
             i_ptr->i_id   = index.id;
             i_ptr->offset = index.offset;
             i_ptr->u1     = index.u1;
             i_ptr->size   = index.size;
-            i_ptr->next   = NULL;
-            if (pf->i_tail)  pf->i_tail->next = i_ptr;
-            if (!pf->i_head) pf->i_head = i_ptr;
-            pf->i_tail = i_ptr;
         }
     } else {
         // this node contains node pointers
@@ -3210,18 +3209,6 @@
 }
 
 
-static void pst_free_id (pst_index_ll *head) {
-    pst_index_ll *t;
-    DEBUG_ENT("pst_free_id");
-    while (head) {
-        t = head->next;
-        free(head);
-        head = t;
-    }
-    DEBUG_RET();
-}
-
-
 static void pst_free_desc (pst_desc_tree *head) {
     pst_desc_tree *t;
     DEBUG_ENT("pst_free_desc");
@@ -3620,6 +3607,13 @@
 }
 
 
+static int pst_getID_compare(const void *key, const void *entry) {
+    uint64_t key_id = *(const uint64_t*)key;
+    uint64_t entry_id = ((const pst_index_ll*)entry)->i_id;
+    return (key_id > entry_id) - (key_id < entry_id);
+}
+
+
 /** */
 pst_index_ll* pst_getID(pst_file* pf, uint64_t i_id) {
     pst_index_ll *ptr;
@@ -3634,10 +3628,7 @@
     i_id -= (i_id & 1);
 
     DEBUG_INFO(("Trying to find %#"PRIx64"\n", i_id));
-    ptr = pf->i_head;
-    while (ptr && (ptr->i_id != i_id)) {
-        ptr = ptr->next;
-    }
+    ptr = bsearch(&i_id, pf->i_table, pf->i_count, sizeof *pf->i_table, pst_getID_compare);
     if (ptr) {DEBUG_INFO(("Found Value %#"PRIx64"\n", i_id));            }
     else     {DEBUG_INFO(("ERROR: Value %#"PRIx64" not found\n", i_id)); }
     DEBUG_RET();
--- a/src/libpst.h	Wed Jul 06 10:20:12 2016 -0700
+++ b/src/libpst.h	Wed Jul 06 10:21:08 2016 -0700
@@ -105,7 +105,6 @@
     uint64_t offset;
     uint64_t size;
     int64_t  u1;
-    struct pst_index_ll *next;
 } pst_index_ll;
 
 
@@ -894,8 +893,9 @@
     char*   fname;
     /** default character set for items without one */
     const char*   charset;
-    /** the head and tail of the linked list of index structures */
-    pst_index_ll *i_head, *i_tail;
+    /** the array of index structures */
+    pst_index_ll *i_table;
+    size_t i_count, i_capacity;
     /** the head and tail of the top level of the descriptor tree */
     pst_desc_tree  *d_head, *d_tail;
     /** the head of the extended attributes linked list */