libpst

changeset 360:26c48ea9d896

From Jeffrey Morlan: pst_build_id_ptr reads the Block BTree into a linked list, which pst_getID does a linear scan through. For large PSTs that have millions of blocks, this is extremely slow - almost all time is spent in pst_getID. Since the BTree entries must be in order, this can be dramatically improved by reading into an array and using binary search.
author Carl Byington <carl@five-ten-sg.com>
date Wed, 06 Jul 2016 10:21:08 -0700
parents a3e674fade6c
children a2da2bbe393a
files python/python-libpst.cpp src/dumpblocks.c src/getidblock.c src/libpst.c src/libpst.h
diffstat 5 files changed, 25 insertions(+), 40 deletions(-) [+]
line diff
     1.1 --- a/python/python-libpst.cpp	Wed Jul 06 10:20:12 2016 -0700
     1.2 +++ b/python/python-libpst.cpp	Wed Jul 06 10:21:08 2016 -0700
     1.3 @@ -269,7 +269,6 @@
     1.4          .def_readonly("offset",                 &pst_index_ll::offset)
     1.5          .def_readonly("size",                   &pst_index_ll::size)
     1.6          .def_readonly("u1",                     &pst_index_ll::u1)
     1.7 -        .add_property("next",                   make_getter(&pst_index_ll::next, return_value_policy<reference_existing_object>()))
     1.8          ;
     1.9  
    1.10      class_<pst_id2_tree>("pst_id2_tree")
    1.11 @@ -586,8 +585,6 @@
    1.12      class_<pst_file>("pst_file")
    1.13          .def_readonly("cwd",                    &pst_file::cwd)
    1.14          .def_readonly("fname",                  &pst_file::fname)
    1.15 -        .add_property("i_head",                 make_getter(&pst_file::i_head, return_value_policy<reference_existing_object>()))
    1.16 -        .add_property("i_tail",                 make_getter(&pst_file::i_tail, return_value_policy<reference_existing_object>()))
    1.17          .add_property("d_head",                 make_getter(&pst_file::d_head, return_value_policy<reference_existing_object>()))
    1.18          .add_property("d_tail",                 make_getter(&pst_file::d_tail, return_value_policy<reference_existing_object>()))
    1.19          .add_property("x_head",                 make_getter(&pst_file::x_head, return_value_policy<reference_existing_object>()))
     2.1 --- a/src/dumpblocks.c	Wed Jul 06 10:20:12 2016 -0700
     2.2 +++ b/src/dumpblocks.c	Wed Jul 06 10:21:08 2016 -0700
     2.3 @@ -5,7 +5,7 @@
     2.4  int main(int argc, char* const* argv)
     2.5  {
     2.6      pst_file pstfile;
     2.7 -    pst_index_ll *ptr;
     2.8 +    size_t i;
     2.9      char *outdir = NULL, *file = NULL, *outname = NULL;
    2.10      char *buf = NULL;
    2.11      int c;
    2.12 @@ -50,12 +50,11 @@
    2.13              exit(1);
    2.14          }
    2.15  
    2.16 -    ptr = pstfile.i_head;
    2.17      outname = (char *) pst_malloc(OUT_BUF);
    2.18      printf("Saving blocks\n");
    2.19 -    while (ptr != NULL) {
    2.20 -        size_t c;
    2.21 -        c = pst_ff_getIDblock_dec(&pstfile, ptr->i_id, &buf);
    2.22 +    for (i = 0; i < pstfile.i_count; i++) {
    2.23 +        pst_index_ll *ptr = &pstfile.i_table[i];
    2.24 +        size_t c = pst_ff_getIDblock_dec(&pstfile, ptr->i_id, &buf);
    2.25          if (c) {
    2.26              snprintf(outname, OUT_BUF, "%#"PRIx64, ptr->i_id);
    2.27              if ((fp = fopen(outname, "wb")) == NULL) {
    2.28 @@ -67,7 +66,6 @@
    2.29          } else {
    2.30              printf("Failed to read block i_id %#"PRIx64"\n", ptr->i_id);
    2.31          }
    2.32 -        ptr = ptr->next;
    2.33      }
    2.34      pst_close(&pstfile);
    2.35      DEBUG_RET();
     3.1 --- a/src/getidblock.c	Wed Jul 06 10:20:12 2016 -0700
     3.2 +++ b/src/getidblock.c	Wed Jul 06 10:21:08 2016 -0700
     3.3 @@ -130,10 +130,9 @@
     3.4          dumper(i_id);
     3.5      }
     3.6      else {
     3.7 -        pst_index_ll *ptr = pstfile.i_head;
     3.8 -        while (ptr) {
     3.9 -            dumper(ptr->i_id);
    3.10 -            ptr = ptr->next;
    3.11 +        size_t i;
    3.12 +        for (i = 0; i < pstfile.i_count; i++) {
    3.13 +            dumper(pstfile.i_table[i].i_id);
    3.14          }
    3.15          dump_desc(pstfile.d_head, NULL);
    3.16      }
     4.1 --- a/src/libpst.c	Wed Jul 06 10:20:12 2016 -0700
     4.2 +++ b/src/libpst.c	Wed Jul 06 10:21:08 2016 -0700
     4.3 @@ -268,7 +268,6 @@
     4.4  static void             pst_free_attach(pst_item_attach *attach);
     4.5  static void             pst_free_desc (pst_desc_tree *head);
     4.6  static void             pst_free_id2(pst_id2_tree * head);
     4.7 -static void             pst_free_id (pst_index_ll *head);
     4.8  static void             pst_free_list(pst_mapi_object *list);
     4.9  static void             pst_free_xattrib(pst_x_attrib_ll *x);
    4.10  static size_t           pst_getAtPos(pst_file *pf, int64_t pos, void* buf, size_t size);
    4.11 @@ -404,8 +403,8 @@
    4.12      // free the paths
    4.13      free(pf->cwd);
    4.14      free(pf->fname);
    4.15 -    // we must free the id linklist and the desc tree
    4.16 -    pst_free_id(pf->i_head);
    4.17 +    // we must free the id array and the desc tree
    4.18 +    free(pf->i_table);
    4.19      pst_free_desc(pf->d_head);
    4.20      pst_free_xattrib(pf->x_head);
    4.21      DEBUG_RET();
    4.22 @@ -1043,15 +1042,15 @@
    4.23                  return -1;
    4.24              }
    4.25              old = index.id;
    4.26 -            i_ptr = (pst_index_ll*) pst_malloc(sizeof(pst_index_ll));
    4.27 +            if (pf->i_count == pf->i_capacity) {
    4.28 +                pf->i_capacity += (pf->i_capacity >> 1) + 16; // arbitrary growth rate
    4.29 +                pf->i_table = pst_realloc(pf->i_table, pf->i_capacity * sizeof(pst_index_ll));
    4.30 +            }
    4.31 +            i_ptr = &pf->i_table[pf->i_count++];
    4.32              i_ptr->i_id   = index.id;
    4.33              i_ptr->offset = index.offset;
    4.34              i_ptr->u1     = index.u1;
    4.35              i_ptr->size   = index.size;
    4.36 -            i_ptr->next   = NULL;
    4.37 -            if (pf->i_tail)  pf->i_tail->next = i_ptr;
    4.38 -            if (!pf->i_head) pf->i_head = i_ptr;
    4.39 -            pf->i_tail = i_ptr;
    4.40          }
    4.41      } else {
    4.42          // this node contains node pointers
    4.43 @@ -3210,18 +3209,6 @@
    4.44  }
    4.45  
    4.46  
    4.47 -static void pst_free_id (pst_index_ll *head) {
    4.48 -    pst_index_ll *t;
    4.49 -    DEBUG_ENT("pst_free_id");
    4.50 -    while (head) {
    4.51 -        t = head->next;
    4.52 -        free(head);
    4.53 -        head = t;
    4.54 -    }
    4.55 -    DEBUG_RET();
    4.56 -}
    4.57 -
    4.58 -
    4.59  static void pst_free_desc (pst_desc_tree *head) {
    4.60      pst_desc_tree *t;
    4.61      DEBUG_ENT("pst_free_desc");
    4.62 @@ -3620,6 +3607,13 @@
    4.63  }
    4.64  
    4.65  
    4.66 +static int pst_getID_compare(const void *key, const void *entry) {
    4.67 +    uint64_t key_id = *(const uint64_t*)key;
    4.68 +    uint64_t entry_id = ((const pst_index_ll*)entry)->i_id;
    4.69 +    return (key_id > entry_id) - (key_id < entry_id);
    4.70 +}
    4.71 +
    4.72 +
    4.73  /** */
    4.74  pst_index_ll* pst_getID(pst_file* pf, uint64_t i_id) {
    4.75      pst_index_ll *ptr;
    4.76 @@ -3634,10 +3628,7 @@
    4.77      i_id -= (i_id & 1);
    4.78  
    4.79      DEBUG_INFO(("Trying to find %#"PRIx64"\n", i_id));
    4.80 -    ptr = pf->i_head;
    4.81 -    while (ptr && (ptr->i_id != i_id)) {
    4.82 -        ptr = ptr->next;
    4.83 -    }
    4.84 +    ptr = bsearch(&i_id, pf->i_table, pf->i_count, sizeof *pf->i_table, pst_getID_compare);
    4.85      if (ptr) {DEBUG_INFO(("Found Value %#"PRIx64"\n", i_id));            }
    4.86      else     {DEBUG_INFO(("ERROR: Value %#"PRIx64" not found\n", i_id)); }
    4.87      DEBUG_RET();
     5.1 --- a/src/libpst.h	Wed Jul 06 10:20:12 2016 -0700
     5.2 +++ b/src/libpst.h	Wed Jul 06 10:21:08 2016 -0700
     5.3 @@ -105,7 +105,6 @@
     5.4      uint64_t offset;
     5.5      uint64_t size;
     5.6      int64_t  u1;
     5.7 -    struct pst_index_ll *next;
     5.8  } pst_index_ll;
     5.9  
    5.10  
    5.11 @@ -894,8 +893,9 @@
    5.12      char*   fname;
    5.13      /** default character set for items without one */
    5.14      const char*   charset;
    5.15 -    /** the head and tail of the linked list of index structures */
    5.16 -    pst_index_ll *i_head, *i_tail;
    5.17 +    /** the array of index structures */
    5.18 +    pst_index_ll *i_table;
    5.19 +    size_t i_count, i_capacity;
    5.20      /** the head and tail of the top level of the descriptor tree */
    5.21      pst_desc_tree  *d_head, *d_tail;
    5.22      /** the head of the extended attributes linked list */