Mercurial > libpst
comparison src/readpst.c @ 345:a8577226f7a9
fixes from AJ Shankar for attachment processing and body encodings that contain embedded null chars
author | Carl Byington <carl@five-ten-sg.com> |
---|---|
date | Mon, 09 Mar 2015 10:47:58 -0700 |
parents | aedcf979f439 |
children | a57c15b3108a |
comparison
equal
deleted
inserted
replaced
344:aedcf979f439 | 345:a8577226f7a9 |
---|---|
52 void header_has_field(char *header, char *field, int *flag); | 52 void header_has_field(char *header, char *field, int *flag); |
53 void header_get_subfield(char *field, const char *subfield, char *body_subfield, size_t size_subfield); | 53 void header_get_subfield(char *field, const char *subfield, char *body_subfield, size_t size_subfield); |
54 char* header_get_field(char *header, char *field); | 54 char* header_get_field(char *header, char *field); |
55 char* header_end_field(char *field); | 55 char* header_end_field(char *field); |
56 void header_strip_field(char *header, char *field); | 56 void header_strip_field(char *header, char *field); |
57 int test_base64(char *body); | 57 int test_base64(char *body, size_t len); |
58 void find_html_charset(char *html, char *charset, size_t charsetlen); | 58 void find_html_charset(char *html, char *charset, size_t charsetlen); |
59 void find_rfc822_headers(char** extra_mime_headers); | 59 void find_rfc822_headers(char** extra_mime_headers); |
60 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst); | 60 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst); |
61 void write_schedule_part_data(FILE* f_output, pst_item* item, const char* sender, const char* method); | 61 void write_schedule_part_data(FILE* f_output, pst_item* item, const char* sender, const char* method); |
62 void write_schedule_part(FILE* f_output, pst_item* item, const char* sender, const char* boundary); | 62 void write_schedule_part(FILE* f_output, pst_item* item, const char* sender, const char* boundary); |
130 int contact_mode = CMODE_VCARD; | 130 int contact_mode = CMODE_VCARD; |
131 int deleted_mode = DMODE_EXCLUDE; | 131 int deleted_mode = DMODE_EXCLUDE; |
132 int output_type_mode = 0xff; // Default to all. | 132 int output_type_mode = 0xff; // Default to all. |
133 int contact_mode_specified = 0; | 133 int contact_mode_specified = 0; |
134 int overwrite = 0; | 134 int overwrite = 0; |
135 int prefer_utf8 = 0; | |
135 int save_rtf_body = 1; | 136 int save_rtf_body = 1; |
136 int file_name_len = 10; // enough room for MODE_SPEARATE file name | 137 int file_name_len = 10; // enough room for MODE_SPEARATE file name |
137 pst_file pstfile; | 138 pst_file pstfile; |
138 regex_t meta_charset_pattern; | 139 regex_t meta_charset_pattern; |
139 char* default_charset = NULL; | 140 char* default_charset = NULL; |
450 printf("cannot compile regex pattern to find content charset in html bodies\n"); | 451 printf("cannot compile regex pattern to find content charset in html bodies\n"); |
451 exit(3); | 452 exit(3); |
452 } | 453 } |
453 | 454 |
454 // command-line option handling | 455 // command-line option handling |
455 while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw"))!= -1) { | 456 while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw8"))!= -1) { |
456 switch (c) { | 457 switch (c) { |
457 case 'a': | 458 case 'a': |
458 if (optarg) { | 459 if (optarg) { |
459 int n = strlen(optarg); | 460 int n = strlen(optarg); |
460 acceptable_extensions = (char*)pst_malloc(n+2); | 461 acceptable_extensions = (char*)pst_malloc(n+2); |
584 version(); | 585 version(); |
585 exit(0); | 586 exit(0); |
586 break; | 587 break; |
587 case 'w': | 588 case 'w': |
588 overwrite = 1; | 589 overwrite = 1; |
590 break; | |
591 case '8': | |
592 prefer_utf8 = 1; | |
589 break; | 593 break; |
590 default: | 594 default: |
591 usage(); | 595 usage(); |
592 exit(1); | 596 exit(1); |
593 break; | 597 break; |
756 printf("\t-q\t- Quiet. Only print error messages\n"); | 760 printf("\t-q\t- Quiet. Only print error messages\n"); |
757 printf("\t-r\t- Recursive. Output in a recursive format\n"); | 761 printf("\t-r\t- Recursive. Output in a recursive format\n"); |
758 printf("\t-t[eajc]\t- Set the output type list. e = email, a = attachment, j = journal, c = contact\n"); | 762 printf("\t-t[eajc]\t- Set the output type list. e = email, a = attachment, j = journal, c = contact\n"); |
759 printf("\t-u\t- Thunderbird mode. Write two extra .size and .type files\n"); | 763 printf("\t-u\t- Thunderbird mode. Write two extra .size and .type files\n"); |
760 printf("\t-w\t- Overwrite any output mbox files\n"); | 764 printf("\t-w\t- Overwrite any output mbox files\n"); |
765 printf("\t-8\t- Output bodies in UTF-8, rather than original encoding, if UTF-8 version is available\n"); | |
761 printf("\n"); | 766 printf("\n"); |
762 printf("Only one of -M -S -e -k -m -r should be specified\n"); | 767 printf("Only one of -M -S -e -k -m -r should be specified\n"); |
763 DEBUG_RET(); | 768 DEBUG_RET(); |
764 } | 769 } |
765 | 770 |
1265 if (e && f && (f < e)) e = f; | 1270 if (e && f && (f < e)) e = f; |
1266 } | 1271 } |
1267 if (!e || (e > n)) e = n; // use the trailing lf as terminator if nothing better | 1272 if (!e || (e > n)) e = n; // use the trailing lf as terminator if nothing better |
1268 save = *e; | 1273 save = *e; |
1269 *e = '\0'; | 1274 *e = '\0'; |
1270 snprintf(body_subfield, size_subfield, "%s", s); // copy the subfield to our buffer | 1275 snprintf(body_subfield, size_subfield, "%s", s); // copy the subfield to our buffer |
1271 *e = save; | 1276 *e = save; |
1272 DEBUG_INFO(("body %s %s from headers\n", subfield, body_subfield)); | 1277 DEBUG_INFO(("body %s %s from headers\n", subfield, body_subfield)); |
1273 } | 1278 } |
1274 DEBUG_RET(); | 1279 DEBUG_RET(); |
1275 } | 1280 } |
1314 } | 1319 } |
1315 } | 1320 } |
1316 } | 1321 } |
1317 | 1322 |
1318 | 1323 |
1319 int test_base64(char *body) | 1324 int test_base64(char *body, size_t len) |
1320 { | 1325 { |
1321 int b64 = 0; | 1326 int b64 = 0; |
1322 uint8_t *b = (uint8_t *)body; | 1327 uint8_t *b = (uint8_t *)body; |
1323 DEBUG_ENT("test_base64"); | 1328 DEBUG_ENT("test_base64"); |
1324 while (*b) { | 1329 while (len--) { |
1325 if ((*b < 32) && (*b != 9) && (*b != 10)) { | 1330 if ((*b < 32) && (*b != 9) && (*b != 10)) { |
1326 DEBUG_INFO(("found base64 byte %d\n", (int)*b)); | 1331 DEBUG_INFO(("found base64 byte %d\n", (int)*b)); |
1327 DEBUG_HEXDUMPC(body, strlen(body), 0x10); | 1332 DEBUG_HEXDUMPC(body, strlen(body), 0x10); |
1328 b64 = 1; | 1333 b64 = 1; |
1329 break; | 1334 break; |
1399 | 1404 |
1400 | 1405 |
1401 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst) | 1406 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst) |
1402 { | 1407 { |
1403 DEBUG_ENT("write_body_part"); | 1408 DEBUG_ENT("write_body_part"); |
1409 removeCR(body->str); | |
1410 size_t body_len = strlen(body->str); | |
1411 | |
1404 if (body->is_utf8 && (strcasecmp("utf-8", charset))) { | 1412 if (body->is_utf8 && (strcasecmp("utf-8", charset))) { |
1405 // try to convert to the specified charset since the target | 1413 if (prefer_utf8) { |
1406 // is not utf-8, and the data came from a unicode (utf16) field | |
1407 // and is now in utf-8. | |
1408 size_t rc; | |
1409 DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset)); | |
1410 pst_vbuf *newer = pst_vballoc(2); | |
1411 rc = pst_vb_utf8to8bit(newer, body->str, strlen(body->str), charset); | |
1412 if (rc == (size_t)-1) { | |
1413 // unable to convert, change the charset to utf8 | |
1414 free(newer->b); | |
1415 DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset)); | |
1416 charset = "utf-8"; | 1414 charset = "utf-8"; |
1417 } | 1415 } else { |
1418 else { | 1416 // try to convert to the specified charset since the target |
1419 // null terminate the output string | 1417 // is not utf-8, and the data came from a unicode (utf16) field |
1420 pst_vbgrow(newer, 1); | 1418 // and is now in utf-8. |
1421 newer->b[newer->dlen] = '\0'; | 1419 size_t rc; |
1422 free(body->str); | 1420 DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset)); |
1423 body->str = newer->b; | 1421 pst_vbuf *newer = pst_vballoc(2); |
1424 } | 1422 rc = pst_vb_utf8to8bit(newer, body->str, body_len, charset); |
1425 free(newer); | 1423 if (rc == (size_t)-1) { |
1426 } | 1424 // unable to convert, change the charset to utf8 |
1427 removeCR(body->str); | 1425 free(newer->b); |
1428 int base64 = test_base64(body->str); | 1426 DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset)); |
1427 charset = "utf-8"; | |
1428 } else { | |
1429 // null terminate the output string | |
1430 pst_vbgrow(newer, 1); | |
1431 newer->b[newer->dlen] = '\0'; | |
1432 free(body->str); | |
1433 body->str = newer->b; | |
1434 body_len = newer->dlen; | |
1435 } | |
1436 free(newer); | |
1437 } | |
1438 } | |
1439 int base64 = test_base64(body->str, body_len); | |
1429 fprintf(f_output, "\n--%s\n", boundary); | 1440 fprintf(f_output, "\n--%s\n", boundary); |
1430 fprintf(f_output, "Content-Type: %s; charset=\"%s\"\n", mime, charset); | 1441 fprintf(f_output, "Content-Type: %s; charset=\"%s\"\n", mime, charset); |
1431 if (base64) fprintf(f_output, "Content-Transfer-Encoding: base64\n"); | 1442 if (base64) fprintf(f_output, "Content-Transfer-Encoding: base64\n"); |
1432 fprintf(f_output, "\n"); | 1443 fprintf(f_output, "\n"); |
1444 // Any body that uses an encoding with NULLs, e.g. UTF16, will be base64-encoded here. | |
1433 if (base64) { | 1445 if (base64) { |
1434 char *enc = pst_base64_encode(body->str, strlen(body->str)); | 1446 char *enc = pst_base64_encode(body->str, body_len); |
1435 if (enc) { | 1447 if (enc) { |
1436 write_email_body(f_output, enc); | 1448 write_email_body(f_output, enc); |
1437 fprintf(f_output, "\n"); | 1449 fprintf(f_output, "\n"); |
1438 free(enc); | 1450 free(enc); |
1439 } | 1451 } |
1533 em_time = pst_fileTimeToUnixTime(item->email->sent_date); | 1545 em_time = pst_fileTimeToUnixTime(item->email->sent_date); |
1534 c_time = ctime(&em_time); | 1546 c_time = ctime(&em_time); |
1535 if (c_time) | 1547 if (c_time) |
1536 c_time[strlen(c_time)-1] = '\0'; //remove end \n | 1548 c_time[strlen(c_time)-1] = '\0'; //remove end \n |
1537 else | 1549 else |
1538 c_time = "Fri Dec 28 12:06:21 2001"; | 1550 c_time = "Thu Jan 1 00:00:00 1970"; |
1539 } else | 1551 } else |
1540 c_time = "Fri Dec 28 12:06:21 2001"; | 1552 c_time = "Thu Jan 1 00:00:00 1970"; |
1541 | 1553 |
1542 // create our MIME boundaries here. | 1554 // create our MIME boundaries here. |
1543 snprintf(boundary, sizeof(boundary), "--boundary-LibPST-iamunique-%i_-_-", rand()); | 1555 snprintf(boundary, sizeof(boundary), "--boundary-LibPST-iamunique-%i_-_-", rand()); |
1544 snprintf(altboundary, sizeof(altboundary), "alt-%s", boundary); | 1556 snprintf(altboundary, sizeof(altboundary), "alt-%s", boundary); |
1545 | 1557 |