comparison src/readpst.c @ 345:a8577226f7a9

fixes from AJ Shankar for attachment processing and body encodings that contain embedded null chars
author Carl Byington <carl@five-ten-sg.com>
date Mon, 09 Mar 2015 10:47:58 -0700
parents aedcf979f439
children a57c15b3108a
comparison
equal deleted inserted replaced
344:aedcf979f439 345:a8577226f7a9
52 void header_has_field(char *header, char *field, int *flag); 52 void header_has_field(char *header, char *field, int *flag);
53 void header_get_subfield(char *field, const char *subfield, char *body_subfield, size_t size_subfield); 53 void header_get_subfield(char *field, const char *subfield, char *body_subfield, size_t size_subfield);
54 char* header_get_field(char *header, char *field); 54 char* header_get_field(char *header, char *field);
55 char* header_end_field(char *field); 55 char* header_end_field(char *field);
56 void header_strip_field(char *header, char *field); 56 void header_strip_field(char *header, char *field);
57 int test_base64(char *body); 57 int test_base64(char *body, size_t len);
58 void find_html_charset(char *html, char *charset, size_t charsetlen); 58 void find_html_charset(char *html, char *charset, size_t charsetlen);
59 void find_rfc822_headers(char** extra_mime_headers); 59 void find_rfc822_headers(char** extra_mime_headers);
60 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst); 60 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst);
61 void write_schedule_part_data(FILE* f_output, pst_item* item, const char* sender, const char* method); 61 void write_schedule_part_data(FILE* f_output, pst_item* item, const char* sender, const char* method);
62 void write_schedule_part(FILE* f_output, pst_item* item, const char* sender, const char* boundary); 62 void write_schedule_part(FILE* f_output, pst_item* item, const char* sender, const char* boundary);
130 int contact_mode = CMODE_VCARD; 130 int contact_mode = CMODE_VCARD;
131 int deleted_mode = DMODE_EXCLUDE; 131 int deleted_mode = DMODE_EXCLUDE;
132 int output_type_mode = 0xff; // Default to all. 132 int output_type_mode = 0xff; // Default to all.
133 int contact_mode_specified = 0; 133 int contact_mode_specified = 0;
134 int overwrite = 0; 134 int overwrite = 0;
135 int prefer_utf8 = 0;
135 int save_rtf_body = 1; 136 int save_rtf_body = 1;
136 int file_name_len = 10; // enough room for MODE_SPEARATE file name 137 int file_name_len = 10; // enough room for MODE_SPEARATE file name
137 pst_file pstfile; 138 pst_file pstfile;
138 regex_t meta_charset_pattern; 139 regex_t meta_charset_pattern;
139 char* default_charset = NULL; 140 char* default_charset = NULL;
450 printf("cannot compile regex pattern to find content charset in html bodies\n"); 451 printf("cannot compile regex pattern to find content charset in html bodies\n");
451 exit(3); 452 exit(3);
452 } 453 }
453 454
454 // command-line option handling 455 // command-line option handling
455 while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw"))!= -1) { 456 while ((c = getopt(argc, argv, "a:bC:c:Dd:emhj:kMo:qrSt:uVw8"))!= -1) {
456 switch (c) { 457 switch (c) {
457 case 'a': 458 case 'a':
458 if (optarg) { 459 if (optarg) {
459 int n = strlen(optarg); 460 int n = strlen(optarg);
460 acceptable_extensions = (char*)pst_malloc(n+2); 461 acceptable_extensions = (char*)pst_malloc(n+2);
584 version(); 585 version();
585 exit(0); 586 exit(0);
586 break; 587 break;
587 case 'w': 588 case 'w':
588 overwrite = 1; 589 overwrite = 1;
590 break;
591 case '8':
592 prefer_utf8 = 1;
589 break; 593 break;
590 default: 594 default:
591 usage(); 595 usage();
592 exit(1); 596 exit(1);
593 break; 597 break;
756 printf("\t-q\t- Quiet. Only print error messages\n"); 760 printf("\t-q\t- Quiet. Only print error messages\n");
757 printf("\t-r\t- Recursive. Output in a recursive format\n"); 761 printf("\t-r\t- Recursive. Output in a recursive format\n");
758 printf("\t-t[eajc]\t- Set the output type list. e = email, a = attachment, j = journal, c = contact\n"); 762 printf("\t-t[eajc]\t- Set the output type list. e = email, a = attachment, j = journal, c = contact\n");
759 printf("\t-u\t- Thunderbird mode. Write two extra .size and .type files\n"); 763 printf("\t-u\t- Thunderbird mode. Write two extra .size and .type files\n");
760 printf("\t-w\t- Overwrite any output mbox files\n"); 764 printf("\t-w\t- Overwrite any output mbox files\n");
765 printf("\t-8\t- Output bodies in UTF-8, rather than original encoding, if UTF-8 version is available\n");
761 printf("\n"); 766 printf("\n");
762 printf("Only one of -M -S -e -k -m -r should be specified\n"); 767 printf("Only one of -M -S -e -k -m -r should be specified\n");
763 DEBUG_RET(); 768 DEBUG_RET();
764 } 769 }
765 770
1265 if (e && f && (f < e)) e = f; 1270 if (e && f && (f < e)) e = f;
1266 } 1271 }
1267 if (!e || (e > n)) e = n; // use the trailing lf as terminator if nothing better 1272 if (!e || (e > n)) e = n; // use the trailing lf as terminator if nothing better
1268 save = *e; 1273 save = *e;
1269 *e = '\0'; 1274 *e = '\0';
1270 snprintf(body_subfield, size_subfield, "%s", s); // copy the subfield to our buffer 1275 snprintf(body_subfield, size_subfield, "%s", s); // copy the subfield to our buffer
1271 *e = save; 1276 *e = save;
1272 DEBUG_INFO(("body %s %s from headers\n", subfield, body_subfield)); 1277 DEBUG_INFO(("body %s %s from headers\n", subfield, body_subfield));
1273 } 1278 }
1274 DEBUG_RET(); 1279 DEBUG_RET();
1275 } 1280 }
1314 } 1319 }
1315 } 1320 }
1316 } 1321 }
1317 1322
1318 1323
1319 int test_base64(char *body) 1324 int test_base64(char *body, size_t len)
1320 { 1325 {
1321 int b64 = 0; 1326 int b64 = 0;
1322 uint8_t *b = (uint8_t *)body; 1327 uint8_t *b = (uint8_t *)body;
1323 DEBUG_ENT("test_base64"); 1328 DEBUG_ENT("test_base64");
1324 while (*b) { 1329 while (len--) {
1325 if ((*b < 32) && (*b != 9) && (*b != 10)) { 1330 if ((*b < 32) && (*b != 9) && (*b != 10)) {
1326 DEBUG_INFO(("found base64 byte %d\n", (int)*b)); 1331 DEBUG_INFO(("found base64 byte %d\n", (int)*b));
1327 DEBUG_HEXDUMPC(body, strlen(body), 0x10); 1332 DEBUG_HEXDUMPC(body, strlen(body), 0x10);
1328 b64 = 1; 1333 b64 = 1;
1329 break; 1334 break;
1399 1404
1400 1405
1401 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst) 1406 void write_body_part(FILE* f_output, pst_string *body, char *mime, char *charset, char *boundary, pst_file* pst)
1402 { 1407 {
1403 DEBUG_ENT("write_body_part"); 1408 DEBUG_ENT("write_body_part");
1409 removeCR(body->str);
1410 size_t body_len = strlen(body->str);
1411
1404 if (body->is_utf8 && (strcasecmp("utf-8", charset))) { 1412 if (body->is_utf8 && (strcasecmp("utf-8", charset))) {
1405 // try to convert to the specified charset since the target 1413 if (prefer_utf8) {
1406 // is not utf-8, and the data came from a unicode (utf16) field
1407 // and is now in utf-8.
1408 size_t rc;
1409 DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset));
1410 pst_vbuf *newer = pst_vballoc(2);
1411 rc = pst_vb_utf8to8bit(newer, body->str, strlen(body->str), charset);
1412 if (rc == (size_t)-1) {
1413 // unable to convert, change the charset to utf8
1414 free(newer->b);
1415 DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset));
1416 charset = "utf-8"; 1414 charset = "utf-8";
1417 } 1415 } else {
1418 else { 1416 // try to convert to the specified charset since the target
1419 // null terminate the output string 1417 // is not utf-8, and the data came from a unicode (utf16) field
1420 pst_vbgrow(newer, 1); 1418 // and is now in utf-8.
1421 newer->b[newer->dlen] = '\0'; 1419 size_t rc;
1422 free(body->str); 1420 DEBUG_INFO(("Convert %s utf-8 to %s\n", mime, charset));
1423 body->str = newer->b; 1421 pst_vbuf *newer = pst_vballoc(2);
1424 } 1422 rc = pst_vb_utf8to8bit(newer, body->str, body_len, charset);
1425 free(newer); 1423 if (rc == (size_t)-1) {
1426 } 1424 // unable to convert, change the charset to utf8
1427 removeCR(body->str); 1425 free(newer->b);
1428 int base64 = test_base64(body->str); 1426 DEBUG_INFO(("Failed to convert %s utf-8 to %s\n", mime, charset));
1427 charset = "utf-8";
1428 } else {
1429 // null terminate the output string
1430 pst_vbgrow(newer, 1);
1431 newer->b[newer->dlen] = '\0';
1432 free(body->str);
1433 body->str = newer->b;
1434 body_len = newer->dlen;
1435 }
1436 free(newer);
1437 }
1438 }
1439 int base64 = test_base64(body->str, body_len);
1429 fprintf(f_output, "\n--%s\n", boundary); 1440 fprintf(f_output, "\n--%s\n", boundary);
1430 fprintf(f_output, "Content-Type: %s; charset=\"%s\"\n", mime, charset); 1441 fprintf(f_output, "Content-Type: %s; charset=\"%s\"\n", mime, charset);
1431 if (base64) fprintf(f_output, "Content-Transfer-Encoding: base64\n"); 1442 if (base64) fprintf(f_output, "Content-Transfer-Encoding: base64\n");
1432 fprintf(f_output, "\n"); 1443 fprintf(f_output, "\n");
1444 // Any body that uses an encoding with NULLs, e.g. UTF16, will be base64-encoded here.
1433 if (base64) { 1445 if (base64) {
1434 char *enc = pst_base64_encode(body->str, strlen(body->str)); 1446 char *enc = pst_base64_encode(body->str, body_len);
1435 if (enc) { 1447 if (enc) {
1436 write_email_body(f_output, enc); 1448 write_email_body(f_output, enc);
1437 fprintf(f_output, "\n"); 1449 fprintf(f_output, "\n");
1438 free(enc); 1450 free(enc);
1439 } 1451 }
1533 em_time = pst_fileTimeToUnixTime(item->email->sent_date); 1545 em_time = pst_fileTimeToUnixTime(item->email->sent_date);
1534 c_time = ctime(&em_time); 1546 c_time = ctime(&em_time);
1535 if (c_time) 1547 if (c_time)
1536 c_time[strlen(c_time)-1] = '\0'; //remove end \n 1548 c_time[strlen(c_time)-1] = '\0'; //remove end \n
1537 else 1549 else
1538 c_time = "Fri Dec 28 12:06:21 2001"; 1550 c_time = "Thu Jan 1 00:00:00 1970";
1539 } else 1551 } else
1540 c_time = "Fri Dec 28 12:06:21 2001"; 1552 c_time = "Thu Jan 1 00:00:00 1970";
1541 1553
1542 // create our MIME boundaries here. 1554 // create our MIME boundaries here.
1543 snprintf(boundary, sizeof(boundary), "--boundary-LibPST-iamunique-%i_-_-", rand()); 1555 snprintf(boundary, sizeof(boundary), "--boundary-LibPST-iamunique-%i_-_-", rand());
1544 snprintf(altboundary, sizeof(altboundary), "alt-%s", boundary); 1556 snprintf(altboundary, sizeof(altboundary), "alt-%s", boundary);
1545 1557