Mercurial > libpst
comparison src/readpst.c @ 142:2189a6b8134e
improve character set handling - don't try to convert utf-8 to single byte for fields that were not originally unicode.
if the conversion fails, leave the data in utf-8.
author | Carl Byington <carl@five-ten-sg.com> |
---|---|
date | Mon, 23 Feb 2009 20:40:51 -0800 |
parents | fd4297884319 |
children | fdc58ad2c758 |
comparison
equal
deleted
inserted
replaced
141:fd4297884319 | 142:2189a6b8134e |
---|---|
49 char* header_end_field(char *field); | 49 char* header_end_field(char *field); |
50 void header_strip_field(char *header, char *field); | 50 void header_strip_field(char *header, char *field); |
51 int test_base64(char *body); | 51 int test_base64(char *body); |
52 void find_html_charset(char *html, char *charset, size_t charsetlen); | 52 void find_html_charset(char *html, char *charset, size_t charsetlen); |
53 void find_rfc822_headers(char** extra_mime_headers); | 53 void find_rfc822_headers(char** extra_mime_headers); |
54 void write_body_part(FILE* f_output, char *body, char *mime, char *charset, char *boundary); | 54 void write_body_part(FILE* f_output, char *body, int32_t body_was_unicode, char *mime, char *charset, char *boundary, pst_file* pst); |
55 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers); | 55 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers); |
56 void write_vcard(FILE* f_output, pst_item_contact* contact, char comment[]); | 56 void write_vcard(FILE* f_output, pst_item_contact* contact, char comment[]); |
57 void write_appointment(FILE* f_output, pst_item_appointment* appointment, | 57 void write_appointment(FILE* f_output, pst_item_appointment* appointment, |
58 pst_item_email* email, FILETIME* create_date, FILETIME* modify_date); | 58 pst_item_email* email, FILETIME* create_date, FILETIME* modify_date); |
59 void create_enter_dir(struct file_ll* f, pst_item *item); | 59 void create_enter_dir(struct file_ll* f, pst_item *item); |
134 if (!d_ptr->desc) { | 134 if (!d_ptr->desc) { |
135 DEBUG_WARN(("main: ERROR ?? item's desc record is NULL\n")); | 135 DEBUG_WARN(("main: ERROR ?? item's desc record is NULL\n")); |
136 ff.skip_count++; | 136 ff.skip_count++; |
137 } | 137 } |
138 else { | 138 else { |
139 DEBUG_MAIN(("main: Desc Email ID %#x [d_ptr->id = %#x]\n", d_ptr->desc->id, d_ptr->id)); | 139 DEBUG_MAIN(("main: Desc Email ID %#"PRIx64" [d_ptr->id = %#"PRIx64"]\n", d_ptr->desc->id, d_ptr->id)); |
140 | 140 |
141 item = pst_parse_item(&pstfile, d_ptr); | 141 item = pst_parse_item(&pstfile, d_ptr); |
142 DEBUG_MAIN(("main: About to process item\n")); | 142 DEBUG_MAIN(("main: About to process item\n")); |
143 if (item && item->email && item->email->subject && item->email->subject->subj) { | 143 if (item && item->email && item->email->subject && item->email->subject->subj) { |
144 DEBUG_EMAIL(("item->email->subject = %p\n", item->email->subject)); | 144 DEBUG_EMAIL(("item->email->subject->subj = %s\n", item->email->subject->subj)); |
145 DEBUG_EMAIL(("item->email->subject->subj = %p\n", item->email->subject->subj)); | |
146 } | 145 } |
147 if (item) { | 146 if (item) { |
148 if (item->folder && d_ptr->child && (deleted_mode == DMODE_INCLUDE || strcasecmp(item->file_as, "Deleted Items"))) { | 147 if (item->folder && d_ptr->child && (deleted_mode == DMODE_INCLUDE || strcasecmp(item->file_as, "Deleted Items"))) { |
149 //if this is a non-empty folder other than deleted items, we want to recurse into it | 148 //if this is a non-empty folder other than deleted items, we want to recurse into it |
150 if (output_mode != OUTPUT_QUIET) printf("Processing Folder \"%s\"\n", item->file_as); | 149 if (output_mode != OUTPUT_QUIET) printf("Processing Folder \"%s\"\n", item->file_as); |
764 void write_inline_attachment(FILE* f_output, pst_item_attach* attach, char *boundary, pst_file* pst) | 763 void write_inline_attachment(FILE* f_output, pst_item_attach* attach, char *boundary, pst_file* pst) |
765 { | 764 { |
766 char *attach_filename; | 765 char *attach_filename; |
767 char *enc = NULL; // base64 encoded attachment | 766 char *enc = NULL; // base64 encoded attachment |
768 DEBUG_ENT("write_inline_attachment"); | 767 DEBUG_ENT("write_inline_attachment"); |
769 DEBUG_EMAIL(("Attachment Size is %i\n", attach->size)); | 768 DEBUG_EMAIL(("Attachment Size is %i, pointer %p, id %d\n", attach->size, attach->data, attach->id_val)); |
770 DEBUG_EMAIL(("Attachment Pointer is %p\n", attach->data)); | |
771 if (attach->data) { | 769 if (attach->data) { |
772 enc = base64_encode (attach->data, attach->size); | 770 enc = base64_encode (attach->data, attach->size); |
773 if (!enc) { | 771 if (!enc) { |
774 DEBUG_EMAIL(("ERROR base64_encode returned NULL. Must have failed\n")); | 772 DEBUG_EMAIL(("ERROR base64_encode returned NULL. Must have failed\n")); |
773 DEBUG_RET(); | |
774 return; | |
775 } | |
776 } | |
777 else { | |
778 // make sure we can fetch data from the id | |
779 pst_index_ll *ptr = pst_getID(pst, attach->id_val); | |
780 if (!ptr) { | |
781 DEBUG_WARN(("Couldn't find ID pointer. Cannot save attachment to file\n")); | |
775 DEBUG_RET(); | 782 DEBUG_RET(); |
776 return; | 783 return; |
777 } | 784 } |
778 } | 785 } |
779 | 786 |
962 DEBUG_EMAIL(("found 822 headers\n%s\n", headers)); | 969 DEBUG_EMAIL(("found 822 headers\n%s\n", headers)); |
963 break; | 970 break; |
964 } | 971 } |
965 } | 972 } |
966 } | 973 } |
967 DEBUG_EMAIL(("skipping to next block after\n%s\n", headers)); | 974 //DEBUG_EMAIL(("skipping to next block after\n%s\n", headers)); |
968 headers = temp+2; // skip to next chunk of headers | 975 headers = temp+2; // skip to next chunk of headers |
969 } | 976 } |
970 *extra_mime_headers = headers; | 977 *extra_mime_headers = headers; |
971 } | 978 } |
972 DEBUG_RET(); | 979 DEBUG_RET(); |
973 } | 980 } |
974 | 981 |
975 | 982 |
976 void write_body_part(FILE* f_output, char *body, char *mime, char *charset, char *boundary) | 983 void write_body_part(FILE* f_output, char *body, int32_t body_was_unicode, char *mime, char *charset, char *boundary, pst_file* pst) |
977 { | 984 { |
978 char *needfree = NULL; | 985 char *needfree = NULL; |
979 DEBUG_ENT("write_body_part"); | 986 DEBUG_ENT("write_body_part"); |
980 if (strcasecmp("utf-8", charset)) { | 987 if (body_was_unicode && (strcasecmp("utf-8", charset))) { |
981 // try to convert to the specified charset since it is not utf-8 | 988 // try to convert to the specified charset since the target |
989 // is not utf-8, and the data came from a unicode (utf16) field | |
990 // and is now in utf-8. | |
982 size_t rc; | 991 size_t rc; |
983 DEBUG_EMAIL(("Convert %s utf-8 to %s\n", mime, charset)); | 992 DEBUG_EMAIL(("Convert %s utf-8 to %s\n", mime, charset)); |
984 vbuf *newer = vballoc(2); | 993 vbuf *newer = vballoc(2); |
985 rc = vb_utf8to8bit(newer, body, strlen(body) + 1, charset); | 994 rc = vb_utf8to8bit(newer, body, strlen(body) + 1, charset); |
986 if (rc == (size_t)-1) { | 995 if (rc == (size_t)-1) { |
987 // unable to convert, maybe it is already in that character set | 996 // unable to convert, change the charset to utf8 |
988 free(newer->b); | 997 free(newer->b); |
989 DEBUG_EMAIL(("Failed to convert %s utf-8 to %s\n", mime, charset)); | 998 DEBUG_EMAIL(("Failed to convert %s utf-8 to %s\n", mime, charset)); |
999 charset = "utf-8"; | |
990 } | 1000 } |
991 else { | 1001 else { |
992 needfree = body = newer->b; | 1002 needfree = body = newer->b; |
993 } | 1003 } |
994 free(newer); | 1004 free(newer); |
1010 else { | 1020 else { |
1011 write_email_body(f_output, body); | 1021 write_email_body(f_output, body); |
1012 } | 1022 } |
1013 if (needfree) free(needfree); | 1023 if (needfree) free(needfree); |
1014 DEBUG_RET(); | 1024 DEBUG_RET(); |
1025 } | |
1026 | |
1027 | |
1028 const char* codepage(int cp) { | |
1029 static char buffer[20]; | |
1030 switch (cp) { | |
1031 case 932 : return "iso-2022-jp"; | |
1032 case 936 : return "gb2313"; | |
1033 case 950 : return "big5"; | |
1034 case 20127 : return "us-ascii"; | |
1035 case 20269 : return "iso-6937"; | |
1036 case 20865 : return "iso-8859-15"; | |
1037 case 20866 : return "koi8-r"; | |
1038 case 21866 : return "koi8-u"; | |
1039 case 28591 : return "iso-8859-1"; | |
1040 case 28592 : return "iso-8859-2"; | |
1041 case 28595 : return "iso-8859-5"; | |
1042 case 28596 : return "iso-8859-6"; | |
1043 case 28597 : return "iso-8859-7"; | |
1044 case 28598 : return "iso-8859-8"; | |
1045 case 28599 : return "iso-8859-9"; | |
1046 case 50220 : return "iso-2022-jp"; | |
1047 case 50221 : return "csiso2022jp"; | |
1048 case 51932 : return "euc-jp"; | |
1049 case 51949 : return "euc-kr"; | |
1050 case 65000 : return "utf-7"; | |
1051 case 65001 : return "utf-8"; | |
1052 default : | |
1053 snprintf(buffer, sizeof(buffer), "cp%d", cp); | |
1054 return buffer; | |
1055 } | |
1056 return NULL; | |
1015 } | 1057 } |
1016 | 1058 |
1017 | 1059 |
1018 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers) | 1060 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers) |
1019 { | 1061 { |
1030 int has_from, has_subject, has_to, has_cc, has_date, has_msgid; | 1072 int has_from, has_subject, has_to, has_cc, has_date, has_msgid; |
1031 has_from = has_subject = has_to = has_cc = has_date = has_msgid = 0; | 1073 has_from = has_subject = has_to = has_cc = has_date = has_msgid = 0; |
1032 DEBUG_ENT("write_normal_email"); | 1074 DEBUG_ENT("write_normal_email"); |
1033 | 1075 |
1034 // setup default body character set and report type | 1076 // setup default body character set and report type |
1035 snprintf(body_charset, sizeof(body_charset), "%s", (item->email->body_charset) ? item->email->body_charset : "utf-8"); | 1077 snprintf(body_charset, sizeof(body_charset), "%s", |
1078 (item->email->body_charset) ? item->email->body_charset : | |
1079 (item->email->message_codepage) ? codepage(item->email->message_codepage) : | |
1080 (item->email->internet_cpid) ? codepage(item->email->internet_cpid) : | |
1081 "utf-8"); | |
1036 body_report[0] = '\0'; | 1082 body_report[0] = '\0'; |
1037 | 1083 |
1038 // setup default sender | 1084 // setup default sender |
1039 if (item->email->sender_address && strchr(item->email->sender_address, '@')) { | 1085 if (item->email->sender_address && strchr(item->email->sender_address, '@')) { |
1040 temp = item->email->sender_address; | 1086 temp = item->email->sender_address; |
1121 if (item && item->email && item->email->subject && item->email->subject->subj) { | 1167 if (item && item->email && item->email->subject && item->email->subject->subj) { |
1122 DEBUG_EMAIL(("item->email->subject->subj = %s\n", item->email->subject->subj)); | 1168 DEBUG_EMAIL(("item->email->subject->subj = %s\n", item->email->subject->subj)); |
1123 } | 1169 } |
1124 | 1170 |
1125 if (mode != MODE_SEPARATE) { | 1171 if (mode != MODE_SEPARATE) { |
1126 // most modes need this separator line | 1172 // most modes need this separator line. |
1127 fprintf(f_output, "From %s %s\n", sender, c_time); | 1173 // procmail produces this separator without the quotes around the |
1174 // sender email address, but apparently some Mac email client needs | |
1175 // those quotes, and they don't seem to cause problems for anyone else. | |
1176 fprintf(f_output, "From \"%s\" %s\n", sender, c_time); | |
1128 } | 1177 } |
1129 | 1178 |
1130 // print the supplied email headers | 1179 // print the supplied email headers |
1131 if (headers) { | 1180 if (headers) { |
1132 int len; | 1181 int len; |
1196 } | 1245 } |
1197 fprintf(f_output, "\n"); // end of headers, start of body | 1246 fprintf(f_output, "\n"); // end of headers, start of body |
1198 | 1247 |
1199 // now dump the body parts | 1248 // now dump the body parts |
1200 if (item->email->body) { | 1249 if (item->email->body) { |
1201 write_body_part(f_output, item->email->body, "text/plain", body_charset, boundary); | 1250 write_body_part(f_output, item->email->body, item->email->body_was_unicode, "text/plain", body_charset, boundary, pst); |
1251 } | |
1252 | |
1253 if ((item->email->report_text) && (body_report[0] != '\0')) { | |
1254 write_body_part(f_output, item->email->report_text, item->email->report_was_unicode, "text/plain", body_charset, boundary, pst); | |
1255 fprintf(f_output, "\n"); | |
1202 } | 1256 } |
1203 | 1257 |
1204 if (item->email->htmlbody) { | 1258 if (item->email->htmlbody) { |
1205 find_html_charset(item->email->htmlbody, body_charset, sizeof(body_charset)); | 1259 find_html_charset(item->email->htmlbody, body_charset, sizeof(body_charset)); |
1206 write_body_part(f_output, item->email->htmlbody, "text/html", body_charset, boundary); | 1260 write_body_part(f_output, item->email->htmlbody, item->email->htmlbody_was_unicode, "text/html", body_charset, boundary, pst); |
1207 } | 1261 } |
1208 | 1262 |
1209 if (item->email->rtf_compressed && save_rtf) { | 1263 if (item->email->rtf_compressed && save_rtf) { |
1210 pst_item_attach* attach = (pst_item_attach*)xmalloc(sizeof(pst_item_attach)); | 1264 pst_item_attach* attach = (pst_item_attach*)xmalloc(sizeof(pst_item_attach)); |
1211 DEBUG_EMAIL(("Adding RTF body as attachment\n")); | 1265 DEBUG_EMAIL(("Adding RTF body as attachment\n")); |