comparison src/readpst.c @ 142:2189a6b8134e

improve character set handling - don't try to convert utf-8 to single byte for fields that were not originally unicode. if the conversion fails, leave the data in utf-8.
author Carl Byington <carl@five-ten-sg.com>
date Mon, 23 Feb 2009 20:40:51 -0800
parents fd4297884319
children fdc58ad2c758
comparison
equal deleted inserted replaced
141:fd4297884319 142:2189a6b8134e
49 char* header_end_field(char *field); 49 char* header_end_field(char *field);
50 void header_strip_field(char *header, char *field); 50 void header_strip_field(char *header, char *field);
51 int test_base64(char *body); 51 int test_base64(char *body);
52 void find_html_charset(char *html, char *charset, size_t charsetlen); 52 void find_html_charset(char *html, char *charset, size_t charsetlen);
53 void find_rfc822_headers(char** extra_mime_headers); 53 void find_rfc822_headers(char** extra_mime_headers);
54 void write_body_part(FILE* f_output, char *body, char *mime, char *charset, char *boundary); 54 void write_body_part(FILE* f_output, char *body, int32_t body_was_unicode, char *mime, char *charset, char *boundary, pst_file* pst);
55 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers); 55 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers);
56 void write_vcard(FILE* f_output, pst_item_contact* contact, char comment[]); 56 void write_vcard(FILE* f_output, pst_item_contact* contact, char comment[]);
57 void write_appointment(FILE* f_output, pst_item_appointment* appointment, 57 void write_appointment(FILE* f_output, pst_item_appointment* appointment,
58 pst_item_email* email, FILETIME* create_date, FILETIME* modify_date); 58 pst_item_email* email, FILETIME* create_date, FILETIME* modify_date);
59 void create_enter_dir(struct file_ll* f, pst_item *item); 59 void create_enter_dir(struct file_ll* f, pst_item *item);
134 if (!d_ptr->desc) { 134 if (!d_ptr->desc) {
135 DEBUG_WARN(("main: ERROR ?? item's desc record is NULL\n")); 135 DEBUG_WARN(("main: ERROR ?? item's desc record is NULL\n"));
136 ff.skip_count++; 136 ff.skip_count++;
137 } 137 }
138 else { 138 else {
139 DEBUG_MAIN(("main: Desc Email ID %#x [d_ptr->id = %#x]\n", d_ptr->desc->id, d_ptr->id)); 139 DEBUG_MAIN(("main: Desc Email ID %#"PRIx64" [d_ptr->id = %#"PRIx64"]\n", d_ptr->desc->id, d_ptr->id));
140 140
141 item = pst_parse_item(&pstfile, d_ptr); 141 item = pst_parse_item(&pstfile, d_ptr);
142 DEBUG_MAIN(("main: About to process item\n")); 142 DEBUG_MAIN(("main: About to process item\n"));
143 if (item && item->email && item->email->subject && item->email->subject->subj) { 143 if (item && item->email && item->email->subject && item->email->subject->subj) {
144 DEBUG_EMAIL(("item->email->subject = %p\n", item->email->subject)); 144 DEBUG_EMAIL(("item->email->subject->subj = %s\n", item->email->subject->subj));
145 DEBUG_EMAIL(("item->email->subject->subj = %p\n", item->email->subject->subj));
146 } 145 }
147 if (item) { 146 if (item) {
148 if (item->folder && d_ptr->child && (deleted_mode == DMODE_INCLUDE || strcasecmp(item->file_as, "Deleted Items"))) { 147 if (item->folder && d_ptr->child && (deleted_mode == DMODE_INCLUDE || strcasecmp(item->file_as, "Deleted Items"))) {
149 //if this is a non-empty folder other than deleted items, we want to recurse into it 148 //if this is a non-empty folder other than deleted items, we want to recurse into it
150 if (output_mode != OUTPUT_QUIET) printf("Processing Folder \"%s\"\n", item->file_as); 149 if (output_mode != OUTPUT_QUIET) printf("Processing Folder \"%s\"\n", item->file_as);
764 void write_inline_attachment(FILE* f_output, pst_item_attach* attach, char *boundary, pst_file* pst) 763 void write_inline_attachment(FILE* f_output, pst_item_attach* attach, char *boundary, pst_file* pst)
765 { 764 {
766 char *attach_filename; 765 char *attach_filename;
767 char *enc = NULL; // base64 encoded attachment 766 char *enc = NULL; // base64 encoded attachment
768 DEBUG_ENT("write_inline_attachment"); 767 DEBUG_ENT("write_inline_attachment");
769 DEBUG_EMAIL(("Attachment Size is %i\n", attach->size)); 768 DEBUG_EMAIL(("Attachment Size is %i, pointer %p, id %d\n", attach->size, attach->data, attach->id_val));
770 DEBUG_EMAIL(("Attachment Pointer is %p\n", attach->data));
771 if (attach->data) { 769 if (attach->data) {
772 enc = base64_encode (attach->data, attach->size); 770 enc = base64_encode (attach->data, attach->size);
773 if (!enc) { 771 if (!enc) {
774 DEBUG_EMAIL(("ERROR base64_encode returned NULL. Must have failed\n")); 772 DEBUG_EMAIL(("ERROR base64_encode returned NULL. Must have failed\n"));
773 DEBUG_RET();
774 return;
775 }
776 }
777 else {
778 // make sure we can fetch data from the id
779 pst_index_ll *ptr = pst_getID(pst, attach->id_val);
780 if (!ptr) {
781 DEBUG_WARN(("Couldn't find ID pointer. Cannot save attachment to file\n"));
775 DEBUG_RET(); 782 DEBUG_RET();
776 return; 783 return;
777 } 784 }
778 } 785 }
779 786
962 DEBUG_EMAIL(("found 822 headers\n%s\n", headers)); 969 DEBUG_EMAIL(("found 822 headers\n%s\n", headers));
963 break; 970 break;
964 } 971 }
965 } 972 }
966 } 973 }
967 DEBUG_EMAIL(("skipping to next block after\n%s\n", headers)); 974 //DEBUG_EMAIL(("skipping to next block after\n%s\n", headers));
968 headers = temp+2; // skip to next chunk of headers 975 headers = temp+2; // skip to next chunk of headers
969 } 976 }
970 *extra_mime_headers = headers; 977 *extra_mime_headers = headers;
971 } 978 }
972 DEBUG_RET(); 979 DEBUG_RET();
973 } 980 }
974 981
975 982
976 void write_body_part(FILE* f_output, char *body, char *mime, char *charset, char *boundary) 983 void write_body_part(FILE* f_output, char *body, int32_t body_was_unicode, char *mime, char *charset, char *boundary, pst_file* pst)
977 { 984 {
978 char *needfree = NULL; 985 char *needfree = NULL;
979 DEBUG_ENT("write_body_part"); 986 DEBUG_ENT("write_body_part");
980 if (strcasecmp("utf-8", charset)) { 987 if (body_was_unicode && (strcasecmp("utf-8", charset))) {
981 // try to convert to the specified charset since it is not utf-8 988 // try to convert to the specified charset since the target
989 // is not utf-8, and the data came from a unicode (utf16) field
990 // and is now in utf-8.
982 size_t rc; 991 size_t rc;
983 DEBUG_EMAIL(("Convert %s utf-8 to %s\n", mime, charset)); 992 DEBUG_EMAIL(("Convert %s utf-8 to %s\n", mime, charset));
984 vbuf *newer = vballoc(2); 993 vbuf *newer = vballoc(2);
985 rc = vb_utf8to8bit(newer, body, strlen(body) + 1, charset); 994 rc = vb_utf8to8bit(newer, body, strlen(body) + 1, charset);
986 if (rc == (size_t)-1) { 995 if (rc == (size_t)-1) {
987 // unable to convert, maybe it is already in that character set 996 // unable to convert, change the charset to utf8
988 free(newer->b); 997 free(newer->b);
989 DEBUG_EMAIL(("Failed to convert %s utf-8 to %s\n", mime, charset)); 998 DEBUG_EMAIL(("Failed to convert %s utf-8 to %s\n", mime, charset));
999 charset = "utf-8";
990 } 1000 }
991 else { 1001 else {
992 needfree = body = newer->b; 1002 needfree = body = newer->b;
993 } 1003 }
994 free(newer); 1004 free(newer);
1010 else { 1020 else {
1011 write_email_body(f_output, body); 1021 write_email_body(f_output, body);
1012 } 1022 }
1013 if (needfree) free(needfree); 1023 if (needfree) free(needfree);
1014 DEBUG_RET(); 1024 DEBUG_RET();
1025 }
1026
1027
1028 const char* codepage(int cp) {
1029 static char buffer[20];
1030 switch (cp) {
1031 case 932 : return "iso-2022-jp";
1032 case 936 : return "gb2313";
1033 case 950 : return "big5";
1034 case 20127 : return "us-ascii";
1035 case 20269 : return "iso-6937";
1036 case 20865 : return "iso-8859-15";
1037 case 20866 : return "koi8-r";
1038 case 21866 : return "koi8-u";
1039 case 28591 : return "iso-8859-1";
1040 case 28592 : return "iso-8859-2";
1041 case 28595 : return "iso-8859-5";
1042 case 28596 : return "iso-8859-6";
1043 case 28597 : return "iso-8859-7";
1044 case 28598 : return "iso-8859-8";
1045 case 28599 : return "iso-8859-9";
1046 case 50220 : return "iso-2022-jp";
1047 case 50221 : return "csiso2022jp";
1048 case 51932 : return "euc-jp";
1049 case 51949 : return "euc-kr";
1050 case 65000 : return "utf-7";
1051 case 65001 : return "utf-8";
1052 default :
1053 snprintf(buffer, sizeof(buffer), "cp%d", cp);
1054 return buffer;
1055 }
1056 return NULL;
1015 } 1057 }
1016 1058
1017 1059
1018 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers) 1060 void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers)
1019 { 1061 {
1030 int has_from, has_subject, has_to, has_cc, has_date, has_msgid; 1072 int has_from, has_subject, has_to, has_cc, has_date, has_msgid;
1031 has_from = has_subject = has_to = has_cc = has_date = has_msgid = 0; 1073 has_from = has_subject = has_to = has_cc = has_date = has_msgid = 0;
1032 DEBUG_ENT("write_normal_email"); 1074 DEBUG_ENT("write_normal_email");
1033 1075
1034 // setup default body character set and report type 1076 // setup default body character set and report type
1035 snprintf(body_charset, sizeof(body_charset), "%s", (item->email->body_charset) ? item->email->body_charset : "utf-8"); 1077 snprintf(body_charset, sizeof(body_charset), "%s",
1078 (item->email->body_charset) ? item->email->body_charset :
1079 (item->email->message_codepage) ? codepage(item->email->message_codepage) :
1080 (item->email->internet_cpid) ? codepage(item->email->internet_cpid) :
1081 "utf-8");
1036 body_report[0] = '\0'; 1082 body_report[0] = '\0';
1037 1083
1038 // setup default sender 1084 // setup default sender
1039 if (item->email->sender_address && strchr(item->email->sender_address, '@')) { 1085 if (item->email->sender_address && strchr(item->email->sender_address, '@')) {
1040 temp = item->email->sender_address; 1086 temp = item->email->sender_address;
1121 if (item && item->email && item->email->subject && item->email->subject->subj) { 1167 if (item && item->email && item->email->subject && item->email->subject->subj) {
1122 DEBUG_EMAIL(("item->email->subject->subj = %s\n", item->email->subject->subj)); 1168 DEBUG_EMAIL(("item->email->subject->subj = %s\n", item->email->subject->subj));
1123 } 1169 }
1124 1170
1125 if (mode != MODE_SEPARATE) { 1171 if (mode != MODE_SEPARATE) {
1126 // most modes need this separator line 1172 // most modes need this separator line.
1127 fprintf(f_output, "From %s %s\n", sender, c_time); 1173 // procmail produces this separator without the quotes around the
1174 // sender email address, but apparently some Mac email client needs
1175 // those quotes, and they don't seem to cause problems for anyone else.
1176 fprintf(f_output, "From \"%s\" %s\n", sender, c_time);
1128 } 1177 }
1129 1178
1130 // print the supplied email headers 1179 // print the supplied email headers
1131 if (headers) { 1180 if (headers) {
1132 int len; 1181 int len;
1196 } 1245 }
1197 fprintf(f_output, "\n"); // end of headers, start of body 1246 fprintf(f_output, "\n"); // end of headers, start of body
1198 1247
1199 // now dump the body parts 1248 // now dump the body parts
1200 if (item->email->body) { 1249 if (item->email->body) {
1201 write_body_part(f_output, item->email->body, "text/plain", body_charset, boundary); 1250 write_body_part(f_output, item->email->body, item->email->body_was_unicode, "text/plain", body_charset, boundary, pst);
1251 }
1252
1253 if ((item->email->report_text) && (body_report[0] != '\0')) {
1254 write_body_part(f_output, item->email->report_text, item->email->report_was_unicode, "text/plain", body_charset, boundary, pst);
1255 fprintf(f_output, "\n");
1202 } 1256 }
1203 1257
1204 if (item->email->htmlbody) { 1258 if (item->email->htmlbody) {
1205 find_html_charset(item->email->htmlbody, body_charset, sizeof(body_charset)); 1259 find_html_charset(item->email->htmlbody, body_charset, sizeof(body_charset));
1206 write_body_part(f_output, item->email->htmlbody, "text/html", body_charset, boundary); 1260 write_body_part(f_output, item->email->htmlbody, item->email->htmlbody_was_unicode, "text/html", body_charset, boundary, pst);
1207 } 1261 }
1208 1262
1209 if (item->email->rtf_compressed && save_rtf) { 1263 if (item->email->rtf_compressed && save_rtf) {
1210 pst_item_attach* attach = (pst_item_attach*)xmalloc(sizeof(pst_item_attach)); 1264 pst_item_attach* attach = (pst_item_attach*)xmalloc(sizeof(pst_item_attach));
1211 DEBUG_EMAIL(("Adding RTF body as attachment\n")); 1265 DEBUG_EMAIL(("Adding RTF body as attachment\n"));