Mercurial > libpst
diff src/readpst.c @ 142:2189a6b8134e
improve character set handling - don't try to convert utf-8 to single byte for fields that were not originally unicode.
if the conversion fails, leave the data in utf-8.
author | Carl Byington <carl@five-ten-sg.com> |
---|---|
date | Mon, 23 Feb 2009 20:40:51 -0800 |
parents | fd4297884319 |
children | fdc58ad2c758 |
line wrap: on
line diff
--- a/src/readpst.c Sat Feb 14 11:02:37 2009 -0800 +++ b/src/readpst.c Mon Feb 23 20:40:51 2009 -0800 @@ -51,7 +51,7 @@ int test_base64(char *body); void find_html_charset(char *html, char *charset, size_t charsetlen); void find_rfc822_headers(char** extra_mime_headers); -void write_body_part(FILE* f_output, char *body, char *mime, char *charset, char *boundary); +void write_body_part(FILE* f_output, char *body, int32_t body_was_unicode, char *mime, char *charset, char *boundary, pst_file* pst); void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers); void write_vcard(FILE* f_output, pst_item_contact* contact, char comment[]); void write_appointment(FILE* f_output, pst_item_appointment* appointment, @@ -136,13 +136,12 @@ ff.skip_count++; } else { - DEBUG_MAIN(("main: Desc Email ID %#x [d_ptr->id = %#x]\n", d_ptr->desc->id, d_ptr->id)); + DEBUG_MAIN(("main: Desc Email ID %#"PRIx64" [d_ptr->id = %#"PRIx64"]\n", d_ptr->desc->id, d_ptr->id)); item = pst_parse_item(&pstfile, d_ptr); DEBUG_MAIN(("main: About to process item\n")); if (item && item->email && item->email->subject && item->email->subject->subj) { - DEBUG_EMAIL(("item->email->subject = %p\n", item->email->subject)); - DEBUG_EMAIL(("item->email->subject->subj = %p\n", item->email->subject->subj)); + DEBUG_EMAIL(("item->email->subject->subj = %s\n", item->email->subject->subj)); } if (item) { if (item->folder && d_ptr->child && (deleted_mode == DMODE_INCLUDE || strcasecmp(item->file_as, "Deleted Items"))) { @@ -766,8 +765,7 @@ char *attach_filename; char *enc = NULL; // base64 encoded attachment DEBUG_ENT("write_inline_attachment"); - DEBUG_EMAIL(("Attachment Size is %i\n", attach->size)); - DEBUG_EMAIL(("Attachment Pointer is %p\n", attach->data)); + DEBUG_EMAIL(("Attachment Size is %i, pointer %p, id %d\n", attach->size, attach->data, attach->id_val)); if (attach->data) { enc = base64_encode (attach->data, attach->size); if (!enc) { @@ -776,6 +774,15 @@ return; } } + else { + // make sure we can fetch data from the id + pst_index_ll *ptr = pst_getID(pst, attach->id_val); + if (!ptr) { + DEBUG_WARN(("Couldn't find ID pointer. Cannot save attachment to file\n")); + DEBUG_RET(); + return; + } + } fprintf(f_output, "\n--%s\n", boundary); if (!attach->mimetype) { @@ -964,7 +971,7 @@ } } } - DEBUG_EMAIL(("skipping to next block after\n%s\n", headers)); + //DEBUG_EMAIL(("skipping to next block after\n%s\n", headers)); headers = temp+2; // skip to next chunk of headers } *extra_mime_headers = headers; @@ -973,20 +980,23 @@ } -void write_body_part(FILE* f_output, char *body, char *mime, char *charset, char *boundary) +void write_body_part(FILE* f_output, char *body, int32_t body_was_unicode, char *mime, char *charset, char *boundary, pst_file* pst) { char *needfree = NULL; DEBUG_ENT("write_body_part"); - if (strcasecmp("utf-8", charset)) { - // try to convert to the specified charset since it is not utf-8 + if (body_was_unicode && (strcasecmp("utf-8", charset))) { + // try to convert to the specified charset since the target + // is not utf-8, and the data came from a unicode (utf16) field + // and is now in utf-8. size_t rc; DEBUG_EMAIL(("Convert %s utf-8 to %s\n", mime, charset)); vbuf *newer = vballoc(2); rc = vb_utf8to8bit(newer, body, strlen(body) + 1, charset); if (rc == (size_t)-1) { - // unable to convert, maybe it is already in that character set + // unable to convert, change the charset to utf8 free(newer->b); DEBUG_EMAIL(("Failed to convert %s utf-8 to %s\n", mime, charset)); + charset = "utf-8"; } else { needfree = body = newer->b; @@ -1015,6 +1025,38 @@ } +const char* codepage(int cp) { + static char buffer[20]; + switch (cp) { + case 932 : return "iso-2022-jp"; + case 936 : return "gb2313"; + case 950 : return "big5"; + case 20127 : return "us-ascii"; + case 20269 : return "iso-6937"; + case 20865 : return "iso-8859-15"; + case 20866 : return "koi8-r"; + case 21866 : return "koi8-u"; + case 28591 : return "iso-8859-1"; + case 28592 : return "iso-8859-2"; + case 28595 : return "iso-8859-5"; + case 28596 : return "iso-8859-6"; + case 28597 : return "iso-8859-7"; + case 28598 : return "iso-8859-8"; + case 28599 : return "iso-8859-9"; + case 50220 : return "iso-2022-jp"; + case 50221 : return "csiso2022jp"; + case 51932 : return "euc-jp"; + case 51949 : return "euc-kr"; + case 65000 : return "utf-7"; + case 65001 : return "utf-8"; + default : + snprintf(buffer, sizeof(buffer), "cp%d", cp); + return buffer; + } + return NULL; +} + + void write_normal_email(FILE* f_output, char f_name[], pst_item* item, int mode, int mode_MH, pst_file* pst, int save_rtf, char** extra_mime_headers) { char boundary[60]; @@ -1032,7 +1074,11 @@ DEBUG_ENT("write_normal_email"); // setup default body character set and report type - snprintf(body_charset, sizeof(body_charset), "%s", (item->email->body_charset) ? item->email->body_charset : "utf-8"); + snprintf(body_charset, sizeof(body_charset), "%s", + (item->email->body_charset) ? item->email->body_charset : + (item->email->message_codepage) ? codepage(item->email->message_codepage) : + (item->email->internet_cpid) ? codepage(item->email->internet_cpid) : + "utf-8"); body_report[0] = '\0'; // setup default sender @@ -1123,8 +1169,11 @@ } if (mode != MODE_SEPARATE) { - // most modes need this separator line - fprintf(f_output, "From %s %s\n", sender, c_time); + // most modes need this separator line. + // procmail produces this separator without the quotes around the + // sender email address, but apparently some Mac email client needs + // those quotes, and they don't seem to cause problems for anyone else. + fprintf(f_output, "From \"%s\" %s\n", sender, c_time); } // print the supplied email headers @@ -1198,12 +1247,17 @@ // now dump the body parts if (item->email->body) { - write_body_part(f_output, item->email->body, "text/plain", body_charset, boundary); + write_body_part(f_output, item->email->body, item->email->body_was_unicode, "text/plain", body_charset, boundary, pst); + } + + if ((item->email->report_text) && (body_report[0] != '\0')) { + write_body_part(f_output, item->email->report_text, item->email->report_was_unicode, "text/plain", body_charset, boundary, pst); + fprintf(f_output, "\n"); } if (item->email->htmlbody) { find_html_charset(item->email->htmlbody, body_charset, sizeof(body_charset)); - write_body_part(f_output, item->email->htmlbody, "text/html", body_charset, boundary); + write_body_part(f_output, item->email->htmlbody, item->email->htmlbody_was_unicode, "text/html", body_charset, boundary, pst); } if (item->email->rtf_compressed && save_rtf) {