2 * Incoming Mail Checker: Charsets
4 * (c) 2007 Martin Mares <mj@ucw.cz>
6 * The code for parsing rfc2047 encoding of headers has been adapted
7 * from the Mutt 1.5.16 MUA. Here is the original copyright message:
9 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
10 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
42 static char *system_charset;
44 #define strfcpy(A,B,C) strncpy(A,B,C), *(A+(C)-1)=0
52 static int Index_hex[128] = {
53 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
54 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
55 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
56 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1, -1,-1,-1,-1,
57 -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
58 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
59 -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
60 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
63 static int Index_64[128] = {
64 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
65 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
66 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
67 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
68 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
69 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
70 -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
71 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
74 #define hexval(c) Index_hex[(unsigned int)(c)]
75 #define base64val(c) Index_64[(unsigned int)(c)]
77 #define option_OPTIGNORELWS 1
79 static size_t convert_string (char *f, size_t flen,
80 const char *from, const char *to,
81 char **t, size_t *tlen)
88 cd = iconv_open (to, from);
89 if (cd == (iconv_t)(-1))
92 ob = buf = xmalloc (obl);
93 n = iconv (cd, &f, &flen, &ob, &obl);
94 if (n == (size_t)(-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1))
106 buf = xrealloc (buf, ob - buf + 1);
113 static int rfc2047_decode_word (char *d, const char *s, size_t len)
115 const char *pp, *pp1;
118 int enc = 0, count = 0;
119 char *charset = NULL;
121 pd = d0 = xmalloc (strlen (s));
123 for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1)
129 /* ignore language specification a la RFC 2231 */
131 if ((t1 = memchr (pp, '*', t - pp)))
133 charset = xmalloc (t - pp + 1);
134 memcpy (charset, pp, t - pp);
135 charset[t-pp] = '\0';
138 if (toupper ((unsigned char) *pp) == 'Q')
139 enc = ENCQUOTEDPRINTABLE;
140 else if (toupper ((unsigned char) *pp) == 'B')
150 if (enc == ENCQUOTEDPRINTABLE)
152 for (; pp < pp1; pp++)
156 else if (*pp == '=' &&
157 (!(pp[1] & ~127) && hexval(pp[1]) != -1) &&
158 (!(pp[2] & ~127) && hexval(pp[2]) != -1))
160 *pd++ = (hexval(pp[1]) << 4) | hexval(pp[2]);
168 else if (enc == ENCBASE64)
172 for (; pp < pp1; pp++)
176 if ((*pp & ~127) || (c = base64val(*pp)) == -1)
181 *pd++ = b | (c >> k);
196 if (charset && system_charset)
200 if (convert_string (d0, strlen(d0), charset, system_charset, &dnew, &dlen) == (size_t) -1)
202 debug("Charset conversion failure: <%s> from %s (%m)\n", d0, charset);
210 strfcpy (d, d0, len);
217 * Find the start and end of the first encoded word in the string.
218 * We use the grammar in section 2 of RFC 2047, but the "encoding"
219 * must be B or Q. Also, we don't require the encoded word to be
220 * separated by linear-white-space (section 5(1)).
222 static const char *find_encoded_word (const char *s, const char **x)
227 while ((p = strstr (q, "=?")))
230 0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q);
233 if (q[0] != '?' || !strchr ("BbQq", q[1]) || q[2] != '?')
235 /* non-strict check since many MUAs will not encode spaces and question marks */
236 for (q = q + 3; 0x20 <= *q && *q < 0x7f && (*q != '?' || q[1] != '='); q++)
238 if (q[0] != '?' || q[1] != '=')
251 /* return length of linear-white-space */
252 static size_t lwslen (const char *s, size_t n)
260 for (; p < s + n; p++)
261 if (!strchr (" \t\r\n", *p))
263 len = (size_t)(p - s);
266 if (strchr ("\r\n", *(p-1))) /* LWS doesn't end with CRLF */
271 /* return length of linear-white-space : reverse */
272 static size_t lwsrlen (const char *s, size_t n)
274 const char *p = s + n - 1;
280 if (strchr ("\r\n", *p)) /* LWS doesn't end with CRLF */
284 if (!strchr (" \t\r\n", *p))
286 len = (size_t)(s + n - 1 - p);
292 /* try to decode anything that looks like a valid RFC2047 encoded
293 * header field, ignoring RFC822 parsing rules
295 static void rfc2047_decode (char **pd)
299 int found_encoded = 0;
307 dlen = 4 * strlen (s); /* should be enough */
308 d = d0 = xmalloc (dlen + 1);
310 while (*s && dlen > 0)
312 if (!(p = find_encoded_word (s, &q)))
314 /* no encoded words */
315 if (option_OPTIGNORELWS)
318 if (found_encoded && (m = lwslen (s, n)) != 0)
321 *d = ' ', d++, dlen--;
325 strncpy (d, s, dlen);
332 n = (size_t) (p - s);
333 /* ignore spaces between encoded word
334 * and linear-white-space between encoded word and *text */
335 if (option_OPTIGNORELWS)
337 if (found_encoded && (m = lwslen (s, n)) != 0)
340 *d = ' ', d++, dlen--;
344 if ((m = n - lwsrlen (s, n)) != 0)
352 *d = ' ', d++, dlen--;
355 else if (!found_encoded || strspn (s, " \t\r\n") != n)
365 if (rfc2047_decode_word (d, p, dlen) < 0)
379 /* Initialize the whole machinery */
383 setlocale(LC_CTYPE, "");
384 system_charset = nl_langinfo(CODESET);
385 if (!system_charset[0])
386 system_charset = NULL;
389 /* FIXME: Use iconvctl() if available? */
390 char *t = xmalloc(strlen(system_charset) + 11);
391 sprintf(t, "%s//TRANSLIT", system_charset);
394 debug("Charset is %s\n", system_charset);
398 add_snippet(char **ppos, char *term, char *add)
402 mbtowc(NULL, NULL, 0);
404 while (pos + MB_CUR_MAX < term)
407 int l = mbtowc(&c, add, MB_CUR_MAX);
434 add_snippet_raw(char **ppos, char *term, char *add)
437 while (pos < term && *add)
444 add_subject_snippet(char **ppos, char *term, char *add)
446 char *buf = xstrdup(add);
447 rfc2047_decode(&buf);
448 add_snippet(ppos, term, buf);
453 add_addr_snippet(char **ppos, char *term, char *add, int add_mbox, int add_personal)
455 ADDRESS *addr = rfc822_parse_adrlist(NULL, add);
458 debug("%s: Cannot parse address (%s)\n", add, rfc822_error(RFC822Error));
459 add_subject_snippet(ppos, term, add);
462 rfc2047_decode(&addr->personal);
463 // debug("%s: pers=%s mbox=%s\n", add, addr->personal, addr->mailbox);
464 if (!addr->mailbox || !addr->mailbox[0])
466 if (!addr->personal || !addr->personal[0])
468 if (addr->mailbox && addr->mailbox[0])
470 char *c = strchr(addr->mailbox, '@');
477 if (add_mbox || add_personal)
480 add_snippet(ppos, term, addr->personal);
481 if (add_mbox && add_personal)
482 add_snippet_raw(ppos, term, " <");
484 add_snippet(ppos, term, addr->mailbox);
485 if (add_mbox && add_personal)
486 add_snippet_raw(ppos, term, ">");
489 add_snippet_raw(ppos, term, "???");
490 rfc822_free_address(&addr);