charset.c

   1 /*
   2  *      Incoming Mail Checker: Charsets
   3  *
   4  *      (c) 2007 Martin Mares <mj@ucw.cz>
   5  *
   6  * The code for parsing rfc2047 encoding of headers has been adapted
   7  * from the Mutt 1.5.16 MUA. Here is the original copyright message:
   8  *
   9  * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
  10  * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
  11  *
  12  *     This program is free software; you can redistribute it and/or modify
  13  *     it under the terms of the GNU General Public License as published by
  14  *     the Free Software Foundation; either version 2 of the License, or
  15  *     (at your option) any later version.
  16  *
  17  *     This program is distributed in the hope that it will be useful,
  18  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  *     GNU General Public License for more details.
  21  *
  22  *     You should have received a copy of the GNU General Public License
  23  *     along with this program; if not, write to the Free Software
  24  *     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  25  */
  26
  27 #include "util.h"
  28 #include "rfc822.h"
  29 #include "charset.h"
  30
  31 #include <ctype.h>
  32 #include <errno.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <wchar.h>
  37 #include <wctype.h>
  38 #include <locale.h>
  39 #include <langinfo.h>
  40 #include <iconv.h>
  41
  42 static char *system_charset;
  43
  44 #define strfcpy(A,B,C) strncpy(A,B,C), *(A+(C)-1)=0
  45
  46 enum encoding {
  47   ENCOTHER,
  48   ENCQUOTEDPRINTABLE,
  49   ENCBASE64,
  50 };
  51
  52 static int Index_hex[128] = {
  53     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  54     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  55     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  56      0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,
  57     -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  58     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  59     -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  60     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
  61 };
  62
  63 static int Index_64[128] = {
  64     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  65     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
  66     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
  67     52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
  68     -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
  69     15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
  70     -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
  71     41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
  72 };
  73
  74 #define hexval(c) Index_hex[(unsigned int)(c)]
  75 #define base64val(c) Index_64[(unsigned int)(c)]
  76
  77 #define option_OPTIGNORELWS 1
  78
  79 static size_t convert_string (char *f, size_t flen,
  80                               const char *from, const char *to,
  81                               char **t, size_t *tlen)
  82 {
  83   iconv_t cd;
  84   char *buf, *ob;
  85   size_t obl, n;
  86   int e;
  87
  88   cd = iconv_open (to, from);
  89   if (cd == (iconv_t)(-1))
  90     return (size_t)(-1);
  91   obl = 4 * flen + 1;
  92   ob = buf = xmalloc (obl);
  93   n = iconv (cd, &f, &flen, &ob, &obl);
  94   if (n == (size_t)(-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1))
  95   {
  96     e = errno;
  97     free(buf);
  98     iconv_close (cd);
  99     errno = e;
 100     return (size_t)(-1);
 101   }
 102   *ob = '\0';
 103
 104   *tlen = ob - buf;
 105
 106   buf = xrealloc (buf, ob - buf + 1);
 107   *t = buf;
 108   iconv_close (cd);
 109
 110   return n;
 111 }
 112
 113 static int rfc2047_decode_word (char *d, const char *s, size_t len)
 114 {
 115   const char *pp, *pp1;
 116   char *pd, *d0;
 117   const char *t, *t1;
 118   int enc = 0, count = 0;
 119   char *charset = NULL;
 120
 121   pd = d0 = xmalloc (strlen (s));
 122
 123   for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1)
 124   {
 125     count++;
 126     switch (count)
 127     {
 128       case 2:
 129         /* ignore language specification a la RFC 2231 */
 130         t = pp1;
 131         if ((t1 = memchr (pp, '*', t - pp)))
 132           t = t1;
 133         charset = xmalloc (t - pp + 1);
 134         memcpy (charset, pp, t - pp);
 135         charset[t-pp] = '\0';
 136         break;
 137       case 3:
 138         if (toupper ((unsigned char) *pp) == 'Q')
 139           enc = ENCQUOTEDPRINTABLE;
 140         else if (toupper ((unsigned char) *pp) == 'B')
 141           enc = ENCBASE64;
 142         else
 143         {
 144           free(charset);
 145           free(d0);
 146           return (-1);
 147         }
 148         break;
 149       case 4:
 150         if (enc == ENCQUOTEDPRINTABLE)
 151         {
 152           for (; pp < pp1; pp++)
 153           {
 154             if (*pp == '_')
 155               *pd++ = ' ';
 156             else if (*pp == '=' &&
 157                      (!(pp[1] & ~127) && hexval(pp[1]) != -1) &&
 158                      (!(pp[2] & ~127) && hexval(pp[2]) != -1))
 159             {
 160               *pd++ = (hexval(pp[1]) << 4) | hexval(pp[2]);
 161               pp += 2;
 162             }
 163             else
 164               *pd++ = *pp;
 165           }
 166           *pd = 0;
 167         }
 168         else if (enc == ENCBASE64)
 169         {
 170           int c, b = 0, k = 0;
 171
 172           for (; pp < pp1; pp++)
 173           {
 174             if (*pp == '=')
 175               break;
 176             if ((*pp & ~127) || (c = base64val(*pp)) == -1)
 177               continue;
 178             if (k + 6 >= 8)
 179             {
 180               k -= 2;
 181               *pd++ = b | (c >> k);
 182               b = c << (8 - k);
 183             }
 184             else
 185             {
 186               b |= c << (k + 2);
 187               k += 6;
 188             }
 189           }
 190           *pd = 0;
 191         }
 192         break;
 193     }
 194   }
 195
 196   if (charset && system_charset)
 197     {
 198       char *dnew;
 199       size_t dlen;
 200       if (convert_string (d0, strlen(d0), charset, system_charset, &dnew, &dlen) == (size_t) -1)
 201         {
 202           debug("Charset conversion failure: <%s> from %s (%m)\n", d0, charset);
 203           free (charset);
 204           free (d0);
 205           return (-1);
 206         }
 207       free (d0);
 208       d0 = dnew;
 209     }
 210   strfcpy (d, d0, len);
 211   free (charset);
 212   free (d0);
 213   return (0);
 214 }
 215
 216 /*
 217  * Find the start and end of the first encoded word in the string.
 218  * We use the grammar in section 2 of RFC 2047, but the "encoding"
 219  * must be B or Q. Also, we don't require the encoded word to be
 220  * separated by linear-white-space (section 5(1)).
 221  */
 222 static const char *find_encoded_word (const char *s, const char **x)
 223 {
 224   const char *p, *q;
 225
 226   q = s;
 227   while ((p = strstr (q, "=?")))
 228   {
 229     for (q = p + 2;
 230          0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q);
 231          q++)
 232       ;
 233     if (q[0] != '?' || !strchr ("BbQq", q[1]) || q[2] != '?')
 234       continue;
 235     /* non-strict check since many MUAs will not encode spaces and question marks */
 236     for (q = q + 3; 0x20 <= *q && *q < 0x7f && (*q != '?' || q[1] != '='); q++)
 237       ;
 238     if (q[0] != '?' || q[1] != '=')
 239     {
 240       --q;
 241       continue;
 242     }
 243
 244     *x = q + 2;
 245     return p;
 246   }
 247
 248   return 0;
 249 }
 250
 251 /* return length of linear-white-space */
 252 static size_t lwslen (const char *s, size_t n)
 253 {
 254   const char *p = s;
 255   size_t len = n;
 256
 257   if (n <= 0)
 258     return 0;
 259
 260   for (; p < s + n; p++)
 261     if (!strchr (" \t\r\n", *p))
 262     {
 263       len = (size_t)(p - s);
 264       break;
 265     }
 266   if (strchr ("\r\n", *(p-1))) /* LWS doesn't end with CRLF */
 267     len = (size_t)0;
 268   return len;
 269 }
 270
 271 /* return length of linear-white-space : reverse */
 272 static size_t lwsrlen (const char *s, size_t n)
 273 {
 274   const char *p = s + n - 1;
 275   size_t len = n;
 276
 277   if (n <= 0)
 278     return 0;
 279
 280   if (strchr ("\r\n", *p)) /* LWS doesn't end with CRLF */
 281     return (size_t)0;
 282
 283   for (; p >= s; p--)
 284     if (!strchr (" \t\r\n", *p))
 285     {
 286       len = (size_t)(s + n - 1 - p);
 287       break;
 288     }
 289   return len;
 290 }
 291
 292 /* try to decode anything that looks like a valid RFC2047 encoded
 293  * header field, ignoring RFC822 parsing rules
 294  */
 295 static void rfc2047_decode (char **pd)
 296 {
 297   const char *p, *q;
 298   size_t m, n;
 299   int found_encoded = 0;
 300   char *d0, *d;
 301   const char *s = *pd;
 302   size_t dlen;
 303
 304   if (!s || !*s)
 305     return;
 306
 307   dlen = 4 * strlen (s); /* should be enough */
 308   d = d0 = xmalloc (dlen + 1);
 309
 310   while (*s && dlen > 0)
 311   {
 312     if (!(p = find_encoded_word (s, &q)))
 313     {
 314       /* no encoded words */
 315       if (option_OPTIGNORELWS)
 316       {
 317         n = strlen (s);
 318         if (found_encoded && (m = lwslen (s, n)) != 0)
 319         {
 320           if (m != n)
 321             *d = ' ', d++, dlen--;
 322           s += m;
 323         }
 324       }
 325       strncpy (d, s, dlen);
 326       d += dlen;
 327       break;
 328     }
 329
 330     if (p != s)
 331     {
 332       n = (size_t) (p - s);
 333       /* ignore spaces between encoded word
 334        * and linear-white-space between encoded word and *text */
 335       if (option_OPTIGNORELWS)
 336       {
 337         if (found_encoded && (m = lwslen (s, n)) != 0)
 338         {
 339           if (m != n)
 340             *d = ' ', d++, dlen--;
 341           n -= m, s += m;
 342         }
 343
 344         if ((m = n - lwsrlen (s, n)) != 0)
 345         {
 346           if (m > dlen)
 347             m = dlen;
 348           memcpy (d, s, m);
 349           d += m;
 350           dlen -= m;
 351           if (m != n)
 352             *d = ' ', d++, dlen--;
 353         }
 354       }
 355       else if (!found_encoded || strspn (s, " \t\r\n") != n)
 356       {
 357         if (n > dlen)
 358           n = dlen;
 359         memcpy (d, s, n);
 360         d += n;
 361         dlen -= n;
 362       }
 363     }
 364
 365     if (rfc2047_decode_word (d, p, dlen) < 0)
 366     {
 367       n = q - p;
 368       if (n > dlen)
 369         n = dlen;
 370       memcpy (d, p, n);
 371     }
 372     found_encoded = 1;
 373     s = q;
 374     n = strlen (d);
 375     dlen -= n;
 376     d += n;
 377   }
 378   *d = 0;
 379
 380   free (*pd);
 381   *pd = d0;
 382 }
 383
 384 /* Initialize the whole machinery */
 385 void
 386 charset_init(void)
 387 {
 388   setlocale(LC_CTYPE, "");
 389   system_charset = nl_langinfo(CODESET);
 390   if (!system_charset[0])
 391     system_charset = NULL;
 392   if (system_charset)
 393     {
 394       /* FIXME: Use iconvctl() if available? */
 395       char *t = xmalloc(strlen(system_charset) + 11);
 396       sprintf(t, "%s//TRANSLIT", system_charset);
 397       system_charset = t;
 398     }
 399   debug("Charset is %s\n", system_charset);
 400 }
 401
 402 void
 403 add_snippet(char **ppos, char *term, char *add)
 404 {
 405   char *pos = *ppos;
 406   int space = 1;
 407   mbtowc(NULL, NULL, 0);
 408
 409   while (pos + MB_CUR_MAX < term)
 410     {
 411       wchar_t c;
 412       int l = mbtowc(&c, add, MB_CUR_MAX);
 413       if (!l)
 414         break;
 415       if (l < 0)
 416         {
 417           l = 1;
 418           c = '?';
 419         }
 420       add += l;
 421       if (!iswprint(c))
 422         c = '?';
 423       if (iswspace(c))
 424         {
 425           if (space)
 426             continue;
 427           space = 1;
 428         }
 429       else
 430         space = 0;
 431       l = wctomb(pos, c);
 432       pos += l;
 433     }
 434   *ppos = pos;
 435   *pos = 0;
 436 }
 437
 438 void
 439 add_snippet_raw(char **ppos, char *term, char *add)
 440 {
 441   char *pos = *ppos;
 442   while (pos < term && *add)
 443     *pos++ = *add++;
 444   *ppos = pos;
 445   *pos = 0;
 446 }
 447
 448 void
 449 add_subject_snippet(char **ppos, char *term, char *add)
 450 {
 451   char *buf = xstrdup(add);
 452   rfc2047_decode(&buf);
 453   add_snippet(ppos, term, buf);
 454   free(buf);
 455 }
 456
 457 void
 458 add_addr_snippet(char **ppos, char *term, char *add, int add_mbox, int add_personal)
 459 {
 460   ADDRESS *addr = rfc822_parse_adrlist(NULL, add);
 461   if (!addr)
 462     {
 463       debug("%s: Cannot parse address (%s)\n", add, rfc822_error(RFC822Error));
 464       add_subject_snippet(ppos, term, add);
 465       return;
 466     }
 467   rfc2047_decode(&addr->personal);
 468   // debug("%s: pers=%s mbox=%s\n", add, addr->personal, addr->mailbox);
 469   if (!addr->mailbox || !addr->mailbox[0])
 470     add_mbox = 0;
 471   if (!addr->personal || !addr->personal[0])
 472     {
 473       if (addr->mailbox && addr->mailbox[0])
 474         {
 475           char *c = strchr(addr->mailbox, '@');
 476           if (c)
 477             *c = 0;
 478           add_mbox = 1;
 479         }
 480       add_personal = 0;
 481     }
 482   if (add_mbox || add_personal)
 483     {
 484       if (add_personal)
 485         add_snippet(ppos, term, addr->personal);
 486       if (add_mbox && add_personal)
 487         add_snippet_raw(ppos, term, " <");
 488       if (add_mbox)
 489         add_snippet(ppos, term, addr->mailbox);
 490       if (add_mbox && add_personal)
 491         add_snippet_raw(ppos, term, ">");
 492     }
 493   else
 494     add_snippet_raw(ppos, term, "???");
 495   rfc822_free_address(&addr);
 496 }