* (c) 2012 Martin Mares <mj@ucw.cz>
*/
+#define _GNU_SOURCE
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <getopt.h>
+#include <wchar.h>
+#include <locale.h>
#include <pcre.h>
fprintf(stderr, "Warning at line %d: ", line_number);
va_list args;
va_start(args, msg);
- vfprintf(stderr, args, msg);
+ vfprintf(stderr, msg, args);
va_end(args);
fputc('\n', stderr);
}
static int next_line(void)
{
for (;;) {
- int c = getchar();
+ int c = getchar_unlocked();
if (c == '\r')
continue;
if (c < 0)
}
}
+static int field_chars(struct field *f)
+{
+ unsigned char *s = line_nth(&in_line, f->start_pos);
+ int i = 0;
+ mbstate_t mbs;
+ memset(&mbs, 0, sizeof(mbs));
+
+ int chars = 0;
+ while (i < f->len) {
+ size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
+ if ((int) k <= 0)
+ break;
+ i += k;
+ chars++;
+ }
+
+ return chars;
+}
+
/*** CSV/TSV back-end */
static int csv_read(void)
{
int quoted = 0;
for (;;) {
- int c = getchar();
+ int c = getchar_unlocked();
int i = line_count(&in_line);
restart:
if (c == '\r')
}
if (quoted) {
if (c == in_format->quote) {
- c = getchar();
+ c = getchar_unlocked();
if (c != in_format->quote) {
quoted = 0;
goto restart;
}
}
if (i)
- putchar(out_format->fs);
+ putchar_unlocked(out_format->fs);
if (need_quotes)
- putchar(out_format->quote);
+ putchar_unlocked(out_format->quote);
for (int j=0; j < f->len; j++) {
int c = line[f->start_pos + j];
if (c == out_format->fs && !need_quotes)
warn(out_format, "Field separator found inside field and quoting is turned off.");
if (c == out_format->quote)
- putchar(c);
- putchar(c);
+ putchar_unlocked(c);
+ putchar_unlocked(c);
}
if (need_quotes)
- putchar(out_format->quote);
+ putchar_unlocked(out_format->quote);
}
- putchar('\n');
+ putchar_unlocked('\n');
}
/*** White-space back-end ***/
if (i)
printf("%*s", out_format->table_sep, "");
struct field *f = fields_nth(&in_fields, i);
- int w = *intarray_nth(&in_format->column_widths, i);
- if (f->len > w) {
- warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", f->len, w);
- w = f->len;
+ int fw = field_chars(f);
+ int cw = *intarray_nth(&in_format->column_widths, i);
+ if (fw > cw) {
+ warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
+ cw = fw;
}
- int j = 0;
unsigned char *p = line_nth(&in_line, f->start_pos);
- while (j < f->len) {
- putchar(*p++);
- j++;
- }
- while (j < w) {
- putchar(' ');
- j++;
+ for (int j = 0; j < f->len; j++)
+ putchar_unlocked(p[j]);
+ while (fw < cw) {
+ putchar_unlocked(' ');
+ fw++;
}
}
- putchar('\n');
+ putchar_unlocked('\n');
}
/*** Temporary file back-end ***/
FILE *tf = in_format->tmp_file;
for (;;) {
- int c = fgetc(tf);
+ int c = getc_unlocked(tf);
if (c < 0)
return 0;
if (c == 0xff)
return 1;
if (c == 0xfe) {
- c = fgetc(tf);
- c = (c << 8) | fgetc(tf);
- c = (c << 8) | fgetc(tf);
- c = (c << 8) | fgetc(tf);
+ c = getc_unlocked(tf);
+ c = (c << 8) | getc_unlocked(tf);
+ c = (c << 8) | getc_unlocked(tf);
+ c = (c << 8) | getc_unlocked(tf);
}
new_field(line_count(&in_line));
in_field->len = c;
while (c--) {
- int x = fgetc(tf);
+ int x = getc_unlocked(tf);
if (x < 0) {
warn(in_format, "Truncated temporary file");
return 0;
for (int i = 0; i < fields_count(&in_fields); i++) {
struct field *f = fields_nth(&in_fields, i);
if (f->len < 0xfe)
- fputc(f->len, tf);
+ putc_unlocked(f->len, tf);
else {
- fputc(0xfe, tf);
- fputc((f->len >> 24) & 0xff, tf);
- fputc((f->len >> 16) & 0xff, tf);
- fputc((f->len >> 8) & 0xff, tf);
- fputc(f->len & 0xff, tf);
+ putc_unlocked(0xfe, tf);
+ putc_unlocked((f->len >> 24) & 0xff, tf);
+ putc_unlocked((f->len >> 16) & 0xff, tf);
+ putc_unlocked((f->len >> 8) & 0xff, tf);
+ putc_unlocked(f->len & 0xff, tf);
}
unsigned char *p = line_nth(&in_line, f->start_pos);
for (int j = 0; j < f->len; j++)
- fputc(*p++, tf);
+ putc_unlocked(*p++, tf);
intarray_t *w = &out_format->column_widths;
while (i >= intarray_count(w))
*intarray_push(w) = 0;
- if (*intarray_nth(w, i) < f->len)
- *intarray_nth(w, i) = f->len;
+ int fw = field_chars(f);
+ if (*intarray_nth(w, i) < fw)
+ *intarray_nth(w, i) = fw;
}
- fputc(0xff, tf);
+ putc_unlocked(0xff, tf);
}
/*** Transforms ***/
{
struct format *final_format = out_format;
+ // We need to use character set info from the current locale
+ setlocale(LC_CTYPE, "");
+
// Pass 1: Set up writer of intermediate format
out_format = xmalloc_zero(sizeof(*out_format));
out_format->id = FORM_TMP;