* (c) 2012 Martin Mares <mj@ucw.cz>
*/
+#define _GNU_SOURCE
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <getopt.h>
+#include <wchar.h>
+#include <locale.h>
#include <pcre.h>
return p;
}
+static void *xmalloc_zero(size_t bytes)
+{
+ void *p = xmalloc(bytes);
+ memset(p, 0, bytes);
+ return p;
+}
+
static void *xrealloc(void *old, size_t bytes)
{
void *p = realloc(old, bytes);
static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
// end
+DECLARE_BUF(intarray, int);
+
/*** Formats and their parameters ***/
enum format_id {
FORM_CSV,
FORM_WS,
FORM_REGEX,
+ FORM_TMP,
+ FORM_TABLE,
};
struct format {
int quiet;
int (*read_line)(void);
void (*write_line)(void);
+ int needs_two_passes;
+
// CSV backend:
int always_quote;
+
+ // WS backend:
+ int strict_ws;
+
// regex backend:
pcre *pcre;
pcre_extra *pcre_extra;
+
+ // Temporary file backend:
+ FILE *tmp_file;
+ intarray_t column_widths;
+
+ // Table backend:
+ int table_sep;
};
static struct format *in_format, *out_format;
+static int want_trim;
struct field {
int start_pos;
fprintf(stderr, "Warning at line %d: ", line_number);
va_list args;
va_start(args, msg);
- vfprintf(stderr, args, msg);
+ vfprintf(stderr, msg, args);
va_end(args);
fputc('\n', stderr);
}
static int next_line(void)
{
for (;;) {
- int c = getchar();
+ int c = getchar_unlocked();
if (c == '\r')
continue;
if (c < 0)
}
}
+static int field_chars(struct field *f)
+{
+ unsigned char *s = line_nth(&in_line, f->start_pos);
+ int i = 0;
+ mbstate_t mbs;
+ memset(&mbs, 0, sizeof(mbs));
+
+ int chars = 0;
+ while (i < f->len) {
+ size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
+ if ((int) k <= 0)
+ break;
+ i += k;
+ chars++;
+ }
+
+ return chars;
+}
+
+/*** CSV/TSV back-end */
+
static int csv_read(void)
{
int quoted = 0;
for (;;) {
- int c = getchar();
+ int c = getchar_unlocked();
int i = line_count(&in_line);
restart:
if (c == '\r')
}
if (quoted) {
if (c == in_format->quote) {
- c = getchar();
+ c = getchar_unlocked();
if (c != in_format->quote) {
quoted = 0;
goto restart;
}
}
if (i)
- putchar(out_format->fs);
+ putchar_unlocked(out_format->fs);
if (need_quotes)
- putchar(out_format->quote);
+ putchar_unlocked(out_format->quote);
for (int j=0; j < f->len; j++) {
int c = line[f->start_pos + j];
if (c == out_format->fs && !need_quotes)
warn(out_format, "Field separator found inside field and quoting is turned off.");
if (c == out_format->quote)
- putchar(c);
- putchar(c);
+ putchar_unlocked(c);
+ putchar_unlocked(c);
}
if (need_quotes)
- putchar(out_format->quote);
+ putchar_unlocked(out_format->quote);
}
- putchar('\n');
+ putchar_unlocked('\n');
}
+/*** White-space back-end ***/
+
static int ws_read(void)
{
if (!next_line())
ws++;
} else {
if (ws) {
- new_field(i);
+ if (!in_field->start_pos &&
+ !in_field->len &&
+ !in_format->strict_ws)
+ in_field->start_pos = i;
+ else
+ new_field(i);
ws = 0;
}
in_field->len++;
}
}
- if (ws)
+ if (ws && in_format->strict_ws)
new_field(n);
return 1;
}
+/*** Regex back-end ***/
+
static const char *regex_set(struct format *f, char *rx)
{
const char *err;
}
}
+/*** Table back-end ***/
+
+static void table_write(void)
+{
+ for (int i = 0; i < fields_count(&in_fields); i++) {
+ if (i)
+ printf("%*s", out_format->table_sep, "");
+ struct field *f = fields_nth(&in_fields, i);
+ int fw = field_chars(f);
+ int cw = *intarray_nth(&in_format->column_widths, i);
+ if (fw > cw) {
+ warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
+ cw = fw;
+ }
+ unsigned char *p = line_nth(&in_line, f->start_pos);
+ for (int j = 0; j < f->len; j++)
+ putchar_unlocked(p[j]);
+ while (fw < cw) {
+ putchar_unlocked(' ');
+ fw++;
+ }
+ }
+ putchar_unlocked('\n');
+}
+
+/*** Temporary file back-end ***/
+
+static int tmp_read(void)
+{
+ FILE *tf = in_format->tmp_file;
+
+ for (;;) {
+ int c = getc_unlocked(tf);
+ if (c < 0)
+ return 0;
+ if (c == 0xff)
+ return 1;
+ if (c == 0xfe) {
+ c = getc_unlocked(tf);
+ c = (c << 8) | getc_unlocked(tf);
+ c = (c << 8) | getc_unlocked(tf);
+ c = (c << 8) | getc_unlocked(tf);
+ }
+ new_field(line_count(&in_line));
+ in_field->len = c;
+ while (c--) {
+ int x = getc_unlocked(tf);
+ if (x < 0) {
+ warn(in_format, "Truncated temporary file");
+ return 0;
+ }
+ *line_push(&in_line) = x;
+ }
+ }
+}
+
+static void tmp_write(void)
+{
+ FILE *tf = out_format->tmp_file;
+
+ for (int i = 0; i < fields_count(&in_fields); i++) {
+ struct field *f = fields_nth(&in_fields, i);
+ if (f->len < 0xfe)
+ putc_unlocked(f->len, tf);
+ else {
+ putc_unlocked(0xfe, tf);
+ putc_unlocked((f->len >> 24) & 0xff, tf);
+ putc_unlocked((f->len >> 16) & 0xff, tf);
+ putc_unlocked((f->len >> 8) & 0xff, tf);
+ putc_unlocked(f->len & 0xff, tf);
+ }
+
+ unsigned char *p = line_nth(&in_line, f->start_pos);
+ for (int j = 0; j < f->len; j++)
+ putc_unlocked(*p++, tf);
+
+ intarray_t *w = &out_format->column_widths;
+ while (i >= intarray_count(w))
+ *intarray_push(w) = 0;
+ int fw = field_chars(f);
+ if (*intarray_nth(w, i) < fw)
+ *intarray_nth(w, i) = fw;
+ }
+ putc_unlocked(0xff, tf);
+}
+
/*** Transforms ***/
static void trim_fields(void)
}
}
+/*** Processing of files ***/
+
+static void one_pass(void)
+{
+ line_number = 0;
+ for (;;) {
+ line_number++;
+ fields_reset(&in_fields);
+ line_reset(&in_line);
+ in_field = NULL;
+ if (!in_format->read_line())
+ break;
+
+ if (want_trim)
+ trim_fields();
+
+ fields_reset(&out_fields);
+ select_fields();
+
+ out_format->write_line();
+ }
+}
+
+static void two_pass(void)
+{
+ struct format *final_format = out_format;
+
+ // We need to use character set info from the current locale
+ setlocale(LC_CTYPE, "");
+
+ // Pass 1: Set up writer of intermediate format
+ out_format = xmalloc_zero(sizeof(*out_format));
+ out_format->id = FORM_TMP;
+ out_format->read_line = tmp_read;
+ out_format->write_line = tmp_write;
+ out_format->tmp_file = tmpfile();
+ intarray_init(&out_format->column_widths);
+ one_pass();
+
+ // Pass 2: Set up reader of intermediate format
+ in_format = out_format;
+ rewind(in_format->tmp_file);
+ out_format = final_format;
+ one_pass();
+ fclose(in_format->tmp_file);
+}
+
/*** Parsing of arguments ***/
static void usage(void)
-t, --tsv TAB-separated values (default)\n\
-c, --csv Comma-separated values\n\
-w, --ws Values separated by arbitrary whitespace\n\
+-W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
-r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
+ --table Format a table (output only)\n\
\n\
Format parameters:\n\
-d, --fs=<char> Delimiter of fields\n\
-q, --quiet Do not show warnings\n\
--always-quote Put quotes around all fields (CSV output only)\n\
+ --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
\n\
Other options:\n\
--trim Trim leading and trailing whitespaces in fields\n\
exit(1);
}
-static const char short_options[] = "cd:qr:tw";
+static const char short_options[] = "cd:qr:twW";
enum long_options {
OPT_HELP = 256,
OPT_TRIM,
OPT_ALWAYS_QUOTE,
+ OPT_TABLE,
+ OPT_TABLE_SEP,
};
static const struct option long_options[] = {
{ "fs", 1, NULL, 'd' },
{ "quiet", 0, NULL, 'q' },
{ "regex", 1, NULL, 'r' },
+ { "strict-ws", 0, NULL, 'W' },
+ { "table", 0, NULL, OPT_TABLE },
+ { "table-sep", 1, NULL, OPT_TABLE_SEP },
{ "trim", 0, NULL, OPT_TRIM },
{ "tsv", 0, NULL, 't' },
{ "ws", 0, NULL, 'w' },
static void set_format(int format_id)
{
- struct format *f = xmalloc(sizeof(*f));
- memset(f, 0, sizeof(*f));
+ struct format *f = xmalloc_zero(sizeof(*f));
f->id = format_id;
switch (format_id) {
case FORM_REGEX:
f->read_line = regex_read;
break;
+ case FORM_TABLE:
+ f->write_line = table_write;
+ f->needs_two_passes = 1;
+ f->table_sep = 2;
+ break;
}
if (!in_format)
else if (!out_format)
out_format = f;
else
- bad_args("At most two format may be given.");
+ bad_args("At most two formats may be given.");
}
static struct format *current_format(void)
int main(int argc, char **argv)
{
int opt;
- int want_trim = 0;
const char *err;
while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
case 'w':
set_format(FORM_WS);
break;
+ case 'W':
+ set_format(FORM_WS);
+ current_format()->strict_ws = 1;
+ break;
case OPT_ALWAYS_QUOTE:
if (current_format()->id != FORM_CSV)
bad_args("--always-quote makes sense only for CSV.");
case OPT_TRIM:
want_trim = 1;
break;
+ case OPT_TABLE:
+ set_format(FORM_TABLE);
+ break;
+ case OPT_TABLE_SEP:
+ current_format()->table_sep = atoi(optarg);
+ break;
default:
bad_args(NULL);
}
fields_init(&out_fields);
line_init(&in_line);
- for (;;) {
- line_number++;
- fields_reset(&in_fields);
- line_reset(&in_line);
- in_field = NULL;
- if (!in_format->read_line())
- break;
-
- if (want_trim)
- trim_fields();
-
- fields_reset(&out_fields);
- select_fields();
-
- out_format->write_line();
- }
-
+ if (out_format->needs_two_passes)
+ two_pass();
+ else
+ one_pass();
return 0;
}