#include <string.h>
#include <stdarg.h>
#include <getopt.h>
+#include <wchar.h>
+#include <locale.h>
#include <pcre.h>
return p;
}
+static void *xmalloc_zero(size_t bytes)
+{
+ void *p = xmalloc(bytes);
+ memset(p, 0, bytes);
+ return p;
+}
+
static void *xrealloc(void *old, size_t bytes)
{
void *p = realloc(old, bytes);
static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
// end
+DECLARE_BUF(intarray, int);
+
/*** Formats and their parameters ***/
enum format_id {
FORM_CSV,
FORM_WS,
FORM_REGEX,
+ FORM_TMP,
+ FORM_TABLE,
};
struct format {
int quiet;
int (*read_line)(void);
void (*write_line)(void);
+ int needs_two_passes;
+
// CSV backend:
int always_quote;
+
+ // WS backend:
+ int strict_ws;
+
// regex backend:
pcre *pcre;
pcre_extra *pcre_extra;
+
+ // Temporary file backend:
+ FILE *tmp_file;
+ intarray_t column_widths;
+
+ // Table backend:
+ int table_sep;
};
static struct format *in_format, *out_format;
+static int want_trim;
struct field {
int start_pos;
static line_t in_line;
static int line_number;
-static void new_field(void)
+static void new_field(int pos)
{
in_field = fields_push(&in_fields);
- in_field->start_pos = line_count(&in_line);
+ in_field->start_pos = pos;
in_field->len = 0;
}
-static void ensure_field(void)
+static void ensure_field(int pos)
{
if (!in_field)
- new_field();
+ new_field(pos);
}
static void warn(struct format *fmt, char *msg, ...)
}
}
+static int field_chars(struct field *f)
+{
+ unsigned char *s = line_nth(&in_line, f->start_pos);
+ int i = 0;
+ mbstate_t mbs;
+ memset(&mbs, 0, sizeof(mbs));
+
+ int chars = 0;
+ while (i < f->len) {
+ size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
+ if ((int) k <= 0)
+ break;
+ i += k;
+ chars++;
+ }
+
+ return chars;
+}
+
+/*** CSV/TSV back-end */
+
static int csv_read(void)
{
int quoted = 0;
for (;;) {
int c = getchar();
+ int i = line_count(&in_line);
restart:
if (c == '\r')
continue;
quoted = 1;
continue;
} else if (c == in_format->fs && !quoted) {
- ensure_field();
- new_field();
+ ensure_field(i);
+ new_field(i);
continue;
}
- ensure_field();
+ ensure_field(i);
*line_push(&in_line) = c;
in_field->len++;
}
putchar('\n');
}
+/*** White-space back-end ***/
+
static int ws_read(void)
{
+ if (!next_line())
+ return 0;
+
+ unsigned char *line = line_first(&in_line);
+ int n = line_count(&in_line);
+ if (!n)
+ return 1;
+
int ws = 0;
- for (;;) {
- int c = getchar();
- if (c < 0)
- return !!fields_count(&in_fields);
- if (c == '\r')
- continue;
- if (c == '\n')
- return 1;
+ new_field(0);
+ for (int i=0; i<n; i++) {
+ int c = line[i];
if (is_ws(c)) {
- ensure_field();
- if (!ws)
- new_field();
ws++;
} else {
- ensure_field();
- *line_push(&in_line) = c;
+ if (ws) {
+ if (!in_field->start_pos &&
+ !in_field->len &&
+ !in_format->strict_ws)
+ in_field->start_pos = i;
+ else
+ new_field(i);
+ ws = 0;
+ }
in_field->len++;
- ws = 0;
}
}
+
+ if (ws && in_format->strict_ws)
+ new_field(n);
+ return 1;
}
+/*** Regex back-end ***/
+
static const char *regex_set(struct format *f, char *rx)
{
const char *err;
if (sep != PCRE_ERROR_NOMATCH)
warn(in_format, "PCRE matching error %d", sep);
// No further occurrence of the separator: the rest is a single field
- new_field();
- in_field->start_pos = i;
+ new_field(i);
in_field->len = n - i;
return 1;
}
- new_field();
- in_field->start_pos = i;
+ new_field(i);
in_field->len = ovec[0] - i;
i = ovec[1];
}
}
+/*** Table back-end ***/
+
+static void table_write(void)
+{
+ for (int i = 0; i < fields_count(&in_fields); i++) {
+ if (i)
+ printf("%*s", out_format->table_sep, "");
+ struct field *f = fields_nth(&in_fields, i);
+ int fw = field_chars(f);
+ int cw = *intarray_nth(&in_format->column_widths, i);
+ if (fw > cw) {
+ warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
+ cw = fw;
+ }
+ unsigned char *p = line_nth(&in_line, f->start_pos);
+ for (int j = 0; j < f->len; j++)
+ putchar(p[j]);
+ while (fw < cw) {
+ putchar(' ');
+ fw++;
+ }
+ }
+ putchar('\n');
+}
+
+/*** Temporary file back-end ***/
+
+static int tmp_read(void)
+{
+ FILE *tf = in_format->tmp_file;
+
+ for (;;) {
+ int c = fgetc(tf);
+ if (c < 0)
+ return 0;
+ if (c == 0xff)
+ return 1;
+ if (c == 0xfe) {
+ c = fgetc(tf);
+ c = (c << 8) | fgetc(tf);
+ c = (c << 8) | fgetc(tf);
+ c = (c << 8) | fgetc(tf);
+ }
+ new_field(line_count(&in_line));
+ in_field->len = c;
+ while (c--) {
+ int x = fgetc(tf);
+ if (x < 0) {
+ warn(in_format, "Truncated temporary file");
+ return 0;
+ }
+ *line_push(&in_line) = x;
+ }
+ }
+}
+
+static void tmp_write(void)
+{
+ FILE *tf = out_format->tmp_file;
+
+ for (int i = 0; i < fields_count(&in_fields); i++) {
+ struct field *f = fields_nth(&in_fields, i);
+ if (f->len < 0xfe)
+ fputc(f->len, tf);
+ else {
+ fputc(0xfe, tf);
+ fputc((f->len >> 24) & 0xff, tf);
+ fputc((f->len >> 16) & 0xff, tf);
+ fputc((f->len >> 8) & 0xff, tf);
+ fputc(f->len & 0xff, tf);
+ }
+
+ unsigned char *p = line_nth(&in_line, f->start_pos);
+ for (int j = 0; j < f->len; j++)
+ fputc(*p++, tf);
+
+ intarray_t *w = &out_format->column_widths;
+ while (i >= intarray_count(w))
+ *intarray_push(w) = 0;
+ int fw = field_chars(f);
+ if (*intarray_nth(w, i) < fw)
+ *intarray_nth(w, i) = fw;
+ }
+ fputc(0xff, tf);
+}
+
/*** Transforms ***/
static void trim_fields(void)
}
}
+/*** Processing of files ***/
+
+static void one_pass(void)
+{
+ line_number = 0;
+ for (;;) {
+ line_number++;
+ fields_reset(&in_fields);
+ line_reset(&in_line);
+ in_field = NULL;
+ if (!in_format->read_line())
+ break;
+
+ if (want_trim)
+ trim_fields();
+
+ fields_reset(&out_fields);
+ select_fields();
+
+ out_format->write_line();
+ }
+}
+
+static void two_pass(void)
+{
+ struct format *final_format = out_format;
+
+ // We need to use character set info from the current locale
+ setlocale(LC_CTYPE, "");
+
+ // Pass 1: Set up writer of intermediate format
+ out_format = xmalloc_zero(sizeof(*out_format));
+ out_format->id = FORM_TMP;
+ out_format->read_line = tmp_read;
+ out_format->write_line = tmp_write;
+ out_format->tmp_file = tmpfile();
+ intarray_init(&out_format->column_widths);
+ one_pass();
+
+ // Pass 2: Set up reader of intermediate format
+ in_format = out_format;
+ rewind(in_format->tmp_file);
+ out_format = final_format;
+ one_pass();
+ fclose(in_format->tmp_file);
+}
+
/*** Parsing of arguments ***/
static void usage(void)
-t, --tsv TAB-separated values (default)\n\
-c, --csv Comma-separated values\n\
-w, --ws Values separated by arbitrary whitespace\n\
+-W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
-r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
+ --table Format a table (output only)\n\
\n\
Format parameters:\n\
-d, --fs=<char> Delimiter of fields\n\
-q, --quiet Do not show warnings\n\
--always-quote Put quotes around all fields (CSV output only)\n\
+ --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
\n\
Other options:\n\
--trim Trim leading and trailing whitespaces in fields\n\
exit(1);
}
-static const char short_options[] = "cd:qr:tw";
+static const char short_options[] = "cd:qr:twW";
enum long_options {
OPT_HELP = 256,
OPT_TRIM,
OPT_ALWAYS_QUOTE,
+ OPT_TABLE,
+ OPT_TABLE_SEP,
};
static const struct option long_options[] = {
{ "fs", 1, NULL, 'd' },
{ "quiet", 0, NULL, 'q' },
{ "regex", 1, NULL, 'r' },
+ { "strict-ws", 0, NULL, 'W' },
+ { "table", 0, NULL, OPT_TABLE },
+ { "table-sep", 1, NULL, OPT_TABLE_SEP },
{ "trim", 0, NULL, OPT_TRIM },
{ "tsv", 0, NULL, 't' },
{ "ws", 0, NULL, 'w' },
static void set_format(int format_id)
{
- struct format *f = xmalloc(sizeof(*f));
- memset(f, 0, sizeof(*f));
+ struct format *f = xmalloc_zero(sizeof(*f));
f->id = format_id;
switch (format_id) {
case FORM_REGEX:
f->read_line = regex_read;
break;
+ case FORM_TABLE:
+ f->write_line = table_write;
+ f->needs_two_passes = 1;
+ f->table_sep = 2;
+ break;
}
if (!in_format)
else if (!out_format)
out_format = f;
else
- bad_args("At most two format may be given.");
+ bad_args("At most two formats may be given.");
}
static struct format *current_format(void)
int main(int argc, char **argv)
{
int opt;
- int want_trim = 0;
const char *err;
while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
case 'w':
set_format(FORM_WS);
break;
+ case 'W':
+ set_format(FORM_WS);
+ current_format()->strict_ws = 1;
+ break;
case OPT_ALWAYS_QUOTE:
if (current_format()->id != FORM_CSV)
bad_args("--always-quote makes sense only for CSV.");
case OPT_TRIM:
want_trim = 1;
break;
+ case OPT_TABLE:
+ set_format(FORM_TABLE);
+ break;
+ case OPT_TABLE_SEP:
+ current_format()->table_sep = atoi(optarg);
+ break;
default:
bad_args(NULL);
}
fields_init(&out_fields);
line_init(&in_line);
- for (;;) {
- line_number++;
- fields_reset(&in_fields);
- line_reset(&in_line);
- in_field = NULL;
- if (!in_format->read_line())
- break;
-
- if (want_trim)
- trim_fields();
-
- fields_reset(&out_fields);
- select_fields();
-
- out_format->write_line();
- }
-
+ if (out_format->needs_two_passes)
+ two_pass();
+ else
+ one_pass();
return 0;
}