#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <stdarg.h>
#include <getopt.h>
+#include <pcre.h>
+
/*** Memory allocation ***/
static void *xmalloc(size_t bytes)
return p;
}
+static void *xmalloc_zero(size_t bytes)
+{
+ void *p = xmalloc(bytes);
+ memset(p, 0, bytes);
+ return p;
+}
+
static void *xrealloc(void *old, size_t bytes)
{
void *p = realloc(old, bytes);
if (b->count >= b->max) name##_extend(b); \
return &b->start[b->count++]; \
} \
+ static inline type *name##_first(name##_t *b) { return b->start; } \
static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
// end
+DECLARE_BUF(intarray, int);
+
/*** Formats and their parameters ***/
enum format_id {
FORM_TSV,
FORM_CSV,
FORM_WS,
+ FORM_REGEX,
+ FORM_TMP,
+ FORM_TABLE,
};
struct format {
enum format_id id;
int fs;
int quote;
+ int quiet;
int (*read_line)(void);
void (*write_line)(void);
+ int needs_two_passes;
+
+ // CSV backend:
+ int always_quote;
+
+ // WS backend:
+ int strict_ws;
+
+ // regex backend:
+ pcre *pcre;
+ pcre_extra *pcre_extra;
+
+ // Temporary file backend:
+ FILE *tmp_file;
+ intarray_t column_widths;
+
+ // Table backend:
+ int table_sep;
};
static struct format *in_format, *out_format;
+static int want_trim;
struct field {
int start_pos;
static fields_t in_fields, out_fields;
static struct field *in_field;
static line_t in_line;
+static int line_number;
-static void new_field(void)
+static void new_field(int pos)
{
in_field = fields_push(&in_fields);
- in_field->start_pos = line_count(&in_line);
+ in_field->start_pos = pos;
in_field->len = 0;
}
-static void ensure_field(void)
+static void ensure_field(int pos)
{
if (!in_field)
- new_field();
+ new_field(pos);
+}
+
+static void warn(struct format *fmt, char *msg, ...)
+{
+ if (!fmt->quiet) {
+ fprintf(stderr, "Warning at line %d: ", line_number);
+ va_list args;
+ va_start(args, msg);
+ vfprintf(stderr, args, msg);
+ va_end(args);
+ fputc('\n', stderr);
+ }
}
+static int next_line(void)
+{
+ for (;;) {
+ int c = getchar();
+ if (c == '\r')
+ continue;
+ if (c < 0)
+ return !!line_count(&in_line);
+ if (c == '\n')
+ return 1;
+ *line_push(&in_line) = c;
+ }
+}
+
+/*** CSV/TSV back-end */
+
static int csv_read(void)
{
int quoted = 0;
- // FIXME: Complain if closing quote is missing?
for (;;) {
int c = getchar();
+ int i = line_count(&in_line);
restart:
- if (c < 0)
- return !!fields_count(&in_fields);
if (c == '\r')
continue;
- if (c == '\n')
- return 1;
+ if (c < 0 || c == '\n') {
+ if (quoted)
+ warn(in_format, "Missing closing quote.");
+ if (c < 0)
+ return !!fields_count(&in_fields);
+ else
+ return 1;
+ }
if (quoted) {
if (c == in_format->quote) {
c = getchar();
quoted = 1;
continue;
} else if (c == in_format->fs && !quoted) {
- ensure_field();
- new_field();
+ ensure_field(i);
+ new_field(i);
continue;
}
- ensure_field();
+ ensure_field(i);
*line_push(&in_line) = c;
in_field->len++;
}
static void csv_write(void)
{
- unsigned char *line = line_nth(&in_line, 0);
+ unsigned char *line = line_first(&in_line);
int n = fields_count(&out_fields);
for (int i=0; i<n; i++) {
struct field *f = fields_nth(&out_fields, i);
int need_quotes = 0;
if (out_format->quote >= 0) {
- for (int j=0; j < f->len; j++) {
+ need_quotes = out_format->always_quote;
+ for (int j=0; !need_quotes && j < f->len; j++) {
int c = line[f->start_pos + j];
- if (c == out_format->fs || c == out_format->quote) {
+ if (c == out_format->fs || c == out_format->quote)
need_quotes = 1;
- break;
- }
}
}
if (i)
putchar(out_format->quote);
for (int j=0; j < f->len; j++) {
int c = line[f->start_pos + j];
+ if (c == out_format->fs && !need_quotes)
+ warn(out_format, "Field separator found inside field and quoting is turned off.");
if (c == out_format->quote)
putchar(c);
putchar(c);
putchar('\n');
}
+/*** White-space back-end ***/
+
static int ws_read(void)
{
+ if (!next_line())
+ return 0;
+
+ unsigned char *line = line_first(&in_line);
+ int n = line_count(&in_line);
+ if (!n)
+ return 1;
+
int ws = 0;
- for (;;) {
- int c = getchar();
- if (c < 0)
- return !!fields_count(&in_fields);
- if (c == '\r')
- continue;
- if (c == '\n')
- return 1;
+ new_field(0);
+ for (int i=0; i<n; i++) {
+ int c = line[i];
if (is_ws(c)) {
- ensure_field();
- if (!ws)
- new_field();
ws++;
} else {
- ensure_field();
- *line_push(&in_line) = c;
+ if (ws) {
+ if (!in_field->start_pos &&
+ !in_field->len &&
+ !in_format->strict_ws)
+ in_field->start_pos = i;
+ else
+ new_field(i);
+ ws = 0;
+ }
in_field->len++;
- ws = 0;
}
}
+
+ if (ws && in_format->strict_ws)
+ new_field(n);
+ return 1;
+}
+
+/*** Regex back-end ***/
+
+static const char *regex_set(struct format *f, char *rx)
+{
+ const char *err;
+ int errpos;
+ f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
+ if (!f->pcre)
+ return err;
+
+ f->pcre_extra = pcre_study(f->pcre, 0, &err);
+ if (!f->pcre_extra)
+ return err;
+
+ return NULL;
+}
+
+static int regex_read(void)
+{
+ if (!next_line())
+ return 0;
+
+ unsigned char *c = line_first(&in_line);
+ int n = line_count(&in_line);
+ if (!n)
+ return 1;
+
+ int i = 0;
+ for (;;) {
+ int ovec[3];
+ int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
+ if (sep < 0) {
+ if (sep != PCRE_ERROR_NOMATCH)
+ warn(in_format, "PCRE matching error %d", sep);
+ // No further occurrence of the separator: the rest is a single field
+ new_field(i);
+ in_field->len = n - i;
+ return 1;
+ }
+ new_field(i);
+ in_field->len = ovec[0] - i;
+ i = ovec[1];
+ }
+}
+
+/*** Table back-end ***/
+
+static void table_write(void)
+{
+ for (int i = 0; i < fields_count(&in_fields); i++) {
+ if (i)
+ printf("%*s", out_format->table_sep, "");
+ struct field *f = fields_nth(&in_fields, i);
+ int w = *intarray_nth(&in_format->column_widths, i);
+ if (f->len > w) {
+ warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", f->len, w);
+ w = f->len;
+ }
+ int j = 0;
+ unsigned char *p = line_nth(&in_line, f->start_pos);
+ while (j < f->len) {
+ putchar(*p++);
+ j++;
+ }
+ while (j < w) {
+ putchar(' ');
+ j++;
+ }
+ }
+ putchar('\n');
+}
+
+/*** Temporary file back-end ***/
+
+static int tmp_read(void)
+{
+ FILE *tf = in_format->tmp_file;
+
+ for (;;) {
+ int c = fgetc(tf);
+ if (c < 0)
+ return 0;
+ if (c == 0xff)
+ return 1;
+ if (c == 0xfe) {
+ c = fgetc(tf);
+ c = (c << 8) | fgetc(tf);
+ c = (c << 8) | fgetc(tf);
+ c = (c << 8) | fgetc(tf);
+ }
+ new_field(line_count(&in_line));
+ in_field->len = c;
+ while (c--) {
+ int x = fgetc(tf);
+ if (x < 0) {
+ warn(in_format, "Truncated temporary file");
+ return 0;
+ }
+ *line_push(&in_line) = x;
+ }
+ }
+}
+
+static void tmp_write(void)
+{
+ FILE *tf = out_format->tmp_file;
+
+ for (int i = 0; i < fields_count(&in_fields); i++) {
+ struct field *f = fields_nth(&in_fields, i);
+ if (f->len < 0xfe)
+ fputc(f->len, tf);
+ else {
+ fputc(0xfe, tf);
+ fputc((f->len >> 24) & 0xff, tf);
+ fputc((f->len >> 16) & 0xff, tf);
+ fputc((f->len >> 8) & 0xff, tf);
+ fputc(f->len & 0xff, tf);
+ }
+
+ unsigned char *p = line_nth(&in_line, f->start_pos);
+ for (int j = 0; j < f->len; j++)
+ fputc(*p++, tf);
+
+ intarray_t *w = &out_format->column_widths;
+ while (i >= intarray_count(w))
+ *intarray_push(w) = 0;
+ if (*intarray_nth(w, i) < f->len)
+ *intarray_nth(w, i) = f->len;
+ }
+ fputc(0xff, tf);
}
/*** Transforms ***/
static void trim_fields(void)
{
- unsigned char *line = line_nth(&in_line, 0);
+ unsigned char *line = line_first(&in_line);
for (int i = 0; i < fields_count(&in_fields); i++) {
struct field *f = fields_nth(&in_fields, i);
while (f->len && is_ws(line[f->start_pos]))
}
}
+/*** Processing of files ***/
+
+static void one_pass(void)
+{
+ line_number = 0;
+ for (;;) {
+ line_number++;
+ fields_reset(&in_fields);
+ line_reset(&in_line);
+ in_field = NULL;
+ if (!in_format->read_line())
+ break;
+
+ if (want_trim)
+ trim_fields();
+
+ fields_reset(&out_fields);
+ select_fields();
+
+ out_format->write_line();
+ }
+}
+
+static void two_pass(void)
+{
+ struct format *final_format = out_format;
+
+ // Pass 1: Set up writer of intermediate format
+ out_format = xmalloc_zero(sizeof(*out_format));
+ out_format->id = FORM_TMP;
+ out_format->read_line = tmp_read;
+ out_format->write_line = tmp_write;
+ out_format->tmp_file = tmpfile();
+ intarray_init(&out_format->column_widths);
+ one_pass();
+
+ // Pass 2: Set up reader of intermediate format
+ in_format = out_format;
+ rewind(in_format->tmp_file);
+ out_format = final_format;
+ one_pass();
+ fclose(in_format->tmp_file);
+}
+
/*** Parsing of arguments ***/
static void usage(void)
-t, --tsv TAB-separated values (default)\n\
-c, --csv Comma-separated values\n\
-w, --ws Values separated by arbitrary whitespace\n\
+-W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
+-r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
+ --table Format a table (output only)\n\
\n\
Format parameters:\n\
-d, --fs=<char> Delimiter of fields\n\
+-q, --quiet Do not show warnings\n\
+ --always-quote Put quotes around all fields (CSV output only)\n\
+ --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
\n\
Other options:\n\
--trim Trim leading and trailing whitespaces in fields\n\
exit(0);
}
-static void bad_args(char *msg)
+static void bad_args(const char *msg, ...)
{
- if (msg)
- fprintf(stderr, "xsv: %s\n", msg);
+ if (msg) {
+ va_list args;
+ va_start(args, msg);
+ fprintf(stderr, "xsv: ");
+ vfprintf(stderr, msg, args);
+ fputc('\n', stderr);
+ va_end(args);
+ }
fprintf(stderr, "Try `xsv --help' for more information.\n");
exit(1);
}
-static const char short_options[] = "cd:tw";
+static const char short_options[] = "cd:qr:twW";
enum long_options {
OPT_HELP = 256,
- OPT_TRIM = 257,
+ OPT_TRIM,
+ OPT_ALWAYS_QUOTE,
+ OPT_TABLE,
+ OPT_TABLE_SEP,
};
static const struct option long_options[] = {
+ { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
{ "csv", 0, NULL, 'c' },
{ "fs", 1, NULL, 'd' },
+ { "quiet", 0, NULL, 'q' },
+ { "regex", 1, NULL, 'r' },
+ { "strict-ws", 0, NULL, 'W' },
+ { "table", 0, NULL, OPT_TABLE },
+ { "table-sep", 1, NULL, OPT_TABLE_SEP },
{ "trim", 0, NULL, OPT_TRIM },
{ "tsv", 0, NULL, 't' },
{ "ws", 0, NULL, 'w' },
static void set_format(int format_id)
{
- struct format *f = xmalloc(sizeof(*f));
- memset(f, 0, sizeof(*f));
+ struct format *f = xmalloc_zero(sizeof(*f));
f->id = format_id;
switch (format_id) {
f->read_line = ws_read;
f->write_line = csv_write;
break;
+ case FORM_REGEX:
+ f->read_line = regex_read;
+ break;
+ case FORM_TABLE:
+ f->write_line = table_write;
+ f->needs_two_passes = 1;
+ f->table_sep = 2;
+ break;
}
if (!in_format)
else if (!out_format)
out_format = f;
else
- bad_args("At most two format may be given.");
+ bad_args("At most two formats may be given.");
}
static struct format *current_format(void)
int main(int argc, char **argv)
{
int opt;
- int want_trim = 0;
+ const char *err;
while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
switch (opt) {
else
bad_args("No field delimiter given.");
break;
+ case 'q':
+ current_format()->quiet = 1;
+ break;
+ case 'r':
+ set_format(FORM_REGEX);
+ err = regex_set(current_format(), optarg);
+ if (err)
+ bad_args("Error compiling regex: %s", err);
+ break;
case 't':
set_format(FORM_TSV);
break;
case 'w':
set_format(FORM_WS);
break;
+ case 'W':
+ set_format(FORM_WS);
+ current_format()->strict_ws = 1;
+ break;
+ case OPT_ALWAYS_QUOTE:
+ if (current_format()->id != FORM_CSV)
+ bad_args("--always-quote makes sense only for CSV.");
+ current_format()->always_quote = 1;
+ break;
case OPT_HELP:
usage();
case OPT_TRIM:
want_trim = 1;
break;
+ case OPT_TABLE:
+ set_format(FORM_TABLE);
+ break;
+ case OPT_TABLE_SEP:
+ current_format()->table_sep = atoi(optarg);
+ break;
default:
bad_args(NULL);
}
current_format();
if (!out_format)
out_format = in_format;
+ if (!in_format->read_line)
+ bad_args("Write-only format selected for input.");
+ if (!out_format->write_line)
+ bad_args("Read-only format selected for output.");
for (int i = optind; i < argc; i++) {
- char *err = parse_selector(argv[i]);
+ err = parse_selector(argv[i]);
if (err)
bad_args(err);
}
fields_init(&out_fields);
line_init(&in_line);
- for (;;) {
- fields_reset(&in_fields);
- line_reset(&in_line);
- in_field = NULL;
- if (!in_format->read_line())
- break;
-
- if (want_trim)
- trim_fields();
-
- fields_reset(&out_fields);
- select_fields();
-
- out_format->write_line();
- }
-
+ if (out_format->needs_two_passes)
+ two_pass();
+ else
+ one_pass();
return 0;
}