/*
- * A Swiss-Army Knife for CSV-like Files
+ * The Swiss-Army Knife for CSV-like Files
*
* (c) 2012 Martin Mares <mj@ucw.cz>
*/
-#define _GNU_SOURCE
-
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define UNUSED
#endif
+static void select_fields(void);
+static void select_all_fields(void);
+
/*** General functions ***/
static void NONRET die(char *msg, ...)
int fs;
int quote;
int quiet;
+ int sloppy;
int (*read_line)(struct format *fmt);
void (*write_line)(struct format *fmt);
void (*write_grid)(struct format *fmt, int pos); // -1=above, 1=below, 0=after header
// CSV backend:
int always_quote;
- // WS backend:
- int strict_ws;
-
// regex backend:
pcre *pcre;
pcre_extra *pcre_extra;
};
static struct format *in_format, *out_format;
-static int want_trim;
+static int want_trim, want_equalize, want_stats;
struct field {
int start_pos;
new_field(pos);
}
-// FIXME: Use elsewhere
static unsigned char *get_field(fields_t *fields, int i, int *len)
{
struct field *f = fields_nth(fields, i);
static void update_stats(void)
{
+ if (!want_stats)
+ return;
+
for (int i = 0; i < fields_count(&out_fields); i++) {
struct field *f = fields_nth(&out_fields, i);
intarray_t *w = &column_widths;
static void csv_write(struct format *fmt)
{
- unsigned char *line = line_first(&in_line);
- int n = fields_count(&out_fields);
- for (int i=0; i<n; i++) {
- struct field *f = fields_nth(&out_fields, i);
+ for (int i=0; i < fields_count(&out_fields); i++) {
+ int len;
+ unsigned char *p = get_field(&out_fields, i, &len);
+
int need_quotes = 0;
if (fmt->quote >= 0) {
need_quotes = fmt->always_quote;
- for (int j=0; !need_quotes && j < f->len; j++) {
- int c = line[f->start_pos + j];
- if (c == fmt->fs || c == fmt->quote)
+ for (int j=0; !need_quotes && j < len; j++) {
+ if (p[j] == fmt->fs || p[j] == fmt->quote)
need_quotes = 1;
}
}
putchar_unlocked(fmt->fs);
if (need_quotes)
putchar_unlocked(fmt->quote);
- for (int j=0; j < f->len; j++) {
- int c = line[f->start_pos + j];
+ for (int j=0; j < len; j++) {
+ int c = p[j];
if (c == fmt->fs && !need_quotes)
warn(fmt, "Field separator found inside field and quoting is turned off.");
if (c == fmt->quote)
if (ws) {
if (!in_field->start_pos &&
!in_field->len &&
- !fmt->strict_ws)
+ fmt->sloppy)
in_field->start_pos = i;
else
new_field(i);
}
}
- if (ws && fmt->strict_ws)
+ if (ws && !fmt->sloppy)
new_field(n);
return 1;
}
int i = 0;
for (;;) {
int ovec[3];
- int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
- if (sep < 0) {
- if (sep != PCRE_ERROR_NOMATCH)
- warn(fmt, "PCRE matching error %d", sep);
+ int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
+ if (err < 0) {
+ if (err != PCRE_ERROR_NOMATCH)
+ warn(fmt, "PCRE matching error %d", err);
// No further occurrence of the separator: the rest is a single field
+ if (!fmt->sloppy || i < n) {
+ new_field(i);
+ in_field->len = n - i;
+ }
+ return 1;
+ }
+ if (ovec[0] == ovec[1]) {
+ warn(fmt, "Regular expression matched an empty separator.");
new_field(i);
in_field->len = n - i;
return 1;
}
- new_field(i);
- in_field->len = ovec[0] - i;
+ if (!fmt->sloppy || ovec[0]) {
+ new_field(i);
+ in_field->len = ovec[0] - i;
+ }
i = ovec[1];
}
}
unsigned char *p = get_field(&out_fields, i, &len);
fw = field_chars(fields_nth(&out_fields, i));
if (fw > cw) {
- warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
+ warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw);
cw = fw;
}
while (len--)
- putchar(*p++);
+ putchar_unlocked(*p++);
}
while (fw < cw) {
putchar_unlocked(' ');
for (int i = 0; i < intarray_count(&column_widths); i++) {
putchar_unlocked('+');
- int w = fmt->table_sep + *intarray_nth(&column_widths, i); // FIXME: Avoid the *
+ int w = fmt->table_sep + *intarray_nth(&column_widths, i);
while (w--)
- putchar('-');
+ putchar_unlocked('-');
}
putchar_unlocked('+');
putchar_unlocked('\n');
in_field->len = c;
while (c--) {
int x = getc_unlocked(tf);
- if (x < 0) {
- warn(fmt, "Truncated temporary file");
- return 0;
- }
+ if (x < 0)
+ die("Truncated temporary file");
*line_push(&in_line) = x;
}
}
FILE *tf = fmt->tmp_file;
for (int i = 0; i < fields_count(&out_fields); i++) {
- struct field *f = fields_nth(&out_fields, i);
- if (f->len < 0xfe)
- putc_unlocked(f->len, tf);
+ int len;
+ unsigned char *p = get_field(&out_fields, i, &len);
+
+ if (len < 0xfe)
+ putc_unlocked(len, tf);
else {
putc_unlocked(0xfe, tf);
- putc_unlocked((f->len >> 24) & 0xff, tf);
- putc_unlocked((f->len >> 16) & 0xff, tf);
- putc_unlocked((f->len >> 8) & 0xff, tf);
- putc_unlocked(f->len & 0xff, tf);
+ putc_unlocked((len >> 24) & 0xff, tf);
+ putc_unlocked((len >> 16) & 0xff, tf);
+ putc_unlocked((len >> 8) & 0xff, tf);
+ putc_unlocked(len & 0xff, tf);
}
- unsigned char *p = line_nth(&in_line, f->start_pos);
- for (int j = 0; j < f->len; j++)
+ while (len--)
putc_unlocked(*p++, tf);
}
putc_unlocked(0xff, tf);
}
}
+static void equalize_fields(void)
+{
+ while (fields_count(&out_fields) < intarray_count(&column_widths)) {
+ struct field *f = fields_push(&out_fields);
+ f->start_pos = f->len = 0;
+ }
+}
+
/*** Field names and headers ***/
struct field_names {
return;
}
+ int want_select_fields = 0;
if (out_format->set_field_names) {
struct field_names *fn = xmalloc_zero(sizeof(*fn));
out_format->field_names = fn;
add_field_names(fn, out_format->set_field_names);
- } else if (in_format->field_names)
+ } else if (in_format->field_names) {
out_format->field_names = in_format->field_names;
- else
+ want_select_fields = 1;
+ } else
die("Output header requested, but no field names specified");
line_reset(&in_line);
- fields_reset(&out_fields);
+ fields_reset(&in_fields);
struct field_names *fn = out_format->field_names;
for (int i = 0; i < stringarray_count(&fn->names); i++) {
- struct field *f = fields_push(&out_fields);
+ struct field *f = fields_push(&in_fields);
f->start_pos = line_count(&in_line);
f->len = 0;
char *s = *stringarray_nth(&fn->names, i);
}
}
+ fields_reset(&out_fields);
+ if (want_select_fields)
+ select_fields();
+ else
+ select_all_fields();
+
// This is tricky: when we are formatting a table, field names are normally
// calculated in pass 1, but the header is written in pass 2, so we have to
// update column statistics, because field name can be too wide to fit.
+ want_stats++;
update_stats();
+ want_stats--;
+ if (want_equalize)
+ equalize_fields();
write_grid(-1);
write_line();
write_grid(0);
if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
return f;
- die("Unknown field %s", str);
+ die("Unknown field `%s'", str);
}
static char *parse_selector(char *str)
else
select_all_fields();
- if (out_format->needs_stats)
- update_stats();
-
+ if (want_equalize && (pass & 2))
+ equalize_fields();
+ update_stats();
write_line();
}
out_format->read_line = tmp_read;
out_format->write_line = tmp_write;
out_format->tmp_file = tmpfile();
- out_format->needs_stats = final_format->needs_stats;
out_format->field_names = in_format->field_names;
one_pass(1);
rewind(in_format->tmp_file);
line_number = 0;
out_format = final_format;
- out_format->needs_stats = 0;
+ want_stats = 0;
one_pass(2);
fclose(in_format->tmp_file);
}
Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
\n\
Formats:\n\
--t, --tsv TAB-separated values (default)\n\
+-t, --tsv Tab-separated values (default)\n\
-c, --csv Comma-separated values\n\
-w, --ws Values separated by arbitrary whitespace\n\
--W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
-r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
--table Format a table (output only)\n\
\n\
--always-quote Put quotes around all fields (CSV output only)\n\
--table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
--grid Separate table columns by grid lines\n\
+-s, --sloppy Ignore separators at the start/end of line (ws/regex only)\n\
\n\
Other options:\n\
--trim Trim leading and trailing whitespaces in fields\n\
+ --equalize Pad all lines to the maximum number of fields\n\
");
exit(0);
}
enum long_options {
OPT_HELP = 256,
+ OPT_VERSION,
OPT_TRIM,
OPT_ALWAYS_QUOTE,
OPT_TABLE,
OPT_TABLE_SEP,
OPT_GRID,
+ OPT_EQUALIZE,
};
static const struct option long_options[] = {
{ "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
{ "csv", 0, NULL, 'c' },
+ { "equalize", 0, NULL, OPT_EQUALIZE },
{ "fields", 1, NULL, 'f' },
{ "fs", 1, NULL, 'd' },
{ "grid", 0, NULL, OPT_GRID },
{ "header", 0, NULL, 'h' },
+ { "help", 0, NULL, OPT_HELP },
{ "quiet", 0, NULL, 'q' },
{ "regex", 1, NULL, 'r' },
- { "strict-ws", 0, NULL, 'W' },
+ { "sloppy", 0, NULL, 's' },
{ "table", 0, NULL, OPT_TABLE },
{ "table-sep", 1, NULL, OPT_TABLE_SEP },
{ "trim", 0, NULL, OPT_TRIM },
{ "tsv", 0, NULL, 't' },
+ { "version", 0, NULL, OPT_VERSION },
{ "ws", 0, NULL, 'w' },
- { "help", 0, NULL, OPT_HELP },
{ NULL, 0, NULL, 0 },
};
if (err)
bad_args("Error compiling regex: %s", err);
break;
+ case 's':
+ if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX)
+ bad_args("--sloppy makes sense only for --ws or --regex.");
+ current_format()->sloppy = 1;
+ break;
case 't':
set_format(FORM_TSV);
break;
case 'w':
set_format(FORM_WS);
break;
- case 'W':
- set_format(FORM_WS);
- current_format()->strict_ws = 1;
- break;
case OPT_ALWAYS_QUOTE:
if (current_format()->id != FORM_CSV)
- bad_args("--always-quote makes sense only for CSV.");
+ bad_args("--always-quote makes sense only for --csv.");
current_format()->always_quote = 1;
break;
case OPT_HELP:
usage();
+ case OPT_VERSION:
+ puts("This is xsv version " VERSION ".");
+ exit(0);
case OPT_TRIM:
want_trim = 1;
break;
case OPT_GRID:
current_format()->table_grid = 1;
break;
+ case OPT_EQUALIZE:
+ want_equalize = 1;
+ break;
default:
bad_args(NULL);
}
}
finish_parse_selectors();
- if (out_format->needs_stats)
+ want_stats = out_format->needs_stats | want_equalize;
+ if (want_stats)
two_pass();
else
one_pass(3);