X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=xsv.c;h=4615d708dadc869542510462f0f8b4d0b660c9d3;hb=24c52bb479bd520afb6618b3adefbc425ab93522;hp=8fd72779d23e5bd11ff80711e2453adc44e31853;hpb=60cd9ac013a349e45a50c0b2d17b0b30931d19c2;p=xsv.git

diff --git a/xsv.c b/xsv.c
index 8fd7277..4615d70 100644
--- a/xsv.c
+++ b/xsv.c
@@ -24,6 +24,9 @@
 #define UNUSED
 #endif
 
+static void select_fields(void);
+static void select_all_fields(void);
+
 /*** General functions ***/
 
 static void NONRET die(char *msg, ...)
@@ -99,6 +102,7 @@ struct format {
 	int fs;
 	int quote;
 	int quiet;
+	int sloppy;
 	int (*read_line)(struct format *fmt);
 	void (*write_line)(struct format *fmt);
 	void (*write_grid)(struct format *fmt, int pos);	// -1=above, 1=below, 0=after header
@@ -112,9 +116,6 @@ struct format {
 	// CSV backend:
 	int always_quote;
 
-	// WS backend:
-	int strict_ws;
-
 	// regex backend:
 	pcre *pcre;
 	pcre_extra *pcre_extra;
@@ -359,7 +360,7 @@ static int ws_read(struct format *fmt)
 			if (ws) {
 				if (!in_field->start_pos &&
 				    !in_field->len &&
-				    !fmt->strict_ws)
+				    fmt->sloppy)
 					in_field->start_pos = i;
 				else
 					new_field(i);
@@ -369,7 +370,7 @@ static int ws_read(struct format *fmt)
 		}
 	}
 
-	if (ws && fmt->strict_ws)
+	if (ws && !fmt->sloppy)
 		new_field(n);
 	return 1;
 }
@@ -404,17 +405,21 @@ static int regex_read(struct format *fmt)
 	int i = 0;
 	for (;;) {
 		int ovec[3];
-		int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
-		if (sep < 0) {
-			if (sep != PCRE_ERROR_NOMATCH)
-				warn(fmt, "PCRE matching error %d", sep);
+		int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
+		if (err < 0) {
+			if (err != PCRE_ERROR_NOMATCH)
+				warn(fmt, "PCRE matching error %d", err);
 			// No further occurrence of the separator: the rest is a single field
-			new_field(i);
-			in_field->len = n - i;
+			if (!fmt->sloppy || i < n) {
+				new_field(i);
+				in_field->len = n - i;
+			}
 			return 1;
 		}
-		new_field(i);
-		in_field->len = ovec[0] - i;
+		if (!fmt->sloppy || ovec[0]) {
+			new_field(i);
+			in_field->len = ovec[0] - i;
+		}
 		i = ovec[1];
 	}
 }
@@ -609,20 +614,22 @@ static void write_header(void)
 		return;
 	}
 
+	int want_select_fields = 0;
 	if (out_format->set_field_names) {
 		struct field_names *fn = xmalloc_zero(sizeof(*fn));
 		out_format->field_names = fn;
 		add_field_names(fn, out_format->set_field_names);
-	} else if (in_format->field_names)
+	} else if (in_format->field_names) {
 		out_format->field_names = in_format->field_names;
-	else
+		want_select_fields = 1;
+	} else
 		die("Output header requested, but no field names specified");
 
 	line_reset(&in_line);
-	fields_reset(&out_fields);
+	fields_reset(&in_fields);
 	struct field_names *fn = out_format->field_names;
 	for (int i = 0; i < stringarray_count(&fn->names); i++) {
-		struct field *f = fields_push(&out_fields);
+		struct field *f = fields_push(&in_fields);
 		f->start_pos = line_count(&in_line);
 		f->len = 0;
 		char *s = *stringarray_nth(&fn->names, i);
@@ -632,6 +639,11 @@ static void write_header(void)
 		}
 	}
 
+	if (want_select_fields)
+		select_fields();
+	else
+		select_all_fields();
+
 	// This is tricky: when we are formatting a table, field names are normally
 	// calculated in pass 1, but the header is written in pass 2, so we have to
 	// update column statistics, because field name can be too wide to fit.
@@ -694,7 +706,7 @@ static int parse_field(char *str)
 	if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
 		return f;
 
-	die("Unknown field %s", str);
+	die("Unknown field `%s'", str);
 }
 
 static char *parse_selector(char *str)
@@ -814,7 +826,6 @@ Formats:\n\
 -t, --tsv		TAB-separated values (default)\n\
 -c, --csv		Comma-separated values\n\
 -w, --ws		Values separated by arbitrary whitespace\n\
--W, --strict-ws		Like --ws, but recognize empty columns at start/end\n\
 -r, --regex=<rx>	Separator given by Perl regular expression (input only)\n\
     --table		Format a table (output only)\n\
 \n\
@@ -826,6 +837,7 @@ Format parameters:\n\
     --always-quote	Put quotes around all fields (CSV output only)\n\
     --table-sep=<n>	Separate table columns by <n> spaces (default: 2)\n\
     --grid		Separate table columns by grid lines\n\
+-s, --sloppy		Ignore separators at the start/end of line (ws/regex only)\n\
 \n\
 Other options:\n\
     --trim		Trim leading and trailing whitespaces in fields\n\
@@ -870,7 +882,7 @@ static const struct option long_options[] = {
 	{ "header",		0,	NULL,	'h' },
 	{ "quiet",		0,	NULL,	'q' },
 	{ "regex",		1,	NULL,	'r' },
-	{ "strict-ws",		0,	NULL,	'W' },
+	{ "sloppy",		0,	NULL,	's' },
 	{ "table",		0,	NULL,	OPT_TABLE },
 	{ "table-sep",		1,	NULL,	OPT_TABLE_SEP },
 	{ "trim",		0,	NULL,	OPT_TRIM },
@@ -964,19 +976,20 @@ int main(int argc, char **argv)
 				if (err)
 					bad_args("Error compiling regex: %s", err);
 				break;
+			case 's':
+				if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX)
+					bad_args("--sloppy makes sense only for --ws or --regex.");
+				current_format()->sloppy = 1;
+				break;
 			case 't':
 				set_format(FORM_TSV);
 				break;
 			case 'w':
 				set_format(FORM_WS);
 				break;
-			case 'W':
-				set_format(FORM_WS);
-				current_format()->strict_ws = 1;
-				break;
 			case OPT_ALWAYS_QUOTE:
 				if (current_format()->id != FORM_CSV)
-					bad_args("--always-quote makes sense only for CSV.");
+					bad_args("--always-quote makes sense only for --csv.");
 				current_format()->always_quote = 1;
 				break;
 			case OPT_HELP: