[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] gawk branch, feature/bwk-csv, created. gawk-4.1.0-4896-g1ee8627c
From: |
Arnold Robbins |
Subject: |
[SCM] gawk branch, feature/bwk-csv, created. gawk-4.1.0-4896-g1ee8627c |
Date: |
Mon, 29 Aug 2022 08:00:00 -0400 (EDT) |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".
The branch, feature/bwk-csv has been created
at 1ee8627c7bb42dad235c66e62050bf61f59cbb6e (commit)
- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=1ee8627c7bb42dad235c66e62050bf61f59cbb6e
commit 1ee8627c7bb42dad235c66e62050bf61f59cbb6e
Author: Arnold D. Robbins <arnold@skeeve.com>
Date: Mon Aug 29 14:59:29 2022 +0300
First cut at CSV support a la BWK awk.
diff --git a/ChangeLog b/ChangeLog
index 46286435..9090eaca 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2022-08-29 Arnold D. Robbins <arnold@skeeve.com>
+
+ * field.c (comma_parse_field, set_comma_field): New functions.
+ (set_FS): If FS is "," and not posix mode, use CSV parsing.
+ Code follows what BWK's new code does.
+
2022-08-25 Arnold D. Robbins <arnold@skeeve.com>
* awkgram.y (yyerror): Exit at the end, to make syntax errors
diff --git a/field.c b/field.c
index 0232cd5a..755a3fd4 100644
--- a/field.c
+++ b/field.c
@@ -59,12 +59,15 @@ static long sc_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static long fw_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
+static long comma_parse_field(long, char **, int, NODE *,
+ Regexp *, Setfunc, NODE *, NODE *, bool);
static const awk_fieldwidth_info_t *api_fw = NULL;
static long fpat_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static void set_element(long num, char * str, long len, NODE *arr);
static void grow_fields_arr(long num);
static void set_field(long num, char *str, long len, NODE *dummy);
+static void set_comma_field(long num, char *str, long len, NODE *dummy);
static void purge_record(void);
static char *parse_extent; /* marks where to restart parse of record */
@@ -147,6 +150,26 @@ set_field(long num,
n->flags = (STRCUR|STRING|USER_INPUT); /* do not set MALLOC */
}
+/* set_comma_field --- set the value of a particular field, coming from CSV */
+
+/*ARGSUSED*/
+static void
+set_comma_field(long num,
+ char *str,
+ long len,
+ NODE *dummy ATTRIBUTE_UNUSED) /* just to make interface same as
set_element */
+{
+ NODE *n;
+ NODE *val = make_string(str, len);
+
+ if (num > nf_high_water)
+ grow_fields_arr(num);
+ n = fields_arr[num];
+ n->stptr = val->stptr;
+ n->stlen = val->stlen;
+ n->flags = (STRCUR|STRING|USER_INPUT|MALLOC);
+}
+
/* rebuild_record --- Someone assigned a value to $(something).
Fix up $0 to be right */
@@ -740,6 +763,98 @@ sc_parse_field(long up_to, /* parse only up to this field
number */
return nf;
}
+/*
+ * comma_parse_field --- CSV parsing same as BWK awk.
+ *
+ * This is called both from get_field() and from do_split()
+ * via (*parse_field)(). This variation is for when FS is a comma,
+ * we do very basic CSV parsing, the same as BWK awk.
+ */
+static long
+comma_parse_field(long up_to, /* parse only up to this field number */
+ char **buf, /* on input: string to parse; on output: point to start
next */
+ int len,
+ NODE *fs,
+ Regexp *rp ATTRIBUTE_UNUSED,
+ Setfunc set, /* routine to set the value of the parsed field */
+ NODE *n,
+ NODE *sep_arr, /* array of field separators (maybe NULL) */
+ bool in_middle ATTRIBUTE_UNUSED)
+{
+ char *scan = *buf;
+ static const char comma = ',';
+ long nf = parse_high_water;
+ char *field;
+ char *end = scan + len;
+
+ static char *newfield = NULL;
+ static size_t buflen = 0;
+
+ if (newfield == NULL) {
+ emalloc(newfield, char *, BUFSIZ, "comma_parse_field");
+ buflen = BUFSIZ;
+ }
+
+ if (set == set_field) // not an array element
+ set = set_comma_field;
+
+ if (up_to == UNLIMITED)
+ nf = 0;
+
+ if (len == 0) {
+ (*set)(++nf, newfield, 0L, n);
+ return nf;
+ }
+
+ for (; nf < up_to;) {
+ char *new_end = newfield;
+ memset(newfield, '\0', buflen);
+
+ while (*scan != comma && scan < end) {
+ if (*scan == '"') {
+ for (scan++; scan < end;) {
+ if (*scan == '"' && scan[1] == '"') {
// "" -> "
+ *new_end++ = '"';
+ scan += 2;
+ } else if (*scan == '"' && (scan ==
end-1 || scan[1] == comma)) {
+ // close of quoted string
+ scan++;
+ break;
+ } else {
+ // grow buffer if needed
+ *new_end++ = *scan++;
+ }
+ }
+ } else {
+ // unquoted field
+ while (*scan != comma && scan < end) {
+ // grow buffer if needed
+ *new_end++ = *scan++;
+ }
+ }
+ }
+
+ (*set)(++nf, newfield, (long)(new_end - newfield), n);
+
+ if (scan == end)
+ break;
+
+ if (scan == *buf) {
+ scan++;
+ continue;
+ }
+
+ scan++;
+ if (scan == end) { /* FS at end of record */
+ (*set)(++nf, newfield, 0L, n);
+ break;
+ }
+ }
+
+ *buf = scan;
+ return nf;
+}
+
/*
* calc_mbslen --- calculate the length in bytes of a multi-byte string
* containing len characters.
@@ -1309,7 +1424,8 @@ set_FS()
save_rs = dupnode(RS_node->var_value);
resave_fs = true;
- /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
+ /*
+ * If FS_re_no_case assignment is fatal (make_regexp in remake_re)
* FS_regexp will be NULL with a non-null FS_re_yes_case.
* refree() handles null argument; no need for `if (FS_regexp != NULL)'
below.
* Please do not remerge.
@@ -1363,6 +1479,8 @@ choose_fs_function:
else if (fs->stptr[0] == '\\')
/* same special case */
strcpy(buf, "[\\\\]");
+ else if (fs->stptr[0] == ',' && ! do_posix)
+ set_parser(comma_parse_field);
else
set_parser(sc_parse_field);
}
diff --git a/pc/ChangeLog b/pc/ChangeLog
index 9b09d8e9..f5d09d28 100644
--- a/pc/ChangeLog
+++ b/pc/ChangeLog
@@ -1,3 +1,7 @@
+2022-08-29 Arnold D. Robbins <arnold@skeeve.com>
+
+ * Makefile.tst: Regenerated.
+
2022-08-25 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.tst: Regenerated.
diff --git a/pc/Makefile.tst b/pc/Makefile.tst
index e8e75e0f..1ffd3f0d 100644
--- a/pc/Makefile.tst
+++ b/pc/Makefile.tst
@@ -188,7 +188,7 @@ GAWK_EXT_TESTS = \
aadelete1 aadelete2 aarray1 aasort aasorti argtest arraysort \
arraysort2 arraytype asortbool backw badargs beginfile1 beginfile2 \
binmode1 charasbytes clos1way clos1way2 clos1way3 clos1way4 \
- clos1way5 clos1way6 colonwarn commas crlf dbugeval dbugeval2 \
+ clos1way5 clos1way6 colonwarn commas crlf csv1 dbugeval dbugeval2 \
dbugeval3 dbugtypedre1 dbugtypedre2 delsub devfd devfd1 devfd2 \
dfacheck1 dumpvars errno exit fieldwdth forcenum fpat1 fpat2 \
fpat3 fpat4 fpat5 fpat6 fpat7 fpat8 fpat9 fpatnull fsfwfs functab1 \
@@ -2704,6 +2704,11 @@ crlf:
@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE:
$$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+csv1:
+ @echo $@
+ @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1
|| echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
dbugeval2:
@echo $@
@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --debug < "$(srcdir)"/$@.in
>_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/ChangeLog b/test/ChangeLog
index 3a2691ed..c8f37a30 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,8 @@
+2022-08-29 Arnold D. Robbins <arnold@skeeve.com>
+
+ * Makefile.am (EXTRA_DIST): csv1: New test.
+ * csv1.awk, csv1.in, csv1.ok: New files.
+
2022-08-25 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.am (EXTRA_DIST): nsbad2, nsbad3: new tests.
diff --git a/test/Makefile.am b/test/Makefile.am
index 962885b5..36068728 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -223,6 +223,9 @@ EXTRA_DIST = \
convfmt.ok \
crlf.awk \
crlf.ok \
+ csv1.awk \
+ csv1.in \
+ csv1.ok \
datanonl.awk \
datanonl.in \
datanonl.ok \
@@ -1478,7 +1481,7 @@ GAWK_EXT_TESTS = \
aadelete1 aadelete2 aarray1 aasort aasorti argtest arraysort \
arraysort2 arraytype asortbool backw badargs beginfile1 beginfile2 \
binmode1 charasbytes clos1way clos1way2 clos1way3 clos1way4 \
- clos1way5 clos1way6 colonwarn commas crlf dbugeval dbugeval2 \
+ clos1way5 clos1way6 colonwarn commas crlf csv1 dbugeval dbugeval2 \
dbugeval3 dbugtypedre1 dbugtypedre2 delsub devfd devfd1 devfd2 \
dfacheck1 dumpvars errno exit fieldwdth forcenum fpat1 fpat2 \
fpat3 fpat4 fpat5 fpat6 fpat7 fpat8 fpat9 fpatnull fsfwfs functab1 \
diff --git a/test/Makefile.in b/test/Makefile.in
index f7ac3cfa..a2057e72 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -491,6 +491,9 @@ EXTRA_DIST = \
convfmt.ok \
crlf.awk \
crlf.ok \
+ csv1.awk \
+ csv1.in \
+ csv1.ok \
datanonl.awk \
datanonl.in \
datanonl.ok \
@@ -1746,7 +1749,7 @@ GAWK_EXT_TESTS = \
aadelete1 aadelete2 aarray1 aasort aasorti argtest arraysort \
arraysort2 arraytype asortbool backw badargs beginfile1 beginfile2 \
binmode1 charasbytes clos1way clos1way2 clos1way3 clos1way4 \
- clos1way5 clos1way6 colonwarn commas crlf dbugeval dbugeval2 \
+ clos1way5 clos1way6 colonwarn commas crlf csv1 dbugeval dbugeval2 \
dbugeval3 dbugtypedre1 dbugtypedre2 delsub devfd devfd1 devfd2 \
dfacheck1 dumpvars errno exit fieldwdth forcenum fpat1 fpat2 \
fpat3 fpat4 fpat5 fpat6 fpat7 fpat8 fpat9 fpatnull fsfwfs functab1 \
@@ -4445,6 +4448,11 @@ crlf:
@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE:
$$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+csv1:
+ @echo $@
+ @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1
|| echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
dbugeval2:
@echo $@
@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --debug < "$(srcdir)"/$@.in
>_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/Maketests b/test/Maketests
index 8b88ed83..a4d685a7 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -1402,6 +1402,11 @@ crlf:
@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE:
$$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+csv1:
+ @echo $@
+ @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1
|| echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
dbugeval2:
@echo $@
@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --debug < "$(srcdir)"/$@.in
>_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/csv1.awk b/test/csv1.awk
new file mode 100644
index 00000000..12bbf1e5
--- /dev/null
+++ b/test/csv1.awk
@@ -0,0 +1,10 @@
+BEGIN {
+ FS = ","
+}
+
+{
+ printf(" \t%s\t", $0)
+ for (i = 1; i <= NF; i++)
+ printf("[%s]", $i)
+ print ""
+}
diff --git a/test/csv1.in b/test/csv1.in
new file mode 100644
index 00000000..620b2ab3
--- /dev/null
+++ b/test/csv1.in
@@ -0,0 +1,26 @@
+a
+ a
+,a
+ , a
+a,b
+a,b,c
+""
+"abc"
+"a""b"
+"a","b"
+a""b
+"a,b"
+""""
+""""""
+"""x"""
+,,""
+a""b
+a"b
+a''b
+"abc
+,,
+a,
+"",
+,
+"abc",def
+
diff --git a/test/csv1.ok b/test/csv1.ok
new file mode 100644
index 00000000..178daed1
--- /dev/null
+++ b/test/csv1.ok
@@ -0,0 +1,26 @@
+ a [a]
+ a [ a]
+ ,a [][a]
+ , a [ ][ a]
+ a,b [a][b]
+ a,b,c [a][b][c]
+ "" []
+ "abc" [abc]
+ "a""b" [a"b]
+ "a","b" [a][b]
+ a""b [a""b]
+ "a,b" [a,b]
+ """" ["]
+ """""" [""]
+ """x""" ["x"]
+ ,,"" [][][]
+ a""b [a""b]
+ a"b [a"b]
+ a''b [a''b]
+ "abc [abc]
+ ,, [][][]
+ a, [a][]
+ "", [][]
+ , [][]
+ "abc",def [abc][def]
+ []
-----------------------------------------------------------------------
hooks/post-receive
--
gawk
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [SCM] gawk branch, feature/bwk-csv, created. gawk-4.1.0-4896-g1ee8627c,
Arnold Robbins <=