Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

csv quote-pending bug fix #89

Merged
merged 3 commits into from
Dec 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ ${JQ_SRC}: ${JQ_TARBALL}
mv $@-tmp/jq-1.6 $@
rm -rf $@-tmp


lib-jq: ${JQ_LIB}
@echo "Using jq library ${JQ_LIB}"

Expand Down
15 changes: 13 additions & 2 deletions app/test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -232,15 +232,26 @@ test-2tsv test-sql test-flatten test-pretty : test-%: ${BUILD_DIR}/bin/zsv_%${EX
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})
# @if [ "$@" = "test-2tsv" ] ; then echo "TO DO: update 2tsv to output Excel format-- see data/Excel.tsv"; fi

test-serialize : test-%: ${BUILD_DIR}/bin/zsv_%${EXE}
${THIS_MAKEFILE_DIR}/../../data/quoted5.csv: ${THIS_MAKEFILE_DIR}/../../data/quoted5.csv.bz2
bzip2 -d -c $< > $@

test-serialize-quoted: ${BUILD_DIR}/bin/zsv_serialize${EXE} ${THIS_MAKEFILE_DIR}/../../data/quoted5.csv
@${TEST_INIT}
@rm -f ${TMP_DIR}/[email protected]
@(${PREFIX} $< ${THIS_MAKEFILE_DIR}/../../data/quoted5.csv | grep xxxxxx | grep Produktinfo ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

test-serialize : test-%: ${BUILD_DIR}/bin/zsv_%${EXE} test-serialize-quoted
@${TEST_INIT}
@( ( ! [ -s "${TEST_DATA_DIR}/test/$*.csv" ] ) && echo "No test input for $*") || \
(${PREFIX} $< < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

(${PREFIX} $< -p < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
@(${PREFIX} $< -p < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

@(${PREFIX} $< -p < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

test-sql: test-sql2 test-sql3 test-sql4
test-sql2: ${BUILD_DIR}/bin/zsv_sql${EXE}
Expand Down
1 change: 1 addition & 0 deletions app/test/expected/test-serialize-quoted.out
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
xxxxxxyy,Produktinfo,"4__________________________________________________________________a________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_______________________b____________-_____,""_______________________-_________________/______/______/______/______/________/_________________=""____-_______________________c____________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________d__________-_____,""____________,_____,__________________,_______________,__________,_______________,___.__/______/______/______/______/_________________=""____-________________________e___________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________f__________-_____,""_________________,_____________________________,____________/______/______/______/________________=""____""____________=""____-_________________________g__________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________h__________-_____,""___________________________________________________,________________%___________________%____________/______/______/______/______/________________=""____-_________________________i__________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________j__________-_____,""__________-_______________________________________.____/__/______/______/______/______/______________=""____-_________________________k__________-_____,""____________=""____-________________________l___________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________m__________-_____,""__/______/______/______/______/__________________________=""____-____________________________________-_____,""_______________________/______/___________________=""____-____________________________________-_____,""___________________/______/___________________=""____-____________________________________-_____,""__________________/______(_________________,_______)_/___________________=""____-____________________________________-_____,""______________,____(_______________,_)____/__/______/____________________________,___________,_/_______________________,________________________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________________-_____,""________________=""____-_______-_____,""__/______/_____/______/______/______/______/_____/__________________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________________-_____,""____________=""____-____________________________________-_____,""__________,__________,__________________________-_________._/______/______/______/______/______/_________________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________________-_____,""____________=""____-____________________________________-_____,""________!_____________,__________________________._/______/______/______/______/______/____/______/______/______/______/______/______/______/______/____"
Binary file added data/quoted5.csv.bz2
Binary file not shown.
1 change: 1 addition & 0 deletions include/zsv/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct zsv_cell {
# define ZSV_PARSER_QUOTE_CLOSED 2 /* value was quoted */
# define ZSV_PARSER_QUOTE_NEEDED 4 /* value contains delimiter or dbl-quote */
# define ZSV_PARSER_QUOTE_EMBEDDED 8 /* value contains dbl-quote */
# define ZSV_PARSER_QUOTE_PENDING 16 /* only used internally by parser */
/**
* quoted flags enable additional efficiency, in particular when input data will
* be output as text (csv, json etc), by indicating whether the cell contents may
Expand Down
6 changes: 1 addition & 5 deletions src/zsv.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) {
scanner->last = scanner->buff.buff[scanner->old_bytes_read-1];
if(scanner->row_start < scanner->old_bytes_read) {
size_t len = scanner->old_bytes_read - scanner->row_start;
if(len < scanner->row_start)
memcpy(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len);
else
memmove(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len);
memmove(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len);
scanner->partial_row_length = len;
} else {
scanner->cell_start = 0;
Expand Down Expand Up @@ -79,7 +76,6 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) {
return capacity;
}


/**
* Read the next chunk of data from our input stream and parse it, calling our
* custom handlers as each cell and row are parsed
Expand Down
35 changes: 16 additions & 19 deletions src/zsv_internal.c
Original file line number Diff line number Diff line change
Expand Up @@ -526,26 +526,23 @@ static void collate_header_row(void *ctx) {
if(!scanner->opts.header_span) {
// finished with header; combine all rows into a single row
set_callbacks(scanner);
// if(VERY_LIKELY(scanner->opts.row_handler != NULL || scanner->opts.cell_handler != NULL
// || scanner->mode == ZSV_MODE_DELIM_PULL)) {
if(scanner->collate_header) {
size_t offset = 0;
for(size_t i = 0; i < scanner->collate_header->column_count; i++) {
size_t len_plus1 = scanner->collate_header->lengths[i];
scanner->row.cells[i].str = scanner->collate_header->buff.buff + offset;
if(len_plus1) {
scanner->row.cells[i].len = len_plus1 - 1;
scanner->row.cells[i].quoted = 1;
} else
scanner->row.cells[i].len = 0;
offset += len_plus1;
}
if(scanner->collate_header) {
size_t offset = 0;
for(size_t i = 0; i < scanner->collate_header->column_count; i++) {
size_t len_plus1 = scanner->collate_header->lengths[i];
scanner->row.cells[i].str = scanner->collate_header->buff.buff + offset;
if(len_plus1) {
scanner->row.cells[i].len = len_plus1 - 1;
scanner->row.cells[i].quoted = 1;
} else
scanner->row.cells[i].len = 0;
offset += len_plus1;
}
}

apply_callbacks(scanner);
if(scanner->mode != ZSV_MODE_DELIM_PULL)
collate_header_destroy(&scanner->collate_header);
// }
apply_callbacks(scanner);
if(scanner->mode != ZSV_MODE_DELIM_PULL)
collate_header_destroy(&scanner->collate_header);
}
}

Expand Down Expand Up @@ -580,7 +577,7 @@ static void zsv_throwaway_row(void *ctx) {

static int zsv_scanner_init(struct zsv_scanner *scanner,
struct zsv_opts *opts) {
size_t need_buff_size = 0; // opts->buffsize
size_t need_buff_size = 0;
if(opts->buffsize < opts->max_row_size * 2)
need_buff_size = opts->max_row_size * 2;
opts->delimiter = opts->delimiter ? opts->delimiter : ',';
Expand Down
19 changes: 11 additions & 8 deletions src/zsv_scan_delim.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner,
skip_next_delim = 0;
bytes_chunk_end = bytes_read >= sizeof(zsv_uc_vector) ? bytes_read - sizeof(zsv_uc_vector) + 1 : 0;
delimiter = scanner->opts.delimiter;

scanner->partial_row_length = 0;

// to do: move into one-time execution code?
Expand All @@ -82,12 +81,15 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner,
memset(&v.nl, '\n', sizeof(zsv_uc_vector)); // ascii code 10
memset(&v.cr, '\r', sizeof(zsv_uc_vector)); // ascii code 13
memset(&v.qt, scanner->opts.no_quotes > 0 ? 0 : '"', sizeof(v.qt));
// case "hel"|"o": check if we have an embedded dbl-quote past the initial opening quote, which was
// split between the last buffer and this one e.g. "hel""o" where the last buffer ended
// with "hel" and this one starts with "o"
if((scanner->quoted & ZSV_PARSER_QUOTE_UNCLOSED)
&& i > scanner->cell_start + 1 // case "|hello": need the + 1 in case split after first char of quoted value e.g. "hello" => " and hello"
&& scanner->last == quote) {

if(scanner->quoted & ZSV_PARSER_QUOTE_PENDING) {
// if we're here, then the last chunk we read ended with a lone quote char inside
// a quoted cell, and we are waiting to find out whether it is followed by
// another dbl-quote e.g. if the end of the last chunk is |, we had:
// ...,"hel"|"o"
// ...,"hel"|,...
// ...,"hel"|p,...
scanner->quoted -= ZSV_PARSER_QUOTE_PENDING;
if(buff[i] != quote) {
scanner->quoted |= ZSV_PARSER_QUOTE_CLOSED;
scanner->quoted -= ZSV_PARSER_QUOTE_UNCLOSED;
Expand Down Expand Up @@ -229,7 +231,8 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner,
scanner->quoted |= ZSV_PARSER_QUOTE_EMBEDDED;
skip_next_delim = 1;
}
}
} else // we are at the end of this input chunk
scanner->quoted |= ZSV_PARSER_QUOTE_PENDING;
} else {
// cell_length > 0 and cell did not start w quote, so
// we have a quote in middle of an unquoted cell
Expand Down