diff --git a/app/Makefile b/app/Makefile index e540a31d..507474bf 100755 --- a/app/Makefile +++ b/app/Makefile @@ -333,7 +333,6 @@ ${JQ_SRC}: ${JQ_TARBALL} mv $@-tmp/jq-1.6 $@ rm -rf $@-tmp - lib-jq: ${JQ_LIB} @echo "Using jq library ${JQ_LIB}" diff --git a/app/test/Makefile b/app/test/Makefile index da4676ec..c675739d 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -232,15 +232,26 @@ test-2tsv test-sql test-flatten test-pretty : test-%: ${BUILD_DIR}/bin/zsv_%${EX ${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL}) # @if [ "$@" = "test-2tsv" ] ; then echo "TO DO: update 2tsv to output Excel format-- see data/Excel.tsv"; fi -test-serialize : test-%: ${BUILD_DIR}/bin/zsv_%${EXE} +${THIS_MAKEFILE_DIR}/../../data/quoted5.csv: ${THIS_MAKEFILE_DIR}/../../data/quoted5.csv.bz2 + bzip2 -d -c $< > $@ + +test-serialize-quoted: ${BUILD_DIR}/bin/zsv_serialize${EXE} ${THIS_MAKEFILE_DIR}/../../data/quoted5.csv + @${TEST_INIT} + @rm -f ${TMP_DIR}/$@.out + @(${PREFIX} $< ${THIS_MAKEFILE_DIR}/../../data/quoted5.csv | grep xxxxxx | grep Produktinfo ${REDIRECT1} ${TMP_DIR}/$@.out && \ + ${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL}) + +test-serialize : test-%: ${BUILD_DIR}/bin/zsv_%${EXE} test-serialize-quoted @${TEST_INIT} @( ( ! [ -s "${TEST_DATA_DIR}/test/$*.csv" ] ) && echo "No test input for $*") || \ (${PREFIX} $< < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \ ${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL}) - (${PREFIX} $< -p < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/$@-2.out && \ + @(${PREFIX} $< -p < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/$@-2.out && \ ${CMP} ${TMP_DIR}/$@-2.out expected/$@-2.out && ${TEST_PASS} || ${TEST_FAIL}) + @(${PREFIX} $< -p < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/$@-2.out && \ + ${CMP} ${TMP_DIR}/$@-2.out expected/$@-2.out && ${TEST_PASS} || ${TEST_FAIL}) test-sql: test-sql2 test-sql3 test-sql4 test-sql2: ${BUILD_DIR}/bin/zsv_sql${EXE} diff --git a/app/test/expected/test-serialize-quoted.out b/app/test/expected/test-serialize-quoted.out new file mode 100644 index 00000000..d75efb7f --- /dev/null +++ b/app/test/expected/test-serialize-quoted.out @@ -0,0 +1 @@ +xxxxxxyy,Produktinfo,"4__________________________________________________________________a________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_______________________b____________-_____,""_______________________-_________________/______/______/______/______/________/_________________=""____-_______________________c____________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________d__________-_____,""____________,_____,__________________,_______________,__________,_______________,___.__/______/______/______/______/_________________=""____-________________________e___________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________f__________-_____,""_________________,_____________________________,____________/______/______/______/________________=""____""____________=""____-_________________________g__________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________h__________-_____,""___________________________________________________,________________%___________________%____________/______/______/______/______/________________=""____-_________________________i__________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________j__________-_____,""__________-_______________________________________.____/__/______/______/______/______/______________=""____-_________________________k__________-_____,""____________=""____-________________________l___________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-_________________________m__________-_____,""__/______/______/______/______/__________________________=""____-____________________________________-_____,""_______________________/______/___________________=""____-____________________________________-_____,""___________________/______/___________________=""____-____________________________________-_____,""__________________/______(_________________,_______)_/___________________=""____-____________________________________-_____,""______________,____(_______________,_)____/__/______/____________________________,___________,_/_______________________,________________________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________________-_____,""________________=""____-_______-_____,""__/______/_____/______/______/______/______/_____/__________________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________________-_____,""____________=""____-____________________________________-_____,""__________,__________,__________________________-_________._/______/______/______/______/______/_________________=""____-____________________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________-_____,""____________=""____-____________________________________-_____,""____________=""____-____________________________________-_____,""________!_____________,__________________________._/______/______/______/______/______/____/______/______/______/______/______/______/______/______/____" diff --git a/data/quoted5.csv.bz2 b/data/quoted5.csv.bz2 new file mode 100644 index 00000000..0f8ad1cb Binary files /dev/null and b/data/quoted5.csv.bz2 differ diff --git a/include/zsv/common.h b/include/zsv/common.h index cb5ec776..8649d4a9 100644 --- a/include/zsv/common.h +++ b/include/zsv/common.h @@ -57,6 +57,7 @@ struct zsv_cell { # define ZSV_PARSER_QUOTE_CLOSED 2 /* value was quoted */ # define ZSV_PARSER_QUOTE_NEEDED 4 /* value contains delimiter or dbl-quote */ # define ZSV_PARSER_QUOTE_EMBEDDED 8 /* value contains dbl-quote */ +# define ZSV_PARSER_QUOTE_PENDING 16 /* only used internally by parser */ /** * quoted flags enable additional efficiency, in particular when input data will * be output as text (csv, json etc), by indicating whether the cell contents may diff --git a/src/zsv.c b/src/zsv.c index 4c41922e..52ecc8b9 100644 --- a/src/zsv.c +++ b/src/zsv.c @@ -41,10 +41,7 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) { scanner->last = scanner->buff.buff[scanner->old_bytes_read-1]; if(scanner->row_start < scanner->old_bytes_read) { size_t len = scanner->old_bytes_read - scanner->row_start; - if(len < scanner->row_start) - memcpy(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len); - else - memmove(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len); + memmove(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len); scanner->partial_row_length = len; } else { scanner->cell_start = 0; @@ -79,7 +76,6 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) { return capacity; } - /** * Read the next chunk of data from our input stream and parse it, calling our * custom handlers as each cell and row are parsed diff --git a/src/zsv_internal.c b/src/zsv_internal.c index 5856e230..ae705254 100644 --- a/src/zsv_internal.c +++ b/src/zsv_internal.c @@ -526,26 +526,23 @@ static void collate_header_row(void *ctx) { if(!scanner->opts.header_span) { // finished with header; combine all rows into a single row set_callbacks(scanner); -// if(VERY_LIKELY(scanner->opts.row_handler != NULL || scanner->opts.cell_handler != NULL -// || scanner->mode == ZSV_MODE_DELIM_PULL)) { - if(scanner->collate_header) { - size_t offset = 0; - for(size_t i = 0; i < scanner->collate_header->column_count; i++) { - size_t len_plus1 = scanner->collate_header->lengths[i]; - scanner->row.cells[i].str = scanner->collate_header->buff.buff + offset; - if(len_plus1) { - scanner->row.cells[i].len = len_plus1 - 1; - scanner->row.cells[i].quoted = 1; - } else - scanner->row.cells[i].len = 0; - offset += len_plus1; - } + if(scanner->collate_header) { + size_t offset = 0; + for(size_t i = 0; i < scanner->collate_header->column_count; i++) { + size_t len_plus1 = scanner->collate_header->lengths[i]; + scanner->row.cells[i].str = scanner->collate_header->buff.buff + offset; + if(len_plus1) { + scanner->row.cells[i].len = len_plus1 - 1; + scanner->row.cells[i].quoted = 1; + } else + scanner->row.cells[i].len = 0; + offset += len_plus1; } + } - apply_callbacks(scanner); - if(scanner->mode != ZSV_MODE_DELIM_PULL) - collate_header_destroy(&scanner->collate_header); -// } + apply_callbacks(scanner); + if(scanner->mode != ZSV_MODE_DELIM_PULL) + collate_header_destroy(&scanner->collate_header); } } @@ -580,7 +577,7 @@ static void zsv_throwaway_row(void *ctx) { static int zsv_scanner_init(struct zsv_scanner *scanner, struct zsv_opts *opts) { - size_t need_buff_size = 0; // opts->buffsize + size_t need_buff_size = 0; if(opts->buffsize < opts->max_row_size * 2) need_buff_size = opts->max_row_size * 2; opts->delimiter = opts->delimiter ? opts->delimiter : ','; diff --git a/src/zsv_scan_delim.c b/src/zsv_scan_delim.c index caef8a02..7fe9f1aa 100644 --- a/src/zsv_scan_delim.c +++ b/src/zsv_scan_delim.c @@ -72,7 +72,6 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner, skip_next_delim = 0; bytes_chunk_end = bytes_read >= sizeof(zsv_uc_vector) ? bytes_read - sizeof(zsv_uc_vector) + 1 : 0; delimiter = scanner->opts.delimiter; - scanner->partial_row_length = 0; // to do: move into one-time execution code? @@ -82,12 +81,15 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner, memset(&v.nl, '\n', sizeof(zsv_uc_vector)); // ascii code 10 memset(&v.cr, '\r', sizeof(zsv_uc_vector)); // ascii code 13 memset(&v.qt, scanner->opts.no_quotes > 0 ? 0 : '"', sizeof(v.qt)); - // case "hel"|"o": check if we have an embedded dbl-quote past the initial opening quote, which was - // split between the last buffer and this one e.g. "hel""o" where the last buffer ended - // with "hel" and this one starts with "o" - if((scanner->quoted & ZSV_PARSER_QUOTE_UNCLOSED) - && i > scanner->cell_start + 1 // case "|hello": need the + 1 in case split after first char of quoted value e.g. "hello" => " and hello" - && scanner->last == quote) { + + if(scanner->quoted & ZSV_PARSER_QUOTE_PENDING) { + // if we're here, then the last chunk we read ended with a lone quote char inside + // a quoted cell, and we are waiting to find out whether it is followed by + // another dbl-quote e.g. if the end of the last chunk is |, we had: + // ...,"hel"|"o" + // ...,"hel"|,... + // ...,"hel"|p,... + scanner->quoted -= ZSV_PARSER_QUOTE_PENDING; if(buff[i] != quote) { scanner->quoted |= ZSV_PARSER_QUOTE_CLOSED; scanner->quoted -= ZSV_PARSER_QUOTE_UNCLOSED; @@ -229,7 +231,8 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner, scanner->quoted |= ZSV_PARSER_QUOTE_EMBEDDED; skip_next_delim = 1; } - } + } else // we are at the end of this input chunk + scanner->quoted |= ZSV_PARSER_QUOTE_PENDING; } else { // cell_length > 0 and cell did not start w quote, so // we have a quote in middle of an unquoted cell