From b0189ea1d10af0fb6621ac54dc2d878c9451893b Mon Sep 17 00:00:00 2001 From: liquidaty Date: Sun, 5 Feb 2023 15:05:18 -0800 Subject: [PATCH 1/4] update fixed-auto move --header-row to general option fix insert-header not working with pull --- app/builtin/help.c | 18 ++-- app/select-pull.c | 8 -- app/select.c | 147 +++++++++++++++----------- app/test/Makefile | 5 + app/test/expected/test-11-select.out | 2 + app/utils/arg.c | 149 ++++++++++++++------------- include/zsv/common.h | 2 + src/zsv.c | 33 ++++-- 8 files changed, 207 insertions(+), 157 deletions(-) create mode 100644 app/test/expected/test-11-select.out diff --git a/app/builtin/help.c b/app/builtin/help.c index fcfd13d6..86bbc3b2 100644 --- a/app/builtin/help.c +++ b/app/builtin/help.c @@ -38,16 +38,18 @@ static int main_help(int argc, const char *argv[]) { " -L,--limit-rows : limit processing to the given number of rows (including any header row(s))", #endif " -c,--max-column-count : set the maximum number of columns parsed per row. defaults to 1024", - " -r,--max-row-size : set the minimum supported maximum row size. defaults to 64k", - " -B,--buff-size : set internal buffer size. defaults to 256k", - " -t,--tab-delim: set column delimiter to tab", - " -O,--other-delim : set column delimiter to specified character", - " -q,--no-quote: turn off quote handling", - " -R,--skip-head : skip specified number of initial rows", - " -d,--header-row-span : apply header depth (rowspan) of n", + " -r,--max-row-size : set the minimum supported maximum row size. defaults to 64k", + " -B,--buff-size : set internal buffer size. defaults to 256k", + " -t,--tab-delim : set column delimiter to tab", + " -O,--other-delim : set column delimiter to specified character", + " -q,--no-quote : turn off quote handling", + " -R,--skip-head : skip specified number of initial rows", + " -d,--header-row-span : apply header depth (rowspan) of n", " -u,--malformed-utf8-replacement : replacement string (can be empty) in case of malformed UTF8 input", " (default for \"desc\" commamnd is '?')", - " -S,--keep-blank-headers: disable default behavior of ignoring leading blank rows", + " -S,--keep-blank-headers : disable default behavior of ignoring leading blank rows", + " -0,--header-row
: insert the provided CSV as the first row (in position 0)", + " e.g. --header-row 'col1,col2,\"my col 3\"'", " -v,--verbose: verbose output", "", "Commands that parse CSV or other tabular data:", diff --git a/app/select-pull.c b/app/select-pull.c index d0dc3381..35a1ab40 100644 --- a/app/select-pull.c +++ b/app/select-pull.c @@ -585,7 +585,6 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op const char *input_path = NULL; struct zsv_csv_writer_options writer_opts = zsv_writer_get_default_opts(); int col_index_arg_i = 0; - const char *insert_header_row = NULL; enum zsv_status stat = zsv_status_ok; for(int arg_i = 1; stat == zsv_status_ok && arg_i < argc; arg_i++) { if(!strcmp(argv[arg_i], "--")) { @@ -655,12 +654,6 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op data.whitspace_clean_flags = 1; } else if(!strcmp(argv[arg_i], "-W") || !strcmp(argv[arg_i], "--no-trim")) { data.no_trim_whitespace = 1; - } else if(!strcmp(argv[arg_i], "--header-row")) { - arg_i++; - if(!(arg_i < argc)) - stat = zsv_printerr(1, "%s option requires a header row value such as 'column_name1,\"column name 2\"'", argv[arg_i-1]); - else - insert_header_row = argv[arg_i]; } else if(!strcmp(argv[arg_i], "--sample-every")) { arg_i++; if(!(arg_i < argc)) @@ -748,7 +741,6 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op stat = zsv_status_memory; else { zsv_parser parser; - data.opts->insert_header_row = insert_header_row; if(zsv_new_with_properties(data.opts, input_path, opts_used, &parser) == zsv_status_ok) { // all done with diff --git a/app/select.c b/app/select.c index a5139e4b..2aad5bb6 100644 --- a/app/select.c +++ b/app/select.c @@ -529,14 +529,11 @@ const char *zsv_select_usage_msg[] = { #endif " -H,--head : (head) only process the first n rows of data", " selected from all rows in the input", - " --header-row
: insert the provided CSV as the first row", - " e.g. --header-row 'col1,col2,\"my col 3\"'", " -s,--search : only output rows with at least one cell containing" " value", // to do: " -s,--search //modifiers: search on regex pattern; modifiers include 'g' (global) and 'i' (case-insensitive)", " --sample-every : output a sample consisting of the first row, then every nth row", " --sample-pct : output a randomly-selected sample (32 bits of randomness) of n percent of the input rows", - " -d,--header-row-span : apply header depth (rowspan) of n", " --distinct : skip subsequent occurrences of columns with the same name", " --merge : merge subsequent occurrences of columns with the same", " name, outputting first non-null value", @@ -601,68 +598,96 @@ static void zsv_select_cleanup(struct zsv_select_data *data) { * ----COLUMN1----COLUMN2-----COLUMN3---- * * Approach: - * - find each instance of white followed by not-white, but ignore the first instance of it + * - read the first [256k] of data [to do: alternatively, read only the first line] + * - merge all read lines into a single line where line[i] = 'x' for each position i at which + * a non-white char appeared in any scanned line + * - from the merged line, find each instance of white followed by not-white, + * but ignore the first instance of it */ -static enum zsv_status auto_detect_fixed_column_sizes(struct fixed *fixed, struct zsv_opts *opts, char **scanned, char verbose) { +static enum zsv_status auto_detect_fixed_column_sizes(struct fixed *fixed, struct zsv_opts *opts, + unsigned char *buff, size_t buffsize, size_t *buff_used, + char verbose) { + char only_first_line = 0; // to do: make this an option + enum zsv_status stat = zsv_status_ok; + fixed->count = 0; - unsigned buffsize = 1024*1024; // 1MB - char *buff = calloc(buffsize, sizeof(*buff)); - if(!buff) - return zsv_status_memory; + char *line = calloc(buffsize, sizeof(*buff)); + if(!line) { + stat = zsv_status_memory; + goto auto_detect_fixed_column_sizes_exit; + } + memset(line, ' ', buffsize); - int c; - size_t i; - char was_space = 1; + *buff_used = fread(buff, 1, buffsize, opts->stream); + if(!*buff_used) { + stat = zsv_status_no_more_input; + goto auto_detect_fixed_column_sizes_exit; + } + + size_t line_end = 0; + size_t line_cursor = 0; char first = 1; - for(i = 0; i < buffsize-1; i++) { - c = fgetc(opts->stream); - if(c == EOF || c == '\n') + char was_space = 1; + char was_line_end = 0; + for(size_t i = 0; i < *buff_used && (!only_first_line || !line_end); i++, line_cursor = was_line_end ? 0 : line_cursor + 1) { + was_line_end = 0; + // to do: support multi-byte unicode chars? + switch(buff[i]) { + case '\n': + case '\r': + if(line_cursor > line_end) + line_end = line_cursor; + was_line_end = 1; + was_space = 1; + break; + case '\t': + case '\v': + case '\f': + case ' ': + was_space = 1; break; - buff[i] = c; - if(!isspace(c)) { + default: + line[line_cursor] = 'x'; if(was_space) { - if(first) - first = 0; - else - fixed->count++; + if(!line_end) { // only count columns for the first line + if(first) + first = 0; + else + fixed->count++; + } } was_space = 0; - } else - was_space = 1; + } } if(!first) fixed->count++; - if(c != '\n' || !fixed->count) { - free(buff); - return zsv_status_error; + if(!line_end) { + stat = zsv_status_error; + goto auto_detect_fixed_column_sizes_exit; } - // free unused memory - char *buff_tmp = realloc(buff, i+1); - if(buff_tmp) - buff = buff_tmp; - *scanned = buff; - buffsize = i; + if(verbose) + fprintf(stderr, "Calculating %zu columns from line:\n%.*s\n", fixed->count, (int)line_end, line); - // set offset values + // allocate offsets free(fixed->offsets); fixed->offsets = malloc(fixed->count * sizeof(*fixed->offsets)); - if(!fixed->offsets) - return zsv_status_memory; + if(!fixed->offsets) { + stat = zsv_status_memory; + goto auto_detect_fixed_column_sizes_exit; + } + // now we have our merged line, so calculate the sizes // do the loop again, but assign values this time int count = 0; was_space = 1; first = 1; if(verbose) fprintf(stderr, "Running --fixed "); - for(i = 0; i < buffsize; i++) { - c = buff[i]; - if(c == EOF || c == '\0') - break; - buff[i] = c; - if(!isspace(c)) { + size_t i; + for(i = 0; i < line_end; i++) { + if(line[i] == 'x') { if(was_space) { if(first) first = 0; @@ -683,7 +708,10 @@ static enum zsv_status auto_detect_fixed_column_sizes(struct fixed *fixed, struc } if(verbose) fprintf(stderr, "\n"); - return zsv_status_ok; + + auto_detect_fixed_column_sizes_exit: + free(line); + return stat; } @@ -699,8 +727,8 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op const char *input_path = NULL; struct zsv_csv_writer_options writer_opts = zsv_writer_get_default_opts(); int col_index_arg_i = 0; - const char *insert_header_row = NULL; - char *fixed_auto_scanned_buff = NULL; + unsigned char *preview_buff = NULL; + size_t preview_buff_len = 0; enum zsv_status stat = zsv_status_ok; for(int arg_i = 1; stat == zsv_status_ok && arg_i < argc; arg_i++) { @@ -773,12 +801,6 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op data.whitspace_clean_flags = 1; } else if(!strcmp(argv[arg_i], "-W") || !strcmp(argv[arg_i], "--no-trim")) { data.no_trim_whitespace = 1; - } else if(!strcmp(argv[arg_i], "--header-row")) { - arg_i++; - if(!(arg_i < argc)) - stat = zsv_printerr(1, "%s option requires a header row value such as 'column_name1,\"column name 2\"'", argv[arg_i-1]); - else - insert_header_row = argv[arg_i]; } else if(!strcmp(argv[arg_i], "--sample-every")) { arg_i++; if(!(arg_i < argc)) @@ -853,12 +875,15 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op if(stat == zsv_status_ok && fixed_auto) { if(data.fixed.offsets) stat = zsv_printerr(zsv_status_error, "Please specify either --fixed-auto or --fixed, but not both"); - else if(insert_header_row) + else if(data.opts->insert_header_row) stat = zsv_printerr(zsv_status_error, "--fixed-auto can not be specified together with --header-row"); else { - stat = auto_detect_fixed_column_sizes(&data.fixed, data.opts, &fixed_auto_scanned_buff, opts->verbose); - if(fixed_auto_scanned_buff) - data.opts->insert_header_row = fixed_auto_scanned_buff; + size_t buffsize = 1024*256; // read the first + preview_buff = calloc(buffsize, sizeof(*preview_buff)); + if(!preview_buff) + stat = zsv_printerr(zsv_status_memory, "Out of memory!"); + else + stat = auto_detect_fixed_column_sizes(&data.fixed, data.opts, preview_buff, buffsize, &preview_buff_len, opts->verbose); } } @@ -879,9 +904,6 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op else { data.opts->row_handler = zsv_select_header_row; data.opts->ctx = &data; - if(!data.opts->insert_header_row) - data.opts->insert_header_row = insert_header_row; - if(zsv_new_with_properties(data.opts, input_path, opts_used, &data.parser) == zsv_status_ok) { // all done with @@ -903,16 +925,19 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op // process the input data zsv_handle_ctrl_c_signal(); - enum zsv_status status; - while(!zsv_signal_interrupted && !data.cancelled && (status = zsv_parse_more(data.parser)) == zsv_status_ok) - ; + enum zsv_status status = zsv_status_ok; + if(preview_buff && preview_buff_len) + status = zsv_parse_bytes(data.parser, preview_buff, preview_buff_len); + while(status == zsv_status_ok + && !zsv_signal_interrupted && !data.cancelled) + status = zsv_parse_more(data.parser); zsv_finish(data.parser); zsv_delete(data.parser); } } } - free(fixed_auto_scanned_buff); + free(preview_buff); zsv_select_cleanup(&data); if(writer_opts.stream && writer_opts.stream != stdout) fclose(writer_opts.stream); diff --git a/app/test/Makefile b/app/test/Makefile index bd731c01..75b13e33 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -198,6 +198,11 @@ test-10-select: test-10-% : ${BUILD_DIR}/bin/zsv_%${EXE} test-10-select-pull: @echo 'N/a (test-10-select-pull)' +test-11-select test-11-select-pull: test-11-% : ${BUILD_DIR}/bin/zsv_%${EXE} + @${TEST_INIT} + @${PREFIX} (echo "A1,B1" | $< --header-row "column1,column2") > /tmp/$@.out + @cmp /tmp/$@.out expected/test-11-select.out && ${TEST_PASS} || ${TEST_FAIL} + test-fixed-1-select: ${BUILD_DIR}/bin/zsv_select${EXE} @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/fixed.csv --fixed 3,7,12,18,20,21,22 ${REDIRECT} ${TMP_DIR}/$@.out diff --git a/app/test/expected/test-11-select.out b/app/test/expected/test-11-select.out new file mode 100644 index 00000000..26c8ee7c --- /dev/null +++ b/app/test/expected/test-11-select.out @@ -0,0 +1,2 @@ +column1,column2 +A1,B1 diff --git a/app/utils/arg.c b/app/utils/arg.c index b4be7fdc..b91f1574 100644 --- a/app/utils/arg.c +++ b/app/utils/arg.c @@ -115,10 +115,12 @@ void zsv_set_default_completed_callback(zsv_completed_callback cb, void *ctx) { * -O,--other-delim * -q,--no-quote * -R,--skip-head : skip specified number of initial rows - * -d,--header-row-span : apply header depth (rowspan) of n + * -d,--header-row-span : apply header depth (rowspan) of n * -u,--malformed-utf8-replacement : replacement string (can be empty) in case of malformed UTF8 input * (default for "desc" commamnd is '?') - * -S,--keep-blank-headers: disable default behavior of ignoring leading blank rows + * -S,--keep-blank-headers : disable default behavior of ignoring leading blank rows + * -0,--header-row
: insert the provided CSV as the first row (in position 0) + * e.g. --header-row 'col1,col2,\"my col 3\"'", * -v,--verbose * * @param argc count of args to process @@ -141,24 +143,13 @@ enum zsv_status zsv_args_to_opts(int argc, const char *argv[], char *opts_used ) { #ifdef ZSV_EXTRAS - static const char *short_args = "BcrtOqvRdSuL"; + static const char *short_args = "BcrtOqvRdSu0L"; #else - static const char *short_args = "BcrtOqvRdSu"; + static const char *short_args = "BcrtOqvRdSu0"; #endif assert(strlen(short_args) < ZSV_OPTS_SIZE_MAX); - *opts_out = zsv_get_default_opts(); - int options_start = 1; // skip this many args before we start looking for options - int err = 0; - int new_argc = 0; - for(; new_argc < options_start && new_argc < argc; new_argc++) - argv_out[new_argc] = argv[new_argc]; - if(opts_used) { - memset(opts_used, ' ', ZSV_OPTS_SIZE_MAX-1); - opts_used[ZSV_OPTS_SIZE_MAX-1] = '\0'; - } - - static const char *long_args[] = { + static const char *long_args[] = { // "buff-size", "max-column-count", "max-row-size", @@ -170,12 +161,24 @@ enum zsv_status zsv_args_to_opts(int argc, const char *argv[], "header-row-span", "keep-blank-headers", "malformed-utf8-replacement", + "header-row", #ifdef ZSV_EXTRAS "limit-rows", #endif NULL }; + *opts_out = zsv_get_default_opts(); + int options_start = 1; // skip this many args before we start looking for options + int err = 0; + int new_argc = 0; + for(; new_argc < options_start && new_argc < argc; new_argc++) + argv_out[new_argc] = argv[new_argc]; + if(opts_used) { + memset(opts_used, ' ', ZSV_OPTS_SIZE_MAX-1); + opts_used[ZSV_OPTS_SIZE_MAX-1] = '\0'; + } + for(int i = options_start; !err && i < argc; i++) { char arg = 0; if(*argv[i] != '-') { /* pass this option through */ @@ -218,67 +221,73 @@ enum zsv_status zsv_args_to_opts(int argc, const char *argv[], case 'R': case 'd': case 'u': + case '0': if(++i >= argc) err = fprintf(stderr, "Error: option %s requires a value\n", argv[i-1]); - else if(arg == 'O') { + else { const char *val = argv[i]; - if(strlen(val) != 1 || *val == 0) - err = fprintf(stderr, "Error: delimiter '%s' may only be a single ascii character", val); - else if(strchr("\n\r\"", *val)) - err = fprintf(stderr, "Error: column delimiter may not be '\\n', '\\r' or '\"'\n"); + if(arg == 'O') { + if(strlen(val) != 1 || *val == 0) + err = fprintf(stderr, "Error: delimiter '%s' may only be a single ascii character", val); + else if(strchr("\n\r\"", *val)) + err = fprintf(stderr, "Error: column delimiter may not be '\\n', '\\r' or '\"'\n"); else opts_out->delimiter = *val; - } else if(arg == 'u') { - const char *val = argv[i]; - if(!strcmp(val, "none")) - opts_out->malformed_utf8_replace = ZSV_MALFORMED_UTF8_DO_NOT_REPLACE; - else if(!*val) - opts_out->malformed_utf8_replace = ZSV_MALFORMED_UTF8_REMOVE; - else if(strlen(val) > 2 || *val < 0) - err = fprintf(stderr, "Error: %s value must be a single-byte UTF8 char, empty string or 'none'\n", argv[i-1]); - else - opts_out->malformed_utf8_replace = *val; - } else { - const char *val = argv[i]; - /* arg = 'B', 'c', 'r', 'R', 'd', or 'L' (ZSV_EXTRAS only) */ - long n = atol(val); - if(n < 0) - err = fprintf(stderr, "Error: option %s value may not be less than zero (got %li\n", val, n); -#ifdef ZSV_EXTRAS - else if(arg == 'L') { - if(n < 1) - err = fprintf(stderr, "Error: max rows may not be less than 1 (got %s)\n", val); - else - opts_out->max_rows = n; - } else -#endif - if(arg == 'B') { - if(n < ZSV_MIN_SCANNER_BUFFSIZE) - err = fprintf(stderr, "Error: buff size may not be less than %u (got %s)\n", - ZSV_MIN_SCANNER_BUFFSIZE, val); - else - opts_out->buffsize = n; - } else if(arg == 'c') { - if(n < 8) - err = fprintf(stderr, "Error: max column count may not be less than 8 (got %s)\n", val); + } else if(arg == 'u') { + if(!strcmp(val, "none")) + opts_out->malformed_utf8_replace = ZSV_MALFORMED_UTF8_DO_NOT_REPLACE; + else if(!*val) + opts_out->malformed_utf8_replace = ZSV_MALFORMED_UTF8_REMOVE; + else if(strlen(val) > 2 || *val < 0) + err = fprintf(stderr, "Error: %s value must be a single-byte UTF8 char, empty string or 'none'\n", argv[i-1]); else - opts_out->max_columns = n; - } else if(arg == 'r') { - if(n < ZSV_ROW_MAX_SIZE_MIN) - err = fprintf(stderr, "Error: max row size size may not be less than %u (got %s)\n", - ZSV_ROW_MAX_SIZE_MIN, val); + opts_out->malformed_utf8_replace = *val; + } else if(arg == '0') { + if(*val == 0) + err = fprintf(stderr, "Invalid empty Inserted header row\n"); else - opts_out->max_row_size = n; - } else if(arg == 'd') { - if(n < 8 && n >= 0) - opts_out->header_span = n; - else - err = fprintf(stderr, "Error: header_span must be an integer between 0 and 8\n"); - } else if(arg == 'R') { - if(n >= 0) - opts_out->rows_to_ignore = n; - else - err = fprintf(stderr, "Error: rows_to_skip must be >= 0\n"); + opts_out->insert_header_row = argv[i]; + } else { + /* arg = 'B', 'c', 'r', 'R', 'd', or 'L' (ZSV_EXTRAS only) */ + long n = atol(val); + if(n < 0) + err = fprintf(stderr, "Error: option %s value may not be less than zero (got %li\n", val, n); +#ifdef ZSV_EXTRAS + else if(arg == 'L') { + if(n < 1) + err = fprintf(stderr, "Error: max rows may not be less than 1 (got %s)\n", val); + else + opts_out->max_rows = n; + } else +#endif + if(arg == 'B') { + if(n < ZSV_MIN_SCANNER_BUFFSIZE) + err = fprintf(stderr, "Error: buff size may not be less than %u (got %s)\n", + ZSV_MIN_SCANNER_BUFFSIZE, val); + else + opts_out->buffsize = n; + } else if(arg == 'c') { + if(n < 8) + err = fprintf(stderr, "Error: max column count may not be less than 8 (got %s)\n", val); + else + opts_out->max_columns = n; + } else if(arg == 'r') { + if(n < ZSV_ROW_MAX_SIZE_MIN) + err = fprintf(stderr, "Error: max row size size may not be less than %u (got %s)\n", + ZSV_ROW_MAX_SIZE_MIN, val); + else + opts_out->max_row_size = n; + } else if(arg == 'd') { + if(n < 8 && n >= 0) + opts_out->header_span = n; + else + err = fprintf(stderr, "Error: header_span must be an integer between 0 and 8\n"); + } else if(arg == 'R') { + if(n >= 0) + opts_out->rows_to_ignore = n; + else + err = fprintf(stderr, "Error: rows_to_skip must be >= 0\n"); + } } } break; diff --git a/include/zsv/common.h b/include/zsv/common.h index 40806d9d..7f4a3bf2 100644 --- a/include/zsv/common.h +++ b/include/zsv/common.h @@ -197,6 +197,8 @@ struct zsv_opts { * if the actual data does not have a header row with column names, the caller * should provide one (in CSV format) which will be treated as if it was the * first row of data + * + * cli option: -0,--header-row */ const char *insert_header_row; diff --git a/src/zsv.c b/src/zsv.c index b9ad2640..48a98506 100644 --- a/src/zsv.c +++ b/src/zsv.c @@ -116,22 +116,31 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) { return capacity; } +/** + * apply --header-row option + */ +static enum zsv_status zsv_insert_string(struct zsv_scanner *scanner) { + // to do: replace below with + // return parse_bytes(scanner, bytes, len); + size_t len = strlen(scanner->insert_string); + if(len > scanner->buff.size - scanner->partial_row_length) + len = scanner->buff.size - 1; // to do: throw an error instead + memcpy(scanner->buff.buff + scanner->partial_row_length, scanner->insert_string, len); + if(scanner->buff.buff[len] != '\n') + scanner->buff.buff[len] = '\n'; + enum zsv_status stat = zsv_scan(scanner, scanner->buff.buff, len + 1); + scanner->insert_string = NULL; + return stat; +} + /** * Read the next chunk of data from our input stream and parse it, calling our * custom handlers as each cell and row are parsed */ ZSV_EXPORT enum zsv_status zsv_parse_more(struct zsv_scanner *scanner) { - if(scanner->insert_string) { - size_t len = strlen(scanner->insert_string); - if(len > scanner->buff.size - scanner->partial_row_length) - len = scanner->buff.size - 1; // to do: throw an error instead - memcpy(scanner->buff.buff + scanner->partial_row_length, scanner->insert_string, len); - if(scanner->buff.buff[len] != '\n') - scanner->buff.buff[len] = '\n'; - zsv_scan(scanner, scanner->buff.buff, len + 1); - scanner->insert_string = NULL; - } + if(VERY_UNLIKELY(scanner->insert_string != NULL)) + zsv_insert_string(scanner); size_t capacity = scanner_pre_parse(scanner); size_t bytes_read; @@ -194,6 +203,10 @@ enum zsv_status zsv_next_row(zsv_parser parser) { parser->mode = ZSV_MODE_DELIM_PULL; zsv_set_row_handler(parser, zsv_pull_row); zsv_set_context(parser, parser); + if(parser->insert_string != NULL) + parser->pull.stat = zsv_insert_string(parser); + if(parser->pull.stat == zsv_status_row) + return parser->pull.stat; } if(VERY_LIKELY(parser->pull.stat == zsv_status_row)) parser->pull.stat = zsv_scan_delim_pull(parser, parser->pull.buff, parser->pull.bytes_read); From 75aa9283f66be9b8305204b5e643864f7274834d Mon Sep 17 00:00:00 2001 From: liquidaty Date: Sun, 5 Feb 2023 16:19:34 -0800 Subject: [PATCH 2/4] test: update expected help messages --- .../test/expected/zsvext-test-3.out | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/app/ext_example/test/expected/zsvext-test-3.out b/app/ext_example/test/expected/zsvext-test-3.out index acb3229a..efea2a6f 100644 --- a/app/ext_example/test/expected/zsvext-test-3.out +++ b/app/ext_example/test/expected/zsvext-test-3.out @@ -20,16 +20,18 @@ Usage: Options common to all commands except `prop`, `rm` and `jq`: -L,--limit-rows : limit processing to the given number of rows (including any header row(s)) -c,--max-column-count : set the maximum number of columns parsed per row. defaults to 1024 - -r,--max-row-size : set the minimum supported maximum row size. defaults to 64k - -B,--buff-size : set internal buffer size. defaults to 256k - -t,--tab-delim: set column delimiter to tab - -O,--other-delim : set column delimiter to specified character - -q,--no-quote: turn off quote handling - -R,--skip-head : skip specified number of initial rows - -d,--header-row-span : apply header depth (rowspan) of n + -r,--max-row-size : set the minimum supported maximum row size. defaults to 64k + -B,--buff-size : set internal buffer size. defaults to 256k + -t,--tab-delim : set column delimiter to tab + -O,--other-delim : set column delimiter to specified character + -q,--no-quote : turn off quote handling + -R,--skip-head : skip specified number of initial rows + -d,--header-row-span : apply header depth (rowspan) of n -u,--malformed-utf8-replacement : replacement string (can be empty) in case of malformed UTF8 input (default for "desc" commamnd is '?') - -S,--keep-blank-headers: disable default behavior of ignoring leading blank rows + -S,--keep-blank-headers : disable default behavior of ignoring leading blank rows + -0,--header-row
: insert the provided CSV as the first row (in position 0) + e.g. --header-row 'col1,col2,"my col 3"' -v,--verbose: verbose output Commands that parse CSV or other tabular data: @@ -75,16 +77,18 @@ Usage: Options common to all commands except `prop`, `rm` and `jq`: -L,--limit-rows : limit processing to the given number of rows (including any header row(s)) -c,--max-column-count : set the maximum number of columns parsed per row. defaults to 1024 - -r,--max-row-size : set the minimum supported maximum row size. defaults to 64k - -B,--buff-size : set internal buffer size. defaults to 256k - -t,--tab-delim: set column delimiter to tab - -O,--other-delim : set column delimiter to specified character - -q,--no-quote: turn off quote handling - -R,--skip-head : skip specified number of initial rows - -d,--header-row-span : apply header depth (rowspan) of n + -r,--max-row-size : set the minimum supported maximum row size. defaults to 64k + -B,--buff-size : set internal buffer size. defaults to 256k + -t,--tab-delim : set column delimiter to tab + -O,--other-delim : set column delimiter to specified character + -q,--no-quote : turn off quote handling + -R,--skip-head : skip specified number of initial rows + -d,--header-row-span : apply header depth (rowspan) of n -u,--malformed-utf8-replacement : replacement string (can be empty) in case of malformed UTF8 input (default for "desc" commamnd is '?') - -S,--keep-blank-headers: disable default behavior of ignoring leading blank rows + -S,--keep-blank-headers : disable default behavior of ignoring leading blank rows + -0,--header-row
: insert the provided CSV as the first row (in position 0) + e.g. --header-row 'col1,col2,"my col 3"' -v,--verbose: verbose output Commands that parse CSV or other tabular data: From 68fe038c6e80cc8a8b7a9f9d49b04313065ff65f Mon Sep 17 00:00:00 2001 From: liquidaty Date: Sun, 5 Feb 2023 16:34:04 -0800 Subject: [PATCH 3/4] fix a bug when invalid offset given for --fixed add another fixed test add all fixed tests for select-pull --- app/select.c | 2 +- app/test/Makefile | 16 +++++++++------- app/test/expected/test-fixed-3-select.out | 2 ++ data/fixed-auto2.txt | 2 ++ 4 files changed, 14 insertions(+), 8 deletions(-) create mode 100644 app/test/expected/test-fixed-3-select.out create mode 100644 data/fixed-auto2.txt diff --git a/app/select.c b/app/select.c index 2aad5bb6..f62e95cc 100644 --- a/app/select.c +++ b/app/select.c @@ -755,7 +755,7 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op for(const char *end = argv[arg_i]; ; end++) { if(*end == ',' || *end == '\0') { if(!sscanf(start, "%zu,", &data.fixed.offsets[count++])) { - stat = zsv_printerr(1, "Invalid offset: %s.*\n", end - start, start); + stat = zsv_printerr(1, "Invalid offset: %.*s\n", end - start, start); break; } else if(*end == '\0') break; diff --git a/app/test/Makefile b/app/test/Makefile index 75b13e33..3a22298d 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -136,7 +136,7 @@ test-2-count test-2-count-pull: ${BUILD_DIR}/bin/zsv_count${EXE} ${TEST_DATA_DIR @for x in 5000 5002 5004 5006 5008 5010 5013 5015 5017 5019 5021 5101 5105 5111 5113 5115 5117 5119 5121 5123 5125 5127 5129 5131 5211 5213 5215 5217 5311 5313 5315 5317 5413 5431 5433 5455 6133 ; do $< -r $$x ${TEST_DATA_DIR}/test/buffsplit_quote.csv ; done > ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/test-2-count.out && ${TEST_PASS} || ${TEST_FAIL} -test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-fixed-2-% test-merge-% +test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-fixed-2-% test-fixed-3-% test-merge-% test-merge-select test-merge-select-pull: test-merge-% : ${BUILD_DIR}/bin/zsv_%${EXE} @${TEST_INIT} @@ -203,18 +203,20 @@ test-11-select test-11-select-pull: test-11-% : ${BUILD_DIR}/bin/zsv_%${EXE} @${PREFIX} (echo "A1,B1" | $< --header-row "column1,column2") > /tmp/$@.out @cmp /tmp/$@.out expected/test-11-select.out && ${TEST_PASS} || ${TEST_FAIL} -test-fixed-1-select: ${BUILD_DIR}/bin/zsv_select${EXE} +test-fixed-1-select test-fixed-1-select-pull: ${BUILD_DIR}/bin/zsv_select${EXE} @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/fixed.csv --fixed 3,7,12,18,20,21,22 ${REDIRECT} ${TMP_DIR}/$@.out - @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} + @${CMP} ${TMP_DIR}/$@.out expected/test-fixed-1-select.out && ${TEST_PASS} || ${TEST_FAIL} -test-fixed-2-select: ${BUILD_DIR}/bin/zsv_select${EXE} +test-fixed-2-select test-fixed-2-select-pull: ${BUILD_DIR}/bin/zsv_select${EXE} @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/fixed-auto.txt --fixed-auto ${REDIRECT} ${TMP_DIR}/$@.out - @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} + @${CMP} ${TMP_DIR}/$@.out expected/test-fixed-2-select.out && ${TEST_PASS} || ${TEST_FAIL} -test-fixed-1-select-pull test-fixed-2-select-pull: - @echo "Skipping $@" +test-fixed-3-select test-fixed-3-select-pull: ${BUILD_DIR}/bin/zsv_select${EXE} + @${TEST_INIT} + @${PREFIX} $< ${TEST_DATA_DIR}/fixed-auto2.txt --fixed-auto ${REDIRECT} ${TMP_DIR}/$@.out + @${CMP} ${TMP_DIR}/$@.out expected/test-fixed-3-select.out && ${TEST_PASS} || ${TEST_FAIL} test-blank-leading-rows: test-blank-leading-rows-1 test-blank-leading-rows-2 test-blank-leading-rows-3 test-blank-leading-rows-4 diff --git a/app/test/expected/test-fixed-3-select.out b/app/test/expected/test-fixed-3-select.out new file mode 100644 index 00000000..c6f81661 --- /dev/null +++ b/app/test/expected/test-fixed-3-select.out @@ -0,0 +1,2 @@ +originator,loanid,origdate,lifecap,lifefloor,teaser,prod +Retail,1,8/2/2005,18,3,0,HELOC diff --git a/data/fixed-auto2.txt b/data/fixed-auto2.txt new file mode 100644 index 00000000..71797533 --- /dev/null +++ b/data/fixed-auto2.txt @@ -0,0 +1,2 @@ +originator loanid origdate lifecap lifefloor teaser prod +Retail 1 8/2/2005 18 3 0 HELOC From f17c48942589f88966ed68ed94e130955a901560 Mon Sep 17 00:00:00 2001 From: liquidaty Date: Sun, 5 Feb 2023 19:29:28 -0800 Subject: [PATCH 4/4] select --fixed: add another test, fix a bug where final fix-width cell may have trailing newline --- app/test/Makefile | 7 ++++++- app/test/expected/test-fixed-4-select.out | 2 ++ data/fixed-auto3.txt | 2 ++ src/zsv.c | 2 ++ 4 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 app/test/expected/test-fixed-4-select.out create mode 100644 data/fixed-auto3.txt diff --git a/app/test/Makefile b/app/test/Makefile index 3a22298d..fb883ad8 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -136,7 +136,7 @@ test-2-count test-2-count-pull: ${BUILD_DIR}/bin/zsv_count${EXE} ${TEST_DATA_DIR @for x in 5000 5002 5004 5006 5008 5010 5013 5015 5017 5019 5021 5101 5105 5111 5113 5115 5117 5119 5121 5123 5125 5127 5129 5131 5211 5213 5215 5217 5311 5313 5315 5317 5413 5431 5433 5455 6133 ; do $< -r $$x ${TEST_DATA_DIR}/test/buffsplit_quote.csv ; done > ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/test-2-count.out && ${TEST_PASS} || ${TEST_FAIL} -test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-fixed-2-% test-fixed-3-% test-merge-% +test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-fixed-2-% test-fixed-3-% test-fixed-4-% test-merge-% test-merge-select test-merge-select-pull: test-merge-% : ${BUILD_DIR}/bin/zsv_%${EXE} @${TEST_INIT} @@ -218,6 +218,11 @@ test-fixed-3-select test-fixed-3-select-pull: ${BUILD_DIR}/bin/zsv_select${EXE} @${PREFIX} $< ${TEST_DATA_DIR}/fixed-auto2.txt --fixed-auto ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/test-fixed-3-select.out && ${TEST_PASS} || ${TEST_FAIL} +test-fixed-4-select test-fixed-4-select-pull: ${BUILD_DIR}/bin/zsv_select${EXE} + @${TEST_INIT} + @${PREFIX} $< ${TEST_DATA_DIR}/fixed-auto3.txt --fixed-auto ${REDIRECT} ${TMP_DIR}/$@.out + @${CMP} ${TMP_DIR}/$@.out expected/test-fixed-4-select.out && ${TEST_PASS} || ${TEST_FAIL} + test-blank-leading-rows: test-blank-leading-rows-1 test-blank-leading-rows-2 test-blank-leading-rows-3 test-blank-leading-rows-4 diff --git a/app/test/expected/test-fixed-4-select.out b/app/test/expected/test-fixed-4-select.out new file mode 100644 index 00000000..1d66310b --- /dev/null +++ b/app/test/expected/test-fixed-4-select.out @@ -0,0 +1,2 @@ +originator,loanid,origdate,lifecap,lifefloor,teaser,prodtype +Retail,1,8/2/2005,18,3,0,HELOC diff --git a/data/fixed-auto3.txt b/data/fixed-auto3.txt new file mode 100644 index 00000000..07e1bccb --- /dev/null +++ b/data/fixed-auto3.txt @@ -0,0 +1,2 @@ +originator loanid origdate lifecap lifefloor teaser prodtype +Retail 1 8/2/2005 18 3 0 HELOC diff --git a/src/zsv.c b/src/zsv.c index 48a98506..ee99cb76 100644 --- a/src/zsv.c +++ b/src/zsv.c @@ -377,6 +377,8 @@ enum zsv_status zsv_finish(struct zsv_scanner *scanner) { return zsv_status_error; if(!scanner->abort) { if(scanner->mode == ZSV_MODE_FIXED) { + if(scanner->partial_row_length && memchr("\n\r", scanner->buff.buff[scanner->partial_row_length-1], 2)) + scanner->partial_row_length--; if(scanner->partial_row_length) return row_fx(scanner, scanner->buff.buff, 0, scanner->partial_row_length); return zsv_status_ok;