From a89b197938f93ae0e5102c17cb8ea606ee1f3d51 Mon Sep 17 00:00:00 2001 From: liquidaty Date: Wed, 25 Jan 2023 14:29:12 -0800 Subject: [PATCH] select: add --fixed-auto option (#105) * select: add --fixed-auto option and test --- app/select.c | 128 +++++++++++++++++++++- app/test/Makefile | 16 ++- app/test/expected/test-fixed-2-select.out | 3 + data/fixed-auto.txt | 3 + 4 files changed, 139 insertions(+), 11 deletions(-) create mode 100644 app/test/expected/test-fixed-2-select.out create mode 100644 data/fixed-auto.txt diff --git a/app/select.c b/app/select.c index 7b3b0d3d..a5139e4b 100644 --- a/app/select.c +++ b/app/select.c @@ -43,6 +43,11 @@ struct zsv_select_uint_list { unsigned int value; }; +struct fixed { + size_t *offsets; + size_t count; +}; + struct zsv_select_data { FILE *in; unsigned int current_column_ix; @@ -92,10 +97,7 @@ struct zsv_select_data { size_t overflow_size; - struct { - size_t *offsets; - size_t count; - } fixed; + struct fixed fixed; unsigned char whitspace_clean_flags; @@ -519,7 +521,9 @@ const char *zsv_select_usage_msg[] = { "", "Options:", " -b,--with-bom: output with BOM", - " --fixed : parse as fixed-width text; use given comma-separated list of positive integers for cell end indexes", + " --fixed : parse as fixed-width text; use given comma-separated list of positive integers for cell end indexes", + " --fixed-auto : parse as fixed-width text; derive widths from first row in input data (up to max 1MB size)", + " assumes ASCII whitespace; multi-byte whitespace is not counted as whitespace", #ifndef ZSV_CLI " -v,--verbose : verbose output", #endif @@ -591,18 +595,113 @@ static void zsv_select_cleanup(struct zsv_select_data *data) { free(data->fixed.offsets); } +/** + * Get a list of ending positions for each column name based on the ending position of each column name + * where the first row is of the below form (dash = whitespace): + * ----COLUMN1----COLUMN2-----COLUMN3---- + * + * Approach: + * - find each instance of white followed by not-white, but ignore the first instance of it + */ +static enum zsv_status auto_detect_fixed_column_sizes(struct fixed *fixed, struct zsv_opts *opts, char **scanned, char verbose) { + fixed->count = 0; + unsigned buffsize = 1024*1024; // 1MB + char *buff = calloc(buffsize, sizeof(*buff)); + if(!buff) + return zsv_status_memory; + + int c; + size_t i; + char was_space = 1; + char first = 1; + for(i = 0; i < buffsize-1; i++) { + c = fgetc(opts->stream); + if(c == EOF || c == '\n') + break; + buff[i] = c; + if(!isspace(c)) { + if(was_space) { + if(first) + first = 0; + else + fixed->count++; + } + was_space = 0; + } else + was_space = 1; + } + if(!first) + fixed->count++; + + if(c != '\n' || !fixed->count) { + free(buff); + return zsv_status_error; + } + + // free unused memory + char *buff_tmp = realloc(buff, i+1); + if(buff_tmp) + buff = buff_tmp; + *scanned = buff; + buffsize = i; + + // set offset values + free(fixed->offsets); + fixed->offsets = malloc(fixed->count * sizeof(*fixed->offsets)); + if(!fixed->offsets) + return zsv_status_memory; + + // do the loop again, but assign values this time + int count = 0; + was_space = 1; + first = 1; + if(verbose) + fprintf(stderr, "Running --fixed "); + for(i = 0; i < buffsize; i++) { + c = buff[i]; + if(c == EOF || c == '\0') + break; + buff[i] = c; + if(!isspace(c)) { + if(was_space) { + if(first) + first = 0; + else { + if(verbose) + fprintf(stderr, "%s%zu", count ? "," : "", i); + fixed->offsets[count++] = i; + } + } + was_space = 0; + } else + was_space = 1; + } + if(!first) { + if(verbose) + fprintf(stderr, "%s%zu", count ? "," : "", i); + fixed->offsets[count++] = i; + } + if(verbose) + fprintf(stderr, "\n"); + return zsv_status_ok; +} + + int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *opts, const char *opts_used) { if(argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { zsv_select_usage(); return zsv_status_ok; } + char fixed_auto = 0; struct zsv_select_data data = { 0 }; data.opts = opts; const char *input_path = NULL; struct zsv_csv_writer_options writer_opts = zsv_writer_get_default_opts(); int col_index_arg_i = 0; const char *insert_header_row = NULL; + char *fixed_auto_scanned_buff = NULL; + enum zsv_status stat = zsv_status_ok; for(int arg_i = 1; stat == zsv_status_ok && arg_i < argc; arg_i++) { if(!strcmp(argv[arg_i], "--")) { @@ -611,6 +710,8 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op } if(!strcmp(argv[arg_i], "-b") || !strcmp(argv[arg_i], "--with-bom")) writer_opts.with_bom = 1; + else if(!strcmp(argv[arg_i], "--fixed-auto")) + fixed_auto = 1; else if(!strcmp(argv[arg_i], "--fixed")) { if(++arg_i >= argc) stat = zsv_printerr(1, "%s option requires parameter", argv[arg_i-1]); @@ -749,6 +850,18 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op #endif } + if(stat == zsv_status_ok && fixed_auto) { + if(data.fixed.offsets) + stat = zsv_printerr(zsv_status_error, "Please specify either --fixed-auto or --fixed, but not both"); + else if(insert_header_row) + stat = zsv_printerr(zsv_status_error, "--fixed-auto can not be specified together with --header-row"); + else { + stat = auto_detect_fixed_column_sizes(&data.fixed, data.opts, &fixed_auto_scanned_buff, opts->verbose); + if(fixed_auto_scanned_buff) + data.opts->insert_header_row = fixed_auto_scanned_buff; + } + } + if(stat == zsv_status_ok) { if(!col_index_arg_i) data.col_argc = 0; @@ -766,7 +879,9 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op else { data.opts->row_handler = zsv_select_header_row; data.opts->ctx = &data; - data.opts->insert_header_row = insert_header_row; + if(!data.opts->insert_header_row) + data.opts->insert_header_row = insert_header_row; + if(zsv_new_with_properties(data.opts, input_path, opts_used, &data.parser) == zsv_status_ok) { // all done with @@ -797,6 +912,7 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op } } } + free(fixed_auto_scanned_buff); zsv_select_cleanup(&data); if(writer_opts.stream && writer_opts.stream != stdout) fclose(writer_opts.stream); diff --git a/app/test/Makefile b/app/test/Makefile index 7afeb3eb..bd731c01 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -136,7 +136,7 @@ test-2-count test-2-count-pull: ${BUILD_DIR}/bin/zsv_count${EXE} ${TEST_DATA_DIR @for x in 5000 5002 5004 5006 5008 5010 5013 5015 5017 5019 5021 5101 5105 5111 5113 5115 5117 5119 5121 5123 5125 5127 5129 5131 5211 5213 5215 5217 5311 5313 5315 5317 5413 5431 5433 5455 6133 ; do $< -r $$x ${TEST_DATA_DIR}/test/buffsplit_quote.csv ; done > ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/test-2-count.out && ${TEST_PASS} || ${TEST_FAIL} -test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-merge-% +test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-fixed-2-% test-merge-% test-merge-select test-merge-select-pull: test-merge-% : ${BUILD_DIR}/bin/zsv_%${EXE} @${TEST_INIT} @@ -198,13 +198,19 @@ test-10-select: test-10-% : ${BUILD_DIR}/bin/zsv_%${EXE} test-10-select-pull: @echo 'N/a (test-10-select-pull)' -test-fixed-1-select-pull: - @echo "Skipping test-fixed-1-select-pull" - test-fixed-1-select: ${BUILD_DIR}/bin/zsv_select${EXE} @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/fixed.csv --fixed 3,7,12,18,20,21,22 ${REDIRECT} ${TMP_DIR}/$@.out - @${CMP} ${TMP_DIR}/$@.out expected/test-fixed-1-select.out && ${TEST_PASS} || ${TEST_FAIL} + @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} + +test-fixed-2-select: ${BUILD_DIR}/bin/zsv_select${EXE} + @${TEST_INIT} + @${PREFIX} $< ${TEST_DATA_DIR}/fixed-auto.txt --fixed-auto ${REDIRECT} ${TMP_DIR}/$@.out + @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} + +test-fixed-1-select-pull test-fixed-2-select-pull: + @echo "Skipping $@" + test-blank-leading-rows: test-blank-leading-rows-1 test-blank-leading-rows-2 test-blank-leading-rows-3 test-blank-leading-rows-4 diff --git a/app/test/expected/test-fixed-2-select.out b/app/test/expected/test-fixed-2-select.out new file mode 100644 index 00000000..a11c44ac --- /dev/null +++ b/app/test/expected/test-fixed-2-select.out @@ -0,0 +1,3 @@ +XXXXXX,XX,XXXX,XXXX,XXXX,XXXX,XXXXX,XXX +AAAAAAA,AAAAA,AAAAAAAA,AAAAA,AAAAAA,AAAAAA AAAAA,AA,AAAAA +AAAAAAA,AAAAAA,AAAAAAAA,AAAAA,AAAAAA,AAAAA AAAAA,AA,AAAAA diff --git a/data/fixed-auto.txt b/data/fixed-auto.txt new file mode 100644 index 00000000..a9c82a84 --- /dev/null +++ b/data/fixed-auto.txt @@ -0,0 +1,3 @@ +XXXXXX XX XXXX XXXX XXXX XXXX XXXXX XXX +AAAAAAA AAAAA AAAAAAAA AAAAA AAAAAA AAAAAA AAAAA AA AAAAA +AAAAAAA AAAAAA AAAAAAAA AAAAA AAAAAA AAAAA AAAAA AA AAAAA