Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

select: add --fixed-auto option #105

Merged
merged 3 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 122 additions & 6 deletions app/select.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ struct zsv_select_uint_list {
unsigned int value;
};

struct fixed {
size_t *offsets;
size_t count;
};

struct zsv_select_data {
FILE *in;
unsigned int current_column_ix;
Expand Down Expand Up @@ -92,10 +97,7 @@ struct zsv_select_data {

size_t overflow_size;

struct {
size_t *offsets;
size_t count;
} fixed;
struct fixed fixed;

unsigned char whitspace_clean_flags;

Expand Down Expand Up @@ -519,7 +521,9 @@ const char *zsv_select_usage_msg[] = {
"",
"Options:",
" -b,--with-bom: output with BOM",
" --fixed <offset1,offset2,offset3>: parse as fixed-width text; use given comma-separated list of positive integers for cell end indexes",
" --fixed <offset1,offset2,..>: parse as fixed-width text; use given comma-separated list of positive integers for cell end indexes",
" --fixed-auto : parse as fixed-width text; derive widths from first row in input data (up to max 1MB size)",
" assumes ASCII whitespace; multi-byte whitespace is not counted as whitespace",
#ifndef ZSV_CLI
" -v,--verbose : verbose output",
#endif
Expand Down Expand Up @@ -591,18 +595,113 @@ static void zsv_select_cleanup(struct zsv_select_data *data) {
free(data->fixed.offsets);
}

/**
* Get a list of ending positions for each column name based on the ending position of each column name
* where the first row is of the below form (dash = whitespace):
* ----COLUMN1----COLUMN2-----COLUMN3----
*
* Approach:
* - find each instance of white followed by not-white, but ignore the first instance of it
*/
static enum zsv_status auto_detect_fixed_column_sizes(struct fixed *fixed, struct zsv_opts *opts, char **scanned, char verbose) {
fixed->count = 0;
unsigned buffsize = 1024*1024; // 1MB
char *buff = calloc(buffsize, sizeof(*buff));
if(!buff)
return zsv_status_memory;

int c;
size_t i;
char was_space = 1;
char first = 1;
for(i = 0; i < buffsize-1; i++) {
c = fgetc(opts->stream);
if(c == EOF || c == '\n')
break;
buff[i] = c;
if(!isspace(c)) {
if(was_space) {
if(first)
first = 0;
else
fixed->count++;
}
was_space = 0;
} else
was_space = 1;
}
if(!first)
fixed->count++;

if(c != '\n' || !fixed->count) {
free(buff);
return zsv_status_error;
}

// free unused memory
char *buff_tmp = realloc(buff, i+1);
if(buff_tmp)
buff = buff_tmp;
*scanned = buff;
buffsize = i;

// set offset values
free(fixed->offsets);
fixed->offsets = malloc(fixed->count * sizeof(*fixed->offsets));
if(!fixed->offsets)
return zsv_status_memory;

// do the loop again, but assign values this time
int count = 0;
was_space = 1;
first = 1;
if(verbose)
fprintf(stderr, "Running --fixed ");
for(i = 0; i < buffsize; i++) {
c = buff[i];
if(c == EOF || c == '\0')
break;
buff[i] = c;
if(!isspace(c)) {
if(was_space) {
if(first)
first = 0;
else {
if(verbose)
fprintf(stderr, "%s%zu", count ? "," : "", i);
fixed->offsets[count++] = i;
}
}
was_space = 0;
} else
was_space = 1;
}
if(!first) {
if(verbose)
fprintf(stderr, "%s%zu", count ? "," : "", i);
fixed->offsets[count++] = i;
}
if(verbose)
fprintf(stderr, "\n");
return zsv_status_ok;
}


int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *opts, const char *opts_used) {
if(argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) {
zsv_select_usage();
return zsv_status_ok;
}

char fixed_auto = 0;
struct zsv_select_data data = { 0 };
data.opts = opts;
const char *input_path = NULL;
struct zsv_csv_writer_options writer_opts = zsv_writer_get_default_opts();
int col_index_arg_i = 0;
const char *insert_header_row = NULL;
char *fixed_auto_scanned_buff = NULL;

enum zsv_status stat = zsv_status_ok;
for(int arg_i = 1; stat == zsv_status_ok && arg_i < argc; arg_i++) {
if(!strcmp(argv[arg_i], "--")) {
Expand All @@ -611,6 +710,8 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
}
if(!strcmp(argv[arg_i], "-b") || !strcmp(argv[arg_i], "--with-bom"))
writer_opts.with_bom = 1;
else if(!strcmp(argv[arg_i], "--fixed-auto"))
fixed_auto = 1;
else if(!strcmp(argv[arg_i], "--fixed")) {
if(++arg_i >= argc)
stat = zsv_printerr(1, "%s option requires parameter", argv[arg_i-1]);
Expand Down Expand Up @@ -749,6 +850,18 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
#endif
}

if(stat == zsv_status_ok && fixed_auto) {
if(data.fixed.offsets)
stat = zsv_printerr(zsv_status_error, "Please specify either --fixed-auto or --fixed, but not both");
else if(insert_header_row)
stat = zsv_printerr(zsv_status_error, "--fixed-auto can not be specified together with --header-row");
else {
stat = auto_detect_fixed_column_sizes(&data.fixed, data.opts, &fixed_auto_scanned_buff, opts->verbose);
if(fixed_auto_scanned_buff)
data.opts->insert_header_row = fixed_auto_scanned_buff;
}
}

if(stat == zsv_status_ok) {
if(!col_index_arg_i)
data.col_argc = 0;
Expand All @@ -766,7 +879,9 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
else {
data.opts->row_handler = zsv_select_header_row;
data.opts->ctx = &data;
data.opts->insert_header_row = insert_header_row;
if(!data.opts->insert_header_row)
data.opts->insert_header_row = insert_header_row;

if(zsv_new_with_properties(data.opts, input_path, opts_used, &data.parser)
== zsv_status_ok) {
// all done with
Expand Down Expand Up @@ -797,6 +912,7 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
}
}
}
free(fixed_auto_scanned_buff);
zsv_select_cleanup(&data);
if(writer_opts.stream && writer_opts.stream != stdout)
fclose(writer_opts.stream);
Expand Down
16 changes: 11 additions & 5 deletions app/test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ test-2-count test-2-count-pull: ${BUILD_DIR}/bin/zsv_count${EXE} ${TEST_DATA_DIR
@for x in 5000 5002 5004 5006 5008 5010 5013 5015 5017 5019 5021 5101 5105 5111 5113 5115 5117 5119 5121 5123 5125 5127 5129 5131 5211 5213 5215 5217 5311 5313 5315 5317 5413 5431 5433 5455 6133 ; do $< -r $$x ${TEST_DATA_DIR}/test/buffsplit_quote.csv ; done > ${TMP_DIR}/[email protected]
@${CMP} ${TMP_DIR}/[email protected] expected/test-2-count.out && ${TEST_PASS} || ${TEST_FAIL}

test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-merge-%
test-select test-select-pull: test-% : test-n-% test-6-% test-7-% test-8-% test-9-% test-10-% test-quotebuff-% test-fixed-1-% test-fixed-2-% test-merge-%

test-merge-select test-merge-select-pull: test-merge-% : ${BUILD_DIR}/bin/zsv_%${EXE}
@${TEST_INIT}
Expand Down Expand Up @@ -198,13 +198,19 @@ test-10-select: test-10-% : ${BUILD_DIR}/bin/zsv_%${EXE}
test-10-select-pull:
@echo 'N/a (test-10-select-pull)'

test-fixed-1-select-pull:
@echo "Skipping test-fixed-1-select-pull"

test-fixed-1-select: ${BUILD_DIR}/bin/zsv_select${EXE}
@${TEST_INIT}
@${PREFIX} $< ${TEST_DATA_DIR}/fixed.csv --fixed 3,7,12,18,20,21,22 ${REDIRECT} ${TMP_DIR}/[email protected]
@${CMP} ${TMP_DIR}/[email protected] expected/test-fixed-1-select.out && ${TEST_PASS} || ${TEST_FAIL}
@${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL}

test-fixed-2-select: ${BUILD_DIR}/bin/zsv_select${EXE}
@${TEST_INIT}
@${PREFIX} $< ${TEST_DATA_DIR}/fixed-auto.txt --fixed-auto ${REDIRECT} ${TMP_DIR}/[email protected]
@${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL}

test-fixed-1-select-pull test-fixed-2-select-pull:
@echo "Skipping $@"


test-blank-leading-rows: test-blank-leading-rows-1 test-blank-leading-rows-2 test-blank-leading-rows-3 test-blank-leading-rows-4

Expand Down
3 changes: 3 additions & 0 deletions app/test/expected/test-fixed-2-select.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
XXXXXX,XX,XXXX,XXXX,XXXX,XXXX,XXXXX,XXX
AAAAAAA,AAAAA,AAAAAAAA,AAAAA,AAAAAA,AAAAAA AAAAA,AA,AAAAA
AAAAAAA,AAAAAA,AAAAAAAA,AAAAA,AAAAAA,AAAAA AAAAA,AA,AAAAA
3 changes: 3 additions & 0 deletions data/fixed-auto.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
XXXXXX XX XXXX XXXX XXXX XXXX XXXXX XXX
AAAAAAA AAAAA AAAAAAAA AAAAA AAAAAA AAAAAA AAAAA AA AAAAA
AAAAAAA AAAAAA AAAAAAAA AAAAA AAAAAA AAAAA AAAAA AA AAAAA