Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compare: add --tolerance option to ignore differences between numeric strings within a numeric tolerance #168

Merged
merged 4 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 43 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,17 +225,20 @@ for speed and ease of development for extending and/or customizing to your needs

* `echo`: read CSV from stdin and write it back out to stdout. This is mostly
useful for demonstrating how to use the API and also how to create a plug-in,
and has some limited utility beyond that e.g. for adding/removing the UTF8
BOM, or cleaning up bad UTF8
and has several uses beyond that including adding/removing BOM,
cleaning up bad UTF8,
whitespace or blank column trimming,
limiting output to a contiguous data block, skipping leading garbage, and even
proving substitution values without modifying the underlying source
* `select`: re-shape CSV by skipping leading garbage, combining header rows into
a single header, selecting or excluding specified columns, removing duplicate
columns, sampling, searching and more
* `sql`: run ad-hoc SQL query on a CSV file
columns, sampling, converting from fixed-width input, searching and more
* `sql`: treat one or more CSV files like database tables and query with SQL
* `desc`: provide a quick description of your table data
* `pretty`: format for console (fixed-width) display, or convert to markdown
format
* `2json`: convert CSV to JSON. Optionally, output in [database schema](docs/db.schema.json)
* `2tsv`: convert CSV to TSV
* `2tsv`: convert to TSV (tab-delimited) format
* `compare`: compare two or more tables of data and output the differences
* `paste` (alpha): horizontally paste two tables together (given inputs X and Y,
output 1...N rows where each row all columns of X in row N, followed by all columns of Y in row N)
Expand Down Expand Up @@ -264,6 +267,41 @@ zsv sql my_population_data.csv "select * from data where population > 100000"

### Using the API

Simple API usage examples include:

Pull parsing:
```
zsv_parser parser = zsv_new(...);
while(zsv_next_row(parser) == zsv_status_row) { /* for each row */
// do something
size_t cell_count = zsv_cell_count(parser);
for(size_t i = 0; i < cell_count; i++) {
struct zsv_cell c = zsv_get_cell(parser, i);
fprintf(stderr, "Cell: %.*s\n", c.len, c.str);
...
}
```

Push parsing:
```
static void my_row_handler(void *ctx) {
zsv_parser p = ctx;
size_t cell_count = zsv_cell_count(p);
for(size_t i = 0, j = zsv_cell_count(p); i < j; i++) {
...
}
}

int main() {
zsv_parser p = zsv_new(NULL);
zsv_set_row_handler(p, my_row_handler);
zsv_set_context(p, p);

enum zsv_status stat;
while((stat = zsv_parse_more(data.parser)) == zsv_status_ok) ;

```

Full application code examples can be found at [examples/lib/README.md](examples/lib/README.md).

An example of using the API, compiled to wasm and called via Javascript,
Expand Down
66 changes: 47 additions & 19 deletions app/compare.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <jsonwriter.h>

#include <sqlite3.h>
Expand Down Expand Up @@ -227,25 +228,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,

#define ZSV_COMPARE_MISSING "Missing"

// if(last_ix + 1 < data->input_count) {
// if we don't have data from every input, then output "Missing" for missing inputs
char got_missing = 0;
for(unsigned i = 0; i < data->input_count; i++) {
struct zsv_compare_input *input = data->inputs_to_sort[i];
if(i > last_ix) {
got_missing = 1;
unsigned input_ix = input->index;
values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
}
}
if(got_missing) {
const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
zsv_compare_output_tuple(data, key_input, key_names, values, 1);
// reset values
memset(values, 0, data->input_count * sizeof(*values));
// if we don't have data from every input, then output "Missing" for missing inputs
char got_missing = 0;
for(unsigned i = 0; i < data->input_count; i++) {
struct zsv_compare_input *input = data->inputs_to_sort[i];
if(i > last_ix) {
got_missing = 1;
unsigned input_ix = input->index;
values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
}
// }
}
if(got_missing) {
const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
zsv_compare_output_tuple(data, key_input, key_names, values, 1);
// reset values
memset(values, 0, data->input_count * sizeof(*values));
}

// for each output column
zsv_compare_unique_colname *output_col = data->output_colnames_first;
Expand All @@ -272,8 +271,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,
if(!output_col)
output_col = input->output_colnames[input_col_ix];
values[input_ix] = data->get_cell(input, input_col_ix);
if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix))
if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix)) {
different = 1;
if(data->tolerance.value
&& values[first_input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN
&& values[input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN) {
// check if both are numbers with a difference less than the given tolerance
double d1, d2;
memcpy(data->tolerance.str1, values[first_input_ix].str, values[first_input_ix].len);
data->tolerance.str1[values[first_input_ix].len] = '\0';
memcpy(data->tolerance.str2, values[input_ix].str, values[input_ix].len);
data->tolerance.str2[values[input_ix].len] = '\0';
if(!zsv_strtod_exact(data->tolerance.str1, &d1)
&& !zsv_strtod_exact(data->tolerance.str2, &d2)
&& fabs(d1 - d2) < data->tolerance.value)
different = 0;
}
}
}
}

Expand Down Expand Up @@ -608,6 +622,10 @@ static int compare_usage() {
" --sort : sort on keys before comparing",
" --sort-in-memory : for sorting, use in-memory instead of temporary db",
" (see https://www.sqlite.org/inmemorydb.html)",
" --tolerance <value>: ignore differences where both values are numeric",
" strings with values differing by less than the given",
" amount e.g. --tolerance 0.01 will ignore differences",
" of numeric strings such as 123.45 vs 123.44",
" --json : output as JSON",
" --json-compact : output as compact JSON",
" --json-object : output as an array of objects",
Expand Down Expand Up @@ -695,6 +713,16 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
data->added_colcount++;
}
}
} else if(!strcmp(arg, "--tolerance")) {
const char *next_arg = zsv_next_arg(++arg_i, argc, argv, &err);
if(next_arg) {
if(zsv_strtod_exact(next_arg, &data->tolerance.value))
fprintf(stderr, "Invalid numeric value: %s\n", next_arg), err = 1;
else if(data->tolerance.value < 0)
fprintf(stderr, "Tolerance must be greater than zero (got %s)\n", next_arg), err = 1;
else
data->tolerance.value = nextafterf(data->tolerance.value, INFINITY);
}
} else if(!strcmp(arg, "--sort")) {
data->sort = 1;
} else if(!strcmp(arg, "--json")) {
Expand Down
6 changes: 6 additions & 0 deletions app/compare_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ struct zsv_compare_data {

sqlite3 *sort_db; // used when --sort option was specified

struct {
double value;
#define ZSV_COMPARE_MAX_NUMBER_BUFF_LEN 128
char str1[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
char str2[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
} tolerance;
struct {
char type; // 'j' for json
union {
Expand Down
96 changes: 90 additions & 6 deletions app/echo.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <zsv/utils/compiler.h>
#include <zsv/utils/writer.h>
#include <zsv/utils/file.h>
#include <zsv/utils/string.h>
#include <zsv/utils/mem.h>

Expand Down Expand Up @@ -52,8 +53,13 @@ struct zsv_echo_data {

unsigned char *skip_until_prefix;
size_t skip_until_prefix_len;

char *tmp_fn;
unsigned max_nonempty_cols;
unsigned char trim_white:1;
unsigned char _:7;
unsigned char trim_columns:1;
unsigned char contiguous:1;
unsigned char _:5;
};

/**
Expand Down Expand Up @@ -86,17 +92,37 @@ void zsv_echo_get_next_overwrite(struct zsv_echo_data *data) {
}
}

static void zsv_echo_get_max_nonempty_cols(void *hook) {
struct zsv_echo_data *data = hook;
unsigned row_nonempty_col_count = 0;
for(size_t i = 0, j = zsv_cell_count(data->parser); i < j; i++) {
struct zsv_cell cell = zsv_get_cell(data->parser, i);
if(UNLIKELY(data->trim_white))
cell.str = (unsigned char *)zsv_strtrim(cell.str, &cell.len);
if(cell.len)
row_nonempty_col_count = i+1;
}
if(data->max_nonempty_cols < row_nonempty_col_count)
data->max_nonempty_cols = row_nonempty_col_count;
}

static void zsv_echo_row(void *hook) {
struct zsv_echo_data *data = hook;
size_t j = zsv_cell_count(data->parser);
if(UNLIKELY(data->trim_columns && j > data->max_nonempty_cols))
j = data->max_nonempty_cols;

if(VERY_UNLIKELY(data->row_ix == 0)) { // header
for(size_t i = 0, j = zsv_cell_count(data->parser); i < j; i++) {
for(size_t i = 0; i < j; i++) {
struct zsv_cell cell = zsv_get_cell(data->parser, i);
if(UNLIKELY(data->trim_white))
cell.str = (unsigned char *)zsv_strtrim(cell.str, &cell.len);
zsv_writer_cell(data->csv_writer, i == 0, cell.str, cell.len, cell.quoted);
}
} else if(VERY_UNLIKELY(data->contiguous && zsv_row_is_blank(data->parser))) {
zsv_abort(data->parser);
} else {
for(size_t i = 0, j = zsv_cell_count(data->parser); i < j; i++) {
for(size_t i = 0; i < j; i++) {
if(VERY_UNLIKELY(data->overwrite.row_ix == data->row_ix && data->overwrite.col_ix == i)) {
zsv_writer_cell(data->csv_writer, i == 0, data->overwrite.str, data->overwrite.len, 1);
zsv_echo_get_next_overwrite(data);
Expand Down Expand Up @@ -132,6 +158,8 @@ const char *zsv_echo_usage_msg[] = {
"Options:",
" -b : output with BOM",
" --trim : trim whitespace",
" --trim-columns : trim blank columns",
" --contiguous : stop output upon scanning an entire row of blank values",
" --skip-until <value>: ignore all leading rows until the first row whose first column starts with the given value ",
" --overwrite <source>: overwrite cells using given source. Source may be:",
" - sqlite3://<filename>[?sql=<query>]",
Expand All @@ -157,6 +185,11 @@ static void zsv_echo_cleanup(struct zsv_echo_data *data) {
fclose(data->in);
if(data->o.sqlite3.db)
sqlite3_close(data->o.sqlite3.db);

if(data->tmp_fn) {
remove(data->tmp_fn);
free(data->tmp_fn);
}
}

#define zsv_echo_sqlite3_prefix "sqlite3://"
Expand Down Expand Up @@ -229,6 +262,10 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
const char *arg = argv[arg_i];
if(!strcmp(arg, "-b"))
writer_opts.with_bom = 1;
else if(!strcmp(arg, "--contiguous"))
data.contiguous = 1;
else if(!strcmp(arg, "--trim-columns"))
data.trim_columns = 1;
else if(!strcmp(arg, "--trim"))
data.trim_white = 1;
else if(!strcmp(arg, "--skip-until")) {
Expand Down Expand Up @@ -288,10 +325,57 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
return 1;
}

unsigned char buff[4096];
if(data.skip_until_prefix)
opts->row_handler = zsv_echo_row_skip_until;
else
else {
if(data.trim_columns) {
// first, save the file if it is stdin
if(data.in == stdin) {
if(!(data.tmp_fn = zsv_get_temp_filename("zsv_echo_XXXXXXXX"))) {
zsv_echo_cleanup(&data);
return 1;
}

FILE *f = fopen(data.tmp_fn, "wb");
if(!f) {
perror(data.tmp_fn);
zsv_echo_cleanup(&data);
return 1;
} else {
size_t bytes_read;
while((bytes_read = fread(buff, 1, sizeof(buff), data.in)) > 0)
fwrite(buff, 1, bytes_read, f);
fclose(f);
if(!(data.in = fopen(data.tmp_fn, "rb"))) {
perror(data.tmp_fn);
zsv_echo_cleanup(&data);
return 1;
}
}
}
// next, determine the max number of columns from the left that contains data
struct zsv_opts tmp_opts = *opts;
tmp_opts.row_handler = zsv_echo_get_max_nonempty_cols;
tmp_opts.stream = data.in;
tmp_opts.ctx = &data;
if(zsv_new_with_properties(&tmp_opts, custom_prop_handler, data.input_path, opts_used, &data.parser) != zsv_status_ok) {
zsv_echo_cleanup(&data);
return 1;
} else {
// find the max nonempty col count
enum zsv_status status;
while(!zsv_signal_interrupted && (status = zsv_parse_more(data.parser)) == zsv_status_ok) ;
zsv_finish(data.parser);
zsv_delete(data.parser);
data.parser = NULL;

// re-open the input again
data.in = fopen(data.tmp_fn ? data.tmp_fn : data.input_path, "rb");
}
}
opts->row_handler = zsv_echo_row;
}
opts->stream = data.in;
opts->ctx = &data;
data.csv_writer = zsv_writer_new(&writer_opts);
Expand All @@ -314,8 +398,8 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
}

// create a local csv writer buff for faster performance
unsigned char writer_buff[64];
zsv_writer_set_temp_buff(data.csv_writer, writer_buff, sizeof(writer_buff));
// unsigned char writer_buff[64];
zsv_writer_set_temp_buff(data.csv_writer, buff, sizeof(buff));

// process the input data.
zsv_handle_ctrl_c_signal();
Expand Down
Loading