Skip to content

Commit

Permalink
Compare: add --tolerance option to ignore differences between numeric…
Browse files Browse the repository at this point in the history
… strings within a numeric tolerance (#168)

* compare: add --tolerance option
  • Loading branch information
liquidaty authored Apr 17, 2024
1 parent 75f2adf commit 6021ae2
Show file tree
Hide file tree
Showing 13 changed files with 159 additions and 25 deletions.
48 changes: 43 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,17 +225,20 @@ for speed and ease of development for extending and/or customizing to your needs

* `echo`: read CSV from stdin and write it back out to stdout. This is mostly
useful for demonstrating how to use the API and also how to create a plug-in,
and has some limited utility beyond that e.g. for adding/removing the UTF8
BOM, or cleaning up bad UTF8
and has several uses beyond that including adding/removing BOM,
cleaning up bad UTF8,
whitespace or blank column trimming,
limiting output to a contiguous data block, skipping leading garbage, and even
proving substitution values without modifying the underlying source
* `select`: re-shape CSV by skipping leading garbage, combining header rows into
a single header, selecting or excluding specified columns, removing duplicate
columns, sampling, searching and more
* `sql`: run ad-hoc SQL query on a CSV file
columns, sampling, converting from fixed-width input, searching and more
* `sql`: treat one or more CSV files like database tables and query with SQL
* `desc`: provide a quick description of your table data
* `pretty`: format for console (fixed-width) display, or convert to markdown
format
* `2json`: convert CSV to JSON. Optionally, output in [database schema](docs/db.schema.json)
* `2tsv`: convert CSV to TSV
* `2tsv`: convert to TSV (tab-delimited) format
* `compare`: compare two or more tables of data and output the differences
* `paste` (alpha): horizontally paste two tables together (given inputs X and Y,
output 1...N rows where each row all columns of X in row N, followed by all columns of Y in row N)
Expand Down Expand Up @@ -264,6 +267,41 @@ zsv sql my_population_data.csv "select * from data where population > 100000"

### Using the API

Simple API usage examples include:

Pull parsing:
```
zsv_parser parser = zsv_new(...);
while(zsv_next_row(parser) == zsv_status_row) { /* for each row */
// do something
size_t cell_count = zsv_cell_count(parser);
for(size_t i = 0; i < cell_count; i++) {
struct zsv_cell c = zsv_get_cell(parser, i);
fprintf(stderr, "Cell: %.*s\n", c.len, c.str);
...
}
```

Push parsing:
```
static void my_row_handler(void *ctx) {
zsv_parser p = ctx;
size_t cell_count = zsv_cell_count(p);
for(size_t i = 0, j = zsv_cell_count(p); i < j; i++) {
...
}
}
int main() {
zsv_parser p = zsv_new(NULL);
zsv_set_row_handler(p, my_row_handler);
zsv_set_context(p, p);
enum zsv_status stat;
while((stat = zsv_parse_more(data.parser)) == zsv_status_ok) ;
```

Full application code examples can be found at [examples/lib/README.md](examples/lib/README.md).

An example of using the API, compiled to wasm and called via Javascript,
Expand Down
66 changes: 47 additions & 19 deletions app/compare.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <jsonwriter.h>

#include <sqlite3.h>
Expand Down Expand Up @@ -227,25 +228,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,

#define ZSV_COMPARE_MISSING "Missing"

// if(last_ix + 1 < data->input_count) {
// if we don't have data from every input, then output "Missing" for missing inputs
char got_missing = 0;
for(unsigned i = 0; i < data->input_count; i++) {
struct zsv_compare_input *input = data->inputs_to_sort[i];
if(i > last_ix) {
got_missing = 1;
unsigned input_ix = input->index;
values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
}
}
if(got_missing) {
const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
zsv_compare_output_tuple(data, key_input, key_names, values, 1);
// reset values
memset(values, 0, data->input_count * sizeof(*values));
// if we don't have data from every input, then output "Missing" for missing inputs
char got_missing = 0;
for(unsigned i = 0; i < data->input_count; i++) {
struct zsv_compare_input *input = data->inputs_to_sort[i];
if(i > last_ix) {
got_missing = 1;
unsigned input_ix = input->index;
values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
}
// }
}
if(got_missing) {
const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
zsv_compare_output_tuple(data, key_input, key_names, values, 1);
// reset values
memset(values, 0, data->input_count * sizeof(*values));
}

// for each output column
zsv_compare_unique_colname *output_col = data->output_colnames_first;
Expand All @@ -272,8 +271,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,
if(!output_col)
output_col = input->output_colnames[input_col_ix];
values[input_ix] = data->get_cell(input, input_col_ix);
if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix))
if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix)) {
different = 1;
if(data->tolerance.value
&& values[first_input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN
&& values[input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN) {
// check if both are numbers with a difference less than the given tolerance
double d1, d2;
memcpy(data->tolerance.str1, values[first_input_ix].str, values[first_input_ix].len);
data->tolerance.str1[values[first_input_ix].len] = '\0';
memcpy(data->tolerance.str2, values[input_ix].str, values[input_ix].len);
data->tolerance.str2[values[input_ix].len] = '\0';
if(!zsv_strtod_exact(data->tolerance.str1, &d1)
&& !zsv_strtod_exact(data->tolerance.str2, &d2)
&& fabs(d1 - d2) < data->tolerance.value)
different = 0;
}
}
}
}

Expand Down Expand Up @@ -608,6 +622,10 @@ static int compare_usage() {
" --sort : sort on keys before comparing",
" --sort-in-memory : for sorting, use in-memory instead of temporary db",
" (see https://www.sqlite.org/inmemorydb.html)",
" --tolerance <value>: ignore differences where both values are numeric",
" strings with values differing by less than the given",
" amount e.g. --tolerance 0.01 will ignore differences",
" of numeric strings such as 123.45 vs 123.44",
" --json : output as JSON",
" --json-compact : output as compact JSON",
" --json-object : output as an array of objects",
Expand Down Expand Up @@ -695,6 +713,16 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
data->added_colcount++;
}
}
} else if(!strcmp(arg, "--tolerance")) {
const char *next_arg = zsv_next_arg(++arg_i, argc, argv, &err);
if(next_arg) {
if(zsv_strtod_exact(next_arg, &data->tolerance.value))
fprintf(stderr, "Invalid numeric value: %s\n", next_arg), err = 1;
else if(data->tolerance.value < 0)
fprintf(stderr, "Tolerance must be greater than zero (got %s)\n", next_arg), err = 1;
else
data->tolerance.value = nextafterf(data->tolerance.value, INFINITY);
}
} else if(!strcmp(arg, "--sort")) {
data->sort = 1;
} else if(!strcmp(arg, "--json")) {
Expand Down
6 changes: 6 additions & 0 deletions app/compare_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ struct zsv_compare_data {

sqlite3 *sort_db; // used when --sort option was specified

struct {
double value;
#define ZSV_COMPARE_MAX_NUMBER_BUFF_LEN 128
char str1[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
char str2[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
} tolerance;
struct {
char type; // 'j' for json
union {
Expand Down
15 changes: 15 additions & 0 deletions app/test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,20 @@ test-desc: test-%: ${BUILD_DIR}/bin/zsv_%${EXE}
@(${PREFIX} $< < ${TEST_DATA_DIR}/test/$*-trim.csv ${REDIRECT2} ${TMP_DIR}/$@.trim && \
${CMP} ${TMP_DIR}/$@.trim expected/$@.trim && ${TEST_PASS} || ${TEST_FAIL})

test-compare-tolerance: ${BUILD_DIR}/bin/zsv_compare${EXE}
@(${PREFIX} $< ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out1 && \
${CMP} ${TMP_DIR}/$@.out1 expected/$@.out1 && ${TEST_PASS} || ${TEST_FAIL})

@(${PREFIX} $< --tolerance 0.001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

@(${PREFIX} $< --tolerance 0.0001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

@(${PREFIX} $< --tolerance 0.00001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})


test-compare: test-%: ${BUILD_DIR}/bin/zsv_%${EXE}
@${TEST_INIT}
@(${PREFIX} $< compare/t1.csv compare/t2.csv compare/t3.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \
Expand Down Expand Up @@ -529,3 +543,4 @@ test-compare: test-%: ${BUILD_DIR}/bin/zsv_%${EXE}

@(${PREFIX} $< ../../data/compare/t1.csv ../../data/compare/t2.csv --add AccentCity --sort -k country -k city ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

5 changes: 5 additions & 0 deletions app/test/expected/test-compare-tolerance.out1
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
1,A,1,1.01
1,B,1,1.001
1,C,1,1.0001
1,D,1,1.00009
2 changes: 2 additions & 0 deletions app/test/expected/test-compare-tolerance.out2
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
1,A,1,1.01
3 changes: 3 additions & 0 deletions app/test/expected/test-compare-tolerance.out3
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
1,A,1,1.01
1,B,1,1.001
5 changes: 5 additions & 0 deletions app/test/expected/test-compare-tolerance.out4
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
1,A,1,1.01
1,B,1,1.001
1,C,1,1.0001
1,D,1,1.00009
10 changes: 10 additions & 0 deletions app/utils/string.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,16 @@ size_t zsv_strunescape_backslash(unsigned char *s, size_t len) {
return j;
}

// zsv_strtod_exact(const char *s): return error; if 0, set value of *d
int zsv_strtod_exact(const char *s, double *d) {
if(!*s) return 1;
char *end;
*d = strtod(s, &end);
if(*end) return 1;
return 0;
}


#ifndef ZSV_STRING_LIB_ONLY
struct zsv_cell zsv_get_cell_trimmed(zsv_parser parser, size_t ix) {
struct zsv_cell c = zsv_get_cell(parser, ix);
Expand Down
2 changes: 2 additions & 0 deletions data/compare/tolerance1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A,B,C,D
1,1,1,1
2 changes: 2 additions & 0 deletions data/compare/tolerance2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A,B,C,D
1.01,1.001,1.0001,1.00009
8 changes: 7 additions & 1 deletion examples/lib/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,13 @@ returns `zsv_status_row` until no more rows are left to parse
```
zsv_parser parser = zsv_new(...);
while(zsv_next_row(parser) == zsv_status_row) { /* for each row */
// do something
// do something
size_t cell_count = zsv_cell_count(parser);
for(size_t i = 0; i < cell_count; i++) {
struct zsv_cell c = zsv_get_cell(parser, i);
fprintf(stderr, "Cell: %.*s\n", c.len, c.str);
...
}
}
```

Expand Down
12 changes: 12 additions & 0 deletions include/zsv/utils/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,20 @@ size_t zsv_strnext_is_sign(const unsigned char *s, size_t len);
*/
size_t zsv_strnext_is_currency(const unsigned char *s, size_t len);


/*
* Convert a string to a double. must convert entire string, else returns error
* @param s string to convert
* @param d pointer to converted value, on success
*
* @returns 0 on success, non-zero on error
*/
int zsv_strtod_exact(const char *s, double *d);

/*
* `zsv_get_cell_trimmed` is equivalent to `zsv_get_cell`, except that it
* @param s string to convert
* @param len length of input string
* returns a value with leading and trailing whitespace removed
*/
#include <zsv.h>
Expand Down

0 comments on commit 6021ae2

Please sign in to comment.