liquidaty · liquidaty · Apr 17, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 16, 2024
diff --git a/README.md b/README.md
@@ -225,17 +225,20 @@ for speed and ease of development for extending and/or customizing to your needs
 
 * `echo`: read CSV from stdin and write it back out to stdout. This is mostly
   useful for demonstrating how to use the API and also how to create a plug-in,
-  and has some limited utility beyond that e.g. for adding/removing the UTF8
-  BOM, or cleaning up bad UTF8
+  and has several uses beyond that including adding/removing BOM,
+  cleaning up bad UTF8,
+  whitespace or blank column trimming,
+  limiting output to a contiguous data block, skipping leading garbage, and even
+  proving substitution values without modifying the underlying source
 * `select`: re-shape CSV by skipping leading garbage, combining header rows into
   a single header, selecting or excluding specified columns, removing duplicate
-  columns, sampling, searching and more
-* `sql`: run ad-hoc SQL query on a CSV file
+  columns, sampling, converting from fixed-width input, searching and more
+* `sql`: treat one or more CSV files like database tables and query with SQL
 * `desc`: provide a quick description of your table data
 * `pretty`: format for console (fixed-width) display, or convert to markdown
   format
 * `2json`: convert CSV to JSON. Optionally, output in [database schema](docs/db.schema.json)
-* `2tsv`: convert CSV to TSV
+* `2tsv`: convert to TSV (tab-delimited) format
 * `compare`: compare two or more tables of data and output the differences
 * `paste` (alpha): horizontally paste two tables together (given inputs X and Y,
    output 1...N rows where each row all columns of X in row N, followed by all columns of Y in row N)
@@ -264,6 +267,41 @@ zsv sql my_population_data.csv "select * from data where population > 100000"
 
 ### Using the API
 
+Simple API usage examples include:
+
+Pull parsing:
+```
+zsv_parser parser = zsv_new(...);
+while(zsv_next_row(parser) == zsv_status_row) { /* for each row */
+    // do something
+  size_t cell_count = zsv_cell_count(parser);
+  for(size_t i = 0; i < cell_count; i++) {
+    struct zsv_cell c = zsv_get_cell(parser, i);
+    fprintf(stderr, "Cell: %.*s\n", c.len, c.str);
+    ...
+  }
+```
+
+Push parsing:
+```
+static void my_row_handler(void *ctx) {
+  zsv_parser p = ctx;
+  size_t cell_count = zsv_cell_count(p);
+  for(size_t i = 0, j = zsv_cell_count(p); i < j; i++) {
+    ...
+  }
+}
+
+int main() {
+  zsv_parser p = zsv_new(NULL);
+  zsv_set_row_handler(p, my_row_handler);
+  zsv_set_context(p, p);
+
+  enum zsv_status stat;
+  while((stat = zsv_parse_more(data.parser)) == zsv_status_ok) ;
+
+```
+
 Full application code examples can be found at [examples/lib/README.md](examples/lib/README.md).
 
 An example of using the API, compiled to wasm and called via Javascript,

diff --git a/app/compare.c b/app/compare.c
@@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <math.h>
 #include <jsonwriter.h>
 
 #include <sqlite3.h>
@@ -227,25 +228,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,
 
 #define ZSV_COMPARE_MISSING "Missing"
 
-//  if(last_ix + 1 < data->input_count) {
-    // if we don't have data from every input, then output "Missing" for missing inputs
-    char got_missing = 0;
-    for(unsigned i = 0; i < data->input_count; i++) {
-      struct zsv_compare_input *input = data->inputs_to_sort[i];
-      if(i > last_ix) {
-        got_missing = 1;
-        unsigned input_ix = input->index;
-        values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
-        values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
-      }
-    }
-    if(got_missing) {
-      const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
-      zsv_compare_output_tuple(data, key_input, key_names, values, 1);
-      // reset values
-      memset(values, 0, data->input_count * sizeof(*values));
+  // if we don't have data from every input, then output "Missing" for missing inputs
+  char got_missing = 0;
+  for(unsigned i = 0; i < data->input_count; i++) {
+    struct zsv_compare_input *input = data->inputs_to_sort[i];
+    if(i > last_ix) {
+      got_missing = 1;
+      unsigned input_ix = input->index;
+      values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
+      values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
     }
-//  }
+  }
+  if(got_missing) {
+    const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
+    zsv_compare_output_tuple(data, key_input, key_names, values, 1);
+    // reset values
+    memset(values, 0, data->input_count * sizeof(*values));
+  }
 
   // for each output column
   zsv_compare_unique_colname *output_col = data->output_colnames_first;
@@ -272,8 +271,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,
         if(!output_col)
           output_col = input->output_colnames[input_col_ix];
         values[input_ix] = data->get_cell(input, input_col_ix);
-        if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix))
+        if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix)) {
           different = 1;
+          if(data->tolerance.value
+             && values[first_input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN
+             && values[input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN) {
+            // check if both are numbers with a difference less than the given tolerance            
+            double d1, d2;
+            memcpy(data->tolerance.str1, values[first_input_ix].str, values[first_input_ix].len);
+            data->tolerance.str1[values[first_input_ix].len] = '\0';
+            memcpy(data->tolerance.str2, values[input_ix].str, values[input_ix].len);
+            data->tolerance.str2[values[input_ix].len] = '\0';
+            if(!zsv_strtod_exact(data->tolerance.str1, &d1)
+               && !zsv_strtod_exact(data->tolerance.str2, &d2)
+               && fabs(d1 - d2) < data->tolerance.value)
+              different = 0;
+          }
+        }
       }
     }
 
@@ -608,6 +622,10 @@ static int compare_usage() {
     "  --sort             : sort on keys before comparing",
     "  --sort-in-memory   : for sorting,  use in-memory instead of temporary db",
     "                       (see https://www.sqlite.org/inmemorydb.html)",
+    "  --tolerance <value>: ignore differences where both values are numeric",
+    "                       strings with values differing by less than the given",
+    "                       amount e.g. --tolerance 0.01 will ignore differences",
+    "                       of numeric strings such as 123.45 vs 123.44",
     "  --json             : output as JSON",
     "  --json-compact     : output as compact JSON",
     "  --json-object      : output as an array of objects",
@@ -695,6 +713,16 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
             data->added_colcount++;
         }
       }
+    } else if(!strcmp(arg, "--tolerance")) {
+      const char *next_arg = zsv_next_arg(++arg_i, argc, argv, &err);
+      if(next_arg) {
+        if(zsv_strtod_exact(next_arg, &data->tolerance.value))
+          fprintf(stderr, "Invalid numeric value: %s\n", next_arg), err = 1;
+        else if(data->tolerance.value < 0)
+          fprintf(stderr, "Tolerance must be greater than zero (got %s)\n", next_arg), err = 1;
+        else
+          data->tolerance.value = nextafterf(data->tolerance.value, INFINITY);
+      }
     } else if(!strcmp(arg, "--sort")) {
       data->sort = 1;
     } else if(!strcmp(arg, "--json")) {

diff --git a/app/compare_internal.h b/app/compare_internal.h
@@ -106,6 +106,12 @@ struct zsv_compare_data {
 
   sqlite3 *sort_db; // used when --sort option was specified
 
+  struct {
+    double value;
+#define ZSV_COMPARE_MAX_NUMBER_BUFF_LEN 128
+    char   str1[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
+    char   str2[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
+  } tolerance;
   struct {
     char type; // 'j' for json
     union {

diff --git a/app/echo.c b/app/echo.c
@@ -18,6 +18,7 @@
 
 #include <zsv/utils/compiler.h>
 #include <zsv/utils/writer.h>
+#include <zsv/utils/file.h>
 #include <zsv/utils/string.h>
 #include <zsv/utils/mem.h>
 
@@ -52,8 +53,13 @@ struct zsv_echo_data {
 
   unsigned char *skip_until_prefix;
   size_t skip_until_prefix_len;
+
+  char *tmp_fn;
+  unsigned max_nonempty_cols;
   unsigned char trim_white:1;
-  unsigned char _:7;
+  unsigned char trim_columns:1;
+  unsigned char contiguous:1;
+  unsigned char _:5;
 };
 
 /**
@@ -86,17 +92,37 @@ void zsv_echo_get_next_overwrite(struct zsv_echo_data *data) {
   }
 }
 
+static void zsv_echo_get_max_nonempty_cols(void *hook) {
+  struct zsv_echo_data *data = hook;
+  unsigned row_nonempty_col_count = 0;
+  for(size_t i = 0, j = zsv_cell_count(data->parser); i < j; i++) {
+    struct zsv_cell cell = zsv_get_cell(data->parser, i);
+    if(UNLIKELY(data->trim_white))
+      cell.str = (unsigned char *)zsv_strtrim(cell.str, &cell.len);
+    if(cell.len)
+      row_nonempty_col_count = i+1;
+  }
+  if(data->max_nonempty_cols < row_nonempty_col_count)
+    data->max_nonempty_cols = row_nonempty_col_count;
+}
+
 static void zsv_echo_row(void *hook) {
   struct zsv_echo_data *data = hook;
+  size_t j = zsv_cell_count(data->parser);
+  if(UNLIKELY(data->trim_columns && j > data->max_nonempty_cols))
+    j = data->max_nonempty_cols;
+
   if(VERY_UNLIKELY(data->row_ix == 0)) { // header
-    for(size_t i = 0, j = zsv_cell_count(data->parser); i < j; i++) {
+    for(size_t i = 0; i < j; i++) {
       struct zsv_cell cell = zsv_get_cell(data->parser, i);
       if(UNLIKELY(data->trim_white))
         cell.str = (unsigned char *)zsv_strtrim(cell.str, &cell.len);
       zsv_writer_cell(data->csv_writer, i == 0, cell.str, cell.len, cell.quoted);
     }
+  } else if(VERY_UNLIKELY(data->contiguous && zsv_row_is_blank(data->parser))) {
+    zsv_abort(data->parser);
   } else {
-    for(size_t i = 0, j = zsv_cell_count(data->parser); i < j; i++) {
+    for(size_t i = 0; i < j; i++) {
       if(VERY_UNLIKELY(data->overwrite.row_ix == data->row_ix && data->overwrite.col_ix == i)) {
         zsv_writer_cell(data->csv_writer, i == 0, data->overwrite.str, data->overwrite.len, 1);
         zsv_echo_get_next_overwrite(data);
@@ -132,6 +158,8 @@ const char *zsv_echo_usage_msg[] = {
   "Options:",
   "  -b                  : output with BOM",
   "  --trim              : trim whitespace",
+  "  --trim-columns      : trim blank columns",
+  "  --contiguous        : stop output upon scanning an entire row of blank values",
   "  --skip-until <value>: ignore all leading rows until the first row whose first column starts with the given value ",
   "  --overwrite <source>: overwrite cells using given source. Source may be:",
   "                        - sqlite3://<filename>[?sql=<query>]",
@@ -157,6 +185,11 @@ static void zsv_echo_cleanup(struct zsv_echo_data *data) {
     fclose(data->in);
   if(data->o.sqlite3.db)
     sqlite3_close(data->o.sqlite3.db);
+
+  if(data->tmp_fn) {
+    remove(data->tmp_fn);
+    free(data->tmp_fn);
+  }
 }
 
 #define zsv_echo_sqlite3_prefix "sqlite3://"
@@ -229,6 +262,10 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
     const char *arg = argv[arg_i];
     if(!strcmp(arg, "-b"))
       writer_opts.with_bom = 1;
+    else if(!strcmp(arg, "--contiguous"))
+      data.contiguous = 1;
+    else if(!strcmp(arg, "--trim-columns"))
+      data.trim_columns = 1;
     else if(!strcmp(arg, "--trim"))
       data.trim_white = 1;
     else if(!strcmp(arg, "--skip-until")) {
@@ -288,10 +325,57 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
     return 1;
   }
 
+  unsigned char buff[4096];
   if(data.skip_until_prefix)
     opts->row_handler = zsv_echo_row_skip_until;
-  else
+  else {
+    if(data.trim_columns) {
+      // first, save the file if it is stdin
+      if(data.in == stdin) {
+        if(!(data.tmp_fn = zsv_get_temp_filename("zsv_echo_XXXXXXXX"))) {
+          zsv_echo_cleanup(&data);
+          return 1;
+        }
+
+        FILE *f = fopen(data.tmp_fn, "wb");
+        if(!f) {
+          perror(data.tmp_fn);
+          zsv_echo_cleanup(&data);
+          return 1;
+        } else {
+          size_t bytes_read;
+          while((bytes_read = fread(buff, 1, sizeof(buff), data.in)) > 0)
+            fwrite(buff, 1, bytes_read, f);
+          fclose(f);
+          if(!(data.in = fopen(data.tmp_fn, "rb"))) {
+            perror(data.tmp_fn);
+            zsv_echo_cleanup(&data);
+            return 1;
+          }
+        }
+      }
+      // next, determine the max number of columns from the left that contains data
+      struct zsv_opts tmp_opts = *opts;
+      tmp_opts.row_handler = zsv_echo_get_max_nonempty_cols;
+      tmp_opts.stream = data.in;
+      tmp_opts.ctx = &data;
+      if(zsv_new_with_properties(&tmp_opts, custom_prop_handler, data.input_path, opts_used, &data.parser) != zsv_status_ok) {
+        zsv_echo_cleanup(&data);
+        return 1;
+      } else {
+        // find the max nonempty col count
+        enum zsv_status status;
+        while(!zsv_signal_interrupted && (status = zsv_parse_more(data.parser)) == zsv_status_ok) ;
+        zsv_finish(data.parser);
+        zsv_delete(data.parser);
+        data.parser = NULL;
+
+        // re-open the input again
+        data.in = fopen(data.tmp_fn ? data.tmp_fn : data.input_path, "rb");
+      }
+    }
     opts->row_handler = zsv_echo_row;
+  }
   opts->stream = data.in;
   opts->ctx = &data;
   data.csv_writer = zsv_writer_new(&writer_opts);
@@ -314,8 +398,8 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
   }
 
   // create a local csv writer buff for faster performance
-  unsigned char writer_buff[64];
-  zsv_writer_set_temp_buff(data.csv_writer, writer_buff, sizeof(writer_buff));
+  //  unsigned char writer_buff[64];
+  zsv_writer_set_temp_buff(data.csv_writer, buff, sizeof(buff));
 
   // process the input data.
   zsv_handle_ctrl_c_signal();