From 1a770f420291996f56a3f28fea3f40563f424d20 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 1 Nov 2024 13:00:13 +0000 Subject: [PATCH] Rework temporary filter file into the index --- app/sheet.c | 11 +++-- app/sheet/index.c | 108 +++++++++++++++++++++++++++++++----------- app/sheet/index.h | 10 +++- app/sheet/read-data.c | 91 ++++++++++++----------------------- 4 files changed, 124 insertions(+), 96 deletions(-) diff --git a/app/sheet.c b/app/sheet.c index 0b037871..3be90f67 100644 --- a/app/sheet.c +++ b/app/sheet.c @@ -62,8 +62,8 @@ struct zsvsheet_opts { #include "sheet/read-data.c" #include "sheet/key-bindings.c" -static void display_buffer_subtable(struct zsvsheet_ui_buffer *ui_buffer, size_t rownum_col_offset, size_t input_header_span, - struct zsvsheet_display_dimensions *ddims); +static void display_buffer_subtable(struct zsvsheet_ui_buffer *ui_buffer, size_t rownum_col_offset, + size_t input_header_span, struct zsvsheet_display_dimensions *ddims); struct zsvsheet_display_info { int update_buffer; @@ -609,7 +609,8 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op handler_state.display_info.update_buffer = false; pthread_mutex_lock(¤t_ui_buffer->mutex); - if (current_ui_buffer->index_ready && current_ui_buffer->dimensions.row_count != current_ui_buffer->index->row_count) { + if (current_ui_buffer->index_ready && + current_ui_buffer->dimensions.row_count != current_ui_buffer->index->row_count) { current_ui_buffer->dimensions.row_count = current_ui_buffer->index->row_count; handler_state.display_info.update_buffer = true; } @@ -683,8 +684,8 @@ const char *display_cell(struct zsvsheet_buffer *buff, size_t data_row, size_t d return str; } -static void display_buffer_subtable(struct zsvsheet_ui_buffer *ui_buffer, size_t rownum_col_offset, size_t input_header_span, - struct zsvsheet_display_dimensions *ddims) { +static void display_buffer_subtable(struct zsvsheet_ui_buffer *ui_buffer, size_t rownum_col_offset, + size_t input_header_span, struct zsvsheet_display_dimensions *ddims) { struct zsvsheet_buffer *buffer = ui_buffer->buffer; size_t start_row = ui_buffer->buff_offset.row; size_t buffer_used_row_count = ui_buffer->buff_used_rows; diff --git a/app/sheet/index.c b/app/sheet/index.c index 93c65d7b..45406237 100644 --- a/app/sheet/index.c +++ b/app/sheet/index.c @@ -5,16 +5,18 @@ #include #include "index.h" +#include "zsv/utils/file.h" +#include "zsv/utils/writer.h" static struct zsvsheet_index *add_line_end(struct zsvsheet_index *ix, uint64_t end) { size_t len = ix->line_end_len, cap = ix->line_end_capacity; - + if (len >= cap) { cap *= 2; - ix = realloc(ix, sizeof(*ix) + cap*sizeof(ix->line_ends[0])); + ix = realloc(ix, sizeof(*ix) + cap * sizeof(ix->line_ends[0])); if (!ix) return NULL; - + ix->line_end_capacity = cap; } @@ -28,74 +30,126 @@ static void build_memory_index_row_handler(void *ctx) { struct zsvsheet_indexer *ixr = ctx; struct zsvsheet_index *ix = ixr->ix; uint64_t line_end = zsv_cum_scanned_length(ixr->parser) + 1; - - if(!ixr->ix->header_line_end) { + size_t col_count = zsv_cell_count(ixr->parser); + + if (ixr->filter) { + if (col_count == 0) + return; + + if (ixr->ix->header_line_end) { + struct zsv_cell first_cell = zsv_get_cell(ixr->parser, 0); + struct zsv_cell last_cell = zsv_get_cell(ixr->parser, col_count - 1); + + if (!memmem(first_cell.str, last_cell.str - first_cell.str + last_cell.len, ixr->filter, ixr->filter_len)) + return; + } + + for (size_t i = 0; i < col_count; i++) { + struct zsv_cell cell = zsv_get_cell(ixr->parser, i); + zsv_writer_cell(ixr->writer, i == 0, cell.str, cell.len, cell.quoted); + } + } + + if (!ixr->ix->header_line_end) { ix->header_line_end = line_end; - } else if((ix->row_count & ((1 << LINE_END_SHIFT) - 1)) == 0) { + } else if ((ix->row_count & ((1 << LINE_END_SHIFT) - 1)) == 0) { + if (ixr->filter) { + if (zsv_writer_flush(ixr->writer) != zsv_writer_status_ok) { + zsv_abort(ixr->parser); + return; + } + line_end = ftell(ixr->filter_stream); + } + ix = add_line_end(ix, line_end); if (!ix) { zsv_abort(ixr->parser); return; } - + ixr->ix = ix; } ix->row_count++; } -enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp, struct zsvsheet_index **index_out) { - struct zsvsheet_indexer ixr = {0}; +enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp) { + struct zsvsheet_indexer ixr = { + .filter = optsp->row_filter, + .filter_len = optsp->row_filter ? strlen(optsp->row_filter) : 0, + }; enum zsvsheet_index_status ret = zsvsheet_index_status_error; struct zsv_opts *zopts = optsp->zsv_optsp; struct zsv_opts ix_zopts = {0}; + char *temp_filename; + FILE *temp_f = NULL; memcpy(&ix_zopts, zopts, sizeof(ix_zopts)); - + FILE *fp = fopen(optsp->filename, "rb"); if (!fp) - goto close_file; + return ret; ix_zopts.ctx = &ixr; ix_zopts.stream = fp; ix_zopts.row_handler = build_memory_index_row_handler; - enum zsv_status zst = zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, - optsp->filename, optsp->opts_used, - &ixr.parser); + enum zsv_status zst = + zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser); if (zst != zsv_status_ok) - goto free_parser; - + goto out; + + if (optsp->row_filter) { + zsv_csv_writer temp_file_writer = NULL; + unsigned char temp_buff[8196]; + + temp_filename = zsv_get_temp_filename("zsvsheet_filter_XXXXXXXX"); + if (!temp_filename) + return ret; + + *optsp->temp_filename = temp_filename; + + struct zsv_csv_writer_options writer_opts = {0}; + if (!(writer_opts.stream = temp_f = fopen(temp_filename, "wb"))) + return ret; + if (!(temp_file_writer = zsv_writer_new(&writer_opts))) + goto out; + + zsv_writer_set_temp_buff(temp_file_writer, temp_buff, sizeof(temp_buff)); + ixr.writer = temp_file_writer; + ixr.filter_stream = temp_f; + } + const size_t initial_cap = 256; ixr.ix = malloc(sizeof(*ixr.ix) + initial_cap * sizeof(size_t)); if (!ixr.ix) - goto free_parser; + goto out; memset(ixr.ix, 0, sizeof(*ixr.ix)); ixr.ix->line_end_capacity = initial_cap; - - while ((zst = zsv_parse_more(ixr.parser)) == zsv_status_ok) - ; + + while ((zst = zsv_parse_more(ixr.parser)) == zsv_status_ok) + ; zsv_finish(ixr.parser); - - if (zst == zsv_status_no_more_input) { + + if (zst == zsv_status_no_more_input) { ret = zsvsheet_index_status_ok; - *index_out = ixr.ix; + *optsp->index = ixr.ix; } else free(ixr.ix); -free_parser: +out: zsv_delete(ixr.parser); - -close_file: fclose(fp); + if (temp_f) + fclose(temp_f); return ret; } void get_memory_index(struct zsvsheet_index *ix, uint64_t row, off_t *offset_out, size_t *remaining_rows_out) { const size_t i = row >> LINE_END_SHIFT; - + *offset_out = (off_t)ix->line_ends[i]; *remaining_rows_out = row & ((1 << LINE_END_SHIFT) - 1); } diff --git a/app/sheet/index.h b/app/sheet/index.h index 7dffd357..559cb98d 100644 --- a/app/sheet/index.h +++ b/app/sheet/index.h @@ -6,13 +6,14 @@ #include #include "zsv.h" +#include "zsv/utils/writer.h" // Decides the number of rows we skip when storing the line end // 1 << 10 = 1024 means that we store every 1024th line end #define LINE_END_SHIFT 10 #define LINE_END_N (1 << LINE_END_SHIFT) -enum zsvsheet_index_status{ +enum zsvsheet_index_status { zsvsheet_index_status_ok = 0, zsvsheet_index_status_memory, zsvsheet_index_status_error, @@ -30,11 +31,16 @@ struct zsvsheet_index { struct zsvsheet_indexer { zsv_parser parser; struct zsvsheet_index *ix; + const char *filter; + size_t filter_len; + zsv_csv_writer writer; + FILE *filter_stream; }; struct zsvsheet_index_opts { pthread_mutex_t *mutexp; const char *filename; + char **temp_filename; const char *row_filter; struct zsv_opts *zsv_optsp; struct zsvsheet_index **index; @@ -44,7 +50,7 @@ struct zsvsheet_index_opts { const char *opts_used; }; -enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp, struct zsvsheet_index **index_out); +enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp); void get_memory_index(struct zsvsheet_index *ix, uint64_t row, off_t *offset_out, size_t *remaining_rows_out); #endif diff --git a/app/sheet/read-data.c b/app/sheet/read-data.c index b55ac565..e2c4f410 100644 --- a/app/sheet/read-data.c +++ b/app/sheet/read-data.c @@ -25,12 +25,13 @@ static char *zsvsheet_found_in_row(zsv_parser parser, size_t col_count, const ch static void *get_data_index(void *d); -static void get_data_index_async(struct zsvsheet_ui_buffer *uibuffp, const char *filename, struct zsv_opts *optsp, const char *row_filter, - struct zsv_prop_handler *custom_prop_handler, const char *opts_used , pthread_mutex_t *mutexp -) { +static void get_data_index_async(struct zsvsheet_ui_buffer *uibuffp, const char *filename, struct zsv_opts *optsp, + const char *row_filter, struct zsv_prop_handler *custom_prop_handler, + const char *opts_used, pthread_mutex_t *mutexp) { struct zsvsheet_index_opts *gdi = calloc(1, sizeof(*gdi)); gdi->mutexp = mutexp; gdi->filename = filename; + gdi->temp_filename = &uibuffp->temp_filename; gdi->zsv_optsp = optsp; gdi->row_filter = row_filter; gdi->index = &uibuffp->index; @@ -44,9 +45,8 @@ static void get_data_index_async(struct zsvsheet_ui_buffer *uibuffp, const char static int read_data(struct zsvsheet_ui_buffer **uibufferp, // a new zsvsheet_ui_buffer will be allocated struct zsvsheet_ui_buffer_opts *uibopts, // if *uibufferp == NULL and uibopts != NULL - size_t start_row, size_t start_col, size_t header_span, - struct zsvsheet_opts *zsvsheet_opts, struct zsv_prop_handler *custom_prop_handler, - const char *opts_used) { + size_t start_row, size_t start_col, size_t header_span, struct zsvsheet_opts *zsvsheet_opts, + struct zsv_prop_handler *custom_prop_handler, const char *opts_used) { (void)(index); // to do const char *filename = (uibufferp && *uibufferp) ? (*uibufferp)->filename : uibopts ? uibopts->filename : NULL; struct zsv_opts opts = {0}; @@ -58,16 +58,28 @@ static int read_data(struct zsvsheet_ui_buffer **uibufferp, // a new zsvsheet_ size_t remaining_rows_to_skip = start_row; size_t remaining_header_to_skip = header_span; size_t original_row_num = 0; - + const char *row_filter = uibuff ? uibuff->row_filter : NULL; + size_t row_filter_len = row_filter ? strlen(row_filter) : 0; + FILE *fp; + assert(filename != NULL); - FILE *fp = fopen(filename, "rb"); + fp = fopen(filename, "rb"); if (!fp) return errno; opts.stream = fp; // Input file stream - + if (uibuff) { pthread_mutex_lock(&uibuff->mutex); + if (uibuff->index_ready && row_filter) { + fclose(fp); + fp = fopen(uibuff->temp_filename, "rb"); + if (!fp) { + pthread_mutex_unlock(&uibuff->mutex); + return errno; + } + } + if (uibuff->index_ready && start_row > LINE_END_N) { off_t file_start; get_memory_index(uibuff->index, start_row - LINE_END_N, &file_start, &remaining_rows_to_skip); @@ -76,9 +88,12 @@ static int read_data(struct zsvsheet_ui_buffer **uibufferp, // a new zsvsheet_ return errno; } - if (!isspace(fgetc(fp))) { - if (fseek(fp, file_start, SEEK_SET)) - return errno; + // original csv files can have two char line endings + if (!row_filter) { + if (!isspace(fgetc(fp))) { + if (fseek(fp, file_start, SEEK_SET)) + return errno; + } } remaining_header_to_skip = 0; @@ -98,12 +113,7 @@ static int read_data(struct zsvsheet_ui_buffer **uibufferp, // a new zsvsheet_ size_t find_len = zsvsheet_opts->find ? strlen(zsvsheet_opts->find) : 0; size_t rows_searched = 0; - const char *row_filter = uibuff ? uibuff->row_filter : NULL; - size_t row_filter_len = row_filter ? strlen(row_filter) : 0; zsvsheet_buffer_t buffer = uibuff ? uibuff->buffer : NULL; - FILE *temp_f = NULL; - unsigned char temp_buff[4096]; - zsv_csv_writer temp_file_writer = NULL; while (zsv_next_row(parser) == zsv_status_row && (rows_read == 0 || rows_read < zsvsheet_buffer_rows(buffer))) { // for each row @@ -175,59 +185,18 @@ static int read_data(struct zsvsheet_ui_buffer **uibufferp, // a new zsvsheet_ zsvsheet_buffer_write_cell_w_len(buffer, rows_read, i + rownum_column_offset, c.str, c.len); } - // if we have a row filter, write it to a temp file - // later if needed this could be optimized in general and where the filtered data is small enough to not need - // indexing - if (row_filter) { - if (rows_read == 0 && uibuff != NULL && uibuff->temp_filename == NULL) { - uibuff->temp_filename = zsv_get_temp_filename("zsvsheet_filter_XXXXXXXX"); - if (!uibuff->temp_filename) - ; // to do: handle out-of-memory error - else { - struct zsv_csv_writer_options writer_opts = {0}; - if (!(writer_opts.stream = temp_f = fopen(uibuff->temp_filename, "wb"))) - ; // to do: handle fopen error - else if (!(temp_file_writer = zsv_writer_new(&writer_opts))) - ; // to do: handle zsv_writer_new error - else { - zsv_writer_set_temp_buff(temp_file_writer, temp_buff, sizeof(temp_buff)); - } - } - } - if (temp_file_writer) { - for (size_t i = 0; i < col_count; i++) { - struct zsv_cell cell = zsv_get_cell(parser, i); - zsv_writer_cell(temp_file_writer, i == 0, cell.str, cell.len, cell.quoted); - } - } - } rows_read++; } fclose(fp); zsv_delete(parser); - if (temp_file_writer) { // finish writing the filtered data to temp file - // to do: do this in a separate thread - while (zsv_next_row(parser) == zsv_status_row) { - size_t col_count = zsv_cell_count(parser); - for (size_t i = 0; i < col_count; i++) { - struct zsv_cell cell = zsv_get_cell(parser, i); - zsv_writer_cell(temp_file_writer, i == 0, cell.str, cell.len, cell.quoted); - } - } - zsv_writer_delete(temp_file_writer); - } - if (temp_f) - fclose(temp_f); - if (uibuff) { if (!uibuff->index_started) { uibuff->buff_used_rows = rows_read; uibuff->index_started = 1; if (original_row_num > 1 && (row_filter == NULL || rows_read > 0)) { opts.stream = NULL; - get_data_index_async(uibuff, filename, &opts, row_filter, - custom_prop_handler, opts_used , &uibuff->mutex); + get_data_index_async(uibuff, filename, &opts, row_filter, custom_prop_handler, opts_used, &uibuff->mutex); } } @@ -255,8 +224,7 @@ static void *get_data_index(void *gdi) { pthread_mutex_t *mutexp = d->mutexp; int *errp = d->errp; - struct zsvsheet_index *ix; - enum zsvsheet_index_status ix_status = build_memory_index(d, &ix); + enum zsvsheet_index_status ix_status = build_memory_index(d); if (ix_status != zsvsheet_index_status_ok) { pthread_mutex_lock(mutexp); @@ -267,7 +235,6 @@ static void *get_data_index(void *gdi) { } pthread_mutex_lock(mutexp); - *d->index = ix; *d->index_ready = 1; free(d); pthread_mutex_unlock(mutexp);