Skip to content

Commit

Permalink
sheet/index: Fix building the index with a filter
Browse files Browse the repository at this point in the history
Previously the indexer was using the file offsets from the original
file when the filter was active. Instead it should use offsets in the
temporary filtered file.

The writer library doesn't record or expose the total number of bytes
written and flushing to the stream and retrieving the offset would be
expensive and invasive. So this splits filtering and indexing into two
operations.
  • Loading branch information
richiejp committed Nov 8, 2024
1 parent 1a8e4d3 commit 7c54f2f
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 23 deletions.
72 changes: 49 additions & 23 deletions app/sheet/index.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,34 @@

#include "index.h"

static void build_memory_index_row_handler(void *ctx) {
static void save_filtered_file_row_handler(void *ctx) {
struct zsvsheet_indexer *ixr = ctx;
struct zsv_index *ix = ixr->ix;
zsv_parser parser = ixr->parser;
size_t col_count = zsv_cell_count(parser);

if (ixr->filter) {
if (col_count == 0)
return;
if (col_count == 0)
return;

if (ixr->ix->header_line_end) {
struct zsv_cell first_cell = zsv_get_cell(parser, 0);
struct zsv_cell last_cell = zsv_get_cell(parser, col_count - 1);
if (ixr->seen_header) {
struct zsv_cell first_cell = zsv_get_cell(parser, 0);
struct zsv_cell last_cell = zsv_get_cell(parser, col_count - 1);

if (!memmem(first_cell.str, last_cell.str - first_cell.str + last_cell.len, ixr->filter, ixr->filter_len))
return;
}
if (!memmem(first_cell.str, last_cell.str - first_cell.str + last_cell.len, ixr->filter, ixr->filter_len))
return;
} else {
ixr->seen_header = 1;
}

for (size_t i = 0; i < col_count; i++) {
struct zsv_cell cell = zsv_get_cell(parser, i);
zsv_writer_cell(ixr->writer, i == 0, cell.str, cell.len, cell.quoted);
}
for (size_t i = 0; i < col_count; i++) {
struct zsv_cell cell = zsv_get_cell(parser, i);
zsv_writer_cell(ixr->writer, i == 0, cell.str, cell.len, cell.quoted);
}
}

static void build_memory_index_row_handler(void *ctx) {
struct zsvsheet_indexer *ixr = ctx;
struct zsv_index *ix = ixr->ix;
zsv_parser parser = ixr->parser;

if (zsv_index_add_row(ix, parser) != zsv_index_status_ok)
zsv_abort(parser);
Expand All @@ -54,32 +59,53 @@ enum zsv_index_status build_memory_index(struct zsvsheet_index_opts *optsp) {

ix_zopts.ctx = &ixr;
ix_zopts.stream = fp;
ix_zopts.row_handler = build_memory_index_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

if (optsp->row_filter) {

temp_filename = zsv_get_temp_filename("zsvsheet_filter_XXXXXXXX");
if (!temp_filename)
return ret;

*optsp->temp_filename = temp_filename;

struct zsv_csv_writer_options writer_opts = {0};
if (!(writer_opts.stream = temp_f = fopen(temp_filename, "wb")))
if (!(writer_opts.stream = temp_f = fopen(temp_filename, "w+")))
return ret;
if (!(temp_file_writer = zsv_writer_new(&writer_opts)))
goto out;

zsv_writer_set_temp_buff(temp_file_writer, temp_buff, sizeof(temp_buff));
ixr.writer = temp_file_writer;
ixr.filter_stream = temp_f;
ix_zopts.row_handler = save_filtered_file_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

while ((zst = zsv_parse_more(ixr.parser)) == zsv_status_ok)
;

if (zst != zsv_status_no_more_input)
goto out;

zsv_finish(ixr.parser);
zsv_delete(ixr.parser);
zsv_writer_delete(temp_file_writer);
temp_file_writer = NULL;
if (fseek(temp_f, 0, SEEK_SET))
goto out;

ix_zopts.stream = temp_f;
}

ix_zopts.row_handler = build_memory_index_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

ixr.ix = zsv_index_new();
if (!ixr.ix)
goto out;
Expand Down
1 change: 1 addition & 0 deletions app/sheet/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ struct zsvsheet_indexer {
size_t filter_len;
zsv_csv_writer writer;
FILE *filter_stream;
char seen_header;
};

struct zsvsheet_index_opts {
Expand Down
25 changes: 25 additions & 0 deletions app/test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -649,3 +649,28 @@ test-sheet-7: ${BUILD_DIR}/bin/zsv_sheet${EXE}
tmux capture-pane -t $@ -p ${REDIRECT1} ${TMP_DIR}/$@.out && \
tmux send-keys -t $@ "q" && \
${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL})

test-sheet-7: ${BUILD_DIR}/bin/zsv_sheet${EXE}
@${TEST_INIT}
@echo 'set-option default-terminal "tmux-256color"' > ~/.tmux.conf
@(tmux new-session -x 80 -y 5 -d -s $@ "${PREFIX} $< -d 3 ${TEST_DATA_DIR}/test/mixed-line-endings.csv" && \
sleep 0.5 && \
tmux send-keys -t $@ "G" "g" "g" "C-u" "/" "1234" "Enter" && \
sleep 0.5 && \
tmux capture-pane -t $@ -p ${REDIRECT1} ${TMP_DIR}/$@.out && \
tmux send-keys -t $@ "q" && \
${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL})

test-sheet-8: ${BUILD_DIR}/bin/zsv_sheet${EXE}
@${TEST_INIT}
@echo 'set-option default-terminal "tmux-256color"' > ~/.tmux.conf
@(tmux new-session -x 160 -y 5 -d -s $@ "${PREFIX} $< worldcitiespop_mil.csv" && \
sleep 0.5 && \
tmux send-keys -t $@ "f" "e" "Enter" && \
sleep 0.5 && \
tmux send-keys -t $@ "G" "C-u" "k" && \
sleep 0.5 && \
tmux capture-pane -t $@ -p ${REDIRECT1} ${TMP_DIR}/$@.out && \
tmux send-keys -t $@ "q" && \
${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL})

0 comments on commit 7c54f2f

Please sign in to comment.