Skip to content

Commit

Permalink
sheet/index: Fix building the index with a filter
Browse files Browse the repository at this point in the history
Previously the indexer was using the file offsets from the original
file when the filter was active. Instead it should use offsets in the
temporary filtered file.

The writer library doesn't record or expose the total number of bytes
written and flushing to the stream and retrieving the offset would be
expensive and invasive. So this splits filtering and indexing into two
operations.
  • Loading branch information
richiejp committed Nov 8, 2024
1 parent 1a8e4d3 commit 98df7ec
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 24 deletions.
72 changes: 49 additions & 23 deletions app/sheet/index.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,34 @@

#include "index.h"

static void build_memory_index_row_handler(void *ctx) {
static void save_filtered_file_row_handler(void *ctx) {
struct zsvsheet_indexer *ixr = ctx;
struct zsv_index *ix = ixr->ix;
zsv_parser parser = ixr->parser;
size_t col_count = zsv_cell_count(parser);

if (ixr->filter) {
if (col_count == 0)
return;
if (col_count == 0)
return;

if (ixr->ix->header_line_end) {
struct zsv_cell first_cell = zsv_get_cell(parser, 0);
struct zsv_cell last_cell = zsv_get_cell(parser, col_count - 1);
if (ixr->seen_header) {
struct zsv_cell first_cell = zsv_get_cell(parser, 0);
struct zsv_cell last_cell = zsv_get_cell(parser, col_count - 1);

if (!memmem(first_cell.str, last_cell.str - first_cell.str + last_cell.len, ixr->filter, ixr->filter_len))
return;
}
if (!memmem(first_cell.str, last_cell.str - first_cell.str + last_cell.len, ixr->filter, ixr->filter_len))
return;
} else {
ixr->seen_header = 1;
}

for (size_t i = 0; i < col_count; i++) {
struct zsv_cell cell = zsv_get_cell(parser, i);
zsv_writer_cell(ixr->writer, i == 0, cell.str, cell.len, cell.quoted);
}
for (size_t i = 0; i < col_count; i++) {
struct zsv_cell cell = zsv_get_cell(parser, i);
zsv_writer_cell(ixr->writer, i == 0, cell.str, cell.len, cell.quoted);
}
}

static void build_memory_index_row_handler(void *ctx) {
struct zsvsheet_indexer *ixr = ctx;
struct zsv_index *ix = ixr->ix;
zsv_parser parser = ixr->parser;

if (zsv_index_add_row(ix, parser) != zsv_index_status_ok)
zsv_abort(parser);
Expand All @@ -54,32 +59,53 @@ enum zsv_index_status build_memory_index(struct zsvsheet_index_opts *optsp) {

ix_zopts.ctx = &ixr;
ix_zopts.stream = fp;
ix_zopts.row_handler = build_memory_index_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

if (optsp->row_filter) {

temp_filename = zsv_get_temp_filename("zsvsheet_filter_XXXXXXXX");
if (!temp_filename)
return ret;

*optsp->temp_filename = temp_filename;

struct zsv_csv_writer_options writer_opts = {0};
if (!(writer_opts.stream = temp_f = fopen(temp_filename, "wb")))
if (!(writer_opts.stream = temp_f = fopen(temp_filename, "w+")))
return ret;
if (!(temp_file_writer = zsv_writer_new(&writer_opts)))
goto out;

zsv_writer_set_temp_buff(temp_file_writer, temp_buff, sizeof(temp_buff));
ixr.writer = temp_file_writer;
ixr.filter_stream = temp_f;
ix_zopts.row_handler = save_filtered_file_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

while ((zst = zsv_parse_more(ixr.parser)) == zsv_status_ok)
;

if (zst != zsv_status_no_more_input)
goto out;

zsv_finish(ixr.parser);
zsv_delete(ixr.parser);
zsv_writer_delete(temp_file_writer);
temp_file_writer = NULL;
if (fseek(temp_f, 0, SEEK_SET))
goto out;

ix_zopts.stream = temp_f;
}

ix_zopts.row_handler = build_memory_index_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

ixr.ix = zsv_index_new();
if (!ixr.ix)
goto out;
Expand Down
1 change: 1 addition & 0 deletions app/sheet/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ struct zsvsheet_indexer {
size_t filter_len;
zsv_csv_writer writer;
FILE *filter_stream;
char seen_header;
};

struct zsvsheet_index_opts {
Expand Down
27 changes: 26 additions & 1 deletion app/test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ test-compare: test-%: ${BUILD_DIR}/bin/zsv_%${EXE}

test-sheet: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} worldcitiespop_mil.csv test-sheet-all

test-sheet-all: test-sheet-1 test-sheet-2 test-sheet-3 test-sheet-4 test-sheet-5 test-sheet-6 test-sheet-7
test-sheet-all: test-sheet-1 test-sheet-2 test-sheet-3 test-sheet-4 test-sheet-5 test-sheet-6 test-sheet-7 test-sheet-8
@(for SESSION in $^; do ! tmux kill-session -t "$$SESSION" 2>/dev/null; done && ${TEST_PASS} || ${TEST_FAIL})

test-sheet-1: ${BUILD_DIR}/bin/zsv_sheet${EXE}
Expand Down Expand Up @@ -649,3 +649,28 @@ test-sheet-7: ${BUILD_DIR}/bin/zsv_sheet${EXE}
tmux capture-pane -t $@ -p ${REDIRECT1} ${TMP_DIR}/$@.out && \
tmux send-keys -t $@ "q" && \
${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL})

test-sheet-7: ${BUILD_DIR}/bin/zsv_sheet${EXE}
@${TEST_INIT}
@echo 'set-option default-terminal "tmux-256color"' > ~/.tmux.conf
@(tmux new-session -x 80 -y 5 -d -s $@ "${PREFIX} $< -d 3 ${TEST_DATA_DIR}/test/mixed-line-endings.csv" && \
sleep 0.5 && \
tmux send-keys -t $@ "G" "g" "g" "C-u" "/" "1234" "Enter" && \
sleep 0.5 && \
tmux capture-pane -t $@ -p ${REDIRECT1} ${TMP_DIR}/$@.out && \
tmux send-keys -t $@ "q" && \
${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL})

test-sheet-8: ${BUILD_DIR}/bin/zsv_sheet${EXE}
@${TEST_INIT}
@echo 'set-option default-terminal "tmux-256color"' > ~/.tmux.conf
@(tmux new-session -x 160 -y 5 -d -s $@ "${PREFIX} $< worldcitiespop_mil.csv" && \
sleep 0.5 && \
tmux send-keys -t $@ "f" "e" "Enter" && \
sleep 0.5 && \
tmux send-keys -t $@ "G" "C-u" "k" && \
sleep 0.5 && \
tmux capture-pane -t $@ -p ${REDIRECT1} ${TMP_DIR}/$@.out && \
tmux send-keys -t $@ "q" && \
${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL})

5 changes: 5 additions & 0 deletions app/test/expected/test-sheet-8.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Row # Country City AccentCity Region Population Latitude Longitude
493034 gb ruthven Ruthven V3 57.066667 -4.033333
493035 gb rutlandshire Rutlandshire L4 52.666667 -.666667
493036 gb rutupiae Rutupiæ G5 51.283333 1.333333
(493039 filtered rows) 493035

0 comments on commit 98df7ec

Please sign in to comment.