Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Create in-memory index for sheet command #254

Merged
merged 22 commits into from
Nov 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,5 @@ tmp
include/zsv.h
app/test/worldcitiespop_mil.csv
data/quoted5.csv
compile_commands.json
.cache
9 changes: 7 additions & 2 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ CFLAGS+=${YAJL_HELPER_INCLUDE} ${YAJL_INCLUDE}

# jq
JQ_TARBALL=${THIS_MAKEFILE_DIR}/external/jq-1.6.tar.gz
JQ_SRC=${BUILD_DIR}/external/jq-src
JQ_SRC=${BUILD_DIR}-external/jq-src

JQ_PREFIX ?=
ifeq ($(JQ_PREFIX),)
Expand Down Expand Up @@ -408,7 +408,12 @@ lib-jq: ${JQ_LIB}
@echo "Using jq library ${JQ_LIB}"

${JQ_BUNDLE_LIB}: ${JQ_SRC} # -D_REENTRANT needed for clang to not break
cd ${JQ_SRC} \
@if [ -e ${JQ_SRC}-conf ]; then \
rm -rf ${JQ_SRC}-conf; \
fi
# copy the directory because its timestamp gets updated after libjq is built
@cp -a ${JQ_SRC} ${JQ_SRC}-conf
cd ${JQ_SRC}-conf \
&& CC="${CC}" CFLAGS="${CFLAGS} -D_REENTRANT" ./configure \
--prefix="${JQ_PREFIX}" \
--disable-maintainer-mode \
Expand Down
56 changes: 21 additions & 35 deletions app/sheet.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@

#include <locale.h>
#include <wchar.h>
#ifdef ZSVSHEET_USE_THREADS
#include <pthread.h>
#endif

#define ZSV_COMMAND sheet
#include "zsv_command.h"
Expand All @@ -58,6 +56,7 @@ struct zsvsheet_opts {

#include "sheet/utf8-width.c"
#include "sheet/ui_buffer.c"
#include "sheet/index.c"
#include "sheet/read-data.c"
#include "sheet/key-bindings.c"

Expand Down Expand Up @@ -90,7 +89,7 @@ struct zsvsheet_builtin_proc_state {
const char *opts_used;
};

int get_subcommand(const char *prompt, char *buff, size_t buffsize, int footer_row) {
static void get_subcommand(const char *prompt, char *buff, size_t buffsize, int footer_row) {
*buff = '\0';
// this is a hack to blank-out the currently-selected cell value
int max_screen_width = 256; // to do: don't assume this
Expand Down Expand Up @@ -126,7 +125,6 @@ int get_subcommand(const char *prompt, char *buff, size_t buffsize, int footer_r
}
// Ignore other keys
}
return 0;
}

zsvsheet_handler_status zsvsheet_ext_prompt(struct zsvsheet_proc_context *ctx, char *buffer, size_t bufsz,
Expand Down Expand Up @@ -209,36 +207,6 @@ void zsvsheet_set_status(const struct zsvsheet_display_dimensions *ddims, int ov
#include "sheet/file.c"
#include "sheet/usage.c"

enum zsvsheet_status zsvsheet_key_handler(struct zsvsheet_key_handler_data *khd, int ch, char *cmdbuff,
size_t cmdbuff_sz, // subcommand buffer
struct zsvsheet_ui_buffer **base_ui_buffer,
struct zsvsheet_ui_buffer **current_ui_buffer,
const struct zsvsheet_display_dimensions *display_dims,
struct zsv_prop_handler *custom_prop_handler, const char *opts_used) {
struct zsvsheet_subcommand_handler_context sctx = {0};
sctx.ch = ch;
sctx.ui_buffers = *base_ui_buffer;
sctx.current_ui_buffer = *current_ui_buffer;
zsvsheet_handler_status status = khd->subcommand_handler(&sctx);
if (status == zsvsheet_handler_status_ok) {
get_subcommand(sctx.prompt, cmdbuff, cmdbuff_sz, (int)(display_dims->rows - display_dims->footer_span));
if (*cmdbuff == '\0')
return zsvsheet_status_continue;
struct zsvsheet_handler_context ctx = {.subcommand_value = cmdbuff,
.ch = ch,
.ui_buffers = {.base = base_ui_buffer, .current = current_ui_buffer},
.display_dims = display_dims,
.custom_prop_handler = custom_prop_handler,
.opts_used = opts_used};
status = khd->handler(&ctx);
}
if (status == zsvsheet_handler_status_ok)
return zsvsheet_status_ok;
if (status == zsvsheet_handler_status_ignore)
return zsvsheet_status_continue;
return zsvsheet_status_error;
}

struct zsvsheet_key_handler_data *zsvsheet_key_handlers = NULL;
struct zsvsheet_key_handler_data **zsvsheet_next_key_handler = &zsvsheet_key_handlers;

Expand Down Expand Up @@ -639,10 +607,28 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op

while (true) {
ch = getch();
if (ch == ERR) {
pthread_mutex_lock(&current_ui_buffer->mutex);
if (current_ui_buffer && current_ui_buffer->status) {
zsvsheet_set_status(&display_dims, 1, current_ui_buffer->status);
display_buffer_subtable(current_ui_buffer, header_span, &display_dims);
cbreak();
}
pthread_mutex_unlock(&current_ui_buffer->mutex);
continue;
}

zsvsheet_set_status(&display_dims, 1, "");
handler_state.display_info.update_buffer = false;

pthread_mutex_lock(&current_ui_buffer->mutex);
if (current_ui_buffer->index_ready &&
current_ui_buffer->dimensions.row_count != current_ui_buffer->index->row_count) {
current_ui_buffer->dimensions.row_count = current_ui_buffer->index->row_count;
handler_state.display_info.update_buffer = true;
}
pthread_mutex_unlock(&current_ui_buffer->mutex);

status = zsvsheet_key_press(ch, &handler_state);
if (status == zsvsheet_handler_status_exit)
break;
Expand All @@ -652,7 +638,7 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
if (handler_state.display_info.update_buffer && current_ui_buffer->filename) {
struct zsvsheet_opts zsvsheet_opts = {0};
if (read_data(&current_ui_buffer, NULL, current_ui_buffer->input_offset.row, current_ui_buffer->input_offset.col,
header_span, current_ui_buffer->dimensions.index, &zsvsheet_opts, custom_prop_handler, opts_used)) {
header_span, &zsvsheet_opts, custom_prop_handler, opts_used)) {
zsvsheet_set_status(&display_dims, 1, "Unexpected error!"); // to do: better error message
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion app/sheet/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#define ZSVSHEET_BUFFER_DEFAULT_CELL_BUFF_LEN 16
#define ZSVSHEET_BUFFER_DEFAULT_MAX_CELL_LEN 32768 - 1
#define ZSVSHEET_BUFFER_DEFAULT_ROW_COUNT 1000
#define ZSVSHEET_BUFFER_DEFAULT_ROW_COUNT 1024

typedef struct zsvsheet_buffer *zsvsheet_buffer_t;

Expand Down
2 changes: 1 addition & 1 deletion app/sheet/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ int zsvsheet_ui_buffer_open_file(const char *filename, const struct zsv_opts *zs
int err = 0;
struct zsvsheet_ui_buffer *tmp_ui_buffer = NULL;
uibopts.row_filter = row_filter;
if ((err = read_data(&tmp_ui_buffer, &uibopts, 0, 0, 0, NULL, &zsvsheet_opts, custom_prop_handler, opts_used)) != 0 ||
if ((err = read_data(&tmp_ui_buffer, &uibopts, 0, 0, 0, &zsvsheet_opts, custom_prop_handler, opts_used)) != 0 ||
!tmp_ui_buffer || !tmp_ui_buffer->buff_used_rows) {
zsvsheet_ui_buffer_delete(tmp_ui_buffer);
if (err)
Expand Down
156 changes: 156 additions & 0 deletions app/sheet/index.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <zsv.h>
#include <zsv/utils/prop.h>

#include "index.h"
#include "zsv/utils/file.h"
#include "zsv/utils/writer.h"

static struct zsvsheet_index *add_line_end(struct zsvsheet_index *ix, uint64_t end) {
size_t len = ix->line_end_len, cap = ix->line_end_capacity;

if (len >= cap) {
cap *= 2;
ix = realloc(ix, sizeof(*ix) + cap * sizeof(ix->line_ends[0]));
if (!ix)
return NULL;

ix->line_end_capacity = cap;
}

ix->line_ends[len] = end;
ix->line_end_len++;

return ix;
}

static void build_memory_index_row_handler(void *ctx) {
struct zsvsheet_indexer *ixr = ctx;
struct zsvsheet_index *ix = ixr->ix;
uint64_t line_end = zsv_cum_scanned_length(ixr->parser) + 1;
size_t col_count = zsv_cell_count(ixr->parser);

if (ixr->filter) {
if (col_count == 0)
return;

if (ixr->ix->header_line_end) {
struct zsv_cell first_cell = zsv_get_cell(ixr->parser, 0);
struct zsv_cell last_cell = zsv_get_cell(ixr->parser, col_count - 1);

if (!memmem(first_cell.str, last_cell.str - first_cell.str + last_cell.len, ixr->filter, ixr->filter_len))
return;
}

for (size_t i = 0; i < col_count; i++) {
struct zsv_cell cell = zsv_get_cell(ixr->parser, i);
zsv_writer_cell(ixr->writer, i == 0, cell.str, cell.len, cell.quoted);
}
}

if (!ixr->ix->header_line_end) {
ix->header_line_end = line_end;
} else if ((ix->row_count & (LINE_END_N - 1)) == 0) {
if (ixr->filter) {
if (zsv_writer_flush(ixr->writer) != zsv_writer_status_ok) {
zsv_abort(ixr->parser);
return;
}
line_end = ftell(ixr->filter_stream);
}

ix = add_line_end(ix, line_end);
if (!ix) {
zsv_abort(ixr->parser);
return;
}

ixr->ix = ix;
}

ix->row_count++;
}

enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp) {
struct zsvsheet_indexer ixr = {0};
ixr.filter = optsp->row_filter;
ixr.filter_len = optsp->row_filter ? strlen(optsp->row_filter) : 0;

enum zsvsheet_index_status ret = zsvsheet_index_status_error;
struct zsv_opts ix_zopts = optsp->zsv_opts;
char *temp_filename;
FILE *temp_f = NULL;
FILE *fp = fopen(optsp->filename, "rb");
if (!fp)
return ret;

ix_zopts.ctx = &ixr;
ix_zopts.stream = fp;
ix_zopts.row_handler = build_memory_index_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

if (optsp->row_filter) {
zsv_csv_writer temp_file_writer = NULL;
unsigned char temp_buff[8196];

temp_filename = zsv_get_temp_filename("zsvsheet_filter_XXXXXXXX");
if (!temp_filename)
return ret;

*optsp->temp_filename = temp_filename;

struct zsv_csv_writer_options writer_opts = {0};
if (!(writer_opts.stream = temp_f = fopen(temp_filename, "wb")))
return ret;
if (!(temp_file_writer = zsv_writer_new(&writer_opts)))
goto out;

zsv_writer_set_temp_buff(temp_file_writer, temp_buff, sizeof(temp_buff));
ixr.writer = temp_file_writer;
ixr.filter_stream = temp_f;
}

const size_t initial_cap = 256;
ixr.ix = malloc(sizeof(*ixr.ix) + initial_cap * sizeof(size_t));
if (!ixr.ix)
goto out;
memset(ixr.ix, 0, sizeof(*ixr.ix));
ixr.ix->line_end_capacity = initial_cap;

while ((zst = zsv_parse_more(ixr.parser)) == zsv_status_ok)
;

zsv_finish(ixr.parser);

if (zst == zsv_status_no_more_input) {
ret = zsvsheet_index_status_ok;
*optsp->index = ixr.ix;
} else
free(ixr.ix);

out:
zsv_delete(ixr.parser);
fclose(fp);
if (temp_f)
fclose(temp_f);

return ret;
}

void get_memory_index(struct zsvsheet_index *ix, uint64_t row, off_t *offset_out, size_t *remaining_rows_out) {
if (!row || row - 1 < LINE_END_N) {
*offset_out = (off_t)ix->header_line_end;
*remaining_rows_out = row;
return;
}

const size_t i = (row - LINE_END_N) >> LINE_END_SHIFT;
*offset_out = (off_t)ix->line_ends[i];
*remaining_rows_out = row & (LINE_END_N - 1);
}
57 changes: 57 additions & 0 deletions app/sheet/index.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef SHEET_INDEX_H
#define SHEET_INDEX_H

#include <stdint.h>
#include <stdio.h>
#include <pthread.h>

#include "zsv.h"
#include "zsv/utils/writer.h"

// Decides the number of rows we skip when storing the line end
// 1 << 10 = 1024 means that we store every 1024th line end
#define LINE_END_SHIFT 10
#define LINE_END_N (1 << LINE_END_SHIFT)

enum zsvsheet_index_status {
zsvsheet_index_status_ok = 0,
zsvsheet_index_status_memory,
zsvsheet_index_status_error,
zsvsheet_index_status_utf8,
};

struct zsvsheet_index {
uint64_t header_line_end;
uint64_t row_count;
size_t line_end_capacity;
size_t line_end_len;
uint64_t line_ends[];
};

struct zsvsheet_indexer {
zsv_parser parser;
struct zsvsheet_index *ix;
const char *filter;
size_t filter_len;
zsv_csv_writer writer;
FILE *filter_stream;
};

struct zsvsheet_index_opts {
pthread_mutex_t *mutexp;
const char *filename;
char **temp_filename;
const char *row_filter;
struct zsv_opts zsv_opts;
struct zsvsheet_index **index;
unsigned char *index_ready;
struct zsvsheet_ui_buffer *uib;
int *errp;
struct zsv_prop_handler *custom_prop_handler;
const char *opts_used;
};

enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp);
void get_memory_index(struct zsvsheet_index *ix, uint64_t row, off_t *offset_out, size_t *remaining_rows_out);

#endif
Loading