Skip to content

Commit

Permalink
[WIP] Create in-memory index for sheet command (#254)
Browse files Browse the repository at this point in the history
* Add in-memory index to sheet command

TO DO: modify config to disallow sheet build if no threading

---------
Co-authored-by: liquidaty <[email protected]>
  • Loading branch information
richiejp authored Nov 3, 2024
1 parent 86a37e9 commit bc61cf2
Show file tree
Hide file tree
Showing 10 changed files with 356 additions and 220 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,5 @@ tmp
include/zsv.h
app/test/worldcitiespop_mil.csv
data/quoted5.csv
compile_commands.json
.cache
9 changes: 7 additions & 2 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ CFLAGS+=${YAJL_HELPER_INCLUDE} ${YAJL_INCLUDE}

# jq
JQ_TARBALL=${THIS_MAKEFILE_DIR}/external/jq-1.6.tar.gz
JQ_SRC=${BUILD_DIR}/external/jq-src
JQ_SRC=${BUILD_DIR}-external/jq-src

JQ_PREFIX ?=
ifeq ($(JQ_PREFIX),)
Expand Down Expand Up @@ -408,7 +408,12 @@ lib-jq: ${JQ_LIB}
@echo "Using jq library ${JQ_LIB}"

${JQ_BUNDLE_LIB}: ${JQ_SRC} # -D_REENTRANT needed for clang to not break
cd ${JQ_SRC} \
@if [ -e ${JQ_SRC}-conf ]; then \
rm -rf ${JQ_SRC}-conf; \
fi
# copy the directory because its timestamp gets updated after libjq is built
@cp -a ${JQ_SRC} ${JQ_SRC}-conf
cd ${JQ_SRC}-conf \
&& CC="${CC}" CFLAGS="${CFLAGS} -D_REENTRANT" ./configure \
--prefix="${JQ_PREFIX}" \
--disable-maintainer-mode \
Expand Down
56 changes: 21 additions & 35 deletions app/sheet.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@

#include <locale.h>
#include <wchar.h>
#ifdef ZSVSHEET_USE_THREADS
#include <pthread.h>
#endif

#define ZSV_COMMAND sheet
#include "zsv_command.h"
Expand All @@ -58,6 +56,7 @@ struct zsvsheet_opts {

#include "sheet/utf8-width.c"
#include "sheet/ui_buffer.c"
#include "sheet/index.c"
#include "sheet/read-data.c"
#include "sheet/key-bindings.c"

Expand Down Expand Up @@ -90,7 +89,7 @@ struct zsvsheet_builtin_proc_state {
const char *opts_used;
};

int get_subcommand(const char *prompt, char *buff, size_t buffsize, int footer_row) {
static void get_subcommand(const char *prompt, char *buff, size_t buffsize, int footer_row) {
*buff = '\0';
// this is a hack to blank-out the currently-selected cell value
int max_screen_width = 256; // to do: don't assume this
Expand Down Expand Up @@ -126,7 +125,6 @@ int get_subcommand(const char *prompt, char *buff, size_t buffsize, int footer_r
}
// Ignore other keys
}
return 0;
}

zsvsheet_handler_status zsvsheet_ext_prompt(struct zsvsheet_proc_context *ctx, char *buffer, size_t bufsz,
Expand Down Expand Up @@ -209,36 +207,6 @@ void zsvsheet_set_status(const struct zsvsheet_display_dimensions *ddims, int ov
#include "sheet/file.c"
#include "sheet/usage.c"

enum zsvsheet_status zsvsheet_key_handler(struct zsvsheet_key_handler_data *khd, int ch, char *cmdbuff,
size_t cmdbuff_sz, // subcommand buffer
struct zsvsheet_ui_buffer **base_ui_buffer,
struct zsvsheet_ui_buffer **current_ui_buffer,
const struct zsvsheet_display_dimensions *display_dims,
struct zsv_prop_handler *custom_prop_handler, const char *opts_used) {
struct zsvsheet_subcommand_handler_context sctx = {0};
sctx.ch = ch;
sctx.ui_buffers = *base_ui_buffer;
sctx.current_ui_buffer = *current_ui_buffer;
zsvsheet_handler_status status = khd->subcommand_handler(&sctx);
if (status == zsvsheet_handler_status_ok) {
get_subcommand(sctx.prompt, cmdbuff, cmdbuff_sz, (int)(display_dims->rows - display_dims->footer_span));
if (*cmdbuff == '\0')
return zsvsheet_status_continue;
struct zsvsheet_handler_context ctx = {.subcommand_value = cmdbuff,
.ch = ch,
.ui_buffers = {.base = base_ui_buffer, .current = current_ui_buffer},
.display_dims = display_dims,
.custom_prop_handler = custom_prop_handler,
.opts_used = opts_used};
status = khd->handler(&ctx);
}
if (status == zsvsheet_handler_status_ok)
return zsvsheet_status_ok;
if (status == zsvsheet_handler_status_ignore)
return zsvsheet_status_continue;
return zsvsheet_status_error;
}

struct zsvsheet_key_handler_data *zsvsheet_key_handlers = NULL;
struct zsvsheet_key_handler_data **zsvsheet_next_key_handler = &zsvsheet_key_handlers;

Expand Down Expand Up @@ -639,10 +607,28 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op

while (true) {
ch = getch();
if (ch == ERR) {
pthread_mutex_lock(&current_ui_buffer->mutex);
if (current_ui_buffer && current_ui_buffer->status) {
zsvsheet_set_status(&display_dims, 1, current_ui_buffer->status);
display_buffer_subtable(current_ui_buffer, header_span, &display_dims);
cbreak();
}
pthread_mutex_unlock(&current_ui_buffer->mutex);
continue;
}

zsvsheet_set_status(&display_dims, 1, "");
handler_state.display_info.update_buffer = false;

pthread_mutex_lock(&current_ui_buffer->mutex);
if (current_ui_buffer->index_ready &&
current_ui_buffer->dimensions.row_count != current_ui_buffer->index->row_count) {
current_ui_buffer->dimensions.row_count = current_ui_buffer->index->row_count;
handler_state.display_info.update_buffer = true;
}
pthread_mutex_unlock(&current_ui_buffer->mutex);

status = zsvsheet_key_press(ch, &handler_state);
if (status == zsvsheet_handler_status_exit)
break;
Expand All @@ -652,7 +638,7 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
if (handler_state.display_info.update_buffer && current_ui_buffer->filename) {
struct zsvsheet_opts zsvsheet_opts = {0};
if (read_data(&current_ui_buffer, NULL, current_ui_buffer->input_offset.row, current_ui_buffer->input_offset.col,
header_span, current_ui_buffer->dimensions.index, &zsvsheet_opts, custom_prop_handler, opts_used)) {
header_span, &zsvsheet_opts, custom_prop_handler, opts_used)) {
zsvsheet_set_status(&display_dims, 1, "Unexpected error!"); // to do: better error message
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion app/sheet/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#define ZSVSHEET_BUFFER_DEFAULT_CELL_BUFF_LEN 16
#define ZSVSHEET_BUFFER_DEFAULT_MAX_CELL_LEN 32768 - 1
#define ZSVSHEET_BUFFER_DEFAULT_ROW_COUNT 1000
#define ZSVSHEET_BUFFER_DEFAULT_ROW_COUNT 1024

typedef struct zsvsheet_buffer *zsvsheet_buffer_t;

Expand Down
2 changes: 1 addition & 1 deletion app/sheet/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ int zsvsheet_ui_buffer_open_file(const char *filename, const struct zsv_opts *zs
int err = 0;
struct zsvsheet_ui_buffer *tmp_ui_buffer = NULL;
uibopts.row_filter = row_filter;
if ((err = read_data(&tmp_ui_buffer, &uibopts, 0, 0, 0, NULL, &zsvsheet_opts, custom_prop_handler, opts_used)) != 0 ||
if ((err = read_data(&tmp_ui_buffer, &uibopts, 0, 0, 0, &zsvsheet_opts, custom_prop_handler, opts_used)) != 0 ||
!tmp_ui_buffer || !tmp_ui_buffer->buff_used_rows) {
zsvsheet_ui_buffer_delete(tmp_ui_buffer);
if (err)
Expand Down
156 changes: 156 additions & 0 deletions app/sheet/index.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <zsv.h>
#include <zsv/utils/prop.h>

#include "index.h"
#include "zsv/utils/file.h"
#include "zsv/utils/writer.h"

static struct zsvsheet_index *add_line_end(struct zsvsheet_index *ix, uint64_t end) {
size_t len = ix->line_end_len, cap = ix->line_end_capacity;

if (len >= cap) {
cap *= 2;
ix = realloc(ix, sizeof(*ix) + cap * sizeof(ix->line_ends[0]));
if (!ix)
return NULL;

ix->line_end_capacity = cap;
}

ix->line_ends[len] = end;
ix->line_end_len++;

return ix;
}

static void build_memory_index_row_handler(void *ctx) {
struct zsvsheet_indexer *ixr = ctx;
struct zsvsheet_index *ix = ixr->ix;
uint64_t line_end = zsv_cum_scanned_length(ixr->parser) + 1;
size_t col_count = zsv_cell_count(ixr->parser);

if (ixr->filter) {
if (col_count == 0)
return;

if (ixr->ix->header_line_end) {
struct zsv_cell first_cell = zsv_get_cell(ixr->parser, 0);
struct zsv_cell last_cell = zsv_get_cell(ixr->parser, col_count - 1);

if (!memmem(first_cell.str, last_cell.str - first_cell.str + last_cell.len, ixr->filter, ixr->filter_len))
return;
}

for (size_t i = 0; i < col_count; i++) {
struct zsv_cell cell = zsv_get_cell(ixr->parser, i);
zsv_writer_cell(ixr->writer, i == 0, cell.str, cell.len, cell.quoted);
}
}

if (!ixr->ix->header_line_end) {
ix->header_line_end = line_end;
} else if ((ix->row_count & (LINE_END_N - 1)) == 0) {
if (ixr->filter) {
if (zsv_writer_flush(ixr->writer) != zsv_writer_status_ok) {
zsv_abort(ixr->parser);
return;
}
line_end = ftell(ixr->filter_stream);
}

ix = add_line_end(ix, line_end);
if (!ix) {
zsv_abort(ixr->parser);
return;
}

ixr->ix = ix;
}

ix->row_count++;
}

enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp) {
struct zsvsheet_indexer ixr = {0};
ixr.filter = optsp->row_filter;
ixr.filter_len = optsp->row_filter ? strlen(optsp->row_filter) : 0;

enum zsvsheet_index_status ret = zsvsheet_index_status_error;
struct zsv_opts ix_zopts = optsp->zsv_opts;
char *temp_filename;
FILE *temp_f = NULL;
FILE *fp = fopen(optsp->filename, "rb");
if (!fp)
return ret;

ix_zopts.ctx = &ixr;
ix_zopts.stream = fp;
ix_zopts.row_handler = build_memory_index_row_handler;

enum zsv_status zst =
zsv_new_with_properties(&ix_zopts, optsp->custom_prop_handler, optsp->filename, optsp->opts_used, &ixr.parser);
if (zst != zsv_status_ok)
goto out;

if (optsp->row_filter) {
zsv_csv_writer temp_file_writer = NULL;
unsigned char temp_buff[8196];

temp_filename = zsv_get_temp_filename("zsvsheet_filter_XXXXXXXX");
if (!temp_filename)
return ret;

*optsp->temp_filename = temp_filename;

struct zsv_csv_writer_options writer_opts = {0};
if (!(writer_opts.stream = temp_f = fopen(temp_filename, "wb")))
return ret;
if (!(temp_file_writer = zsv_writer_new(&writer_opts)))
goto out;

zsv_writer_set_temp_buff(temp_file_writer, temp_buff, sizeof(temp_buff));
ixr.writer = temp_file_writer;
ixr.filter_stream = temp_f;
}

const size_t initial_cap = 256;
ixr.ix = malloc(sizeof(*ixr.ix) + initial_cap * sizeof(size_t));
if (!ixr.ix)
goto out;
memset(ixr.ix, 0, sizeof(*ixr.ix));
ixr.ix->line_end_capacity = initial_cap;

while ((zst = zsv_parse_more(ixr.parser)) == zsv_status_ok)
;

zsv_finish(ixr.parser);

if (zst == zsv_status_no_more_input) {
ret = zsvsheet_index_status_ok;
*optsp->index = ixr.ix;
} else
free(ixr.ix);

out:
zsv_delete(ixr.parser);
fclose(fp);
if (temp_f)
fclose(temp_f);

return ret;
}

void get_memory_index(struct zsvsheet_index *ix, uint64_t row, off_t *offset_out, size_t *remaining_rows_out) {
if (!row || row - 1 < LINE_END_N) {
*offset_out = (off_t)ix->header_line_end;
*remaining_rows_out = row;
return;
}

const size_t i = (row - LINE_END_N) >> LINE_END_SHIFT;
*offset_out = (off_t)ix->line_ends[i];
*remaining_rows_out = row & (LINE_END_N - 1);
}
57 changes: 57 additions & 0 deletions app/sheet/index.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef SHEET_INDEX_H
#define SHEET_INDEX_H

#include <stdint.h>
#include <stdio.h>
#include <pthread.h>

#include "zsv.h"
#include "zsv/utils/writer.h"

// Decides the number of rows we skip when storing the line end
// 1 << 10 = 1024 means that we store every 1024th line end
#define LINE_END_SHIFT 10
#define LINE_END_N (1 << LINE_END_SHIFT)

enum zsvsheet_index_status {
zsvsheet_index_status_ok = 0,
zsvsheet_index_status_memory,
zsvsheet_index_status_error,
zsvsheet_index_status_utf8,
};

struct zsvsheet_index {
uint64_t header_line_end;
uint64_t row_count;
size_t line_end_capacity;
size_t line_end_len;
uint64_t line_ends[];
};

struct zsvsheet_indexer {
zsv_parser parser;
struct zsvsheet_index *ix;
const char *filter;
size_t filter_len;
zsv_csv_writer writer;
FILE *filter_stream;
};

struct zsvsheet_index_opts {
pthread_mutex_t *mutexp;
const char *filename;
char **temp_filename;
const char *row_filter;
struct zsv_opts zsv_opts;
struct zsvsheet_index **index;
unsigned char *index_ready;
struct zsvsheet_ui_buffer *uib;
int *errp;
struct zsv_prop_handler *custom_prop_handler;
const char *opts_used;
};

enum zsvsheet_index_status build_memory_index(struct zsvsheet_index_opts *optsp);
void get_memory_index(struct zsvsheet_index *ix, uint64_t row, off_t *offset_out, size_t *remaining_rows_out);

#endif
Loading

0 comments on commit bc61cf2

Please sign in to comment.