Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2tsv: add --convert-to-space option #250

Merged
merged 1 commit into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions app/2tsv.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ struct static_buff {
struct zsv_2tsv_data {
zsv_parser parser;
struct static_buff out;
unsigned char convert_to_space : 1;
unsigned char _ : 7;
};

__attribute__((always_inline)) static inline void zsv_2tsv_flush(struct static_buff *b) {
Expand Down Expand Up @@ -57,7 +59,8 @@ static inline void zsv_2tsv_write(struct static_buff *b, const unsigned char *s,
// - else, return allocated char * of converted tsv (caller must free), and update *lenp
// - on error, set *err
__attribute__((always_inline)) static inline unsigned char *zsv_to_tsv(const unsigned char *utf8, size_t *len,
enum zsv_2tsv_status *err) {
enum zsv_2tsv_status *err,
char convert_to_space) {
// replace tab, newline and lf with \t, \n or \r or backslash
size_t do_convert = 0;
for (size_t i = 0; i < *len; i++) {
Expand All @@ -74,8 +77,12 @@ __attribute__((always_inline)) static inline unsigned char *zsv_to_tsv(const uns
size_t j = 0;
for (size_t i = 0; i < *len; i++) {
if (UNLIKELY(utf8[i] == '\t' || utf8[i] == '\n' || utf8[i] == '\r' || utf8[i] == '\\')) {
converted[j++] = '\\';
converted[j++] = utf8[i] == '\t' ? 't' : utf8[i] == '\n' ? 'n' : utf8[i] == '\r' ? 'r' : '\\';
if (convert_to_space)
converted[j++] = utf8[i] == '\\' ? '\\' : ' ';
else {
converted[j++] = '\\';
converted[j++] = utf8[i] == '\t' ? 't' : utf8[i] == '\n' ? 'n' : utf8[i] == '\r' ? 'r' : '\\';
}
} else
converted[j++] = utf8[i];
}
Expand All @@ -86,7 +93,8 @@ __attribute__((always_inline)) static inline unsigned char *zsv_to_tsv(const uns
}

__attribute__((always_inline)) static inline void zsv_2tsv_cell(struct zsv_2tsv_data *data, unsigned char *utf8_value,
size_t len, char no_newline_or_slash) {
size_t len, char no_newline_or_slash,
char convert_to_space) {
// output cell contents (converted if necessary)
if (len) {
enum zsv_2tsv_status err = zsv_2tsv_status_ok;
Expand All @@ -96,7 +104,7 @@ __attribute__((always_inline)) static inline void zsv_2tsv_cell(struct zsv_2tsv_
}

// if we're here, there either definitely an embedded tab, or maybe an embedded \n or \r
unsigned char *converted = zsv_to_tsv(utf8_value, &len, &err);
unsigned char *converted = zsv_to_tsv(utf8_value, &len, &err, convert_to_space);
if (converted != NULL) {
zsv_2tsv_write(&data->out, converted, len);
free(converted);
Expand All @@ -119,11 +127,12 @@ static void zsv_2tsv_row(void *ctx) {
if (LIKELY(!(memchr(start, '\n', row_len) || memchr(start, '\r', row_len) || memchr(start, '\\', row_len))))
no_newline_or_slash = 1;

zsv_2tsv_cell(ctx, cell.str, cell.len, no_newline_or_slash);
const char convert_to_space = data->convert_to_space;
zsv_2tsv_cell(ctx, cell.str, cell.len, no_newline_or_slash, convert_to_space);
for (unsigned int i = 1; i < cols; i++) {
zsv_2tsv_write(&data->out, (const unsigned char *)"\t", 1);
cell = zsv_get_cell(data->parser, i);
zsv_2tsv_cell(ctx, cell.str, cell.len, no_newline_or_slash);
zsv_2tsv_cell(ctx, cell.str, cell.len, no_newline_or_slash, convert_to_space);
}
}
zsv_2tsv_write(&data->out, (const unsigned char *)"\n", 1);
Expand All @@ -135,8 +144,12 @@ int zsv_2tsv_usage(int rc) {
" text processing. By default, embedded tabs or multilines will be escaped",
" to \\t, \\n or \\r, respectively",
"",
"Usage: " APPNAME " [filename] [-o <output_filename>]",
"Usage: " APPNAME " [filename] [options]",
" e.g. " APPNAME " < file.csv > file.tsv",
"",
"Options:",
" -o <file> : save output to <file>",
" --convert-to-space: convert embedded \\t, \\n or \\r to space",
NULL,
};

Expand All @@ -152,9 +165,11 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
const char *input_path = NULL;
int err = 0;
for (int i = 1; !err && i < argc; i++) {
if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {
if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
return zsv_2tsv_usage(0);
} else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--output")) {
if (!strcmp(argv[i], "--convert-to-space"))
data.convert_to_space = 1;
else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--output")) {
if (++i >= argc)
fprintf(stderr, "%s option requires a filename value\n", argv[i - 1]), err = 1;
else if (data.out.stream && data.out.stream != stdout)
Expand Down
7 changes: 6 additions & 1 deletion app/test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -391,14 +391,19 @@ test-pretty-escape-chars: ${BUILD_DIR}/bin/zsv_pretty${EXE}
@(${PREFIX} $< ${ARGS-$*} < ${TEST_DATA_DIR}/test/pretty-escape.csv -M ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

test-2tsv: test-2tsv-1 test-2tsv-2
test-2tsv: test-2tsv-1 test-2tsv-2 test-2tsv-3

test-2tsv-1 test-2tsv-2: test-% : ${BUILD_DIR}/bin/zsv_2tsv${EXE}
@${TEST_INIT}
@( ( ! [ -s "${TEST_DATA_DIR}/test/$*.csv" ] ) && echo "No test input for 2tsv" && exit 1) || \
(${PREFIX} $< ${ARGS-$*} < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

test-2tsv-3: ${BUILD_DIR}/bin/zsv_2tsv${EXE}
@${TEST_INIT}
@(${PREFIX} $< --convert-to-space < ${TEST_DATA_DIR}/test/2tsv-2.csv ${REDIRECT1} ${TMP_DIR}/[email protected] && \
${CMP} ${TMP_DIR}/[email protected] expected/[email protected] && ${TEST_PASS} || ${TEST_FAIL})

${THIS_MAKEFILE_DIR}/../../data/quoted5.csv: ${THIS_MAKEFILE_DIR}/../../data/quoted5.csv.bz2
bzip2 -d -c $< > $@

Expand Down
2 changes: 2 additions & 0 deletions app/test/expected/test-2tsv-3.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A B C
D D E F