Skip to content

Commit

Permalink
update zsv_strencode to add optional callback invoked on bad utf8 (#66)
Browse files Browse the repository at this point in the history
* update zsv_strencode to add optional callback invoked on bad utf8
* replace func() with func(void) to suppress warnings w/ certain compilers (e.g. emcc)
  • Loading branch information
liquidaty authored Oct 12, 2022
1 parent 2d4c4fb commit 65d86fa
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 20 deletions.
17 changes: 9 additions & 8 deletions app/select.c
Original file line number Diff line number Diff line change
Expand Up @@ -261,15 +261,12 @@ static void zsv_select_add_search(struct zsv_select_data *data, const char *valu
__attribute__((always_inline)) static inline unsigned char *
zsv_select_cell_clean(struct zsv_select_data *data, unsigned char *utf8_value, char quoted, size_t *lenp) {
size_t len = *lenp;
if(LIKELY(data->any_clean == 0))
return utf8_value;

// to do: option to replace or warn non-printable chars 0 - 31:
// vectorized scan
// replace or warn if found

if(UNLIKELY(data->malformed_utf8_replace != NULL))
len = zsv_strencode(utf8_value, len, *data->malformed_utf8_replace);
len = zsv_strencode(utf8_value, len, *data->malformed_utf8_replace, NULL, NULL);

if(UNLIKELY(!data->no_trim_whitespace))
utf8_value = (unsigned char *)zsv_strtrim(utf8_value, &len);
Expand Down Expand Up @@ -306,7 +303,8 @@ static inline char zsv_select_row_search_hit(struct zsv_select_data *data) {
unsigned int j = zsv_cell_count(data->parser);
for(unsigned int i = 0; i < j; i++) {
struct zsv_cell cell = zsv_get_cell(data->parser, i);
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(UNLIKELY(data->any_clean != 0))
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(cell.len) {
for(struct zsv_select_search_str *ss = data->search_strings; ss; ss = ss->next)
if(ss->value && *ss->value && memmem(cell.str, cell.len, ss->value, ss->len))
Expand Down Expand Up @@ -392,14 +390,16 @@ static void zsv_select_output_data_row(struct zsv_select_data *data) {
for(unsigned int i = 0; i < cnt; i++) { // for each output column
unsigned int in_ix = data->out2in[i].ix;
struct zsv_cell cell = zsv_get_cell(data->parser, in_ix);
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(UNLIKELY(data->any_clean != 0))
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(VERY_UNLIKELY(data->distinct == ZSV_SELECT_DISTINCT_MERGE)) {
if(UNLIKELY(cell.len == 0)) {
for(struct zsv_select_uint_list *ix = data->out2in[i].merge.indexes; ix; ix = ix->next) {
unsigned int m_ix = ix->value;
cell = zsv_get_cell(data->parser, m_ix);
if(cell.len) {
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(UNLIKELY(data->any_clean != 0))
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(cell.len)
break;
}
Expand Down Expand Up @@ -476,7 +476,8 @@ static void zsv_select_header_row(void *ctx) {
unsigned int max_header_ix = 0;
for(unsigned int i = 0; i < cols; i++) {
struct zsv_cell cell = zsv_get_cell(data->parser, i);
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(UNLIKELY(data->any_clean != 0))
cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len);
if(i < data->opts->max_columns) {
data->header_names[i] = zsv_memdup(cell.str, cell.len);
max_header_ix = i+1;
Expand Down
9 changes: 7 additions & 2 deletions app/utils/string.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ unsigned char *zsv_strtolowercase(const unsigned char *s, size_t *lenp) {
fprintf(stderr, "Warning: malformed UTF8 '%.*s'\n", (int)len_orig, s);
memcpy(tmp_s, s, len_orig);
tmp_s[len_orig] = '\0';
*lenp = zsv_strencode(tmp_s, len_orig, '?');
*lenp = zsv_strencode(tmp_s, len_orig, '?', NULL, NULL);
new_s = utf8proc_tolower_str(tmp_s, lenp);
free(tmp_s);
}
Expand Down Expand Up @@ -300,14 +300,17 @@ size_t zsv_strwhite(unsigned char *s, size_t len, unsigned int flags) {
// replace any non-conforming utf8 with the specified char, or
// remove from the string (and shorten the string) if replace = 0.
// returns the length of the valid string
size_t zsv_strencode(unsigned char *s, size_t n, unsigned char replace) {
size_t zsv_strencode(unsigned char *s, size_t n, unsigned char replace,
int (*malformed_handler)(void *, const unsigned char *s, size_t n, size_t offset), void *handler_ctx) {
size_t new_len = 0;
int clen;
for(size_t i2 = 0; i2 < n; i2 += (size_t)clen) {
clen = ZSV_UTF8_CHARLEN(s[i2]);
if(LIKELY(clen == 1))
s[new_len++] = s[i2];
else if(UNLIKELY(clen < 0) || UNLIKELY(i2 + clen >= n)) {
if(malformed_handler)
malformed_handler(handler_ctx, s, n, new_len);
if(replace)
s[new_len++] = replace;
clen = 1;
Expand All @@ -320,6 +323,8 @@ size_t zsv_strencode(unsigned char *s, size_t n, unsigned char replace) {
memmove(s + new_len, s + i2, clen);
new_len += clen;
} else { /* invalid; valid_n smaller than expected */
if(malformed_handler)
malformed_handler(handler_ctx, s, n, new_len);
memset(s + new_len, replace, valid_n);
new_len += valid_n;
clen = valid_n;
Expand Down
2 changes: 1 addition & 1 deletion include/zsv/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ struct zsv_cell zsv_get_cell(zsv_parser parser, size_t index);
* Get the library version
*/
ZSV_EXPORT
const char *zsv_lib_version();
const char *zsv_lib_version(void);

/**
* Change a parser's row handler. This function may be called at any time
Expand Down
4 changes: 2 additions & 2 deletions include/zsv/utils/arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ char havearg(const char *arg,
*/
void zsv_set_default_opts(struct zsv_opts);

struct zsv_opts zsv_get_default_opts();
struct zsv_opts zsv_get_default_opts(void);

void zsv_clear_default_opts();
void zsv_clear_default_opts(void);

# ifdef ZSV_EXTRAS

Expand Down
15 changes: 9 additions & 6 deletions include/zsv/utils/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,16 @@ size_t zsv_strwhite(unsigned char *s, size_t len, unsigned int flags);
* Force a string to conform to UTF8 encoding. Replaces any non-conforming utf8
* with the specified char, or removes from the string (and shortens the string)
* if replace = 0
* @param s input string
* @param n length (in bytes) of input
* @param replace the character to replace any malformed UTF8 bytes with, or 0
* to remove and shorten the result
* @return the length of the valid string
* @param s input string. invalid UTF8 bytes will be overwritten
* @param n length (in bytes) of input
* @param replace the character to replace any malformed UTF8 bytes with, or 0
* to remove and shorten the result
* @param callback optional callback invoked upon scanning malformed UTF8
* @param ctx context pointer passed to callback
* @return length of the valid string
*/
size_t zsv_strencode(unsigned char *s, size_t n, unsigned char replace);
size_t zsv_strencode(unsigned char *s, size_t n, unsigned char replace,
int (*malformed_handler)(void *, const unsigned char *s, size_t n, size_t offset), void *handler_ctx);

size_t zsv_strip_trailing_zeros(const char *s, size_t len);

Expand Down
2 changes: 1 addition & 1 deletion src/zsv.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include "zsv_internal.c"

ZSV_EXPORT
const char *zsv_lib_version() {
const char *zsv_lib_version(void) {
return VERSION;
}

Expand Down

0 comments on commit 65d86fa

Please sign in to comment.