Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pandas/_libs/include/pandas/parser/pd_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ typedef struct {
int (*parser_trim_buffers)(parser_t *);
int (*tokenize_all_rows)(parser_t *, const char *);
int (*tokenize_nrows)(parser_t *, size_t, const char *);
int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
int64_t (*str_to_int64)(const char *, int64_t, int64_t, TokenizerError *,
char);
uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
int *, char);
TokenizerError *, char);
double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
int *);
Expand Down
15 changes: 9 additions & 6 deletions pandas/_libs/include/pandas/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ See LICENSE for the license
#define PY_SSIZE_T_CLEAN
#include <Python.h>

#define ERROR_NO_DIGITS 1
#define ERROR_OVERFLOW 2
#define ERROR_INVALID_CHARS 3

#include <stdint.h>

#define STREAM_INIT_SIZE 32
Expand Down Expand Up @@ -50,6 +46,13 @@ See LICENSE for the license
* duplication of some file I/O.
*/

typedef enum {
TOKENIZER_OK,
ERROR_NO_DIGITS,
ERROR_OVERFLOW,
ERROR_INVALID_CHARS,
} TokenizerError;

typedef enum {
START_RECORD,
START_FIELD,
Expand Down Expand Up @@ -209,9 +212,9 @@ void uint_state_init(uint_state *self);
int uint64_conflict(uint_state *self);

uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep);
uint64_t uint_max, TokenizerError *error, char tsep);
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep);
TokenizerError *error, char tsep);
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int);
double precise_xstrtod(const char *p, char **q, char decimal, char sci,
Expand Down
138 changes: 50 additions & 88 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,10 @@ cdef extern from "pandas/parser/tokenizer.h":
SKIP_LINE
FINISHED

enum: ERROR_OVERFLOW
ctypedef enum TokenizerError:
TOKENIZER_OK,
ERROR_OVERFLOW,
ERROR_INVALID_CHARS

ctypedef enum BadLineHandleMethod:
ERROR,
Expand Down Expand Up @@ -282,9 +285,9 @@ cdef extern from "pandas/parser/pd_parser.h":
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil

int64_t str_to_int64(char *p_item, int64_t int_min,
int64_t int_max, int *error, char tsep) nogil
int64_t int_max, TokenizerError *error, char tsep) nogil
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) nogil
uint64_t uint_max, TokenizerError *error, char tsep) nogil

double xstrtod(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
Expand Down Expand Up @@ -1058,7 +1061,7 @@ cdef class TextReader:
if col_dtype is not None:
col_res, na_count = self._convert_with_dtype(
col_dtype, i, start, end, na_filter,
1, na_hashset, na_fset)
1, na_hashset, na_fset, False)

# Fallback on the parse (e.g. we requested int dtype,
# but its actually a float).
Expand All @@ -1069,30 +1072,34 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
maybe_int = True
for dt in self.dtype_cast_order:
if (dt.kind in "iu" and
self._column_has_float(i, start, end, na_filter, na_hashset)):
if not maybe_int and dt.kind in "iu":
continue

try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
except ValueError as e:
if str(e) == "Number is not int":
maybe_int = False
continue
else:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset, False)
except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
0, na_hashset, na_fset, False)

if col_res is not None:
break
Expand Down Expand Up @@ -1140,7 +1147,7 @@ cdef class TextReader:
bint na_filter,
bint user_dtype,
kh_str_starts_t *na_hashset,
set na_fset):
set na_fset, bint raise_on_float):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
Expand Down Expand Up @@ -1181,14 +1188,14 @@ cdef class TextReader:

elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset, raise_on_float)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError(f"Integer column has NA values in column {i}")
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_filter, na_hashset, raise_on_float)
na_count = 0

if result is not None and dtype != "int64":
Expand Down Expand Up @@ -1351,59 +1358,6 @@ cdef class TextReader:
else:
return None

cdef bint _column_has_float(self, Py_ssize_t col,
int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):
"""Check if the column contains any float number."""
cdef:
Py_ssize_t i, j, lines = end - start
coliter_t it
const char *word = NULL
const char *ignored_chars = " +-"
const char *digits = "0123456789"
const char *float_indicating_chars = "eE"
char null_byte = 0

coliter_setup(&it, self.parser, col, start)

for i in range(lines):
COLITER_NEXT(it, word)

if na_filter and kh_get_str_starts_item(na_hashset, word):
continue

found_first_digit = False
j = 0
while word[j] != null_byte:
if word[j] == self.parser.decimal:
return True
elif not found_first_digit and word[j] in ignored_chars:
# no-op
pass
elif not found_first_digit and word[j] not in digits:
# word isn't numeric
return False
elif not found_first_digit and word[j] in digits:
found_first_digit = True
elif word[j] in float_indicating_chars:
# preceding chars indicates numeric and
# current char indicates float
return True
elif word[j] not in digits:
# previous characters indicates numeric
# current character shows otherwise
return False
elif word[j] in digits:
# no-op
pass
else:
raise AssertionError(
f"Unhandled case {word[j]=} {found_first_digit=}"
)
j += 1

return False

# Factor out code common to TextReader.__dealloc__ and TextReader.close
# It cannot be a class method, since calling self.close() in __dealloc__
# which causes a class attribute lookup and violates best practices
Expand Down Expand Up @@ -1800,7 +1754,8 @@ cdef int _try_double_nogil(parser_t *parser,

cdef _try_uint64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
bint raise_on_float):
cdef:
int error
Py_ssize_t lines
Expand All @@ -1822,7 +1777,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None

if uint64_conflict(&state):
raise ValueError("Cannot convert to numerical dtype")
Expand All @@ -1839,7 +1797,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
const kh_str_starts_t *na_hashset,
uint64_t *data, uint_state *state) nogil:
cdef:
int error
TokenizerError error = TOKENIZER_OK
Py_ssize_t i, lines = line_end - line_start
coliter_t it
const char *word = NULL
Expand Down Expand Up @@ -1872,9 +1830,10 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,

cdef _try_int64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
cdef:
int error, na_count = 0
TokenizerError error = TOKENIZER_OK
int na_count = 0
Py_ssize_t lines
coliter_t it
int64_t *data
Expand All @@ -1892,18 +1851,21 @@ cdef _try_int64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None, None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None, None

return result, na_count


cdef int _try_int64_nogil(parser_t *parser, int64_t col,
int64_t line_start,
int64_t line_end, bint na_filter,
const kh_str_starts_t *na_hashset, int64_t NA,
int64_t *data, int *na_count) nogil:
cdef TokenizerError _try_int64_nogil(parser_t *parser, int64_t col,
int64_t line_start,
int64_t line_end, bint na_filter,
const kh_str_starts_t *na_hashset, int64_t NA,
int64_t *data, int *na_count) nogil:
cdef:
int error
TokenizerError error = TOKENIZER_OK
Py_ssize_t i, lines = line_end - line_start
coliter_t it
const char *word = NULL
Expand All @@ -1922,17 +1884,17 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,

data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
&error, parser.thousands)
if error != 0:
if error != TOKENIZER_OK:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pattern would definitely be cleaner with a macro to return of non-zero (in a follow up PR is fine)

return error
else:
for i in range(lines):
COLITER_NEXT(it, word)
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
&error, parser.thousands)
if error != 0:
if error != TOKENIZER_OK:
return error

return 0
return error

cdef _try_pylong(parser_t *parser, Py_ssize_t col,
int64_t line_start, int64_t line_end,
Expand Down
Loading
Loading