@@ -36,6 +36,7 @@ from cpython.ref cimport (
3636from cpython.unicode cimport (
3737 PyUnicode_AsUTF8String,
3838 PyUnicode_Decode,
39+ PyUnicode_DecodeUTF8,
3940)
4041
4142
@@ -321,7 +322,6 @@ cdef class TextReader:
321322 bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
322323 uint64_t parser_start
323324 list clocks
324- char * c_encoding
325325 const char * encoding_errors
326326 kh_str_starts_t * false_set
327327 kh_str_starts_t * true_set
@@ -381,7 +381,6 @@ cdef class TextReader:
381381 encoding_errors = b" strict" ):
382382
383383 # set encoding for native Python and C library
384- self .c_encoding = NULL
385384 if isinstance (encoding_errors, str ):
386385 encoding_errors = encoding_errors.encode(" utf-8" )
387386 Py_INCREF(encoding_errors)
@@ -638,7 +637,6 @@ cdef class TextReader:
638637 char * word
639638 object name, old_name
640639 uint64_t hr, data_line = 0
641- StringPath path = _string_path(self .c_encoding)
642640 list header = []
643641 set unnamed_cols = set ()
644642
@@ -678,8 +676,8 @@ cdef class TextReader:
678676 for i in range (field_count):
679677 word = self .parser.words[start + i]
680678
681- name = PyUnicode_Decode (word, strlen(word),
682- self .c_encoding, self .encoding_errors)
679+ name = PyUnicode_DecodeUTF8 (word, strlen(word),
680+ self .encoding_errors)
683681
684682 # We use this later when collecting placeholder names.
685683 old_name = name
@@ -987,8 +985,7 @@ cdef class TextReader:
987985 f" for column {name} - only the converter will "
988986 f" be used" ), ParserWarning,
989987 stacklevel = 5 )
990- results[i] = _apply_converter(conv, self .parser, i, start, end,
991- self .c_encoding)
988+ results[i] = _apply_converter(conv, self .parser, i, start, end)
992989 continue
993990
994991 # Collect the list of NaN values associated with the column.
@@ -1102,8 +1099,7 @@ cdef class TextReader:
11021099 # TODO: I suspect that _categorical_convert could be
11031100 # optimized when dtype is an instance of CategoricalDtype
11041101 codes, cats, na_count = _categorical_convert(
1105- self .parser, i, start, end, na_filter,
1106- na_hashset, self .c_encoding)
1102+ self .parser, i, start, end, na_filter, na_hashset)
11071103
11081104 # Method accepts list of strings, not encoded ones.
11091105 true_values = [x.decode() for x in self .true_values]
@@ -1199,14 +1195,8 @@ cdef class TextReader:
11991195 cdef _string_convert(self , Py_ssize_t i, int64_t start, int64_t end,
12001196 bint na_filter, kh_str_starts_t * na_hashset):
12011197
1202- cdef StringPath path = _string_path(self .c_encoding)
1203-
1204- if path == UTF8:
1205- return _string_box_utf8(self .parser, i, start, end, na_filter,
1206- na_hashset, self .encoding_errors)
1207- elif path == ENCODED:
1208- return _string_box_decode(self .parser, i, start, end,
1209- na_filter, na_hashset, self .c_encoding)
1198+ return _string_box_utf8(self .parser, i, start, end, na_filter,
1199+ na_hashset, self .encoding_errors)
12101200
12111201 def _get_converter (self , i , name ):
12121202 if self .converters is None :
@@ -1336,18 +1326,6 @@ def _maybe_upcast(arr):
13361326 return arr
13371327
13381328
1339- cdef enum StringPath:
1340- UTF8
1341- ENCODED
1342-
1343-
1344- # factored out logic to pick string converter
1345- cdef inline StringPath _string_path(char * encoding):
1346- if encoding != NULL and encoding != b" utf-8" :
1347- return ENCODED
1348- return UTF8
1349-
1350-
13511329# ----------------------------------------------------------------------
13521330# Type conversions / inference support code
13531331
@@ -1406,68 +1384,10 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
14061384 return result, na_count
14071385
14081386
1409- cdef _string_box_decode(parser_t * parser, int64_t col,
1410- int64_t line_start, int64_t line_end,
1411- bint na_filter, kh_str_starts_t * na_hashset,
1412- char * encoding):
1413- cdef:
1414- int na_count = 0
1415- Py_ssize_t i, size, lines
1416- coliter_t it
1417- const char * word = NULL
1418- ndarray[object ] result
1419-
1420- int ret = 0
1421- kh_strbox_t * table
1422-
1423- char * errors = " strict"
1424-
1425- object pyval
1426-
1427- object NA = na_values[np.object_]
1428- khiter_t k
1429-
1430- table = kh_init_strbox()
1431- lines = line_end - line_start
1432- result = np.empty(lines, dtype = np.object_)
1433- coliter_setup(& it, parser, col, line_start)
1434-
1435- for i in range (lines):
1436- COLITER_NEXT(it, word)
1437-
1438- if na_filter:
1439- if kh_get_str_starts_item(na_hashset, word):
1440- # in the hash table
1441- na_count += 1
1442- result[i] = NA
1443- continue
1444-
1445- k = kh_get_strbox(table, word)
1446-
1447- # in the hash table
1448- if k != table.n_buckets:
1449- # this increments the refcount, but need to test
1450- pyval = < object > table.vals[k]
1451- else :
1452- # box it. new ref?
1453- size = strlen(word)
1454- pyval = PyUnicode_Decode(word, size, encoding, errors)
1455-
1456- k = kh_put_strbox(table, word, & ret)
1457- table.vals[k] = < PyObject * > pyval
1458-
1459- result[i] = pyval
1460-
1461- kh_destroy_strbox(table)
1462-
1463- return result, na_count
1464-
1465-
14661387@ cython.boundscheck (False )
14671388cdef _categorical_convert(parser_t * parser, int64_t col,
14681389 int64_t line_start, int64_t line_end,
1469- bint na_filter, kh_str_starts_t * na_hashset,
1470- char * encoding):
1390+ bint na_filter, kh_str_starts_t * na_hashset):
14711391 " Convert column data into codes, categories"
14721392 cdef:
14731393 int na_count = 0
@@ -1480,7 +1400,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
14801400 int64_t current_category = 0
14811401
14821402 char * errors = " strict"
1483- StringPath path = _string_path(encoding)
14841403
14851404 int ret = 0
14861405 kh_str_t * table
@@ -1516,16 +1435,9 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
15161435
15171436 # parse and box categories to python strings
15181437 result = np.empty(table.n_occupied, dtype = np.object_)
1519- if path == ENCODED:
1520- for k in range (table.n_buckets):
1521- if kh_exist_str(table, k):
1522- size = strlen(table.keys[k])
1523- result[table.vals[k]] = PyUnicode_Decode(
1524- table.keys[k], size, encoding, errors)
1525- elif path == UTF8:
1526- for k in range (table.n_buckets):
1527- if kh_exist_str(table, k):
1528- result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1438+ for k in range (table.n_buckets):
1439+ if kh_exist_str(table, k):
1440+ result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
15291441
15301442 kh_destroy_str(table)
15311443 return np.asarray(codes), result, na_count
@@ -2064,13 +1976,11 @@ for k in list(na_values):
20641976
20651977
20661978cdef _apply_converter(object f, parser_t * parser, int64_t col,
2067- int64_t line_start, int64_t line_end,
2068- char * c_encoding):
1979+ int64_t line_start, int64_t line_end):
20691980 cdef:
20701981 Py_ssize_t i, lines
20711982 coliter_t it
20721983 const char * word = NULL
2073- char * errors = " strict"
20741984 ndarray[object ] result
20751985 object val
20761986
@@ -2079,17 +1989,10 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
20791989
20801990 coliter_setup(& it, parser, col, line_start)
20811991
2082- if c_encoding == NULL or c_encoding == b' utf-8' :
2083- for i in range (lines):
2084- COLITER_NEXT(it, word)
2085- val = PyUnicode_FromString(word)
2086- result[i] = f(val)
2087- else :
2088- for i in range (lines):
2089- COLITER_NEXT(it, word)
2090- val = PyUnicode_Decode(word, strlen(word),
2091- c_encoding, errors)
2092- result[i] = f(val)
1992+ for i in range (lines):
1993+ COLITER_NEXT(it, word)
1994+ val = PyUnicode_FromString(word)
1995+ result[i] = f(val)
20931996
20941997 return lib.maybe_convert_objects(result)
20951998
0 commit comments