diff --git a/pandas/tests/io/parser/__init__.py b/pandas/tests/io/parser/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/io/parser/common/__init__.py b/pandas/tests/io/parser/common/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py deleted file mode 100644 index 6be7269cb8433..0000000000000 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ /dev/null @@ -1,282 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas.errors import DtypeWarning - -from pandas import ( - DataFrame, - concat, -) -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.parametrize("index_col", [0, "index"]) -def test_read_chunksize_with_index(all_parsers, index_col): - parser = all_parsers - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - expected = DataFrame( - [ - ["foo", 2, 3, 4, 5], - ["bar", 7, 8, 9, 10], - ["baz", 12, 13, 14, 15], - ["qux", 12, 13, 14, 15], - ["foo2", 12, 13, 14, 15], - ["bar2", 12, 13, 14, 15], - ], - columns=["index", "A", "B", "C", "D"], - ) - expected = expected.set_index("index") - - with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: - chunks = list(reader) - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -@pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) -def test_read_chunksize_bad(all_parsers, chunksize): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - msg = r"'chunksize' must be an integer >=1" - - with pytest.raises(ValueError, match=msg): - with parser.read_csv(StringIO(data), chunksize=chunksize) as _: - pass - - -@pytest.mark.parametrize("chunksize", [2, 8]) -def test_read_chunksize_and_nrows(all_parsers, chunksize): - # see gh-15755 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0, "nrows": 5} - - expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), expected) - - -def test_read_chunksize_and_nrows_changing_size(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0, "nrows": 5} - - expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: - tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) - tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) - - with pytest.raises(StopIteration, match=""): - reader.get_chunk(size=3) - - -def test_get_chunk_passed_chunksize(all_parsers): - parser = all_parsers - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -1,2,3""" - - with parser.read_csv(StringIO(data), chunksize=2) as reader: - result = reader.get_chunk() - - expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}]) -def test_read_chunksize_compat(all_parsers, kwargs): - # see gh-12185 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), result) - - -def test_read_chunksize_jagged_names(all_parsers): - # see gh-23509 - parser = all_parsers - data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) - - expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) - with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: - result = concat(reader) - tm.assert_frame_equal(result, expected) - - -def test_chunk_begins_with_newline_whitespace(all_parsers): - # see gh-10022 - parser = all_parsers - data = "\n hello\nworld\n" - - result = parser.read_csv(StringIO(data), header=None) - expected = DataFrame([" hello", "world"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.slow -def test_chunks_have_consistent_numerical_type(all_parsers): - parser = all_parsers - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) - - # Coercions should work without warnings. - with tm.assert_produces_warning(None): - result = parser.read_csv(StringIO(data)) - - assert type(result.a[0]) is np.float64 - assert result.a.dtype == float - - -def test_warn_if_chunks_have_mismatched_type(all_parsers): - warning_type = None - parser = all_parsers - size = 10000 - - # see gh-3866: if chunks are different types and can't - # be coerced using numerical types, then issue warning. - if parser.engine == "c" and parser.low_memory: - warning_type = DtypeWarning - # Use larger size to hit warning path - size = 499999 - - integers = [str(i) for i in range(size)] - data = "a\n" + "\n".join(integers + ["a", "b"] + integers) - - buf = StringIO(data) - - df = parser.read_csv_check_warnings( - warning_type, - r"Columns \(0\) have mixed types. " - "Specify dtype option on import or set low_memory=False.", - buf, - ) - - assert df.a.dtype == object - - -@pytest.mark.parametrize("iterator", [True, False]) -def test_empty_with_nrows_chunksize(all_parsers, iterator): - # see gh-9535 - parser = all_parsers - expected = DataFrame(columns=["foo", "bar"]) - - nrows = 10 - data = StringIO("foo,bar\n") - - if iterator: - with parser.read_csv(data, chunksize=nrows) as reader: - result = next(iter(reader)) - else: - result = parser.read_csv(data, nrows=nrows) - - tm.assert_frame_equal(result, expected) - - -def test_read_csv_memory_growth_chunksize(all_parsers): - # see gh-24805 - # - # Let's just make sure that we don't crash - # as we iteratively process all chunks. - parser = all_parsers - - with tm.ensure_clean() as path: - with open(path, "w", encoding="utf-8") as f: - for i in range(1000): - f.write(str(i) + "\n") - - with parser.read_csv(path, chunksize=20) as result: - for _ in result: - pass - - -def test_chunksize_with_usecols_second_block_shorter(all_parsers): - # GH#21211 - parser = all_parsers - data = """1,2,3,4 -5,6,7,8 -9,10,11 -""" - - result_chunks = parser.read_csv( - StringIO(data), - names=["a", "b"], - chunksize=2, - usecols=[0, 1], - header=None, - ) - - expected_frames = [ - DataFrame({"a": [1, 5], "b": [2, 6]}), - DataFrame({"a": [9], "b": [10]}, index=[2]), - ] - - for i, result in enumerate(result_chunks): - tm.assert_frame_equal(result, expected_frames[i]) - - -def test_chunksize_second_block_shorter(all_parsers): - # GH#21211 - parser = all_parsers - data = """a,b,c,d -1,2,3,4 -5,6,7,8 -9,10,11 -""" - - result_chunks = parser.read_csv(StringIO(data), chunksize=2) - - expected_frames = [ - DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), - DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]), - ] - - for i, result in enumerate(result_chunks): - tm.assert_frame_equal(result, expected_frames[i]) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py deleted file mode 100644 index 9083d725887f1..0000000000000 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ /dev/null @@ -1,864 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from datetime import datetime -from inspect import signature -from io import StringIO -import os -from pathlib import Path -import sys - -import numpy as np -import pytest - -from pandas.errors import ( - EmptyDataError, - ParserError, - ParserWarning, -) - -from pandas import ( - DataFrame, - Index, - Timestamp, - compat, -) -import pandas._testing as tm - -from pandas.io.parsers import TextFileReader -from pandas.io.parsers.c_parser_wrapper import CParserWrapper - -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -def test_override_set_noconvert_columns(): - # see gh-17351 - # - # Usecols needs to be sorted in _set_noconvert_columns based - # on the test_usecols_with_parse_dates test from test_usecols.py - class MyTextFileReader(TextFileReader): - def __init__(self) -> None: - self._currow = 0 - self.squeeze = False - - class MyCParserWrapper(CParserWrapper): - def _set_noconvert_columns(self): - if self.usecols_dtype == "integer": - # self.usecols is a set, which is documented as unordered - # but in practice, a CPython set of integers is sorted. - # In other implementations this assumption does not hold. - # The following code simulates a different order, which - # before GH 17351 would cause the wrong columns to be - # converted via the parse_dates parameter - self.usecols = list(self.usecols) - self.usecols.reverse() - return CParserWrapper._set_noconvert_columns(self) - - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - - parse_dates = [[1, 2]] - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - parser = MyTextFileReader() - parser.options = { - "usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ",", - } - parser.engine = "c" - parser._engine = MyCParserWrapper(StringIO(data), **parser.options) - - result = parser.read() - tm.assert_frame_equal(result, expected) - - -def test_read_csv_local(all_parsers, csv1): - prefix = "file:///" if compat.is_platform_windows() else "file://" - parser = all_parsers - - fname = prefix + str(os.path.abspath(csv1)) - result = parser.read_csv(fname, index_col=0, parse_dates=True) - - expected = DataFrame( - [ - [0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007], - ], - columns=["A", "B", "C", "D"], - index=Index( - [ - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11), - ], - name="index", - ), - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_1000_sep(all_parsers): - parser = all_parsers - data = """A|B|C -1|2,334|5 -10|13|10. -""" - expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) - - result = parser.read_csv(StringIO(data), sep="|", thousands=",") - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_unnamed_columns(all_parsers): - data = """A,B,C,, -1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - parser = all_parsers - expected = DataFrame( - [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], - dtype=np.int64, - columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], - ) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -def test_csv_mixed_type(all_parsers): - data = """A,B,C -a,1,2 -b,3,4 -c,4,5 -""" - parser = all_parsers - expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_read_csv_low_memory_no_rows_with_index(all_parsers): - # see gh-21141 - parser = all_parsers - - if not parser.low_memory: - pytest.skip("This is a low-memory specific test") - - data = """A,B,C -1,1,1,2 -2,2,3,4 -3,3,4,5 -""" - result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) - expected = DataFrame(columns=["A", "B", "C"]) - tm.assert_frame_equal(result, expected) - - -def test_read_csv_dataframe(all_parsers, csv1): - parser = all_parsers - result = parser.read_csv(csv1, index_col=0, parse_dates=True) - - expected = DataFrame( - [ - [0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007], - ], - columns=["A", "B", "C", "D"], - index=Index( - [ - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11), - ], - name="index", - ), - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("nrows", [3, 3.0]) -def test_read_nrows(all_parsers, nrows): - # see gh-10476 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - expected = DataFrame( - [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], - columns=["index", "A", "B", "C", "D"], - ) - parser = all_parsers - - result = parser.read_csv(StringIO(data), nrows=nrows) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("nrows", [1.2, "foo", -1]) -def test_read_nrows_bad(all_parsers, nrows): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - msg = r"'nrows' must be an integer >=0" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), nrows=nrows) - - -def test_nrows_skipfooter_errors(all_parsers): - msg = "'skipfooter' not supported with 'nrows'" - data = "a\n1\n2\n3\n4\n5\n6" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), skipfooter=1, nrows=5) - - -@xfail_pyarrow -def test_missing_trailing_delimiters(all_parsers): - parser = all_parsers - data = """A,B,C,D -1,2,3,4 -1,3,3, -1,4,5""" - - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_skip_initial_space(all_parsers): - data = ( - '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' - "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, " - "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, " - "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, " - "0.212036, 14.7674, 41.605, -9999.0, -9999.0, " - "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128" - ) - parser = all_parsers - - result = parser.read_csv( - StringIO(data), - names=list(range(33)), - header=None, - na_values=["-9999.0"], - skipinitialspace=True, - ) - expected = DataFrame( - [ - [ - "09-Apr-2012", - "01:10:18.300", - 2456026.548822908, - 12849, - 1.00361, - 1.12551, - 330.65659, - 355626618.16711, - 73.48821, - 314.11625, - 1917.09447, - 179.71425, - 80.0, - 240.0, - -350, - 70.06056, - 344.9837, - 1, - 1, - -0.689265, - -0.692787, - 0.212036, - 14.7674, - 41.605, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - 0, - 12, - 128, - ] - ] - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_trailing_delimiters(all_parsers): - # see gh-2442 - data = """A,B,C -1,2,3, -4,5,6, -7,8,9,""" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=False) - - expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) - tm.assert_frame_equal(result, expected) - - -def test_escapechar(all_parsers): - # https://stackoverflow.com/questions/13824840/feature-request-for- - # pandas-read-csv - data = '''SEARCH_TERM,ACTUAL_URL -"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa: E501 - - parser = all_parsers - result = parser.read_csv( - StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" - ) - - assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series' - - tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) - - -@xfail_pyarrow -def test_ignore_leading_whitespace(all_parsers): - # see gh-3374, gh-6607 - parser = all_parsers - data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" - result = parser.read_csv(StringIO(data), sep=r"\s+") - - expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) -def test_uneven_lines_with_usecols(all_parsers, usecols): - # see gh-12203 - parser = all_parsers - data = r"""a,b,c -0,1,2 -3,4,5,6,7 -8,9,10""" - - if usecols is None: - # Make sure that an error is still raised - # when the "usecols" parameter is not provided. - msg = r"Expected \d+ fields in line \d+, saw \d+" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data)) - else: - expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - # First, check to see that the response of parser when faced with no - # provided columns raises the correct error, with or without usecols. - ("", {}, None), - ("", {"usecols": ["X"]}, None), - ( - ",,", - {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, - DataFrame(columns=["X"], index=[0], dtype=np.float64), - ), - ( - "", - {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, - DataFrame(columns=["X"]), - ), - ], -) -def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): - # see gh-12493 - parser = all_parsers - - if expected is None: - msg = "No columns to parse from file" - with pytest.raises(EmptyDataError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize( - "kwargs,expected", - [ - # gh-8661, gh-8679: this should ignore six lines, including - # lines with trailing whitespace and blank lines. - ( - { - "header": None, - "delim_whitespace": True, - "skiprows": [0, 1, 2, 3, 5, 6], - "skip_blank_lines": True, - }, - DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), - ), - # gh-8983: test skipping set of rows after a row with trailing spaces. - ( - { - "delim_whitespace": True, - "skiprows": [1, 2, 3, 5, 6], - "skip_blank_lines": True, - }, - DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), - ), - ], -) -def test_trailing_spaces(all_parsers, kwargs, expected): - data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 - parser = all_parsers - - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_raise_on_sep_with_delim_whitespace(all_parsers): - # see gh-6607 - data = "a b c\n1 2 3" - parser = all_parsers - - with pytest.raises(ValueError, match="you can only specify one"): - parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) - - -def test_read_filepath_or_buffer(all_parsers): - # see gh-43366 - parser = all_parsers - - with pytest.raises(TypeError, match="Expected file path name or file-like"): - parser.read_csv(filepath_or_buffer=b"input") - - -@xfail_pyarrow -@pytest.mark.parametrize("delim_whitespace", [True, False]) -def test_single_char_leading_whitespace(all_parsers, delim_whitespace): - # see gh-9710 - parser = all_parsers - data = """\ -MyColumn -a -b -a -b\n""" - - expected = DataFrame({"MyColumn": list("abab")}) - result = parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) - tm.assert_frame_equal(result, expected) - - -# Skip for now, actually only one test fails though, but its tricky to xfail -@skip_pyarrow -@pytest.mark.parametrize( - "sep,skip_blank_lines,exp_data", - [ - (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), - (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), - ( - ",", - False, - [ - [1.0, 2.0, 4.0], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [5.0, np.nan, 10.0], - [np.nan, np.nan, np.nan], - [-70.0, 0.4, 1.0], - ], - ), - ], -) -def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): - parser = all_parsers - data = """\ -A,B,C -1,2.,4. - - -5.,NaN,10.0 - --70,.4,1 -""" - - if sep == r"\s+": - data = data.replace(",", " ") - - result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) - expected = DataFrame(exp_data, columns=["A", "B", "C"]) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_whitespace_lines(all_parsers): - parser = all_parsers - data = """ - -\t \t\t -\t -A,B,C -\t 1,2.,4. -5.,NaN,10.0 -""" - expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize( - "data,expected", - [ - ( - """ A B C D -a 1 2 3 4 -b 1 2 3 4 -c 1 2 3 4 -""", - DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - columns=["A", "B", "C", "D"], - index=["a", "b", "c"], - ), - ), - ( - " a b c\n1 2 3 \n4 5 6\n 7 8 9", - DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), - ), - ], -) -def test_whitespace_regex_separator(all_parsers, data, expected): - # see gh-6607 - parser = all_parsers - result = parser.read_csv(StringIO(data), sep=r"\s+") - tm.assert_frame_equal(result, expected) - - -def test_sub_character(all_parsers, csv_dir_path): - # see gh-16893 - filename = os.path.join(csv_dir_path, "sub_char.csv") - expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) - - parser = all_parsers - result = parser.read_csv(filename) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) -def test_filename_with_special_chars(all_parsers, filename): - # see gh-15086. - parser = all_parsers - df = DataFrame({"a": [1, 2, 3]}) - - with tm.ensure_clean(filename) as path: - df.to_csv(path, index=False) - - result = parser.read_csv(path) - tm.assert_frame_equal(result, df) - - -def test_read_table_same_signature_as_read_csv(all_parsers): - # GH-34976 - parser = all_parsers - - table_sign = signature(parser.read_table) - csv_sign = signature(parser.read_csv) - - assert table_sign.parameters.keys() == csv_sign.parameters.keys() - assert table_sign.return_annotation == csv_sign.return_annotation - - for key, csv_param in csv_sign.parameters.items(): - table_param = table_sign.parameters[key] - if key == "sep": - assert csv_param.default == "," - assert table_param.default == "\t" - assert table_param.annotation == csv_param.annotation - assert table_param.kind == csv_param.kind - continue - - assert table_param == csv_param - - -def test_read_table_equivalency_to_read_csv(all_parsers): - # see gh-21948 - # As of 0.25.0, read_table is undeprecated - parser = all_parsers - data = "a\tb\n1\t2\n3\t4" - expected = parser.read_csv(StringIO(data), sep="\t") - result = parser.read_table(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("read_func", ["read_csv", "read_table"]) -def test_read_csv_and_table_sys_setprofile(all_parsers, read_func): - # GH#41069 - parser = all_parsers - data = "a b\n0 1" - - sys.setprofile(lambda *a, **k: None) - result = getattr(parser, read_func)(StringIO(data)) - sys.setprofile(None) - - expected = DataFrame({"a b": ["0 1"]}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_first_row_bom(all_parsers): - # see gh-26545 - parser = all_parsers - data = '''\ufeff"Head1"\t"Head2"\t"Head3"''' - - result = parser.read_csv(StringIO(data), delimiter="\t") - expected = DataFrame(columns=["Head1", "Head2", "Head3"]) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_first_row_bom_unquoted(all_parsers): - # see gh-36343 - parser = all_parsers - data = """\ufeffHead1\tHead2\tHead3""" - - result = parser.read_csv(StringIO(data), delimiter="\t") - expected = DataFrame(columns=["Head1", "Head2", "Head3"]) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("nrows", range(1, 6)) -def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): - # GH 28071 - ref = DataFrame( - [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], - columns=list("ab"), - ) - csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" - parser = all_parsers - df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) - tm.assert_frame_equal(df, ref[:nrows]) - - -@xfail_pyarrow -def test_no_header_two_extra_columns(all_parsers): - # GH 26218 - column_names = ["one", "two", "three"] - ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) - stream = StringIO("foo,bar,baz,bam,blah") - parser = all_parsers - df = parser.read_csv_check_warnings( - ParserWarning, - "Length of header or names does not match length of data. " - "This leads to a loss of data with index_col=False.", - stream, - header=None, - names=column_names, - index_col=False, - ) - tm.assert_frame_equal(df, ref) - - -def test_read_csv_names_not_accepting_sets(all_parsers): - # GH 34946 - data = """\ - 1,2,3 - 4,5,6\n""" - parser = all_parsers - with pytest.raises(ValueError, match="Names should be an ordered collection."): - parser.read_csv(StringIO(data), names=set("QAZ")) - - -@xfail_pyarrow -def test_read_table_delim_whitespace_default_sep(all_parsers): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - result = parser.read_table(f, delim_whitespace=True) - expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) - - -def test_read_csv_delimiter_and_sep_no_default(all_parsers): - # GH#39823 - f = StringIO("a,b\n1,2") - parser = all_parsers - msg = "Specified a sep and a delimiter; you can only specify one." - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, sep=" ", delimiter=".") - - -@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}]) -def test_read_csv_line_break_as_separator(kwargs, all_parsers): - # GH#43528 - parser = all_parsers - data = """a,b,c -1,2,3 - """ - msg = ( - r"Specified \\n as separator or delimiter. This forces the python engine " - r"which does not accept a line terminator. Hence it is not allowed to use " - r"the line terminator as separator." - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - - -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) - - -@xfail_pyarrow -def test_dict_keys_as_names(all_parsers): - # GH: 36928 - data = "1,2" - - keys = {"a": int, "b": int}.keys() - parser = all_parsers - - result = parser.read_csv(StringIO(data), names=keys) - expected = DataFrame({"a": [1], "b": [2]}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_encoding_surrogatepass(all_parsers): - # GH39017 - parser = all_parsers - content = b"\xed\xbd\xbf" - decoded = content.decode("utf-8", errors="surrogatepass") - expected = DataFrame({decoded: [decoded]}, index=[decoded * 2]) - expected.index.name = decoded * 2 - - with tm.ensure_clean() as path: - Path(path).write_bytes( - content * 2 + b"," + content + b"\n" + content * 2 + b"," + content - ) - df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0) - tm.assert_frame_equal(df, expected) - with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"): - parser.read_csv(path) - - -def test_malformed_second_line(all_parsers): - # see GH14782 - parser = all_parsers - data = "\na\nb\n" - result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1) - expected = DataFrame({"a": ["b"]}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_short_single_line(all_parsers): - # GH 47566 - parser = all_parsers - columns = ["a", "b", "c"] - data = "1,2" - result = parser.read_csv(StringIO(data), header=None, names=columns) - expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_short_multi_line(all_parsers): - # GH 47566 - parser = all_parsers - columns = ["a", "b", "c"] - data = "1,2\n1,2" - result = parser.read_csv(StringIO(data), header=None, names=columns) - expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]}) - tm.assert_frame_equal(result, expected) - - -def test_read_seek(all_parsers): - # GH48646 - parser = all_parsers - prefix = "### DATA\n" - content = "nkey,value\ntables,rectangular\n" - with tm.ensure_clean() as path: - Path(path).write_text(prefix + content) - with open(path, encoding="utf-8") as file: - file.readline() - actual = parser.read_csv(file) - expected = parser.read_csv(StringIO(content)) - tm.assert_frame_equal(actual, expected) diff --git a/pandas/tests/io/parser/common/test_data_list.py b/pandas/tests/io/parser/common/test_data_list.py deleted file mode 100644 index 8d484bba1cb9d..0000000000000 --- a/pandas/tests/io/parser/common/test_data_list.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -import csv -from io import StringIO - -import pytest - -from pandas import DataFrame -import pandas._testing as tm - -from pandas.io.parsers import TextParser - -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - - -@xfail_pyarrow -def test_read_data_list(all_parsers): - parser = all_parsers - kwargs = {"index_col": 0} - data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" - - data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] - expected = parser.read_csv(StringIO(data), **kwargs) - - with TextParser(data_list, chunksize=2, **kwargs) as parser: - result = parser.read() - - tm.assert_frame_equal(result, expected) - - -def test_reader_list(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0} - - lines = list(csv.reader(StringIO(data))) - with TextParser(lines, chunksize=2, **kwargs) as reader: - chunks = list(reader) - - expected = parser.read_csv(StringIO(data), **kwargs) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -def test_reader_list_skiprows(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0} - - lines = list(csv.reader(StringIO(data))) - with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: - chunks = list(reader) - - expected = parser.read_csv(StringIO(data), **kwargs) - - tm.assert_frame_equal(chunks[0], expected[1:3]) - - -def test_read_csv_parse_simple_list(all_parsers): - parser = all_parsers - data = """foo -bar baz -qux foo -foo -bar""" - - result = parser.read_csv(StringIO(data), header=None) - expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py deleted file mode 100644 index 72d4eb2c69845..0000000000000 --- a/pandas/tests/io/parser/common/test_decimal.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import pytest - -from pandas import DataFrame -import pandas._testing as tm - -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - - -@xfail_pyarrow -@pytest.mark.parametrize( - "data,thousands,decimal", - [ - ( - """A|B|C -1|2,334.01|5 -10|13|10. -""", - ",", - ".", - ), - ( - """A|B|C -1|2.334,01|5 -10|13|10, -""", - ".", - ",", - ), - ], -) -def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): - parser = all_parsers - expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) - - result = parser.read_csv( - StringIO(data), sep="|", thousands=thousands, decimal=decimal - ) - tm.assert_frame_equal(result, expected) - - -def test_euro_decimal_format(all_parsers): - parser = all_parsers - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - result = parser.read_csv(StringIO(data), sep=";", decimal=",") - expected = DataFrame( - [ - [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], - [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], - [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], - ], - columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py deleted file mode 100644 index ba196a532adf6..0000000000000 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ /dev/null @@ -1,423 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import ( - BytesIO, - StringIO, -) -import os -import platform -from urllib.error import URLError -import uuid - -import pytest - -from pandas.errors import ( - EmptyDataError, - ParserError, -) -import pandas.util._test_decorators as td - -from pandas import DataFrame -import pandas._testing as tm - -# TODO(1.4) Please xfail individual tests at release time -# instead of skip -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.network -@tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/salaries.csv" - ), - check_before_test=True, -) -def test_url(all_parsers, csv_dir_path): - parser = all_parsers - kwargs = {"sep": "\t"} - - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/salaries.csv" - ) - url_result = parser.read_csv(url, **kwargs) - - local_path = os.path.join(csv_dir_path, "salaries.csv") - local_result = parser.read_csv(local_path, **kwargs) - tm.assert_frame_equal(url_result, local_result) - - -@pytest.mark.slow -def test_local_file(all_parsers, csv_dir_path): - parser = all_parsers - kwargs = {"sep": "\t"} - - local_path = os.path.join(csv_dir_path, "salaries.csv") - local_result = parser.read_csv(local_path, **kwargs) - url = "file://localhost/" + local_path - - try: - url_result = parser.read_csv(url, **kwargs) - tm.assert_frame_equal(url_result, local_result) - except URLError: - # Fails on some systems. - pytest.skip("Failing on: " + " ".join(platform.uname())) - - -def test_path_path_lib(all_parsers): - parser = all_parsers - df = tm.makeDataFrame() - result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) - tm.assert_frame_equal(df, result) - - -def test_path_local_path(all_parsers): - parser = all_parsers - df = tm.makeDataFrame() - result = tm.round_trip_localpath( - df.to_csv, lambda p: parser.read_csv(p, index_col=0) - ) - tm.assert_frame_equal(df, result) - - -def test_nonexistent_path(all_parsers): - # gh-2428: pls no segfault - # gh-14086: raise more helpful FileNotFoundError - # GH#29233 "File foo" instead of "File b'foo'" - parser = all_parsers - path = f"{uuid.uuid4()}.csv" - - msg = r"\[Errno 2\]" - with pytest.raises(FileNotFoundError, match=msg) as e: - parser.read_csv(path) - assert path == e.value.filename - - -@td.skip_if_windows # os.chmod does not work in windows -def test_no_permission(all_parsers): - # GH 23784 - parser = all_parsers - - msg = r"\[Errno 13\]" - with tm.ensure_clean() as path: - os.chmod(path, 0) # make file unreadable - - # verify that this process cannot open the file (not running as sudo) - try: - with open(path, encoding="utf-8"): - pass - pytest.skip("Running as sudo.") - except PermissionError: - pass - - with pytest.raises(PermissionError, match=msg) as e: - parser.read_csv(path) - assert path == e.value.filename - - -@pytest.mark.parametrize( - "data,kwargs,expected,msg", - [ - # gh-10728: WHITESPACE_LINE - ( - "a,b,c\n4,5,6\n ", - {}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # gh-10548: EAT_LINE_COMMENT - ( - "a,b,c\n4,5,6\n#comment", - {"comment": "#"}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # EAT_CRNL_NOP - ( - "a,b,c\n4,5,6\n\r", - {}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # EAT_COMMENT - ( - "a,b,c\n4,5,6#comment", - {"comment": "#"}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # SKIP_LINE - ( - "a,b,c\n4,5,6\nskipme", - {"skiprows": [2]}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # EAT_LINE_COMMENT - ( - "a,b,c\n4,5,6\n#comment", - {"comment": "#", "skip_blank_lines": False}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # IN_FIELD - ( - "a,b,c\n4,5,6\n ", - {"skip_blank_lines": False}, - DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), - None, - ), - # EAT_CRNL - ( - "a,b,c\n4,5,6\n\r", - {"skip_blank_lines": False}, - DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), - None, - ), - # ESCAPED_CHAR - ( - "a,b,c\n4,5,6\n\\", - {"escapechar": "\\"}, - None, - "(EOF following escape character)|(unexpected end of data)", - ), - # ESCAPE_IN_QUOTED_FIELD - ( - 'a,b,c\n4,5,6\n"\\', - {"escapechar": "\\"}, - None, - "(EOF inside string starting at row 2)|(unexpected end of data)", - ), - # IN_QUOTED_FIELD - ( - 'a,b,c\n4,5,6\n"', - {"escapechar": "\\"}, - None, - "(EOF inside string starting at row 2)|(unexpected end of data)", - ), - ], - ids=[ - "whitespace-line", - "eat-line-comment", - "eat-crnl-nop", - "eat-comment", - "skip-line", - "eat-line-comment", - "in-field", - "eat-crnl", - "escaped-char", - "escape-in-quoted-field", - "in-quoted-field", - ], -) -def test_eof_states(all_parsers, data, kwargs, expected, msg): - # see gh-10728, gh-10548 - parser = all_parsers - - if expected is None: - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_temporary_file(all_parsers): - # see gh-13398 - parser = all_parsers - data = "0 0" - - with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: - new_file.write(data) - new_file.flush() - new_file.seek(0) - - result = parser.read_csv(new_file, sep=r"\s+", header=None) - - expected = DataFrame([[0, 0]]) - tm.assert_frame_equal(result, expected) - - -def test_internal_eof_byte(all_parsers): - # see gh-5500 - parser = all_parsers - data = "a,b\n1\x1a,2" - - expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -def test_internal_eof_byte_to_file(all_parsers): - # see gh-16559 - parser = all_parsers - data = b'c1,c2\r\n"test \x1a test", test\r\n' - expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) - path = f"__{uuid.uuid4()}__.csv" - - with tm.ensure_clean(path) as path: - with open(path, "wb") as f: - f.write(data) - - result = parser.read_csv(path) - tm.assert_frame_equal(result, expected) - - -def test_file_handle_string_io(all_parsers): - # gh-14418 - # - # Don't close user provided file handles. - parser = all_parsers - data = "a,b\n1,2" - - fh = StringIO(data) - parser.read_csv(fh) - assert not fh.closed - - -def test_file_handles_with_open(all_parsers, csv1): - # gh-14418 - # - # Don't close user provided file handles. - parser = all_parsers - - for mode in ["r", "rb"]: - with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f: - parser.read_csv(f) - assert not f.closed - - -def test_invalid_file_buffer_class(all_parsers): - # see gh-15337 - class InvalidBuffer: - pass - - parser = all_parsers - msg = "Invalid file path or buffer object type" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(InvalidBuffer()) - - -def test_invalid_file_buffer_mock(all_parsers): - # see gh-15337 - parser = all_parsers - msg = "Invalid file path or buffer object type" - - class Foo: - pass - - with pytest.raises(ValueError, match=msg): - parser.read_csv(Foo()) - - -def test_valid_file_buffer_seems_invalid(all_parsers): - # gh-16135: we want to ensure that "tell" and "seek" - # aren't actually being used when we call `read_csv` - # - # Thus, while the object may look "invalid" (these - # methods are attributes of the `StringIO` class), - # it is still a valid file-object for our purposes. - class NoSeekTellBuffer(StringIO): - def tell(self): - raise AttributeError("No tell method") - - def seek(self, pos, whence=0): - raise AttributeError("No seek method") - - data = "a\n1" - parser = all_parsers - expected = DataFrame({"a": [1]}) - - result = parser.read_csv(NoSeekTellBuffer(data)) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("io_class", [StringIO, BytesIO]) -@pytest.mark.parametrize("encoding", [None, "utf-8"]) -def test_read_csv_file_handle(all_parsers, io_class, encoding): - """ - Test whether read_csv does not close user-provided file handles. - - GH 36980 - """ - parser = all_parsers - expected = DataFrame({"a": [1], "b": [2]}) - - content = "a,b\n1,2" - handle = io_class(content.encode("utf-8") if io_class == BytesIO else content) - - tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) - assert not handle.closed - - -def test_memory_map_compression(all_parsers, compression): - """ - Support memory map for compressed files. - - GH 37621 - """ - parser = all_parsers - expected = DataFrame({"a": [1], "b": [2]}) - - with tm.ensure_clean() as path: - expected.to_csv(path, index=False, compression=compression) - - tm.assert_frame_equal( - parser.read_csv(path, memory_map=True, compression=compression), - expected, - ) - - -def test_context_manager(all_parsers, datapath): - # make sure that opened files are closed - parser = all_parsers - - path = datapath("io", "data", "csv", "iris.csv") - - reader = parser.read_csv(path, chunksize=1) - assert not reader.handles.handle.closed - try: - with reader: - next(reader) - assert False - except AssertionError: - assert reader.handles.handle.closed - - -def test_context_manageri_user_provided(all_parsers, datapath): - # make sure that user-provided handles are not closed - parser = all_parsers - - with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path: - reader = parser.read_csv(path, chunksize=1) - assert not reader.handles.handle.closed - try: - with reader: - next(reader) - assert False - except AssertionError: - assert not reader.handles.handle.closed - - -def test_file_descriptor_leak(all_parsers, using_copy_on_write): - # GH 31488 - parser = all_parsers - with tm.ensure_clean() as path: - with pytest.raises(EmptyDataError, match="No columns to parse from file"): - parser.read_csv(path) - - -def test_memory_map(all_parsers, csv_dir_path): - mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") - parser = all_parsers - - expected = DataFrame( - {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} - ) - - result = parser.read_csv(mmap_file, memory_map=True) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py deleted file mode 100644 index 2ca98de914f9e..0000000000000 --- a/pandas/tests/io/parser/common/test_float.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas.compat import is_platform_linux - -from pandas import DataFrame -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def test_float_parser(all_parsers): - # see gh-9565 - parser = all_parsers - data = "45e-1,4.5,45.,inf,-inf" - result = parser.read_csv(StringIO(data), header=None) - - expected = DataFrame([[float(s) for s in data.split(",")]]) - tm.assert_frame_equal(result, expected) - - -def test_scientific_no_exponent(all_parsers_all_precisions): - # see gh-12215 - df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) - data = df.to_csv(index=False) - parser, precision = all_parsers_all_precisions - - df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) - tm.assert_frame_equal(df_roundtrip, df) - - -@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) -def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): - # GH#38753 - parser, precision = all_parsers_all_precisions - - data = f"data\n10E{neg_exp}" - result = parser.read_csv(StringIO(data), float_precision=precision) - expected = DataFrame({"data": [0.0]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) -def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): - # GH#38753 - parser, precision = all_parsers_all_precisions - data = f"data\n10E{exp}" - result = parser.read_csv(StringIO(data), float_precision=precision) - if precision == "round_trip": - if exp == 999999999999999999 and is_platform_linux(): - mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") - request.node.add_marker(mark) - - value = np.inf if exp > 0 else 0.0 - expected = DataFrame({"data": [value]}) - else: - expected = DataFrame({"data": [f"10E{exp}"]}) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py deleted file mode 100644 index 69afb9fe56472..0000000000000 --- a/pandas/tests/io/parser/common/test_index.py +++ /dev/null @@ -1,299 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from datetime import datetime -from io import StringIO -import os - -import pytest - -from pandas import ( - DataFrame, - Index, - MultiIndex, -) -import pandas._testing as tm - -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""", - {"index_col": 0, "names": ["index", "A", "B", "C", "D"]}, - DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), - columns=["A", "B", "C", "D"], - ), - ), - ( - """foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""", - {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]}, - DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - index=MultiIndex.from_tuples( - [ - ("foo", "one"), - ("foo", "two"), - ("foo", "three"), - ("bar", "one"), - ("bar", "two"), - ], - names=["index1", "index2"], - ), - columns=["A", "B", "C", "D"], - ), - ), - ], -) -def test_pass_names_with_index(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): - data = """index1,index2,A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - headless_data = "\n".join(data.split("\n")[1:]) - - names = ["A", "B", "C", "D"] - parser = all_parsers - - result = parser.read_csv( - StringIO(headless_data), index_col=index_col, header=None, names=names - ) - expected = parser.read_csv(StringIO(data), index_col=index_col) - - # No index names in headless data. - expected.index.names = [None] * 2 - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multi_index_no_level_names_implicit(all_parsers): - parser = all_parsers - data = """A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - columns=["A", "B", "C", "D"], - index=MultiIndex.from_tuples( - [ - ("foo", "one"), - ("foo", "two"), - ("foo", "three"), - ("bar", "one"), - ("bar", "two"), - ] - ), - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize( - "data,expected,header", - [ - ("a,b", DataFrame(columns=["a", "b"]), [0]), - ( - "a,b\nc,d", - DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), - [0, 1], - ), - ], -) -@pytest.mark.parametrize("round_trip", [True, False]) -def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): - # see gh-14545 - parser = all_parsers - data = expected.to_csv(index=False) if round_trip else data - - result = parser.read_csv(StringIO(data), header=header) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_no_unnamed_index(all_parsers): - parser = all_parsers - data = """ id c0 c1 c2 -0 1 0 a b -1 2 0 c d -2 2 2 e f -""" - result = parser.read_csv(StringIO(data), sep=" ") - expected = DataFrame( - [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], - columns=["Unnamed: 0", "id", "c0", "c1", "c2"], - ) - tm.assert_frame_equal(result, expected) - - -def test_read_duplicate_index_explicit(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0) - - expected = DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_read_duplicate_index_implicit(all_parsers): - data = """A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - - expected = DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_read_csv_no_index_name(all_parsers, csv_dir_path): - parser = all_parsers - csv2 = os.path.join(csv_dir_path, "test2.csv") - result = parser.read_csv(csv2, index_col=0, parse_dates=True) - - expected = DataFrame( - [ - [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], - [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], - [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], - [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], - [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], - ], - columns=["A", "B", "C", "D", "E"], - index=Index( - [ - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - ] - ), - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_empty_with_index(all_parsers): - # see gh-10184 - data = "x,y" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0) - - expected = DataFrame(columns=["y"], index=Index([], name="x")) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_with_multi_index(all_parsers): - # see gh-10467 - data = "x,y,z" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=["x", "y"]) - - expected = DataFrame( - columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_with_reversed_multi_index(all_parsers): - data = "x,y,z" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=[1, 0]) - - expected = DataFrame( - columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py deleted file mode 100644 index d43fb2f5187e1..0000000000000 --- a/pandas/tests/io/parser/common/test_inf.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas import ( - DataFrame, - option_context, -) -import pandas._testing as tm - -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - - -@xfail_pyarrow -@pytest.mark.parametrize("na_filter", [True, False]) -def test_inf_parsing(all_parsers, na_filter): - parser = all_parsers - data = """\ -,A -a,inf -b,-inf -c,+Inf -d,-Inf -e,INF -f,-INF -g,+INf -h,-INf -i,inF -j,-inF""" - expected = DataFrame( - {"A": [float("inf"), float("-inf")] * 5}, - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - ) - result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("na_filter", [True, False]) -def test_infinity_parsing(all_parsers, na_filter): - parser = all_parsers - data = """\ -,A -a,Infinity -b,-Infinity -c,+Infinity -""" - expected = DataFrame( - {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, - index=["a", "b", "c"], - ) - result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) - tm.assert_frame_equal(result, expected) - - -def test_read_csv_with_use_inf_as_na(all_parsers): - # https://github.com/pandas-dev/pandas/issues/35493 - parser = all_parsers - data = "1.0\nNaN\n3.0" - with option_context("use_inf_as_na", True): - result = parser.read_csv(StringIO(data), header=None) - expected = DataFrame([1.0, np.nan, 3.0]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py deleted file mode 100644 index e3159ef3e6a42..0000000000000 --- a/pandas/tests/io/parser/common/test_ints.py +++ /dev/null @@ -1,215 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas import ( - DataFrame, - Series, -) -import pandas._testing as tm - -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -def test_int_conversion(all_parsers): - data = """A,B -1.0,1 -2.0,2 -3.0,3 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - - expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - "A,B\nTrue,1\nFalse,2\nTrue,3", - {}, - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), - ), - ( - "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", - {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]}, - DataFrame( - [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], - columns=["A", "B"], - ), - ), - ( - "A,B\nTRUE,1\nFALSE,2\nTRUE,3", - {}, - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), - ), - ( - "A,B\nfoo,bar\nbar,foo", - {"true_values": ["foo"], "false_values": ["bar"]}, - DataFrame([[True, False], [False, True]], columns=["A", "B"]), - ), - ], -) -def test_parse_bool(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_parse_integers_above_fp_precision(all_parsers): - data = """Numbers -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000194""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - { - "Numbers": [ - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000194, - ] - } - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow # Flaky -@pytest.mark.parametrize("sep", [" ", r"\s+"]) -def test_integer_overflow_bug(all_parsers, sep): - # see gh-2601 - data = "65248E10 11\n55555E55 22\n" - parser = all_parsers - - result = parser.read_csv(StringIO(data), header=None, sep=sep) - expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) - tm.assert_frame_equal(result, expected) - - -def test_int64_min_issues(all_parsers): - # see gh-2599 - parser = all_parsers - data = "A,B\n0,0\n0," - result = parser.read_csv(StringIO(data)) - - expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) -def test_int64_overflow(all_parsers, conv): - data = """ID -00013007854817840016671868 -00013007854817840016749251 -00013007854817840016754630 -00013007854817840016781876 -00013007854817840017028824 -00013007854817840017963235 -00013007854817840018860166""" - parser = all_parsers - - if conv is None: - # 13007854817840016671868 > UINT64_MAX, so this - # will overflow and return object as the dtype. - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [ - "00013007854817840016671868", - "00013007854817840016749251", - "00013007854817840016754630", - "00013007854817840016781876", - "00013007854817840017028824", - "00013007854817840017963235", - "00013007854817840018860166", - ], - columns=["ID"], - ) - tm.assert_frame_equal(result, expected) - else: - # 13007854817840016671868 > UINT64_MAX, so attempts - # to cast to either int64 or uint64 will result in - # an OverflowError being raised. - msg = ( - "(Python int too large to convert to C long)|" - "(long too big to convert)|" - "(int too big to convert)" - ) - - with pytest.raises(OverflowError, match=msg): - parser.read_csv(StringIO(data), converters={"ID": conv}) - - -@skip_pyarrow -@pytest.mark.parametrize( - "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] -) -def test_int64_uint64_range(all_parsers, val): - # These numbers fall right inside the int64-uint64 - # range, so they should be parsed as string. - parser = all_parsers - result = parser.read_csv(StringIO(str(val)), header=None) - - expected = DataFrame([val]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] -) -def test_outside_int64_uint64_range(all_parsers, val): - # These numbers fall just outside the int64-uint64 - # range, so they should be parsed as string. - parser = all_parsers - result = parser.read_csv(StringIO(str(val)), header=None) - - expected = DataFrame([str(val)]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]]) -def test_numeric_range_too_wide(all_parsers, exp_data): - # No numerical dtype can hold both negative and uint64 - # values, so they should be cast as string. - parser = all_parsers - data = "\n".join(exp_data) - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), header=None) - tm.assert_frame_equal(result, expected) - - -def test_integer_precision(all_parsers): - # Gh 7072 - s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 -5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" - parser = all_parsers - result = parser.read_csv(StringIO(s), header=None)[4] - expected = Series([4321583677327450765, 4321113141090630389], name=4) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py deleted file mode 100644 index 58e5886aedd6b..0000000000000 --- a/pandas/tests/io/parser/common/test_iterator.py +++ /dev/null @@ -1,108 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import pytest - -from pandas import ( - DataFrame, - concat, -) -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def test_iterator(all_parsers): - # see gh-6607 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0} - - expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: - first_chunk = reader.read(3) - tm.assert_frame_equal(first_chunk, expected[:3]) - - last_chunk = reader.read(5) - tm.assert_frame_equal(last_chunk, expected[3:]) - - -def test_iterator2(all_parsers): - parser = all_parsers - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - - with parser.read_csv(StringIO(data), iterator=True) as reader: - result = list(reader) - - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"], - ) - tm.assert_frame_equal(result[0], expected) - - -def test_iterator_stop_on_chunksize(all_parsers): - # gh-3967: stopping iteration when chunksize is specified - parser = all_parsers - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - - with parser.read_csv(StringIO(data), chunksize=1) as reader: - result = list(reader) - - assert len(result) == 3 - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"], - ) - tm.assert_frame_equal(concat(result), expected) - - -@pytest.mark.parametrize( - "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] -) -def test_iterator_skipfooter_errors(all_parsers, kwargs): - msg = "'skipfooter' not supported for iteration" - parser = all_parsers - data = "a\n1\n2" - - with pytest.raises(ValueError, match=msg): - with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: - pass - - -def test_iteration_open_handle(all_parsers): - parser = all_parsers - kwargs = {"header": None} - - with tm.ensure_clean() as path: - with open(path, "w", encoding="utf-8") as f: - f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") - - with open(path, encoding="utf-8") as f: - for line in f: - if "CCC" in line: - break - - result = parser.read_csv(f, **kwargs) - expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py deleted file mode 100644 index 817daad9849c0..0000000000000 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ /dev/null @@ -1,274 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -import codecs -import csv -from io import StringIO -import os -from pathlib import Path -import warnings - -import numpy as np -import pytest - -from pandas.compat import PY311 -from pandas.errors import ( - EmptyDataError, - ParserError, -) - -from pandas import DataFrame -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def test_empty_decimal_marker(all_parsers): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - # Parsers support only length-1 decimals - msg = "Only length-1 decimal markers supported" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), decimal="") - - -def test_bad_stream_exception(all_parsers, csv_dir_path): - # see gh-13652 - # - # This test validates that both the Python engine and C engine will - # raise UnicodeDecodeError instead of C engine raising ParserError - # and swallowing the exception that caused read to fail. - path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") - codec = codecs.lookup("utf-8") - utf8 = codecs.lookup("utf-8") - parser = all_parsers - msg = "'utf-8' codec can't decode byte" - - # Stream must be binary UTF8. - with open(path, "rb") as handle, codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter - ) as stream: - with pytest.raises(UnicodeDecodeError, match=msg): - parser.read_csv(stream) - - -def test_malformed(all_parsers): - # see gh-6607 - parser = all_parsers - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -""" - msg = "Expected 3 fields in line 4, saw 5" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=1, comment="#") - - -@pytest.mark.parametrize("nrows", [5, 3, None]) -def test_malformed_chunks(all_parsers, nrows): - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - parser = all_parsers - msg = "Expected 3 fields in line 6, saw 5" - with parser.read_csv( - StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] - ) as reader: - with pytest.raises(ParserError, match=msg): - reader.read(nrows) - - -def test_catch_too_many_names(all_parsers): - # see gh-5156 - data = """\ -1,2,3 -4,,6 -7,8,9 -10,11,12\n""" - parser = all_parsers - msg = ( - "Too many columns specified: expected 4 and found 3" - if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file" - ) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) - - -@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) -def test_raise_on_no_columns(all_parsers, nrows): - parser = all_parsers - data = "\n" * nrows - - msg = "No columns to parse from file" - with pytest.raises(EmptyDataError, match=msg): - parser.read_csv(StringIO(data)) - - -def test_unexpected_keyword_parameter_exception(all_parsers): - # GH-34976 - parser = all_parsers - - msg = "{}\\(\\) got an unexpected keyword argument 'foo'" - with pytest.raises(TypeError, match=msg.format("read_csv")): - parser.read_csv("foo.csv", foo=1) - with pytest.raises(TypeError, match=msg.format("read_table")): - parser.read_table("foo.tsv", foo=1) - - -def test_suppress_error_output(all_parsers, capsys): - # see gh-15925 - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - expected = DataFrame({"a": [1, 4]}) - - result = parser.read_csv(StringIO(data), on_bad_lines="skip") - tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - assert captured.err == "" - - -def test_error_bad_lines(all_parsers): - # see gh-15925 - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - - msg = "Expected 1 fields in line 3, saw 3" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), on_bad_lines="error") - - -def test_warn_bad_lines(all_parsers, capsys): - # see gh-15925 - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - expected = DataFrame({"a": [1, 4]}) - - result = parser.read_csv(StringIO(data), on_bad_lines="warn") - tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - assert "Skipping line 5" in captured.err - - -def test_read_csv_wrong_num_columns(all_parsers): - # Too few columns. - data = """A,B,C,D,E,F -1,2,3,4,5,6 -6,7,8,9,10,11,12 -11,12,13,14,15,16 -""" - parser = all_parsers - msg = "Expected 6 fields in line 3, saw 7" - - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data)) - - -def test_null_byte_char(request, all_parsers): - # see gh-2741 - data = "\x00,foo" - names = ["a", "b"] - parser = all_parsers - - if parser.engine == "c" or (parser.engine == "python" and PY311): - if parser.engine == "python" and PY311: - request.node.add_marker( - pytest.mark.xfail( - reason="In Python 3.11, this is read as an empty character not null" - ) - ) - expected = DataFrame([[np.nan, "foo"]], columns=names) - out = parser.read_csv(StringIO(data), names=names) - tm.assert_frame_equal(out, expected) - else: - msg = "NULL byte detected" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), names=names) - - -def test_open_file(request, all_parsers): - # GH 39024 - parser = all_parsers - if parser.engine == "c": - request.node.add_marker( - pytest.mark.xfail( - reason=f"{parser.engine} engine does not support sep=None " - f"with delim_whitespace=False" - ) - ) - - with tm.ensure_clean() as path: - file = Path(path) - file.write_bytes(b"\xe4\na\n1") - - with warnings.catch_warnings(record=True) as record: - # should not trigger a ResourceWarning - warnings.simplefilter("always", category=ResourceWarning) - with pytest.raises(csv.Error, match="Could not determine delimiter"): - parser.read_csv(file, sep=None, encoding_errors="replace") - assert len(record) == 0, record[0].message - - -def test_invalid_on_bad_line(all_parsers): - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"): - parser.read_csv(StringIO(data), on_bad_lines="abc") - - -def test_bad_header_uniform_error(all_parsers): - parser = all_parsers - data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" - msg = "Expected 2 fields in line 2, saw 4" - if parser.engine == "c": - msg = ( - "Could not construct index. Requested to use 1 " - "number of columns, but 3 left to parse." - ) - - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") - - -def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): - # see gh-15925 - parser = all_parsers - data = """1,2 -a,b -a,b,c -a,b,d -a,b -""" - expected = DataFrame({"1": "a", "2": ["b"] * 2}) - - result = parser.read_csv(StringIO(data), on_bad_lines="warn") - tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - if parser.engine == "c": - warn = """Skipping line 3: expected 2 fields, saw 3 -Skipping line 4: expected 2 fields, saw 3 - -""" - else: - warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 -Skipping line 4: Expected 2 fields in line 4, saw 3 -""" - assert captured.err == warn diff --git a/pandas/tests/io/parser/common/test_verbose.py b/pandas/tests/io/parser/common/test_verbose.py deleted file mode 100644 index 335065db974dc..0000000000000 --- a/pandas/tests/io/parser/common/test_verbose.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import pytest - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def test_verbose_read(all_parsers, capsys): - parser = all_parsers - data = """a,b,c,d -one,1,2,3 -one,1,2,3 -,1,2,3 -one,1,2,3 -,1,2,3 -,1,2,3 -one,1,2,3 -two,1,2,3""" - - # Engines are verbose in different ways. - parser.read_csv(StringIO(data), verbose=True) - captured = capsys.readouterr() - - if parser.engine == "c": - assert "Tokenization took:" in captured.out - assert "Parser memory cleanup took:" in captured.out - else: # Python engine - assert captured.out == "Filled 3 NA values in column a\n" - - -def test_verbose_read2(all_parsers, capsys): - parser = all_parsers - data = """a,b,c,d -one,1,2,3 -two,1,2,3 -three,1,2,3 -four,1,2,3 -five,1,2,3 -,1,2,3 -seven,1,2,3 -eight,1,2,3""" - - parser.read_csv(StringIO(data), verbose=True, index_col=0) - captured = capsys.readouterr() - - # Engines are verbose in different ways. - if parser.engine == "c": - assert "Tokenization took:" in captured.out - assert "Parser memory cleanup took:" in captured.out - else: # Python engine - assert captured.out == "Filled 1 NA values in column a\n" diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py deleted file mode 100644 index 3ab40ff846cb6..0000000000000 --- a/pandas/tests/io/parser/conftest.py +++ /dev/null @@ -1,297 +0,0 @@ -from __future__ import annotations - -import os - -import pytest - -from pandas.compat._optional import VERSIONS - -from pandas import ( - read_csv, - read_table, -) -import pandas._testing as tm - - -class BaseParser: - engine: str | None = None - low_memory = True - float_precision_choices: list[str | None] = [] - - def update_kwargs(self, kwargs): - kwargs = kwargs.copy() - kwargs.update({"engine": self.engine, "low_memory": self.low_memory}) - - return kwargs - - def read_csv(self, *args, **kwargs): - kwargs = self.update_kwargs(kwargs) - return read_csv(*args, **kwargs) - - def read_csv_check_warnings( - self, warn_type: type[Warning], warn_msg: str, *args, **kwargs - ): - # We need to check the stacklevel here instead of in the tests - # since this is where read_csv is called and where the warning - # should point to. - kwargs = self.update_kwargs(kwargs) - with tm.assert_produces_warning(warn_type, match=warn_msg): - return read_csv(*args, **kwargs) - - def read_table(self, *args, **kwargs): - kwargs = self.update_kwargs(kwargs) - return read_table(*args, **kwargs) - - def read_table_check_warnings( - self, warn_type: type[Warning], warn_msg: str, *args, **kwargs - ): - # We need to check the stacklevel here instead of in the tests - # since this is where read_table is called and where the warning - # should point to. - kwargs = self.update_kwargs(kwargs) - with tm.assert_produces_warning(warn_type, match=warn_msg): - return read_table(*args, **kwargs) - - -class CParser(BaseParser): - engine = "c" - float_precision_choices = [None, "high", "round_trip"] - - -class CParserHighMemory(CParser): - low_memory = False - - -class CParserLowMemory(CParser): - low_memory = True - - -class PythonParser(BaseParser): - engine = "python" - float_precision_choices = [None] - - -class PyArrowParser(BaseParser): - engine = "pyarrow" - float_precision_choices = [None] - - -@pytest.fixture -def csv_dir_path(datapath): - """ - The directory path to the data files needed for parser tests. - """ - return datapath("io", "parser", "data") - - -@pytest.fixture -def csv1(datapath): - """ - The path to the data file "test1.csv" needed for parser tests. - """ - return os.path.join(datapath("io", "data", "csv"), "test1.csv") - - -_cParserHighMemory = CParserHighMemory -_cParserLowMemory = CParserLowMemory -_pythonParser = PythonParser -_pyarrowParser = PyArrowParser - -_py_parsers_only = [_pythonParser] -_c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] - -_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] - -_py_parser_ids = ["python"] -_c_parser_ids = ["c_high", "c_low"] -_pyarrow_parsers_ids = ["pyarrow"] - -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids] - - -@pytest.fixture(params=_all_parsers, ids=_all_parser_ids) -def all_parsers(request): - """ - Fixture all of the CSV parsers. - """ - parser = request.param() - if parser.engine == "pyarrow": - pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) - # Try finding a way to disable threads all together - # for more stable CI runs - import pyarrow - - pyarrow.set_cpu_count(1) - return parser - - -@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) -def c_parser_only(request): - """ - Fixture all of the CSV parsers using the C engine. - """ - return request.param() - - -@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) -def python_parser_only(request): - """ - Fixture all of the CSV parsers using the Python engine. - """ - return request.param() - - -@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids) -def pyarrow_parser_only(request): - """ - Fixture all of the CSV parsers using the Pyarrow engine. - """ - return request.param() - - -def _get_all_parser_float_precision_combinations(): - """ - Return all allowable parser and float precision - combinations and corresponding ids. - """ - params = [] - ids = [] - for parser, parser_id in zip(_all_parsers, _all_parser_ids): - if hasattr(parser, "values"): - # Wrapped in pytest.param, get the actual parser back - parser = parser.values[0] - for precision in parser.float_precision_choices: - # Re-wrap in pytest.param for pyarrow - mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () - param = pytest.param((parser(), precision), marks=mark) - params.append(param) - ids.append(f"{parser_id}-{precision}") - - return {"params": params, "ids": ids} - - -@pytest.fixture( - params=_get_all_parser_float_precision_combinations()["params"], - ids=_get_all_parser_float_precision_combinations()["ids"], -) -def all_parsers_all_precisions(request): - """ - Fixture for all allowable combinations of parser - and float precision - """ - return request.param - - -_utf_values = [8, 16, 32] - -_encoding_seps = ["", "-", "_"] -_encoding_prefixes = ["utf", "UTF"] - -_encoding_fmts = [ - f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes -] - - -@pytest.fixture(params=_utf_values) -def utf_value(request): - """ - Fixture for all possible integer values for a UTF encoding. - """ - return request.param - - -@pytest.fixture(params=_encoding_fmts) -def encoding_fmt(request): - """ - Fixture for all possible string formats of a UTF encoding. - """ - return request.param - - -@pytest.fixture( - params=[ - ("-1,0", -1.0), - ("-1,2e0", -1.2), - ("-1e0", -1.0), - ("+1e0", 1.0), - ("+1e+0", 1.0), - ("+1e-1", 0.1), - ("+,1e1", 1.0), - ("+1,e0", 1.0), - ("-,1e1", -1.0), - ("-1,e0", -1.0), - ("0,1", 0.1), - ("1,", 1.0), - (",1", 0.1), - ("-,1", -0.1), - ("1_,", 1.0), - ("1_234,56", 1234.56), - ("1_234,56e0", 1234.56), - # negative cases; must not parse as float - ("_", "_"), - ("-_", "-_"), - ("-_1", "-_1"), - ("-_1e0", "-_1e0"), - ("_1", "_1"), - ("_1,", "_1,"), - ("_1,_", "_1,_"), - ("_1e0", "_1e0"), - ("1,2e_1", "1,2e_1"), - ("1,2e1_0", "1,2e1_0"), - ("1,_2", "1,_2"), - (",1__2", ",1__2"), - (",1e", ",1e"), - ("-,1e", "-,1e"), - ("1_000,000_000", "1_000,000_000"), - ("1,e1_2", "1,e1_2"), - ("e11,2", "e11,2"), - ("1e11,2", "1e11,2"), - ("1,2,2", "1,2,2"), - ("1,2_1", "1,2_1"), - ("1,2e-10e1", "1,2e-10e1"), - ("--1,2", "--1,2"), - ("1a_2,1", "1a_2,1"), - ("1,2E-1", 0.12), - ("1,2E1", 12.0), - ] -) -def numeric_decimal(request): - """ - Fixture for all numeric formats which should get recognized. The first entry - represents the value to read while the second represents the expected result. - """ - return request.param - - -@pytest.fixture -def pyarrow_xfail(request): - """ - Fixture that xfails a test if the engine is pyarrow. - """ - if "all_parsers" in request.fixturenames: - parser = request.getfixturevalue("all_parsers") - elif "all_parsers_all_precisions" in request.fixturenames: - # Return value is tuple of (engine, precision) - parser = request.getfixturevalue("all_parsers_all_precisions")[0] - else: - return - if parser.engine == "pyarrow": - mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") - request.node.add_marker(mark) - - -@pytest.fixture -def pyarrow_skip(request): - """ - Fixture that skips a test if the engine is pyarrow. - """ - if "all_parsers" in request.fixturenames: - parser = request.getfixturevalue("all_parsers") - elif "all_parsers_all_precisions" in request.fixturenames: - # Return value is tuple of (engine, precision) - parser = request.getfixturevalue("all_parsers_all_precisions")[0] - else: - return - if parser.engine == "pyarrow": - pytest.skip("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl deleted file mode 100644 index f784d37befa82..0000000000000 --- a/pandas/tests/io/parser/data/items.jsonl +++ /dev/null @@ -1,2 +0,0 @@ -{"a": 1, "b": 2} -{"b":2, "a" :1} diff --git a/pandas/tests/io/parser/data/salaries.csv b/pandas/tests/io/parser/data/salaries.csv deleted file mode 100644 index 85631704ff6e0..0000000000000 --- a/pandas/tests/io/parser/data/salaries.csv +++ /dev/null @@ -1,47 +0,0 @@ -S X E M -13876 1 1 1 -11608 1 3 0 -18701 1 3 1 -11283 1 2 0 -11767 1 3 0 -20872 2 2 1 -11772 2 2 0 -10535 2 1 0 -12195 2 3 0 -12313 3 2 0 -14975 3 1 1 -21371 3 2 1 -19800 3 3 1 -11417 4 1 0 -20263 4 3 1 -13231 4 3 0 -12884 4 2 0 -13245 5 2 0 -13677 5 3 0 -15965 5 1 1 -12336 6 1 0 -21352 6 3 1 -13839 6 2 0 -22884 6 2 1 -16978 7 1 1 -14803 8 2 0 -17404 8 1 1 -22184 8 3 1 -13548 8 1 0 -14467 10 1 0 -15942 10 2 0 -23174 10 3 1 -23780 10 2 1 -25410 11 2 1 -14861 11 1 0 -16882 12 2 0 -24170 12 3 1 -15990 13 1 0 -26330 13 2 1 -17949 14 2 0 -25685 15 3 1 -27837 16 2 1 -18838 16 2 0 -17483 16 1 0 -19207 17 2 0 -19346 20 1 0 diff --git a/pandas/tests/io/parser/data/salaries.csv.bz2 b/pandas/tests/io/parser/data/salaries.csv.bz2 deleted file mode 100644 index a68b4e62bf34a..0000000000000 Binary files a/pandas/tests/io/parser/data/salaries.csv.bz2 and /dev/null differ diff --git a/pandas/tests/io/parser/data/salaries.csv.gz b/pandas/tests/io/parser/data/salaries.csv.gz deleted file mode 100644 index 629de9703d345..0000000000000 Binary files a/pandas/tests/io/parser/data/salaries.csv.gz and /dev/null differ diff --git a/pandas/tests/io/parser/data/salaries.csv.xz b/pandas/tests/io/parser/data/salaries.csv.xz deleted file mode 100644 index 40df8e8f936dc..0000000000000 Binary files a/pandas/tests/io/parser/data/salaries.csv.xz and /dev/null differ diff --git a/pandas/tests/io/parser/data/salaries.csv.zip b/pandas/tests/io/parser/data/salaries.csv.zip deleted file mode 100644 index 294f65b36771d..0000000000000 Binary files a/pandas/tests/io/parser/data/salaries.csv.zip and /dev/null differ diff --git a/pandas/tests/io/parser/data/salaries.csv.zst b/pandas/tests/io/parser/data/salaries.csv.zst deleted file mode 100644 index 20c9ed8a7e39f..0000000000000 Binary files a/pandas/tests/io/parser/data/salaries.csv.zst and /dev/null differ diff --git a/pandas/tests/io/parser/data/sauron.SHIFT_JIS.csv b/pandas/tests/io/parser/data/sauron.SHIFT_JIS.csv deleted file mode 100644 index 218ddf333ef52..0000000000000 --- a/pandas/tests/io/parser/data/sauron.SHIFT_JIS.csv +++ /dev/null @@ -1,14 +0,0 @@ -num, text -1,TEiSauronAACkȂn̎ - OI3019N325j́AJEREREg[L̒𕑑Ƃwzrbg̖`xww֕xwV}̕x̓olB -2,wzrbg̖`xɌŷulȂtvifwzrbgV[Yx̎ł́ulilN}T[jvjƂ͔ނ̂ƂłB -3,̑҂łww֕xɂẮu‚̎wցithe One Ringjv̍AuiDark LordjvAu̎ҁithe Onej[1]vƂēoꂷBOjɂwV}̕xł́A̖SX̍ł͂鑤߂łB -4,TE͌AA_inj̑nSVgI푰ACkÄłA僁R[̔tɉSđ—AA_ɊQȂ݂ƂȂB -5,uTEvƂ̓NEFŁug̖т̂悾‚́vƂӖłAV_œl̈Ӗł閼OuSTEAvƌĂ΂邱ƂB -6,́ATEA݌Gtɂ閼łAww֕x쒆ɂăAŚuiTEj͎̖{̖͎gȂAɏɏo肷邱ƂȂvƔĂB -7,̂قAIɃGtɑ΂Ď̂Ƃ閼ɁAuAi^[iNjvAuA^miMȍ׍HtjvAuAEfBiAẺljvB -8,I̍̃TÉA݂ɕϐg\͂ĂB -9,̔\͂gΌڗ킵hȊO𑕂ƂA܂ȘTzƂɕς邱ƂłAGt狰ꂽB -10,IɈ‚̎wւグTÉA̗͂̎wւŐ鎖₻̏L҂xzł悤ɂȂB -11,܂ÂłтĂwւ艽xłh邱ƂłB -12,k[m[v̍ۂɔ̂j󂳂ꂽ́AxƔϐg邱Ƃ͂łȂȂÄӂ̋̂悤Ȍ낵pƂȂȂƂB -13,܂΂΁u܂Ԃ̂Ȃ΂ɉꂽځvƂSە\őꂽB diff --git a/pandas/tests/io/parser/data/sub_char.csv b/pandas/tests/io/parser/data/sub_char.csv deleted file mode 100644 index ff1fa777832c7..0000000000000 --- a/pandas/tests/io/parser/data/sub_char.csv +++ /dev/null @@ -1,2 +0,0 @@ -a,"b",c -1,2,3 \ No newline at end of file diff --git a/pandas/tests/io/parser/data/tar_csv.tar b/pandas/tests/io/parser/data/tar_csv.tar deleted file mode 100644 index d1819550e0a00..0000000000000 Binary files a/pandas/tests/io/parser/data/tar_csv.tar and /dev/null differ diff --git a/pandas/tests/io/parser/data/tar_csv.tar.gz b/pandas/tests/io/parser/data/tar_csv.tar.gz deleted file mode 100644 index 80505d345f1e2..0000000000000 Binary files a/pandas/tests/io/parser/data/tar_csv.tar.gz and /dev/null differ diff --git a/pandas/tests/io/parser/data/test2.csv b/pandas/tests/io/parser/data/test2.csv deleted file mode 100644 index 6f9141152001a..0000000000000 --- a/pandas/tests/io/parser/data/test2.csv +++ /dev/null @@ -1,6 +0,0 @@ -A,B,C,D,E -2000-01-03 00:00:00,0.980268513777,3.68573087906,-0.364216805298,-1.15973806169,foo -2000-01-04 00:00:00,1.04791624281,-0.0412318367011,-0.16181208307,0.212549316967,bar -2000-01-05 00:00:00,0.498580885705,0.731167677815,-0.537677223318,1.34627041952,baz -2000-01-06 00:00:00,1.12020151869,1.56762092543,0.00364077397681,0.67525259227,qux -2000-01-07 00:00:00,-0.487094399463,0.571454623474,-1.6116394093,0.103468562917,foo2 diff --git a/pandas/tests/io/parser/data/test_mmap.csv b/pandas/tests/io/parser/data/test_mmap.csv deleted file mode 100644 index 2885fc2bfbd69..0000000000000 --- a/pandas/tests/io/parser/data/test_mmap.csv +++ /dev/null @@ -1,4 +0,0 @@ -a,b,c -1,one,I -2,two,II -3,three,III diff --git a/pandas/tests/io/parser/data/unicode_series.csv b/pandas/tests/io/parser/data/unicode_series.csv deleted file mode 100644 index 2485e149edb06..0000000000000 --- a/pandas/tests/io/parser/data/unicode_series.csv +++ /dev/null @@ -1,18 +0,0 @@ -1617,King of New York (1990) -1618,All Things Fair (1996) -1619,"Sixth Man, The (1997)" -1620,Butterfly Kiss (1995) -1621,"Paris, France (1993)" -1622,"Crmonie, La (1995)" -1623,Hush (1998) -1624,Nightwatch (1997) -1625,Nobody Loves Me (Keiner liebt mich) (1994) -1626,"Wife, The (1995)" -1627,Lamerica (1994) -1628,Nico Icon (1995) -1629,"Silence of the Palace, The (Saimt el Qusur) (1994)" -1630,"Slingshot, The (1993)" -1631,Land and Freedom (Tierra y libertad) (1995) -1632, kldum klaka (Cold Fever) (1994) -1633,Etz Hadomim Tafus (Under the Domin Tree) (1994) -1634,Two Friends (1986) diff --git a/pandas/tests/io/parser/data/utf16_ex.txt b/pandas/tests/io/parser/data/utf16_ex.txt deleted file mode 100644 index f0b452a2bd5ff..0000000000000 Binary files a/pandas/tests/io/parser/data/utf16_ex.txt and /dev/null differ diff --git a/pandas/tests/io/parser/data/utf16_ex_small.zip b/pandas/tests/io/parser/data/utf16_ex_small.zip deleted file mode 100644 index b0560c1b1f6c4..0000000000000 Binary files a/pandas/tests/io/parser/data/utf16_ex_small.zip and /dev/null differ diff --git a/pandas/tests/io/parser/data/utf32_ex_small.zip b/pandas/tests/io/parser/data/utf32_ex_small.zip deleted file mode 100644 index 9a6d5c08da9db..0000000000000 Binary files a/pandas/tests/io/parser/data/utf32_ex_small.zip and /dev/null differ diff --git a/pandas/tests/io/parser/data/utf8_ex_small.zip b/pandas/tests/io/parser/data/utf8_ex_small.zip deleted file mode 100644 index a4c5440bdffa7..0000000000000 Binary files a/pandas/tests/io/parser/data/utf8_ex_small.zip and /dev/null differ diff --git a/pandas/tests/io/parser/dtypes/__init__.py b/pandas/tests/io/parser/dtypes/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py deleted file mode 100644 index 33422d41c2f93..0000000000000 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ /dev/null @@ -1,309 +0,0 @@ -""" -Tests dtype specification during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO -import os - -import numpy as np -import pytest - -from pandas.core.dtypes.dtypes import CategoricalDtype - -import pandas as pd -from pandas import ( - Categorical, - DataFrame, - Timestamp, -) -import pandas._testing as tm - -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@xfail_pyarrow -@pytest.mark.parametrize( - "dtype", - [ - "category", - CategoricalDtype(), - {"a": "category", "b": "category", "c": CategoricalDtype()}, - ], -) -def test_categorical_dtype(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["a", "a", "b"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - -@skip_pyarrow # Flaky -@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) -def test_categorical_dtype_single(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = DataFrame( - {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - -@xfail_pyarrow -def test_categorical_dtype_unsorted(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", "b", "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - -@xfail_pyarrow -def test_categorical_dtype_missing(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", np.nan, "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - -@xfail_pyarrow -@pytest.mark.slow -def test_categorical_dtype_high_cardinality_numeric(all_parsers): - # see gh-18186 - parser = all_parsers - data = np.sort([str(i) for i in range(524289)]) - expected = DataFrame({"a": Categorical(data, ordered=True)}) - - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") - actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True - ) - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_utf16(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - encoding = "utf-16" - sep = "\t" - - expected = parser.read_csv(pth, sep=sep, encoding=encoding) - expected = expected.apply(Categorical) - - actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") - tm.assert_frame_equal(actual, expected) - - -@xfail_pyarrow -def test_categorical_dtype_chunksize_infer_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), - DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), - ] - with parser.read_csv( - StringIO(data), dtype={"b": "category"}, chunksize=2 - ) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -@xfail_pyarrow -def test_categorical_dtype_chunksize_explicit_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - cats = ["a", "b", "c"] - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), - DataFrame( - {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, - index=[2, 3], - ), - ] - dtype = CategoricalDtype(cats) - with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_latin1(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - encoding = "latin-1" - - expected = parser.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - - actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) - tm.assert_frame_equal(actual, expected) - - -@pytest.mark.parametrize("ordered", [False, True]) -@pytest.mark.parametrize( - "categories", - [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], -) -def test_categorical_category_dtype(all_parsers, categories, ordered): - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical( - ["a", "b", "b", "c"], categories=categories, ordered=ordered - ), - } - ) - - dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_category_dtype_unsorted(all_parsers): - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - dtype = CategoricalDtype(["c", "b", "a"]) - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), - } - ) - - result = parser.read_csv(StringIO(data), dtype={"b": dtype}) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_numeric(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([1, 2, 3])} - - data = "b\n1\n1\n2\n3" - expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow # Flaky -def test_categorical_coerces_datetime(all_parsers): - parser = all_parsers - dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) - dtype = {"b": CategoricalDtype(dti)} - - data = "b\n2017-01-01\n2018-01-01\n2019-01-01" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_timestamp(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([Timestamp("2014")])} - - data = "b\n2014-01-01\n2014-01-01" - expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_timedelta(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} - - data = "b\n1H\n2H\n3H" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data", - [ - "b\nTrue\nFalse\nNA\nFalse", - "b\ntrue\nfalse\nNA\nfalse", - "b\nTRUE\nFALSE\nNA\nFALSE", - "b\nTrue\nFalse\nNA\nFALSE", - ], -) -def test_categorical_dtype_coerces_boolean(all_parsers, data): - # see gh-20498 - parser = all_parsers - dtype = {"b": CategoricalDtype([False, True])} - expected = DataFrame({"b": Categorical([True, False, None, False])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_unexpected_categories(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} - - data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py deleted file mode 100644 index 915cc9a9a1f95..0000000000000 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ /dev/null @@ -1,540 +0,0 @@ -""" -Tests dtype specification during parsing -for all of the parsers defined in parsers.py -""" -from collections import defaultdict -from io import StringIO - -import numpy as np -import pytest - -from pandas.errors import ParserWarning - -import pandas as pd -from pandas import ( - DataFrame, - Timestamp, -) -import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - IntegerArray, - StringArray, -) - - -@pytest.mark.parametrize("dtype", [str, object]) -@pytest.mark.parametrize("check_orig", [True, False]) -@pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): - # see gh-3795, gh-6607 - parser = all_parsers - - df = DataFrame( - np.random.rand(5, 2).round(4), - columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"], - ) - - with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: - df.to_csv(path) - - result = parser.read_csv(path, dtype=dtype, index_col=0) - - if check_orig: - expected = df.copy() - result = result.astype(float) - else: - expected = df.astype(str) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - expected = DataFrame( - [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] - ) - expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) - - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_invalid_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): - parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_raise_on_passed_int_dtype_with_nas(all_parsers): - # see gh-2631 - parser = all_parsers - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_with_converters(all_parsers): - parser = all_parsers - data = """a,b -1.1,2.2 -1.2,2.3""" - - # Dtype spec ignored if converted specified. - result = parser.read_csv_check_warnings( - ParserWarning, - "Both a converter and dtype were specified for column a " - "- only the converter will be used.", - StringIO(data), - dtype={"a": "i8"}, - converters={"a": lambda x: str(x)}, - ) - expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) -) -def test_numeric_dtype(all_parsers, dtype): - data = "0\n1" - parser = all_parsers - expected = DataFrame([0, 1], dtype=dtype) - - result = parser.read_csv(StringIO(data), header=None, dtype=dtype) - tm.assert_frame_equal(expected, result) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_boolean_dtype(all_parsers): - parser = all_parsers - data = "\n".join( - [ - "a", - "True", - "TRUE", - "true", - "1", - "1.0", - "False", - "FALSE", - "false", - "0", - "0.0", - "NaN", - "nan", - "NA", - "null", - "NULL", - ] - ) - - result = parser.read_csv(StringIO(data), dtype="boolean") - expected = DataFrame( - { - "a": pd.array( - [ - True, - True, - True, - True, - True, - False, - False, - False, - False, - False, - None, - None, - None, - None, - None, - ], - dtype="boolean", - ) - } - ) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_delimiter_with_usecols_and_parse_dates(all_parsers): - # GH#35873 - result = all_parsers.read_csv( - StringIO('"dump","-9,1","-9,1",20101010'), - engine="python", - names=["col", "col1", "col2", "col3"], - usecols=["col1", "col2", "col3"], - parse_dates=["col3"], - decimal=",", - ) - expected = DataFrame( - {"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]} - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("thousands", ["_", None]) -def test_decimal_and_exponential( - request, python_parser_only, numeric_decimal, thousands -): - # GH#31920 - decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None) - - -@pytest.mark.parametrize("thousands", ["_", None]) -@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) -def test_1000_sep_decimal_float_precision( - request, c_parser_only, numeric_decimal, float_precision, thousands -): - # test decimal and thousand sep handling in across 'float_precision' - # parsers - decimal_number_check( - request, c_parser_only, numeric_decimal, thousands, float_precision - ) - text, value = numeric_decimal - text = " " + text + " " - if isinstance(value, str): # the negative cases (parse as text) - value = " " + value + " " - decimal_number_check( - request, c_parser_only, (text, value), thousands, float_precision - ) - - -def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision): - # GH#31920 - value = numeric_decimal[0] - if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"): - request.node.add_marker( - pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}") - ) - df = parser.read_csv( - StringIO(value), - float_precision=float_precision, - sep="|", - thousands=thousands, - decimal=",", - header=None, - ) - val = df.iloc[0, 0] - assert val == numeric_decimal[1] - - -@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) -def test_skip_whitespace(c_parser_only, float_precision): - DATA = """id\tnum\t -1\t1.2 \t -1\t 2.1\t -2\t 1\t -2\t 1.2 \t -""" - df = c_parser_only.read_csv( - StringIO(DATA), - float_precision=float_precision, - sep="\t", - header=0, - dtype={1: np.float64}, - ) - tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num")) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_true_values_cast_to_bool(all_parsers): - # GH#34655 - text = """a,b -yes,xxx -no,yyy -1,zzz -0,aaa - """ - parser = all_parsers - result = parser.read_csv( - StringIO(text), - true_values=["yes"], - false_values=["no"], - dtype={"a": "boolean"}, - ) - expected = DataFrame( - {"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]} - ) - expected["a"] = expected["a"].astype("boolean") - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) -def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): - # GH#35211 - parser = all_parsers - data = """a,a\n1,1""" - dtype_dict = {"a": str, **dtypes} - # GH#42462 - dtype_dict_copy = dtype_dict.copy() - result = parser.read_csv(StringIO(data), dtype=dtype_dict) - expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) - assert dtype_dict == dtype_dict_copy, "dtype dict changed" - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_mangle_dup_cols_single_dtype(all_parsers): - # GH#42022 - parser = all_parsers - data = """a,a\n1,1""" - result = parser.read_csv(StringIO(data), dtype=str) - expected = DataFrame({"a": ["1"], "a.1": ["1"]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_multi_index(all_parsers): - # GH 42446 - parser = all_parsers - data = "A,B,B\nX,Y,Z\n1,2,3" - - result = parser.read_csv( - StringIO(data), - header=list(range(2)), - dtype={ - ("A", "X"): np.int32, - ("B", "Y"): np.int32, - ("B", "Z"): np.float32, - }, - ) - - expected = DataFrame( - { - ("A", "X"): np.int32([1]), - ("B", "Y"): np.int32([2]), - ("B", "Z"): np.float32([3]), - } - ) - - tm.assert_frame_equal(result, expected) - - -def test_nullable_int_dtype(all_parsers, any_int_ea_dtype): - # GH 25472 - parser = all_parsers - dtype = any_int_ea_dtype - - data = """a,b,c -,3,5 -1,,6 -2,4,""" - expected = DataFrame( - { - "a": pd.array([pd.NA, 1, 2], dtype=dtype), - "b": pd.array([3, pd.NA, 4], dtype=dtype), - "c": pd.array([5, 6, pd.NA], dtype=dtype), - } - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -@pytest.mark.parametrize("default", ["float", "float64"]) -def test_dtypes_defaultdict(all_parsers, default): - # GH#41574 - data = """a,b -1,2 -""" - dtype = defaultdict(lambda: default, a="int64") - parser = all_parsers - result = parser.read_csv(StringIO(data), dtype=dtype) - expected = DataFrame({"a": [1], "b": 2.0}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): - # GH#41574 - data = """a,b,a,b,b.1 -1,2,3,4,5 -""" - dtype = defaultdict(lambda: "float64", a="int64") - dtype["b.1"] = "int64" - parser = all_parsers - result = parser.read_csv(StringIO(data), dtype=dtype) - expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_dtypes_defaultdict_invalid(all_parsers): - # GH#41574 - data = """a,b -1,2 -""" - dtype = defaultdict(lambda: "invalid_dtype", a="int64") - parser = all_parsers - with pytest.raises(TypeError, match="not understood"): - parser.read_csv(StringIO(data), dtype=dtype) - - -def test_dtype_backend(all_parsers): - # GH#36712 - - parser = all_parsers - - data = """a,b,c,d,e,f,g,h,i,j -1,2.5,True,a,,,,,12-31-2019, -3,4.5,False,b,6,7.5,True,a,12-31-2019, -""" - result = parser.read_csv( - StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"] - ) - expected = DataFrame( - { - "a": pd.Series([1, 3], dtype="Int64"), - "b": pd.Series([2.5, 4.5], dtype="Float64"), - "c": pd.Series([True, False], dtype="boolean"), - "d": pd.Series(["a", "b"], dtype="string"), - "e": pd.Series([pd.NA, 6], dtype="Int64"), - "f": pd.Series([pd.NA, 7.5], dtype="Float64"), - "g": pd.Series([pd.NA, True], dtype="boolean"), - "h": pd.Series([pd.NA, "a"], dtype="string"), - "i": pd.Series([Timestamp("2019-12-31")] * 2), - "j": pd.Series([pd.NA, pd.NA], dtype="Int64"), - } - ) - tm.assert_frame_equal(result, expected) - - -def test_dtype_backend_and_dtype(all_parsers): - # GH#36712 - - parser = all_parsers - - data = """a,b -1,2.5 -, -""" - result = parser.read_csv( - StringIO(data), dtype_backend="numpy_nullable", dtype="float64" - ) - expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]}) - tm.assert_frame_equal(result, expected) - - -def test_dtype_backend_string(all_parsers, string_storage): - # GH#36712 - pa = pytest.importorskip("pyarrow") - - with pd.option_context("mode.string_storage", string_storage): - parser = all_parsers - - data = """a,b -a,x -b, -""" - result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) - - -def test_dtype_backend_ea_dtype_specified(all_parsers): - # GH#491496 - data = """a,b -1,2 -""" - parser = all_parsers - result = parser.read_csv( - StringIO(data), dtype="Int64", dtype_backend="numpy_nullable" - ) - expected = DataFrame({"a": [1], "b": 2}, dtype="Int64") - tm.assert_frame_equal(result, expected) - - -def test_dtype_backend_pyarrow(all_parsers, request): - # GH#36712 - pa = pytest.importorskip("pyarrow") - parser = all_parsers - - data = """a,b,c,d,e,f,g,h,i,j -1,2.5,True,a,,,,,12-31-2019, -3,4.5,False,b,6,7.5,True,a,12-31-2019, -""" - result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"]) - expected = DataFrame( - { - "a": pd.Series([1, 3], dtype="int64[pyarrow]"), - "b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"), - "c": pd.Series([True, False], dtype="bool[pyarrow]"), - "d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())), - "e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"), - "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), - "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), - "h": pd.Series( - [pd.NA, "a"], - dtype=pd.ArrowDtype(pa.string()), - ), - "i": pd.Series([Timestamp("2019-12-31")] * 2), - "j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"), - } - ) - tm.assert_frame_equal(result, expected) - - -def test_ea_int_avoid_overflow(all_parsers): - # GH#32134 - parser = all_parsers - data = """a,b -1,1 -,1 -1582218195625938945,1 -""" - result = parser.read_csv(StringIO(data), dtype={"a": "Int64"}) - expected = DataFrame( - { - "a": IntegerArray( - np.array([1, 1, 1582218195625938945]), np.array([False, True, False]) - ), - "b": 1, - } - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py deleted file mode 100644 index 1f709a3cd8f28..0000000000000 --- a/pandas/tests/io/parser/dtypes/test_empty.py +++ /dev/null @@ -1,174 +0,0 @@ -""" -Tests dtype specification during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas import ( - Categorical, - DataFrame, - Index, - MultiIndex, - Series, - concat, -) -import pandas._testing as tm - -# TODO(1.4): Change me into individual xfails at release time -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def test_dtype_all_columns_empty(all_parsers): - # see gh-12048 - parser = all_parsers - result = parser.read_csv(StringIO("A,B"), dtype=str) - - expected = DataFrame({"A": [], "B": []}, dtype=str) - tm.assert_frame_equal(result, expected) - - -def test_empty_pass_dtype(all_parsers): - parser = all_parsers - - data = "one,two" - result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_index_pass_dtype(all_parsers): - parser = all_parsers - - data = "one,two" - result = parser.read_csv( - StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} - ) - - expected = DataFrame( - {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_multi_index_pass_dtype(all_parsers): - parser = all_parsers - - data = "one,two,three" - result = parser.read_csv( - StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} - ) - - exp_idx = MultiIndex.from_arrays( - [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], - names=["one", "two"], - ) - expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): - parser = all_parsers - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): - parser = all_parsers - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) - - with pytest.raises(ValueError, match="Duplicate names"): - data = "" - parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) - - -@pytest.mark.parametrize( - "dtype,expected", - [ - (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), - ( - "category", - DataFrame({"a": Categorical([]), "b": Categorical([])}), - ), - ( - {"a": "category", "b": "category"}, - DataFrame({"a": Categorical([]), "b": Categorical([])}), - ), - ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), - ( - "timedelta64[ns]", - DataFrame( - { - "a": Series([], dtype="timedelta64[ns]"), - "b": Series([], dtype="timedelta64[ns]"), - }, - ), - ), - ( - {"a": np.int64, "b": np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - ), - ), - ( - {0: np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - ), - ), - ( - {"a": np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - ), - ), - ], -) -def test_empty_dtype(all_parsers, dtype, expected): - # see gh-14712 - parser = all_parsers - data = "a,b" - - result = parser.read_csv(StringIO(data), header=0, dtype=dtype) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py deleted file mode 100644 index 818c4f3522606..0000000000000 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ /dev/null @@ -1,688 +0,0 @@ -""" -Tests that apply specifically to the CParser. Unless specifically stated -as a CParser-specific issue, the goal is to eventually move as many of -these tests out of this module as soon as the Python parser can accept -further arguments when parsing. -""" -from decimal import Decimal -from io import ( - BytesIO, - StringIO, - TextIOWrapper, -) -import mmap -import os -import tarfile - -import numpy as np -import pytest - -from pandas.compat import is_ci_environment -from pandas.compat.numpy import np_version_gte1p24 -from pandas.errors import ParserError -import pandas.util._test_decorators as td - -from pandas import ( - DataFrame, - concat, -) -import pandas._testing as tm - - -@pytest.mark.parametrize( - "malformed", - ["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"], - ids=["words pointer", "stream pointer", "lines pointer"], -) -def test_buffer_overflow(c_parser_only, malformed): - # see gh-9205: test certain malformed input files that cause - # buffer overflows in tokenizer.c - msg = "Buffer overflow caught - possible malformed input file." - parser = c_parser_only - - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(malformed)) - - -def test_buffer_rd_bytes(c_parser_only): - # see gh-12098: src->buffer in the C parser can be freed twice leading - # to a segfault if a corrupt gzip file is read with 'read_csv', and the - # buffer is filled more than once before gzip raises an Exception. - - data = ( - "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" - "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" - "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" - "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" - ) - parser = c_parser_only - - for _ in range(100): - try: - parser.read_csv_check_warnings( - RuntimeWarning, - "compression has no effect when passing a non-binary object as input", - StringIO(data), - compression="gzip", - delim_whitespace=True, - ) - except Exception: - pass - - -def test_delim_whitespace_custom_terminator(c_parser_only): - # See gh-12912 - data = "a b c~1 2 3~4 5 6~7 8 9" - parser = c_parser_only - - df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) - tm.assert_frame_equal(df, expected) - - -def test_dtype_and_names_error(c_parser_only): - # see gh-8833: passing both dtype and names - # resulting in an error reporting issue - parser = c_parser_only - data = """ -1.0 1 -2.0 2 -3.0 3 -""" - # base cases - result = parser.read_csv(StringIO(data), sep=r"\s+", header=None) - expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) - tm.assert_frame_equal(result, expected) - - result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"]) - expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - # fallback casting - result = parser.read_csv( - StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32} - ) - expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"]) - expected["a"] = expected["a"].astype(np.int32) - tm.assert_frame_equal(result, expected) - - data = """ -1.0 1 -nan 2 -3.0 3 -""" - # fallback casting, but not castable - warning = RuntimeWarning if np_version_gte1p24 else None - with pytest.raises(ValueError, match="cannot safely convert"): - with tm.assert_produces_warning(warning, check_stacklevel=False): - parser.read_csv( - StringIO(data), - sep=r"\s+", - header=None, - names=["a", "b"], - dtype={"a": np.int32}, - ) - - -@pytest.mark.parametrize( - "match,kwargs", - [ - # For each of these cases, all of the dtypes are valid, just unsupported. - ( - ( - "the dtype datetime64 is not supported for parsing, " - "pass this column using parse_dates instead" - ), - {"dtype": {"A": "datetime64", "B": "float64"}}, - ), - ( - ( - "the dtype datetime64 is not supported for parsing, " - "pass this column using parse_dates instead" - ), - {"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]}, - ), - ( - "the dtype timedelta64 is not supported for parsing", - {"dtype": {"A": "timedelta64", "B": "float64"}}, - ), - ( - f"the dtype {tm.ENDIAN}U8 is not supported for parsing", - {"dtype": {"A": "U8"}}, - ), - ], - ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"], -) -def test_unsupported_dtype(c_parser_only, match, kwargs): - parser = c_parser_only - df = DataFrame( - np.random.rand(5, 2), columns=list("AB"), index=["1A", "1B", "1C", "1D", "1E"] - ) - - with tm.ensure_clean("__unsupported_dtype__.csv") as path: - df.to_csv(path) - - with pytest.raises(TypeError, match=match): - parser.read_csv(path, index_col=0, **kwargs) - - -@td.skip_if_32bit -@pytest.mark.slow -def test_precise_conversion(c_parser_only): - parser = c_parser_only - - normal_errors = [] - precise_errors = [] - - def error(val: float, actual_val: Decimal) -> Decimal: - return abs(Decimal(f"{val:.100}") - actual_val) - - # test numbers between 1 and 2 - for num in np.linspace(1.0, 2.0, num=500): - # 25 decimal digits of precision - text = f"a\n{num:.25}" - - normal_val = float( - parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] - ) - precise_val = float( - parser.read_csv(StringIO(text), float_precision="high")["a"][0] - ) - roundtrip_val = float( - parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0] - ) - actual_val = Decimal(text[2:]) - - normal_errors.append(error(normal_val, actual_val)) - precise_errors.append(error(precise_val, actual_val)) - - # round-trip should match float() - assert roundtrip_val == float(text[2:]) - - assert sum(precise_errors) <= sum(normal_errors) - assert max(precise_errors) <= max(normal_errors) - - -def test_usecols_dtypes(c_parser_only): - parser = c_parser_only - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - - result = parser.read_csv( - StringIO(data), - usecols=(0, 1, 2), - names=("a", "b", "c"), - header=None, - converters={"a": str}, - dtype={"b": int, "c": float}, - ) - result2 = parser.read_csv( - StringIO(data), - usecols=(0, 2), - names=("a", "b", "c"), - header=None, - converters={"a": str}, - dtype={"b": int, "c": float}, - ) - - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() - - -def test_disable_bool_parsing(c_parser_only): - # see gh-2090 - - parser = c_parser_only - data = """A,B,C -Yes,No,Yes -No,Yes,Yes -Yes,,Yes -No,No,No""" - - result = parser.read_csv(StringIO(data), dtype=object) - assert (result.dtypes == object).all() - - result = parser.read_csv(StringIO(data), dtype=object, na_filter=False) - assert result["B"][2] == "" - - -def test_custom_lineterminator(c_parser_only): - parser = c_parser_only - data = "a,b,c~1,2,3~4,5,6" - - result = parser.read_csv(StringIO(data), lineterminator="~") - expected = parser.read_csv(StringIO(data.replace("~", "\n"))) - - tm.assert_frame_equal(result, expected) - - -def test_parse_ragged_csv(c_parser_only): - parser = c_parser_only - data = """1,2,3 -1,2,3,4 -1,2,3,4,5 -1,2 -1,2,3,4""" - - nice_data = """1,2,3,, -1,2,3,4, -1,2,3,4,5 -1,2,,, -1,2,3,4,""" - result = parser.read_csv( - StringIO(data), header=None, names=["a", "b", "c", "d", "e"] - ) - - expected = parser.read_csv( - StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"] - ) - - tm.assert_frame_equal(result, expected) - - # too many columns, cause segfault if not careful - data = "1,2\n3,4,5" - - result = parser.read_csv(StringIO(data), header=None, names=range(50)) - expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex( - columns=range(50) - ) - - tm.assert_frame_equal(result, expected) - - -def test_tokenize_CR_with_quoting(c_parser_only): - # see gh-3453 - parser = c_parser_only - data = ' a,b,c\r"a,b","e,d","f,f"' - - result = parser.read_csv(StringIO(data), header=None) - expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None) - tm.assert_frame_equal(result, expected) - - result = parser.read_csv(StringIO(data)) - expected = parser.read_csv(StringIO(data.replace("\r", "\n"))) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.slow -def test_grow_boundary_at_cap(c_parser_only): - # See gh-12494 - # - # Cause of error was that the C parser - # was not increasing the buffer size when - # the desired space would fill the buffer - # to capacity, which would later cause a - # buffer overflow error when checking the - # EOF terminator of the CSV stream. - parser = c_parser_only - - def test_empty_header_read(count): - with StringIO("," * count) as s: - expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)]) - df = parser.read_csv(s) - tm.assert_frame_equal(df, expected) - - for cnt in range(1, 101): - test_empty_header_read(cnt) - - -def test_parse_trim_buffers(c_parser_only): - # This test is part of a bugfix for gh-13703. It attempts to - # to stress the system memory allocator, to cause it to move the - # stream buffer and either let the OS reclaim the region, or let - # other memory requests of parser otherwise modify the contents - # of memory space, where it was formally located. - # This test is designed to cause a `segfault` with unpatched - # `tokenizer.c`. Sometimes the test fails on `segfault`, other - # times it fails due to memory corruption, which causes the - # loaded DataFrame to differ from the expected one. - - parser = c_parser_only - - # Generate a large mixed-type CSV file on-the-fly (one record is - # approx 1.5KiB). - record_ = ( - """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" - """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" - """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" - """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" - """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" - """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" - """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" - """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" - """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" - """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" - """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" - """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" - """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" - """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" - """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" - """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" - """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" - """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" - """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" - """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" - """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" - """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" - """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" - """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" - """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" - """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" - """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" - """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" - """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" - ) - - # Set the number of lines so that a call to `parser_trim_buffers` - # is triggered: after a couple of full chunks are consumed a - # relatively small 'residual' chunk would cause reallocation - # within the parser. - chunksize, n_lines = 128, 2 * 128 + 15 - csv_data = "\n".join([record_] * n_lines) + "\n" - - # We will use StringIO to load the CSV from this text buffer. - # pd.read_csv() will iterate over the file in chunks and will - # finally read a residual chunk of really small size. - - # Generate the expected output: manually create the dataframe - # by splitting by comma and repeating the `n_lines` times. - row = tuple(val_ if val_ else np.nan for val_ in record_.split(",")) - expected = DataFrame( - [row for _ in range(n_lines)], dtype=object, columns=None, index=None - ) - - # Iterate over the CSV file in chunks of `chunksize` lines - with parser.read_csv( - StringIO(csv_data), header=None, dtype=object, chunksize=chunksize - ) as chunks_: - result = concat(chunks_, axis=0, ignore_index=True) - - # Check for data corruption if there was no segfault - tm.assert_frame_equal(result, expected) - - # This extra test was added to replicate the fault in gh-5291. - # Force 'utf-8' encoding, so that `_string_convert` would take - # a different execution branch. - with parser.read_csv( - StringIO(csv_data), - header=None, - dtype=object, - chunksize=chunksize, - encoding="utf_8", - ) as chunks_: - result = concat(chunks_, axis=0, ignore_index=True) - tm.assert_frame_equal(result, expected) - - -def test_internal_null_byte(c_parser_only): - # see gh-14012 - # - # The null byte ('\x00') should not be used as a - # true line terminator, escape character, or comment - # character, only as a placeholder to indicate that - # none was specified. - # - # This test should be moved to test_common.py ONLY when - # Python's csv class supports parsing '\x00'. - parser = c_parser_only - - names = ["a", "b", "c"] - data = "1,2,3\n4,\x00,6\n7,8,9" - expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names) - - result = parser.read_csv(StringIO(data), names=names) - tm.assert_frame_equal(result, expected) - - -def test_read_nrows_large(c_parser_only): - # gh-7626 - Read only nrows of data in for large inputs (>262144b) - parser = c_parser_only - header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n" - data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n" - header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n" - data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n" - test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2 - - df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) - - assert df.size == 1010 * 10 - - -def test_float_precision_round_trip_with_text(c_parser_only): - # see gh-15140 - parser = c_parser_only - df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip") - tm.assert_frame_equal(df, DataFrame({0: ["a"]})) - - -def test_large_difference_in_columns(c_parser_only): - # see gh-14125 - parser = c_parser_only - - count = 10000 - large_row = ("X," * count)[:-1] + "\n" - normal_row = "XXXXXX XXXXXX,111111111111111\n" - test_input = (large_row + normal_row * 6)[:-1] - - result = parser.read_csv(StringIO(test_input), header=None, usecols=[0]) - rows = test_input.split("\n") - - expected = DataFrame([row.split(",")[0] for row in rows]) - tm.assert_frame_equal(result, expected) - - -def test_data_after_quote(c_parser_only): - # see gh-15910 - parser = c_parser_only - - data = 'a\n1\n"b"a' - result = parser.read_csv(StringIO(data)) - - expected = DataFrame({"a": ["1", "ba"]}) - tm.assert_frame_equal(result, expected) - - -def test_comment_whitespace_delimited(c_parser_only, capsys): - parser = c_parser_only - test_input = """\ -1 2 -2 2 3 -3 2 3 # 3 fields -4 2 3# 3 fields -5 2 # 2 fields -6 2# 2 fields -7 # 1 field, NaN -8# 1 field, NaN -9 2 3 # skipped line -# comment""" - df = parser.read_csv( - StringIO(test_input), - comment="#", - header=None, - delimiter="\\s+", - skiprows=0, - on_bad_lines="warn", - ) - captured = capsys.readouterr() - # skipped lines 2, 3, 4, 9 - for line_num in (2, 3, 4, 9): - assert f"Skipping line {line_num}" in captured.err - expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) - tm.assert_frame_equal(df, expected) - - -def test_file_like_no_next(c_parser_only): - # gh-16530: the file-like need not have a "next" or "__next__" - # attribute despite having an "__iter__" attribute. - # - # NOTE: This is only true for the C engine, not Python engine. - class NoNextBuffer(StringIO): - def __next__(self): - raise AttributeError("No next method") - - next = __next__ - - parser = c_parser_only - data = "a\n1" - - expected = DataFrame({"a": [1]}) - result = parser.read_csv(NoNextBuffer(data)) - - tm.assert_frame_equal(result, expected) - - -def test_buffer_rd_bytes_bad_unicode(c_parser_only): - # see gh-22748 - t = BytesIO(b"\xB0") - t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") - msg = "'utf-8' codec can't encode character" - with pytest.raises(UnicodeError, match=msg): - c_parser_only.read_csv(t, encoding="UTF-8") - - -@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) -def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): - # see gh-16530 - # - # Unfortunately, Python's CSV library can't handle - # tarfile objects (expects string, not bytes when - # iterating through a file-like). - parser = c_parser_only - tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix) - - with tarfile.open(tar_path, "r") as tar: - data_file = tar.extractfile("tar_data.csv") - - out = parser.read_csv(data_file) - expected = DataFrame({"a": [1]}) - tm.assert_frame_equal(out, expected) - - -@pytest.mark.single_cpu -@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.") -def test_bytes_exceed_2gb(c_parser_only): - # see gh-16798 - # - # Read from a "CSV" that has a column larger than 2GB. - parser = c_parser_only - - if parser.low_memory: - pytest.skip("not a low_memory test") - - # csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test - # spikes up to 10.4GB on the c_high case - csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) - df = parser.read_csv(csv) - assert not df.empty - - -def test_chunk_whitespace_on_boundary(c_parser_only): - # see gh-9735: this issue is C parser-specific (bug when - # parsing whitespace and characters at chunk boundary) - # - # This test case has a field too large for the Python parser / CSV library. - parser = c_parser_only - - chunk1 = "a" * (1024 * 256 - 2) + "\na" - chunk2 = "\n a" - result = parser.read_csv(StringIO(chunk1 + chunk2), header=None) - - expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"]) - tm.assert_frame_equal(result, expected) - - -def test_file_handles_mmap(c_parser_only, csv1): - # gh-14418 - # - # Don't close user provided file handles. - parser = c_parser_only - - with open(csv1, encoding="utf-8") as f: - with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m: - parser.read_csv(m) - assert not m.closed - - -def test_file_binary_mode(c_parser_only): - # see gh-23779 - parser = c_parser_only - expected = DataFrame([[1, 2, 3], [4, 5, 6]]) - - with tm.ensure_clean() as path: - with open(path, "w", encoding="utf-8") as f: - f.write("1,2,3\n4,5,6") - - with open(path, "rb") as f: - result = parser.read_csv(f, header=None) - tm.assert_frame_equal(result, expected) - - -def test_unix_style_breaks(c_parser_only): - # GH 11020 - parser = c_parser_only - with tm.ensure_clean() as path: - with open(path, "w", newline="\n", encoding="utf-8") as f: - f.write("blah\n\ncol_1,col_2,col_3\n\n") - result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") - expected = DataFrame(columns=["col_1", "col_2", "col_3"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) -@pytest.mark.parametrize( - "data,thousands,decimal", - [ - ( - """A|B|C -1|2,334.01|5 -10|13|10. -""", - ",", - ".", - ), - ( - """A|B|C -1|2.334,01|5 -10|13|10, -""", - ".", - ",", - ), - ], -) -def test_1000_sep_with_decimal( - c_parser_only, data, thousands, decimal, float_precision -): - parser = c_parser_only - expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) - - result = parser.read_csv( - StringIO(data), - sep="|", - thousands=thousands, - decimal=decimal, - float_precision=float_precision, - ) - tm.assert_frame_equal(result, expected) - - -def test_float_precision_options(c_parser_only): - # GH 17154, 36228 - parser = c_parser_only - s = "foo\n243.164\n" - df = parser.read_csv(StringIO(s)) - df2 = parser.read_csv(StringIO(s), float_precision="high") - - tm.assert_frame_equal(df, df2) - - df3 = parser.read_csv(StringIO(s), float_precision="legacy") - - assert not df.iloc[0, 0] == df3.iloc[0, 0] - - msg = "Unrecognized float_precision option: junk" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(s), float_precision="junk") diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py deleted file mode 100644 index 9a14e67c154b6..0000000000000 --- a/pandas/tests/io/parser/test_comment.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Tests that comments are properly handled during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas import DataFrame -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.parametrize("na_values", [None, ["NaN"]]) -def test_comment(all_parsers, na_values): - parser = all_parsers - data = """A,B,C -1,2.,4.#hello world -5.,NaN,10.0 -""" - expected = DataFrame( - [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] - ) - result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}] -) -def test_line_comment(all_parsers, read_kwargs, request): - parser = all_parsers - data = """# empty -A,B,C -1,2.,4.#hello world -#ignore this line -5.,NaN,10.0 -""" - if read_kwargs.get("delim_whitespace"): - data = data.replace(",", " ") - elif read_kwargs.get("lineterminator"): - if parser.engine != "c": - mark = pytest.mark.xfail( - reason="Custom terminator not supported with Python engine" - ) - request.node.add_marker(mark) - - data = data.replace("\n", read_kwargs.get("lineterminator")) - - read_kwargs["comment"] = "#" - result = parser.read_csv(StringIO(data), **read_kwargs) - - expected = DataFrame( - [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] - ) - tm.assert_frame_equal(result, expected) - - -def test_comment_skiprows(all_parsers): - parser = all_parsers - data = """# empty -random line -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - # This should ignore the first four lines (including comments). - expected = DataFrame( - [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] - ) - result = parser.read_csv(StringIO(data), comment="#", skiprows=4) - tm.assert_frame_equal(result, expected) - - -def test_comment_header(all_parsers): - parser = all_parsers - data = """# empty -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - # Header should begin at the second non-comment line. - expected = DataFrame( - [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] - ) - result = parser.read_csv(StringIO(data), comment="#", header=1) - tm.assert_frame_equal(result, expected) - - -def test_comment_skiprows_header(all_parsers): - parser = all_parsers - data = """# empty -# second empty line -# third empty line -X,Y,Z -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - # Skiprows should skip the first 4 lines (including comments), - # while header should start from the second non-commented line, - # starting with line 5. - expected = DataFrame( - [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] - ) - result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"]) -def test_custom_comment_char(all_parsers, comment_char): - parser = all_parsers - data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" - result = parser.read_csv( - StringIO(data.replace("#", comment_char)), comment=comment_char - ) - - expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("header", ["infer", None]) -def test_comment_first_line(all_parsers, header): - # see gh-4623 - parser = all_parsers - data = "# notes\na,b,c\n# more notes\n1,2,3" - - if header is None: - expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]}) - else: - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - - result = parser.read_csv(StringIO(data), comment="#", header=header) - tm.assert_frame_equal(result, expected) - - -def test_comment_char_in_default_value(all_parsers, request): - # GH#34002 - if all_parsers.engine == "c": - reason = "see gh-34002: works on the python engine but not the c engine" - # NA value containing comment char is interpreted as comment - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=AssertionError)) - parser = all_parsers - - data = ( - "# this is a comment\n" - "col1,col2,col3,col4\n" - "1,2,3,4#inline comment\n" - "4,5#,6,10\n" - "7,8,#N/A,11\n" - ) - result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A") - expected = DataFrame( - { - "col1": [1, 4, 7], - "col2": [2, 5, 8], - "col3": [3.0, np.nan, np.nan], - "col4": [4.0, np.nan, 11.0], - } - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py deleted file mode 100644 index bcba9c4a1823d..0000000000000 --- a/pandas/tests/io/parser/test_compression.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -Tests compressed data parsing functionality for all -of the parsers defined in parsers.py -""" - -import os -from pathlib import Path -import tarfile -import zipfile - -import pytest - -from pandas import DataFrame -import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension - -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.fixture(params=[True, False]) -def buffer(request): - return request.param - - -@pytest.fixture -def parser_and_data(all_parsers, csv1): - parser = all_parsers - - with open(csv1, "rb") as f: - data = f.read() - expected = parser.read_csv(csv1) - - return parser, data, expected - - -@skip_pyarrow -@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) -def test_zip(parser_and_data, compression): - parser, data, expected = parser_and_data - - with tm.ensure_clean("test_file.zip") as path: - with zipfile.ZipFile(path, mode="w") as tmp: - tmp.writestr("test_file", data) - - if compression == "zip2": - with open(path, "rb") as f: - result = parser.read_csv(f, compression="zip") - else: - result = parser.read_csv(path, compression=compression) - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("compression", ["zip", "infer"]) -def test_zip_error_multiple_files(parser_and_data, compression): - parser, data, expected = parser_and_data - - with tm.ensure_clean("combined_zip.zip") as path: - inner_file_names = ["test_file", "second_file"] - - with zipfile.ZipFile(path, mode="w") as tmp: - for file_name in inner_file_names: - tmp.writestr(file_name, data) - - with pytest.raises(ValueError, match="Multiple files"): - parser.read_csv(path, compression=compression) - - -@skip_pyarrow -def test_zip_error_no_files(parser_and_data): - parser, _, _ = parser_and_data - - with tm.ensure_clean() as path: - with zipfile.ZipFile(path, mode="w"): - pass - - with pytest.raises(ValueError, match="Zero files"): - parser.read_csv(path, compression="zip") - - -@skip_pyarrow -def test_zip_error_invalid_zip(parser_and_data): - parser, _, _ = parser_and_data - - with tm.ensure_clean() as path: - with open(path, "rb") as f: - with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"): - parser.read_csv(f, compression="zip") - - -@skip_pyarrow -@pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression(request, parser_and_data, compression_only, buffer, filename): - parser, data, expected = parser_and_data - compress_type = compression_only - - ext = _compression_to_extension[compress_type] - filename = filename if filename is None else filename.format(ext=ext) - - if filename and buffer: - request.node.add_marker( - pytest.mark.xfail( - reason="Cannot deduce compression from buffer of compressed data." - ) - ) - - with tm.ensure_clean(filename=filename) as path: - tm.write_to_compressed(compress_type, path, data) - compression = "infer" if filename else compress_type - - if buffer: - with open(path, "rb") as f: - result = parser.read_csv(f, compression=compression) - else: - result = parser.read_csv(path, compression=compression) - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("ext", [None, "gz", "bz2"]) -def test_infer_compression(all_parsers, csv1, buffer, ext): - # see gh-9770 - parser = all_parsers - kwargs = {"index_col": 0, "parse_dates": True} - - expected = parser.read_csv(csv1, **kwargs) - kwargs["compression"] = "infer" - - if buffer: - with open(csv1, encoding="utf-8") as f: - result = parser.read_csv(f, **kwargs) - else: - ext = "." + ext if ext else "" - result = parser.read_csv(csv1 + ext, **kwargs) - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): - # see gh-18071, gh-24130 - parser = all_parsers - encoding = encoding_fmt.format(utf_value) - path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip") - - result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t") - expected = DataFrame( - { - "Country": ["Venezuela", "Venezuela"], - "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], - } - ) - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) -def test_invalid_compression(all_parsers, invalid_compression): - parser = all_parsers - compress_kwargs = {"compression": invalid_compression} - - msg = f"Unrecognized compression type: {invalid_compression}" - - with pytest.raises(ValueError, match=msg): - parser.read_csv("test_file.zip", **compress_kwargs) - - -@skip_pyarrow -def test_compression_tar_archive(all_parsers, csv_dir_path): - parser = all_parsers - path = os.path.join(csv_dir_path, "tar_csv.tar.gz") - df = parser.read_csv(path) - assert list(df.columns) == ["a"] - - -def test_ignore_compression_extension(all_parsers): - parser = all_parsers - df = DataFrame({"a": [0, 1]}) - with tm.ensure_clean("test.csv") as path_csv: - with tm.ensure_clean("test.csv.zip") as path_zip: - # make sure to create un-compressed file with zip extension - df.to_csv(path_csv, index=False) - Path(path_zip).write_text( - Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8" - ) - - tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) - - -@skip_pyarrow -def test_writes_tar_gz(all_parsers): - parser = all_parsers - data = DataFrame( - { - "Country": ["Venezuela", "Venezuela"], - "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], - } - ) - with tm.ensure_clean("test.tar.gz") as tar_path: - data.to_csv(tar_path, index=False) - - # test that read_csv infers .tar.gz to gzip: - tm.assert_frame_equal(parser.read_csv(tar_path), data) - - # test that file is indeed gzipped: - with tarfile.open(tar_path, "r:gz") as tar: - result = parser.read_csv( - tar.extractfile(tar.getnames()[0]), compression="infer" - ) - tm.assert_frame_equal(result, data) diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py deleted file mode 100644 index 1bae2317a2fc6..0000000000000 --- a/pandas/tests/io/parser/test_concatenate_chunks.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -import pytest - -from pandas.errors import DtypeWarning - -import pandas._testing as tm -from pandas.core.arrays import ArrowExtensionArray - -from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks - - -def test_concatenate_chunks_pyarrow(): - # GH#51876 - pa = pytest.importorskip("pyarrow") - chunks = [ - {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, - {0: ArrowExtensionArray(pa.array([1, 2]))}, - ] - result = _concatenate_chunks(chunks) - expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0])) - tm.assert_extension_array_equal(result[0], expected) - - -def test_concatenate_chunks_pyarrow_strings(): - # GH#51876 - pa = pytest.importorskip("pyarrow") - chunks = [ - {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, - {0: ArrowExtensionArray(pa.array(["a", "b"]))}, - ] - with tm.assert_produces_warning(DtypeWarning, match="have mixed types"): - result = _concatenate_chunks(chunks) - expected = np.concatenate( - [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])] - ) - tm.assert_numpy_array_equal(result[0], expected) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py deleted file mode 100644 index 85f3db0398080..0000000000000 --- a/pandas/tests/io/parser/test_converters.py +++ /dev/null @@ -1,203 +0,0 @@ -""" -Tests column conversion functionality during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -from dateutil.parser import parse -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Index, -) -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def test_converters_type_must_be_dict(all_parsers): - parser = all_parsers - data = """index,A,B,C,D -foo,2,3,4,5 -""" - - with pytest.raises(TypeError, match="Type converters.+"): - parser.read_csv(StringIO(data), converters=0) - - -@pytest.mark.parametrize("column", [3, "D"]) -@pytest.mark.parametrize( - "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer. -) -def test_converters(all_parsers, column, converter): - parser = all_parsers - data = """A,B,C,D -a,1,2,01/01/2009 -b,3,4,01/02/2009 -c,4,5,01/03/2009 -""" - result = parser.read_csv(StringIO(data), converters={column: converter}) - - expected = parser.read_csv(StringIO(data)) - expected["D"] = expected["D"].map(converter) - - tm.assert_frame_equal(result, expected) - - -def test_converters_no_implicit_conv(all_parsers): - # see gh-2184 - parser = all_parsers - data = """000102,1.2,A\n001245,2,B""" - - converters = {0: lambda x: x.strip()} - result = parser.read_csv(StringIO(data), header=None, converters=converters) - - # Column 0 should not be casted to numeric and should remain as object. - expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]]) - tm.assert_frame_equal(result, expected) - - -def test_converters_euro_decimal_format(all_parsers): - # see gh-583 - converters = {} - parser = all_parsers - - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,7387 -2;121,12;14897,76;DEF;uyt;0,3773 -3;878,158;108013,434;GHI;rez;2,7356""" - converters["Number1"] = converters["Number2"] = converters[ - "Number3" - ] = lambda x: float(x.replace(",", ".")) - - result = parser.read_csv(StringIO(data), sep=";", converters=converters) - expected = DataFrame( - [ - [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], - [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], - [3, 878.158, 108013.434, "GHI", "rez", 2.7356], - ], - columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], - ) - tm.assert_frame_equal(result, expected) - - -def test_converters_corner_with_nans(all_parsers): - parser = all_parsers - data = """id,score,days -1,2,12 -2,2-5, -3,,14+ -4,6-12,2""" - - # Example converters. - def convert_days(x): - x = x.strip() - - if not x: - return np.nan - - is_plus = x.endswith("+") - - if is_plus: - x = int(x[:-1]) + 1 - else: - x = int(x) - - return x - - def convert_days_sentinel(x): - x = x.strip() - - if not x: - return np.nan - - is_plus = x.endswith("+") - - if is_plus: - x = int(x[:-1]) + 1 - else: - x = int(x) - - return x - - def convert_score(x): - x = x.strip() - - if not x: - return np.nan - - if x.find("-") > 0: - val_min, val_max = map(int, x.split("-")) - val = 0.5 * (val_min + val_max) - else: - val = float(x) - - return val - - results = [] - - for day_converter in [convert_days, convert_days_sentinel]: - result = parser.read_csv( - StringIO(data), - converters={"score": convert_score, "days": day_converter}, - na_values=["", None], - ) - assert pd.isna(result["days"][1]) - results.append(result) - - tm.assert_frame_equal(results[0], results[1]) - - -@pytest.mark.parametrize("conv_f", [lambda x: x, str]) -def test_converter_index_col_bug(all_parsers, conv_f): - # see gh-1835 , GH#40589 - parser = all_parsers - data = "A;B\n1;2\n3;4" - - rs = parser.read_csv( - StringIO(data), sep=";", index_col="A", converters={"A": conv_f} - ) - - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) - tm.assert_frame_equal(rs, xp) - - -def test_converter_identity_object(all_parsers): - # GH#40589 - parser = all_parsers - data = "A,B\n1,2\n3,4" - - rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x}) - - xp = DataFrame({"A": ["1", "3"], "B": [2, 4]}) - tm.assert_frame_equal(rs, xp) - - -def test_converter_multi_index(all_parsers): - # GH 42446 - parser = all_parsers - data = "A,B,B\nX,Y,Z\n1,2,3" - - result = parser.read_csv( - StringIO(data), - header=list(range(2)), - converters={ - ("A", "X"): np.int32, - ("B", "Y"): np.int32, - ("B", "Z"): np.float32, - }, - ) - - expected = DataFrame( - { - ("A", "X"): np.int32([1]), - ("B", "Y"): np.int32([2]), - ("B", "Z"): np.float32([3]), - } - ) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py deleted file mode 100644 index 7d2bb6c083cda..0000000000000 --- a/pandas/tests/io/parser/test_dialect.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -Tests that dialects are properly handled during parsing -for all of the parsers defined in parsers.py -""" - -import csv -from io import StringIO - -import pytest - -from pandas.errors import ParserWarning - -from pandas import DataFrame -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.fixture -def custom_dialect(): - dialect_name = "weird" - dialect_kwargs = { - "doublequote": False, - "escapechar": "~", - "delimiter": ":", - "skipinitialspace": False, - "quotechar": "~", - "quoting": 3, - } - return dialect_name, dialect_kwargs - - -def test_dialect(all_parsers): - parser = all_parsers - data = """\ -label1,label2,label3 -index1,"a,c,e -index2,b,d,f -""" - - dia = csv.excel() - dia.quoting = csv.QUOTE_NONE - df = parser.read_csv(StringIO(data), dialect=dia) - - data = """\ -label1,label2,label3 -index1,a,c,e -index2,b,d,f -""" - exp = parser.read_csv(StringIO(data)) - exp.replace("a", '"a', inplace=True) - tm.assert_frame_equal(df, exp) - - -def test_dialect_str(all_parsers): - dialect_name = "mydialect" - parser = all_parsers - data = """\ -fruit:vegetable -apple:broccoli -pear:tomato -""" - exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]}) - - with tm.with_csv_dialect(dialect_name, delimiter=":"): - df = parser.read_csv(StringIO(data), dialect=dialect_name) - tm.assert_frame_equal(df, exp) - - -def test_invalid_dialect(all_parsers): - class InvalidDialect: - pass - - data = "a\n1" - parser = all_parsers - msg = "Invalid dialect" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dialect=InvalidDialect) - - -@pytest.mark.parametrize( - "arg", - [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"], -) -@pytest.mark.parametrize("value", ["dialect", "default", "other"]) -def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value): - # see gh-23761. - dialect_name, dialect_kwargs = custom_dialect - parser = all_parsers - - expected = DataFrame({"a": [1], "b": [2]}) - data = "a:b\n1:2" - - warning_klass = None - kwds = {} - - # arg=None tests when we pass in the dialect without any other arguments. - if arg is not None: - if value == "dialect": # No conflict --> no warning. - kwds[arg] = dialect_kwargs[arg] - elif value == "default": # Default --> no warning. - from pandas.io.parsers.base_parser import parser_defaults - - kwds[arg] = parser_defaults[arg] - else: # Non-default + conflict with dialect --> warning. - warning_klass = ParserWarning - kwds[arg] = "blah" - - with tm.with_csv_dialect(dialect_name, **dialect_kwargs): - result = parser.read_csv_check_warnings( - warning_klass, - "Conflicting values for", - StringIO(data), - dialect=dialect_name, - **kwds, - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "kwargs,warning_klass", - [ - ({"sep": ","}, None), # sep is default --> sep_override=True - ({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False - ({"delimiter": ":"}, None), # No conflict - ({"delimiter": None}, None), # Default arguments --> sep_override=True - ({"delimiter": ","}, ParserWarning), # Conflict - ({"delimiter": "."}, ParserWarning), # Conflict - ], - ids=[ - "sep-override-true", - "sep-override-false", - "delimiter-no-conflict", - "delimiter-default-arg", - "delimiter-conflict", - "delimiter-conflict2", - ], -) -def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass): - # see gh-23761. - dialect_name, dialect_kwargs = custom_dialect - parser = all_parsers - - expected = DataFrame({"a": [1], "b": [2]}) - data = "a:b\n1:2" - - with tm.with_csv_dialect(dialect_name, **dialect_kwargs): - result = parser.read_csv_check_warnings( - warning_klass, - "Conflicting values for 'delimiter'", - StringIO(data), - dialect=dialect_name, - **kwargs, - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py deleted file mode 100644 index 31c7994f39058..0000000000000 --- a/pandas/tests/io/parser/test_encoding.py +++ /dev/null @@ -1,318 +0,0 @@ -""" -Tests encoding functionality during parsing -for all of the parsers defined in parsers.py -""" -from io import ( - BytesIO, - TextIOWrapper, -) -import os -import tempfile -import uuid - -import numpy as np -import pytest - -from pandas import ( - DataFrame, - read_csv, -) -import pandas._testing as tm - -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - - -def test_bytes_io_input(all_parsers): - encoding = "cp1255" - parser = all_parsers - - data = BytesIO("שלום:1234\n562:123".encode(encoding)) - result = parser.read_csv(data, sep=":", encoding=encoding) - - expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_csv_unicode(all_parsers): - parser = all_parsers - data = BytesIO("\u0141aski, Jan;1".encode()) - - result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) - expected = DataFrame([["\u0141aski, Jan", 1]]) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("sep", [",", "\t"]) -@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) -def test_utf16_bom_skiprows(all_parsers, sep, encoding): - # see gh-2298 - parser = all_parsers - data = """skip this -skip this too -A,B,C -1,2,3 -4,5,6""".replace( - ",", sep - ) - path = f"__{uuid.uuid4()}__.csv" - kwargs = {"sep": sep, "skiprows": 2} - utf8 = "utf-8" - - with tm.ensure_clean(path) as path: - bytes_data = data.encode(encoding) - - with open(path, "wb") as f: - f.write(bytes_data) - - with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer: - result = parser.read_csv(path, encoding=encoding, **kwargs) - expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_utf16_example(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - result = parser.read_csv(path, encoding="utf-16", sep="\t") - assert len(result) == 50 - - -def test_unicode_encoding(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - - result = parser.read_csv(path, header=None, encoding="latin-1") - result = result.set_index(0) - got = result[1][1632] - - expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" - assert got == expected - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - # Basic test - ("a\n1", {}, DataFrame({"a": [1]})), - # "Regular" quoting - ('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})), - # Test in a data row instead of header - ("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})), - # Test in empty data row with skipping - ("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})), - # Test in empty data row without skipping - ( - "\n1", - {"names": ["a"], "skip_blank_lines": False}, - DataFrame({"a": [np.nan, 1]}), - ), - ], -) -def test_utf8_bom(all_parsers, data, kwargs, expected, request): - # see gh-4793 - parser = all_parsers - bom = "\ufeff" - utf8 = "utf-8" - - def _encode_data_with_bom(_data): - bom_data = (bom + _data).encode(utf8) - return BytesIO(bom_data) - - if ( - parser.engine == "pyarrow" - and data == "\n1" - and kwargs.get("skip_blank_lines", True) - ): - # Manually xfail, since we don't have mechanism to xfail specific version - request.node.add_marker( - pytest.mark.xfail(reason="Pyarrow can't read blank lines") - ) - - result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): - # see gh-13549 - expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) - parser = all_parsers - - encoding = encoding_fmt.format(utf_value) - data = "mb_num,multibyte\n4.8,test".encode(encoding) - - result = parser.read_csv(BytesIO(data), encoding=encoding) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "file_path,encoding", - [ - (("io", "data", "csv", "test1.csv"), "utf-8"), - (("io", "parser", "data", "unicode_series.csv"), "latin-1"), - (("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"), - ], -) -def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath): - # gh-23779: Python csv engine shouldn't error on files opened in binary. - # gh-31575: Python csv engine shouldn't error on files opened in raw binary. - parser = all_parsers - - fpath = datapath(*file_path) - expected = parser.read_csv(fpath, encoding=encoding) - - with open(fpath, encoding=encoding) as fa: - result = parser.read_csv(fa) - assert not fa.closed - tm.assert_frame_equal(expected, result) - - with open(fpath, mode="rb") as fb: - result = parser.read_csv(fb, encoding=encoding) - assert not fb.closed - tm.assert_frame_equal(expected, result) - - with open(fpath, mode="rb", buffering=0) as fb: - result = parser.read_csv(fb, encoding=encoding) - assert not fb.closed - tm.assert_frame_equal(expected, result) - - -@skip_pyarrow -@pytest.mark.parametrize("pass_encoding", [True, False]) -def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): - # see gh-24130 - parser = all_parsers - encoding = encoding_fmt.format(utf_value) - - expected = DataFrame({"foo": ["bar"]}) - - with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f: - f.write("foo\nbar") - f.seek(0) - - result = parser.read_csv(f, encoding=encoding if pass_encoding else None) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_encoding_named_temp_file(all_parsers): - # see gh-31819 - parser = all_parsers - encoding = "shift-jis" - - title = "てすと" - data = "こむ" - - expected = DataFrame({title: [data]}) - - with tempfile.NamedTemporaryFile() as f: - f.write(f"{title}\n{data}".encode(encoding)) - - f.seek(0) - - result = parser.read_csv(f, encoding=encoding) - tm.assert_frame_equal(result, expected) - assert not f.closed - - -@pytest.mark.parametrize( - "encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"] -) -def test_parse_encoded_special_characters(encoding): - # GH16218 Verify parsing of data with encoded special characters - # Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a") - data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" - encoded_data = BytesIO(data.encode(encoding)) - result = read_csv(encoded_data, delimiter="\t", encoding=encoding) - - expected = DataFrame( - data=[[":foo", 0], ["bar", 1], ["baz", 2]], - columns=["a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) -def test_encoding_memory_map(all_parsers, encoding): - # GH40986 - parser = all_parsers - expected = DataFrame( - { - "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"], - "mask": ["red", "purple", "orange", "blue"], - "weapon": ["sai", "bo staff", "nunchunk", "katana"], - } - ) - with tm.ensure_clean() as file: - expected.to_csv(file, index=False, encoding=encoding) - df = parser.read_csv(file, encoding=encoding, memory_map=True) - tm.assert_frame_equal(df, expected) - - -@xfail_pyarrow -def test_chunk_splits_multibyte_char(all_parsers): - """ - Chunk splits a multibyte character with memory_map=True - - GH 43540 - """ - parser = all_parsers - # DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx - df = DataFrame(data=["a" * 127] * 2048) - - # Put two-bytes utf-8 encoded character "ą" at the end of chunk - # utf-8 encoding of "ą" is b'\xc4\x85' - df.iloc[2047] = "a" * 127 + "ą" - with tm.ensure_clean("bug-gh43540.csv") as fname: - df.to_csv(fname, index=False, header=False, encoding="utf-8") - dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c") - tm.assert_frame_equal(dfr, df) - - -@xfail_pyarrow -def test_readcsv_memmap_utf8(all_parsers): - """ - GH 43787 - - Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8 - """ - lines = [] - line_length = 128 - start_char = " " - end_char = "\U00010080" - # This for loop creates a list of 128-char strings - # consisting of consecutive Unicode chars - for lnum in range(ord(start_char), ord(end_char), line_length): - line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" - try: - line.encode("utf-8") - except UnicodeEncodeError: - continue - lines.append(line) - parser = all_parsers - df = DataFrame(lines) - with tm.ensure_clean("utf8test.csv") as fname: - df.to_csv(fname, index=False, header=False, encoding="utf-8") - dfr = parser.read_csv( - fname, header=None, memory_map=True, engine="c", encoding="utf-8" - ) - tm.assert_frame_equal(df, dfr) - - -@pytest.mark.usefixtures("pyarrow_xfail") -@pytest.mark.parametrize("mode", ["w+b", "w+t"]) -def test_not_readable(all_parsers, mode): - # GH43439 - parser = all_parsers - content = b"abcd" - if "t" in mode: - content = "abcd" - with tempfile.SpooledTemporaryFile(mode=mode) as handle: - handle.write(content) - handle.seek(0) - df = parser.read_csv(handle) - expected = DataFrame([], columns=["abcd"]) - tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py deleted file mode 100644 index 5cb54bb4e2916..0000000000000 --- a/pandas/tests/io/parser/test_header.py +++ /dev/null @@ -1,660 +0,0 @@ -""" -Tests that the file header is properly handled or inferred -during parsing for all of the parsers defined in parsers.py -""" - -from collections import namedtuple -from io import StringIO - -import numpy as np -import pytest - -from pandas.errors import ParserError - -from pandas import ( - DataFrame, - Index, - MultiIndex, -) -import pandas._testing as tm - -# TODO(1.4): Change me to xfails at release time -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@skip_pyarrow -def test_read_with_bad_header(all_parsers): - parser = all_parsers - msg = r"but only \d+ lines in file" - - with pytest.raises(ValueError, match=msg): - s = StringIO(",,") - parser.read_csv(s, header=[10]) - - -def test_negative_header(all_parsers): - # see gh-27779 - parser = all_parsers - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - with pytest.raises( - ValueError, - match="Passing negative integer to header is invalid. " - "For no header, use header=None instead", - ): - parser.read_csv(StringIO(data), header=-1) - - -@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])]) -def test_negative_multi_index_header(all_parsers, header): - # see gh-27779 - parser = all_parsers - data = """1,2,3,4,5 - 6,7,8,9,10 - 11,12,13,14,15 - """ - with pytest.raises( - ValueError, match="cannot specify multi-index header with negative integers" - ): - parser.read_csv(StringIO(data), header=header) - - -@pytest.mark.parametrize("header", [True, False]) -def test_bool_header_arg(all_parsers, header): - # see gh-6114 - parser = all_parsers - data = """\ -MyColumn -a -b -a -b""" - msg = "Passing a bool to header is invalid" - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), header=header) - - -@skip_pyarrow -def test_header_with_index_col(all_parsers): - parser = all_parsers - data = """foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - names = ["A", "B", "C"] - result = parser.read_csv(StringIO(data), names=names) - - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"], - ) - tm.assert_frame_equal(result, expected) - - -def test_header_not_first_line(all_parsers): - parser = all_parsers - data = """got,to,ignore,this,line -got,to,ignore,this,line -index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - data2 = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - - result = parser.read_csv(StringIO(data), header=2, index_col=0) - expected = parser.read_csv(StringIO(data2), header=0, index_col=0) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_header_multi_index(all_parsers): - parser = all_parsers - expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - - data = """\ -C0,,C_l0_g0,C_l0_g1,C_l0_g2 - -C1,,C_l1_g0,C_l1_g1,C_l1_g2 -C2,,C_l2_g0,C_l2_g1,C_l2_g2 -C3,,C_l3_g0,C_l3_g1,C_l3_g2 -R0,R1,,, -R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 -R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 -R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 -R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 -R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 -""" - result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "kwargs,msg", - [ - ( - {"index_col": ["foo", "bar"]}, - ( - "index_col must only contain " - "row numbers when specifying " - "a multi-index header" - ), - ), - ( - {"index_col": [0, 1], "names": ["foo", "bar"]}, - ("cannot specify names when specifying a multi-index header"), - ), - ( - {"index_col": [0, 1], "usecols": ["foo", "bar"]}, - ("cannot specify usecols when specifying a multi-index header"), - ), - ], -) -def test_header_multi_index_invalid(all_parsers, kwargs, msg): - data = """\ -C0,,C_l0_g0,C_l0_g1,C_l0_g2 - -C1,,C_l1_g0,C_l1_g1,C_l1_g2 -C2,,C_l2_g0,C_l2_g1,C_l2_g2 -C3,,C_l3_g0,C_l3_g1,C_l3_g2 -R0,R1,,, -R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 -R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 -R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 -R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 -R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 -""" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs) - - -_TestTuple = namedtuple("_TestTuple", ["first", "second"]) - - -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs", - [ - {"header": [0, 1]}, - { - "skiprows": 3, - "names": [ - ("a", "q"), - ("a", "r"), - ("a", "s"), - ("b", "t"), - ("c", "u"), - ("c", "v"), - ], - }, - { - "skiprows": 3, - "names": [ - _TestTuple("a", "q"), - _TestTuple("a", "r"), - _TestTuple("a", "s"), - _TestTuple("b", "t"), - _TestTuple("c", "u"), - _TestTuple("c", "v"), - ], - }, - ], -) -def test_header_multi_index_common_format1(all_parsers, kwargs): - parser = all_parsers - expected = DataFrame( - [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] - ), - ) - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -,,,,,, -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = parser.read_csv(StringIO(data), index_col=0, **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs", - [ - {"header": [0, 1]}, - { - "skiprows": 2, - "names": [ - ("a", "q"), - ("a", "r"), - ("a", "s"), - ("b", "t"), - ("c", "u"), - ("c", "v"), - ], - }, - { - "skiprows": 2, - "names": [ - _TestTuple("a", "q"), - _TestTuple("a", "r"), - _TestTuple("a", "s"), - _TestTuple("b", "t"), - _TestTuple("c", "u"), - _TestTuple("c", "v"), - ], - }, - ], -) -def test_header_multi_index_common_format2(all_parsers, kwargs): - parser = all_parsers - expected = DataFrame( - [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] - ), - ) - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = parser.read_csv(StringIO(data), index_col=0, **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs", - [ - {"header": [0, 1]}, - { - "skiprows": 2, - "names": [ - ("a", "q"), - ("a", "r"), - ("a", "s"), - ("b", "t"), - ("c", "u"), - ("c", "v"), - ], - }, - { - "skiprows": 2, - "names": [ - _TestTuple("a", "q"), - _TestTuple("a", "r"), - _TestTuple("a", "s"), - _TestTuple("b", "t"), - _TestTuple("c", "u"), - _TestTuple("c", "v"), - ], - }, - ], -) -def test_header_multi_index_common_format3(all_parsers, kwargs): - parser = all_parsers - expected = DataFrame( - [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] - ), - ) - expected = expected.reset_index(drop=True) - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = parser.read_csv(StringIO(data), index_col=None, **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_header_multi_index_common_format_malformed1(all_parsers): - parser = all_parsers - expected = DataFrame( - np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), - index=Index([1, 7]), - columns=MultiIndex( - levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=["a", "q"], - ), - ) - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - -@skip_pyarrow -def test_header_multi_index_common_format_malformed2(all_parsers): - parser = all_parsers - expected = DataFrame( - np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), - index=Index([1, 7]), - columns=MultiIndex( - levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=[None, "q"], - ), - ) - - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - -@skip_pyarrow -def test_header_multi_index_common_format_malformed3(all_parsers): - parser = all_parsers - expected = DataFrame( - np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), - index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]), - columns=MultiIndex( - levels=[["a", "b", "c"], ["s", "t", "u", "v"]], - codes=[[0, 1, 2, 2], [0, 1, 2, 3]], - names=[None, "q"], - ), - ) - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) - tm.assert_frame_equal(expected, result) - - -@skip_pyarrow -def test_header_multi_index_blank_line(all_parsers): - # GH 40442 - parser = all_parsers - data = [[None, None], [1, 2], [3, 4]] - columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) - expected = DataFrame(data, columns=columns) - data = "a,b\nA,B\n,\n1,2\n3,4" - result = parser.read_csv(StringIO(data), header=[0, 1]) - tm.assert_frame_equal(expected, result) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] -) -def test_header_names_backward_compat(all_parsers, data, header): - # see gh-2539 - parser = all_parsers - expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) - - result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}]) -def test_read_only_header_no_rows(all_parsers, kwargs): - # See gh-7773 - parser = all_parsers - expected = DataFrame(columns=["a", "b", "c"]) - - result = parser.read_csv(StringIO("a,b,c"), **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "kwargs,names", - [ - ({}, [0, 1, 2, 3, 4]), - ( - {"names": ["foo", "bar", "baz", "quux", "panda"]}, - ["foo", "bar", "baz", "quux", "panda"], - ), - ], -) -def test_no_header(all_parsers, kwargs, names): - parser = all_parsers - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - expected = DataFrame( - [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names - ) - result = parser.read_csv(StringIO(data), header=None, **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("header", [["a", "b"], "string_header"]) -def test_non_int_header(all_parsers, header): - # see gh-16338 - msg = "header must be integer or list of integers" - data = """1,2\n3,4""" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=header) - - -@skip_pyarrow -def test_singleton_header(all_parsers): - # see gh-7757 - data = """a,b,c\n0,1,2\n1,2,3""" - parser = all_parsers - - expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) - result = parser.read_csv(StringIO(data), header=[0]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,expected", - [ - ( - "A,A,A,B\none,one,one,two\n0,40,34,0.1", - DataFrame( - [[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")] - ), - ), - ), - ( - "A,A,A,B\none,one,one.1,two\n0,40,34,0.1", - DataFrame( - [[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")] - ), - ), - ), - ( - "A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", - DataFrame( - [[0, 40, 34, 0.1, 0.1]], - columns=MultiIndex.from_tuples( - [ - ("A", "one"), - ("A", "one.1"), - ("A", "one.1.1"), - ("B", "two"), - ("B", "two.1"), - ] - ), - ), - ), - ], -) -def test_mangles_multi_index(all_parsers, data, expected): - # see gh-18062 - parser = all_parsers - - result = parser.read_csv(StringIO(data), header=[0, 1]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("index_col", [None, [0]]) -@pytest.mark.parametrize( - "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] -) -def test_multi_index_unnamed(all_parsers, index_col, columns): - # see gh-23687 - # - # When specifying a multi-index header, make sure that - # we don't error just because one of the rows in our header - # has ALL column names containing the string "Unnamed". The - # correct condition to check is whether the row contains - # ALL columns that did not have names (and instead were given - # placeholder ones). - parser = all_parsers - header = [0, 1] - - if index_col is None: - data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" - else: - data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" - - result = parser.read_csv(StringIO(data), header=header, index_col=index_col) - exp_columns = [] - - if columns is None: - columns = ["", "", ""] - - for i, col in enumerate(columns): - if not col: # Unnamed. - col = f"Unnamed: {i if index_col is None else i + 1}_level_0" - - exp_columns.append(col) - - columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) - expected = DataFrame([[2, 3], [4, 5]], columns=columns) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_names_longer_than_header_but_equal_with_data_rows(all_parsers): - # GH#38453 - parser = all_parsers - data = """a, b -1,2,3 -5,6,4 -""" - result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"]) - expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]}) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_csv_multiindex_columns(all_parsers): - # GH#6051 - parser = all_parsers - - s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81" - s2 = ( - "Male, Male, Male, Female, Female\n" - "R, R, L, R, R\n" - ".86, .67, .88, .78, .81\n" - ".86, .67, .88, .78, .82" - ) - - mi = MultiIndex.from_tuples( - [ - ("Male", "R"), - (" Male", " R"), - (" Male", " L"), - (" Female", " R"), - (" Female", " R.1"), - ] - ) - expected = DataFrame( - [[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi - ) - - df1 = parser.read_csv(StringIO(s1), header=[0, 1]) - tm.assert_frame_equal(df1, expected.iloc[:1]) - df2 = parser.read_csv(StringIO(s2), header=[0, 1]) - tm.assert_frame_equal(df2, expected) - - -@skip_pyarrow -def test_read_csv_multi_header_length_check(all_parsers): - # GH#43102 - parser = all_parsers - - case = """row11,row12,row13 -row21,row22, row23 -row31,row32 -""" - - with pytest.raises( - ParserError, match="Header rows must have an equal number of columns." - ): - parser.read_csv(StringIO(case), header=[0, 2]) - - -@skip_pyarrow -def test_header_none_and_implicit_index(all_parsers): - # GH#22144 - parser = all_parsers - data = "x,1,5\ny,2\nz,3\n" - result = parser.read_csv(StringIO(data), names=["a", "b"], header=None) - expected = DataFrame( - {"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"] - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_header_none_and_implicit_index_in_second_row(all_parsers): - # GH#22144 - parser = all_parsers - data = "x,1\ny,2,5\nz,3\n" - with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"): - parser.read_csv(StringIO(data), names=["a", "b"], header=None) - - -@skip_pyarrow -def test_header_none_and_on_bad_lines_skip(all_parsers): - # GH#22144 - parser = all_parsers - data = "x,1\ny,2,5\nz,3\n" - result = parser.read_csv( - StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip" - ) - expected = DataFrame({"a": ["x", "z"], "b": [1, 3]}) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_header_missing_rows(all_parsers): - # GH#47400 - parser = all_parsers - data = """a,b -1,2 -""" - msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file" - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=[0, 1, 2]) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py deleted file mode 100644 index 13c4216710f84..0000000000000 --- a/pandas/tests/io/parser/test_index_col.py +++ /dev/null @@ -1,355 +0,0 @@ -""" -Tests that the specified index column (a.k.a "index_col") -is properly handled or inferred during parsing for all of -the parsers defined in parsers.py -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas import ( - DataFrame, - Index, - MultiIndex, -) -import pandas._testing as tm - -# TODO(1.4): Change me to xfails at release time -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.parametrize("with_header", [True, False]) -def test_index_col_named(all_parsers, with_header): - parser = all_parsers - no_header = """\ -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" - - if with_header: - data = header + no_header - - result = parser.read_csv(StringIO(data), index_col="ID") - expected = parser.read_csv(StringIO(data), header=0).set_index("ID") - tm.assert_frame_equal(result, expected) - else: - data = no_header - msg = "Index ID invalid" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), index_col="ID") - - -def test_index_col_named2(all_parsers): - parser = all_parsers - data = """\ -1,2,3,4,hello -5,6,7,8,world -9,10,11,12,foo -""" - - expected = DataFrame( - {"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]}, - index=Index(["hello", "world", "foo"], name="message"), - ) - names = ["a", "b", "c", "d", "message"] - - result = parser.read_csv(StringIO(data), names=names, index_col=["message"]) - tm.assert_frame_equal(result, expected) - - -def test_index_col_is_true(all_parsers): - # see gh-9798 - data = "a,b\n1,2" - parser = all_parsers - - msg = "The value of index_col couldn't be 'True'" - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), index_col=True) - - -@skip_pyarrow -def test_infer_index_col(all_parsers): - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"], - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "index_col,kwargs", - [ - (None, {"columns": ["x", "y", "z"]}), - (False, {"columns": ["x", "y", "z"]}), - (0, {"columns": ["y", "z"], "index": Index([], name="x")}), - (1, {"columns": ["x", "z"], "index": Index([], name="y")}), - ("x", {"columns": ["y", "z"], "index": Index([], name="x")}), - ("y", {"columns": ["x", "z"], "index": Index([], name="y")}), - ( - [0, 1], - { - "columns": ["z"], - "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]), - }, - ), - ( - ["x", "y"], - { - "columns": ["z"], - "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]), - }, - ), - ( - [1, 0], - { - "columns": ["z"], - "index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]), - }, - ), - ( - ["y", "x"], - { - "columns": ["z"], - "index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]), - }, - ), - ], -) -def test_index_col_empty_data(all_parsers, index_col, kwargs): - data = "x,y,z" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=index_col) - - expected = DataFrame(**kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_with_index_col_false(all_parsers): - # see gh-10413 - data = "x,y" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=False) - - expected = DataFrame(columns=["x", "y"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "index_names", - [ - ["", ""], - ["foo", ""], - ["", "bar"], - ["foo", "bar"], - ["NotReallyUnnamed", "Unnamed: 0"], - ], -) -def test_multi_index_naming(all_parsers, index_names): - parser = all_parsers - - # We don't want empty index names being replaced with "Unnamed: 0" - data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) - result = parser.read_csv(StringIO(data), index_col=[0, 1]) - - expected = DataFrame( - {"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]]) - ) - expected.index.names = [name if name else None for name in index_names] - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_multi_index_naming_not_all_at_beginning(all_parsers): - parser = all_parsers - data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" - result = parser.read_csv(StringIO(data), index_col=[0, 2]) - - expected = DataFrame( - {"Unnamed: 2": ["c", "d", "c", "d"]}, - index=MultiIndex( - levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]] - ), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_no_multi_index_level_names_empty(all_parsers): - # GH 10984 - parser = all_parsers - midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) - expected = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) - with tm.ensure_clean() as path: - expected.to_csv(path) - result = parser.read_csv(path, index_col=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_header_with_index_col(all_parsers): - # GH 33476 - parser = all_parsers - data = """ -I11,A,A -I12,B,B -I2,1,3 -""" - midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) - idx = Index(["I2"]) - expected = DataFrame([[1, 3]], index=idx, columns=midx) - - result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1]) - tm.assert_frame_equal(result, expected) - - col_idx = Index(["A", "A.1"]) - idx = Index(["I12", "I2"], name="I11") - expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx) - - result = parser.read_csv(StringIO(data), index_col="I11", header=0) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.slow -def test_index_col_large_csv(all_parsers): - # https://github.com/pandas-dev/pandas/issues/37094 - parser = all_parsers - - N = 1_000_001 - df = DataFrame({"a": range(N), "b": np.random.randn(N)}) - - with tm.ensure_clean() as path: - df.to_csv(path, index=False) - result = parser.read_csv(path, index_col=[0]) - - tm.assert_frame_equal(result, df.set_index("a")) - - -@skip_pyarrow -def test_index_col_multiindex_columns_no_data(all_parsers): - # GH#38292 - parser = all_parsers - result = parser.read_csv( - StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0 - ) - expected = DataFrame( - [], - index=Index([]), - columns=MultiIndex.from_arrays( - [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"] - ), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_index_col_header_no_data(all_parsers): - # GH#38292 - parser = all_parsers - result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0) - expected = DataFrame( - [], - columns=["a1", "a2"], - index=Index([], name="a0"), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_multiindex_columns_no_data(all_parsers): - # GH#38292 - parser = all_parsers - result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1]) - expected = DataFrame( - [], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]]) - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_multiindex_columns_index_col_with_data(all_parsers): - # GH#38292 - parser = all_parsers - result = parser.read_csv( - StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0 - ) - expected = DataFrame( - [["data", "data"]], - columns=MultiIndex.from_arrays( - [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"] - ), - index=Index(["data"]), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_infer_types_boolean_sum(all_parsers): - # GH#44079 - parser = all_parsers - result = parser.read_csv( - StringIO("0,1"), - names=["a", "b"], - index_col=["a"], - dtype={"a": "UInt8"}, - ) - expected = DataFrame( - data={ - "a": [ - 0, - ], - "b": [1], - } - ).set_index("a") - # Not checking index type now, because the C parser will return a - # index column of dtype 'object', and the Python parser will return a - # index column of dtype 'int64'. - tm.assert_frame_equal(result, expected, check_index_type=False) - - -@skip_pyarrow -@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) -def test_specify_dtype_for_index_col(all_parsers, dtype, val): - # GH#9435 - data = "a,b\n01,2" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_multiindex_columns_not_leading_index_col(all_parsers): - # GH#38549 - parser = all_parsers - data = """a,b,c,d -e,f,g,h -x,y,1,2 -""" - result = parser.read_csv( - StringIO(data), - header=[0, 1], - index_col=1, - ) - cols = MultiIndex.from_tuples( - [("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"] - ) - expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py deleted file mode 100644 index 5709e7e4027e8..0000000000000 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ /dev/null @@ -1,165 +0,0 @@ -""" -Tests that duplicate columns are handled appropriately when parsed by the -CSV engine. In general, the expected result is that they are either thoroughly -de-duplicated (if mangling requested) or ignored otherwise. -""" -from io import StringIO - -import pytest - -from pandas import DataFrame -import pandas._testing as tm - -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@skip_pyarrow -def test_basic(all_parsers): - parser = all_parsers - - data = "a,a,b,b,b\n1,2,3,4,5" - result = parser.read_csv(StringIO(data), sep=",") - - expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_basic_names(all_parsers): - # See gh-7160 - parser = all_parsers - - data = "a,b,a\n0,1,2\n3,4,5" - expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"]) - - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -def test_basic_names_raise(all_parsers): - # See gh-7160 - parser = all_parsers - - data = "0,1,2\n3,4,5" - with pytest.raises(ValueError, match="Duplicate names"): - parser.read_csv(StringIO(data), names=["a", "b", "a"]) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,expected", - [ - ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])), - ( - "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", - DataFrame( - [[1, 2, 3, 4, 5, 6]], - columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], - ), - ), - ( - "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", - DataFrame( - [[1, 2, 3, 4, 5, 6, 7]], - columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"], - ), - ), - ], -) -def test_thorough_mangle_columns(all_parsers, data, expected): - # see gh-17060 - parser = all_parsers - - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,names,expected", - [ - ( - "a,b,b\n1,2,3", - ["a.1", "a.1", "a.1.1"], - DataFrame( - [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"] - ), - ), - ( - "a,b,c,d,e,f\n1,2,3,4,5,6", - ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], - DataFrame( - [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]], - columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], - ), - ), - ( - "a,b,c,d,e,f,g\n1,2,3,4,5,6,7", - ["a", "a", "a.3", "a.1", "a.2", "a", "a"], - DataFrame( - [ - ["a", "b", "c", "d", "e", "f", "g"], - ["1", "2", "3", "4", "5", "6", "7"], - ], - columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], - ), - ), - ], -) -def test_thorough_mangle_names(all_parsers, data, names, expected): - # see gh-17095 - parser = all_parsers - - with pytest.raises(ValueError, match="Duplicate names"): - parser.read_csv(StringIO(data), names=names) - - -@skip_pyarrow -def test_mangled_unnamed_placeholders(all_parsers): - # xref gh-13017 - orig_key = "0" - parser = all_parsers - - orig_value = [1, 2, 3] - df = DataFrame({orig_key: orig_value}) - - # This test recursively updates `df`. - for i in range(3): - expected = DataFrame() - - for j in range(i + 1): - col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) - expected.insert(loc=0, column=col_name, value=[0, 1, 2]) - - expected[orig_key] = orig_value - df = parser.read_csv(StringIO(df.to_csv())) - - tm.assert_frame_equal(df, expected) - - -@skip_pyarrow -def test_mangle_dupe_cols_already_exists(all_parsers): - # GH#14704 - parser = all_parsers - - data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7" - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [[1, 2, 3, 4, 5, 6, 7]], - columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"], - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): - # GH#14704 - parser = all_parsers - - data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4" - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [[1, 2, 3, 4]], - columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"], - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py deleted file mode 100644 index ab278470934a5..0000000000000 --- a/pandas/tests/io/parser/test_multi_thread.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Tests multithreading behaviour for reading and -parsing files for each parser defined in parsers.py -""" -from contextlib import ExitStack -from io import BytesIO -from multiprocessing.pool import ThreadPool - -import numpy as np -import pytest - -import pandas as pd -from pandas import DataFrame -import pandas._testing as tm - -# We'll probably always skip these for pyarrow -# Maybe we'll add our own tests for pyarrow too -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def _construct_dataframe(num_rows): - """ - Construct a DataFrame for testing. - - Parameters - ---------- - num_rows : int - The number of rows for our DataFrame. - - Returns - ------- - df : DataFrame - """ - df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde")) - df["foo"] = "foo" - df["bar"] = "bar" - df["baz"] = "baz" - df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s") - df["int"] = np.arange(num_rows, dtype="int64") - return df - - -@pytest.mark.slow -def test_multi_thread_string_io_read_csv(all_parsers): - # see gh-11786 - parser = all_parsers - max_row_range = 10000 - num_files = 100 - - bytes_to_df = [ - "\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode() - for _ in range(num_files) - ] - - # Read all files in many threads. - with ExitStack() as stack: - files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df] - - pool = stack.enter_context(ThreadPool(8)) - - results = pool.map(parser.read_csv, files) - first_result = results[0] - - for result in results: - tm.assert_frame_equal(first_result, result) - - -def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks): - """ - Generate a DataFrame via multi-thread. - - Parameters - ---------- - parser : BaseParser - The parser object to use for reading the data. - path : str - The location of the CSV file to read. - num_rows : int - The number of rows to read per task. - num_tasks : int - The number of tasks to use for reading this DataFrame. - - Returns - ------- - df : DataFrame - """ - - def reader(arg): - """ - Create a reader for part of the CSV. - - Parameters - ---------- - arg : tuple - A tuple of the following: - - * start : int - The starting row to start for parsing CSV - * nrows : int - The number of rows to read. - - Returns - ------- - df : DataFrame - """ - start, nrows = arg - - if not start: - return parser.read_csv( - path, index_col=0, header=0, nrows=nrows, parse_dates=["date"] - ) - - return parser.read_csv( - path, - index_col=0, - header=None, - skiprows=int(start) + 1, - nrows=nrows, - parse_dates=[9], - ) - - tasks = [ - (num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks) - ] - - with ThreadPool(processes=num_tasks) as pool: - results = pool.map(reader, tasks) - - header = results[0].columns - - for r in results[1:]: - r.columns = header - - final_dataframe = pd.concat(results) - return final_dataframe - - -@pytest.mark.slow -def test_multi_thread_path_multipart_read_csv(all_parsers): - # see gh-11786 - num_tasks = 4 - num_rows = 100000 - - parser = all_parsers - file_name = "__thread_pool_reader__.csv" - df = _construct_dataframe(num_rows) - - with tm.ensure_clean(file_name) as path: - df.to_csv(path) - - final_dataframe = _generate_multi_thread_dataframe( - parser, path, num_rows, num_tasks - ) - tm.assert_frame_equal(df, final_dataframe) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py deleted file mode 100644 index 9a16ec5a50d36..0000000000000 --- a/pandas/tests/io/parser/test_na_values.py +++ /dev/null @@ -1,671 +0,0 @@ -""" -Tests that NA values are properly handled during -parsing for all of the parsers defined in parsers.py -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas._libs.parsers import STR_NA_VALUES - -from pandas import ( - DataFrame, - Index, - MultiIndex, -) -import pandas._testing as tm - -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - - -def test_string_nas(all_parsers): - parser = all_parsers - data = """A,B,C -a,b,c -d,,f -,g,h -""" - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]], - columns=["A", "B", "C"], - ) - if parser.engine == "pyarrow": - expected.loc[2, "A"] = None - expected.loc[1, "B"] = None - tm.assert_frame_equal(result, expected) - - -def test_detect_string_na(all_parsers): - parser = all_parsers - data = """A,B -foo,bar -NA,baz -NaN,nan -""" - expected = DataFrame( - [["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"] - ) - if parser.engine == "pyarrow": - expected.loc[[1, 2], "A"] = None - expected.loc[2, "B"] = None - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "na_values", - [ - ["-999.0", "-999"], - [-999, -999.0], - [-999.0, -999], - ["-999.0"], - ["-999"], - [-999.0], - [-999], - ], -) -@pytest.mark.parametrize( - "data", - [ - """A,B --999,1.2 -2,-999 -3,4.5 -""", - """A,B --999,1.200 -2,-999.000 -3,4.500 -""", - ], -) -def test_non_string_na_values(all_parsers, data, na_values): - # see gh-3611: with an odd float format, we can't match - # the string "999.0" exactly but still need float matching - parser = all_parsers - expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) - - result = parser.read_csv(StringIO(data), na_values=na_values) - tm.assert_frame_equal(result, expected) - - -def test_default_na_values(all_parsers): - _NA_VALUES = { - "-1.#IND", - "1.#QNAN", - "1.#IND", - "-1.#QNAN", - "#N/A", - "N/A", - "n/a", - "NA", - "", - "#NA", - "NULL", - "null", - "NaN", - "nan", - "-NaN", - "-nan", - "#N/A N/A", - "", - "None", - } - assert _NA_VALUES == STR_NA_VALUES - - parser = all_parsers - nv = len(_NA_VALUES) - - def f(i, v): - if i == 0: - buf = "" - elif i > 0: - buf = "".join([","] * i) - - buf = f"{buf}{v}" - - if i < nv - 1: - joined = "".join([","] * (nv - i - 1)) - buf = f"{buf}{joined}" - - return buf - - data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)])) - expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) - - result = parser.read_csv(data, header=None) - tm.assert_frame_equal(result, expected) - - -# TODO: needs skiprows list support in pyarrow -@skip_pyarrow -@pytest.mark.parametrize("na_values", ["baz", ["baz"]]) -def test_custom_na_values(all_parsers, na_values): - parser = all_parsers - data = """A,B,C -ignore,this,row -1,NA,3 --1.#IND,5,baz -7,8,NaN -""" - expected = DataFrame( - [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] - ) - result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) - tm.assert_frame_equal(result, expected) - - -def test_bool_na_values(all_parsers): - data = """A,B,C -True,False,True -NA,True,False -False,NA,True""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - { - "A": np.array([True, np.nan, False], dtype=object), - "B": np.array([False, True, np.nan], dtype=object), - "C": [True, False, True], - } - ) - if parser.engine == "pyarrow": - expected.loc[1, "A"] = None - expected.loc[2, "B"] = None - tm.assert_frame_equal(result, expected) - - -# TODO: Needs pyarrow support for dictionary in na_values -@skip_pyarrow -def test_na_value_dict(all_parsers): - data = """A,B,C -foo,bar,NA -bar,foo,foo -foo,bar,NA -bar,foo,foo""" - parser = all_parsers - df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) - expected = DataFrame( - { - "A": [np.nan, "bar", np.nan, "bar"], - "B": [np.nan, "foo", np.nan, "foo"], - "C": [np.nan, "foo", np.nan, "foo"], - } - ) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize( - "index_col,expected", - [ - ( - [0], - DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")), - ), - ( - [0, 2], - DataFrame( - {"b": [np.nan], "d": [5]}, - index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), - ), - ), - ( - ["a", "c"], - DataFrame( - {"b": [np.nan], "d": [5]}, - index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), - ), - ), - ], -) -def test_na_value_dict_multi_index(all_parsers, index_col, expected): - data = """\ -a,b,c,d -0,NA,1,5 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col) - tm.assert_frame_equal(result, expected) - - -# TODO: xfail components of this test, the first one passes -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs,expected", - [ - ( - {}, - DataFrame( - { - "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], - } - ), - ), - ( - {"na_values": {"A": [], "C": []}, "keep_default_na": False}, - DataFrame( - { - "A": ["a", "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"], - } - ), - ), - ( - {"na_values": ["a"], "keep_default_na": False}, - DataFrame( - { - "A": [np.nan, "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"], - } - ), - ), - ( - {"na_values": {"A": [], "C": []}}, - DataFrame( - { - "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], - } - ), - ), - ], -) -def test_na_values_keep_default(all_parsers, kwargs, expected): - data = """\ -A,B,C -a,1,one -b,2,two -,3,three -d,4,nan -e,5,five -nan,6, -g,7,seven -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_no_na_values_no_keep_default(all_parsers): - # see gh-4318: passing na_values=None and - # keep_default_na=False yields 'None" as a na_value - data = """\ -A,B,C -a,1,None -b,2,two -,3,None -d,4,nan -e,5,five -nan,6, -g,7,seven -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), keep_default_na=False) - - expected = DataFrame( - { - "A": ["a", "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["None", "two", "None", "nan", "five", "", "seven"], - } - ) - tm.assert_frame_equal(result, expected) - - -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow -def test_no_keep_default_na_dict_na_values(all_parsers): - # see gh-19227 - data = "a,b\n,2" - parser = all_parsers - result = parser.read_csv( - StringIO(data), na_values={"b": ["2"]}, keep_default_na=False - ) - expected = DataFrame({"a": [""], "b": [np.nan]}) - tm.assert_frame_equal(result, expected) - - -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow -def test_no_keep_default_na_dict_na_scalar_values(all_parsers): - # see gh-19227 - # - # Scalar values shouldn't cause the parsing to crash or fail. - data = "a,b\n1,2" - parser = all_parsers - df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) - expected = DataFrame({"a": [1], "b": [np.nan]}) - tm.assert_frame_equal(df, expected) - - -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow -@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) -def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): - # see gh-19227 - data = """\ -113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 -729639,"qwer","",asdfkj,466.681,,252.373 -""" - parser = all_parsers - expected = DataFrame( - { - 0: [np.nan, 729639.0], - 1: [np.nan, "qwer"], - 2: ["/blaha", np.nan], - 3: ["kjsdkj", "asdfkj"], - 4: [412.166, 466.681], - 5: ["225.874", ""], - 6: [np.nan, 252.373], - } - ) - - result = parser.read_csv( - StringIO(data), - header=None, - keep_default_na=False, - na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, - ) - tm.assert_frame_equal(result, expected) - - -# TODO: Empty null_values doesn't work properly on pyarrow -@skip_pyarrow -@pytest.mark.parametrize( - "na_filter,row_data", - [ - (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), - (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), - ], -) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): - data = """\ -A,B -1,A -nan,B -3,C -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) - - expected = DataFrame(row_data, columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -# TODO: Arrow parse error -@skip_pyarrow -def test_na_trailing_columns(all_parsers): - parser = all_parsers - data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax -2012-03-14,USD,AAPL,BUY,1000 -2012-05-12,USD,SBUX,SELL,500""" - - # Trailing columns should be all NaN. - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [ - ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], - ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], - ], - columns=[ - "Date", - "Currency", - "Symbol", - "Type", - "Units", - "UnitPrice", - "Cost", - "Tax", - ], - ) - tm.assert_frame_equal(result, expected) - - -# TODO: xfail the na_values dict case -@skip_pyarrow -@pytest.mark.parametrize( - "na_values,row_data", - [ - (1, [[np.nan, 2.0], [2.0, np.nan]]), - ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), - ], -) -def test_na_values_scalar(all_parsers, na_values, row_data): - # see gh-12224 - parser = all_parsers - names = ["a", "b"] - data = "1,2\n2,1" - - result = parser.read_csv(StringIO(data), names=names, na_values=na_values) - expected = DataFrame(row_data, columns=names) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_na_values_dict_aliasing(all_parsers): - parser = all_parsers - na_values = {"a": 2, "b": 1} - na_values_copy = na_values.copy() - - names = ["a", "b"] - data = "1,2\n2,1" - - expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) - result = parser.read_csv(StringIO(data), names=names, na_values=na_values) - - tm.assert_frame_equal(result, expected) - tm.assert_dict_equal(na_values, na_values_copy) - - -@skip_pyarrow -def test_na_values_dict_col_index(all_parsers): - # see gh-14203 - data = "a\nfoo\n1" - parser = all_parsers - na_values = {0: "foo"} - - result = parser.read_csv(StringIO(data), na_values=na_values) - expected = DataFrame({"a": [np.nan, 1]}) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - str(2**63) + "\n" + str(2**63 + 1), - {"na_values": [2**63]}, - DataFrame([str(2**63), str(2**63 + 1)]), - ), - (str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])), - (str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])), - ], -) -def test_na_values_uint64(all_parsers, data, kwargs, expected): - # see gh-14983 - parser = all_parsers - result = parser.read_csv(StringIO(data), header=None, **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_empty_na_values_no_default_with_index(all_parsers): - # see gh-15835 - data = "a,1\nb,2" - parser = all_parsers - expected = DataFrame({"1": [2]}, index=Index(["b"], name="a")) - - result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False) - tm.assert_frame_equal(result, expected) - - -# TODO: Missing support for na_filter kewyord -@skip_pyarrow -@pytest.mark.parametrize( - "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] -) -def test_no_na_filter_on_index(all_parsers, na_filter, index_data): - # see gh-5239 - # - # Don't parse NA-values in index unless na_filter=True - parser = all_parsers - data = "a,b,c\n1,,3\n4,5,6" - - expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b")) - result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter) - tm.assert_frame_equal(result, expected) - - -def test_inf_na_values_with_int_index(all_parsers): - # see gh-17128 - parser = all_parsers - data = "idx,col1,col2\n1,3,4\n2,inf,-inf" - - # Don't fail with OverflowError with inf's and integer index column. - out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"]) - expected = DataFrame( - {"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx") - ) - tm.assert_frame_equal(out, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("na_filter", [True, False]) -def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): - # see gh-20377 - parser = all_parsers - data = "a,b,c\n1,,3\n4,5,6" - - # na_filter=True --> missing value becomes NaN. - # na_filter=False --> missing value remains empty string. - empty = np.nan if na_filter else "" - expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]}) - - result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data, na_values", - [ - ("false,1\n,1\ntrue", None), - ("false,1\nnull,1\ntrue", None), - ("false,1\nnan,1\ntrue", None), - ("false,1\nfoo,1\ntrue", "foo"), - ("false,1\nfoo,1\ntrue", ["foo"]), - ("false,1\nfoo,1\ntrue", {"a": "foo"}), - ], -) -def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): - parser = all_parsers - msg = ( - "(Bool column has NA values in column [0a])|" - "(cannot safely convert passed user dtype of " - "bool for object dtyped data in column 0)" - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv( - StringIO(data), - header=None, - names=["a", "b"], - dtype={"a": "bool"}, - na_values=na_values, - ) - - -@skip_pyarrow -def test_str_nan_dropped(all_parsers): - # see gh-21131 - parser = all_parsers - - data = """File: small.csv,, -10010010233,0123,654 -foo,,bar -01001000155,4530,898""" - - result = parser.read_csv( - StringIO(data), - header=None, - names=["col1", "col2", "col3"], - dtype={"col1": str, "col2": str, "col3": str}, - ).dropna() - - expected = DataFrame( - { - "col1": ["10010010233", "01001000155"], - "col2": ["0123", "4530"], - "col3": ["654", "898"], - }, - index=[1, 3], - ) - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_nan_multi_index(all_parsers): - # GH 42446 - parser = all_parsers - data = "A,B,B\nX,Y,Z\n1,2,inf" - - result = parser.read_csv( - StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} - ) - - expected = DataFrame( - { - ("A", "X"): [1], - ("B", "Y"): [2], - ("B", "Z"): [np.nan], - } - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_bool_and_nan_to_bool(all_parsers): - # GH#42808 - parser = all_parsers - data = """0 -NaN -True -False -""" - with pytest.raises(ValueError, match="NA values"): - parser.read_csv(StringIO(data), dtype="bool") - - -def test_bool_and_nan_to_int(all_parsers): - # GH#42808 - parser = all_parsers - data = """0 -NaN -True -False -""" - with pytest.raises(ValueError, match="convert|NoneType"): - parser.read_csv(StringIO(data), dtype="int") - - -def test_bool_and_nan_to_float(all_parsers): - # GH#42808 - parser = all_parsers - data = """0 -NaN -True -False -""" - result = parser.read_csv(StringIO(data), dtype="float") - expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py deleted file mode 100644 index a0d9c6ae99dcf..0000000000000 --- a/pandas/tests/io/parser/test_network.py +++ /dev/null @@ -1,318 +0,0 @@ -""" -Tests parsers ability to read and parse non-local files -and hence require a network connection to be read. -""" -from io import ( - BytesIO, - StringIO, -) -import logging - -import numpy as np -import pytest - -from pandas.compat import is_ci_environment -import pandas.util._test_decorators as td - -from pandas import DataFrame -import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension - -from pandas.io.feather_format import read_feather -from pandas.io.parsers import read_csv - - -@pytest.mark.network -@tm.network( - url=( - "https://github.com/pandas-dev/pandas/raw/main/" - "pandas/tests/io/parser/data/salaries.csv" - ), - check_before_test=True, -) -@pytest.mark.parametrize("mode", ["explicit", "infer"]) -@pytest.mark.parametrize("engine", ["python", "c"]) -def test_compressed_urls(salaries_table, mode, engine, compression_only): - # test reading compressed urls with various engines and - # extension inference - extension = _compression_to_extension[compression_only] - base_url = ( - "https://github.com/pandas-dev/pandas/raw/main/" - "pandas/tests/io/parser/data/salaries.csv" - ) - - url = base_url + extension - - if mode != "explicit": - compression_only = mode - - url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine) - tm.assert_frame_equal(url_table, salaries_table) - - -@pytest.mark.network -@tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/unicode_series.csv" - ), - check_before_test=True, -) -def test_url_encoding_csv(): - """ - read_csv should honor the requested encoding for URLs. - - GH 10424 - """ - path = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/unicode_series.csv" - ) - df = read_csv(path, encoding="latin-1", header=None) - assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)" - - -@pytest.fixture -def tips_df(datapath): - """DataFrame with the tips dataset.""" - return read_csv(datapath("io", "data", "csv", "tips.csv")) - - -@pytest.mark.single_cpu -@pytest.mark.usefixtures("s3_resource") -@pytest.mark.xfail( - reason="CI race condition GH 45433, GH 44584", - raises=FileNotFoundError, - strict=False, -) -@td.skip_if_not_us_locale() -class TestS3: - @td.skip_if_no("s3fs") - def test_parse_public_s3_bucket(self, tips_df, s3so): - # more of an integration test due to the not-public contents portion - # can probably mock this though. - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - "s3://pandas-test/tips.csv" + ext, - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(df, tips_df) - - # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(df, tips_df) - - def test_parse_public_s3n_bucket(self, tips_df, s3so): - # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) - - def test_parse_public_s3a_bucket(self, tips_df, s3so): - # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) - - def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - "s3://pandas-test/tips.csv" + ext, - nrows=10, - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) - - def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): - # Read with a chunksize - chunksize = 5 - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - with read_csv( - "s3://pandas-test/tips.csv" + ext, - chunksize=chunksize, - compression=comp, - storage_options=s3so, - ) as df_reader: - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk : chunksize * (i_chunk + 1) - ] - tm.assert_frame_equal(true_df, df) - - def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): - # Read with a chunksize using the Python parser - chunksize = 5 - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - with read_csv( - "s3://pandas-test/tips.csv" + ext, - chunksize=chunksize, - compression=comp, - engine="python", - storage_options=s3so, - ) as df_reader: - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk : chunksize * (i_chunk + 1) - ] - tm.assert_frame_equal(true_df, df) - - def test_parse_public_s3_bucket_python(self, tips_df, s3so): - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - "s3://pandas-test/tips.csv" + ext, - engine="python", - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(df, tips_df) - - def test_infer_s3_compression(self, tips_df, s3so): - for ext in ["", ".gz", ".bz2"]: - df = read_csv( - "s3://pandas-test/tips.csv" + ext, - engine="python", - compression="infer", - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(df, tips_df) - - def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - "s3://pandas-test/tips.csv" + ext, - engine="python", - nrows=10, - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) - - def test_read_s3_fails(self, s3so): - msg = "The specified bucket does not exist" - with pytest.raises(OSError, match=msg): - read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) - - # Receive a permission error when trying to read a private bucket. - # It's irrelevant here that this isn't actually a table. - with pytest.raises(OSError, match=msg): - read_csv("s3://cant_get_it/file.csv") - - @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) - def test_write_s3_csv_fails(self, tips_df, s3so): - # GH 32486 - # Attempting to write to an invalid S3 path should raise - import botocore - - # GH 34087 - # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html - # Catch a ClientError since AWS Service Errors are defined dynamically - error = (FileNotFoundError, botocore.exceptions.ClientError) - - with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv( - "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so - ) - - @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) - @td.skip_if_no("pyarrow") - def test_write_s3_parquet_fails(self, tips_df, s3so): - # GH 27679 - # Attempting to write to an invalid S3 path should raise - import botocore - - # GH 34087 - # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html - # Catch a ClientError since AWS Service Errors are defined dynamically - error = (FileNotFoundError, botocore.exceptions.ClientError) - - with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet( - "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", - storage_options=s3so, - ) - - @pytest.mark.single_cpu - def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): - # see gh-16135 - - s3_object = s3_resource.meta.client.get_object( - Bucket="pandas-test", Key="tips.csv" - ) - - with BytesIO(s3_object["Body"].read()) as buffer: - result = read_csv(buffer, encoding="utf8") - assert isinstance(result, DataFrame) - assert not result.empty - - expected = read_csv(tips_file) - tm.assert_frame_equal(result, expected) - - @pytest.mark.single_cpu - @pytest.mark.skipif( - is_ci_environment(), - reason="GH: 45651: This test can hang in our CI min_versions build", - ) - def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): - # 8 MB, S3FS uses 5MB chunks - import s3fs - - df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) - str_buf = StringIO() - - df.to_csv(str_buf) - - buf = BytesIO(str_buf.getvalue().encode("utf-8")) - - s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) - - # Possibly some state leaking in between tests. - # If we don't clear this cache, we saw `GetObject operation: Forbidden`. - # Presumably the s3fs instance is being cached, with the directory listing - # from *before* we add the large-file.csv in the pandas-test bucket. - s3fs.S3FileSystem.clear_instance_cache() - - with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) - # log of fetch_range (start, stop) - assert (0, 5505024) in (x.args[-2:] for x in caplog.records) - - def test_read_s3_with_hash_in_key(self, tips_df, s3so): - # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) - tm.assert_frame_equal(tips_df, result) - - @td.skip_if_no("pyarrow") - def test_read_feather_s3_file_path(self, feather_file, s3so): - # GH 29055 - expected = read_feather(feather_file) - res = read_feather( - "s3://pandas-test/simple_dataset.feather", storage_options=s3so - ) - tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py deleted file mode 100644 index 571e09bb5e9dd..0000000000000 --- a/pandas/tests/io/parser/test_parse_dates.py +++ /dev/null @@ -1,2239 +0,0 @@ -""" -Tests date parsing functionality for all of the -parsers defined in parsers.py -""" - -from datetime import ( - date, - datetime, - timedelta, - timezone, -) -from io import StringIO - -from dateutil.parser import parse as du_parse -from hypothesis import given -import numpy as np -import pytest -import pytz - -from pandas._libs.tslibs import parsing -from pandas._libs.tslibs.parsing import py_parse_datetime_string - -import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, - Index, - MultiIndex, - Series, - Timestamp, -) -import pandas._testing as tm -from pandas._testing._hypothesis import DATETIME_NO_TZ -from pandas.core.indexes.datetimes import date_range - -from pandas.io.parsers import read_csv - -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@xfail_pyarrow -def test_read_csv_with_custom_date_parser(all_parsers): - # GH36111 - def __custom_date_parser(time): - time = time.astype(np.float_) - time = time.astype(np.int_) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e n h - 41047.00 -98573.7297 871458.0640 389.0089 - 41048.00 -98573.7299 871458.0640 389.0089 - 41049.00 -98573.7300 871458.0642 389.0088 - 41050.00 -98573.7299 871458.0643 389.0088 - 41051.00 -98573.7302 871458.0640 389.0086 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=True, - date_parser=__custom_date_parser, - index_col="time", - ) - time = [41047, 41048, 41049, 41050, 41051] - time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time") - expected = DataFrame( - { - "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302], - "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640], - "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086], - }, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): - # GH44366 - def __custom_date_parser(time): - time = time.astype(np.float_) - time = time.astype(np.int_) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e - 41047.00 -93.77 - 41048.00 -95.79 - 41049.00 -98.73 - 41050.00 -93.99 - 41051.00 -97.72 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=False, - date_parser=__custom_date_parser, - index_col="time", - ) - time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time") - expected = DataFrame( - {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]}, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_separator_date_conflict(all_parsers): - # Regression test for gh-4678 - # - # Make sure thousands separator and - # date parsing do not conflict. - parser = all_parsers - data = "06-02-2013;13:00;1-000.215" - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] - ) - - df = parser.read_csv( - StringIO(data), - sep=";", - thousands="-", - parse_dates={"Date": [0, 1]}, - header=None, - ) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col_custom(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.node.add_marker(mark) - - def date_parser(*date_cols): - """ - Test date parser. - - Parameters - ---------- - date_cols : args - The list of data columns to parse. - - Returns - ------- - parsed : Series - """ - return parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), parser=du_parse - ) - - kwds = { - "header": None, - "date_parser": date_parser, - "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - ) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "actual", - "nominal", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("container", [list, tuple, Index, Series]) -@pytest.mark.parametrize("dim", [1, 2]) -def test_concat_date_col_fail(container, dim): - msg = "not all elements from date_cols are numpy arrays" - value = "19990127" - - date_cols = tuple(container([value]) for _ in range(dim)) - - with pytest.raises(ValueError, match=msg): - parsing.concat_date_cols(date_cols) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.node.add_marker(mark) - - kwds = { - "header": None, - "parse_dates": [[1, 2], [1, 3]], - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - result = parser.read_csv(StringIO(data), **kwds) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "X1_X2", - "X1_X3", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - tm.assert_frame_equal(result, expected) - - -def test_date_col_as_index_col(all_parsers): - data = """\ -KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -""" - parser = all_parsers - kwds = { - "header": None, - "parse_dates": [1], - "index_col": 1, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7"], - } - result = parser.read_csv(StringIO(data), **kwds) - - index = Index( - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 22, 0), - ], - name="X1", - ) - expected = DataFrame( - [ - ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], - ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], - ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], - ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], - ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], - ], - columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], - index=index, - ) - if parser.engine == "pyarrow": - # https://github.com/pandas-dev/pandas/issues/44231 - # pyarrow 6.0 starts to infer time type - expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time - - tm.assert_frame_equal(result, expected) - - -def test_multiple_date_cols_int_cast(all_parsers): - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - parse_dates = {"actual": [1, 2], "nominal": [1, 3]} - parser = all_parsers - - kwds = { - "header": None, - "parse_dates": parse_dates, - "date_parser": pd.to_datetime, - } - result = parser.read_csv_check_warnings( - FutureWarning, "use 'date_format' instead", StringIO(data), **kwds - ) - - expected = DataFrame( - [ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_multiple_date_col_timestamp_parse(all_parsers): - parser = all_parsers - data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 -05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=[[0, 1]], - header=None, - date_parser=Timestamp, - ) - expected = DataFrame( - [ - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 1, - "E", - 0, - np.nan, - 1306.25, - ], - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 8, - "E", - 0, - np.nan, - 1306.25, - ], - ], - columns=["0_1", 2, 3, 4, 5, 6, 7], - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_with_header(all_parsers): - parser = all_parsers - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,parse_dates,msg", - [ - ( - """\ -date_NominalTime,date,NominalTime -KORD1,19990127, 19:00:00 -KORD2,19990127, 20:00:00""", - [[1, 2]], - ("New date column already in dict date_NominalTime"), - ), - ( - """\ -ID,date,nominalTime -KORD,19990127, 19:00:00 -KORD,19990127, 20:00:00""", - {"ID": [1, 2]}, - "Date column ID already in dict", - ), - ], -) -def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), parse_dates=parse_dates) - - -def test_date_parser_int_bug(all_parsers): - # see gh-3071 - parser = all_parsers - data = ( - "posix_timestamp,elapsed,sys,user,queries,query_time,rows," - "accountid,userid,contactid,level,silo,method\n" - "1343103150,0.062353,0,4,6,0.01690,3," - "12345,1,-1,3,invoice_InvoiceResource,search\n" - ) - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - index_col=0, - parse_dates=[0], - date_parser=lambda x: datetime.utcfromtimestamp(int(x)), - ) - expected = DataFrame( - [ - [ - 0.062353, - 0, - 4, - 6, - 0.01690, - 3, - 12345, - 1, - -1, - 3, - "invoice_InvoiceResource", - "search", - ] - ], - columns=[ - "elapsed", - "sys", - "user", - "queries", - "query_time", - "rows", - "accountid", - "userid", - "contactid", - "level", - "silo", - "method", - ], - index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_nat_parse(all_parsers): - # see gh-3062 - parser = all_parsers - df = DataFrame( - { - "A": np.arange(10, dtype="float64"), - "B": Timestamp("20010101").as_unit("ns"), - } - ) - df.iloc[3:6, :] = np.nan - - with tm.ensure_clean("__nat_parse_.csv") as path: - df.to_csv(path) - - result = parser.read_csv(path, index_col=0, parse_dates=["B"]) - tm.assert_frame_equal(result, df) - - -@xfail_pyarrow -def test_csv_custom_parser(all_parsers): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=lambda x: datetime.strptime(x, "%Y%m%d"), - ) - expected = parser.read_csv(StringIO(data), parse_dates=True) - tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), date_format="%Y%m%d") - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_parse_dates_implicit_first_col(all_parsers): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), parse_dates=True) - - expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_parse_dates_string(all_parsers): - data = """date,A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) - # freq doesn't round-trip - index = DatetimeIndex( - list(date_range("1/1/2009", periods=3)), name="date", freq=None - ) - - expected = DataFrame( - {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -# Bug in https://github.com/dateutil/dateutil/issues/217 -# has been addressed, but we just don't pass in the `yearfirst` -@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") -@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) -def test_yy_format_with_year_first(all_parsers, parse_dates): - data = """date,time,B,C -090131,0010,1,2 -090228,1020,3,4 -090331,0830,5,6 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - StringIO(data), - index_col=0, - parse_dates=parse_dates, - ) - index = DatetimeIndex( - [ - datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0), - ], - dtype=object, - name="date_time", - ) - expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) -def test_parse_dates_column_list(all_parsers, parse_dates): - data = "a,b,c\n01/01/2010,1,15/02/2010" - parser = all_parsers - - expected = DataFrame( - {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} - ) - expected = expected.set_index(["a", "b"]) - - result = parser.read_csv( - StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_parse_dates(all_parsers, index_col): - data = """index1,index2,A,B,C -20090101,one,a,1,2 -20090101,two,b,3,4 -20090101,three,c,4,5 -20090102,one,a,1,2 -20090102,two,b,3,4 -20090102,three,c,4,5 -20090103,one,a,1,2 -20090103,two,b,3,4 -20090103,three,c,4,5 -""" - parser = all_parsers - index = MultiIndex.from_product( - [ - (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), - ("one", "two", "three"), - ], - names=["index1", "index2"], - ) - - # Out of order. - if index_col == [1, 0]: - index = index.swaplevel(0, 1) - - expected = DataFrame( - [ - ["a", 1, 2], - ["b", 3, 4], - ["c", 4, 5], - ["a", 1, 2], - ["b", 3, 4], - ["c", 4, 5], - ["a", 1, 2], - ["b", 3, 4], - ["c", 4, 5], - ], - columns=["A", "B", "C"], - index=index, - ) - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - StringIO(data), - index_col=index_col, - parse_dates=True, - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -@pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) -def test_parse_dates_custom_euro_format(all_parsers, kwargs): - parser = all_parsers - data = """foo,bar,baz -31/01/2010,1,2 -01/02/2010,1,NA -02/02/2010,1,2 -""" - if "dayfirst" in kwargs: - df = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - header=0, - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - exp_index = Index( - [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], - name="time", - ) - expected = DataFrame( - {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, - index=exp_index, - columns=["Q", "NTU"], - ) - tm.assert_frame_equal(df, expected) - else: - msg = "got an unexpected keyword argument 'day_first'" - with pytest.raises(TypeError, match=msg): - parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - skiprows=[0], - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - - -def test_parse_tz_aware(all_parsers, request): - # See gh-1693 - parser = all_parsers - data = "Date,x\n2012-06-13T01:39:00Z,0.5" - - result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - expected = DataFrame( - {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") - ) - tm.assert_frame_equal(result, expected) - if parser.engine == "pyarrow": - expected_tz = pytz.utc - else: - expected_tz = timezone.utc - assert result.index.tz is expected_tz - - -@xfail_pyarrow -@pytest.mark.parametrize( - "parse_dates,index_col", - [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], -) -def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): - parser = all_parsers - data = """ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD1", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD2", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD3", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD4", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD5", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD6", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - expected = expected.set_index("nominal") - - if not isinstance(parse_dates, dict): - expected.index.name = "date_NominalTime" - - result = parser.read_csv( - StringIO(data), parse_dates=parse_dates, index_col=index_col - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_chunked(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], - ) - expected = expected.set_index("nominal") - - with parser.read_csv( - StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal", - chunksize=2, - ) as reader: - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -def test_multiple_date_col_named_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - with_indices = parser.read_csv( - StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" - ) - with_names = parser.read_csv( - StringIO(data), - index_col="nominal", - parse_dates={"nominal": ["date", "nominalTime"]}, - ) - tm.assert_frame_equal(with_indices, with_names) - - -def test_multiple_date_col_multiple_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - result = parser.read_csv( - StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} - ) - expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - - expected = expected.set_index(["nominal", "ID"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) -def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): - # see gh-5636 - parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) - data = """A,B,C - 1,2,2003-11-1""" - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), parse_dates="C", **kwargs) - - -@pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) -def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): - parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) - data = """A,B,C - 1,2,2003-11-1""" - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), parse_dates=(1,)) - - -@pytest.mark.parametrize("cache_dates", [True, False]) -@pytest.mark.parametrize("value", ["nan", ""]) -def test_bad_date_parse(all_parsers, cache_dates, value): - # if we have an invalid date make sure that we handle this with - # and w/o the cache properly - parser = all_parsers - s = StringIO((f"{value},\n") * 50000) - - parser.read_csv( - s, - header=None, - names=["foo", "bar"], - parse_dates=["foo"], - cache_dates=cache_dates, - ) - - -@pytest.mark.parametrize("cache_dates", [True, False]) -@pytest.mark.parametrize("value", ["0"]) -def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): - # if we have an invalid date make sure that we handle this with - # and w/o the cache properly. - parser = all_parsers - s = StringIO((f"{value},\n") * 50000) - - if parser.engine == "pyarrow": - # pyarrow reads "0" as 0 (of type int64), and so - # pandas doesn't try to guess the datetime format - # TODO: parse dates directly in pyarrow, see - # https://github.com/pandas-dev/pandas/issues/48017 - warn = None - elif cache_dates: - # Note: warning is not raised if 'cache_dates', because here there is only a - # single unique date and hence no risk of inconsistent parsing. - warn = None - else: - warn = UserWarning - parser.read_csv_check_warnings( - warn, - "Could not infer format", - s, - header=None, - names=["foo", "bar"], - parse_dates=["foo"], - cache_dates=cache_dates, - ) - - -@xfail_pyarrow -def test_parse_dates_empty_string(all_parsers): - # see gh-2263 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) - - expected = DataFrame( - [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "reader", ["read_csv_check_warnings", "read_table_check_warnings"] -) -def test_parse_dates_infer_datetime_format_warning(all_parsers, reader): - # GH 49024, 51017 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - - getattr(parser, reader)( - FutureWarning, - "The argument 'infer_datetime_format' is deprecated", - StringIO(data), - parse_dates=["Date"], - infer_datetime_format=True, - sep=",", - ) - - -@pytest.mark.parametrize( - "reader", ["read_csv_check_warnings", "read_table_check_warnings"] -) -def test_parse_dates_date_parser_and_date_format(all_parsers, reader): - # GH 50601 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - msg = "Cannot use both 'date_parser' and 'date_format'" - with pytest.raises(TypeError, match=msg): - getattr(parser, reader)( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=["Date"], - date_parser=pd.to_datetime, - date_format="ISO8601", - sep=",", - ) - - -@xfail_pyarrow -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - "a\n04.15.2016", - {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"]), - ), - ( - "a\n04.15.2016", - {"parse_dates": True, "index_col": 0}, - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), - ), - ( - "a,b\n04.15.2016,09.16.2013", - {"parse_dates": ["a", "b"]}, - DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] - ), - ), - ( - "a,b\n04.15.2016,09.16.2013", - {"parse_dates": True, "index_col": [0, 1]}, - DataFrame( - index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] - ), - columns=[], - ), - ), - ], -) -def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): - # see gh-14066 - parser = all_parsers - - result = parser.read_csv(StringIO(data), thousands=".", **kwargs) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_parse_date_time_multi_level_column_name(all_parsers): - data = """\ -D,T,A,B -date, time,a,b -2001-01-05, 09:00:00, 0.0, 10. -2001-01-06, 00:00:00, 1.0, 11. -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=pd.to_datetime, - ) - - expected_data = [ - [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], - ] - expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """\ -date,time,a,b -2001-01-05, 10:00:00, 0.0, 10. -2001-01-05, 00:00:00, 1., 11. -""", - {"header": 0, "parse_dates": {"date_time": [0, 1]}}, - DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], - [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], - ], - columns=["date_time", "a", "b"], - ), - ), - ( - ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ), - {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}}, - DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - 0.81, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - 0.01, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ), - ), - ], -) -def test_parse_date_time(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=pd.to_datetime, - **kwargs, - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_parse_date_fields(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=lambda x: x, - ) - - expected = DataFrame( - [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], - columns=["ymd", "a"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S", None), - ], -) -def test_parse_date_all_fields(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0,0.0,10. -2001,01,5,10,0,00,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S.%f", None), - ], -) -def test_datetime_fractional_seconds(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0.123456,0.0,10. -2001,01,5,10,0,0.500000,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -def test_generic(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - - def parse_function(yy, mm): - return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ym": [0, 1]}, - date_parser=parse_function, - ) - expected = DataFrame( - [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], - columns=["ym", "day", "a"], - ) - expected["ym"] = expected["ym"].astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_date_parser_resolution_if_not_ns(all_parsers): - # see gh-10245 - parser = all_parsers - data = """\ -date,time,prn,rxstatus -2013-11-03,19:00:00,126,00E80000 -2013-11-03,19:00:00,23,00E80000 -2013-11-03,19:00:00,13,00E80000 -""" - - def date_parser(dt, time): - try: - arr = dt + "T" + time - except TypeError: - # dt & time are date/time objects - arr = [datetime.combine(d, t) for d, t in zip(dt, time)] - return np.array(arr, dtype="datetime64[s]") - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"], - ) - - datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") - expected = DataFrame( - data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_arrays( - [datetimes, [126, 23, 13]], - names=["datetime", "prn"], - ), - ) - tm.assert_frame_equal(result, expected) - - -def test_parse_date_column_with_empty_string(all_parsers): - # see gh-6428 - parser = all_parsers - data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, " - result = parser.read_csv(StringIO(data), parse_dates=["opdate"]) - - expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]] - expected = DataFrame(expected_data, columns=["case", "opdate"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,expected", - [ - ( - "a\n135217135789158401\n1352171357E+5", - DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"), - ), - ( - "a\n99999999999\n123456789012345\n1234E+0", - DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"), - ), - ], -) -@pytest.mark.parametrize("parse_dates", [True, False]) -def test_parse_date_float(all_parsers, data, expected, parse_dates): - # see gh-2697 - # - # Date parsing should fail, so we leave the data untouched - # (i.e. float precision should remain unchanged). - parser = all_parsers - - result = parser.read_csv(StringIO(data), parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -def test_parse_timezone(all_parsers): - # see gh-22256 - parser = all_parsers - data = """dt,val - 2018-01-04 09:01:00+09:00,23350 - 2018-01-04 09:02:00+09:00,23400 - 2018-01-04 09:03:00+09:00,23400 - 2018-01-04 09:04:00+09:00,23400 - 2018-01-04 09:05:00+09:00,23400""" - result = parser.read_csv(StringIO(data), parse_dates=["dt"]) - - dti = DatetimeIndex( - list( - date_range( - start="2018-01-04 09:01:00", - end="2018-01-04 09:05:00", - freq="1min", - tz=timezone(timedelta(minutes=540)), - ) - ), - freq=None, - ) - expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} - - expected = DataFrame(expected_data) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "date_string", - ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], -) -def test_invalid_parse_delimited_date(all_parsers, date_string): - parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") - result = parser.read_csv( - StringIO(date_string), - header=None, - parse_dates=[0], - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "date_string,dayfirst,expected", - [ - # %d/%m/%Y; month > 12 thus replacement - ("13/02/2019", True, datetime(2019, 2, 13)), - # %m/%d/%Y; day > 12 thus there will be no replacement - ("02/13/2019", False, datetime(2019, 2, 13)), - # %d/%m/%Y; dayfirst==True thus replacement - ("04/02/2019", True, datetime(2019, 2, 4)), - ], -) -def test_parse_delimited_date_swap_no_warning( - all_parsers, date_string, dayfirst, expected -): - parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - result = parser.read_csv( - StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "date_string,dayfirst,expected", - [ - # %d/%m/%Y; month > 12 - ("13/02/2019", False, datetime(2019, 2, 13)), - # %m/%d/%Y; day > 12 - ("02/13/2019", True, datetime(2019, 2, 13)), - ], -) -def test_parse_delimited_date_swap_with_warning( - all_parsers, date_string, dayfirst, expected -): - parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - warning_msg = ( - "Parsing dates in .* format when dayfirst=.* was specified. " - "Pass `dayfirst=.*` or specify a format to silence this warning." - ) - result = parser.read_csv_check_warnings( - UserWarning, - warning_msg, - StringIO(date_string), - header=None, - dayfirst=dayfirst, - parse_dates=[0], - ) - tm.assert_frame_equal(result, expected) - - -def test_parse_multiple_delimited_dates_with_swap_warnings(): - # GH46210 - with pytest.raises( - ValueError, - match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' - r"at position 1. You might want to try:" - ), - ): - pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) - - -def _helper_hypothesis_delimited_date(call, date_string, **kwargs): - msg, result = None, None - try: - result = call(date_string, **kwargs) - except ValueError as er: - msg = str(er) - return msg, result - - -@skip_pyarrow -@given(DATETIME_NO_TZ) -@pytest.mark.parametrize("delimiter", list(" -./")) -@pytest.mark.parametrize("dayfirst", [True, False]) -@pytest.mark.parametrize( - "date_format", - ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], -) -def test_hypothesis_delimited_date( - request, date_format, dayfirst, delimiter, test_datetime -): - if date_format == "%m %Y" and delimiter == ".": - request.node.add_marker( - pytest.mark.xfail( - reason="parse_datetime_string cannot reliably tell whether " - "e.g. %m.%Y is a float or a date" - ) - ) - date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) - - except_out_dateutil, result = _helper_hypothesis_delimited_date( - py_parse_datetime_string, date_string, dayfirst=dayfirst - ) - except_in_dateutil, expected = _helper_hypothesis_delimited_date( - du_parse, - date_string, - default=datetime(1, 1, 1), - dayfirst=dayfirst, - yearfirst=False, - ) - - assert except_out_dateutil == except_in_dateutil - assert result == expected - - -@skip_pyarrow -@pytest.mark.parametrize( - "names, usecols, parse_dates, missing_cols", - [ - (None, ["val"], ["date", "time"], "date, time"), - (None, ["val"], [0, "time"], "time"), - (None, ["val"], [["date", "time"]], "date, time"), - (None, ["val"], [[0, "time"]], "time"), - (None, ["val"], {"date": [0, "time"]}, "time"), - (None, ["val"], {"date": ["date", "time"]}, "date, time"), - (None, ["val"], [["date", "time"], "date"], "date, time"), - (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), - ( - ["date1", "time1", "temperature"], - ["date1", "temperature"], - ["date1", "time"], - "time", - ), - ], -) -def test_missing_parse_dates_column_raises( - all_parsers, names, usecols, parse_dates, missing_cols -): - # gh-31251 column names provided in parse_dates could be missing. - parser = all_parsers - content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") - msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" - with pytest.raises(ValueError, match=msg): - parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates - ) - - -@skip_pyarrow -def test_date_parser_and_names(all_parsers): - # GH#33699 - parser = all_parsers - data = StringIO("""x,y\n1,2""") - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - data, - parse_dates=["B"], - names=["B"], - ) - expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_date_parser_multiindex_columns(all_parsers): - parser = all_parsers - data = """a,b -1,2 -2019-12-31,6""" - result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) - expected = DataFrame( - {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "parse_spec, col_name", - [ - ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), - ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), - ], -) -def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): - parser = all_parsers - data = """a,b,c -1,2,3 -2019-12,-31,6""" - result = parser.read_csv( - StringIO(data), - parse_dates=parse_spec, - header=[0, 1], - ) - expected = DataFrame( - {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_date_parser_usecols_thousands(all_parsers): - # GH#39365 - data = """A,B,C - 1,3,20-09-01-01 - 2,4,20-09-01-01 - """ - - parser = all_parsers - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - StringIO(data), - parse_dates=[1], - usecols=[1, 2], - thousands="-", - ) - expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_parse_dates_and_keep_orgin_column(all_parsers): - # GH#13378 - parser = all_parsers - data = """A -20150908 -20150909 -""" - result = parser.read_csv( - StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True - ) - expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] - expected = DataFrame({"date": expected_data, "A": expected_data}) - tm.assert_frame_equal(result, expected) - - -def test_dayfirst_warnings(): - # GH 12585 - - # CASE 1: valid input - input = "date\n31/12/2014\n10/03/2011" - expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" - ) - warning_msg = ( - "Parsing dates in .* format when dayfirst=.* was specified. " - "Pass `dayfirst=.*` or specify a format to silence this warning." - ) - - # A. dayfirst arg correct, no warning - res1 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" - ).index - tm.assert_index_equal(expected, res1) - - # B. dayfirst arg incorrect, warning - with tm.assert_produces_warning(UserWarning, match=warning_msg): - res2 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected, res2) - - # CASE 2: invalid input - # cannot consistently process with single format - # return to user unaltered - - # first in DD/MM/YYYY, second in MM/DD/YYYY - input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") - - # A. use dayfirst=True - res5 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" - ).index - tm.assert_index_equal(expected, res5) - - # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg): - res6 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected, res6) - - -@pytest.mark.parametrize( - "date_string, dayfirst", - [ - pytest.param( - "31/1/2014", - False, - id="second date is single-digit", - ), - pytest.param( - "1/31/2014", - True, - id="first date is single-digit", - ), - ], -) -def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): - # GH47880 - initial_value = f"date\n{date_string}" - expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" - ) - warning_msg = ( - "Parsing dates in .* format when dayfirst=.* was specified. " - "Pass `dayfirst=.*` or specify a format to silence this warning." - ) - with tm.assert_produces_warning(UserWarning, match=warning_msg): - res = read_csv( - StringIO(initial_value), - parse_dates=["date"], - index_col="date", - dayfirst=dayfirst, - ).index - tm.assert_index_equal(expected, res) - - -@skip_pyarrow -def test_infer_first_column_as_index(all_parsers): - # GH#11019 - parser = all_parsers - data = "a,b,c\n1970-01-01,2,3,4" - result = parser.read_csv( - StringIO(data), - parse_dates=["a"], - ) - expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning), - ("date_format", "%Y-%m-%d", None), - ], -) -def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): - # GH#26203 - parser = all_parsers - data = """Test -2012-10-01 -0 -2015-05-15 -# -2017-09-09 -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - na_values={"Test": ["#", "0"]}, - parse_dates=["Test"], - **{key: value}, - ) - expected = DataFrame( - { - "Test": [ - Timestamp("2012-10-01"), - pd.NaT, - Timestamp("2015-05-15"), - pd.NaT, - Timestamp("2017-09-09"), - ] - } - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_parse_dates_and_string_dtype(all_parsers): - # GH#34066 - parser = all_parsers - data = """a,b -1,2019-12-31 -""" - result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) - expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) - expected["a"] = expected["a"].astype("string") - tm.assert_frame_equal(result, expected) - - -def test_parse_dot_separated_dates(all_parsers): - # https://github.com/pandas-dev/pandas/issues/2586 - parser = all_parsers - data = """a,b -27.03.2003 14:55:00.000,1 -03.08.2003 15:20:00.000,2""" - if parser.engine == "pyarrow": - expected_index = Index( - ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", - name="a", - ) - warn = None - else: - expected_index = DatetimeIndex( - ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ns]", - name="a", - ) - warn = UserWarning - msg = r"when dayfirst=False \(the default\) was specified" - result = parser.read_csv_check_warnings( - warn, msg, StringIO(data), parse_dates=True, index_col=0 - ) - expected = DataFrame({"b": [1, 2]}, index=expected_index) - tm.assert_frame_equal(result, expected) - - -def test_parse_dates_dict_format(all_parsers): - # GH#51240 - parser = all_parsers - data = """a,b -2019-12-31,31-12-2019 -2020-12-31,31-12-2020""" - - result = parser.read_csv( - StringIO(data), - date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"}, - parse_dates=["a", "b"], - ) - expected = DataFrame( - { - "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] -) -def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): - # GH#51240 - parser = all_parsers - data = """a,b -31-,12-2019 -31-,12-2020""" - - with tm.assert_produces_warning(None): - result = parser.read_csv( - StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates - ) - expected = DataFrame( - { - key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_parse_dates_dict_format_index(all_parsers): - # GH#51240 - parser = all_parsers - data = """a,b -2019-12-31,31-12-2019 -2020-12-31,31-12-2020""" - - result = parser.read_csv( - StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0 - ) - expected = DataFrame( - { - "b": ["31-12-2019", "31-12-2020"], - }, - index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"), - ) - tm.assert_frame_equal(result, expected) - - -def test_parse_dates_arrow_engine(all_parsers): - # GH#53295 - parser = all_parsers - data = """a,b -2000-01-01 00:00:00,1 -2000-01-01 00:00:01,1""" - - result = parser.read_csv(StringIO(data), parse_dates=["a"]) - expected = DataFrame( - { - "a": [ - Timestamp("2000-01-01 00:00:00"), - Timestamp("2000-01-01 00:00:01"), - ], - "b": 1, - } - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py deleted file mode 100644 index b22953fedd6af..0000000000000 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ /dev/null @@ -1,559 +0,0 @@ -""" -Tests that apply specifically to the Python parser. Unless specifically -stated as a Python-specific issue, the goal is to eventually move as many of -these tests out of this module as soon as the C parser can accept further -arguments when parsing. -""" -from __future__ import annotations - -import csv -from io import ( - BytesIO, - StringIO, - TextIOWrapper, -) -from typing import Iterator - -import numpy as np -import pytest - -from pandas.errors import ( - ParserError, - ParserWarning, -) - -from pandas import ( - DataFrame, - Index, - MultiIndex, -) -import pandas._testing as tm - - -def test_default_separator(python_parser_only): - # see gh-17333 - # - # csv.Sniffer in Python treats "o" as separator. - data = "aob\n1o2\n3o4" - parser = python_parser_only - expected = DataFrame({"a": [1, 3], "b": [2, 4]}) - - result = parser.read_csv(StringIO(data), sep=None) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True]) -def test_invalid_skipfooter_non_int(python_parser_only, skipfooter): - # see gh-15925 (comment) - data = "a\n1\n2" - parser = python_parser_only - msg = "skipfooter must be an integer" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), skipfooter=skipfooter) - - -def test_invalid_skipfooter_negative(python_parser_only): - # see gh-15925 (comment) - data = "a\n1\n2" - parser = python_parser_only - msg = "skipfooter cannot be negative" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), skipfooter=-1) - - -@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}]) -def test_sniff_delimiter(python_parser_only, kwargs): - data = """index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""" - parser = python_parser_only - result = parser.read_csv(StringIO(data), index_col=0, **kwargs) - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"], - index=Index(["foo", "bar", "baz"], name="index"), - ) - tm.assert_frame_equal(result, expected) - - -def test_sniff_delimiter_comment(python_parser_only): - data = """# comment line -index|A|B|C -# comment line -foo|1|2|3 # ignore | this -bar|4|5|6 -baz|7|8|9 -""" - parser = python_parser_only - result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#") - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"], - index=Index(["foo", "bar", "baz"], name="index"), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("encoding", [None, "utf-8"]) -def test_sniff_delimiter_encoding(python_parser_only, encoding): - parser = python_parser_only - data = """ignore this -ignore this too -index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""" - - if encoding is not None: - data = data.encode(encoding) - data = BytesIO(data) - data = TextIOWrapper(data, encoding=encoding) - else: - data = StringIO(data) - - result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding) - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"], - index=Index(["foo", "bar", "baz"], name="index"), - ) - tm.assert_frame_equal(result, expected) - - -def test_single_line(python_parser_only): - # see gh-6607: sniff separator - parser = python_parser_only - result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None) - - expected = DataFrame({"a": [1], "b": [2]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}]) -def test_skipfooter(python_parser_only, kwargs): - # see gh-6607 - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -want to skip this -also also skip this -""" - parser = python_parser_only - result = parser.read_csv(StringIO(data), **kwargs) - - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")] -) -def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): - # see gh-6607 - parser = python_parser_only - - with open(csv1, "rb") as f: - data = f.read() - - data = data.replace(b",", b"::") - expected = parser.read_csv(csv1) - - module = pytest.importorskip(compression) - klass = getattr(module, klass) - - with tm.ensure_clean() as path: - with klass(path, mode="wb") as tmp: - tmp.write(data) - - result = parser.read_csv(path, sep="::", compression=compression) - tm.assert_frame_equal(result, expected) - - -def test_read_csv_buglet_4x_multi_index(python_parser_only): - # see gh-6607 - data = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - parser = python_parser_only - - expected = DataFrame( - [ - [-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], - [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838], - ], - columns=["A", "B", "C", "D", "E"], - index=MultiIndex.from_tuples( - [("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)], - names=["one", "two", "three", "four"], - ), - ) - result = parser.read_csv(StringIO(data), sep=r"\s+") - tm.assert_frame_equal(result, expected) - - -def test_read_csv_buglet_4x_multi_index2(python_parser_only): - # see gh-6893 - data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9" - parser = python_parser_only - - expected = DataFrame.from_records( - [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], - columns=list("abcABC"), - index=list("abc"), - ) - result = parser.read_csv(StringIO(data), sep=r"\s+") - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("add_footer", [True, False]) -def test_skipfooter_with_decimal(python_parser_only, add_footer): - # see gh-6971 - data = "1#2\n3#4" - parser = python_parser_only - expected = DataFrame({"a": [1.2, 3.4]}) - - if add_footer: - # The stray footer line should not mess with the - # casting of the first two lines if we skip it. - kwargs = {"skipfooter": 1} - data += "\nFooter" - else: - kwargs = {} - - result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"] -) -@pytest.mark.parametrize( - "encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"] -) -def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): - # see gh-3404 - expected = DataFrame({"a": [1], "b": [2]}) - parser = python_parser_only - - data = "1" + sep + "2" - encoded_data = data.encode(encoding) - - result = parser.read_csv( - BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) -def test_multi_char_sep_quotes(python_parser_only, quoting): - # see gh-13374 - kwargs = {"sep": ",,"} - parser = python_parser_only - - data = 'a,,b\n1,,a\n2,,"2,,b"' - - if quoting == csv.QUOTE_NONE: - msg = "Expected 2 fields in line 3, saw 3" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), quoting=quoting, **kwargs) - else: - msg = "ignored when a multi-char delimiter is used" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), quoting=quoting, **kwargs) - - -def test_none_delimiter(python_parser_only, capsys): - # see gh-13374 and gh-17465 - parser = python_parser_only - data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" - expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]}) - - # We expect the third line in the data to be - # skipped because it is malformed, but we do - # not expect any errors to occur. - result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn") - tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - - -@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) -@pytest.mark.parametrize("skipfooter", [0, 1]) -def test_skipfooter_bad_row(python_parser_only, data, skipfooter): - # see gh-13879 and gh-15910 - parser = python_parser_only - if skipfooter: - msg = "parsing errors in the skipped footer rows" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), skipfooter=skipfooter) - else: - msg = "unexpected end of data|expected after" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), skipfooter=skipfooter) - - -def test_malformed_skipfooter(python_parser_only): - parser = python_parser_only - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -footer -""" - msg = "Expected 3 fields in line 4, saw 5" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) - - -def test_python_engine_file_no_next(python_parser_only): - parser = python_parser_only - - class NoNextBuffer: - def __init__(self, csv_data) -> None: - self.data = csv_data - - def __iter__(self) -> Iterator: - return self.data.__iter__() - - def read(self): - return self.data - - def readline(self): - return self.data - - parser.read_csv(NoNextBuffer("a\n1")) - - -@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]]) -def test_on_bad_lines_callable(python_parser_only, bad_line_func): - # GH 5686 - parser = python_parser_only - data = """a,b -1,2 -2,3,4,5,6 -3,4 -""" - bad_sio = StringIO(data) - result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) - expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) - tm.assert_frame_equal(result, expected) - - -def test_on_bad_lines_callable_write_to_external_list(python_parser_only): - # GH 5686 - parser = python_parser_only - data = """a,b -1,2 -2,3,4,5,6 -3,4 -""" - bad_sio = StringIO(data) - lst = [] - - def bad_line_func(bad_line: list[str]) -> list[str]: - lst.append(bad_line) - return ["2", "3"] - - result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) - expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) - tm.assert_frame_equal(result, expected) - assert lst == [["2", "3", "4", "5", "6"]] - - -@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]]) -@pytest.mark.parametrize("sep", [",", "111"]) -def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep): - # GH 5686 - # iterator=True has a separate code path than iterator=False - parser = python_parser_only - data = f""" -0{sep}1 -hi{sep}there -foo{sep}bar{sep}baz -good{sep}bye -""" - bad_sio = StringIO(data) - result_iter = parser.read_csv( - bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep - ) - expecteds = [ - {"0": "hi", "1": "there"}, - {"0": "foo", "1": "bar"}, - {"0": "good", "1": "bye"}, - ] - for i, (result, expected) in enumerate(zip(result_iter, expecteds)): - expected = DataFrame(expected, index=range(i, i + 1)) - tm.assert_frame_equal(result, expected) - - -def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only): - # GH 5686 - parser = python_parser_only - data = """a,b -1,2 -2,3,4,5,6 -3,4 -""" - bad_sio = StringIO(data) - msg = "This function is buggy." - - def bad_line_func(bad_line): - raise ValueError(msg) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(bad_sio, on_bad_lines=bad_line_func) - - -def test_on_bad_lines_callable_not_expected_length(python_parser_only): - # GH 5686 - parser = python_parser_only - data = """a,b -1,2 -2,3,4,5,6 -3,4 -""" - bad_sio = StringIO(data) - - result = parser.read_csv_check_warnings( - ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x - ) - expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) - tm.assert_frame_equal(result, expected) - - -def test_on_bad_lines_callable_returns_none(python_parser_only): - # GH 5686 - parser = python_parser_only - data = """a,b -1,2 -2,3,4,5,6 -3,4 -""" - bad_sio = StringIO(data) - - result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None) - expected = DataFrame({"a": [1, 3], "b": [2, 4]}) - tm.assert_frame_equal(result, expected) - - -def test_on_bad_lines_index_col_inferred(python_parser_only): - # GH 5686 - parser = python_parser_only - data = """a,b -1,2,3 -4,5,6 -""" - bad_sio = StringIO(data) - - result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"]) - expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4]) - tm.assert_frame_equal(result, expected) - - -def test_index_col_false_and_header_none(python_parser_only): - # GH#46955 - parser = python_parser_only - data = """ -0.5,0.03 -0.1,0.2,0.3,2 -""" - result = parser.read_csv_check_warnings( - ParserWarning, - "Length of header", - StringIO(data), - sep=",", - header=None, - index_col=False, - ) - expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]}) - tm.assert_frame_equal(result, expected) - - -def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only): - # GH#46569 - parser = python_parser_only - data = StringIO("a\na,b\nc,d,e\nf,g,h") - result = parser.read_csv_check_warnings( - ParserWarning, "Length of header", data, engine="python", index_col=False - ) - expected = DataFrame({"a": ["a", "c", "f"]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] -) -def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype): - # GH#50270 - parser = python_parser_only - data = """\ -a;b;c -0000.7995;16.000;0 -3.03.001.00514;0;4.000 -4923.600.041;23.000;131""" - result = parser.read_csv( - StringIO(data), - sep=";", - dtype=dtype, - thousands=".", - ) - expected = DataFrame( - { - "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], - "b": [16000, 0, 23000], - "c": [0, 4000, 131], - } - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype,expected", - [ - ( - {"a": str, "b": np.float64, "c": np.int64}, - DataFrame( - { - "b": [16000.1, 0, 23000], - "c": [0, 4001, 131], - } - ), - ), - ( - str, - DataFrame( - { - "b": ["16,000.1", "0", "23,000"], - "c": ["0", "4,001", "131"], - } - ), - ), - ], -) -def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected): - # GH#50270 - parser = python_parser_only - data = """a;b;c -0000,7995;16,000.1;0 -3,03,001,00514;0;4,001 -4923,600,041;23,000;131 -""" - result = parser.read_csv( - StringIO(data), - sep=";", - dtype=dtype, - thousands=",", - ) - expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py deleted file mode 100644 index 025a612dc47d2..0000000000000 --- a/pandas/tests/io/parser/test_quoting.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -Tests that quoting specifications are properly handled -during parsing for all of the parsers defined in parsers.py -""" - -import csv -from io import StringIO - -import pytest - -from pandas.compat import PY311 -from pandas.errors import ParserError - -from pandas import DataFrame -import pandas._testing as tm - -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.parametrize( - "kwargs,msg", - [ - ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'), - ( - {"quotechar": None, "quoting": csv.QUOTE_MINIMAL}, - "quotechar must be set if quoting enabled", - ), - ({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'), - ], -) -def test_bad_quote_char(all_parsers, kwargs, msg): - data = "1,2,3" - parser = all_parsers - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - - -@pytest.mark.parametrize( - "quoting,msg", - [ - ("foo", '"quoting" must be an integer|Argument'), - (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] - ], -) -def test_bad_quoting(all_parsers, quoting, msg): - data = "1,2,3" - parser = all_parsers - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), quoting=quoting) - - -def test_quote_char_basic(all_parsers): - parser = all_parsers - data = 'a,b,c\n1,2,"cat"' - expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) - - result = parser.read_csv(StringIO(data), quotechar='"') - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) -def test_quote_char_various(all_parsers, quote_char): - parser = all_parsers - expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) - - data = 'a,b,c\n1,2,"cat"' - new_data = data.replace('"', quote_char) - - result = parser.read_csv(StringIO(new_data), quotechar=quote_char) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) -@pytest.mark.parametrize("quote_char", ["", None]) -def test_null_quote_char(all_parsers, quoting, quote_char): - kwargs = {"quotechar": quote_char, "quoting": quoting} - data = "a,b,c\n1,2,3" - parser = all_parsers - - if quoting != csv.QUOTE_NONE: - # Sanity checking. - msg = ( - '"quotechar" must be a 1-character string' - if PY311 and all_parsers.engine == "python" and quote_char == "" - else "quotechar must be set if quoting enabled" - ) - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - elif not (PY311 and all_parsers.engine == "python"): - # Python 3.11+ doesn't support null/blank quote chars in their csv parsers - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "kwargs,exp_data", - [ - ({}, [[1, 2, "foo"]]), # Test default. - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]), - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]), - # QUOTE_NONE tells the reader to do no special handling - # of quote characters and leave them alone. - ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]), - # QUOTE_NONNUMERIC tells the reader to cast - # all non-quoted fields to float - ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]), - ], -) -def test_quoting_various(all_parsers, kwargs, exp_data): - data = '1,2,"foo"' - parser = all_parsers - columns = ["a", "b", "c"] - - result = parser.read_csv(StringIO(data), names=columns, **kwargs) - expected = DataFrame(exp_data, columns=columns) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] -) -def test_double_quote(all_parsers, doublequote, exp_data): - parser = all_parsers - data = 'a,b\n3,"4 "" 5"' - - result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) - expected = DataFrame(exp_data, columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("quotechar", ['"', "\u0001"]) -def test_quotechar_unicode(all_parsers, quotechar): - # see gh-14477 - data = "a\n1" - parser = all_parsers - expected = DataFrame({"a": [1]}) - - result = parser.read_csv(StringIO(data), quotechar=quotechar) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("balanced", [True, False]) -def test_unbalanced_quoting(all_parsers, balanced): - # see gh-22789. - parser = all_parsers - data = 'a,b,c\n1,2,"3' - - if balanced: - # Re-balance the quoting and read in without errors. - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - result = parser.read_csv(StringIO(data + '"')) - tm.assert_frame_equal(result, expected) - else: - msg = ( - "EOF inside string starting at row 1" - if parser.engine == "c" - else "unexpected end of data" - ) - - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data)) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py deleted file mode 100644 index 030650ad0031d..0000000000000 --- a/pandas/tests/io/parser/test_read_fwf.py +++ /dev/null @@ -1,1060 +0,0 @@ -""" -Tests the 'read_fwf' function in parsers.py. This -test suite is independent of the others because the -engine is set to 'python-fwf' internally. -""" - -from datetime import datetime -from io import ( - BytesIO, - StringIO, -) -from pathlib import Path - -import numpy as np -import pytest - -from pandas.errors import EmptyDataError - -import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, -) -import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.tests.io.test_compression import _compression_to_extension - -from pandas.io.common import urlopen -from pandas.io.parsers import ( - read_csv, - read_fwf, -) - - -def test_basic(): - data = """\ -A B C D -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 -201160 364.136849 183.628767 11806.2 -201161 413.836124 184.375703 11916.8 -201162 502.953953 173.237159 12468.3 -""" - result = read_fwf(StringIO(data)) - expected = DataFrame( - [ - [201158, 360.242940, 149.910199, 11950.7], - [201159, 444.953632, 166.985655, 11788.4], - [201160, 364.136849, 183.628767, 11806.2], - [201161, 413.836124, 184.375703, 11916.8], - [201162, 502.953953, 173.237159, 12468.3], - ], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(result, expected) - - -def test_colspecs(): - data = """\ -A B C D E -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 -201160 364.136849 183.628767 11806.2 -201161 413.836124 184.375703 11916.8 -201162 502.953953 173.237159 12468.3 -""" - colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] - result = read_fwf(StringIO(data), colspecs=colspecs) - - expected = DataFrame( - [ - [2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3], - ], - columns=["A", "B", "C", "D", "E"], - ) - tm.assert_frame_equal(result, expected) - - -def test_widths(): - data = """\ -A B C D E -2011 58 360.242940 149.910199 11950.7 -2011 59 444.953632 166.985655 11788.4 -2011 60 364.136849 183.628767 11806.2 -2011 61 413.836124 184.375703 11916.8 -2011 62 502.953953 173.237159 12468.3 -""" - result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7]) - - expected = DataFrame( - [ - [2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3], - ], - columns=["A", "B", "C", "D", "E"], - ) - tm.assert_frame_equal(result, expected) - - -def test_non_space_filler(): - # From Thomas Kluyver: - # - # Apparently, some non-space filler characters can be seen, this is - # supported by specifying the 'delimiter' character: - # - # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html - data = """\ -A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E -201158~~~~360.242940~~~149.910199~~~11950.7 -201159~~~~444.953632~~~166.985655~~~11788.4 -201160~~~~364.136849~~~183.628767~~~11806.2 -201161~~~~413.836124~~~184.375703~~~11916.8 -201162~~~~502.953953~~~173.237159~~~12468.3 -""" - colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] - result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~") - - expected = DataFrame( - [ - [2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3], - ], - columns=["A", "B", "C", "D", "E"], - ) - tm.assert_frame_equal(result, expected) - - -def test_over_specified(): - data = """\ -A B C D E -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 -201160 364.136849 183.628767 11806.2 -201161 413.836124 184.375703 11916.8 -201162 502.953953 173.237159 12468.3 -""" - colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] - - with pytest.raises(ValueError, match="must specify only one of"): - read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7]) - - -def test_under_specified(): - data = """\ -A B C D E -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 -201160 364.136849 183.628767 11806.2 -201161 413.836124 184.375703 11916.8 -201162 502.953953 173.237159 12468.3 -""" - with pytest.raises(ValueError, match="Must specify either"): - read_fwf(StringIO(data), colspecs=None, widths=None) - - -def test_read_csv_compat(): - csv_data = """\ -A,B,C,D,E -2011,58,360.242940,149.910199,11950.7 -2011,59,444.953632,166.985655,11788.4 -2011,60,364.136849,183.628767,11806.2 -2011,61,413.836124,184.375703,11916.8 -2011,62,502.953953,173.237159,12468.3 -""" - expected = read_csv(StringIO(csv_data), engine="python") - - fwf_data = """\ -A B C D E -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 -201160 364.136849 183.628767 11806.2 -201161 413.836124 184.375703 11916.8 -201162 502.953953 173.237159 12468.3 -""" - colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] - result = read_fwf(StringIO(fwf_data), colspecs=colspecs) - tm.assert_frame_equal(result, expected) - - -def test_bytes_io_input(): - result = read_fwf(BytesIO("שלום\nשלום".encode()), widths=[2, 2], encoding="utf8") - expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) - tm.assert_frame_equal(result, expected) - - -def test_fwf_colspecs_is_list_or_tuple(): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - msg = "column specifications must be a list or tuple.+" - - with pytest.raises(TypeError, match=msg): - read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",") - - -def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - msg = "Each column specification must be.+" - - with pytest.raises(TypeError, match=msg): - read_fwf(StringIO(data), colspecs=[("a", 1)]) - - -@pytest.mark.parametrize( - "colspecs,exp_data", - [ - ([(0, 3), (3, None)], [[123, 456], [456, 789]]), - ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), - ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), - ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), - ], -) -def test_fwf_colspecs_none(colspecs, exp_data): - # see gh-7079 - data = """\ -123456 -456789 -""" - expected = DataFrame(exp_data) - - result = read_fwf(StringIO(data), colspecs=colspecs, header=None) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "infer_nrows,exp_data", - [ - # infer_nrows --> colspec == [(2, 3), (5, 6)] - (1, [[1, 2], [3, 8]]), - # infer_nrows > number of rows - (10, [[1, 2], [123, 98]]), - ], -) -def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data): - # see gh-15138 - data = """\ - 1 2 -123 98 -""" - expected = DataFrame(exp_data) - - result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None) - tm.assert_frame_equal(result, expected) - - -def test_fwf_regression(): - # see gh-3594 - # - # Turns out "T060" is parsable as a datetime slice! - tz_list = [1, 10, 20, 30, 60, 80, 100] - widths = [16] + [8] * len(tz_list) - names = ["SST"] + [f"T{z:03d}" for z in tz_list[1:]] - - data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 -2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 -2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657 -2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 -2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 -""" - - with tm.assert_produces_warning(FutureWarning, match="use 'date_format' instead"): - result = read_fwf( - StringIO(data), - index_col=0, - header=None, - names=names, - widths=widths, - parse_dates=True, - date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), - ) - expected = DataFrame( - [ - [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], - [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], - [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], - [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], - [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], - ], - index=DatetimeIndex( - [ - "2009-06-13 20:20:00", - "2009-06-13 20:30:00", - "2009-06-13 20:40:00", - "2009-06-13 20:50:00", - "2009-06-13 21:00:00", - ] - ), - columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], - ) - tm.assert_frame_equal(result, expected) - result = read_fwf( - StringIO(data), - index_col=0, - header=None, - names=names, - widths=widths, - parse_dates=True, - date_format="%Y%j%H%M%S", - ) - tm.assert_frame_equal(result, expected) - - -def test_fwf_for_uint8(): - data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 -1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa: E501 - df = read_fwf( - StringIO(data), - colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)], - names=["time", "pri", "pgn", "dst", "src", "data"], - converters={ - "pgn": lambda x: int(x, 16), - "src": lambda x: int(x, 16), - "dst": lambda x: int(x, 16), - "data": lambda x: len(x.split(" ")), - }, - ) - - expected = DataFrame( - [ - [1421302965.213420, 3, 61184, 23, 40, 8], - [1421302964.226776, 6, 61442, None, 71, 8], - ], - columns=["time", "pri", "pgn", "dst", "src", "data"], - ) - expected["dst"] = expected["dst"].astype(object) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("comment", ["#", "~", "!"]) -def test_fwf_comment(comment): - data = """\ - 1 2. 4 #hello world - 5 NaN 10.0 -""" - data = data.replace("#", comment) - - colspecs = [(0, 3), (4, 9), (9, 25)] - expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]]) - - result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment) - tm.assert_almost_equal(result, expected) - - -def test_fwf_skip_blank_lines(): - data = """ - -A B C D - -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 - - -201162 502.953953 173.237159 12468.3 - -""" - result = read_fwf(StringIO(data), skip_blank_lines=True) - expected = DataFrame( - [ - [201158, 360.242940, 149.910199, 11950.7], - [201159, 444.953632, 166.985655, 11788.4], - [201162, 502.953953, 173.237159, 12468.3], - ], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(result, expected) - - data = """\ -A B C D -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 - - -201162 502.953953 173.237159 12468.3 -""" - result = read_fwf(StringIO(data), skip_blank_lines=False) - expected = DataFrame( - [ - [201158, 360.242940, 149.910199, 11950.7], - [201159, 444.953632, 166.985655, 11788.4], - [np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, np.nan], - [201162, 502.953953, 173.237159, 12468.3], - ], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("thousands", [",", "#", "~"]) -def test_fwf_thousands(thousands): - data = """\ - 1 2,334.0 5 -10 13 10. -""" - data = data.replace(",", thousands) - - colspecs = [(0, 3), (3, 11), (12, 16)] - expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]]) - - result = read_fwf( - StringIO(data), header=None, colspecs=colspecs, thousands=thousands - ) - tm.assert_almost_equal(result, expected) - - -@pytest.mark.parametrize("header", [True, False]) -def test_bool_header_arg(header): - # see gh-6114 - data = """\ -MyColumn - a - b - a - b""" - - msg = "Passing a bool to header is invalid" - with pytest.raises(TypeError, match=msg): - read_fwf(StringIO(data), header=header) - - -def test_full_file(): - # File with all values. - test = """index A B C -2000-01-03T00:00:00 0.980268513777 3 foo -2000-01-04T00:00:00 1.04791624281 -4 bar -2000-01-05T00:00:00 0.498580885705 73 baz -2000-01-06T00:00:00 1.12020151869 1 foo -2000-01-07T00:00:00 0.487094399463 0 bar -2000-01-10T00:00:00 0.836648671666 2 baz -2000-01-11T00:00:00 0.157160753327 34 foo""" - colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - - result = read_fwf(StringIO(test)) - tm.assert_frame_equal(result, expected) - - -def test_full_file_with_missing(): - # File with missing values. - test = """index A B C -2000-01-03T00:00:00 0.980268513777 3 foo -2000-01-04T00:00:00 1.04791624281 -4 bar - 0.498580885705 73 baz -2000-01-06T00:00:00 1.12020151869 1 foo -2000-01-07T00:00:00 0 bar -2000-01-10T00:00:00 0.836648671666 2 baz - 34""" - colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - - result = read_fwf(StringIO(test)) - tm.assert_frame_equal(result, expected) - - -def test_full_file_with_spaces(): - # File with spaces in columns. - test = """ -Account Name Balance CreditLimit AccountCreated -101 Keanu Reeves 9315.45 10000.00 1/17/1998 -312 Gerard Butler 90.00 1000.00 8/6/2003 -868 Jennifer Love Hewitt 0 17000.00 5/25/1985 -761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 -317 Bill Murray 789.65 5000.00 2/5/2007 -""".strip( - "\r\n" - ) - colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - - result = read_fwf(StringIO(test)) - tm.assert_frame_equal(result, expected) - - -def test_full_file_with_spaces_and_missing(): - # File with spaces and missing values in columns. - test = """ -Account Name Balance CreditLimit AccountCreated -101 10000.00 1/17/1998 -312 Gerard Butler 90.00 1000.00 8/6/2003 -868 5/25/1985 -761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 -317 Bill Murray 789.65 -""".strip( - "\r\n" - ) - colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - - result = read_fwf(StringIO(test)) - tm.assert_frame_equal(result, expected) - - -def test_messed_up_data(): - # Completely messed up file. - test = """ - Account Name Balance Credit Limit Account Created - 101 10000.00 1/17/1998 - 312 Gerard Butler 90.00 1000.00 - - 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 - 317 Bill Murray 789.65 -""".strip( - "\r\n" - ) - colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - - result = read_fwf(StringIO(test)) - tm.assert_frame_equal(result, expected) - - -def test_multiple_delimiters(): - test = r""" -col1~~~~~col2 col3++++++++++++++++++col4 -~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves - 33+++122.33\\\bar.........Gerard Butler -++44~~~~12.01 baz~~Jennifer Love Hewitt -~~55 11+++foo++++Jada Pinkett-Smith -..66++++++.03~~~bar Bill Murray -""".strip( - "\r\n" - ) - delimiter = " +~.\\" - colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) - expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter) - - result = read_fwf(StringIO(test), delimiter=delimiter) - tm.assert_frame_equal(result, expected) - - -def test_variable_width_unicode(): - data = """ -שלום שלום -ום שלל -של ום -""".strip( - "\r\n" - ) - encoding = "utf8" - kwargs = {"header": None, "encoding": encoding} - - expected = read_fwf( - BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs - ) - result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "int32"}]) -def test_dtype(dtype): - data = """ a b c -1 2 3.2 -3 4 5.2 -""" - colspecs = [(0, 5), (5, 10), (10, None)] - result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype) - - expected = DataFrame( - {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"] - ) - - for col, dt in dtype.items(): - expected[col] = expected[col].astype(dt) - - tm.assert_frame_equal(result, expected) - - -def test_skiprows_inference(): - # see gh-11256 - data = """ -Text contained in the file header - -DataCol1 DataCol2 - 0.0 1.0 - 101.6 956.1 -""".strip() - skiprows = 2 - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - - result = read_fwf(StringIO(data), skiprows=skiprows) - tm.assert_frame_equal(result, expected) - - -def test_skiprows_by_index_inference(): - data = """ -To be skipped -Not To Be Skipped -Once more to be skipped -123 34 8 123 -456 78 9 456 -""".strip() - skiprows = [0, 2] - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - - result = read_fwf(StringIO(data), skiprows=skiprows) - tm.assert_frame_equal(result, expected) - - -def test_skiprows_inference_empty(): - data = """ -AA BBB C -12 345 6 -78 901 2 -""".strip() - - msg = "No rows from which to infer column width" - with pytest.raises(EmptyDataError, match=msg): - read_fwf(StringIO(data), skiprows=3) - - -def test_whitespace_preservation(): - # see gh-16772 - header = None - csv_data = """ - a ,bbb - cc,dd """ - - fwf_data = """ - a bbb - ccdd """ - result = read_fwf( - StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" - ) - expected = read_csv(StringIO(csv_data), header=header) - tm.assert_frame_equal(result, expected) - - -def test_default_delimiter(): - header = None - csv_data = """ -a,bbb -cc,dd""" - - fwf_data = """ -a \tbbb -cc\tdd """ - result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0]) - expected = read_csv(StringIO(csv_data), header=header) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("infer", [True, False]) -def test_fwf_compression(compression_only, infer): - data = """1111111111 - 2222222222 - 3333333333""".strip() - - compression = compression_only - extension = _compression_to_extension[compression] - - kwargs = {"widths": [5, 5], "names": ["one", "two"]} - expected = read_fwf(StringIO(data), **kwargs) - - data = bytes(data, encoding="utf-8") - - with tm.ensure_clean(filename="tmp." + extension) as path: - tm.write_to_compressed(compression, path, data) - - if infer is not None: - kwargs["compression"] = "infer" if infer else compression - - result = read_fwf(path, **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_binary_mode(): - """ - read_fwf supports opening files in binary mode. - - GH 18035. - """ - data = """aas aas aas -bba bab b a""" - df_reference = DataFrame( - [["bba", "bab", "b a"]], columns=["aas", "aas.1", "aas.2"], index=[0] - ) - with tm.ensure_clean() as path: - Path(path).write_text(data) - with open(path, "rb") as file: - df = read_fwf(file) - file.seek(0) - tm.assert_frame_equal(df, df_reference) - - -@pytest.mark.parametrize("memory_map", [True, False]) -def test_encoding_mmap(memory_map): - """ - encoding should be working, even when using a memory-mapped file. - - GH 23254. - """ - encoding = "iso8859_1" - with tm.ensure_clean() as path: - Path(path).write_bytes(" 1 A Ä 2\n".encode(encoding)) - df = read_fwf( - path, - header=None, - widths=[2, 2, 2, 2], - encoding=encoding, - memory_map=memory_map, - ) - df_reference = DataFrame([[1, "A", "Ä", 2]]) - tm.assert_frame_equal(df, df_reference) - - -@pytest.mark.parametrize( - "colspecs, names, widths, index_col", - [ - ( - [(0, 6), (6, 12), (12, 18), (18, None)], - list("abcde"), - None, - None, - ), - ( - None, - list("abcde"), - [6] * 4, - None, - ), - ( - [(0, 6), (6, 12), (12, 18), (18, None)], - list("abcde"), - None, - True, - ), - ( - None, - list("abcde"), - [6] * 4, - False, - ), - ( - None, - list("abcde"), - [6] * 4, - True, - ), - ( - [(0, 6), (6, 12), (12, 18), (18, None)], - list("abcde"), - None, - False, - ), - ], -) -def test_len_colspecs_len_names(colspecs, names, widths, index_col): - # GH#40830 - data = """col1 col2 col3 col4 - bab ba 2""" - msg = "Length of colspecs must match length of names" - with pytest.raises(ValueError, match=msg): - read_fwf( - StringIO(data), - colspecs=colspecs, - names=names, - widths=widths, - index_col=index_col, - ) - - -@pytest.mark.parametrize( - "colspecs, names, widths, index_col, expected", - [ - ( - [(0, 6), (6, 12), (12, 18), (18, None)], - list("abc"), - None, - 0, - DataFrame( - index=["col1", "ba"], - columns=["a", "b", "c"], - data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], - ), - ), - ( - [(0, 6), (6, 12), (12, 18), (18, None)], - list("ab"), - None, - [0, 1], - DataFrame( - index=[["col1", "ba"], ["col2", "b ba"]], - columns=["a", "b"], - data=[["col3", "col4"], ["2", np.nan]], - ), - ), - ( - [(0, 6), (6, 12), (12, 18), (18, None)], - list("a"), - None, - [0, 1, 2], - DataFrame( - index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], - columns=["a"], - data=[["col4"], [np.nan]], - ), - ), - ( - None, - list("abc"), - [6] * 4, - 0, - DataFrame( - index=["col1", "ba"], - columns=["a", "b", "c"], - data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], - ), - ), - ( - None, - list("ab"), - [6] * 4, - [0, 1], - DataFrame( - index=[["col1", "ba"], ["col2", "b ba"]], - columns=["a", "b"], - data=[["col3", "col4"], ["2", np.nan]], - ), - ), - ( - None, - list("a"), - [6] * 4, - [0, 1, 2], - DataFrame( - index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], - columns=["a"], - data=[["col4"], [np.nan]], - ), - ), - ], -) -def test_len_colspecs_len_names_with_index_col( - colspecs, names, widths, index_col, expected -): - # GH#40830 - data = """col1 col2 col3 col4 - bab ba 2""" - result = read_fwf( - StringIO(data), - colspecs=colspecs, - names=names, - widths=widths, - index_col=index_col, - ) - tm.assert_frame_equal(result, expected) - - -def test_colspecs_with_comment(): - # GH 14135 - result = read_fwf( - StringIO("#\nA1K\n"), colspecs=[(1, 2), (2, 3)], comment="#", header=None - ) - expected = DataFrame([[1, "K"]], columns=[0, 1]) - tm.assert_frame_equal(result, expected) - - -def test_skip_rows_and_n_rows(): - # GH#44021 - data = """a\tb -1\t a -2\t b -3\t c -4\t d -5\t e -6\t f - """ - result = read_fwf(StringIO(data), nrows=4, skiprows=[2, 4]) - expected = DataFrame({"a": [1, 3, 5, 6], "b": ["a", "c", "e", "f"]}) - tm.assert_frame_equal(result, expected) - - -def test_skiprows_with_iterator(): - # GH#10261 - data = """0 -1 -2 -3 -4 -5 -6 -7 -8 -9 - """ - df_iter = read_fwf( - StringIO(data), - colspecs=[(0, 2)], - names=["a"], - iterator=True, - chunksize=2, - skiprows=[0, 1, 2, 6, 9], - ) - expected_frames = [ - DataFrame({"a": [3, 4]}), - DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]), - DataFrame({"a": []}, dtype="object"), - ] - for i, result in enumerate(df_iter): - tm.assert_frame_equal(result, expected_frames[i]) - - -def test_names_and_infer_colspecs(): - # GH#45337 - data = """X Y Z - 959.0 345 22.2 - """ - result = read_fwf(StringIO(data), skiprows=1, usecols=[0, 2], names=["a", "b"]) - expected = DataFrame({"a": [959.0], "b": 22.2}) - tm.assert_frame_equal(result, expected) - - -def test_widths_and_usecols(): - # GH#46580 - data = """0 1 n -0.4100.1 -0 2 p 0.2 90.1 -0 3 n -0.3140.4""" - result = read_fwf( - StringIO(data), - header=None, - usecols=(0, 1, 3), - widths=(3, 5, 1, 5, 5), - index_col=False, - names=("c0", "c1", "c3"), - ) - expected = DataFrame( - { - "c0": 0, - "c1": [1, 2, 3], - "c3": [-0.4, 0.2, -0.3], - } - ) - tm.assert_frame_equal(result, expected) - - -def test_dtype_backend(string_storage, dtype_backend): - # GH#50289 - if string_storage == "python": - arr = StringArray(np.array(["a", "b"], dtype=np.object_)) - arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) - else: - pa = pytest.importorskip("pyarrow") - arr = ArrowStringArray(pa.array(["a", "b"])) - arr_na = ArrowStringArray(pa.array([None, "a"])) - - data = """a b c d e f g h i -1 2.5 True a -3 4.5 False b True 6 7.5 a""" - with pd.option_context("mode.string_storage", string_storage): - result = read_fwf(StringIO(data), dtype_backend=dtype_backend) - - expected = DataFrame( - { - "a": pd.Series([1, 3], dtype="Int64"), - "b": pd.Series([2.5, 4.5], dtype="Float64"), - "c": pd.Series([True, False], dtype="boolean"), - "d": arr, - "e": pd.Series([pd.NA, True], dtype="boolean"), - "f": pd.Series([pd.NA, 6], dtype="Int64"), - "g": pd.Series([pd.NA, 7.5], dtype="Float64"), - "h": arr_na, - "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), - } - ) - if dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - expected = DataFrame( - { - col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) - for col in expected.columns - } - ) - expected["i"] = ArrowExtensionArray(pa.array([None, None])) - - tm.assert_frame_equal(result, expected) - - -def test_invalid_dtype_backend(): - msg = ( - "dtype_backend numpy is invalid, only 'numpy_nullable' and " - "'pyarrow' are allowed." - ) - with pytest.raises(ValueError, match=msg): - read_fwf("test", dtype_backend="numpy") - - -@pytest.mark.network -@tm.network( - url="ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt", - check_before_test=True, -) -def test_url_urlopen(): - expected = pd.Index( - [ - "CC", - "Network", - "Code", - "StationId", - "Latitude", - "Longitude", - "Elev", - "dummy", - "StationName", - "From", - "To", - "Nrec", - ], - dtype="object", - ) - url = "ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt" - with urlopen(url) as f: - result = read_fwf( - f, - widths=(2, 1, 3, 5, 9, 10, 7, 4, 30, 5, 5, 7), - names=( - "CC", - "Network", - "Code", - "StationId", - "Latitude", - "Longitude", - "Elev", - "dummy", - "StationName", - "From", - "To", - "Nrec", - ), - ).columns - - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py deleted file mode 100644 index c58e27aacfa00..0000000000000 --- a/pandas/tests/io/parser/test_skiprows.py +++ /dev/null @@ -1,288 +0,0 @@ -""" -Tests that skipped rows are properly handled during -parsing for all of the parsers defined in parsers.py -""" - -from datetime import datetime -from io import StringIO - -import numpy as np -import pytest - -from pandas.errors import EmptyDataError - -from pandas import ( - DataFrame, - Index, -) -import pandas._testing as tm - -# XFAIL ME PLS once hanging tests issues identified -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.parametrize("skiprows", [list(range(6)), 6]) -def test_skip_rows_bug(all_parsers, skiprows): - # see gh-505 - parser = all_parsers - text = """#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - result = parser.read_csv( - StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True - ) - index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 - ) - - expected = DataFrame( - np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index - ) - tm.assert_frame_equal(result, expected) - - -def test_deep_skip_rows(all_parsers): - # see gh-4382 - parser = all_parsers - data = "a,b,c\n" + "\n".join( - [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] - ) - condensed_data = "a,b,c\n" + "\n".join( - [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] - ) - - result = parser.read_csv(StringIO(data), skiprows=[6, 8]) - condensed_result = parser.read_csv(StringIO(condensed_data)) - tm.assert_frame_equal(result, condensed_result) - - -def test_skip_rows_blank(all_parsers): - # see gh-9832 - parser = all_parsers - text = """#foo,a,b,c -#foo,a,b,c - -#foo,a,b,c -#foo,a,b,c - -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - data = parser.read_csv( - StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True - ) - index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 - ) - - expected = DataFrame( - np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index - ) - tm.assert_frame_equal(data, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""", - {"skiprows": [1]}, - DataFrame( - [[2, "line 21\nline 22", 2], [3, "line 31", 1]], - columns=["id", "text", "num_lines"], - ), - ), - ( - "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", - {"quotechar": "~", "skiprows": [2]}, - DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), - ), - ( - ( - "Text,url\n~example\n " - "sentence\n one~,url1\n~" - "example\n sentence\n two~,url2\n~" - "example\n sentence\n three~,url3" - ), - {"quotechar": "~", "skiprows": [1, 3]}, - DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]), - ), - ], -) -def test_skip_row_with_newline(all_parsers, data, kwargs, expected): - # see gh-12775 and gh-10911 - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_skip_row_with_quote(all_parsers): - # see gh-12775 and gh-10911 - parser = all_parsers - data = """id,text,num_lines -1,"line '11' line 12",2 -2,"line '21' line 22",2 -3,"line '31' line 32",1""" - - exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] - expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) - - result = parser.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,exp_data", - [ - ( - """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""", - [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], - ), - ( - """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""", - [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], - ), - ( - """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""", - [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], - ), - ], -) -def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): - # see gh-12775 and gh-10911 - parser = all_parsers - result = parser.read_csv(StringIO(data), skiprows=[1]) - - expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" -) -def test_skiprows_lineterminator(all_parsers, lineterminator, request): - # see gh-9079 - parser = all_parsers - data = "\n".join( - [ - "SMOSMANIA ThetaProbe-ML2X ", - "2007/01/01 01:00 0.2140 U M ", - "2007/01/01 02:00 0.2141 M O ", - "2007/01/01 04:00 0.2142 D M ", - ] - ) - expected = DataFrame( - [ - ["2007/01/01", "01:00", 0.2140, "U", "M"], - ["2007/01/01", "02:00", 0.2141, "M", "O"], - ["2007/01/01", "04:00", 0.2142, "D", "M"], - ], - columns=["date", "time", "var", "flag", "oflag"], - ) - - if parser.engine == "python" and lineterminator == "\r": - mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet") - request.node.add_marker(mark) - - data = data.replace("\n", lineterminator) - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], - ) - tm.assert_frame_equal(result, expected) - - -def test_skiprows_infield_quote(all_parsers): - # see gh-14459 - parser = all_parsers - data = 'a"\nb"\na\n1' - expected = DataFrame({"a": [1]}) - - result = parser.read_csv(StringIO(data), skiprows=2) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "kwargs,expected", - [ - ({}, DataFrame({"1": [3, 5]})), - ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})), - ], -) -def test_skip_rows_callable(all_parsers, kwargs, expected): - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - - result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_skip_rows_callable_not_in(all_parsers): - parser = all_parsers - data = "0,a\n1,b\n2,c\n3,d\n4,e" - expected = DataFrame([[1, "b"], [3, "d"]]) - - result = parser.read_csv( - StringIO(data), header=None, skiprows=lambda x: x not in [1, 3] - ) - tm.assert_frame_equal(result, expected) - - -def test_skip_rows_skip_all(all_parsers): - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - msg = "No columns to parse from file" - - with pytest.raises(EmptyDataError, match=msg): - parser.read_csv(StringIO(data), skiprows=lambda x: True) - - -def test_skip_rows_bad_callable(all_parsers): - msg = "by zero" - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - - with pytest.raises(ZeroDivisionError, match=msg): - parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) - - -def test_skip_rows_and_n_rows(all_parsers): - # GH#44021 - data = """a,b -1,a -2,b -3,c -4,d -5,e -6,f -7,g -8,h -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) - expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py deleted file mode 100644 index f150ed3903443..0000000000000 --- a/pandas/tests/io/parser/test_textreader.py +++ /dev/null @@ -1,343 +0,0 @@ -""" -Tests the TextReader class in parsers.pyx, which -is integral to the C engine in parsers.py -""" -from io import ( - BytesIO, - StringIO, -) - -import numpy as np -import pytest - -import pandas._libs.parsers as parser -from pandas._libs.parsers import TextReader - -from pandas import DataFrame -import pandas._testing as tm - -from pandas.io.parsers import ( - TextFileReader, - read_csv, -) -from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs - - -class TestTextReader: - @pytest.fixture - def csv_path(self, datapath): - return datapath("io", "data", "csv", "test1.csv") - - def test_file_handle(self, csv_path): - with open(csv_path, "rb") as f: - reader = TextReader(f) - reader.read() - - def test_file_handle_mmap(self, csv_path): - # this was never using memory_map=True - with open(csv_path, "rb") as f: - reader = TextReader(f, header=None) - reader.read() - - def test_StringIO(self, csv_path): - with open(csv_path, "rb") as f: - text = f.read() - src = BytesIO(text) - reader = TextReader(src, header=None) - reader.read() - - def test_string_factorize(self): - # should this be optional? - data = "a\nb\na\nb\na" - reader = TextReader(StringIO(data), header=None) - result = reader.read() - assert len(set(map(id, result[0]))) == 2 - - def test_skipinitialspace(self): - data = "a, b\na, b\na, b\na, b" - - reader = TextReader(StringIO(data), skipinitialspace=True, header=None) - result = reader.read() - - tm.assert_numpy_array_equal( - result[0], np.array(["a", "a", "a", "a"], dtype=np.object_) - ) - tm.assert_numpy_array_equal( - result[1], np.array(["b", "b", "b", "b"], dtype=np.object_) - ) - - def test_parse_booleans(self): - data = "True\nFalse\nTrue\nTrue" - - reader = TextReader(StringIO(data), header=None) - result = reader.read() - - assert result[0].dtype == np.bool_ - - def test_delimit_whitespace(self): - data = 'a b\na\t\t "b"\n"a"\t \t b' - - reader = TextReader(StringIO(data), delim_whitespace=True, header=None) - result = reader.read() - - tm.assert_numpy_array_equal( - result[0], np.array(["a", "a", "a"], dtype=np.object_) - ) - tm.assert_numpy_array_equal( - result[1], np.array(["b", "b", "b"], dtype=np.object_) - ) - - def test_embedded_newline(self): - data = 'a\n"hello\nthere"\nthis' - - reader = TextReader(StringIO(data), header=None) - result = reader.read() - - expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_) - tm.assert_numpy_array_equal(result[0], expected) - - def test_euro_decimal(self): - data = "12345,67\n345,678" - - reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None) - result = reader.read() - - expected = np.array([12345.67, 345.678]) - tm.assert_almost_equal(result[0], expected) - - def test_integer_thousands(self): - data = "123,456\n12,500" - - reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None) - result = reader.read() - - expected = np.array([123456, 12500], dtype=np.int64) - tm.assert_almost_equal(result[0], expected) - - def test_integer_thousands_alt(self): - data = "123.456\n12.500" - - reader = TextFileReader( - StringIO(data), delimiter=":", thousands=".", header=None - ) - result = reader.read() - - expected = DataFrame([123456, 12500]) - tm.assert_frame_equal(result, expected) - - def test_skip_bad_lines(self, capsys): - # too many lines, see #2430 for why - data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" - - reader = TextReader(StringIO(data), delimiter=":", header=None) - msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4" - with pytest.raises(parser.ParserError, match=msg): - reader.read() - - reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip - ) - result = reader.read() - expected = { - 0: np.array(["a", "d", "g", "l"], dtype=object), - 1: np.array(["b", "e", "h", "m"], dtype=object), - 2: np.array(["c", "f", "i", "n"], dtype=object), - } - assert_array_dicts_equal(result, expected) - - reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn - ) - reader.read() - captured = capsys.readouterr() - - assert "Skipping line 4" in captured.err - assert "Skipping line 6" in captured.err - - def test_header_not_enough_lines(self): - data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" - - reader = TextReader(StringIO(data), delimiter=",", header=2) - header = reader.header - expected = [["a", "b", "c"]] - assert header == expected - - recs = reader.read() - expected = { - 0: np.array([1, 4], dtype=np.int64), - 1: np.array([2, 5], dtype=np.int64), - 2: np.array([3, 6], dtype=np.int64), - } - assert_array_dicts_equal(recs, expected) - - def test_escapechar(self): - data = '\\"hello world"\n\\"hello world"\n\\"hello world"' - - reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") - result = reader.read() - expected = {0: np.array(['"hello world"'] * 3, dtype=object)} - assert_array_dicts_equal(result, expected) - - def test_eof_has_eol(self): - # handling of new line at EOF - pass - - def test_na_substitution(self): - pass - - def test_numpy_string_dtype(self): - data = """\ -a,1 -aa,2 -aaa,3 -aaaa,4 -aaaaa,5""" - - def _make_reader(**kwds): - if "dtype" in kwds: - kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) - return TextReader(StringIO(data), delimiter=",", header=None, **kwds) - - reader = _make_reader(dtype="S5,i4") - result = reader.read() - - assert result[0].dtype == "S5" - - ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5") - assert (result[0] == ex_values).all() - assert result[1].dtype == "i4" - - reader = _make_reader(dtype="S4") - result = reader.read() - assert result[0].dtype == "S4" - ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4") - assert (result[0] == ex_values).all() - assert result[1].dtype == "S4" - - def test_pass_dtype(self): - data = """\ -one,two -1,a -2,b -3,c -4,d""" - - def _make_reader(**kwds): - if "dtype" in kwds: - kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) - return TextReader(StringIO(data), delimiter=",", **kwds) - - reader = _make_reader(dtype={"one": "u1", 1: "S1"}) - result = reader.read() - assert result[0].dtype == "u1" - assert result[1].dtype == "S1" - - reader = _make_reader(dtype={"one": np.uint8, 1: object}) - result = reader.read() - assert result[0].dtype == "u1" - assert result[1].dtype == "O" - - reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")}) - result = reader.read() - assert result[0].dtype == "u1" - assert result[1].dtype == "O" - - def test_usecols(self): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - - def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=",", **kwds) - - reader = _make_reader(usecols=(1, 2)) - result = reader.read() - - exp = _make_reader().read() - assert len(result) == 2 - assert (result[1] == exp[1]).all() - assert (result[2] == exp[2]).all() - - @pytest.mark.parametrize( - "text, kwargs", - [ - ("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}), - ( - "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12", - {"delim_whitespace": True}, - ), - ("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}), - ( - ( - "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r" - "AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r" - ",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0" - ), - {"delimiter": ","}, - ), - ("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}), - ("A B C\r2 3\r4 5 6", {"delim_whitespace": True}), - ], - ) - def test_cr_delimited(self, text, kwargs): - nice_text = text.replace("\r", "\r\n") - result = TextReader(StringIO(text), **kwargs).read() - expected = TextReader(StringIO(nice_text), **kwargs).read() - assert_array_dicts_equal(result, expected) - - def test_empty_field_eof(self): - data = "a,b,c\n1,2,3\n4,," - - result = TextReader(StringIO(data), delimiter=",").read() - - expected = { - 0: np.array([1, 4], dtype=np.int64), - 1: np.array(["2", ""], dtype=object), - 2: np.array(["3", ""], dtype=object), - } - assert_array_dicts_equal(result, expected) - - # GH5664 - a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"]) - b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1]) - c = DataFrame( - [ - [1, 2, 3, 4], - [6, np.nan, np.nan, np.nan], - [8, 9, 10, 11], - [13, 14, np.nan, np.nan], - ], - columns=list("abcd"), - index=[0, 5, 7, 12], - ) - - for _ in range(100): - df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c") - tm.assert_frame_equal(df, a) - - df = read_csv( - StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c" - ) - tm.assert_frame_equal(df, b) - - df = read_csv( - StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"), - names=list("abcd"), - engine="c", - ) - tm.assert_frame_equal(df, c) - - def test_empty_csv_input(self): - # GH14867 - with read_csv( - StringIO(), chunksize=20, header=None, names=["a", "b", "c"] - ) as df: - assert isinstance(df, TextFileReader) - - -def assert_array_dicts_equal(left, right): - for k, v in left.items(): - tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k])) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py deleted file mode 100644 index 1a9d99b0b5c1f..0000000000000 --- a/pandas/tests/io/parser/test_unsupported.py +++ /dev/null @@ -1,212 +0,0 @@ -""" -Tests that features that are currently unsupported in -either the Python or C parser are actually enforced -and are clearly communicated to the user. - -Ultimately, the goal is to remove test cases from this -test suite as new feature support is added to the parsers. -""" -from io import StringIO -import os -from pathlib import Path - -import pytest - -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) -from pandas.errors import ParserError - -import pandas._testing as tm - -from pandas.io.parsers import read_csv -import pandas.io.parsers.readers as parsers - - -@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) -def python_engine(request): - return request.param - - -class TestUnsupportedFeatures: - def test_mangle_dupe_cols_false(self): - # see gh-12935 - data = "a b c\n1 2 3" - - for engine in ("c", "python"): - with pytest.raises(TypeError, match="unexpected keyword"): - read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True) - - def test_c_engine(self): - # see gh-6607 - data = "a b c\n1 2 3" - msg = "does not support" - - # specify C engine with unsupported options (raise) - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="c", sep=r"\s") - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128)) - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="c", skipfooter=1) - - # specify C-unsupported options without python-unsupported options - with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), sep=None, delim_whitespace=False) - with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), sep=r"\s") - with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), sep="\t", quotechar=chr(128)) - with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), skipfooter=1) - - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - msg = "Error tokenizing data" - - with pytest.raises(ParserError, match=msg): - read_csv(StringIO(text), sep="\\s+") - with pytest.raises(ParserError, match=msg): - read_csv(StringIO(text), engine="c", sep="\\s+") - - msg = "Only length-1 thousands markers supported" - data = """A|B|C -1|2,334|5 -10|13|10. -""" - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), thousands=",,") - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), thousands="") - - msg = "Only length-1 line terminators supported" - data = "a,b,c~~1,2,3~~4,5,6" - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), lineterminator="~~") - - def test_python_engine(self, python_engine): - from pandas.io.parsers.readers import _python_unsupported as py_unsupported - - data = """1,2,3,, -1,2,3,4, -1,2,3,4,5 -1,2,,, -1,2,3,4,""" - - for default in py_unsupported: - msg = ( - f"The {repr(default)} option is not " - f"supported with the {repr(python_engine)} engine" - ) - - kwargs = {default: object()} - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine=python_engine, **kwargs) - - def test_python_engine_file_no_iter(self, python_engine): - # see gh-16530 - class NoNextBuffer: - def __init__(self, csv_data) -> None: - self.data = csv_data - - def __next__(self): - return self.data.__next__() - - def read(self): - return self.data - - def readline(self): - return self.data - - data = "a\n1" - msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator" - - with pytest.raises(TypeError, match=msg): - read_csv(NoNextBuffer(data), engine=python_engine) - - def test_pyarrow_engine(self): - from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported - - data = """1,2,3,, - 1,2,3,4, - 1,2,3,4,5 - 1,2,,, - 1,2,3,4,""" - - for default in pa_unsupported: - msg = ( - f"The {repr(default)} option is not " - f"supported with the 'pyarrow' engine" - ) - kwargs = {default: object()} - default_needs_bool = {"warn_bad_lines", "error_bad_lines"} - if default == "dialect": - kwargs[default] = "excel" # test a random dialect - elif default in default_needs_bool: - kwargs[default] = True - elif default == "on_bad_lines": - kwargs[default] = "warn" - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="pyarrow", **kwargs) - - def test_on_bad_lines_callable_python_only(self, all_parsers): - # GH 5686 - sio = StringIO("a,b\n1,2") - bad_lines_func = lambda x: x - parser = all_parsers - if all_parsers.engine != "python": - msg = "on_bad_line can only be a callable function if engine='python'" - with pytest.raises(ValueError, match=msg): - parser.read_csv(sio, on_bad_lines=bad_lines_func) - else: - parser.read_csv(sio, on_bad_lines=bad_lines_func) - - -def test_close_file_handle_on_invalid_usecols(all_parsers): - # GH 45384 - parser = all_parsers - - error = ValueError - if parser.engine == "pyarrow": - pyarrow = pytest.importorskip("pyarrow") - error = pyarrow.lib.ArrowKeyError - if is_ci_environment() and (is_platform_windows() or is_platform_mac()): - # GH#45547 causes timeouts on windows/mac builds - pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22") - - with tm.ensure_clean("test.csv") as fname: - Path(fname).write_text("col1,col2\na,b\n1,2") - with tm.assert_produces_warning(False): - with pytest.raises(error, match="col3"): - parser.read_csv(fname, usecols=["col1", "col2", "col3"]) - # unlink fails on windows if file handles still point to it - os.unlink(fname) - - -def test_invalid_file_inputs(request, all_parsers): - # GH#45957 - parser = all_parsers - if parser.engine == "python": - request.node.add_marker( - pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.") - ) - - with pytest.raises(ValueError, match="Invalid"): - parser.read_csv([]) - - -def test_invalid_dtype_backend(all_parsers): - parser = all_parsers - msg = ( - "dtype_backend numpy is invalid, only 'numpy_nullable' and " - "'pyarrow' are allowed." - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv("test", dtype_backend="numpy") diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py deleted file mode 100644 index 558822b84620a..0000000000000 --- a/pandas/tests/io/parser/test_upcast.py +++ /dev/null @@ -1,107 +0,0 @@ -import numpy as np -import pytest - -from pandas._libs.parsers import ( - _maybe_upcast, - na_values, -) -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import NA -import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - BooleanArray, - FloatingArray, - IntegerArray, - StringArray, -) - - -def test_maybe_upcast(any_real_numpy_dtype): - # GH#36712 - - dtype = np.dtype(any_real_numpy_dtype) - na_value = na_values[dtype] - arr = np.array([1, 2, na_value], dtype=dtype) - result = _maybe_upcast(arr, use_dtype_backend=True) - - expected_mask = np.array([False, False, True]) - if issubclass(dtype.type, np.integer): - expected = IntegerArray(arr, mask=expected_mask) - else: - expected = FloatingArray(arr, mask=expected_mask) - - tm.assert_extension_array_equal(result, expected) - - -def test_maybe_upcast_no_na(any_real_numpy_dtype): - # GH#36712 - if any_real_numpy_dtype == "float32": - pytest.skip() - - arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype) - result = _maybe_upcast(arr, use_dtype_backend=True) - - expected_mask = np.array([False, False, False]) - if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer): - expected = IntegerArray(arr, mask=expected_mask) - else: - expected = FloatingArray(arr, mask=expected_mask) - - tm.assert_extension_array_equal(result, expected) - - -def test_maybe_upcaste_bool(): - # GH#36712 - dtype = np.bool_ - na_value = na_values[dtype] - arr = np.array([True, False, na_value], dtype="uint8").view(dtype) - result = _maybe_upcast(arr, use_dtype_backend=True) - - expected_mask = np.array([False, False, True]) - expected = BooleanArray(arr, mask=expected_mask) - tm.assert_extension_array_equal(result, expected) - - -def test_maybe_upcaste_bool_no_nan(): - # GH#36712 - dtype = np.bool_ - arr = np.array([True, False, False], dtype="uint8").view(dtype) - result = _maybe_upcast(arr, use_dtype_backend=True) - - expected_mask = np.array([False, False, False]) - expected = BooleanArray(arr, mask=expected_mask) - tm.assert_extension_array_equal(result, expected) - - -def test_maybe_upcaste_all_nan(): - # GH#36712 - dtype = np.int64 - na_value = na_values[dtype] - arr = np.array([na_value, na_value], dtype=dtype) - result = _maybe_upcast(arr, use_dtype_backend=True) - - expected_mask = np.array([True, True]) - expected = IntegerArray(arr, mask=expected_mask) - tm.assert_extension_array_equal(result, expected) - - -@td.skip_if_no("pyarrow") -@pytest.mark.parametrize("val", [na_values[np.object_], "c"]) -def test_maybe_upcast_object(val, string_storage): - # GH#36712 - import pyarrow as pa - - with pd.option_context("mode.string_storage", string_storage): - arr = np.array(["a", "b", val], dtype=np.object_) - result = _maybe_upcast(arr, use_dtype_backend=True) - - if string_storage == "python": - exp_val = "c" if val == "c" else NA - expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) - else: - exp_val = "c" if val == "c" else None - expected = ArrowStringArray(pa.array(["a", "b", exp_val])) - tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/__init__.py b/pandas/tests/io/parser/usecols/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py deleted file mode 100644 index 32231cbbdda64..0000000000000 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ /dev/null @@ -1,159 +0,0 @@ -""" -Tests the usecols functionality during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -import pytest - -from pandas import ( - DataFrame, - Index, - Timestamp, -) -import pandas._testing as tm - -_msg_validate_usecols_arg = ( - "'usecols' must either be list-like " - "of all strings, all unicode, all " - "integers or a callable." -) -_msg_validate_usecols_names = ( - "Usecols do not match columns, columns expected but not found: {0}" -) - -# TODO(1.4): Change these to xfails whenever parse_dates support(which was -# intentionally disable to keep small PR sizes) is added back -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates2(all_parsers): - # see gh-13604 - parser = all_parsers - data = """2008-02-07 09:40,1032.43 -2008-02-07 09:50,1042.54 -2008-02-07 10:00,1051.65""" - - names = ["date", "values"] - usecols = names[:] - parse_dates = [0] - - index = Index( - [ - Timestamp("2008-02-07 09:40"), - Timestamp("2008-02-07 09:50"), - Timestamp("2008-02-07 10:00"), - ], - name="date", - ) - cols = {"values": [1032.43, 1042.54, 1051.65]} - expected = DataFrame(cols, index=index) - - result = parser.read_csv( - StringIO(data), - parse_dates=parse_dates, - index_col=0, - usecols=usecols, - header=None, - names=names, - ) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates3(all_parsers): - # see gh-14792 - parser = all_parsers - data = """a,b,c,d,e,f,g,h,i,j -2016/09/21,1,1,2,3,4,5,6,7,8""" - - usecols = list("abcdefghij") - parse_dates = [0] - - cols = { - "a": Timestamp("2016-09-21").as_unit("ns"), - "b": [1], - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=usecols) - - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - result = parser.read_csv( - StringIO(data), - usecols=usecols, - parse_dates=parse_dates, - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): - # see gh-9755 - s = """0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_strings.py b/pandas/tests/io/parser/usecols/test_strings.py deleted file mode 100644 index 8cecf1fc981ee..0000000000000 --- a/pandas/tests/io/parser/usecols/test_strings.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -Tests the usecols functionality during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -import pytest - -from pandas import DataFrame -import pandas._testing as tm - -_msg_validate_usecols_arg = ( - "'usecols' must either be list-like " - "of all strings, all unicode, all " - "integers or a callable." -) -_msg_validate_usecols_names = ( - "Usecols do not match columns, columns expected but not found: {0}" -) - - -def test_usecols_with_unicode_strings(all_parsers): - # see gh-13219 - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "AAA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "BBB": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_single_byte_unicode_strings(all_parsers): - # see gh-13219 - data = """A,B,C,D -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "A": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "B": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) -def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - -@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) -def test_usecols_with_multi_byte_characters(all_parsers, usecols): - data = """あああ,いい,ううう,ええええ -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "あああ": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "いい": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py deleted file mode 100644 index 032cb961103df..0000000000000 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ /dev/null @@ -1,432 +0,0 @@ -""" -Tests the usecols functionality during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas.errors import ParserError - -from pandas import ( - DataFrame, - Index, -) -import pandas._testing as tm - -_msg_validate_usecols_arg = ( - "'usecols' must either be list-like " - "of all strings, all unicode, all " - "integers or a callable." -) -_msg_validate_usecols_names = ( - "Usecols do not match columns, columns expected but not found: {0}" -) - -# TODO(1.4): Change to xfails at release time -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - - -def test_raise_on_mixed_dtype_usecols(all_parsers): - # See gh-12678 - data = """a,b,c - 1000,2000,3000 - 4000,5000,6000 - """ - usecols = [0, "b", 2] - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - -@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) -def test_usecols(all_parsers, usecols): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_names(all_parsers): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - names = ["foo", "bar"] - result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] -) -def test_usecols_relative_to_names(all_parsers, names, usecols): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_relative_to_names2(all_parsers): - # see gh-5766 - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv( - StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] - ) - - expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_name_length_conflict(all_parsers): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - msg = "Number of passed names did not match number of header fields in the file" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) - - -def test_usecols_single_string(all_parsers): - # see gh-20558 - parser = all_parsers - data = """foo, bar, baz -1000, 2000, 3000 -4000, 5000, 6000""" - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols="foo") - - -@pytest.mark.parametrize( - "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] -) -def test_usecols_index_col_false(all_parsers, data): - # see gh-9082 - parser = all_parsers - usecols = ["a", "c", "d"] - expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) - - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("index_col", ["b", 0]) -@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) -def test_usecols_index_col_conflict(all_parsers, usecols, index_col): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" - expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) - - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) - tm.assert_frame_equal(result, expected) - - -def test_usecols_index_col_conflict2(all_parsers): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" - - expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) - expected = expected.set_index(["b", "c"]) - - result = parser.read_csv( - StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] - ) - tm.assert_frame_equal(result, expected) - - -def test_usecols_implicit_index_col(all_parsers): - # see gh-2654 - parser = all_parsers - data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" - - result = parser.read_csv(StringIO(data), usecols=["a", "b"]) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_index_col_middle(all_parsers): - # GH#9098 - parser = all_parsers - data = """a,b,c,d -1,2,3,4 -""" - result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c") - expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c")) - tm.assert_frame_equal(result, expected) - - -def test_usecols_index_col_end(all_parsers): - # GH#9098 - parser = all_parsers - data = """a,b,c,d -1,2,3,4 -""" - result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d") - expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d")) - tm.assert_frame_equal(result, expected) - - -def test_usecols_regex_sep(all_parsers): - # see gh-2733 - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) - - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_whitespace(all_parsers): - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - - result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "usecols,expected", - [ - # Column selection by index. - ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), - # Column selection by name. - ( - ["0", "1"], - DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]), - ), - ], -) -def test_usecols_with_integer_like_header(all_parsers, usecols, expected): - parser = all_parsers - data = """2,0,1 -1000,2000,3000 -4000,5000,6000""" - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -def test_empty_usecols(all_parsers): - data = "a,b,c\n1,2,3\n4,5,6" - expected = DataFrame(columns=Index([])) - parser = all_parsers - - result = parser.read_csv(StringIO(data), usecols=set()) - tm.assert_frame_equal(result, expected) - - -def test_np_array_usecols(all_parsers): - # see gh-12546 - parser = all_parsers - data = "a,b,c\n1,2,3" - usecols = np.array(["a", "b"]) - - expected = DataFrame([[1, 2]], columns=usecols) - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "usecols,expected", - [ - ( - lambda x: x.upper() in ["AAA", "BBB", "DDD"], - DataFrame( - { - "AaA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "bBb": {0: 8, 1: 2, 2: 7}, - "ddd": {0: "a", 1: "b", 2: "a"}, - } - ), - ), - (lambda x: False, DataFrame(columns=Index([]))), - ], -) -def test_callable_usecols(all_parsers, usecols, expected): - # see gh-14154 - data = """AaA,bBb,CCC,ddd -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) -def test_incomplete_first_row(all_parsers, usecols): - # see gh-6710 - data = "1,2\n1,2,3" - parser = all_parsers - names = ["a", "b", "c"] - expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) - - result = parser.read_csv(StringIO(data), names=names, usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,usecols,kwargs,expected", - [ - # see gh-8985 - ( - "19,29,39\n" * 2 + "10,20,30,40", - [0, 1, 2], - {"header": None}, - DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), - ), - # see gh-9549 - ( - ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), - ["A", "B", "C"], - {}, - DataFrame( - { - "A": [1, 3, 1, 1, 1, 5], - "B": [2, 4, 2, 2, 2, 6], - "C": [3, 5, 4, 3, 3, 7], - } - ), - ), - ], -) -def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): - # see gh-8985 - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "usecols,kwargs,expected,msg", - [ - ( - ["a", "b", "c", "d"], - {}, - DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), - None, - ), - ( - ["a", "b", "c", "f"], - {}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), - ( - ["a", "b", "f", "g"], - {}, - None, - _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), - ), - # see gh-14671 - ( - None, - {"header": 0, "names": ["A", "B", "C", "D"]}, - DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), - None, - ), - ( - ["A", "B", "C", "f"], - {"header": 0, "names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ( - ["A", "B", "f"], - {"names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ], -) -def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - kwargs.update(usecols=usecols) - parser = all_parsers - - if expected is None: - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) -def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - names = ["A", "B", "C", "D"] - parser = all_parsers - - result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) - expected = DataFrame({"A": [1, 5], "C": [3, 7]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("names", [None, ["a", "b"]]) -def test_usecols_indices_out_of_bounds(all_parsers, names): - # GH#25623 & GH 41130; enforced in 2.0 - parser = all_parsers - data = """ -a,b -1,2 - """ - with pytest.raises(ParserError, match="Defining usecols without of bounds"): - parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) - - -def test_usecols_additional_columns(all_parsers): - # GH#46997 - parser = all_parsers - usecols = lambda header: header.strip() in ["a", "b", "c"] - result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) - expected = DataFrame({"a": ["x"], "b": "y"}) - tm.assert_frame_equal(result, expected) - - -def test_usecols_additional_columns_integer_columns(all_parsers): - # GH#46997 - parser = all_parsers - usecols = lambda header: header.strip() in ["0", "1"] - result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) - expected = DataFrame({"0": ["x"], "1": "y"}) - tm.assert_frame_equal(result, expected)