Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,9 @@ date_parser : function, default ``None``
.. deprecated:: 2.0.0
Use ``date_format`` instead, or read in as ``object`` and then apply
:func:`to_datetime` as-needed.
date_format : str, default ``None``
date_format : str or dict of column -> format default ``None``
If used in conjunction with ``parse_dates``, will parse dates according to this
format. For anything more complex (e.g. different formats for different columns),
format. For anything more complex,
please read in as ``object`` and then apply :func:`to_datetime` as-needed.

.. versionadded:: 2.0.0
Expand Down Expand Up @@ -912,7 +912,7 @@ Finally, the parser allows you to specify a custom ``date_format``.
Performance-wise, you should try these methods of parsing dates in order:

1. If you know the format, use ``date_format``, e.g.:
``date_format="%d/%m/%Y"``.
``date_format="%d/%m/%Y"`` or ``date_format={column_name: "%d/%m/%Y"}``.

2. If you different formats for different columns, or want to pass any extra options (such
as ``utc``) to ``to_datetime``, then you should read in your data as ``object`` dtype, and
Expand Down
14 changes: 7 additions & 7 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,9 @@
.. deprecated:: 2.0.0
Use ``date_format`` instead, or read in as ``object`` and then apply
:func:`to_datetime` as-needed.
date_format : str, default ``None``
date_format : str or dict of column -> format, default ``None``
If used in conjunction with ``parse_dates``, will parse dates according to this
format. For anything more complex (e.g. different formats for different columns),
format. For anything more complex,
please read in as ``object`` and then apply :func:`to_datetime` as-needed.

.. versionadded:: 2.0.0
Expand Down Expand Up @@ -397,7 +397,7 @@ def read_excel(
verbose: bool = ...,
parse_dates: list | dict | bool = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | None = ...,
date_format: dict[Hashable, str] | str | None = ...,
thousands: str | None = ...,
decimal: str = ...,
comment: str | None = ...,
Expand Down Expand Up @@ -437,7 +437,7 @@ def read_excel(
verbose: bool = ...,
parse_dates: list | dict | bool = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | None = ...,
date_format: dict[Hashable, str] | str | None = ...,
thousands: str | None = ...,
decimal: str = ...,
comment: str | None = ...,
Expand Down Expand Up @@ -477,7 +477,7 @@ def read_excel(
verbose: bool = False,
parse_dates: list | dict | bool = False,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: str | None = None,
date_format: dict[Hashable, str] | str | None = None,
thousands: str | None = None,
decimal: str = ".",
comment: str | None = None,
Expand Down Expand Up @@ -726,7 +726,7 @@ def parse(
verbose: bool = False,
parse_dates: list | dict | bool = False,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: str | None = None,
date_format: dict[Hashable, str] | str | None = None,
thousands: str | None = None,
decimal: str = ".",
comment: str | None = None,
Expand Down Expand Up @@ -1554,7 +1554,7 @@ def parse(
na_values=None,
parse_dates: list | dict | bool = False,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: str | None = None,
date_format: str | dict[Hashable, str] | None = None,
thousands: str | None = None,
comment: str | None = None,
skipfooter: int = 0,
Expand Down
20 changes: 14 additions & 6 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,10 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:

for i, arr in enumerate(index):
if try_parse_dates and self._should_parse_dates(i):
arr = self._date_conv(arr)
arr = self._date_conv(
arr,
col=self.index_names[i] if self.index_names is not None else None,
)

if self.na_filter:
col_na_values = self.na_values
Expand Down Expand Up @@ -1094,7 +1097,7 @@ def _make_date_converter(
date_parser=lib.no_default,
dayfirst: bool = False,
cache_dates: bool = True,
date_format: str | None = None,
date_format: dict[Hashable, str] | str | None = None,
):
if date_parser is not lib.no_default:
warnings.warn(
Expand All @@ -1108,13 +1111,16 @@ def _make_date_converter(
if date_parser is not lib.no_default and date_format is not None:
raise TypeError("Cannot use both 'date_parser' and 'date_format'")

def converter(*date_cols):
def converter(*date_cols, col: Hashable):
if date_parser is lib.no_default:
strs = parsing.concat_date_cols(date_cols)
date_fmt = (
date_format.get(col) if isinstance(date_format, dict) else date_format
)

return tools.to_datetime(
ensure_object(strs),
format=date_format,
format=date_fmt,
utc=False,
dayfirst=dayfirst,
errors="ignore",
Expand Down Expand Up @@ -1218,7 +1224,9 @@ def _isindex(colspec):
continue
# Pyarrow engine returns Series which we need to convert to
# numpy array before converter, its a no-op for other parsers
data_dict[colspec] = converter(np.asarray(data_dict[colspec]))
data_dict[colspec] = converter(
np.asarray(data_dict[colspec]), col=colspec
)
else:
new_name, col, old_names = _try_convert_dates(
converter, colspec, data_dict, orig_names
Expand Down Expand Up @@ -1279,7 +1287,7 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
new_name = "_".join([str(x) for x in colnames])
to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]

new_col = parser(*to_parse)
new_col = parser(*to_parse, col=new_name)
return new_name, new_col, colnames


Expand Down
5 changes: 4 additions & 1 deletion pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,10 @@ def _get_index_names(self):

def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
if try_parse_dates and self._should_parse_dates(index):
values = self._date_conv(values)
values = self._date_conv(
values,
col=self.index_names[index] if self.index_names is not None else None,
)
return values


Expand Down
6 changes: 3 additions & 3 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,9 +265,9 @@
.. deprecated:: 2.0.0
Use ``date_format`` instead, or read in as ``object`` and then apply
:func:`to_datetime` as-needed.
date_format : str, default ``None``
date_format : str or dict of column -> format, default ``None``
If used in conjunction with ``parse_dates``, will parse dates according to this
format. For anything more complex (e.g. different formats for different columns),
format. For anything more complex,
please read in as ``object`` and then apply :func:`to_datetime` as-needed.

.. versionadded:: 2.0.0
Expand Down Expand Up @@ -1794,7 +1794,7 @@ def TextParser(*args, **kwds) -> TextFileReader:
date_parser : function, optional

.. deprecated:: 2.0.0
date_format : str, default ``None``
date_format : str or dict of column -> format, default ``None``

.. versionadded:: 2.0.0
skiprows : list of integers
Expand Down
63 changes: 63 additions & 0 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2155,3 +2155,66 @@ def test_parse_dot_separated_dates(all_parsers):
)
expected = DataFrame({"b": [1, 2]}, index=expected_index)
tm.assert_frame_equal(result, expected)


def test_parse_dates_dict_format(all_parsers):
# GH#
parser = all_parsers
data = """a,b
2019-12-31,31-12-2019
2020-12-31,31-12-2020"""

result = parser.read_csv(
StringIO(data),
date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"},
parse_dates=["a", "b"],
)
expected = DataFrame(
{
"a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
"b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
}
)
tm.assert_frame_equal(result, expected)


@skip_pyarrow
@pytest.mark.parametrize(
"key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})]
)
def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates):
# GH#
parser = all_parsers
data = """a,b
31-,12-2019
31-,12-2020"""

result = parser.read_csv(
StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates
)
expected = DataFrame(
{
key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
}
)
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_parse_dates_dict_format_index(all_parsers):
# GH#
parser = all_parsers
data = """a,b
2019-12-31,31-12-2019
2020-12-31,31-12-2020"""

result = parser.read_csv(
StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0
)
expected = DataFrame(
{
"b": ["31-12-2019", "31-12-2020"],
},
index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"),
)
tm.assert_frame_equal(result, expected)