diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index c00bc20b7c7..83deb064222 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -550,27 +550,23 @@ Result> Converter::Make(const std::shared_ptr>)) + CONVERTER_CASE(Type::NA, NullConverter) - CONVERTER_CASE(Type::INT8, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::INT16, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::INT32, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::INT64, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::UINT8, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::UINT16, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::UINT32, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::UINT64, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::FLOAT, - (PrimitiveConverter>)) - CONVERTER_CASE(Type::DOUBLE, - (PrimitiveConverter>)) + NUMERIC_CONVERTER_CASE(Type::INT8, Int8Type) + NUMERIC_CONVERTER_CASE(Type::INT16, Int16Type) + NUMERIC_CONVERTER_CASE(Type::INT32, Int32Type) + NUMERIC_CONVERTER_CASE(Type::INT64, Int64Type) + NUMERIC_CONVERTER_CASE(Type::UINT8, UInt8Type) + NUMERIC_CONVERTER_CASE(Type::UINT16, UInt16Type) + NUMERIC_CONVERTER_CASE(Type::UINT32, UInt32Type) + NUMERIC_CONVERTER_CASE(Type::UINT64, UInt64Type) + NUMERIC_CONVERTER_CASE(Type::FLOAT, FloatType) + NUMERIC_CONVERTER_CASE(Type::DOUBLE, DoubleType) + NUMERIC_CONVERTER_CASE(Type::DATE32, Date32Type) + NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type) CONVERTER_CASE(Type::BOOL, (PrimitiveConverter)) CONVERTER_CASE(Type::BINARY, (PrimitiveConverter>)) @@ -624,6 +620,7 @@ Result> Converter::Make(const std::shared_ptrInitialize()); return ptr; diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc index 33fb49e28e8..be51911914a 100644 --- a/cpp/src/arrow/csv/converter_test.cc +++ b/cpp/src/arrow/csv/converter_test.cc @@ -354,6 +354,26 @@ TEST(BooleanConversion, CustomNulls) { {{true, false}, {false, true}}, options); } +TEST(Date32Conversion, Basics) { + AssertConversion(date32(), {"1945-05-08\n", "2020-03-15\n"}, + {{-9004, 18336}}); +} + +TEST(Date32Conversion, Nulls) { + AssertConversion(date32(), {"N/A\n", "2020-03-15\n"}, {{0, 18336}}, + {{false, true}}); +} + +TEST(Date64Conversion, Basics) { + AssertConversion(date64(), {"1945-05-08\n", "2020-03-15\n"}, + {{-777945600000LL, 1584230400000LL}}); +} + +TEST(Date64Conversion, Nulls) { + AssertConversion(date64(), {"N/A\n", "2020-03-15\n"}, + {{0, 1584230400000LL}}, {{false, true}}); +} + TEST(TimestampConversion, Basics) { auto type = timestamp(TimeUnit::SECOND); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 1682e1d7f1d..547f6f51533 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -239,6 +239,31 @@ TEST(StringConversion, ToUInt64) { AssertConversionFails("e"); } +TEST(StringConversion, ToDate32) { + AssertConversion("1970-01-01", 0); + AssertConversion("1970-01-02", 1); + AssertConversion("2020-03-15", 18336); + AssertConversion("1945-05-08", -9004); + AssertConversion("4707-11-28", 999999); + AssertConversion("0001-01-01", -719162); + + // Invalid format + AssertConversionFails(""); + AssertConversionFails("1970"); + AssertConversionFails("1970-01"); + AssertConversionFails("1970-01-01 00:00:00"); + AssertConversionFails("1970/01/01"); +} + +TEST(StringConversion, ToDate64) { + AssertConversion("1970-01-01", 0); + AssertConversion("1970-01-02", 86400000); + AssertConversion("2020-03-15", 1584230400000LL); + AssertConversion("1945-05-08", -777945600000LL); + AssertConversion("4707-11-28", 86399913600000LL); + AssertConversion("0001-01-01", -62135596800000LL); +} + TEST(StringConversion, ToTimestampDate_ISO8601) { { TimestampType type{TimeUnit::SECOND}; diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst index 50a5cdb8956..4bb608fc570 100644 --- a/docs/source/cpp/csv.rst +++ b/docs/source/cpp/csv.rst @@ -126,10 +126,14 @@ can be chosen from the following list: * Float32 and Float64 * Decimal128 * Boolean +* Date32 and Date64 * Timestamp * Binary and Large Binary * String and Large String (with optional UTF8 input validation) * Fixed-Size Binary +* Dictionary with index type Int32 and value type one of the following: + Binary, String, LargeBinary, LargeString, Int32, UInt32, Int64, UInt64, + Float32, Float64, Decimal128 Other data types do not support conversion from CSV values and will error out. diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index a3f16b8ae0d..e95a9fff167 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -16,7 +16,7 @@ # under the License. import bz2 -from datetime import datetime +from datetime import date, datetime from decimal import Decimal import gc import gzip @@ -534,6 +534,30 @@ def test_timestamp_parsers(self): 'b': [datetime(1980, 1, 1), datetime(1980, 1, 2)], } + def test_dates(self): + # Dates are inferred as timestamps by default + rows = b"a,b\n1970-01-01,1970-01-02\n1971-01-01,1971-01-02\n" + table = self.read_bytes(rows) + schema = pa.schema([('a', pa.timestamp('s')), + ('b', pa.timestamp('s'))]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [datetime(1970, 1, 1), datetime(1971, 1, 1)], + 'b': [datetime(1970, 1, 2), datetime(1971, 1, 2)], + } + + # Can ask for date types explicitly + opts = ConvertOptions() + opts.column_types = {'a': pa.date32(), 'b': pa.date64()} + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.date32()), + ('b', pa.date64())]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [date(1970, 1, 1), date(1971, 1, 1)], + 'b': [date(1970, 1, 2), date(1971, 1, 2)], + } + def test_auto_dict_encode(self): opts = ConvertOptions(auto_dict_encode=True) rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode()