Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 17 additions & 20 deletions cpp/src/arrow/csv/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -550,27 +550,23 @@ Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataTyp
ptr.reset(new CONVERTER_TYPE(type, options, pool)); \
break;

#define NUMERIC_CONVERTER_CASE(TYPE_ID, TYPE_CLASS) \
CONVERTER_CASE(TYPE_ID, \
(PrimitiveConverter<TYPE_CLASS, NumericValueDecoder<TYPE_CLASS>>))

CONVERTER_CASE(Type::NA, NullConverter)
CONVERTER_CASE(Type::INT8,
(PrimitiveConverter<Int8Type, NumericValueDecoder<Int8Type>>))
CONVERTER_CASE(Type::INT16,
(PrimitiveConverter<Int16Type, NumericValueDecoder<Int16Type>>))
CONVERTER_CASE(Type::INT32,
(PrimitiveConverter<Int32Type, NumericValueDecoder<Int32Type>>))
CONVERTER_CASE(Type::INT64,
(PrimitiveConverter<Int64Type, NumericValueDecoder<Int64Type>>))
CONVERTER_CASE(Type::UINT8,
(PrimitiveConverter<UInt8Type, NumericValueDecoder<UInt8Type>>))
CONVERTER_CASE(Type::UINT16,
(PrimitiveConverter<UInt16Type, NumericValueDecoder<UInt16Type>>))
CONVERTER_CASE(Type::UINT32,
(PrimitiveConverter<UInt32Type, NumericValueDecoder<UInt32Type>>))
CONVERTER_CASE(Type::UINT64,
(PrimitiveConverter<UInt64Type, NumericValueDecoder<UInt64Type>>))
CONVERTER_CASE(Type::FLOAT,
(PrimitiveConverter<FloatType, NumericValueDecoder<FloatType>>))
CONVERTER_CASE(Type::DOUBLE,
(PrimitiveConverter<DoubleType, NumericValueDecoder<DoubleType>>))
NUMERIC_CONVERTER_CASE(Type::INT8, Int8Type)
NUMERIC_CONVERTER_CASE(Type::INT16, Int16Type)
NUMERIC_CONVERTER_CASE(Type::INT32, Int32Type)
NUMERIC_CONVERTER_CASE(Type::INT64, Int64Type)
NUMERIC_CONVERTER_CASE(Type::UINT8, UInt8Type)
NUMERIC_CONVERTER_CASE(Type::UINT16, UInt16Type)
NUMERIC_CONVERTER_CASE(Type::UINT32, UInt32Type)
NUMERIC_CONVERTER_CASE(Type::UINT64, UInt64Type)
NUMERIC_CONVERTER_CASE(Type::FLOAT, FloatType)
NUMERIC_CONVERTER_CASE(Type::DOUBLE, DoubleType)
NUMERIC_CONVERTER_CASE(Type::DATE32, Date32Type)
NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType, BooleanValueDecoder>))
CONVERTER_CASE(Type::BINARY,
(PrimitiveConverter<BinaryType, BinaryValueDecoder<false>>))
Expand Down Expand Up @@ -624,6 +620,7 @@ Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataTyp
}

#undef CONVERTER_CASE
#undef NUMERIC_CONVERTER_CASE
}
RETURN_NOT_OK(ptr->Initialize());
return ptr;
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/arrow/csv/converter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,26 @@ TEST(BooleanConversion, CustomNulls) {
{{true, false}, {false, true}}, options);
}

TEST(Date32Conversion, Basics) {
AssertConversion<Date32Type, int32_t>(date32(), {"1945-05-08\n", "2020-03-15\n"},
{{-9004, 18336}});
}

TEST(Date32Conversion, Nulls) {
AssertConversion<Date32Type, int32_t>(date32(), {"N/A\n", "2020-03-15\n"}, {{0, 18336}},
{{false, true}});
}

TEST(Date64Conversion, Basics) {
AssertConversion<Date64Type, int64_t>(date64(), {"1945-05-08\n", "2020-03-15\n"},
{{-777945600000LL, 1584230400000LL}});
}

TEST(Date64Conversion, Nulls) {
AssertConversion<Date64Type, int64_t>(date64(), {"N/A\n", "2020-03-15\n"},
{{0, 1584230400000LL}}, {{false, true}});
}

TEST(TimestampConversion, Basics) {
auto type = timestamp(TimeUnit::SECOND);

Expand Down
25 changes: 25 additions & 0 deletions cpp/src/arrow/util/value_parsing_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,31 @@ TEST(StringConversion, ToUInt64) {
AssertConversionFails<UInt64Type>("e");
}

TEST(StringConversion, ToDate32) {
AssertConversion<Date32Type>("1970-01-01", 0);
AssertConversion<Date32Type>("1970-01-02", 1);
AssertConversion<Date32Type>("2020-03-15", 18336);
AssertConversion<Date32Type>("1945-05-08", -9004);
AssertConversion<Date32Type>("4707-11-28", 999999);
AssertConversion<Date32Type>("0001-01-01", -719162);

// Invalid format
AssertConversionFails<Date32Type>("");
AssertConversionFails<Date32Type>("1970");
AssertConversionFails<Date32Type>("1970-01");
AssertConversionFails<Date32Type>("1970-01-01 00:00:00");
AssertConversionFails<Date32Type>("1970/01/01");
}

TEST(StringConversion, ToDate64) {
AssertConversion<Date64Type>("1970-01-01", 0);
AssertConversion<Date64Type>("1970-01-02", 86400000);
AssertConversion<Date64Type>("2020-03-15", 1584230400000LL);
AssertConversion<Date64Type>("1945-05-08", -777945600000LL);
AssertConversion<Date64Type>("4707-11-28", 86399913600000LL);
AssertConversion<Date64Type>("0001-01-01", -62135596800000LL);
}

TEST(StringConversion, ToTimestampDate_ISO8601) {
{
TimestampType type{TimeUnit::SECOND};
Expand Down
4 changes: 4 additions & 0 deletions docs/source/cpp/csv.rst
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,14 @@ can be chosen from the following list:
* Float32 and Float64
* Decimal128
* Boolean
* Date32 and Date64
* Timestamp
* Binary and Large Binary
* String and Large String (with optional UTF8 input validation)
* Fixed-Size Binary
* Dictionary with index type Int32 and value type one of the following:
Binary, String, LargeBinary, LargeString, Int32, UInt32, Int64, UInt64,
Float32, Float64, Decimal128

Other data types do not support conversion from CSV values and will error out.

Expand Down
26 changes: 25 additions & 1 deletion python/pyarrow/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# under the License.

import bz2
from datetime import datetime
from datetime import date, datetime
from decimal import Decimal
import gc
import gzip
Expand Down Expand Up @@ -534,6 +534,30 @@ def test_timestamp_parsers(self):
'b': [datetime(1980, 1, 1), datetime(1980, 1, 2)],
}

def test_dates(self):
# Dates are inferred as timestamps by default
rows = b"a,b\n1970-01-01,1970-01-02\n1971-01-01,1971-01-02\n"
table = self.read_bytes(rows)
schema = pa.schema([('a', pa.timestamp('s')),
('b', pa.timestamp('s'))])
assert table.schema == schema
assert table.to_pydict() == {
'a': [datetime(1970, 1, 1), datetime(1971, 1, 1)],
'b': [datetime(1970, 1, 2), datetime(1971, 1, 2)],
}

# Can ask for date types explicitly
opts = ConvertOptions()
opts.column_types = {'a': pa.date32(), 'b': pa.date64()}
table = self.read_bytes(rows, convert_options=opts)
schema = pa.schema([('a', pa.date32()),
('b', pa.date64())])
assert table.schema == schema
assert table.to_pydict() == {
'a': [date(1970, 1, 1), date(1971, 1, 1)],
'b': [date(1970, 1, 2), date(1971, 1, 2)],
}

def test_auto_dict_encode(self):
opts = ConvertOptions(auto_dict_encode=True)
rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode()
Expand Down