Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8e61432
ARROW-9223: [Python] Propagate timezone information in pandas conversion
emkornfield Jul 11, 2020
69511b5
Honor tzinfo when converting from datetime
emkornfield Jul 20, 2020
5700909
better timezone support
kszucs Jul 21, 2020
3c4f071
test time conversions
kszucs Jul 21, 2020
793e68e
Expost TzinfoToString on the C++ side
kszucs Jul 22, 2020
2f131d3
Add more tests and reorganize implementation
kszucs Jul 22, 2020
e007cb1
Add timezone inference
kszucs Jul 22, 2020
25f9b25
Enable tests for nanosecond resolution
kszucs Jul 22, 2020
33b978f
Fix array from scalar test case
kszucs Jul 22, 2020
973259d
Convert from mixed builtin and pandas datetimes
kszucs Jul 22, 2020
29527d0
Skip test_tzinfo_to_string_errors for python versions <=3.6
kszucs Jul 22, 2020
2be7a9e
Fix skip condition
kszucs Jul 22, 2020
d5c890f
fix typos
emkornfield Aug 1, 2020
6a5d9f8
Add fallback parameter to ignore timezone in C++ python code
emkornfield Aug 1, 2020
263517d
add ARROW_NO_TZ env variable to revert back to old behavior
emkornfield Aug 2, 2020
9680e1c
use env variable in psark script
emkornfield Aug 3, 2020
c8c22f9
ARROW_NO_TZ -> PYARROW_IGNORE_TZ
emkornfield Aug 9, 2020
463a0c4
address review comments
kszucs Aug 10, 2020
d4cc519
properly test nested struct roundtrip
kszucs Aug 10, 2020
97e4237
address review comments
kszucs Aug 10, 2020
498d743
use OwnedRef to cleanup tzobject
kszucs Aug 10, 2020
2f5dbf6
fix test_tzinfo_to_string_errors test case
kszucs Aug 10, 2020
f5d44e7
clang format
kszucs Aug 10, 2020
880f344
TZ->TIMEZONE
emkornfield Aug 15, 2020
7a658cd
fix doc comments
emkornfield Aug 15, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ci/scripts/integration_spark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ source_dir=${1}
spark_dir=${2}
spark_version=${SPARK_VERSION:-master}

# Use old behavior that always dropped tiemzones.
export PYARROW_IGNORE_TIMEZONE=1

if [ "${SPARK_VERSION:0:2}" == "2." ]; then
# https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x
export ARROW_PRE_0_15_IPC_FORMAT=1
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -861,10 +861,10 @@ void AddBinaryLength(FunctionRegistry* registry) {
applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
ArrayKernelExec exec_offset_64 =
applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, BinaryLength>::Exec;
for (const auto& input_type : {binary(), utf8()}) {
for (const auto input_type : {binary(), utf8()}) {
DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32));
}
for (const auto& input_type : {large_binary(), large_utf8()}) {
for (const auto input_type : {large_binary(), large_utf8()}) {
DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64));
}
DCHECK_OK(registry->AddFunction(std::move(func)));
Expand Down
53 changes: 41 additions & 12 deletions cpp/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@

// Functions for pandas conversion via NumPy

#include "arrow/python/numpy_interop.h" // IWYU pragma: expand

#include "arrow/python/arrow_to_pandas.h"
#include "arrow/python/numpy_interop.h" // IWYU pragma: expand

#include <cmath>
#include <cstdint>
Expand Down Expand Up @@ -642,24 +641,28 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da
std::vector<OwnedRef> fields_data(num_fields);
OwnedRef dict_item;

// XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second
// In ARROW-7723, we found as a result of ARROW-3789 that second
// through microsecond resolution tz-aware timestamps were being promoted to
// use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy
// array in this function. PyArray_GETITEM returns datetime.datetime for
// units second through microsecond but PyLong for nanosecond (because
// datetime.datetime does not support nanoseconds). We inserted this hack to
// preserve the <= 0.15.1 behavior until a better solution can be devised
// datetime.datetime does not support nanoseconds).
// We force the object conversion to preserve the value of the timezone.
// Nanoseconds are returned integers inside of structs.
PandasOptions modified_options = options;
modified_options.ignore_timezone = true;
modified_options.coerce_temporal_nanoseconds = false;

for (int c = 0; c < data.num_chunks(); c++) {
auto arr = checked_cast<const StructArray*>(data.chunk(c).get());
// Convert the struct arrays first
for (int32_t i = 0; i < num_fields; i++) {
PyObject* numpy_array;
RETURN_NOT_OK(ConvertArrayToPandas(
modified_options, arr->field(static_cast<int>(i)), nullptr, &numpy_array));
std::shared_ptr<Array> field = arr->field(static_cast<int>(i));
// See notes above about timestamp conversion. Don't blindly convert because
// timestamps in lists are handled differently.
modified_options.timestamp_as_object =
field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object;
RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array));
fields_data[i].reset(numpy_array);
}

Expand Down Expand Up @@ -951,12 +954,39 @@ struct ObjectWriterVisitor {
template <typename Type>
enable_if_timestamp<Type, Status> Visit(const Type& type) {
const TimeUnit::type unit = type.unit();
auto WrapValue = [unit](typename Type::c_type value, PyObject** out) {
OwnedRef tzinfo;

auto ConvertTimezoneNaive = [&](typename Type::c_type value, PyObject** out) {
RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out));
RETURN_IF_PYERROR();
return Status::OK();
};
return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) {
PyObject* naive_datetime;
RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime));
// convert the timezone naive datetime object to timezone aware
*out = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", naive_datetime);
// the timezone naive object is no longer required
Py_DECREF(naive_datetime);
RETURN_IF_PYERROR();
return Status::OK();
};

if (!type.timezone().empty() && !options.ignore_timezone) {
// convert timezone aware
PyObject* tzobj;
ARROW_ASSIGN_OR_RAISE(tzobj, internal::StringToTzinfo(type.timezone()));
tzinfo.reset(tzobj);
RETURN_IF_PYERROR();
RETURN_NOT_OK(
ConvertAsPyObjects<Type>(options, data, ConvertTimezoneAware, out_values));
} else {
// convert timezone naive
RETURN_NOT_OK(
ConvertAsPyObjects<Type>(options, data, ConvertTimezoneNaive, out_values));
}

return Status::OK();
}

Status Visit(const Decimal128Type& type) {
Expand Down Expand Up @@ -1727,8 +1757,7 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
// Nanoseconds are never out of bounds for pandas, so in that case
// we don't convert to object
*output_type = PandasWriter::OBJECT;
} else if (ts_type.timezone() != "" && !options.ignore_timezone) {
// XXX: ignore_timezone: hack here for ARROW-7723
} else if (!ts_type.timezone().empty()) {
*output_type = PandasWriter::DATETIME_NANO_TZ;
} else if (options.coerce_temporal_nanoseconds) {
*output_type = PandasWriter::DATETIME_NANO;
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/python/arrow_to_pandas.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ struct PandasOptions {
/// Coerce all date and timestamp to datetime64[ns]
bool coerce_temporal_nanoseconds = false;

/// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion
/// path
/// Used to maintain backwards compatibility for
/// timezone bugs (see ARROW-9528). Should be removed
/// after Arrow 2.0 release.
bool ignore_timezone = false;

/// \brief If true, do not create duplicate PyObject versions of equal
Expand Down
172 changes: 171 additions & 1 deletion cpp/src/arrow/python/datetime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,66 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow/python/datetime.h"

#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>

#include "arrow/python/common.h"
#include "arrow/python/datetime.h"
#include "arrow/python/helpers.h"
#include "arrow/python/platform.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/logging.h"
#include "arrow/util/value_parsing.h"

namespace arrow {
namespace py {
namespace internal {

namespace {

// Same as Regex '([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$'.
// GCC 4.9 doesn't support regex, so handcode until support for it
// is dropped.
bool MatchFixedOffset(const std::string& tz, util::string_view* sign,
util::string_view* hour, util::string_view* minute) {
if (tz.size() < 5) {
return false;
}
const char* iter = tz.data();
if (*iter == '+' || *iter == '-') {
*sign = util::string_view(iter, 1);
iter++;
if (tz.size() < 6) {
return false;
}
}
if ((((*iter == '0' || *iter == '1') && *(iter + 1) >= '0' && *(iter + 1) <= '9') ||
(*iter == '2' && *(iter + 1) >= '0' && *(iter + 1) <= '3'))) {
*hour = util::string_view(iter, 2);
iter += 2;
} else {
return false;
}
if (*iter != ':') {
return false;
}
iter++;

if (*iter >= '0' && *iter <= '5' && *(iter + 1) >= '0' && *(iter + 1) <= '9') {
*minute = util::string_view(iter, 2);
iter += 2;
} else {
return false;
}
return iter == (tz.data() + tz.size());
}

} // namespace

PyDateTime_CAPI* datetime_api = nullptr;

void InitDatetime() {
Expand Down Expand Up @@ -262,6 +306,132 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) {
PyDateTime_GET_DAY(pydate));
}

Result<int64_t> PyDateTime_utcoffset_s(PyObject* obj) {
// calculate offset from UTC timezone in seconds
// supports only PyDateTime_DateTime and PyDateTime_Time objects
OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL));
RETURN_IF_PYERROR();
if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) {
auto delta = reinterpret_cast<PyDateTime_Delta*>(pyoffset.obj());
return internal::PyDelta_to_s(delta);
} else {
return 0;
}
}

Result<std::string> PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) {
// attempt to convert timezone offset objects to "+/-{hh}:{mm}" format
OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None));
RETURN_IF_PYERROR();

if (!PyDelta_Check(pydelta_object.obj())) {
return Status::Invalid(
"Object returned by tzinfo.utcoffset(None) is not an instance of "
"datetime.timedelta");
}
auto pydelta = reinterpret_cast<PyDateTime_Delta*>(pydelta_object.obj());

// retrieve the offset as seconds
auto total_seconds = internal::PyDelta_to_s(pydelta);

// determine whether the offset is positive or negative
auto sign = (total_seconds < 0) ? "-" : "+";
total_seconds = abs(total_seconds);

// calculate offset components
int64_t hours, minutes, seconds;
seconds = split_time(total_seconds, 60, &minutes);
minutes = split_time(minutes, 60, &hours);
if (seconds > 0) {
// check there are no remaining seconds
return Status::Invalid("Offset must represent whole number of minutes");
}

// construct the timezone string
std::stringstream stream;
stream << sign << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0')
<< std::setw(2) << minutes;
return stream.str();
}

// Converted from python. See https://github.com/apache/arrow/pull/7604
// for details.
Result<PyObject*> StringToTzinfo(const std::string& tz) {
util::string_view sign_str, hour_str, minute_str;
OwnedRef pytz;
RETURN_NOT_OK(internal::ImportModule("pytz", &pytz));

if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
int sign = -1;
if (sign_str == "+") {
sign = 1;
}
OwnedRef fixed_offset;
RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset));
uint32_t minutes, hours;
if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) ||
!::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(),
&minutes)) {
return Status::Invalid("Invalid timezone: ", tz);
}
OwnedRef total_minutes(PyLong_FromLong(
sign * ((static_cast<int>(hours) * 60) + static_cast<int>(minutes))));
RETURN_IF_PYERROR();
auto tzinfo =
PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL);
RETURN_IF_PYERROR();
return tzinfo;
}

OwnedRef timezone;
RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone));
OwnedRef py_tz_string(
PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL);
RETURN_IF_PYERROR();
return tzinfo;
}

Result<std::string> TzinfoToString(PyObject* tzinfo) {
OwnedRef module_pytz; // import pytz
OwnedRef module_datetime; // import datetime
OwnedRef class_timezone; // from datetime import timezone
OwnedRef class_fixedoffset; // from pytz import _FixedOffset

// import necessary modules
RETURN_NOT_OK(internal::ImportModule("pytz", &module_pytz));
RETURN_NOT_OK(internal::ImportModule("datetime", &module_datetime));
// import necessary classes
RETURN_NOT_OK(
internal::ImportFromModule(module_pytz.obj(), "_FixedOffset", &class_fixedoffset));
RETURN_NOT_OK(
internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone));

// check that it's a valid tzinfo object
if (!PyTZInfo_Check(tzinfo)) {
return Status::TypeError("Not an instance of datetime.tzinfo");
}

// if tzinfo is an instance of pytz._FixedOffset or datetime.timezone return the
// HH:MM offset string representation
if (PyObject_IsInstance(tzinfo, class_timezone.obj()) ||
PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) {
return PyTZInfo_utcoffset_hhmm(tzinfo);
}

// attempt to call tzinfo.tzname(None)
OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
RETURN_IF_PYERROR();
if (PyUnicode_Check(tzname_object.obj())) {
std::string result;
RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result));
return result;
}

// fall back to HH:MM offset string representation based on tzinfo.utcoffset(None)
return PyTZInfo_utcoffset_hhmm(tzinfo);
}

} // namespace internal
} // namespace py
} // namespace arrow
26 changes: 26 additions & 0 deletions cpp/src/arrow/python/datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,32 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
return PyDelta_to_us(pytimedelta) * 1000;
}

ARROW_PYTHON_EXPORT
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these need to be exported?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the other utility functions are exported as well.

Result<int64_t> PyDateTime_utcoffset_s(PyObject* pydatetime);

/// \brief Convert a time zone name into a time zone object.
///
/// Supported input strings are:
/// * As used in the Olson time zone database (the "tz database" or
/// "tzdata"), such as "America/New_York"
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
/// GIL must be held when calling this method.
ARROW_PYTHON_EXPORT
Result<PyObject*> StringToTzinfo(const std::string& tz);

/// \brief Convert a time zone object to a string representation.
///
/// The output strings are:
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
/// if the input object is either an instance of pytz._FixedOffset or
/// datetime.timedelta
/// * The timezone's name if the input object's tzname() method returns with a
/// non-empty timezone name such as "UTC" or "America/New_York"
///
/// GIL must be held when calling this method.
ARROW_PYTHON_EXPORT
Result<std::string> TzinfoToString(PyObject* pytzinfo);

} // namespace internal
} // namespace py
} // namespace arrow
Loading