diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index fdc29fff5e4..7401fc489c9 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -239,6 +239,9 @@ jobs: with: fetch-depth: 0 submodules: recursive + - name: Download Timezone Database + shell: bash + run: ci/scripts/download_tz_database.sh - name: Build shell: bash run: ci/scripts/cpp_build.sh $(pwd) $(pwd)/build @@ -319,6 +322,9 @@ jobs: run: | export CMAKE_BUILD_PARALLEL_LEVEL=$NUMBER_OF_PROCESSORS ci/scripts/cpp_build.sh "$(pwd)" "$(pwd)/build" + - name: Download Timezone Database + shell: bash + run: ci/scripts/download_tz_database.sh - name: Download MinIO shell: msys2 {0} run: | diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index 8db35831673..936306296f6 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -115,3 +115,17 @@ powershell.exe -Command "Start-Process clcache-server" || exit /B if "%ARROW_S3%" == "ON" ( appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/minio.exe -FileName C:\Windows\Minio.exe || exit /B ) + + +@rem +@rem Download IANA Timezone Database for unit tests +@rem +@rem (Doc section: Download timezone database) +curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz +mkdir tzdata +tar --extract --file tzdata.tar.gz --directory tzdata +move tzdata %USERPROFILE%\Downloads\tzdata +@rem Also need Windows timezone mapping +curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^ + --output %USERPROFILE%\Downloads\tzdata\windowsZones.xml +@rem (Doc section: Download timezone database) \ No newline at end of file diff --git a/ci/scripts/download_tz_database.sh b/ci/scripts/download_tz_database.sh new file mode 100644 index 00000000000..c4095ef9c2d --- /dev/null +++ b/ci/scripts/download_tz_database.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +# Download database +curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output ~/Downloads/tzdata2021e.tar.gz + +# Extract +mkdir -p ~/Downloads/tzdata +tar --extract --file ~/Downloads/tzdata2021e.tar.gz --directory ~/Downloads/tzdata + +# Download Windows timezone mapping +curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml --output ~/Downloads/tzdata/windowsZones.xml diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index efc41622004..8ac2123d69e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -150,12 +150,6 @@ struct TemporalToStringCastFunctor { return Status::OK(); })); } else { -#ifdef _WIN32 - // TODO(ARROW-13168): - return Status::NotImplemented( - "Casting a timestamp with time zone to string is not yet supported on " - "Windows."); -#else switch (ty.unit()) { case TimeUnit::SECOND: RETURN_NOT_OK(ConvertZoned(input, timezone, &builder)); @@ -176,7 +170,6 @@ struct TemporalToStringCastFunctor { DCHECK(false); return Status::NotImplemented("Unimplemented time unit"); } -#endif } std::shared_ptr output_array; RETURN_NOT_OK(builder.Finish(&output_array)); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index f13b05ccd07..222b5ee88a4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -34,6 +34,7 @@ #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" @@ -1146,6 +1147,16 @@ constexpr char kTimestampSecondsJson[] = constexpr char kTimestampExtremeJson[] = R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])"; +class CastTimezone : public ::testing::Test { + protected: + void SetUp() override { +#ifdef _WIN32 + // Initialize timezone database on Windows + ASSERT_OK(InitTestTimezoneDatabase()); +#endif + } +}; + TEST(Cast, TimestampToDate) { // See scalar_temporal_test.cc auto timestamps = ArrayFromJSON(timestamp(TimeUnit::NANO), kTimestampJson); @@ -1181,12 +1192,7 @@ TEST(Cast, TimestampToDate) { } } -TEST(Cast, ZonedTimestampToDate) { -#ifdef _WIN32 - // TODO(ARROW-13168): we lack tzdb on Windows - GTEST_SKIP() << "ARROW-13168: no access to timezone database on Windows"; -#endif - +TEST_F(CastTimezone, ZonedTimestampToDate) { { // See TestZoned in scalar_temporal_test.cc auto timestamps = @@ -1377,12 +1383,7 @@ TEST(Cast, TimestampToTime) { } } -TEST(Cast, ZonedTimestampToTime) { -#ifdef _WIN32 - // TODO(ARROW-13168): we lack tzdb on Windows - GTEST_SKIP() << "ARROW-13168: no access to timezone database on Windows"; -#endif - +TEST_F(CastTimezone, ZonedTimestampToTime) { CheckCast(ArrayFromJSON(timestamp(TimeUnit::NANO, "Pacific/Marquesas"), kTimestampJson), ArrayFromJSON(time64(TimeUnit::NANO), R"([ 52259123456789, 50003999999999, 56480001001001, 65000000000000, @@ -1573,8 +1574,7 @@ TEST(Cast, TimestampToString) { } } -#ifndef _WIN32 -TEST(Cast, TimestampWithZoneToString) { +TEST_F(CastTimezone, TimestampWithZoneToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), @@ -1608,21 +1608,6 @@ TEST(Cast, TimestampWithZoneToString) { R"(["1968-11-30 13:30:44.123456789-0700", "2016-02-29 10:42:23.456789246-0700"])")); } } -#else -// TODO(ARROW-13168): we lack tzdb on Windows -TEST(Cast, TimestampWithZoneToString) { - for (auto string_type : {utf8(), large_utf8()}) { - ASSERT_RAISES(NotImplemented, Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), - "[-34226955, 1456767743]"), - CastOptions::Safe(string_type))); - - ASSERT_RAISES(NotImplemented, - Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), - "[-34226955, 1456767743]"), - CastOptions::Safe(string_type))); - } -} -#endif TEST(Cast, DateToDate) { auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]"); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 38b810902ba..5d9690d58ff 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -24,6 +24,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" +#include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/formatting.h" @@ -407,6 +408,14 @@ class ScalarTemporalTest : public ::testing::Test { RoundTemporalOptions round_to_15_quarters = RoundTemporalOptions(15, CalendarUnit::QUARTER); RoundTemporalOptions round_to_15_years = RoundTemporalOptions(15, CalendarUnit::YEAR); + + protected: + void SetUp() override { +#ifdef _WIN32 + // Initialize timezone database on Windows + ASSERT_OK(InitTestTimezoneDatabase()); +#endif + } }; TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionAllTemporalTypes) { @@ -564,8 +573,6 @@ TEST_F(ScalarTemporalTest, TestOutsideNanosecondRange) { CheckScalarUnary("subsecond", unit, times, float64(), subsecond); } -#ifndef _WIN32 -// TODO: We should test on windows once ARROW-13168 is resolved. TEST_F(ScalarTemporalTest, TestIsLeapYear) { auto is_leap_year_marquesas = "[false, true, false, false, false, false, false, false, false, false, false, " @@ -792,7 +799,6 @@ TEST_F(ScalarTemporalTest, TestNonexistentTimezone) { ASSERT_RAISES(Invalid, Subsecond(timestamp_array)); } } -#endif TEST_F(ScalarTemporalTest, Week) { auto unit = timestamp(TimeUnit::NANO); @@ -1607,8 +1613,6 @@ TEST_F(ScalarTemporalTest, TestTemporalDifferenceErrors) { CallFunction("weeks_between", {arr1, arr1}, &options)); } -// TODO: We should test on windows once ARROW-13168 is resolved. -#ifndef _WIN32 TEST_F(ScalarTemporalTest, TestAssumeTimezone) { std::string timezone_utc = "UTC"; std::string timezone_kolkata = "Asia/Kolkata"; @@ -1875,6 +1879,9 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) { } TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { +#ifdef _WIN32 + GTEST_SKIP() << "There is a known bug in strftime for locales on Windows (ARROW-15922)"; +#else if (!LocaleExists("fr_FR.UTF-8")) { GTEST_SKIP() << "locale 'fr_FR.UTF-8' doesn't exist on this system"; } @@ -1886,6 +1893,7 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { ["01 janvier 1970 00:00:59,123", "18 août 2021 15:11:50,456", null])"; CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "UTC"), milliseconds, utf8(), expected, &options); +#endif } TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) { @@ -2579,7 +2587,6 @@ TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalKolkata) { CheckScalarUnary("round_temporal", unit, times, unit, round_1_hours, &round_to_1_hours); CheckScalarUnary("round_temporal", unit, times, unit, round_2_hours, &round_to_2_hours); } -#endif // !_WIN32 } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index ed08c367664..51b0552429f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -1046,7 +1046,6 @@ struct RoundTemporal { // ---------------------------------------------------------------------- // Convert timestamps to a string representation with an arbitrary format -#ifndef _WIN32 Result GetLocale(const std::string& locale) { try { return std::locale(locale.c_str()); @@ -1130,18 +1129,6 @@ struct Strftime { return Status::OK(); } }; -#else -// TODO(ARROW-13168) -template -struct Strftime { - static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - return Status::NotImplemented("Strftime not yet implemented on windows."); - } - static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) { - return Status::NotImplemented("Strftime not yet implemented on windows."); - } -}; -#endif // ---------------------------------------------------------------------- // Convert timestamps from local timestamp without a timezone to timestamps with a diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc index 93df10b097c..a93a8feae1d 100644 --- a/cpp/src/arrow/config.cc +++ b/cpp/src/arrow/config.cc @@ -21,6 +21,7 @@ #include "arrow/util/config.h" #include "arrow/util/cpu_info.h" +#include "arrow/vendored/datetime.h" namespace arrow { @@ -62,6 +63,8 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) { } } +util::optional timezone_db_path; + }; // namespace const BuildInfo& GetBuildInfo() { return kBuildInfo; } @@ -73,7 +76,32 @@ RuntimeInfo GetRuntimeInfo() { MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); }); info.detected_simd_level = MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); }); + info.using_os_timezone_db = USE_OS_TZDB; +#if !USE_OS_TZDB + info.timezone_db_path = timezone_db_path; +#else + info.timezone_db_path = util::optional(); +#endif return info; } +Status Initialize(const GlobalOptions& options) noexcept { + if (options.timezone_db_path.has_value()) { +#if !USE_OS_TZDB + try { + arrow_vendored::date::set_install(options.timezone_db_path.value()); + arrow_vendored::date::reload_tzdb(); + } catch (const std::runtime_error& e) { + return Status::IOError(e.what()); + } + timezone_db_path = options.timezone_db_path.value(); +#else + return Status::Invalid( + "Arrow was set to use OS timezone database at compile time, " + "so a downloaded database cannot be provided at runtime."); +#endif // !USE_OS_TZDB + } + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/config.h b/cpp/src/arrow/config.h index a485b91a4a5..87e31cc456a 100644 --- a/cpp/src/arrow/config.h +++ b/cpp/src/arrow/config.h @@ -19,7 +19,9 @@ #include +#include "arrow/status.h" #include "arrow/util/config.h" // IWYU pragma: export +#include "arrow/util/optional.h" #include "arrow/util/visibility.h" namespace arrow { @@ -62,6 +64,13 @@ struct RuntimeInfo { /// The SIMD level available on the OS and CPU std::string detected_simd_level; + + /// Whether using the OS-based timezone database + /// This is set at compile-time. + bool using_os_timezone_db; + + /// The path to the timezone database; by default None. + util::optional timezone_db_path; }; /// \brief Get runtime build info. @@ -77,4 +86,13 @@ const BuildInfo& GetBuildInfo(); ARROW_EXPORT RuntimeInfo GetRuntimeInfo(); +struct GlobalOptions { + /// Path to text timezone database. This is only configurable on Windows, + /// which does not have a compatible OS timezone database. + util::optional timezone_db_path; +}; + +ARROW_EXPORT +Status Initialize(const GlobalOptions& options) noexcept; + } // namespace arrow diff --git a/cpp/src/arrow/public_api_test.cc b/cpp/src/arrow/public_api_test.cc index f6f78295499..45f3313c67f 100644 --- a/cpp/src/arrow/public_api_test.cc +++ b/cpp/src/arrow/public_api_test.cc @@ -85,6 +85,8 @@ #include #include +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" namespace arrow { @@ -103,4 +105,46 @@ TEST(Misc, BuildInfo) { ASSERT_THAT(info.full_so_version, ::testing::HasSubstr(info.so_version)); } +TEST(Misc, SetTimezoneConfig) { +#ifndef _WIN32 + GTEST_SKIP() << "Can only set the Timezone database on Windows"; +#elif !defined(ARROW_FILESYSTEM) + GTEST_SKIP() << "Need filesystem support to test timezone config."; +#else + auto fs = std::make_shared(); + + util::optional tzdata_result = GetTestTimezoneDatabaseRoot(); + std::string tzdata_dir; + if (tzdata_result.has_value()) { + tzdata_dir = tzdata_result.value(); + } else { + auto home_raw = std::getenv("USERPROFILE"); + std::string home = home_raw == nullptr ? "~" : std::string(home_raw); + ASSERT_OK_AND_ASSIGN(tzdata_dir, fs->NormalizePath(home + "\\Downloads\\tzdata")); + } + ASSERT_OK_AND_ASSIGN(tzdata_dir, fs->NormalizePath(tzdata_dir)); + ASSERT_OK_AND_ASSIGN(auto tzdata_path, + arrow::internal::PlatformFilename::FromString(tzdata_dir)); + + if (!arrow::internal::FileExists(tzdata_path).ValueOr(false)) { + GTEST_SKIP() << "Couldn't find timezone database in expected dir: " << tzdata_dir; + } + // Create a tmp directory + ASSERT_OK_AND_ASSIGN(auto tempdir, arrow::internal::TemporaryDir::Make("tzdata")); + + // Validate that setting tzdb to that dir fails + arrow::GlobalOptions options = {util::make_optional(tempdir->path().ToString())}; + ASSERT_NOT_OK(arrow::Initialize(options)); + + // Copy tzdb data from ~/Downloads + auto selector = arrow::fs::FileSelector(); + selector.base_dir = tzdata_dir; + selector.recursive = true; + ASSERT_OK(arrow::fs::CopyFiles(fs, selector, fs, tempdir->path().ToString())); + + // Validate that tzdb is working + ASSERT_OK(arrow::Initialize(options)); +#endif +} + } // namespace arrow diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc index 19917185130..16da96452e0 100644 --- a/cpp/src/arrow/testing/util.cc +++ b/cpp/src/arrow/testing/util.cc @@ -37,6 +37,7 @@ #include // IWYU pragma: keep #endif +#include "arrow/config.h" #include "arrow/table.h" #include "arrow/testing/random.h" #include "arrow/type.h" @@ -108,6 +109,25 @@ Status GetTestResourceRoot(std::string* out) { return Status::OK(); } +util::optional GetTestTimezoneDatabaseRoot() { + const char* c_root = std::getenv("ARROW_TIMEZONE_DATABASE"); + if (!c_root) { + return util::optional(); + } + return util::make_optional(std::string(c_root)); +} + +Status InitTestTimezoneDatabase() { + auto maybe_tzdata = GetTestTimezoneDatabaseRoot(); + // If missing, timezone database will default to %USERPROFILE%\Downloads\tzdata + if (!maybe_tzdata.has_value()) return Status::OK(); + + auto tzdata_path = std::string(maybe_tzdata.value()); + arrow::GlobalOptions options = {util::make_optional(tzdata_path)}; + ARROW_RETURN_NOT_OK(arrow::Initialize(options)); + return Status::OK(); +} + int GetListenPort() { // Get a new available port number by binding a socket to an ephemeral port // and then closing it. Since ephemeral port allocation tends to avoid diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 786ab3814aa..b993d86ed63 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -34,6 +34,7 @@ #include "arrow/testing/visibility.h" #include "arrow/type_fwd.h" #include "arrow/util/macros.h" +#include "arrow/util/optional.h" namespace arrow { @@ -110,6 +111,13 @@ UnionTypeFactories() { // Status ARROW_TESTING_EXPORT Status GetTestResourceRoot(std::string*); +// Return the value of the ARROW_TIMEZONE_DATABASE environment variable +ARROW_TESTING_EXPORT util::optional GetTestTimezoneDatabaseRoot(); + +// Set the Timezone database based on the ARROW_TIMEZONE_DATABASE env variable +// This is only relevant on Windows, since other OSs have compatible databases built-in +ARROW_TESTING_EXPORT Status InitTestTimezoneDatabase(); + // Get a TCP port number to listen on. This is a different number every time, // as reusing the same port across tests can produce spurious bind errors on // Windows. diff --git a/docs/source/cpp/api/support.rst b/docs/source/cpp/api/support.rst index 85374d27fb6..ac1c6f6d9c3 100644 --- a/docs/source/cpp/api/support.rst +++ b/docs/source/cpp/api/support.rst @@ -73,6 +73,16 @@ introduced API). ``"7.0.1"``. +Runtime Configuration +===================== + +.. doxygenstruct:: arrow::ArrowGlobalOptions + :members: + + +.. doxygenfunction:: arrow::Initialize + + Error return and reporting ========================== diff --git a/docs/source/cpp/build_system.rst b/docs/source/cpp/build_system.rst index fbdd2eafbd2..d91d070e092 100644 --- a/docs/source/cpp/build_system.rst +++ b/docs/source/cpp/build_system.rst @@ -163,3 +163,26 @@ can control the source of each dependency and whether it is statically or dynamically linked. See :doc:`/developers/cpp/building` for instructions. Or alternatively, use Arrow from a package manager such as Conda or vcpkg which will manage consistent versions of Arrow and its dependencies. + + +.. _download-timezone-database: + +Runtime Dependencies +==================== + +While Arrow uses the OS-provided timezone database on Linux and macOS, it +requires a user-provided database on Windows. You must download and extract the +text version of the IANA timezone database and add the Windows timezone mapping +XML. To download, you can use the following batch script: + +.. literalinclude:: ../../../ci/appveyor-cpp-setup.bat + :language: cmd + :start-after: @rem (Doc section: Download timezone database) + :end-before: @rem (Doc section: Download timezone database) + +By default, the timezone database will be detected at ``%USERPROFILE%\Downloads\tzdata``, +but you can set a custom path at runtime in :struct:`arrow::ArrowGlobalOptions`:: + + arrow::ArrowGlobalOptions options; + options.tz_db_path = "path/to/tzdata"; + ARROW_RETURN_NOT_OK(arrow::Initialize(options)); diff --git a/docs/source/developers/cpp/windows.rst b/docs/source/developers/cpp/windows.rst index b2c0c238ff2..ad3f47749bd 100644 --- a/docs/source/developers/cpp/windows.rst +++ b/docs/source/developers/cpp/windows.rst @@ -362,6 +362,15 @@ suppress dllimport/dllexport marking of symbols. Projects that statically link against Arrow on Windows additionally need this definition. The Unix builds do not use the macro. +Downloading the Timezone Database +================================= + +To run some of the compute unit tests on Windows, the IANA timezone database +and the Windows timezone mapping need to be downloaded first. See +:ref:`download-timezone-database` for download instructions. To set a non-default +path for the timezone database while running the unit tests, set the +``ARROW_TIMEZONE_DATABASE`` environment variable. + Replicating Appveyor Builds =========================== diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 1f721468765..36a55c05b26 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -58,6 +58,7 @@ Suggests: stringr, testthat (>= 3.1.0), tibble, + tzdb, withr LinkingTo: cpp11 (>= 0.4.2) Collate: diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 9424fc9228d..5ab82d88a88 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -65,6 +65,14 @@ # Disable multithreading on Windows # See https://issues.apache.org/jira/browse/ARROW-8379 options(arrow.use_threads = FALSE) + + # Try to set timezone database + if (requireNamespace("tzdb", quietly = TRUE)) { + tzdb::tzdb_initialize() + set_timezone_database(tzdb::tzdb_path("text")) + } else { + warning("The tzdb package is not installed. Timezones will not be available.") + } } invisible() diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 6774ef39d4f..55886b2ea03 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -468,6 +468,10 @@ runtime_info <- function() { .Call(`_arrow_runtime_info`) } +set_timezone_database <- function(path) { + invisible(.Call(`_arrow_set_timezone_database`, path)) +} + csv___WriteOptions__initialize <- function(options) { .Call(`_arrow_csv___WriteOptions__initialize`, options) } diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 8f5a7689c07..88e772884ac 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -15,6 +15,15 @@ # specific language governing permissions and limitations # under the License. +check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) { + if (tolower(Sys.info()[["sysname"]]) == "windows" & locale != "C") { + # MingW C++ std::locale only supports "C" and "POSIX" + stop(paste0("On Windows, time locales other than 'C' are not supported in Arrow. ", + "Consider setting `Sys.setlocale('LC_TIME', 'C')`")) + } + locale +} + register_bindings_datetime <- function() { register_binding("strptime", function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") { @@ -48,7 +57,7 @@ register_bindings_datetime <- function() { } else { ts <- x } - Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + Expression$create("strftime", ts, options = list(format = format, locale = check_time_locale())) }) register_binding("format_ISO8601", function(x, usetz = FALSE, precision = NULL, ...) { @@ -95,7 +104,7 @@ register_bindings_datetime <- function() { } else { format <- "%A" } - return(Expression$create("strftime", x, options = list(format = format, locale = locale))) + return(Expression$create("strftime", x, options = list(format = format, locale = check_time_locale(locale)))) } Expression$create("day_of_week", x, options = list(count_from_zero = FALSE, week_start = week_start)) @@ -133,7 +142,7 @@ register_bindings_datetime <- function() { } else { format <- "%B" } - return(build_expr("strftime", x, options = list(format = format, locale = locale))) + return(build_expr("strftime", x, options = list(format = format, locale = check_time_locale(locale)))) } build_expr("month", x) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5774a793813..c875206dada 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1840,6 +1840,22 @@ extern "C" SEXP _arrow_runtime_info(){ } #endif +// config.cpp +#if defined(ARROW_R_WITH_ARROW) +void set_timezone_database(cpp11::strings path); +extern "C" SEXP _arrow_set_timezone_database(SEXP path_sexp){ +BEGIN_CPP11 + arrow::r::Input::type path(path_sexp); + set_timezone_database(path); + return R_NilValue; +END_CPP11 +} +#else +extern "C" SEXP _arrow_set_timezone_database(SEXP path_sexp){ + Rf_error("Cannot call set_timezone_database(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // csv.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr csv___WriteOptions__initialize(cpp11::list options); @@ -7619,11 +7635,11 @@ return Rf_ScalarLogical( ); } static const R_CallMethodDef CallEntries[] = { -{ "_arrow_available", (DL_FUNC)& _arrow_available, 0 }, -{ "_dataset_available", (DL_FUNC)& _dataset_available, 0 }, -{ "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, -{ "_s3_available", (DL_FUNC)& _s3_available, 0 }, -{ "_json_available", (DL_FUNC)& _json_available, 0 }, + { "_arrow_available", (DL_FUNC)& _arrow_available, 0 }, + { "_dataset_available", (DL_FUNC)& _dataset_available, 0 }, + { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, + { "_s3_available", (DL_FUNC)& _s3_available, 0 }, + { "_json_available", (DL_FUNC)& _json_available, 0 }, { "_arrow_test_SET_STRING_ELT", (DL_FUNC) &_arrow_test_SET_STRING_ELT, 1}, { "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1}, { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, @@ -7741,6 +7757,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0}, { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0}, { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0}, + { "_arrow_set_timezone_database", (DL_FUNC) &_arrow_set_timezone_database, 1}, { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1}, { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, diff --git a/r/src/config.cpp b/r/src/config.cpp index 497843573bb..763ddee2cc1 100644 --- a/r/src/config.cpp +++ b/r/src/config.cpp @@ -20,6 +20,7 @@ #if defined(ARROW_R_WITH_ARROW) #include +#include // [[arrow::export]] std::vector build_info() { @@ -34,4 +35,16 @@ std::vector runtime_info() { return {info.simd_level, info.detected_simd_level}; } +// [[arrow::export]] +void set_timezone_database(cpp11::strings path) { + auto paths = cpp11::as_cpp>(path); + if (path.size() != 1) { + cpp11::stop("Must provide a single path to the timezone database."); + } + + arrow::GlobalOptions options; + options.timezone_db_path = arrow::util::make_optional(paths[0]); + arrow::StopIfNotOk(arrow::Initialize(options)); +} + #endif diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d0afda8912d..1e3ad628be4 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -26,13 +26,12 @@ library(dplyr, warn.conflicts = FALSE) # TODO: consider reevaluating this workaround after ARROW-12980 withr::local_timezone("UTC") -# TODO: We should test on windows once ARROW-13168 is resolved. if (tolower(Sys.info()[["sysname"]]) == "windows") { - test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "") -} else { - test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas") + withr::local_locale(LC_TIME = "C") } +test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas") + test_df <- tibble::tibble( # test_date + 1 turns the tzone = "" to NULL, which is functionally equivalent @@ -120,8 +119,6 @@ test_that("errors in strptime", { }) test_that("strftime", { - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - times <- tibble( datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), date = c(as.Date("2021-01-01"), NA) @@ -184,13 +181,15 @@ test_that("strftime", { # This check is due to differences in the way %c currently works in Arrow and R's strftime. # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved. - expect_error( - times %>% - Table$create() %>% - mutate(x = strftime(datetime, format = "%c")) %>% - collect(), - "%c flag is not supported in non-C locales." - ) + if (Sys.getlocale("LC_TIME") != "C") { + expect_error( + times %>% + Table$create() %>% + mutate(x = strftime(datetime, format = "%c")) %>% + collect(), + "%c flag is not supported in non-C locales." + ) + } # Output precision of %S depends on the input timestamp precision. # Timestamps with second precision are represented as integers while @@ -209,8 +208,6 @@ test_that("strftime", { test_that("format_ISO8601", { # https://issues.apache.org/jira/projects/ARROW/issues/ARROW-15266 skip_if_not_available("re2") - # https://issues.apache.org/jira/browse/ARROW-13168 - skip_on_os("windows") times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA)) compare_dplyr_binding( @@ -356,8 +353,6 @@ test_that("extract month from timestamp", { test_df ) - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( .input %>% # R returns ordered factor whereas Arrow returns character @@ -434,8 +429,6 @@ test_that("extract wday from timestamp", { test_df ) - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( .input %>% mutate(x = wday(date, label = TRUE)) %>% @@ -538,15 +531,6 @@ test_that("extract quarter from date", { ) }) -test_that("extract month from date", { - compare_dplyr_binding( - .input %>% - mutate(x = month(date)) %>% - collect(), - test_df - ) -}) - test_that("extract isoweek from date", { compare_dplyr_binding( .input %>% @@ -582,8 +566,6 @@ test_that("extract month from date", { test_df ) - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( .input %>% # R returns ordered factor whereas Arrow returns character @@ -602,7 +584,6 @@ test_that("extract month from date", { ) }) - test_that("extract day from date", { compare_dplyr_binding( .input %>% @@ -634,8 +615,6 @@ test_that("extract wday from date", { test_df ) - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( .input %>% mutate(x = wday(date, label = TRUE, abbr = TRUE)) %>% @@ -704,10 +683,6 @@ test_that("leap_year mirror lubridate", { }) test_that("am/pm mirror lubridate", { - - # https://issues.apache.org/jira/browse/ARROW-13168 - skip_on_os("windows") - compare_dplyr_binding( .input %>% mutate( @@ -805,8 +780,6 @@ test_that("dst extracts daylight savings time correctly", { test_df <- tibble( dates = as.POSIXct(c("2021-02-20", "2021-07-31", "2021-10-31", "2021-01-31"), tz = "Europe/London") ) - # https://issues.apache.org/jira/browse/ARROW-13168 - skip_on_os("windows") compare_dplyr_binding( .input %>%