From 433f064d0b005ea064e3cfbd5ed4984b249e5d92 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Thu, 1 Jun 2023 11:27:18 +0200 Subject: [PATCH] #793: Added support for Pandas 2 pyarrow dtype columns for emitting data from Python UDFs (#357) * Update pandas to 2.0.2 in compatible template flavors * #796: Fixed silent data corruption when emitting dataframes with float16 dtype columns from Python UDFs * Replace asscalar with item in test dataframe.py, because asscalar was removed * Added handleEmitPyFloat to also support float pyarrow dtype columns with NAN and object-dtype columns with float * Refactored and split Pandas Tests * Added tests for more dtypes to Pandas Tests --- .../python/python3/python_ext_dataframe.cc | 275 +++++++++---- .../flavor_base/testconfig | 2 +- .../packages/python3_pip_packages | 5 +- .../flavor_base/testconfig | 2 +- .../language_deps/packages/conda_packages | 4 +- .../flavor_base/testconfig | 2 +- .../language_deps/packages/conda_packages | 4 +- .../flavor_base/testconfig | 2 +- .../test/{python3 => pandas}/all/dataframe.py | 44 +-- .../tests/test/pandas/all/emit_dtypes.py | 369 ++++++++++++++++++ .../tests/test/pandas/pandas2/pandas.py | 361 +++++++++++++++++ 11 files changed, 957 insertions(+), 113 deletions(-) rename test_container/tests/test/{python3 => pandas}/all/dataframe.py (96%) create mode 100755 test_container/tests/test/pandas/all/emit_dtypes.py create mode 100755 test_container/tests/test/pandas/pandas2/pandas.py diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 4cd94e378..36f0fffdb 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -1,4 +1,5 @@ #include "exaudflib/swig/swig_common.h" +#include "debug_message.h" #include @@ -16,6 +17,7 @@ #include #include #include +#include extern "C" { @@ -25,9 +27,11 @@ extern "C" { #define PY_DATE (NPY_USERDEF+4) #define PY_NONETYPE (NPY_USERDEF+5) #define PY_BOOL (NPY_USERDEF+6) +#define PY_FLOAT (NPY_USERDEF+7) +#define PY_TIMESTAMP (NPY_USERDEF+8) + +std::map pandasDTypeStrToNumpyCTypeMap { -std::map typeMap { - {"bool", NPY_BOOL}, {"int", NPY_INT32}, {"intc", NPY_INT32}, {"intp", NPY_INT64}, @@ -35,22 +39,71 @@ std::map typeMap { {"int16", NPY_INT16}, {"int32", NPY_INT32}, {"int64", NPY_INT64}, + {"int8[pyarrow]", NPY_OBJECT}, + {"int16[pyarrow]", NPY_OBJECT}, + {"int32[pyarrow]", NPY_OBJECT}, + {"int64[pyarrow]", NPY_OBJECT}, + {"uint8", NPY_UINT8}, {"uint16", NPY_UINT16}, {"uint32", NPY_UINT32}, {"uint64", NPY_UINT64}, - {"float", NPY_FLOAT64}, - {"float16", NPY_FLOAT16}, + {"uint8[pyarrow]", NPY_OBJECT}, + {"uint16[pyarrow]", NPY_OBJECT}, + {"uint32[pyarrow]", NPY_OBJECT}, + {"uint64[pyarrow]", NPY_OBJECT}, + {"float32", NPY_FLOAT32}, {"float64", NPY_FLOAT64}, + {"float", NPY_FLOAT32}, + {"double", NPY_FLOAT64}, + {"float32[pyarrow]", NPY_OBJECT}, + {"float64[pyarrow]", NPY_OBJECT}, + {"float[pyarrow]", NPY_OBJECT}, + {"double[pyarrow]", NPY_OBJECT}, + // We let numpy convert float16 to float (32 bit) and then use the C conversion from float to double, because a proper conversion from float16 to double in C is very complicated. + {"float16", NPY_FLOAT32}, + {"halffloat", NPY_FLOAT32}, + {"float16[pyarrow]", NPY_OBJECT}, + {"halffloat[pyarrow]", NPY_OBJECT}, + + {"string[pyarrow]", NPY_OBJECT}, + {"string[python]", NPY_OBJECT}, + {"string", NPY_OBJECT}, + + {"bool[pyarrow]", NPY_OBJECT}, + {"boolean", NPY_OBJECT}, + {"bool", NPY_BOOL}, + + {"datetime64[ns]", NPY_DATETIME}, + {"timestamp[ns, tz=UTC][pyarrow]", NPY_OBJECT}, + + {"object", NPY_OBJECT}, + + {"py_NAType", PY_NONETYPE}, + {"py_NoneType", PY_NONETYPE}, + {"py_bool", PY_BOOL}, {"py_int", PY_INT}, + {"py_float", PY_FLOAT}, {"py_decimal.Decimal", PY_DECIMAL}, {"py_str", PY_STR}, {"py_datetime.date", PY_DATE}, - {"datetime64[ns]", NPY_DATETIME}, - {"object", NPY_OBJECT}, - {"py_NoneType", PY_NONETYPE}, - {"py_bool", PY_BOOL} + {"py_Timestamp", PY_TIMESTAMP} +}; + +std::map numpyCTypeToNumpyDTypeStrMap { + {NPY_BOOL, "bool"}, + {NPY_INT8, "int8"}, + {NPY_INT16, "int16"}, + {NPY_INT32, "int32"}, + {NPY_INT64, "int64"}, + {NPY_UINT8, "uint8"}, + {NPY_UINT16, "uint16"}, + {NPY_UINT32, "uint32"}, + {NPY_UINT64, "uint64"}, + // We don't list NPY_FLOAT16 here, because we let numpy convert float16 to float (32 bit) and then use the C conversion from float to double, because a proper conversion from float16 to double in C is very complicated. + {NPY_FLOAT32, "float32"}, + {NPY_FLOAT64, "float64"}, }; std::map emitTypeMap { @@ -390,17 +443,39 @@ inline void getColumnSetMethods(std::vector& colInfo, std::vector>& colTypes){ PyPtr numpyTypeIter(PyObject_GetIter(numpyTypes)); for (PyPtr numpyType(PyIter_Next(numpyTypeIter.get())); numpyType.get(); numpyType.reset(PyIter_Next(numpyTypeIter.get()))) { const char *typeName = PyUnicode_AsUTF8(numpyType.get()); - std::map::iterator it = typeMap.find(typeName); - if (it != typeMap.end()) { + std::map::iterator it = pandasDTypeStrToNumpyCTypeMap.find(typeName); + if (it != pandasDTypeStrToNumpyCTypeMap.end()) { colTypes.push_back(*it); + } else if(isArrowDecimal128(typeName)){ + colTypes.push_back({typeName, NPY_OBJECT}); } else if(isNumpyDatetime64(typeName)){ std::stringstream ss; ss << "F-UDF-CL-SL-PYTHON-1138: emit: unsupported datetime type: " << typeName << @@ -417,15 +492,23 @@ inline void getColumnTypeInfo(PyObject *numpyTypes, std::vectorob_type; const char* p = type->tp_name; PyObject* objectsRepresentation = PyObject_Repr(obj); const char* s = PyUnicode_AsUTF8(objectsRepresentation); - throw std::runtime_error(error_code+": "+std::string(s)+" "+std::string(p)); + DBG_STREAM_MSG(std::cerr, error_code << ": " << std::string(s) << " " << std::string(p)); +} + +inline const PyPtr& getPandasNA(){ + static const PyPtr pdNA(PyObject_GetAttrString(pandasModule.get(), "NA")); + return pdNA; +} + +inline bool isNoneOrNA(PyObject* pyVal){ + const PyPtr& pdNA = getPandasNA(); + return pyVal == Py_None || pyVal == pdNA.get(); } -#endif inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, std::vector>& colTypes, std::vector& columnArrays){ @@ -448,14 +531,16 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, throw std::runtime_error(ss.str().c_str()); } - // Get type of first non-None item in list + // Get type of first non-None, non-NA item in list PyObject *pyVal = PyList_GetItem(pyList.get(), 0); checkPyObjectIsNull(pyVal,"F-UDF-CL-SL-PYTHON-1126"); std::string pyTypeName(std::string("py_") + Py_TYPE(pyVal)->tp_name); - for (int r = 1; r < numRows && pyVal == Py_None; r++) { + bool pyValIsNoneOrNA = isNoneOrNA(pyVal); + for (int r = 1; r < numRows && pyValIsNoneOrNA; r++) { pyVal = PyList_GetItem(pyList.get(), r); + pyValIsNoneOrNA = isNoneOrNA(pyVal); checkPyObjectIsNull(pyVal,"F-UDF-CL-SL-PYTHON-1127"); - if (pyVal != Py_None) { + if (!pyValIsNoneOrNA) { pyTypeName = std::string("py_") + Py_TYPE(pyVal)->tp_name; break; } @@ -463,8 +548,8 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, // Update type in column type info std::map::iterator userDefIt; - userDefIt = typeMap.find(pyTypeName); - if (userDefIt != typeMap.end()) { + userDefIt = pandasDTypeStrToNumpyCTypeMap.find(pyTypeName); + if (userDefIt != pandasDTypeStrToNumpyCTypeMap.end()) { colTypes[c] = *userDefIt; } else { // TODO accept pandas.Timestamp values @@ -493,9 +578,9 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, PyPtr asType (PyObject_GetAttrString(array.get(), "astype")); PyPtr keywordArgs(PyDict_New()); PyDict_SetItemString(keywordArgs.get(), "copy", Py_False); - PyPtr funcArgs(Py_BuildValue("(s)", colTypes[c].first.c_str())); + const std::string numpyDTypeStr = numpyCTypeToNumpyDTypeStrMap.at(colTypes[c].second); + PyPtr funcArgs(Py_BuildValue("(s)", numpyDTypeStr.c_str())); PyPtr scalarArr(PyObject_Call(asType.get(), funcArgs.get(), keywordArgs.get())); - columnArrays.push_back(std::move(scalarArr)); } } @@ -724,43 +809,6 @@ inline void handleEmitNpyFloat32( pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); } -inline void handleEmitNpyFloat16( - int c, int r, - std::vector& columnArrays, - std::vector>& pyColSetMethods, - std::vector& colInfo, - std::vector>& colTypes, - PyObject *resultHandler, - PyPtr& pyValue, - PyPtr& pyResult, - PyPtr& pySetNullMethodName){ - double value = static_cast(*((uint16_t*)(PyArray_GETPTR1((PyArrayObject*)(columnArrays[c].get()), r)))); - if (npy_isnan(value)) { - pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); - return; - } - switch (colInfo[c].type) { - case SWIGVMContainers::INT64: - case SWIGVMContainers::INT32: - pyValue.reset(PyLong_FromLong(static_cast(value))); - break; - case SWIGVMContainers::NUMERIC: - pyValue.reset(PyUnicode_FromString(std::to_string(value).c_str())); - break; - case SWIGVMContainers::DOUBLE: - pyValue.reset(PyFloat_FromDouble(value)); - break; - default: - { - std::stringstream ss; - ss << "F-UDF-CL-SL-PYTHON-1064: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; - throw std::runtime_error(ss.str().c_str()); - } - } - checkPyPtrIsNull(pyValue); - pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); -} - inline void handleEmitNpyBool( int c, int r, std::vector& columnArrays, @@ -810,7 +858,7 @@ inline void handleEmitPyBool( PyPtr& pySetNullMethodName){ PyPtr pyBool(PyList_GetItem(columnArrays[c].get(), r)); checkPyPtrIsNull(pyBool); - if (pyBool.get() == Py_None) { + if (isNoneOrNA(pyBool.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -851,7 +899,7 @@ inline void handleEmitPyInt( PyPtr& pySetNullMethodName){ PyPtr pyInt(PyList_GetItem(columnArrays[c].get(), r)); checkPyPtrIsNull(pyInt); - if (pyInt.get() == Py_None) { + if (isNoneOrNA(pyInt.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -882,6 +930,54 @@ inline void handleEmitPyInt( pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); } +inline void handleEmitPyFloat( + int c, int r, + std::vector& columnArrays, + std::vector>& pyColSetMethods, + std::vector& colInfo, + std::vector>& colTypes, + PyObject *resultHandler, + PyPtr& pyValue, + PyPtr& pyResult, + PyPtr& pySetNullMethodName){ + PyPtr pyFloat(PyList_GetItem(columnArrays[c].get(), r)); + checkPyPtrIsNull(pyFloat); + if (isNoneOrNA(pyFloat.get())) { + pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); + return; + } + + switch (colInfo[c].type) { + case SWIGVMContainers::INT64: + case SWIGVMContainers::INT32: + { + double value = PyFloat_AsDouble(pyFloat.get()); + if (value < 0 && PyErr_Occurred()) + throw std::runtime_error("F-UDF-CL-SL-PYTHON-1139: emit() PY_FLOAT: PyFloat_AsDouble error"); + if (npy_isnan(value)) { + pyResult.reset( + PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); + return; + } + pyValue.reset(PyLong_FromLong(static_cast(value))); + break; + } + case SWIGVMContainers::NUMERIC: + pyValue.reset(PyObject_Str(pyFloat.get())); + break; + case SWIGVMContainers::DOUBLE: + pyValue.reset(pyFloat.release()); + break; + default: + { + std::stringstream ss; + ss << "F-UDF-CL-SL-PYTHON-1140: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; + throw std::runtime_error(ss.str().c_str()); + } + } + pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); +} + inline void handleEmitPyDecimal( int c, int r, std::vector& columnArrays, @@ -896,7 +992,7 @@ inline void handleEmitPyDecimal( PyPtr& pyFloatMethodName ){ PyPtr pyDecimal(PyList_GetItem(columnArrays[c].get(), r)); - if (pyDecimal.get() == Py_None) { + if (isNoneOrNA(pyDecimal.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -940,7 +1036,7 @@ inline void handleEmitPyStr( PyPtr& pySetNullMethodName){ PyPtr pyString(PyList_GetItem(columnArrays[c].get(), r)); - if (pyString.get() == Py_None) { + if (isNoneOrNA(pyString.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -980,7 +1076,7 @@ inline void handleEmitPyDate( PyPtr& pySetNullMethodName, PyPtr& pyIsoformatMethodName){ PyPtr pyDate(PyList_GetItem(columnArrays[c].get(), r)); - if (pyDate.get() == Py_None) { + if (isNoneOrNA(pyDate.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -1000,6 +1096,42 @@ inline void handleEmitPyDate( } } } +inline void handleEmitPyTimestamp( + int c, int r, + std::vector& columnArrays, + std::vector>& pyColSetMethods, + std::vector& colInfo, + std::vector>& colTypes, + PyObject *resultHandler, + PyPtr& pyValue, + PyPtr& pyResult, + PyPtr& pySetNullMethodName){ + PyPtr pyTimestamp(PyList_GetItem(columnArrays[c].get(), r)); + if (isNoneOrNA(pyTimestamp.get())) { + pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); + return; + } + + switch (colInfo[c].type) { + case SWIGVMContainers::TIMESTAMP: + { + // We call here pandas.Timestamp.tz_localize(None), because we need to remove the timezone from the timestamp. + // Exasol doesn't support timezones, and if we don't remove the timezone, pandas.Timestamp.isoformat will add + // it to the generated string. + pyTimestamp.reset(PyObject_CallMethod(pyTimestamp.get(), "tz_localize", "z", NULL)); + PyPtr pyIsoDatetime(PyObject_CallMethod(pyTimestamp.get(), "isoformat", "s", " ")); + pyResult.reset(PyObject_CallMethodObjArgs( + resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyIsoDatetime.get(), NULL)); + break; + } + default: + { + std::stringstream ss; + ss << "F-UDF-CL-SL-PYTHON-1141: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; + throw std::runtime_error(ss.str().c_str()); + } + } +} inline void handleEmitNpyDateTime( @@ -1146,12 +1278,6 @@ void emit(PyObject *resultHandler, std::vector& colInfo, PyObject *d handleEmitNpyFloat32(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); break; } - case NPY_FLOAT16: - { - handleEmitNpyFloat16(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); - break; - } - case NPY_BOOL: { handleEmitNpyBool(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); @@ -1167,6 +1293,11 @@ void emit(PyObject *resultHandler, std::vector& colInfo, PyObject *d handleEmitPyInt(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); break; } + case PY_FLOAT: + { + handleEmitPyFloat(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); + break; + } case PY_DECIMAL: { handleEmitPyDecimal(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, @@ -1184,6 +1315,12 @@ void emit(PyObject *resultHandler, std::vector& colInfo, PyObject *d pySetNullMethodName, pyIsoformatMethodName); break; } + case PY_TIMESTAMP: + { + handleEmitPyTimestamp(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, + pySetNullMethodName); + break; + } case NPY_DATETIME: { handleEmitNpyDateTime(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, diff --git a/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig b/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig index 7c6eefa41..8688dd718 100644 --- a/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig +++ b/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all diff --git a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages index f0269d53c..827abf566 100644 --- a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages +++ b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages @@ -1,2 +1,3 @@ -pandas|1.3.4 -numpy|1.21.3 +pandas|2.0.2 +numpy|1.24.3 +pyarrow|12.0.0 diff --git a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig index 7c6eefa41..d1bdb8113 100644 --- a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig +++ b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all pandas/pandas2 diff --git a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages index 4ba668b64..465ae1c0a 100644 --- a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages +++ b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages @@ -1,6 +1,6 @@ python|3.8.13 -numpy|1.22.3 -pandas|1.4.2 +numpy|1.24.3 +pandas|2.0.2 libblas|3.9.0=15_linux64_mkl mamba|1.3.1 ld_impl_linux-64|2.36.1 diff --git a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig index 7c6eefa41..d1bdb8113 100644 --- a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig +++ b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all pandas/pandas2 diff --git a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages index 4ba668b64..465ae1c0a 100644 --- a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages +++ b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages @@ -1,6 +1,6 @@ python|3.8.13 -numpy|1.22.3 -pandas|1.4.2 +numpy|1.24.3 +pandas|2.0.2 libblas|3.9.0=15_linux64_mkl mamba|1.3.1 ld_impl_linux-64|2.36.1 diff --git a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig index 7c6eefa41..d1bdb8113 100644 --- a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig +++ b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all pandas/pandas2 diff --git a/test_container/tests/test/python3/all/dataframe.py b/test_container/tests/test/pandas/all/dataframe.py similarity index 96% rename from test_container/tests/test/python3/all/dataframe.py rename to test_container/tests/test/pandas/all/dataframe.py index bba8d2cf2..58c9fd738 100755 --- a/test_container/tests/test/python3/all/dataframe.py +++ b/test_container/tests/test/pandas/all/dataframe.py @@ -5,10 +5,14 @@ from datetime import datetime from exasol_python_test_framework import udf +from exasol_python_test_framework.exatest.testcase import useData +from exasol_python_test_framework.udf.udf_debug import UdfDebugger +from typing import List, Tuple, Union class PandasDataFrame(udf.TestCase): def setUp(self): + self.maxDiff=None self.query('CREATE SCHEMA FN2', ignore_errors=True) self.query('OPEN SCHEMA FN2', ignore_errors=True) @@ -159,7 +163,7 @@ def test_dataframe_scalar_returns(self): def run(ctx): df = ctx.get_dataframe() - return np.asscalar(df.iloc[0, 0] + df.iloc[0, 1]) + return (df.iloc[0, 0] + df.iloc[0, 1]).item() / ''' % (self.col_defs_str)) self.query(udf_sql) @@ -217,7 +221,7 @@ def test_dataframe_scalar_emits_unique(self): def run(ctx): df = ctx.get_dataframe() - ctx.emit(np.asscalar(df.C0)) + ctx.emit(df.C0.item()) / ''') print(udf_sql) @@ -236,7 +240,7 @@ def test_dataframe_scalar_emits_all_unique(self): def run(ctx): df = ctx.get_dataframe(num_rows="all") - ctx.emit(np.asscalar(df.C0)) + ctx.emit(df.C0.item()) / ''') print(udf_sql) @@ -331,7 +335,7 @@ def test_dataframe_set_returns(self): def run(ctx): df = ctx.get_dataframe(num_rows="all") - return np.asscalar(df.iloc[:, 0].sum()) + return df.iloc[:, 0].sum().item() / ''' % (self.col_defs_str)) print(udf_sql) @@ -477,7 +481,7 @@ def run(ctx): df = ctx.get_dataframe(num_rows=1) if df is None: break - ctx.emit(np.asscalar(df.C0)) + ctx.emit(df.C0.item()) / ''') print(udf_sql) @@ -500,7 +504,7 @@ def run(ctx): if df is None: break for i in range(df.shape[0]): - ctx.emit(np.asscalar(df.iloc[i, 0])) + ctx.emit(df.iloc[i, 0].item()) / ''') print(udf_sql) @@ -901,33 +905,6 @@ def run(ctx): (234,) ], rows) - def test_dataframe_set_emits_double_pyfloat_only_todo(self): - import datetime - udf_sql = udf.fixindent(''' - CREATE OR REPLACE PYTHON3 SET SCRIPT foo(sec int) EMITS (ts double) AS - - def run(ctx): - import pandas as pd - import numpy as np - import datetime - - c1=np.empty(shape=(2),dtype=np.object_) - - c1[:]=234.5 - - df=pd.DataFrame({0:c1}) - - ctx.emit(df) - / - ''') - print(udf_sql) - self.query(udf_sql) - select_sql = 'SELECT foo(1)' - print(select_sql) - #TODO implement support - with self.assertRaisesRegex(Exception, 'F-UDF-CL-SL-PYTHON-1056'): - rows = self.query(select_sql) - def test_dataframe_set_emits_double_npfloat32_only(self): import datetime udf_sql = udf.fixindent(''' @@ -1015,7 +992,6 @@ def run(ctx): print(select_sql) rows = self.query(select_sql) - if __name__ == '__main__': udf.main() diff --git a/test_container/tests/test/pandas/all/emit_dtypes.py b/test_container/tests/test/pandas/all/emit_dtypes.py new file mode 100755 index 000000000..774e0bc57 --- /dev/null +++ b/test_container/tests/test/pandas/all/emit_dtypes.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 + +from decimal import Decimal +from datetime import date +from datetime import datetime + +from exasol_python_test_framework import udf +from exasol_python_test_framework.exatest.testcase import useData +from exasol_python_test_framework.udf.udf_debug import UdfDebugger +from typing import List, Tuple, Union + + +class PandasDataFrameEmitDTypes(udf.TestCase): + def setUp(self): + self.maxDiff=None + + self.query(f'CREATE SCHEMA {self.__class__.__name__}', ignore_errors=True) + self.query(f'OPEN SCHEMA {self.__class__.__name__}', ignore_errors=True) + + int_dataframe_value_str = "[[1, 2],[3, 4]]" + int_expected_rows = [(1, 2, None),(3, 4, None)] + int_to_float_expected_rows = [(1.0, 2.0, None),(3.0, 4.0, None)] + + float16_dataframe_value_str = 'np.array([[1.1, 2.1],[3.1, 4.1]], dtype="float16")' + float_dataframe_value_str = "[[1.1, 2.1],[3.1, 4.1]]" + float_expected_rows = [(1.1, 2.1, None),(3.1, 4.1, None)] + + str_dataframe_value_str = "[['a','b'],['c','d']]" + str_expected_rows = [('a','b',None),('c','d',None)] + + bool_dataframe_value_str = "[[True,False],[True,False]]" + bool_expected_rows = [(True,False,None),(True,False,None)] + + decimal_dataframe_value_str = "[[Decimal('1.1'),Decimal('2.1')],[Decimal('3.1'),Decimal('4.1')]]" + decimal_expected_rows = [(Decimal('1.1'),Decimal('2.1'),None),(Decimal('3.1'),Decimal('4.1'),None)] + int_to_decimal_expected_rows = [(Decimal('1'),Decimal('2'),None),(Decimal('3'),Decimal('4'),None)] + + timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))],' \ + +'[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)],' \ + +'[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None), + (datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + date_dataframe_value_str = '[[date(2020, 7, 27),' \ + +'date(2020, 7, 27)],' \ + +'[date(2020, 7, 27),' \ + +'date(2020, 7, 27)]]' + date_expected_rows = [(date(2020, 7, 27),date(2020, 7, 27),None), + (date(2020, 7, 27),date(2020, 7, 27),None)] + + mixed_int_dataframe_value_str = "[[1, None],[None, 4]]" + mixed_int_expected_rows = [(1, None, None),(None, 4, None)] + mixed_int_to_float_expected_rows = [(1.0, None, None),(None, 4.0, None)] + + mixed_float16_dataframe_value_str = 'np.array([[1.1, None],[None, 4.1]], dtype="float16")' + mixed_float_dataframe_value_str = "[[1.1, None],[None, 4.1]]" + mixed_float_expected_rows = [(1.1, None, None),(None, 4.1, None)] + + mixed_str_dataframe_value_str = "[['a',None],[None,'d']]" + mixed_str_expected_rows = [('a',None,None),(None,'d',None)] + + mixed_bool_dataframe_value_str = "[[True,None],[None,False]]" + mixed_bool_expected_rows = [(True,None,None),(None,False,None)] + mixed_bool_expected_rows_bool_ = [(True, False, None),(False, False, None)] + + mixed_decimal_dataframe_value_str = "[[Decimal('1.1'),None],[None,Decimal('4.1')]]" + mixed_decimal_expected_rows = [(Decimal('1.1'),None,None),(None,Decimal('4.1'),None)] + mixed_int_to_decimal_expected_rows = [(Decimal('1'),None,None),(None,Decimal('4'),None)] + + mixed_timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),None],' \ + +'[None,pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + mixed_datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),None],' \ + +'[None,datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + mixed_datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),None,None), + (None,datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + mixed_date_dataframe_value_str = '[[date(2020, 7, 27),None],' \ + +'[None,date(2020, 7, 27)]]' + mixed_date_expected_rows = [(date(2020, 7, 27),None,None), + (None,date(2020, 7, 27),None)] + + none_dataframe_value_str = "[[None, None],[None, None]]" + none_expected_rows = [(None, None, None),(None, None, None)] + none_expected_rows_bool_ = [(False, False, None),(False, False, None)] + + nan_dataframe_value_str = "[[np.nan, np.nan],[np.nan, np.nan]]" + nan_expected_rows = [(None, None, None),(None, None, None)] + nan_expected_rows_bool_ = [(True, True, None),(True, True, None)] + + + + types = [ + # Full columns without None or NaN / Int + + ("uint8", "integer", int_dataframe_value_str, int_expected_rows, False), + ("uint16", "integer", int_dataframe_value_str, int_expected_rows, False), + ("uint32", "integer", int_dataframe_value_str, int_expected_rows, False), + ("uint64", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int8", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int16", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int32", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int64", "integer", int_dataframe_value_str, int_expected_rows, False), + ("object", "integer", int_dataframe_value_str, int_expected_rows, False), + + # Full columns without None or NaN / Float + + ("float16", "double", float16_dataframe_value_str, float_expected_rows, True), + ("float32", "double", float_dataframe_value_str, float_expected_rows, True), + ("float64", "double", float_dataframe_value_str, float_expected_rows, False), + ("float", "double", float_dataframe_value_str, float_expected_rows, False), + ("double", "double", float_dataframe_value_str, float_expected_rows, False), + ("object", "double", float_dataframe_value_str, float_expected_rows, False), + + # Full columns without None or NaN / Int to Float + + ("uint8", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("uint16", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("uint32", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("uint64", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int8", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int16", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int32", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int64", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("object", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + + # Full columns without None or NaN / Float to Int + + ("float16", "integer", float16_dataframe_value_str, int_expected_rows, False), + ("float32", "integer", float_dataframe_value_str, int_expected_rows, False), + ("float64", "integer", float_dataframe_value_str, int_expected_rows, False), + ("float", "integer", float_dataframe_value_str, int_expected_rows, False), + ("double", "integer", float_dataframe_value_str, int_expected_rows, False), + ("object", "integer", float_dataframe_value_str, int_expected_rows, False), + + # Full columns without None or NaN / Int to Decimal + + ("uint8", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("uint16", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("uint32", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("uint64", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int8", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int16", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int32", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int64", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("object", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + + # Full columns without None or NaN / Float to Decimal + + ("float16", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), + ("float32", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("float64", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("float", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("double", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("object", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + + # Full columns without None or NaN / Decimal + + ("object", "DECIMAL(10,5)", decimal_dataframe_value_str, decimal_expected_rows, False), + + # Full columns without None or NaN / String + + ("string", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + ("object", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + + # Full columns without None or NaN / Boolean + + ("bool_", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + ("boolean", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + ("object", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + + # Full columns without None or NaN / Date and Time + + ("datetime64[ns]", "timestamp", timestamp_dataframe_value_str, datetime_expected_rows, False), + ("object", "timestamp", timestamp_dataframe_value_str, datetime_expected_rows, False), + ("object", "timestamp", datetime_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1056.*unexpected python type: py_datetime.datetime.*", False), + ("object", "timestamp", date_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1071: emit column 0 of type TIMESTAMP but data given have type py_datetime.date.*", False), + ("object", "DATE", date_dataframe_value_str, date_expected_rows, False), + + # Mixed columns with values and None / Int + + #(u)int-dtypes don't support None or np.nan + + ("object", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + + # Mixed columns with values and None / Float + + ("float16", "double", mixed_float16_dataframe_value_str, mixed_float_expected_rows, True), + ("float32", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, True), + ("float64", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("float", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("double", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("object", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + + # Mixed columns with values and None / Float to Int + ("float16", "integer", mixed_float16_dataframe_value_str, mixed_int_expected_rows, False), + ("float32", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("float64", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("float", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("double", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("object", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + + # Mixed columns with values and None / Int to Decimal + + ("object", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + + # Mixed columns with values and None / Float to Decimal + + ("float16", "DECIMAL(10,5)", mixed_float16_dataframe_value_str, mixed_decimal_expected_rows, True), + ("float32", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("float64", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("float", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("double", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("object", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + + # Mixed columns with values and None / Decimal + + ("object", "DECIMAL(10,5)", mixed_decimal_dataframe_value_str, mixed_decimal_expected_rows, False), + + # Mixed columns with values and None / String + + ("string", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), + ("object", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), + + # Mixed columns with values and None / Boolean + + ("bool_", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows_bool_, False), + ("boolean", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), + ("object", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), + + # Mixed columns with values and None / Data and time + + ("datetime64[ns]", "timestamp", mixed_timestamp_dataframe_value_str, mixed_datetime_expected_rows, False), + ("object", "timestamp", mixed_timestamp_dataframe_value_str, mixed_datetime_expected_rows, False), + ("object", "DATE", mixed_date_dataframe_value_str, mixed_date_expected_rows, False), + + # None + + ("object", "integer", none_dataframe_value_str, none_expected_rows, False), + + ("float16", "double", none_dataframe_value_str, none_expected_rows, False), + ("float32", "double", none_dataframe_value_str, none_expected_rows, False), + ("float64", "double", none_dataframe_value_str, none_expected_rows, False), + ("float", "double", none_dataframe_value_str, none_expected_rows, False), + ("double", "double", none_dataframe_value_str, none_expected_rows, False), + ("object", "double", none_dataframe_value_str, none_expected_rows, False), + + ("float16", "integer", none_dataframe_value_str, none_expected_rows, False), + ("float32", "integer", none_dataframe_value_str, none_expected_rows, False), + ("float64", "integer", none_dataframe_value_str, none_expected_rows, False), + ("float", "integer", none_dataframe_value_str, none_expected_rows, False), + ("double", "integer", none_dataframe_value_str, none_expected_rows, False), + + ("float16", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("float32", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("float64", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("float", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("double", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + + ("object", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + + ("string", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), + ("object", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), + + ("bool_", "boolean", none_dataframe_value_str, none_expected_rows_bool_, False), + ("boolean", "boolean", none_dataframe_value_str, none_expected_rows, False), + ("object", "boolean", none_dataframe_value_str, none_expected_rows, False), + + ("datetime64[ns]", "timestamp", none_dataframe_value_str, none_expected_rows, False), + ("object", "timestamp", none_dataframe_value_str, none_expected_rows, False), + ("object", "DATE", none_dataframe_value_str, none_expected_rows, False), + + # NaN + + ("object", "integer", nan_dataframe_value_str, nan_expected_rows, False), + + ("float16", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("float32", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("float64", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("float", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("double", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "double", nan_dataframe_value_str, nan_expected_rows, False), + + ("float16", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("float32", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("float64", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("float", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("double", "integer", nan_dataframe_value_str, nan_expected_rows, False), + + ("float16", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("float32", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("float64", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("float", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("double", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + + #("object", "DECIMAL(10,5)", nan_dataframe_value_str, None, False), # Fails with VM error: [22018] invalid character value for cast; Value: 'nan' + + ("string", "VARCHAR(2000000)", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "VARCHAR(2000000)", nan_dataframe_value_str, ".*PYTHON-1068: emit column 0 of type STRING but data given have type py_float.*", False), + + ("bool_", "boolean", nan_dataframe_value_str, nan_expected_rows_bool_, False), + ("boolean", "boolean", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "boolean", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type BOOLEAN but data given have type py_float.*", False), + + ("datetime64[ns]", "timestamp", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "timestamp", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type TIMESTAMP but data given have type py_float.*", False), + ("object", "DATE", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type DATE but data given have type py_float.*", False), + + ] + + @useData(types) + def test_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_result:Union[str,List[Tuple]], use_almost_equal:bool): + sql=udf.fixindent(f''' + CREATE OR REPLACE PYTHON3 SET SCRIPT test_dtype_emit(i integer) + EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS + + def run(ctx): + try: + from decimal import Decimal + import pandas as pd + import numpy as np + from datetime import datetime, date + df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") + df["traceback"]=None + ctx.emit(df) + except: + import traceback + ctx.emit(None,None,traceback.format_exc()) + / + ''') + print(sql) + self.query(sql) + with UdfDebugger(test_case=self): + rows = self.query('''SELECT test_dtype_emit(0)''') + if isinstance(expected_result,str): + self.assertRegex(rows[0][2], expected_result) + else: + if use_almost_equal: + self.assertRowsAlmostEqual(expected_result, rows, places=1) + else: + self.assertRowsEqual(expected_result, rows) + + def isValueAlmostEqual(self, left, right, places): + if isinstance(left, (float, Decimal)) and isinstance(right, (float, Decimal)): + return round(left, places) == round(right, places) + else: + return left == right + + def isRowAlmostEqual(self, left, right, places): + if len(left) != len(right): + return False + all_values_almost_equal = all(self.isValueAlmostEqual(lvalue, rvalue, places) + for lvalue, rvalue in zip(left, right)) + return all_values_almost_equal + + def assertRowsAlmostEqual(self, left, right, places): + lrows = [tuple(x) for x in left] + rrows = [tuple(x) for x in right] + if len(lrows) != len(rrows): + raise AssertionError(f'{lrows} and {rrows} have different number of rows.') + all_rows_almost_equal = all(self.isRowAlmostEqual(lrow, rrow, places) for lrow, rrow in zip(lrows, rrows)) + if not all_rows_almost_equal: + raise AssertionError(f'{lrows} and {rrows} are not almost equal.') + +if __name__ == '__main__': + udf.main() + diff --git a/test_container/tests/test/pandas/pandas2/pandas.py b/test_container/tests/test/pandas/pandas2/pandas.py new file mode 100755 index 000000000..802edd422 --- /dev/null +++ b/test_container/tests/test/pandas/pandas2/pandas.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 + +from decimal import Decimal +from datetime import date +from datetime import datetime + +from exasol_python_test_framework import udf +from exasol_python_test_framework.exatest.testcase import useData +from exasol_python_test_framework.udf.udf_debug import UdfDebugger +from typing import List, Tuple, Union + +class Pandas2Test(udf.TestCase): + def setUp(self): + self.query('create schema pandas2test', ignore_errors=True) + self.maxDiff=None + + def test_pandas2_version(self): + sql=udf.fixindent(''' + CREATE OR REPLACE PYTHON3 SET SCRIPT pandas2test.test_pandas2_version(i integer) EMITS (o VARCHAR(100)) AS + + def run(ctx): + import pandas as pd + ctx.emit(pd.__version__) + / + ''') + print(sql) + self.query(sql) + rows = self.query('''SELECT pandas2test.test_pandas2_version(0)''') + version_parts = rows[0][0].split(".") + self.assertEqual("2",version_parts[0]) + + + int_dataframe_value_str = "[[1, 2],[3, 4]]" + int_expected_rows = [(1, 2, None),(3, 4, None)] + int_to_float_expected_rows = [(1.0, 2.0, None),(3.0, 4.0, None)] + + float16_dataframe_value_str = 'np.array([[1.1, 2.1],[3.1, 4.1]], dtype="float16")' + float_dataframe_value_str = "[[1.1, 2.1],[3.1, 4.1]]" + float_expected_rows = [(1.1, 2.1, None),(3.1, 4.1, None)] + + str_dataframe_value_str = "[['a','b'],['c','d']]" + str_expected_rows = [('a','b',None),('c','d',None)] + + bool_dataframe_value_str = "[[True,False],[True,False]]" + bool_expected_rows = [(True,False,None),(True,False,None)] + + decimal_dataframe_value_str = "[[Decimal('1.1'),Decimal('2.1')],[Decimal('3.1'),Decimal('4.1')]]" + decimal_expected_rows = [(Decimal('1.1'),Decimal('2.1'),None),(Decimal('3.1'),Decimal('4.1'),None)] + int_to_decimal_expected_rows = [(Decimal('1'),Decimal('2'),None),(Decimal('3'),Decimal('4'),None)] + + timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))],' \ + +'[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)],' \ + +'[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None), + (datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + date_dataframe_value_str = '[[date(2020, 7, 27),' \ + +'date(2020, 7, 27)],' \ + +'[date(2020, 7, 27),' \ + +'date(2020, 7, 27)]]' + date_expected_rows = [(date(2020, 7, 27),date(2020, 7, 27),None), + (date(2020, 7, 27),date(2020, 7, 27),None)] + + mixed_int_dataframe_value_str = "[[1, None],[None, 4]]" + mixed_int_expected_rows = [(1, None, None),(None, 4, None)] + mixed_int_to_float_expected_rows = [(1.0, None, None),(None, 4.0, None)] + + mixed_float16_dataframe_value_str = 'np.array([[1.1, None],[None, 4.1]], dtype="float16")' + mixed_float_dataframe_value_str = "[[1.1, None],[None, 4.1]]" + mixed_float_expected_rows = [(1.1, None, None),(None, 4.1, None)] + + mixed_str_dataframe_value_str = "[['a',None],[None,'d']]" + mixed_str_expected_rows = [('a',None,None),(None,'d',None)] + + mixed_bool_dataframe_value_str = "[[True,None],[None,False]]" + mixed_bool_expected_rows = [(True,None,None),(None,False,None)] + + mixed_decimal_dataframe_value_str = "[[Decimal('1.1'),None],[None,Decimal('4.1')]]" + mixed_decimal_expected_rows = [(Decimal('1.1'),None,None),(None,Decimal('4.1'),None)] + mixed_int_to_decimal_expected_rows = [(Decimal('1'),None,None),(None,Decimal('4'),None)] + + mixed_timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),None],' \ + +'[None,pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + mixed_datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),None],' \ + +'[None,datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + mixed_datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),None,None), + (None,datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + mixed_date_dataframe_value_str = '[[date(2020, 7, 27),None],' \ + +'[None,date(2020, 7, 27)]]' + mixed_date_expected_rows = [(date(2020, 7, 27),None,None), + (None,date(2020, 7, 27),None)] + + none_dataframe_value_str = "[[None, None],[None, None]]" + none_expected_rows = [(None, None, None),(None, None, None)] + + nan_dataframe_value_str = "[[np.nan, np.nan],[np.nan, np.nan]]" + nan_expected_rows = [(None, None, None),(None, None, None)] + + types = [ + # Full columns without None or NaN / Int + + ("dtype='uint8[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + + # Full columns without None or NaN / Float + + ("dtype='float16[pyarrow]'", "double", float16_dataframe_value_str, float_expected_rows, True), + ("dtype='float32[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, True), + ("dtype='float64[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "double", float16_dataframe_value_str, float_expected_rows, True), + ("dtype='float[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, True), + ("dtype='double[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, False), + + # Full columns without None or NaN / Decimal + + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", decimal_dataframe_value_str, decimal_expected_rows, False), + # Full columns without None or NaN / Int to Decimal + + ("dtype='uint8[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='uint16[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='uint32[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='uint64[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int8[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int16[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int32[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int64[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + + # Full columns without None or NaN / Float to Decimal + + ("dtype='float16[pyarrow]'", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), + ("dtype='float32[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + ("dtype='float64[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), + ("dtype='float[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + ("dtype='double[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + + # Full columns without None or NaN / Int To Double + + ("dtype='uint8[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='uint16[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='uint32[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='uint64[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int8[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int16[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int32[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int64[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + + # Full columns without None or NaN / Float to Int + + ("dtype='float16[pyarrow]'", "integer", float16_dataframe_value_str, int_expected_rows, False), + ("dtype='float32[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + ("dtype='float64[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "integer", float16_dataframe_value_str, int_expected_rows, False), + ("dtype='float[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + ("dtype='double[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + + # Full columns without None or NaN / String + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + + # Full columns without None or NaN / Boolean + + ("dtype='bool[pyarrow]'", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + + # Full columns without None or NaN / Date and time + + ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", datetime_dataframe_value_str, datetime_expected_rows, False), + #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date32())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone + #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date64())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone + + # Mixed columns with values and None / Int + + ("dtype='uint8[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + + # Mixed columns with values and None / Float + + ("dtype='float16[pyarrow]'", "double", mixed_float16_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='float32[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='float64[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "double", mixed_float16_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='float[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='double[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + + # Mixed columns with values and None / Decimal + + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", mixed_decimal_dataframe_value_str, mixed_decimal_expected_rows, False), + # Mixed columns with values and None / Int to Decimal + + ("dtype='uint8[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='uint16[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='uint32[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='uint64[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int8[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int16[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int32[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int64[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + + # Mixed columns with values and None / Float to Decimal + + ("dtype='float16[pyarrow]'", "DECIMAL(10,5)", mixed_float16_dataframe_value_str, mixed_decimal_expected_rows, True), + ("dtype='float32[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + ("dtype='float64[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "DECIMAL(10,5)", mixed_float16_dataframe_value_str, mixed_decimal_expected_rows, True), + ("dtype='float[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + ("dtype='double[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + + # Mixed columns with values and None / Int To Double + + ("dtype='uint8[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='uint16[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='uint32[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='uint64[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int8[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int16[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int32[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int64[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + + # Mixed columns with values and None / Float to Int + + ("dtype='float16[pyarrow]'", "integer", mixed_float16_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='float32[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='float64[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "integer", mixed_float16_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='float[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='double[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + + # Mixed columns with values and None / String + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), + + # Mixed columns with values and None / Boolean + + ("dtype='bool[pyarrow]'", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), + + # Mixed columns with values and None / Date and time + + ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", mixed_datetime_dataframe_value_str, mixed_datetime_expected_rows, False), + + # None + + ("dtype='uint8[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + + ("dtype='float16[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='float32[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='float64[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='float[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='double[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), + + ("dtype='bool[pyarrow]'", "boolean", none_dataframe_value_str, none_expected_rows, False), + + ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", none_dataframe_value_str, none_expected_rows, False), + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + + # NaN + + ("dtype='uint8[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + + ("dtype='float16[pyarrow]'", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*", False), + ("dtype='float32[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='float64[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*", False), + ("dtype='float[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='double[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", nan_dataframe_value_str, nan_expected_rows, False), + + ("dtype='bool[pyarrow]'", "boolean", nan_dataframe_value_str, nan_expected_rows, False), + + #("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", nan_dataframe_value_str, nan_expected_rows, False), # Dateframe creation fails with: pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to timestamp using function cast_timestamp + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ] + + @useData(types) + def test_dtype_emit(self, dtype_definition:str, sql_type:str, dataframe_value_str:str, expected_result:Union[str,List[Tuple]], use_almost_equal:bool): + sql=udf.fixindent(f''' + CREATE OR REPLACE PYTHON3 SET SCRIPT test_dtype_emit(i integer) + EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS + + def run(ctx): + try: + from decimal import Decimal + import pandas as pd + import numpy as np + import pyarrow as pa + from datetime import datetime, date + {dtype_definition} + df = pd.DataFrame({dataframe_value_str}, dtype=dtype) + df["traceback"]=None + ctx.emit(df) + except: + import traceback + ctx.emit(None,None,traceback.format_exc()) + / + ''') + print(sql) + self.query(sql) + with UdfDebugger(test_case=self): + rows = self.query('''SELECT test_dtype_emit(0)''') + if isinstance(expected_result,str): + self.assertRegex(rows[0][2], expected_result) + else: + if use_almost_equal: + self.assertRowsAlmostEqual(expected_result, rows, places=1) + else: + self.assertRowsEqual(expected_result, rows) + + def isValueAlmostEqual(self, left, right, places): + if isinstance(left, (float, Decimal)) and isinstance(right, (float, Decimal)): + return round(left, places) == round(right, places) + else: + return left == right + + def isRowAlmostEqual(self, left, right, places): + if len(left) != len(right): + return False + all_values_almost_equal = all(self.isValueAlmostEqual(lvalue, rvalue, places) + for lvalue, rvalue in zip(left, right)) + return all_values_almost_equal + + def assertRowsAlmostEqual(self, left, right, places): + lrows = [tuple(x) for x in left] + rrows = [tuple(x) for x in right] + if len(lrows) != len(rrows): + raise AssertionError(f'{lrows} and {rrows} have different number of rows.') + +if __name__ == '__main__': + udf.main()