From e1cec26703b906fdabfdb5638f0a647307440a49 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Thu, 4 May 2023 09:14:55 +0200 Subject: [PATCH 01/11] Update pandas to 2.0.1 in compatible template flavors and replace asscalar with item in test dataframe.py, because asscalar was removed --- .../language_deps/packages/python3_pip_packages | 4 ++-- .../language_deps/packages/conda_packages | 4 ++-- .../language_deps/packages/conda_packages | 4 ++-- test_container/tests/test/python3/all/dataframe.py | 12 ++++++------ 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages index f0269d53c..460b92d24 100644 --- a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages +++ b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages @@ -1,2 +1,2 @@ -pandas|1.3.4 -numpy|1.21.3 +pandas|2.0.1 +numpy|1.24.3 diff --git a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages index 4ba668b64..0a755b9f8 100644 --- a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages +++ b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages @@ -1,6 +1,6 @@ python|3.8.13 -numpy|1.22.3 -pandas|1.4.2 +numpy|1.24.3 +pandas|2.0.1 libblas|3.9.0=15_linux64_mkl mamba|1.3.1 ld_impl_linux-64|2.36.1 diff --git a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages index 4ba668b64..0a755b9f8 100644 --- a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages +++ b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages @@ -1,6 +1,6 @@ python|3.8.13 -numpy|1.22.3 -pandas|1.4.2 +numpy|1.24.3 +pandas|2.0.1 libblas|3.9.0=15_linux64_mkl mamba|1.3.1 ld_impl_linux-64|2.36.1 diff --git a/test_container/tests/test/python3/all/dataframe.py b/test_container/tests/test/python3/all/dataframe.py index bba8d2cf2..6dd42d2bf 100755 --- a/test_container/tests/test/python3/all/dataframe.py +++ b/test_container/tests/test/python3/all/dataframe.py @@ -159,7 +159,7 @@ def test_dataframe_scalar_returns(self): def run(ctx): df = ctx.get_dataframe() - return np.asscalar(df.iloc[0, 0] + df.iloc[0, 1]) + return (df.iloc[0, 0] + df.iloc[0, 1]).item() / ''' % (self.col_defs_str)) self.query(udf_sql) @@ -217,7 +217,7 @@ def test_dataframe_scalar_emits_unique(self): def run(ctx): df = ctx.get_dataframe() - ctx.emit(np.asscalar(df.C0)) + ctx.emit(df.C0.item()) / ''') print(udf_sql) @@ -236,7 +236,7 @@ def test_dataframe_scalar_emits_all_unique(self): def run(ctx): df = ctx.get_dataframe(num_rows="all") - ctx.emit(np.asscalar(df.C0)) + ctx.emit(df.C0.item()) / ''') print(udf_sql) @@ -331,7 +331,7 @@ def test_dataframe_set_returns(self): def run(ctx): df = ctx.get_dataframe(num_rows="all") - return np.asscalar(df.iloc[:, 0].sum()) + return df.iloc[:, 0].sum().item() / ''' % (self.col_defs_str)) print(udf_sql) @@ -477,7 +477,7 @@ def run(ctx): df = ctx.get_dataframe(num_rows=1) if df is None: break - ctx.emit(np.asscalar(df.C0)) + ctx.emit(df.C0.item()) / ''') print(udf_sql) @@ -500,7 +500,7 @@ def run(ctx): if df is None: break for i in range(df.shape[0]): - ctx.emit(np.asscalar(df.iloc[i, 0])) + ctx.emit(df.iloc[i, 0].item()) / ''') print(udf_sql) From b44c2747a872064cb11634558a10e40e4502779e Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 16 May 2023 17:56:42 +0200 Subject: [PATCH 02/11] Add pandas2 with pyarrow support and fix float16 emit bug --- .../python/python3/python_ext_dataframe.cc | 85 +++++++++++++++---- .../packages/python3_pip_packages | 1 + test_container/tests/test/pandas2/pandas.py | 79 +++++++++++++++++ .../tests/test/python3/all/dataframe.py | 50 +++++++++++ 4 files changed, 200 insertions(+), 15 deletions(-) create mode 100755 test_container/tests/test/pandas2/pandas.py diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 4cd94e378..7564b505a 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -1,4 +1,5 @@ #include "exaudflib/swig/swig_common.h" +#include "debug_message.h" #include @@ -16,6 +17,7 @@ #include #include #include +#include extern "C" { @@ -26,8 +28,9 @@ extern "C" { #define PY_NONETYPE (NPY_USERDEF+5) #define PY_BOOL (NPY_USERDEF+6) -std::map typeMap { +std::map pandasDTypeStrToNumpyCTypeMap { {"bool", NPY_BOOL}, + {"int", NPY_INT32}, {"intc", NPY_INT32}, {"intp", NPY_INT64}, @@ -35,14 +38,34 @@ std::map typeMap { {"int16", NPY_INT16}, {"int32", NPY_INT32}, {"int64", NPY_INT64}, + {"int8[pyarrow]", NPY_INT8}, + {"int16[pyarrow]", NPY_INT16}, + {"int32[pyarrow]", NPY_INT32}, + {"int64[pyarrow]", NPY_INT64}, + {"uint8", NPY_UINT8}, {"uint16", NPY_UINT16}, {"uint32", NPY_UINT32}, {"uint64", NPY_UINT64}, - {"float", NPY_FLOAT64}, - {"float16", NPY_FLOAT16}, + {"uint8[pyarrow]", NPY_UINT8}, + {"uint16[pyarrow]", NPY_UINT16}, + {"uint32[pyarrow]", NPY_UINT32}, + {"uint64[pyarrow]", NPY_UINT64}, + {"float32", NPY_FLOAT32}, {"float64", NPY_FLOAT64}, + {"float", NPY_FLOAT32}, + {"double", NPY_FLOAT64}, + {"float32[pyarrow]", NPY_FLOAT32}, + {"float64[pyarrow]", NPY_FLOAT64}, + {"float[pyarrow]", NPY_FLOAT32}, + {"double[pyarrow]", NPY_FLOAT64}, + // We let numpy convert float16 to float (32 bit) and then use the C conversion from float to double, because a proper conversion from float16 to double in C is very complicated. + {"float16", NPY_FLOAT32}, + {"halffloat", NPY_FLOAT32}, + {"float16[pyarrow]", NPY_FLOAT32}, + {"halffloat[pyarrow]", NPY_FLOAT32}, + {"py_int", PY_INT}, {"py_decimal.Decimal", PY_DECIMAL}, {"py_str", PY_STR}, @@ -53,6 +76,26 @@ std::map typeMap { {"py_bool", PY_BOOL} }; +std::map numpyCTypeToNumpyDTypeStrMap { + {NPY_BOOL, "bool"}, + {NPY_INT8, "int8"}, + {NPY_INT16, "int16"}, + {NPY_INT32, "int32"}, + {NPY_INT64, "int64"}, + {NPY_UINT8, "uint8"}, + {NPY_UINT16, "uint16"}, + {NPY_UINT32, "uint32"}, + {NPY_UINT64, "uint64"}, + {NPY_FLOAT32, "float32"}, + {NPY_FLOAT64, "float64"}, + {PY_INT, "py_int"}, + {PY_DECIMAL, "py_decimal.Decimal"}, + {PY_STR, "py_str"}, + {PY_DATE, "py_datetime.date"}, + {PY_NONETYPE, "py_NoneType"}, + {PY_BOOL, "py_bool"} +}; + std::map emitTypeMap { {SWIGVMContainers::UNSUPPORTED, "UNSUPPORTED"}, {SWIGVMContainers::DOUBLE, "DOUBLE"}, @@ -398,8 +441,8 @@ inline void getColumnTypeInfo(PyObject *numpyTypes, std::vector::iterator it = typeMap.find(typeName); - if (it != typeMap.end()) { + std::map::iterator it = pandasDTypeStrToNumpyCTypeMap.find(typeName); + if (it != pandasDTypeStrToNumpyCTypeMap.end()) { colTypes.push_back(*it); } else if(isNumpyDatetime64(typeName)){ std::stringstream ss; @@ -430,6 +473,7 @@ inline void printPyObject(PyObject* obj, const std::string& error_code){ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, std::vector>& colTypes, std::vector& columnArrays){ for (int c = 0; c < numCols; c++) { + DBG_STREAM_MSG(std::cerr, "Start Column " << c); PyPtr pyStart(PyLong_FromLong(c)); PyPtr pyStop(PyLong_FromLong(c + 1)); PyPtr slice(PySlice_New(pyStart.get(), pyStop.get(), Py_None)); @@ -437,9 +481,11 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, PyPtr pyZero(PyLong_FromLong(0L)); PyPtr array(PyObject_GetItem(arraySlice.get(), pyZero.get())); + DBG_STREAM_MSG(std::cerr, "Got array for column " << c); if (colTypes[c].second == NPY_OBJECT) { + DBG_STREAM_MSG(std::cerr, "Column is NPY_OBJECT " << c); // Convert numpy array to python list PyPtr pyList(PyObject_CallMethod(array.get(), "tolist", NULL)); if (!PyList_Check(pyList.get())) { @@ -463,8 +509,8 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, // Update type in column type info std::map::iterator userDefIt; - userDefIt = typeMap.find(pyTypeName); - if (userDefIt != typeMap.end()) { + userDefIt = pandasDTypeStrToNumpyCTypeMap.find(pyTypeName); + if (userDefIt != pandasDTypeStrToNumpyCTypeMap.end()) { colTypes[c] = *userDefIt; } else { // TODO accept pandas.Timestamp values @@ -477,6 +523,7 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, } else if (colTypes[c].second == NPY_DATETIME) { + DBG_STREAM_MSG(std::cerr, "Column is NPY_DATETIME " << c); // Convert numpy array to python list PyPtr pyList(PyObject_CallMethod(array.get(), "tolist", NULL)); @@ -490,13 +537,24 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, columnArrays.push_back(std::move(pyList)); } else { + DBG_STREAM_MSG(std::cerr, "Column is something else " << c); PyPtr asType (PyObject_GetAttrString(array.get(), "astype")); + DBG_STREAM_MSG(std::cerr, "Step 1 successfull" << c); PyPtr keywordArgs(PyDict_New()); + DBG_STREAM_MSG(std::cerr, "Step 2 successfull" << c); PyDict_SetItemString(keywordArgs.get(), "copy", Py_False); - PyPtr funcArgs(Py_BuildValue("(s)", colTypes[c].first.c_str())); + DBG_STREAM_MSG(std::cerr, "Step 3 successfull" << c); + DBG_STREAM_MSG(std::cerr, "colTypes[c].first" << colTypes[c].first); + std::string numpyDTypeStr = numpyCTypeToNumpyDTypeStrMap.at(colTypes[c].second); + DBG_STREAM_MSG(std::cerr, "numpyDTypeStr" << numpyDTypeStr); + PyPtr funcArgs(Py_BuildValue("(s)", numpyDTypeStr.c_str())); + DBG_STREAM_MSG(std::cerr, "Step 4 successfull" << c); PyPtr scalarArr(PyObject_Call(asType.get(), funcArgs.get(), keywordArgs.get())); + DBG_STREAM_MSG(std::cerr, "Step 5 successfull" << c); columnArrays.push_back(std::move(scalarArr)); + DBG_STREAM_MSG(std::cerr, "Step 6 successfull" << c); + } } @@ -734,7 +792,10 @@ inline void handleEmitNpyFloat16( PyPtr& pyValue, PyPtr& pyResult, PyPtr& pySetNullMethodName){ - double value = static_cast(*((uint16_t*)(PyArray_GETPTR1((PyArrayObject*)(columnArrays[c].get()), r)))); + uint16_t float16_value = *((uint16_t*)(PyArray_GETPTR1((PyArrayObject*)(columnArrays[c].get()), r))); + DBG_STREAM_MSG(std::cerr, "float16_value " << float16_value); + double value = static_cast(float16_value); + DBG_STREAM_MSG(std::cerr, "value " << value); if (npy_isnan(value)) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; @@ -1146,12 +1207,6 @@ void emit(PyObject *resultHandler, std::vector& colInfo, PyObject *d handleEmitNpyFloat32(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); break; } - case NPY_FLOAT16: - { - handleEmitNpyFloat16(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); - break; - } - case NPY_BOOL: { handleEmitNpyBool(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); diff --git a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages index 460b92d24..3d6464a6e 100644 --- a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages +++ b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages @@ -1,2 +1,3 @@ pandas|2.0.1 numpy|1.24.3 +pyarrow|12.0.0 diff --git a/test_container/tests/test/pandas2/pandas.py b/test_container/tests/test/pandas2/pandas.py new file mode 100755 index 000000000..fa1fc1c9a --- /dev/null +++ b/test_container/tests/test/pandas2/pandas.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + + +from exasol_python_test_framework import udf +from exasol_python_test_framework.exatest.testcase import useData +from exasol_python_test_framework.udf.udf_debug import UdfDebugger +from typing import List, Tuple + +class Pandas2Test(udf.TestCase): + def setUp(self): + self.query('create schema pandas2test', ignore_errors=True) + self.maxDiff=None + + def test_pandas2_version(self): + sql=udf.fixindent(''' + CREATE OR REPLACE PYTHON3 SET SCRIPT pandas2test.test_pandas2_version(i integer) EMITS (o VARCHAR(100)) AS + + def run(ctx): + import pandas as pd + ctx.emit(pd.__version__) + / + ''') + print(sql) + self.query(sql) + rows = self.query('''SELECT pandas2test.test_pandas2_version(0)''') + version_parts = rows[0][0].split(".") + self.assertEqual("2",version_parts[0]) + + int_dataframe_value_str = "[[1,2],[3,4]]" + int_expected_rows = [(1,2, None),(3,4, None)] + float16_dataframe_value_str = 'np.array([[1.0,1.0],[1.0,1.0]], dtype="float16")' + float_dataframe_value_str = "[[1.0,1.0],[1.0,1.0]]" + float_expected_rows = [(1.0,1.0, None),(1.0,1.0, None)] + + + types = [ + ("uint8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("uint16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("uint32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("uint64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("float16[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), + ("float32[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ("float64[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ("halffloat[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), + ("float[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ("double[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ] + + @useData(types) + def test_pandas2_int_pyarrow_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_rows:List[Tuple]): + sql=udf.fixindent(f''' + CREATE OR REPLACE PYTHON3 SET SCRIPT pandas2test.test_pandas2_pyarrow_dtype_emit(i integer) + EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS + + def run(ctx): + try: + import pandas as pd + import numpy as np + df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") + df["traceback"]=None + ctx.emit(df) + except: + import traceback + ctx.emit(None,None,traceback.format_exc()) + / + ''') + print(sql) + self.query(sql) + with UdfDebugger(test_case=self): + rows = self.query('''SELECT pandas2test.test_pandas2_pyarrow_dtype_emit(0)''') + self.assertRowsEqual(expected_rows, rows) + + +if __name__ == '__main__': + udf.main() diff --git a/test_container/tests/test/python3/all/dataframe.py b/test_container/tests/test/python3/all/dataframe.py index 6dd42d2bf..d46cf8581 100755 --- a/test_container/tests/test/python3/all/dataframe.py +++ b/test_container/tests/test/python3/all/dataframe.py @@ -5,10 +5,14 @@ from datetime import datetime from exasol_python_test_framework import udf +from exasol_python_test_framework.exatest.testcase import useData +from exasol_python_test_framework.udf.udf_debug import UdfDebugger +from typing import List, Tuple class PandasDataFrame(udf.TestCase): def setUp(self): + self.maxDiff=None self.query('CREATE SCHEMA FN2', ignore_errors=True) self.query('OPEN SCHEMA FN2', ignore_errors=True) @@ -1015,6 +1019,52 @@ def run(ctx): print(select_sql) rows = self.query(select_sql) + int_dataframe_value_str = "[[1,2],[3,4]]" + int_expected_rows = [(1,2, None),(3,4, None)] + float16_dataframe_value_str = 'np.array([[1.0,1.0],[1.0,np.nan]], dtype="float16")' + float_dataframe_value_str = "[[1.0,1.0],[1.0,np.nan]]" + float_expected_rows = [(1.0,1.0, None),(1.0, None, None)] + + + types = [ + ("uint8", "integer", int_dataframe_value_str, int_expected_rows), + ("uint16", "integer", int_dataframe_value_str, int_expected_rows), + ("uint32", "integer", int_dataframe_value_str, int_expected_rows), + ("uint64", "integer", int_dataframe_value_str, int_expected_rows), + ("int8", "integer", int_dataframe_value_str, int_expected_rows), + ("int16", "integer", int_dataframe_value_str, int_expected_rows), + ("int32", "integer", int_dataframe_value_str, int_expected_rows), + ("int64", "integer", int_dataframe_value_str, int_expected_rows), + ("float16", "float", float16_dataframe_value_str, float_expected_rows), + ("float32", "float", float_dataframe_value_str, float_expected_rows), + ("float64", "float", float_dataframe_value_str, float_expected_rows), + ("float", "float", float_dataframe_value_str, float_expected_rows), + ("double", "float", float_dataframe_value_str, float_expected_rows), + ] + + @useData(types) + def test_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_rows:List[Tuple]): + sql=udf.fixindent(f''' + CREATE OR REPLACE PYTHON3 SET SCRIPT test_dtype_emit(i integer) + EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS + + def run(ctx): + try: + import pandas as pd + import numpy as np + df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") + df["traceback"]=None + ctx.emit(df) + except: + import traceback + ctx.emit(None,None,traceback.format_exc()) + / + ''') + print(sql) + self.query(sql) + with UdfDebugger(test_case=self): + rows = self.query('''SELECT test_dtype_emit(0)''') + self.assertRowsEqual(expected_rows, rows) if __name__ == '__main__': udf.main() From 0ad7c51758bf3d0ff982e31777c528bec2d88c03 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 17 May 2023 08:52:55 +0200 Subject: [PATCH 03/11] Removed handleEmitNpyFloat16 and debug messages --- .../python/python3/python_ext_dataframe.cc | 55 ------------------- 1 file changed, 55 deletions(-) diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 7564b505a..252a9d2da 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -473,7 +473,6 @@ inline void printPyObject(PyObject* obj, const std::string& error_code){ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, std::vector>& colTypes, std::vector& columnArrays){ for (int c = 0; c < numCols; c++) { - DBG_STREAM_MSG(std::cerr, "Start Column " << c); PyPtr pyStart(PyLong_FromLong(c)); PyPtr pyStop(PyLong_FromLong(c + 1)); PyPtr slice(PySlice_New(pyStart.get(), pyStop.get(), Py_None)); @@ -481,11 +480,9 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, PyPtr pyZero(PyLong_FromLong(0L)); PyPtr array(PyObject_GetItem(arraySlice.get(), pyZero.get())); - DBG_STREAM_MSG(std::cerr, "Got array for column " << c); if (colTypes[c].second == NPY_OBJECT) { - DBG_STREAM_MSG(std::cerr, "Column is NPY_OBJECT " << c); // Convert numpy array to python list PyPtr pyList(PyObject_CallMethod(array.get(), "tolist", NULL)); if (!PyList_Check(pyList.get())) { @@ -523,7 +520,6 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, } else if (colTypes[c].second == NPY_DATETIME) { - DBG_STREAM_MSG(std::cerr, "Column is NPY_DATETIME " << c); // Convert numpy array to python list PyPtr pyList(PyObject_CallMethod(array.get(), "tolist", NULL)); @@ -537,24 +533,13 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, columnArrays.push_back(std::move(pyList)); } else { - DBG_STREAM_MSG(std::cerr, "Column is something else " << c); PyPtr asType (PyObject_GetAttrString(array.get(), "astype")); - DBG_STREAM_MSG(std::cerr, "Step 1 successfull" << c); PyPtr keywordArgs(PyDict_New()); - DBG_STREAM_MSG(std::cerr, "Step 2 successfull" << c); PyDict_SetItemString(keywordArgs.get(), "copy", Py_False); - DBG_STREAM_MSG(std::cerr, "Step 3 successfull" << c); - DBG_STREAM_MSG(std::cerr, "colTypes[c].first" << colTypes[c].first); std::string numpyDTypeStr = numpyCTypeToNumpyDTypeStrMap.at(colTypes[c].second); - DBG_STREAM_MSG(std::cerr, "numpyDTypeStr" << numpyDTypeStr); PyPtr funcArgs(Py_BuildValue("(s)", numpyDTypeStr.c_str())); - DBG_STREAM_MSG(std::cerr, "Step 4 successfull" << c); PyPtr scalarArr(PyObject_Call(asType.get(), funcArgs.get(), keywordArgs.get())); - DBG_STREAM_MSG(std::cerr, "Step 5 successfull" << c); - columnArrays.push_back(std::move(scalarArr)); - DBG_STREAM_MSG(std::cerr, "Step 6 successfull" << c); - } } @@ -782,46 +767,6 @@ inline void handleEmitNpyFloat32( pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); } -inline void handleEmitNpyFloat16( - int c, int r, - std::vector& columnArrays, - std::vector>& pyColSetMethods, - std::vector& colInfo, - std::vector>& colTypes, - PyObject *resultHandler, - PyPtr& pyValue, - PyPtr& pyResult, - PyPtr& pySetNullMethodName){ - uint16_t float16_value = *((uint16_t*)(PyArray_GETPTR1((PyArrayObject*)(columnArrays[c].get()), r))); - DBG_STREAM_MSG(std::cerr, "float16_value " << float16_value); - double value = static_cast(float16_value); - DBG_STREAM_MSG(std::cerr, "value " << value); - if (npy_isnan(value)) { - pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); - return; - } - switch (colInfo[c].type) { - case SWIGVMContainers::INT64: - case SWIGVMContainers::INT32: - pyValue.reset(PyLong_FromLong(static_cast(value))); - break; - case SWIGVMContainers::NUMERIC: - pyValue.reset(PyUnicode_FromString(std::to_string(value).c_str())); - break; - case SWIGVMContainers::DOUBLE: - pyValue.reset(PyFloat_FromDouble(value)); - break; - default: - { - std::stringstream ss; - ss << "F-UDF-CL-SL-PYTHON-1064: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; - throw std::runtime_error(ss.str().c_str()); - } - } - checkPyPtrIsNull(pyValue); - pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); -} - inline void handleEmitNpyBool( int c, int r, std::vector& columnArrays, From b7815a52b2ee400249402752a5de5c24dd12eca5 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 23 May 2023 08:52:02 +0200 Subject: [PATCH 04/11] Add handleEmitPyFloat to also support float pyarrow dtype columns with NAN. - Refactor and split Pandas Tests - Add tests for more dtypes to Pandas Tests --- .../python/python3/python_ext_dataframe.cc | 133 ++++++--- .../flavor_base/testconfig | 2 +- .../flavor_base/testconfig | 2 +- .../flavor_base/testconfig | 2 +- .../flavor_base/testconfig | 2 +- .../test/{python3 => pandas}/all/dataframe.py | 76 +---- .../tests/test/pandas/all/emit_dtypes.py | 263 ++++++++++++++++++ .../tests/test/pandas/pandas2/pandas.py | 151 ++++++++++ test_container/tests/test/pandas2/pandas.py | 79 ------ 9 files changed, 520 insertions(+), 190 deletions(-) rename test_container/tests/test/{python3 => pandas}/all/dataframe.py (91%) create mode 100755 test_container/tests/test/pandas/all/emit_dtypes.py create mode 100755 test_container/tests/test/pandas/pandas2/pandas.py delete mode 100755 test_container/tests/test/pandas2/pandas.py diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 252a9d2da..228a9f54c 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -27,9 +27,9 @@ extern "C" { #define PY_DATE (NPY_USERDEF+4) #define PY_NONETYPE (NPY_USERDEF+5) #define PY_BOOL (NPY_USERDEF+6) +#define PY_FLOAT (NPY_USERDEF+7) std::map pandasDTypeStrToNumpyCTypeMap { - {"bool", NPY_BOOL}, {"int", NPY_INT32}, {"intc", NPY_INT32}, @@ -38,42 +38,53 @@ std::map pandasDTypeStrToNumpyCTypeMap { {"int16", NPY_INT16}, {"int32", NPY_INT32}, {"int64", NPY_INT64}, - {"int8[pyarrow]", NPY_INT8}, - {"int16[pyarrow]", NPY_INT16}, - {"int32[pyarrow]", NPY_INT32}, - {"int64[pyarrow]", NPY_INT64}, + {"int8[pyarrow]", NPY_OBJECT}, + {"int16[pyarrow]", NPY_OBJECT}, + {"int32[pyarrow]", NPY_OBJECT}, + {"int64[pyarrow]", NPY_OBJECT}, {"uint8", NPY_UINT8}, {"uint16", NPY_UINT16}, {"uint32", NPY_UINT32}, {"uint64", NPY_UINT64}, - {"uint8[pyarrow]", NPY_UINT8}, - {"uint16[pyarrow]", NPY_UINT16}, - {"uint32[pyarrow]", NPY_UINT32}, - {"uint64[pyarrow]", NPY_UINT64}, + {"uint8[pyarrow]", NPY_OBJECT}, + {"uint16[pyarrow]", NPY_OBJECT}, + {"uint32[pyarrow]", NPY_OBJECT}, + {"uint64[pyarrow]", NPY_OBJECT}, {"float32", NPY_FLOAT32}, {"float64", NPY_FLOAT64}, {"float", NPY_FLOAT32}, {"double", NPY_FLOAT64}, - {"float32[pyarrow]", NPY_FLOAT32}, - {"float64[pyarrow]", NPY_FLOAT64}, - {"float[pyarrow]", NPY_FLOAT32}, - {"double[pyarrow]", NPY_FLOAT64}, + {"float32[pyarrow]", NPY_OBJECT}, + {"float64[pyarrow]", NPY_OBJECT}, + {"float[pyarrow]", NPY_OBJECT}, + {"double[pyarrow]", NPY_OBJECT}, // We let numpy convert float16 to float (32 bit) and then use the C conversion from float to double, because a proper conversion from float16 to double in C is very complicated. {"float16", NPY_FLOAT32}, {"halffloat", NPY_FLOAT32}, - {"float16[pyarrow]", NPY_FLOAT32}, - {"halffloat[pyarrow]", NPY_FLOAT32}, + {"float16[pyarrow]", NPY_OBJECT}, + {"halffloat[pyarrow]", NPY_OBJECT}, + + {"string[pyarrow]", NPY_OBJECT}, + {"string[python]", NPY_OBJECT}, + {"string", NPY_OBJECT}, + + {"bool[pyarrow]", NPY_OBJECT}, + {"boolean", NPY_OBJECT}, + {"bool", NPY_BOOL}, - {"py_int", PY_INT}, - {"py_decimal.Decimal", PY_DECIMAL}, - {"py_str", PY_STR}, - {"py_datetime.date", PY_DATE}, {"datetime64[ns]", NPY_DATETIME}, {"object", NPY_OBJECT}, + + {"py_NAType", PY_NONETYPE}, {"py_NoneType", PY_NONETYPE}, - {"py_bool", PY_BOOL} + {"py_bool", PY_BOOL}, + {"py_int", PY_INT}, + {"py_float", PY_FLOAT}, + {"py_decimal.Decimal", PY_DECIMAL}, + {"py_str", PY_STR}, + {"py_datetime.date", PY_DATE} }; std::map numpyCTypeToNumpyDTypeStrMap { @@ -89,6 +100,7 @@ std::map numpyCTypeToNumpyDTypeStrMap { {NPY_FLOAT32, "float32"}, {NPY_FLOAT64, "float64"}, {PY_INT, "py_int"}, + {PY_FLOAT, "py_float"}, {PY_DECIMAL, "py_decimal.Decimal"}, {PY_STR, "py_str"}, {PY_DATE, "py_datetime.date"}, @@ -460,15 +472,17 @@ inline void getColumnTypeInfo(PyObject *numpyTypes, std::vectorob_type; const char* p = type->tp_name; PyObject* objectsRepresentation = PyObject_Repr(obj); const char* s = PyUnicode_AsUTF8(objectsRepresentation); - throw std::runtime_error(error_code+": "+std::string(s)+" "+std::string(p)); + DBG_STREAM_MSG(std::cerr, error_code << ": " << std::string(s) << " " << std::string(p)); +} + +inline bool isNoneOrNA(PyObject* pyVal){ + return pyVal == Py_None || std::string(Py_TYPE(pyVal)->tp_name) == "NAType"; } -#endif inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, std::vector>& colTypes, std::vector& columnArrays){ @@ -491,14 +505,16 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, throw std::runtime_error(ss.str().c_str()); } - // Get type of first non-None item in list + // Get type of first non-None, non-NA item in list PyObject *pyVal = PyList_GetItem(pyList.get(), 0); checkPyObjectIsNull(pyVal,"F-UDF-CL-SL-PYTHON-1126"); std::string pyTypeName(std::string("py_") + Py_TYPE(pyVal)->tp_name); - for (int r = 1; r < numRows && pyVal == Py_None; r++) { + bool pyValIsNoneOrNA = isNoneOrNA(pyVal); + for (int r = 1; r < numRows && pyValIsNoneOrNA; r++) { pyVal = PyList_GetItem(pyList.get(), r); + pyValIsNoneOrNA = isNoneOrNA(pyVal); checkPyObjectIsNull(pyVal,"F-UDF-CL-SL-PYTHON-1127"); - if (pyVal != Py_None) { + if (!pyValIsNoneOrNA) { pyTypeName = std::string("py_") + Py_TYPE(pyVal)->tp_name; break; } @@ -536,7 +552,7 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, PyPtr asType (PyObject_GetAttrString(array.get(), "astype")); PyPtr keywordArgs(PyDict_New()); PyDict_SetItemString(keywordArgs.get(), "copy", Py_False); - std::string numpyDTypeStr = numpyCTypeToNumpyDTypeStrMap.at(colTypes[c].second); + std::string numpyDTypeStr = numpyCTypeToNumpyDTypeStrMap.at(colTypes[c].second); PyPtr funcArgs(Py_BuildValue("(s)", numpyDTypeStr.c_str())); PyPtr scalarArr(PyObject_Call(asType.get(), funcArgs.get(), keywordArgs.get())); columnArrays.push_back(std::move(scalarArr)); @@ -816,7 +832,7 @@ inline void handleEmitPyBool( PyPtr& pySetNullMethodName){ PyPtr pyBool(PyList_GetItem(columnArrays[c].get(), r)); checkPyPtrIsNull(pyBool); - if (pyBool.get() == Py_None) { + if (isNoneOrNA(pyBool.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -857,7 +873,7 @@ inline void handleEmitPyInt( PyPtr& pySetNullMethodName){ PyPtr pyInt(PyList_GetItem(columnArrays[c].get(), r)); checkPyPtrIsNull(pyInt); - if (pyInt.get() == Py_None) { + if (isNoneOrNA(pyInt.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -888,6 +904,54 @@ inline void handleEmitPyInt( pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); } +inline void handleEmitPyFloat( + int c, int r, + std::vector& columnArrays, + std::vector>& pyColSetMethods, + std::vector& colInfo, + std::vector>& colTypes, + PyObject *resultHandler, + PyPtr& pyValue, + PyPtr& pyResult, + PyPtr& pySetNullMethodName){ + PyPtr pyFloat(PyList_GetItem(columnArrays[c].get(), r)); + checkPyPtrIsNull(pyFloat); + if (isNoneOrNA(pyFloat.get())) { + pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); + return; + } + + switch (colInfo[c].type) { + case SWIGVMContainers::INT64: + case SWIGVMContainers::INT32: + { + double value = PyFloat_AsDouble(pyFloat.get()); + if (value < 0 && PyErr_Occurred()) + throw std::runtime_error("F-UDF-CL-SL-PYTHON-1067: emit() PY_FLOAT: PyFloat_AsDouble error"); + if (npy_isnan(value)) { + pyResult.reset( + PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); + return; + } + pyValue.reset(PyLong_FromLong(static_cast(value))); + break; + } + case SWIGVMContainers::NUMERIC: + pyValue.reset(PyObject_Str(pyFloat.get())); + break; + case SWIGVMContainers::DOUBLE: + pyValue.reset(pyFloat.release()); + break; + default: + { + std::stringstream ss; + ss << "F-UDF-CL-SL-PYTHON-1068: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; + throw std::runtime_error(ss.str().c_str()); + } + } + pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL)); +} + inline void handleEmitPyDecimal( int c, int r, std::vector& columnArrays, @@ -902,7 +966,7 @@ inline void handleEmitPyDecimal( PyPtr& pyFloatMethodName ){ PyPtr pyDecimal(PyList_GetItem(columnArrays[c].get(), r)); - if (pyDecimal.get() == Py_None) { + if (isNoneOrNA(pyDecimal.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -946,7 +1010,7 @@ inline void handleEmitPyStr( PyPtr& pySetNullMethodName){ PyPtr pyString(PyList_GetItem(columnArrays[c].get(), r)); - if (pyString.get() == Py_None) { + if (isNoneOrNA(pyString.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -986,7 +1050,7 @@ inline void handleEmitPyDate( PyPtr& pySetNullMethodName, PyPtr& pyIsoformatMethodName){ PyPtr pyDate(PyList_GetItem(columnArrays[c].get(), r)); - if (pyDate.get() == Py_None) { + if (isNoneOrNA(pyDate.get())) { pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); return; } @@ -1167,6 +1231,11 @@ void emit(PyObject *resultHandler, std::vector& colInfo, PyObject *d handleEmitPyInt(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); break; } + case PY_FLOAT: + { + handleEmitPyFloat(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, pySetNullMethodName); + break; + } case PY_DECIMAL: { handleEmitPyDecimal(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, diff --git a/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig b/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig index 7c6eefa41..8688dd718 100644 --- a/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig +++ b/flavors/python-3.7-minimal-EXASOL-6.2.0/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all diff --git a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig index 7c6eefa41..8688dd718 100644 --- a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig +++ b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all diff --git a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig index 7c6eefa41..8688dd718 100644 --- a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig +++ b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all diff --git a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig index 7c6eefa41..8688dd718 100644 --- a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig +++ b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all +test_folders=python3/all pandas/all diff --git a/test_container/tests/test/python3/all/dataframe.py b/test_container/tests/test/pandas/all/dataframe.py similarity index 91% rename from test_container/tests/test/python3/all/dataframe.py rename to test_container/tests/test/pandas/all/dataframe.py index d46cf8581..58c9fd738 100755 --- a/test_container/tests/test/python3/all/dataframe.py +++ b/test_container/tests/test/pandas/all/dataframe.py @@ -7,7 +7,7 @@ from exasol_python_test_framework import udf from exasol_python_test_framework.exatest.testcase import useData from exasol_python_test_framework.udf.udf_debug import UdfDebugger -from typing import List, Tuple +from typing import List, Tuple, Union class PandasDataFrame(udf.TestCase): @@ -905,33 +905,6 @@ def run(ctx): (234,) ], rows) - def test_dataframe_set_emits_double_pyfloat_only_todo(self): - import datetime - udf_sql = udf.fixindent(''' - CREATE OR REPLACE PYTHON3 SET SCRIPT foo(sec int) EMITS (ts double) AS - - def run(ctx): - import pandas as pd - import numpy as np - import datetime - - c1=np.empty(shape=(2),dtype=np.object_) - - c1[:]=234.5 - - df=pd.DataFrame({0:c1}) - - ctx.emit(df) - / - ''') - print(udf_sql) - self.query(udf_sql) - select_sql = 'SELECT foo(1)' - print(select_sql) - #TODO implement support - with self.assertRaisesRegex(Exception, 'F-UDF-CL-SL-PYTHON-1056'): - rows = self.query(select_sql) - def test_dataframe_set_emits_double_npfloat32_only(self): import datetime udf_sql = udf.fixindent(''' @@ -1019,53 +992,6 @@ def run(ctx): print(select_sql) rows = self.query(select_sql) - int_dataframe_value_str = "[[1,2],[3,4]]" - int_expected_rows = [(1,2, None),(3,4, None)] - float16_dataframe_value_str = 'np.array([[1.0,1.0],[1.0,np.nan]], dtype="float16")' - float_dataframe_value_str = "[[1.0,1.0],[1.0,np.nan]]" - float_expected_rows = [(1.0,1.0, None),(1.0, None, None)] - - - types = [ - ("uint8", "integer", int_dataframe_value_str, int_expected_rows), - ("uint16", "integer", int_dataframe_value_str, int_expected_rows), - ("uint32", "integer", int_dataframe_value_str, int_expected_rows), - ("uint64", "integer", int_dataframe_value_str, int_expected_rows), - ("int8", "integer", int_dataframe_value_str, int_expected_rows), - ("int16", "integer", int_dataframe_value_str, int_expected_rows), - ("int32", "integer", int_dataframe_value_str, int_expected_rows), - ("int64", "integer", int_dataframe_value_str, int_expected_rows), - ("float16", "float", float16_dataframe_value_str, float_expected_rows), - ("float32", "float", float_dataframe_value_str, float_expected_rows), - ("float64", "float", float_dataframe_value_str, float_expected_rows), - ("float", "float", float_dataframe_value_str, float_expected_rows), - ("double", "float", float_dataframe_value_str, float_expected_rows), - ] - - @useData(types) - def test_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_rows:List[Tuple]): - sql=udf.fixindent(f''' - CREATE OR REPLACE PYTHON3 SET SCRIPT test_dtype_emit(i integer) - EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS - - def run(ctx): - try: - import pandas as pd - import numpy as np - df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") - df["traceback"]=None - ctx.emit(df) - except: - import traceback - ctx.emit(None,None,traceback.format_exc()) - / - ''') - print(sql) - self.query(sql) - with UdfDebugger(test_case=self): - rows = self.query('''SELECT test_dtype_emit(0)''') - self.assertRowsEqual(expected_rows, rows) - if __name__ == '__main__': udf.main() diff --git a/test_container/tests/test/pandas/all/emit_dtypes.py b/test_container/tests/test/pandas/all/emit_dtypes.py new file mode 100755 index 000000000..ef65ce444 --- /dev/null +++ b/test_container/tests/test/pandas/all/emit_dtypes.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +from decimal import Decimal +from datetime import date +from datetime import datetime + +from exasol_python_test_framework import udf +from exasol_python_test_framework.exatest.testcase import useData +from exasol_python_test_framework.udf.udf_debug import UdfDebugger +from typing import List, Tuple, Union + + +class PandasDataFrameEmitDTypes(udf.TestCase): + def setUp(self): + self.maxDiff=None + + self.query(f'CREATE SCHEMA {self.__class__.__name__}', ignore_errors=True) + self.query(f'OPEN SCHEMA {self.__class__.__name__}', ignore_errors=True) + + int_dataframe_value_str = "[[1, 2],[3, 4]]" + int_expected_rows = [(1, 2, None),(3, 4, None)] + int_to_float_expected_rows = [(1.0, 2.0, None),(3.0, 4.0, None)] + + float16_dataframe_value_str = 'np.array([[1.1, 2.1],[3.1, 4.1]], dtype="float16")' + float_dataframe_value_str = "[[1.1, 2.1],[3.1, 4.1]]" + float_expected_rows = [(1.1, 2.1, None),(3.1, 4.1, None)] + + str_dataframe_value_str = "[['a','b'],['c','d']]" + str_expected_rows = [('a','b',None),('c','d',None)] + + bool_dataframe_value_str = "[[True,False],[True,False]]" + bool_expected_rows = [(True,False,None),(True,False,None)] + + decimal_dataframe_value_str = "[[Decimal('1.1'),Decimal('2.1')],[Decimal('3.1'),Decimal('4.1')]]" + decimal_expected_rows = [(Decimal('1.1'),Decimal('2.1'),None),(Decimal('3.1'),Decimal('4.1'),None)] + int_to_decimal_expected_rows = [(Decimal('1'),Decimal('2'),None),(Decimal('3'),Decimal('4'),None)] + + timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))],' \ + +'[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)],' \ + +'[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None), + (datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + date_dataframe_value_str = '[[date(2020, 7, 27),' \ + +'date(2020, 7, 27)],' \ + +'[date(2020, 7, 27),' \ + +'date(2020, 7, 27)]]' + date_expected_rows = [(date(2020, 7, 27),date(2020, 7, 27),None), + (date(2020, 7, 27),date(2020, 7, 27),None)] + + none_dataframe_value_str = "[[None, None],[None, None]]" + none_expected_rows = [(None, None, None),(None, None, None)] + none_expected_rows_bool_ = [(False, False, None),(False, False, None)] + + nan_dataframe_value_str = "[[np.nan, np.nan],[np.nan, np.nan]]" + nan_expected_rows = [(None, None, None),(None, None, None)] + nan_expected_rows_bool_ = [(True, True, None),(True, True, None)] + + + + types = [ + ("uint8", "integer", int_dataframe_value_str, int_expected_rows, False), + ("uint16", "integer", int_dataframe_value_str, int_expected_rows, False), + ("uint32", "integer", int_dataframe_value_str, int_expected_rows, False), + ("uint64", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int8", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int16", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int32", "integer", int_dataframe_value_str, int_expected_rows, False), + ("int64", "integer", int_dataframe_value_str, int_expected_rows, False), + ("object", "integer", int_dataframe_value_str, int_expected_rows, False), + + ("float16", "double", float16_dataframe_value_str, float_expected_rows, True), + ("float32", "double", float_dataframe_value_str, float_expected_rows, True), + ("float64", "double", float_dataframe_value_str, float_expected_rows, False), + ("float", "double", float_dataframe_value_str, float_expected_rows, False), + ("double", "double", float_dataframe_value_str, float_expected_rows, False), + ("object", "double", float_dataframe_value_str, float_expected_rows, False), + + ("uint8", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("uint16", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("uint32", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("uint64", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int8", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int16", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int32", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("int64", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("object", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + + ("float16", "integer", float16_dataframe_value_str, int_expected_rows, False), + ("float32", "integer", float_dataframe_value_str, int_expected_rows, False), + ("float64", "integer", float_dataframe_value_str, int_expected_rows, False), + ("float", "integer", float_dataframe_value_str, int_expected_rows, False), + ("double", "integer", float_dataframe_value_str, int_expected_rows, False), + ("object", "integer", float_dataframe_value_str, int_expected_rows, False), + + ("uint8", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("uint16", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("uint32", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("uint64", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int8", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int16", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int32", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("int64", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("object", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + + ("float16", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), + ("float32", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("float64", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("float", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("double", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + ("object", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + + ("object", "DECIMAL(10,5)", decimal_dataframe_value_str, decimal_expected_rows, False), + + ("string", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + ("object", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + + ("bool_", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + ("boolean", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + ("object", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + + ("datetime64[ns]", "timestamp", timestamp_dataframe_value_str, datetime_expected_rows, False), + ("object", "timestamp", timestamp_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1056.*unexpected python type: py_Timestamp.*", False), + ("object", "timestamp", datetime_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1056.*unexpected python type: py_datetime.datetime.*", False), + ("object", "timestamp", date_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1071: emit column 0 of type TIMESTAMP but data given have type py_datetime.date.*", False), + ("object", "DATE", date_dataframe_value_str, date_expected_rows, False), + + #(u)int-dtypes don't support None or np.nan + + # None + + ("object", "integer", none_dataframe_value_str, none_expected_rows, False), + + ("float16", "double", none_dataframe_value_str, none_expected_rows, False), + ("float32", "double", none_dataframe_value_str, none_expected_rows, False), + ("float64", "double", none_dataframe_value_str, none_expected_rows, False), + ("float", "double", none_dataframe_value_str, none_expected_rows, False), + ("double", "double", none_dataframe_value_str, none_expected_rows, False), + ("object", "double", none_dataframe_value_str, none_expected_rows, False), + + ("float16", "integer", none_dataframe_value_str, none_expected_rows, False), + ("float32", "integer", none_dataframe_value_str, none_expected_rows, False), + ("float64", "integer", none_dataframe_value_str, none_expected_rows, False), + ("float", "integer", none_dataframe_value_str, none_expected_rows, False), + ("double", "integer", none_dataframe_value_str, none_expected_rows, False), + + ("float16", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("float32", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("float64", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("float", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + ("double", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + + ("object", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + + ("string", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), + ("object", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), + + ("bool_", "boolean", none_dataframe_value_str, none_expected_rows_bool_, False), + ("boolean", "boolean", none_dataframe_value_str, none_expected_rows, False), + ("object", "boolean", none_dataframe_value_str, none_expected_rows, False), + + ("datetime64[ns]", "timestamp", none_dataframe_value_str, none_expected_rows, False), + ("object", "DATE", none_dataframe_value_str, none_expected_rows, False), + + # NaN + + ("object", "integer", nan_dataframe_value_str, nan_expected_rows, False), + + ("float16", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("float32", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("float64", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("float", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("double", "double", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "double", nan_dataframe_value_str, nan_expected_rows, False), + + ("float16", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("float32", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("float64", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("float", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("double", "integer", nan_dataframe_value_str, nan_expected_rows, False), + + ("float16", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("float32", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("float64", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("float", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + ("double", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), + + #("object", "DECIMAL(10,5)", nan_dataframe_value_str, None, False), # Fails with VM error: [22018] invalid character value for cast; Value: 'nan' + + ("string", "VARCHAR(2000000)", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "VARCHAR(2000000)", nan_dataframe_value_str, ".*PYTHON-1068: emit column 0 of type STRING but data given have type py_float.*", False), + + ("bool_", "boolean", nan_dataframe_value_str, nan_expected_rows_bool_, False), + ("boolean", "boolean", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "boolean", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type BOOLEAN but data given have type py_float.*", False), + + ("datetime64[ns]", "timestamp", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "DATE", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type DATE but data given have type py_float.*", False), + + # TODO mixed nan/none with values + ] + + @useData(types) + def test_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_result:Union[str,List[Tuple]], use_almost_equal:bool): + sql=udf.fixindent(f''' + CREATE OR REPLACE PYTHON3 SET SCRIPT test_dtype_emit(i integer) + EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS + + def run(ctx): + try: + from decimal import Decimal + import pandas as pd + import numpy as np + from datetime import datetime, date + df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") + df["traceback"]=None + ctx.emit(df) + except: + import traceback + ctx.emit(None,None,traceback.format_exc()) + / + ''') + print(sql) + self.query(sql) + with UdfDebugger(test_case=self): + rows = self.query('''SELECT test_dtype_emit(0)''') + if isinstance(expected_result,str): + self.assertRegex(rows[0][2], expected_result) + else: + if use_almost_equal: + self.assertRowsAlmostEqual(expected_result, rows, places=1) + else: + self.assertRowsEqual(expected_result, rows) + + def isValueAlmostEqual(self, left, right, places): + if isinstance(left, (float, Decimal)) and isinstance(right, (float, Decimal)): + return round(left, places) == round(right, places) + else: + return left == right + + def isRowAlmostEqual(self, left, right, places): + if len(left) != len(right): + return False + all_values_almost_equal = all(self.isValueAlmostEqual(lvalue, rvalue, places) + for lvalue, rvalue in zip(left, right)) + return all_values_almost_equal + + def assertRowsAlmostEqual(self, left, right, places): + lrows = [tuple(x) for x in left] + rrows = [tuple(x) for x in right] + if len(lrows) != len(rrows): + raise AssertionError(f'{lrows} and {rrows} have different number of rows.') + all_rows_almost_equal = all(self.isRowAlmostEqual(lrow, rrow, places) for lrow, rrow in zip(lrows, rrows)) + if not all_rows_almost_equal: + raise AssertionError(f'{lrows} and {rrows} are not almost equal.') + +if __name__ == '__main__': + udf.main() + diff --git a/test_container/tests/test/pandas/pandas2/pandas.py b/test_container/tests/test/pandas/pandas2/pandas.py new file mode 100755 index 000000000..72ebcf713 --- /dev/null +++ b/test_container/tests/test/pandas/pandas2/pandas.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +from decimal import Decimal +from datetime import date +from datetime import datetime + +from exasol_python_test_framework import udf +from exasol_python_test_framework.exatest.testcase import useData +from exasol_python_test_framework.udf.udf_debug import UdfDebugger +from typing import List, Tuple, Union + +class Pandas2Test(udf.TestCase): + def setUp(self): + self.query('create schema pandas2test', ignore_errors=True) + self.maxDiff=None + + def test_pandas2_version(self): + sql=udf.fixindent(''' + CREATE OR REPLACE PYTHON3 SET SCRIPT pandas2test.test_pandas2_version(i integer) EMITS (o VARCHAR(100)) AS + + def run(ctx): + import pandas as pd + ctx.emit(pd.__version__) + / + ''') + print(sql) + self.query(sql) + rows = self.query('''SELECT pandas2test.test_pandas2_version(0)''') + version_parts = rows[0][0].split(".") + self.assertEqual("2",version_parts[0]) + + + int_dataframe_value_str = "[[1, 2],[3, 4]]" + int_expected_rows = [(1, 2, None),(3, 4, None)] + + float16_dataframe_value_str = 'np.array([[1.0, 2.0],[3.0, 4.0]], dtype="float16")' + float_dataframe_value_str = "[[1.0, 2.0],[3.0, 4.0]]" + float_expected_rows = [(1.0, 2.0, None),(3.0, 4.0, None)] + + str_dataframe_value_str = "[['a','b'],['c','d']]" + str_expected_rows = [('a','b',None),('c','d',None)] + + bool_dataframe_value_str = "[[True,False],[True,False]]" + bool_expected_rows = [(True,False,None),(True,False,None)] + + decimal_dataframe_value_str = "[[Decimal('1.0'),Decimal('2.0')],[Decimal('3.0'),Decimal('4.0')]]" + decimal_expected_rows = [(Decimal('1.0'),Decimal('2.0'),None),(Decimal('3.0'),Decimal('4.0'),None)] + + timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))],' \ + +'[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ + +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)],' \ + +'[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ + +'datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + datetime_expected_row = [(datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None), + (datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + + none_dataframe_value_str = "[[None, None],[None, None]]" + none_expected_rows = [(None, None, None),(None, None, None)] + none_expected_rows_bool_ = [(False, False, None),(False, False, None)] + + nan_dataframe_value_str = "[[np.nan, np.nan],[np.nan, np.nan]]" + nan_float16_dataframe_value_str = "np.array([[np.nan, np.nan],[np.nan, np.nan]], dtype='float16')" + nan_expected_rows = [(None, None, None),(None, None, None)] + + types = [ + ("uint8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("uint16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("uint32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("uint64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("int64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), + ("float16[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), + ("float32[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ("float64[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ("halffloat[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), + ("float[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ("double[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), + ("string[pyarrow]", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows), + ("bool[pyarrow]", "boolean", bool_dataframe_value_str, bool_expected_rows), + + ("uint8[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("uint16[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("uint32[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("uint64[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("int8[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("int16[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("int32[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("int64[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), + ("float16[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), + ("float32[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), + ("float64[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), + ("halffloat[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), + ("float[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), + ("double[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), + ("string[pyarrow]", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows), + ("bool[pyarrow]", "boolean", none_dataframe_value_str, none_expected_rows), + + ("uint8[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("uint16[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("uint32[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("uint64[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("int8[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("int16[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("int32[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("int64[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), + ("float16[pyarrow]", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*"), + ("float32[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), + ("float64[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), + ("halffloat[pyarrow]", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*"), + ("float[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), + ("double[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), + ("string[pyarrow]", "VARCHAR(2000000)", nan_dataframe_value_str, nan_expected_rows), + ("bool[pyarrow]", "boolean", nan_dataframe_value_str, nan_expected_rows), + ] + + @useData(types) + def test_pyarrow_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_result:Union[str,List[Tuple]]): + sql=udf.fixindent(f''' + CREATE OR REPLACE PYTHON3 SET SCRIPT test_dtype_emit(i integer) + EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS + + def run(ctx): + try: + from decimal import Decimal + import pandas as pd + import numpy as np + from datetime import datetime + df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") + df["traceback"]=None + ctx.emit(df) + except: + import traceback + ctx.emit(None,None,traceback.format_exc()) + / + ''') + print(sql) + self.query(sql) + with UdfDebugger(test_case=self): + rows = self.query('''SELECT test_dtype_emit(0)''') + if isinstance(expected_result,str): + self.assertRegex(rows[0][2], expected_result) + else: + self.assertRowsEqual(expected_result, rows) + +if __name__ == '__main__': + udf.main() diff --git a/test_container/tests/test/pandas2/pandas.py b/test_container/tests/test/pandas2/pandas.py deleted file mode 100755 index fa1fc1c9a..000000000 --- a/test_container/tests/test/pandas2/pandas.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 - - -from exasol_python_test_framework import udf -from exasol_python_test_framework.exatest.testcase import useData -from exasol_python_test_framework.udf.udf_debug import UdfDebugger -from typing import List, Tuple - -class Pandas2Test(udf.TestCase): - def setUp(self): - self.query('create schema pandas2test', ignore_errors=True) - self.maxDiff=None - - def test_pandas2_version(self): - sql=udf.fixindent(''' - CREATE OR REPLACE PYTHON3 SET SCRIPT pandas2test.test_pandas2_version(i integer) EMITS (o VARCHAR(100)) AS - - def run(ctx): - import pandas as pd - ctx.emit(pd.__version__) - / - ''') - print(sql) - self.query(sql) - rows = self.query('''SELECT pandas2test.test_pandas2_version(0)''') - version_parts = rows[0][0].split(".") - self.assertEqual("2",version_parts[0]) - - int_dataframe_value_str = "[[1,2],[3,4]]" - int_expected_rows = [(1,2, None),(3,4, None)] - float16_dataframe_value_str = 'np.array([[1.0,1.0],[1.0,1.0]], dtype="float16")' - float_dataframe_value_str = "[[1.0,1.0],[1.0,1.0]]" - float_expected_rows = [(1.0,1.0, None),(1.0,1.0, None)] - - - types = [ - ("uint8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("uint16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("uint32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("uint64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("float16[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), - ("float32[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ("float64[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ("halffloat[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), - ("float[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ("double[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ] - - @useData(types) - def test_pandas2_int_pyarrow_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_rows:List[Tuple]): - sql=udf.fixindent(f''' - CREATE OR REPLACE PYTHON3 SET SCRIPT pandas2test.test_pandas2_pyarrow_dtype_emit(i integer) - EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS - - def run(ctx): - try: - import pandas as pd - import numpy as np - df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") - df["traceback"]=None - ctx.emit(df) - except: - import traceback - ctx.emit(None,None,traceback.format_exc()) - / - ''') - print(sql) - self.query(sql) - with UdfDebugger(test_case=self): - rows = self.query('''SELECT pandas2test.test_pandas2_pyarrow_dtype_emit(0)''') - self.assertRowsEqual(expected_rows, rows) - - -if __name__ == '__main__': - udf.main() From ff8c9be8e4cd5cf638c1a54f6df0c68bcc177b20 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 23 May 2023 08:58:24 +0200 Subject: [PATCH 05/11] Add pandas2 test folder to testconfig --- flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig | 2 +- .../template-Exasol-all-python-3.8-conda/flavor_base/testconfig | 2 +- .../flavor_base/testconfig | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig index 8688dd718..d1bdb8113 100644 --- a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig +++ b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all pandas/all +test_folders=python3/all pandas/all pandas/pandas2 diff --git a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig index 8688dd718..d1bdb8113 100644 --- a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig +++ b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all pandas/all +test_folders=python3/all pandas/all pandas/pandas2 diff --git a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig index 8688dd718..d1bdb8113 100644 --- a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig +++ b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/testconfig @@ -1,2 +1,2 @@ generic_language_tests=python3 -test_folders=python3/all pandas/all +test_folders=python3/all pandas/all pandas/pandas2 From 71e8aed88ea0c0b7f4bb6d3f7f55939340c2a036 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 23 May 2023 18:34:01 +0200 Subject: [PATCH 06/11] Add support for pyarrow timestamp and decimal dtypes --- .../python/python3/python_ext_dataframe.cc | 59 +++++- .../tests/test/pandas/all/emit_dtypes.py | 4 +- .../tests/test/pandas/pandas2/pandas.py | 197 ++++++++++++------ 3 files changed, 190 insertions(+), 70 deletions(-) diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 228a9f54c..5afb964a4 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -28,6 +28,7 @@ extern "C" { #define PY_NONETYPE (NPY_USERDEF+5) #define PY_BOOL (NPY_USERDEF+6) #define PY_FLOAT (NPY_USERDEF+7) +#define PY_TIMESTAMP (NPY_USERDEF+8) std::map pandasDTypeStrToNumpyCTypeMap { @@ -75,6 +76,8 @@ std::map pandasDTypeStrToNumpyCTypeMap { {"bool", NPY_BOOL}, {"datetime64[ns]", NPY_DATETIME}, + {"timestamp[ns, tz=UTC][pyarrow]", NPY_OBJECT}, + {"object", NPY_OBJECT}, {"py_NAType", PY_NONETYPE}, @@ -84,7 +87,8 @@ std::map pandasDTypeStrToNumpyCTypeMap { {"py_float", PY_FLOAT}, {"py_decimal.Decimal", PY_DECIMAL}, {"py_str", PY_STR}, - {"py_datetime.date", PY_DATE} + {"py_datetime.date", PY_DATE}, + {"py_Timestamp", PY_TIMESTAMP} }; std::map numpyCTypeToNumpyDTypeStrMap { @@ -99,13 +103,6 @@ std::map numpyCTypeToNumpyDTypeStrMap { {NPY_UINT64, "uint64"}, {NPY_FLOAT32, "float32"}, {NPY_FLOAT64, "float64"}, - {PY_INT, "py_int"}, - {PY_FLOAT, "py_float"}, - {PY_DECIMAL, "py_decimal.Decimal"}, - {PY_STR, "py_str"}, - {PY_DATE, "py_datetime.date"}, - {PY_NONETYPE, "py_NoneType"}, - {PY_BOOL, "py_bool"} }; std::map emitTypeMap { @@ -449,6 +446,11 @@ inline bool isNumpyDatetime64(const char* typeName){ return std::string(typeName).find("datetime64[")==0; } +inline bool isArrowDecimal128(const char* typeName){ + // example decimal128(3, 2)[pyarrow] + return std::string(typeName).find("decimal128(")==0 && std::string(typeName).find("[pyarrow]")!=std::string::npos; +} + inline void getColumnTypeInfo(PyObject *numpyTypes, std::vector>& colTypes){ PyPtr numpyTypeIter(PyObject_GetIter(numpyTypes)); for (PyPtr numpyType(PyIter_Next(numpyTypeIter.get())); numpyType.get(); numpyType.reset(PyIter_Next(numpyTypeIter.get()))) { @@ -456,6 +458,8 @@ inline void getColumnTypeInfo(PyObject *numpyTypes, std::vector::iterator it = pandasDTypeStrToNumpyCTypeMap.find(typeName); if (it != pandasDTypeStrToNumpyCTypeMap.end()) { colTypes.push_back(*it); + } else if(isArrowDecimal128(typeName)){ + colTypes.push_back({typeName, NPY_OBJECT}); } else if(isNumpyDatetime64(typeName)){ std::stringstream ss; ss << "F-UDF-CL-SL-PYTHON-1138: emit: unsupported datetime type: " << typeName << @@ -1070,6 +1074,39 @@ inline void handleEmitPyDate( } } } +inline void handleEmitPyTimestamp( + int c, int r, + std::vector& columnArrays, + std::vector>& pyColSetMethods, + std::vector& colInfo, + std::vector>& colTypes, + PyObject *resultHandler, + PyPtr& pyValue, + PyPtr& pyResult, + PyPtr& pySetNullMethodName){ + PyPtr pyTimestamp(PyList_GetItem(columnArrays[c].get(), r)); + if (isNoneOrNA(pyTimestamp.get())) { + pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); + return; + } + + switch (colInfo[c].type) { + case SWIGVMContainers::TIMESTAMP: + { + pyTimestamp.reset(PyObject_CallMethod(pyTimestamp.get(), "astimezone", "z", NULL)); + PyPtr pyIsoDatetime(PyObject_CallMethod(pyTimestamp.get(), "isoformat", "s", " ")); + pyResult.reset(PyObject_CallMethodObjArgs( + resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyIsoDatetime.get(), NULL)); + break; + } + default: + { + std::stringstream ss; + ss << "F-UDF-CL-SL-PYTHON-1071: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; + throw std::runtime_error(ss.str().c_str()); + } + } +} inline void handleEmitNpyDateTime( @@ -1253,6 +1290,12 @@ void emit(PyObject *resultHandler, std::vector& colInfo, PyObject *d pySetNullMethodName, pyIsoformatMethodName); break; } + case PY_TIMESTAMP: + { + handleEmitPyTimestamp(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, + pySetNullMethodName); + break; + } case NPY_DATETIME: { handleEmitNpyDateTime(c, r, columnArrays, pyColSetMethods, colInfo, colTypes, resultHandler, pyValue, pyResult, diff --git a/test_container/tests/test/pandas/all/emit_dtypes.py b/test_container/tests/test/pandas/all/emit_dtypes.py index ef65ce444..39ba55b70 100755 --- a/test_container/tests/test/pandas/all/emit_dtypes.py +++ b/test_container/tests/test/pandas/all/emit_dtypes.py @@ -124,7 +124,7 @@ def setUp(self): ("object", "boolean", bool_dataframe_value_str, bool_expected_rows, False), ("datetime64[ns]", "timestamp", timestamp_dataframe_value_str, datetime_expected_rows, False), - ("object", "timestamp", timestamp_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1056.*unexpected python type: py_Timestamp.*", False), + ("object", "timestamp", timestamp_dataframe_value_str, datetime_expected_rows, False), ("object", "timestamp", datetime_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1056.*unexpected python type: py_datetime.datetime.*", False), ("object", "timestamp", date_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1071: emit column 0 of type TIMESTAMP but data given have type py_datetime.date.*", False), ("object", "DATE", date_dataframe_value_str, date_expected_rows, False), @@ -164,6 +164,7 @@ def setUp(self): ("object", "boolean", none_dataframe_value_str, none_expected_rows, False), ("datetime64[ns]", "timestamp", none_dataframe_value_str, none_expected_rows, False), + ("object", "timestamp", none_dataframe_value_str, none_expected_rows, False), ("object", "DATE", none_dataframe_value_str, none_expected_rows, False), # NaN @@ -199,6 +200,7 @@ def setUp(self): ("object", "boolean", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type BOOLEAN but data given have type py_float.*", False), ("datetime64[ns]", "timestamp", nan_dataframe_value_str, nan_expected_rows, False), + ("object", "timestamp", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type DATE but data given have type py_float.*", False), ("object", "DATE", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type DATE but data given have type py_float.*", False), # TODO mixed nan/none with values diff --git a/test_container/tests/test/pandas/pandas2/pandas.py b/test_container/tests/test/pandas/pandas2/pandas.py index 72ebcf713..479d4d9c9 100755 --- a/test_container/tests/test/pandas/pandas2/pandas.py +++ b/test_container/tests/test/pandas/pandas2/pandas.py @@ -32,10 +32,11 @@ def run(ctx): int_dataframe_value_str = "[[1, 2],[3, 4]]" int_expected_rows = [(1, 2, None),(3, 4, None)] + int_to_float_expected_rows = [(1.0, 2.0, None),(3.0, 4.0, None)] - float16_dataframe_value_str = 'np.array([[1.0, 2.0],[3.0, 4.0]], dtype="float16")' - float_dataframe_value_str = "[[1.0, 2.0],[3.0, 4.0]]" - float_expected_rows = [(1.0, 2.0, None),(3.0, 4.0, None)] + float16_dataframe_value_str = 'np.array([[1.1, 2.1],[3.1, 4.1]], dtype="float16")' + float_dataframe_value_str = "[[1.1, 2.1],[3.1, 4.1]]" + float_expected_rows = [(1.1, 2.1, None),(3.1, 4.1, None)] str_dataframe_value_str = "[['a','b'],['c','d']]" str_expected_rows = [('a','b',None),('c','d',None)] @@ -43,8 +44,9 @@ def run(ctx): bool_dataframe_value_str = "[[True,False],[True,False]]" bool_expected_rows = [(True,False,None),(True,False,None)] - decimal_dataframe_value_str = "[[Decimal('1.0'),Decimal('2.0')],[Decimal('3.0'),Decimal('4.0')]]" - decimal_expected_rows = [(Decimal('1.0'),Decimal('2.0'),None),(Decimal('3.0'),Decimal('4.0'),None)] + decimal_dataframe_value_str = "[[Decimal('1.1'),Decimal('2.1')],[Decimal('3.1'),Decimal('4.1')]]" + decimal_expected_rows = [(Decimal('1.1'),Decimal('2.1'),None),(Decimal('3.1'),Decimal('4.1'),None)] + int_to_decimal_expected_rows = [(Decimal('1'),Decimal('2'),None),(Decimal('3'),Decimal('4'),None)] timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),' \ +'pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))],' \ @@ -54,72 +56,121 @@ def run(ctx): +'datetime(2020, 7, 27, 14, 22, 33, 673251)],' \ +'[datetime(2020, 7, 27, 14, 22, 33, 673251),' \ +'datetime(2020, 7, 27, 14, 22, 33, 673251)]]' - datetime_expected_row = [(datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None), + datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None), (datetime(2020, 7, 27, 14, 22, 33, 673000),datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + date_dataframe_value_str = '[[date(2020, 7, 27),' \ + +'date(2020, 7, 27)],' \ + +'[date(2020, 7, 27),' \ + +'date(2020, 7, 27)]]' + date_expected_rows = [(date(2020, 7, 27),date(2020, 7, 27),None), + (date(2020, 7, 27),date(2020, 7, 27),None)] none_dataframe_value_str = "[[None, None],[None, None]]" none_expected_rows = [(None, None, None),(None, None, None)] - none_expected_rows_bool_ = [(False, False, None),(False, False, None)] nan_dataframe_value_str = "[[np.nan, np.nan],[np.nan, np.nan]]" - nan_float16_dataframe_value_str = "np.array([[np.nan, np.nan],[np.nan, np.nan]], dtype='float16')" nan_expected_rows = [(None, None, None),(None, None, None)] types = [ - ("uint8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("uint16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("uint32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("uint64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int8[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int16[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int32[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("int64[pyarrow]", "integer", int_dataframe_value_str, int_expected_rows), - ("float16[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), - ("float32[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ("float64[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ("halffloat[pyarrow]", "float", float16_dataframe_value_str, float_expected_rows), - ("float[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ("double[pyarrow]", "float", float_dataframe_value_str, float_expected_rows), - ("string[pyarrow]", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows), - ("bool[pyarrow]", "boolean", bool_dataframe_value_str, bool_expected_rows), + ("dtype='uint8[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + + ("dtype='float16[pyarrow]'", "double", float16_dataframe_value_str, float_expected_rows, True), + ("dtype='float32[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, True), + ("dtype='float64[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "double", float16_dataframe_value_str, float_expected_rows, True), + ("dtype='float[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, True), + ("dtype='double[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, False), + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + + ("dtype='bool[pyarrow]'", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + + ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", datetime_dataframe_value_str, datetime_expected_rows, False), + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", decimal_dataframe_value_str, decimal_expected_rows, False), + #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date32())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone + #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date64())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone + - ("uint8[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("uint16[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("uint32[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("uint64[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("int8[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("int16[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("int32[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("int64[pyarrow]", "integer", none_dataframe_value_str, none_expected_rows), - ("float16[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), - ("float32[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), - ("float64[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), - ("halffloat[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), - ("float[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), - ("double[pyarrow]", "float", none_dataframe_value_str, none_expected_rows), - ("string[pyarrow]", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows), - ("bool[pyarrow]", "boolean", none_dataframe_value_str, none_expected_rows), - - ("uint8[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("uint16[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("uint32[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("uint64[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("int8[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("int16[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("int32[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("int64[pyarrow]", "integer", nan_dataframe_value_str, nan_expected_rows), - ("float16[pyarrow]", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*"), - ("float32[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), - ("float64[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), - ("halffloat[pyarrow]", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*"), - ("float[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), - ("double[pyarrow]", "float", nan_dataframe_value_str, nan_expected_rows), - ("string[pyarrow]", "VARCHAR(2000000)", nan_dataframe_value_str, nan_expected_rows), - ("bool[pyarrow]", "boolean", nan_dataframe_value_str, nan_expected_rows), + # Int To Double + + ("dtype='uint8[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='uint16[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='uint32[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='uint64[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int8[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int16[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int32[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + ("dtype='int64[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + + # Float to Int + + ("dtype='float16[pyarrow]'", "integer", float16_dataframe_value_str, int_expected_rows, False), + ("dtype='float32[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + ("dtype='float64[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "integer", float16_dataframe_value_str, int_expected_rows, False), + ("dtype='float[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + ("dtype='double[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + + # None + + ("dtype='uint8[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), + + ("dtype='float16[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='float32[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='float64[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='float[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + ("dtype='double[pyarrow]'", "float", none_dataframe_value_str, none_expected_rows, False), + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), + + ("dtype='bool[pyarrow]'", "boolean", none_dataframe_value_str, none_expected_rows, False), + + ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", none_dataframe_value_str, none_expected_rows, False), + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), + + # NaN + + ("dtype='uint8[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", nan_dataframe_value_str, nan_expected_rows, False), + + ("dtype='float16[pyarrow]'", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*", False), + ("dtype='float32[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='float64[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "float", nan_dataframe_value_str, ".*pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to halffloat using function cast_half_float.*", False), + ("dtype='float[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + ("dtype='double[pyarrow]'", "float", nan_dataframe_value_str, nan_expected_rows, False), + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", nan_dataframe_value_str, nan_expected_rows, False), + + ("dtype='bool[pyarrow]'", "boolean", nan_dataframe_value_str, nan_expected_rows, False), + + #("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", nan_dataframe_value_str, nan_expected_rows, False), # Dateframe creation fails with: pyarrow.lib.ArrowNotImplementedError: Unsupported cast from double to timestamp using function cast_timestamp + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), ] @useData(types) - def test_pyarrow_dtype_emit(self, dtype:str, sql_type:str, dataframe_value_str:str, expected_result:Union[str,List[Tuple]]): + def test_dtype_emit(self, dtype_definition:str, sql_type:str, dataframe_value_str:str, expected_result:Union[str,List[Tuple]], use_almost_equal:bool): sql=udf.fixindent(f''' CREATE OR REPLACE PYTHON3 SET SCRIPT test_dtype_emit(i integer) EMITS (o1 {sql_type}, o2 {sql_type}, traceback varchar(2000000)) AS @@ -129,8 +180,10 @@ def run(ctx): from decimal import Decimal import pandas as pd import numpy as np - from datetime import datetime - df = pd.DataFrame({dataframe_value_str}, dtype="{dtype}") + import pyarrow as pa + from datetime import datetime, date + {dtype_definition} + df = pd.DataFrame({dataframe_value_str}, dtype=dtype) df["traceback"]=None ctx.emit(df) except: @@ -145,7 +198,29 @@ def run(ctx): if isinstance(expected_result,str): self.assertRegex(rows[0][2], expected_result) else: - self.assertRowsEqual(expected_result, rows) + if use_almost_equal: + self.assertRowsAlmostEqual(expected_result, rows, places=1) + else: + self.assertRowsEqual(expected_result, rows) + + def isValueAlmostEqual(self, left, right, places): + if isinstance(left, (float, Decimal)) and isinstance(right, (float, Decimal)): + return round(left, places) == round(right, places) + else: + return left == right + + def isRowAlmostEqual(self, left, right, places): + if len(left) != len(right): + return False + all_values_almost_equal = all(self.isValueAlmostEqual(lvalue, rvalue, places) + for lvalue, rvalue in zip(left, right)) + return all_values_almost_equal + + def assertRowsAlmostEqual(self, left, right, places): + lrows = [tuple(x) for x in left] + rrows = [tuple(x) for x in right] + if len(lrows) != len(rrows): + raise AssertionError(f'{lrows} and {rrows} have different number of rows.') if __name__ == '__main__': udf.main() From 1309c76663eb6389a061105d75fe9f8957c76659 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 24 May 2023 11:03:44 +0200 Subject: [PATCH 07/11] Change astimezone to tz_localize to also handle timestamps without timezone and started to add mixed value/none tests --- .../python/python3/python_ext_dataframe.cc | 2 +- .../tests/test/pandas/all/emit_dtypes.py | 101 +++++++++++++++--- 2 files changed, 87 insertions(+), 16 deletions(-) diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 5afb964a4..613fbc2eb 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -1093,7 +1093,7 @@ inline void handleEmitPyTimestamp( switch (colInfo[c].type) { case SWIGVMContainers::TIMESTAMP: { - pyTimestamp.reset(PyObject_CallMethod(pyTimestamp.get(), "astimezone", "z", NULL)); + pyTimestamp.reset(PyObject_CallMethod(pyTimestamp.get(), "tz_localize", "z", NULL)); PyPtr pyIsoDatetime(PyObject_CallMethod(pyTimestamp.get(), "isoformat", "s", " ")); pyResult.reset(PyObject_CallMethodObjArgs( resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyIsoDatetime.get(), NULL)); diff --git a/test_container/tests/test/pandas/all/emit_dtypes.py b/test_container/tests/test/pandas/all/emit_dtypes.py index 39ba55b70..8e2c90712 100755 --- a/test_container/tests/test/pandas/all/emit_dtypes.py +++ b/test_container/tests/test/pandas/all/emit_dtypes.py @@ -52,6 +52,36 @@ def setUp(self): date_expected_rows = [(date(2020, 7, 27),date(2020, 7, 27),None), (date(2020, 7, 27),date(2020, 7, 27),None)] + mixed_int_dataframe_value_str = "[[1, None],[None, 4]]" + mixed_int_expected_rows = [(1, None, None),(None, 4, None)] + mixed_int_to_float_expected_rows = [(1.0, None, None),(None, 4.0, None)] + + mixed_float16_dataframe_value_str = 'np.array([[1.1, None],[None, 4.1]], dtype="float16")' + mixed_float_dataframe_value_str = "[[1.1, None],[None, 4.1]]" + mixed_float_expected_rows = [(1.1, None, None),(None, 4.1, None)] + + mixed_str_dataframe_value_str = "[['a',None],[None,'d']]" + mixed_str_expected_rows = [('a',None,None),(None,'d',None)] + + mixed_bool_dataframe_value_str = "[[True,None],[None,False]]" + mixed_bool_expected_rows = [(True,None,None),(None,False,None)] + mixed_bool_expected_rows_bool_ = [(True, False, None),(False, False, None)] + + mixed_decimal_dataframe_value_str = "[[Decimal('1.1'),None],[None,Decimal('4.1')]]" + mixed_decimal_expected_rows = [(Decimal('1.1'),None,None),(None,Decimal('4.1'),None)] + mixed_int_to_decimal_expected_rows = [(Decimal('1'),None,None),(None,Decimal('4'),None)] + + mixed_timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),None],' \ + +'[None,pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + mixed_datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),None],' \ + +'[None,datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + mixed_datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),None,None), + (None,datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + mixed_date_dataframe_value_str = '[[date(2020, 7, 27),None],' \ + +'[None,date(2020, 7, 27)]]' + mixed_date_expected_rows = [(date(2020, 7, 27),None,None), + (None,date(2020, 7, 27),None)] + none_dataframe_value_str = "[[None, None],[None, None]]" none_expected_rows = [(None, None, None),(None, None, None)] none_expected_rows_bool_ = [(False, False, None),(False, False, None)] @@ -63,6 +93,8 @@ def setUp(self): types = [ + # Full columns without None or NaN + ("uint8", "integer", int_dataframe_value_str, int_expected_rows, False), ("uint16", "integer", int_dataframe_value_str, int_expected_rows, False), ("uint32", "integer", int_dataframe_value_str, int_expected_rows, False), @@ -72,7 +104,7 @@ def setUp(self): ("int32", "integer", int_dataframe_value_str, int_expected_rows, False), ("int64", "integer", int_dataframe_value_str, int_expected_rows, False), ("object", "integer", int_dataframe_value_str, int_expected_rows, False), - + ("float16", "double", float16_dataframe_value_str, float_expected_rows, True), ("float32", "double", float_dataframe_value_str, float_expected_rows, True), ("float64", "double", float_dataframe_value_str, float_expected_rows, False), @@ -106,19 +138,19 @@ def setUp(self): ("int32", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), ("int64", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), ("object", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), - + ("float16", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), ("float32", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("float64", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("float", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("double", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("object", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), - + ("object", "DECIMAL(10,5)", decimal_dataframe_value_str, decimal_expected_rows, False), - + ("string", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), ("object", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), - + ("bool_", "boolean", bool_dataframe_value_str, bool_expected_rows, False), ("boolean", "boolean", bool_dataframe_value_str, bool_expected_rows, False), ("object", "boolean", bool_dataframe_value_str, bool_expected_rows, False), @@ -129,12 +161,52 @@ def setUp(self): ("object", "timestamp", date_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1071: emit column 0 of type TIMESTAMP but data given have type py_datetime.date.*", False), ("object", "DATE", date_dataframe_value_str, date_expected_rows, False), + # Mixed columns with values and None + + ("object", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + + ("float16", "double", mixed_float16_dataframe_value_str, mixed_float_expected_rows, True), + ("float32", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, True), + ("float64", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("float", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("double", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("object", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + + ("float16", "integer", mixed_float16_dataframe_value_str, mixed_int_expected_rows, False), + ("float32", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("float64", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("float", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("double", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("object", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + + ("object", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + + ("float16", "DECIMAL(10,5)", mixed_float16_dataframe_value_str, mixed_decimal_expected_rows, True), + ("float32", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("float64", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("float", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("double", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + ("object", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + + ("object", "DECIMAL(10,5)", mixed_decimal_dataframe_value_str, mixed_decimal_expected_rows, False), + + ("string", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), + ("object", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), + + ("bool_", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows_bool_, False), + ("boolean", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), + ("object", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), + + ("datetime64[ns]", "timestamp", mixed_timestamp_dataframe_value_str, mixed_datetime_expected_rows, False), + ("object", "timestamp", mixed_timestamp_dataframe_value_str, mixed_datetime_expected_rows, False), + ("object", "DATE", mixed_date_dataframe_value_str, mixed_date_expected_rows, False), + #(u)int-dtypes don't support None or np.nan # None ("object", "integer", none_dataframe_value_str, none_expected_rows, False), - + ("float16", "double", none_dataframe_value_str, none_expected_rows, False), ("float32", "double", none_dataframe_value_str, none_expected_rows, False), ("float64", "double", none_dataframe_value_str, none_expected_rows, False), @@ -147,7 +219,7 @@ def setUp(self): ("float64", "integer", none_dataframe_value_str, none_expected_rows, False), ("float", "integer", none_dataframe_value_str, none_expected_rows, False), ("double", "integer", none_dataframe_value_str, none_expected_rows, False), - + ("float16", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), ("float32", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), ("float64", "DECIMAL(10,5)", none_dataframe_value_str, none_expected_rows, False), @@ -158,19 +230,19 @@ def setUp(self): ("string", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), ("object", "VARCHAR(2000000)", none_dataframe_value_str, none_expected_rows, False), - + ("bool_", "boolean", none_dataframe_value_str, none_expected_rows_bool_, False), ("boolean", "boolean", none_dataframe_value_str, none_expected_rows, False), ("object", "boolean", none_dataframe_value_str, none_expected_rows, False), - + ("datetime64[ns]", "timestamp", none_dataframe_value_str, none_expected_rows, False), ("object", "timestamp", none_dataframe_value_str, none_expected_rows, False), ("object", "DATE", none_dataframe_value_str, none_expected_rows, False), # NaN - + ("object", "integer", nan_dataframe_value_str, nan_expected_rows, False), - + ("float16", "double", nan_dataframe_value_str, nan_expected_rows, False), ("float32", "double", nan_dataframe_value_str, nan_expected_rows, False), ("float64", "double", nan_dataframe_value_str, nan_expected_rows, False), @@ -183,7 +255,7 @@ def setUp(self): ("float64", "integer", nan_dataframe_value_str, nan_expected_rows, False), ("float", "integer", nan_dataframe_value_str, nan_expected_rows, False), ("double", "integer", nan_dataframe_value_str, nan_expected_rows, False), - + ("float16", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), ("float32", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), ("float64", "DECIMAL(10,5)", nan_dataframe_value_str, nan_expected_rows, False), @@ -194,16 +266,15 @@ def setUp(self): ("string", "VARCHAR(2000000)", nan_dataframe_value_str, nan_expected_rows, False), ("object", "VARCHAR(2000000)", nan_dataframe_value_str, ".*PYTHON-1068: emit column 0 of type STRING but data given have type py_float.*", False), - + ("bool_", "boolean", nan_dataframe_value_str, nan_expected_rows_bool_, False), ("boolean", "boolean", nan_dataframe_value_str, nan_expected_rows, False), ("object", "boolean", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type BOOLEAN but data given have type py_float.*", False), ("datetime64[ns]", "timestamp", nan_dataframe_value_str, nan_expected_rows, False), - ("object", "timestamp", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type DATE but data given have type py_float.*", False), + ("object", "timestamp", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type TIMESTAMP but data given have type py_float.*", False), ("object", "DATE", nan_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1068: emit column 0 of type DATE but data given have type py_float.*", False), - # TODO mixed nan/none with values ] @useData(types) From 08451329558a2682d34ef0445905ecb9c10d76a7 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 24 May 2023 14:04:52 +0200 Subject: [PATCH 08/11] Add remaining mixed value/none tests --- .../tests/test/pandas/all/emit_dtypes.py | 41 ++++- .../tests/test/pandas/pandas2/pandas.py | 157 ++++++++++++++++-- 2 files changed, 183 insertions(+), 15 deletions(-) diff --git a/test_container/tests/test/pandas/all/emit_dtypes.py b/test_container/tests/test/pandas/all/emit_dtypes.py index 8e2c90712..774e0bc57 100755 --- a/test_container/tests/test/pandas/all/emit_dtypes.py +++ b/test_container/tests/test/pandas/all/emit_dtypes.py @@ -93,7 +93,7 @@ def setUp(self): types = [ - # Full columns without None or NaN + # Full columns without None or NaN / Int ("uint8", "integer", int_dataframe_value_str, int_expected_rows, False), ("uint16", "integer", int_dataframe_value_str, int_expected_rows, False), @@ -105,6 +105,8 @@ def setUp(self): ("int64", "integer", int_dataframe_value_str, int_expected_rows, False), ("object", "integer", int_dataframe_value_str, int_expected_rows, False), + # Full columns without None or NaN / Float + ("float16", "double", float16_dataframe_value_str, float_expected_rows, True), ("float32", "double", float_dataframe_value_str, float_expected_rows, True), ("float64", "double", float_dataframe_value_str, float_expected_rows, False), @@ -112,6 +114,8 @@ def setUp(self): ("double", "double", float_dataframe_value_str, float_expected_rows, False), ("object", "double", float_dataframe_value_str, float_expected_rows, False), + # Full columns without None or NaN / Int to Float + ("uint8", "double", int_dataframe_value_str, int_to_float_expected_rows, False), ("uint16", "double", int_dataframe_value_str, int_to_float_expected_rows, False), ("uint32", "double", int_dataframe_value_str, int_to_float_expected_rows, False), @@ -122,6 +126,8 @@ def setUp(self): ("int64", "double", int_dataframe_value_str, int_to_float_expected_rows, False), ("object", "double", int_dataframe_value_str, int_to_float_expected_rows, False), + # Full columns without None or NaN / Float to Int + ("float16", "integer", float16_dataframe_value_str, int_expected_rows, False), ("float32", "integer", float_dataframe_value_str, int_expected_rows, False), ("float64", "integer", float_dataframe_value_str, int_expected_rows, False), @@ -129,6 +135,8 @@ def setUp(self): ("double", "integer", float_dataframe_value_str, int_expected_rows, False), ("object", "integer", float_dataframe_value_str, int_expected_rows, False), + # Full columns without None or NaN / Int to Decimal + ("uint8", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), ("uint16", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), ("uint32", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), @@ -139,32 +147,46 @@ def setUp(self): ("int64", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), ("object", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + # Full columns without None or NaN / Float to Decimal + ("float16", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), ("float32", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("float64", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("float", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("double", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), ("object", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, True), + + # Full columns without None or NaN / Decimal ("object", "DECIMAL(10,5)", decimal_dataframe_value_str, decimal_expected_rows, False), + # Full columns without None or NaN / String + ("string", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), ("object", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + # Full columns without None or NaN / Boolean + ("bool_", "boolean", bool_dataframe_value_str, bool_expected_rows, False), ("boolean", "boolean", bool_dataframe_value_str, bool_expected_rows, False), ("object", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + # Full columns without None or NaN / Date and Time + ("datetime64[ns]", "timestamp", timestamp_dataframe_value_str, datetime_expected_rows, False), ("object", "timestamp", timestamp_dataframe_value_str, datetime_expected_rows, False), ("object", "timestamp", datetime_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1056.*unexpected python type: py_datetime.datetime.*", False), ("object", "timestamp", date_dataframe_value_str, ".*F-UDF-CL-SL-PYTHON-1071: emit column 0 of type TIMESTAMP but data given have type py_datetime.date.*", False), ("object", "DATE", date_dataframe_value_str, date_expected_rows, False), - # Mixed columns with values and None + # Mixed columns with values and None / Int + + #(u)int-dtypes don't support None or np.nan ("object", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + # Mixed columns with values and None / Float + ("float16", "double", mixed_float16_dataframe_value_str, mixed_float_expected_rows, True), ("float32", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, True), ("float64", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), @@ -172,6 +194,7 @@ def setUp(self): ("double", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), ("object", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + # Mixed columns with values and None / Float to Int ("float16", "integer", mixed_float16_dataframe_value_str, mixed_int_expected_rows, False), ("float32", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), ("float64", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), @@ -179,8 +202,12 @@ def setUp(self): ("double", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), ("object", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + # Mixed columns with values and None / Int to Decimal + ("object", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + # Mixed columns with values and None / Float to Decimal + ("float16", "DECIMAL(10,5)", mixed_float16_dataframe_value_str, mixed_decimal_expected_rows, True), ("float32", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), ("float64", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), @@ -188,21 +215,27 @@ def setUp(self): ("double", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), ("object", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, True), + # Mixed columns with values and None / Decimal + ("object", "DECIMAL(10,5)", mixed_decimal_dataframe_value_str, mixed_decimal_expected_rows, False), + # Mixed columns with values and None / String + ("string", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), ("object", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), + # Mixed columns with values and None / Boolean + ("bool_", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows_bool_, False), ("boolean", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), ("object", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), + # Mixed columns with values and None / Data and time + ("datetime64[ns]", "timestamp", mixed_timestamp_dataframe_value_str, mixed_datetime_expected_rows, False), ("object", "timestamp", mixed_timestamp_dataframe_value_str, mixed_datetime_expected_rows, False), ("object", "DATE", mixed_date_dataframe_value_str, mixed_date_expected_rows, False), - #(u)int-dtypes don't support None or np.nan - # None ("object", "integer", none_dataframe_value_str, none_expected_rows, False), diff --git a/test_container/tests/test/pandas/pandas2/pandas.py b/test_container/tests/test/pandas/pandas2/pandas.py index 479d4d9c9..802edd422 100755 --- a/test_container/tests/test/pandas/pandas2/pandas.py +++ b/test_container/tests/test/pandas/pandas2/pandas.py @@ -65,6 +65,35 @@ def run(ctx): date_expected_rows = [(date(2020, 7, 27),date(2020, 7, 27),None), (date(2020, 7, 27),date(2020, 7, 27),None)] + mixed_int_dataframe_value_str = "[[1, None],[None, 4]]" + mixed_int_expected_rows = [(1, None, None),(None, 4, None)] + mixed_int_to_float_expected_rows = [(1.0, None, None),(None, 4.0, None)] + + mixed_float16_dataframe_value_str = 'np.array([[1.1, None],[None, 4.1]], dtype="float16")' + mixed_float_dataframe_value_str = "[[1.1, None],[None, 4.1]]" + mixed_float_expected_rows = [(1.1, None, None),(None, 4.1, None)] + + mixed_str_dataframe_value_str = "[['a',None],[None,'d']]" + mixed_str_expected_rows = [('a',None,None),(None,'d',None)] + + mixed_bool_dataframe_value_str = "[[True,None],[None,False]]" + mixed_bool_expected_rows = [(True,None,None),(None,False,None)] + + mixed_decimal_dataframe_value_str = "[[Decimal('1.1'),None],[None,Decimal('4.1')]]" + mixed_decimal_expected_rows = [(Decimal('1.1'),None,None),(None,Decimal('4.1'),None)] + mixed_int_to_decimal_expected_rows = [(Decimal('1'),None,None),(None,Decimal('4'),None)] + + mixed_timestamp_dataframe_value_str = '[[pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251)),None],' \ + +'[None,pd.Timestamp(datetime(2020, 7, 27, 14, 22, 33, 673251))]]' + mixed_datetime_dataframe_value_str = '[[datetime(2020, 7, 27, 14, 22, 33, 673251),None],' \ + +'[None,datetime(2020, 7, 27, 14, 22, 33, 673251)]]' + mixed_datetime_expected_rows = [(datetime(2020, 7, 27, 14, 22, 33, 673000),None,None), + (None,datetime(2020, 7, 27, 14, 22, 33, 673000),None)] + mixed_date_dataframe_value_str = '[[date(2020, 7, 27),None],' \ + +'[None,date(2020, 7, 27)]]' + mixed_date_expected_rows = [(date(2020, 7, 27),None,None), + (None,date(2020, 7, 27),None)] + none_dataframe_value_str = "[[None, None],[None, None]]" none_expected_rows = [(None, None, None),(None, None, None)] @@ -72,6 +101,8 @@ def run(ctx): nan_expected_rows = [(None, None, None),(None, None, None)] types = [ + # Full columns without None or NaN / Int + ("dtype='uint8[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), ("dtype='uint16[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), ("dtype='uint32[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), @@ -81,6 +112,8 @@ def run(ctx): ("dtype='int32[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), ("dtype='int64[pyarrow]'", "integer", int_dataframe_value_str, int_expected_rows, False), + # Full columns without None or NaN / Float + ("dtype='float16[pyarrow]'", "double", float16_dataframe_value_str, float_expected_rows, True), ("dtype='float32[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, True), ("dtype='float64[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, False), @@ -88,17 +121,30 @@ def run(ctx): ("dtype='float[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, True), ("dtype='double[pyarrow]'", "double", float_dataframe_value_str, float_expected_rows, False), - ("dtype='string[pyarrow]'", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + # Full columns without None or NaN / Decimal - ("dtype='bool[pyarrow]'", "boolean", bool_dataframe_value_str, bool_expected_rows, False), - - ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", datetime_dataframe_value_str, datetime_expected_rows, False), ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", decimal_dataframe_value_str, decimal_expected_rows, False), - #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date32())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone - #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date64())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone - - - # Int To Double + # Full columns without None or NaN / Int to Decimal + + ("dtype='uint8[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='uint16[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='uint32[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='uint64[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int8[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int16[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int32[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + ("dtype='int64[pyarrow]'", "DECIMAL(10,5)", int_dataframe_value_str, int_to_decimal_expected_rows, False), + + # Full columns without None or NaN / Float to Decimal + + ("dtype='float16[pyarrow]'", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), + ("dtype='float32[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + ("dtype='float64[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "DECIMAL(10,5)", float16_dataframe_value_str, decimal_expected_rows, True), + ("dtype='float[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + ("dtype='double[pyarrow]'", "DECIMAL(10,5)", float_dataframe_value_str, decimal_expected_rows, False), + + # Full columns without None or NaN / Int To Double ("dtype='uint8[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), ("dtype='uint16[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), @@ -109,7 +155,7 @@ def run(ctx): ("dtype='int32[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), ("dtype='int64[pyarrow]'", "double", int_dataframe_value_str, int_to_float_expected_rows, False), - # Float to Int + # Full columns without None or NaN / Float to Int ("dtype='float16[pyarrow]'", "integer", float16_dataframe_value_str, int_expected_rows, False), ("dtype='float32[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), @@ -117,9 +163,98 @@ def run(ctx): ("dtype='halffloat[pyarrow]'", "integer", float16_dataframe_value_str, int_expected_rows, False), ("dtype='float[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), ("dtype='double[pyarrow]'", "integer", float_dataframe_value_str, int_expected_rows, False), + + # Full columns without None or NaN / String + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", str_dataframe_value_str, str_expected_rows, False), + + # Full columns without None or NaN / Boolean + + ("dtype='bool[pyarrow]'", "boolean", bool_dataframe_value_str, bool_expected_rows, False), + + # Full columns without None or NaN / Date and time + + ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", datetime_dataframe_value_str, datetime_expected_rows, False), + #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date32())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone + #df = pd.DataFrame([[datetime.date(2012,1,1),None],[None,None]], dtype=pd.ArrowDtype(pa.date64())) can't be created at the moment, because it fails with "AttributeError: 'ArrowDtype' object has no attribute 'tz'" and pa.date32() doesn't accept a timezone + + # Mixed columns with values and None / Int + + ("dtype='uint8[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='uint16[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='uint32[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='uint64[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int8[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int16[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int32[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='int64[pyarrow]'", "integer", mixed_int_dataframe_value_str, mixed_int_expected_rows, False), + + # Mixed columns with values and None / Float + + ("dtype='float16[pyarrow]'", "double", mixed_float16_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='float32[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='float64[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "double", mixed_float16_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='float[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, True), + ("dtype='double[pyarrow]'", "double", mixed_float_dataframe_value_str, mixed_float_expected_rows, False), + + # Mixed columns with values and None / Decimal + + ("dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))", "DECIMAL(10,5)", mixed_decimal_dataframe_value_str, mixed_decimal_expected_rows, False), + # Mixed columns with values and None / Int to Decimal + + ("dtype='uint8[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='uint16[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='uint32[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='uint64[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int8[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int16[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int32[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + ("dtype='int64[pyarrow]'", "DECIMAL(10,5)", mixed_int_dataframe_value_str, mixed_int_to_decimal_expected_rows, False), + + # Mixed columns with values and None / Float to Decimal + + ("dtype='float16[pyarrow]'", "DECIMAL(10,5)", mixed_float16_dataframe_value_str, mixed_decimal_expected_rows, True), + ("dtype='float32[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + ("dtype='float64[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "DECIMAL(10,5)", mixed_float16_dataframe_value_str, mixed_decimal_expected_rows, True), + ("dtype='float[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + ("dtype='double[pyarrow]'", "DECIMAL(10,5)", mixed_float_dataframe_value_str, mixed_decimal_expected_rows, False), + # Mixed columns with values and None / Int To Double + + ("dtype='uint8[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='uint16[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='uint32[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='uint64[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int8[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int16[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int32[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + ("dtype='int64[pyarrow]'", "double", mixed_int_dataframe_value_str, mixed_int_to_float_expected_rows, False), + + # Mixed columns with values and None / Float to Int + + ("dtype='float16[pyarrow]'", "integer", mixed_float16_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='float32[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='float64[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='halffloat[pyarrow]'", "integer", mixed_float16_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='float[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + ("dtype='double[pyarrow]'", "integer", mixed_float_dataframe_value_str, mixed_int_expected_rows, False), + + # Mixed columns with values and None / String + + ("dtype='string[pyarrow]'", "VARCHAR(2000000)", mixed_str_dataframe_value_str, mixed_str_expected_rows, False), + + # Mixed columns with values and None / Boolean + + ("dtype='bool[pyarrow]'", "boolean", mixed_bool_dataframe_value_str, mixed_bool_expected_rows, False), + + # Mixed columns with values and None / Date and time + + ("dtype=pd.ArrowDtype(pa.timestamp('ns','UTC'))", "timestamp", mixed_datetime_dataframe_value_str, mixed_datetime_expected_rows, False), + # None - + ("dtype='uint8[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), ("dtype='uint16[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), ("dtype='uint32[pyarrow]'", "integer", none_dataframe_value_str, none_expected_rows, False), From 444e8e245bdc277c4df5f8a465ff5a59fd34fde8 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Thu, 25 May 2023 10:18:47 +0200 Subject: [PATCH 09/11] Fix duplicate error codes --- exaudfclient/base/python/python3/python_ext_dataframe.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 613fbc2eb..7d889cc8e 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -931,7 +931,7 @@ inline void handleEmitPyFloat( { double value = PyFloat_AsDouble(pyFloat.get()); if (value < 0 && PyErr_Occurred()) - throw std::runtime_error("F-UDF-CL-SL-PYTHON-1067: emit() PY_FLOAT: PyFloat_AsDouble error"); + throw std::runtime_error("F-UDF-CL-SL-PYTHON-1139: emit() PY_FLOAT: PyFloat_AsDouble error"); if (npy_isnan(value)) { pyResult.reset( PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL)); @@ -949,7 +949,7 @@ inline void handleEmitPyFloat( default: { std::stringstream ss; - ss << "F-UDF-CL-SL-PYTHON-1068: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; + ss << "F-UDF-CL-SL-PYTHON-1140: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; throw std::runtime_error(ss.str().c_str()); } } @@ -1102,7 +1102,7 @@ inline void handleEmitPyTimestamp( default: { std::stringstream ss; - ss << "F-UDF-CL-SL-PYTHON-1071: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; + ss << "F-UDF-CL-SL-PYTHON-1141: emit column " << c << " of type " << emitTypeMap.at(colInfo[c].type) << " but data given have type " << colTypes[c].first; throw std::runtime_error(ss.str().c_str()); } } From 324f1695b479dac1280a0268ff730332ccd59024 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 31 May 2023 09:08:41 +0200 Subject: [PATCH 10/11] Apply review suggestions --- .../python/python3/python_ext_dataframe.cc | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/exaudfclient/base/python/python3/python_ext_dataframe.cc b/exaudfclient/base/python/python3/python_ext_dataframe.cc index 7d889cc8e..36f0fffdb 100644 --- a/exaudfclient/base/python/python3/python_ext_dataframe.cc +++ b/exaudfclient/base/python/python3/python_ext_dataframe.cc @@ -101,6 +101,7 @@ std::map numpyCTypeToNumpyDTypeStrMap { {NPY_UINT16, "uint16"}, {NPY_UINT32, "uint32"}, {NPY_UINT64, "uint64"}, + // We don't list NPY_FLOAT16 here, because we let numpy convert float16 to float (32 bit) and then use the C conversion from float to double, because a proper conversion from float16 to double in C is very complicated. {NPY_FLOAT32, "float32"}, {NPY_FLOAT64, "float64"}, }; @@ -442,13 +443,28 @@ inline void getColumnSetMethods(std::vector& colInfo, std::vector>& colTypes){ @@ -484,8 +500,14 @@ inline void printPyObject(PyObject* obj, const std::string& error_code){ DBG_STREAM_MSG(std::cerr, error_code << ": " << std::string(s) << " " << std::string(p)); } +inline const PyPtr& getPandasNA(){ + static const PyPtr pdNA(PyObject_GetAttrString(pandasModule.get(), "NA")); + return pdNA; +} + inline bool isNoneOrNA(PyObject* pyVal){ - return pyVal == Py_None || std::string(Py_TYPE(pyVal)->tp_name) == "NAType"; + const PyPtr& pdNA = getPandasNA(); + return pyVal == Py_None || pyVal == pdNA.get(); } inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, @@ -556,7 +578,7 @@ inline void getColumnArrays(PyObject *colArray, int numCols, int numRows, PyPtr asType (PyObject_GetAttrString(array.get(), "astype")); PyPtr keywordArgs(PyDict_New()); PyDict_SetItemString(keywordArgs.get(), "copy", Py_False); - std::string numpyDTypeStr = numpyCTypeToNumpyDTypeStrMap.at(colTypes[c].second); + const std::string numpyDTypeStr = numpyCTypeToNumpyDTypeStrMap.at(colTypes[c].second); PyPtr funcArgs(Py_BuildValue("(s)", numpyDTypeStr.c_str())); PyPtr scalarArr(PyObject_Call(asType.get(), funcArgs.get(), keywordArgs.get())); columnArrays.push_back(std::move(scalarArr)); @@ -1093,6 +1115,9 @@ inline void handleEmitPyTimestamp( switch (colInfo[c].type) { case SWIGVMContainers::TIMESTAMP: { + // We call here pandas.Timestamp.tz_localize(None), because we need to remove the timezone from the timestamp. + // Exasol doesn't support timezones, and if we don't remove the timezone, pandas.Timestamp.isoformat will add + // it to the generated string. pyTimestamp.reset(PyObject_CallMethod(pyTimestamp.get(), "tz_localize", "z", NULL)); PyPtr pyIsoDatetime(PyObject_CallMethod(pyTimestamp.get(), "isoformat", "s", " ")); pyResult.reset(PyObject_CallMethodObjArgs( From 1770e00440200f54016ddde697f38961f042a742 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Thu, 1 Jun 2023 09:35:10 +0200 Subject: [PATCH 11/11] Update to pandas 2.0.2 --- .../flavor_base/language_deps/packages/python3_pip_packages | 2 +- .../flavor_base/language_deps/packages/conda_packages | 2 +- .../flavor_base/language_deps/packages/conda_packages | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages index 3d6464a6e..827abf566 100644 --- a/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages +++ b/flavors/python-3.8-minimal-EXASOL-6.2.0/flavor_base/language_deps/packages/python3_pip_packages @@ -1,3 +1,3 @@ -pandas|2.0.1 +pandas|2.0.2 numpy|1.24.3 pyarrow|12.0.0 diff --git a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages index 0a755b9f8..465ae1c0a 100644 --- a/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages +++ b/flavors/template-Exasol-all-python-3.8-conda/flavor_base/language_deps/packages/conda_packages @@ -1,6 +1,6 @@ python|3.8.13 numpy|1.24.3 -pandas|2.0.1 +pandas|2.0.2 libblas|3.9.0=15_linux64_mkl mamba|1.3.1 ld_impl_linux-64|2.36.1 diff --git a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages index 0a755b9f8..465ae1c0a 100644 --- a/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages +++ b/flavors/template-Exasol-all-python-3.8-cuda-conda/flavor_base/language_deps/packages/conda_packages @@ -1,6 +1,6 @@ python|3.8.13 numpy|1.24.3 -pandas|2.0.1 +pandas|2.0.2 libblas|3.9.0=15_linux64_mkl mamba|1.3.1 ld_impl_linux-64|2.36.1