vaexio · maartenbreddels · Jan 26, 2021 · Jan 25, 2021 · Jan 25, 2021 · Jan 26, 2021
diff --git a/ci/conda-env-nightlies.yml b/ci/conda-env-nightlies.yml
@@ -1 +1 @@
-# pyarrow
+pyarrow
diff --git a/packages/vaex-core/setup.py b/packages/vaex-core/setup.py
@@ -23,7 +23,7 @@
 install_requires_core = ["numpy>=1.16", "astropy>=2", "aplus", "tabulate>=0.8.3",
                          "future>=0.15.2", "pyyaml", "progressbar2", "psutil>=1.2.1",
                          "requests", "six", "cloudpickle", "pandas", "dask[array]",
-                         "nest-asyncio>=1.3.3", "pyarrow>=1.0", "frozendict",
+                         "nest-asyncio>=1.3.3", "pyarrow>=3.0", "frozendict",
                          "blake3"]
 if sys.version_info[0] == 2:
     install_requires_core.append("futures>=2.2.0")

diff --git a/packages/vaex-core/src/strings.cpp b/packages/vaex-core/src/strings.cpp
@@ -1913,6 +1913,46 @@ StringSequenceBase* StringListList::join(std::string sep) {
     return sl;
 }
 
+template<class T>
+T* join(std::string sep, py::array_t<typename T::index_type, py::array::c_style> offsets_list, T* input, int64_t offset=0) {
+    py::gil_scoped_release release;
+    int64_t list_length = offsets_list.size() - 1;
+    auto offsets = offsets_list.template mutable_unchecked<1>();
+    T* sl = new T(1, list_length);
+    char* target = sl->bytes;
+    size_t byte_offset;
+    for(int64_t i = 0; i < list_length; i++) {
+        byte_offset = target - sl->bytes;
+        sl->indices[i] = byte_offset;
+        int64_t i1 = offsets[i] - offset;
+        int64_t i2 = offsets[i+1] - offset;
+        size_t count = i2 - i1;
+        for(size_t j = 0; j < count; j++) {
+            auto str = input->get(i1 + j);
+            // make sure the buffer is large enough
+            while((byte_offset + str.length()) > sl->byte_length) {
+                sl->grow();
+                target = sl->bytes + byte_offset;
+            }
+            copy(str, target);
+            byte_offset = target - sl->bytes;
+            // copy separator
+            if(j < (count - 1)) {
+
+                while((byte_offset + sep.length()) > sl->byte_length) {
+                    sl->grow();
+                    target = sl->bytes + byte_offset;
+                }
+                copy(sep, target);
+                byte_offset = target - sl->bytes;
+            }
+        }
+    }
+    byte_offset = target - sl->bytes;
+    sl->indices[list_length] = byte_offset;
+    return sl;
+}
+
 template<class StringList, class Base, class Module>
 void add_string_list(Module m, Base& base, const char* class_name) {
 
@@ -2248,4 +2288,6 @@ PYBIND11_MODULE(superstrings, m) {
     m.def("format", &format<uint8_t>);
     m.def("format", &format<bool>);
     m.def("format", &format_string);
+    m.def("join", &join<StringList32>);
+    m.def("join", &join<StringList64>);
 }
diff --git a/packages/vaex-core/vaex/__init__.py b/packages/vaex-core/vaex/__init__.py
@@ -767,7 +767,7 @@ def dtype_of(ar):
     '''Creates a Vaex DataType from a NumPy or Arrow array'''
     if vaex.array_types.is_arrow_array(ar):
         return dtype(ar.type)
-    elif vaex.array_types.is_numpy_array(ar):
+    elif vaex.array_types.is_numpy_array(ar) or isinstance(ar, vaex.column.supported_column_types):
         return dtype(ar.dtype)
     else:
         raise TypeError(f'{ar} is not a an Arrow or NumPy array')
diff --git a/packages/vaex-core/vaex/arrow/convert.py b/packages/vaex-core/vaex/arrow/convert.py
@@ -289,3 +289,14 @@ def align(a, b):
     else:
         return a, b
     return
+
+
+def same_type(*arrays):
+    types = [ar.type for ar in arrays]
+    if any(types[0] != type for type in types):
+        if vaex.dtype(types[0]) == str:
+            # we have mixed large and normal string
+            return [large_string_to_string(ar) if ar.type == pa.large_string() else ar for ar in arrays]
+        else:
+            raise NotImplementedError
+    return arrays
diff --git a/packages/vaex-core/vaex/arrow/utils.py b/packages/vaex-core/vaex/arrow/utils.py
@@ -24,6 +24,7 @@ def wrapper(new_values):
                     buffers = buffers[:2]
                     buffers = trim_offsets(offset, length, *buffers)
                     offset = 0
+                    new_values = vaex.array_types.to_arrow(new_values)
                     type = pa.list_(new_values.type)
                     ar = pa.ListArray.from_buffers(type, length, [buffers[0], buffers[1]], null_count, offset, children=[new_values])
                 else:

diff --git a/packages/vaex-core/vaex/column.py b/packages/vaex-core/vaex/column.py
@@ -27,6 +27,9 @@ def to_arrow(self, type=None):
 
 supported_column_types = (Column, ) + supported_array_types
 
+def is_column_like(col):
+    return isinstance(col, supported_column_types)
+
 
 class ColumnVirtualRange(Column):
     def __init__(self, start, stop, step=1, dtype=None):
@@ -631,13 +634,16 @@ def trim(self, i1, i2):
                 references=references)
 
     @classmethod
-    def from_string_sequence(cls, string_sequence):
+    def from_string_sequence(cls, string_sequence, copy=False):
         s = string_sequence
         null_bitmap = s.null_bitmap
         if s.null_offset != 0:
             mask = ~s.mask()
             null_bitmap = np.frombuffer(pa.array(mask, pa.bool_()).buffers()[1], dtype='b')
-        return cls(s.indices, s.bytes, s.length, s.offset, string_sequence=s, null_bitmap=null_bitmap)
+        if copy:
+            return cls(s.indices.copy(), s.bytes.copy(), s.length, s.offset, string_sequence=s, null_bitmap=null_bitmap)
+        else:
+            return cls(s.indices, s.bytes, s.length, s.offset, string_sequence=s, null_bitmap=null_bitmap)
 
     @classmethod
     def from_arrow(cls, ar):

diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py
@@ -478,7 +478,7 @@ def unique(self, expression, return_inverse=False, dropna=False, dropnan=False,
         :param dropmissing: do not count missing values
         :param dropnan: do not count nan values
         :param dropna: short for any of the above, (see :func:`Expression.isna`)
-        :param bool axis: Axis over which to determine the unique elements (None will flatten arrays or lists)
+        :param int axis: Axis over which to determine the unique elements (None will flatten arrays or lists)
         :param str array_type: {array_type}
         """
         if dropna: