Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/conda-env-nightlies.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# pyarrow
pyarrow
2 changes: 1 addition & 1 deletion packages/vaex-core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
install_requires_core = ["numpy>=1.16", "astropy>=2", "aplus", "tabulate>=0.8.3",
"future>=0.15.2", "pyyaml", "progressbar2", "psutil>=1.2.1",
"requests", "six", "cloudpickle", "pandas", "dask[array]",
"nest-asyncio>=1.3.3", "pyarrow>=1.0", "frozendict",
"nest-asyncio>=1.3.3", "pyarrow>=3.0", "frozendict",
"blake3"]
if sys.version_info[0] == 2:
install_requires_core.append("futures>=2.2.0")
Expand Down
42 changes: 42 additions & 0 deletions packages/vaex-core/src/strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,46 @@ StringSequenceBase* StringListList::join(std::string sep) {
return sl;
}

template<class T>
T* join(std::string sep, py::array_t<typename T::index_type, py::array::c_style> offsets_list, T* input, int64_t offset=0) {
py::gil_scoped_release release;
int64_t list_length = offsets_list.size() - 1;
auto offsets = offsets_list.template mutable_unchecked<1>();
T* sl = new T(1, list_length);
char* target = sl->bytes;
size_t byte_offset;
for(int64_t i = 0; i < list_length; i++) {
byte_offset = target - sl->bytes;
sl->indices[i] = byte_offset;
int64_t i1 = offsets[i] - offset;
int64_t i2 = offsets[i+1] - offset;
size_t count = i2 - i1;
for(size_t j = 0; j < count; j++) {
auto str = input->get(i1 + j);
// make sure the buffer is large enough
while((byte_offset + str.length()) > sl->byte_length) {
sl->grow();
target = sl->bytes + byte_offset;
}
copy(str, target);
byte_offset = target - sl->bytes;
// copy separator
if(j < (count - 1)) {

while((byte_offset + sep.length()) > sl->byte_length) {
sl->grow();
target = sl->bytes + byte_offset;
}
copy(sep, target);
byte_offset = target - sl->bytes;
}
}
}
byte_offset = target - sl->bytes;
sl->indices[list_length] = byte_offset;
return sl;
}

template<class StringList, class Base, class Module>
void add_string_list(Module m, Base& base, const char* class_name) {

Expand Down Expand Up @@ -2248,4 +2288,6 @@ PYBIND11_MODULE(superstrings, m) {
m.def("format", &format<uint8_t>);
m.def("format", &format<bool>);
m.def("format", &format_string);
m.def("join", &join<StringList32>);
m.def("join", &join<StringList64>);
}
2 changes: 1 addition & 1 deletion packages/vaex-core/vaex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ def dtype_of(ar):
'''Creates a Vaex DataType from a NumPy or Arrow array'''
if vaex.array_types.is_arrow_array(ar):
return dtype(ar.type)
elif vaex.array_types.is_numpy_array(ar):
elif vaex.array_types.is_numpy_array(ar) or isinstance(ar, vaex.column.supported_column_types):
return dtype(ar.dtype)
else:
raise TypeError(f'{ar} is not a an Arrow or NumPy array')
11 changes: 11 additions & 0 deletions packages/vaex-core/vaex/arrow/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,14 @@ def align(a, b):
else:
return a, b
return


def same_type(*arrays):
types = [ar.type for ar in arrays]
if any(types[0] != type for type in types):
if vaex.dtype(types[0]) == str:
# we have mixed large and normal string
return [large_string_to_string(ar) if ar.type == pa.large_string() else ar for ar in arrays]
else:
raise NotImplementedError
return arrays
1 change: 1 addition & 0 deletions packages/vaex-core/vaex/arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def wrapper(new_values):
buffers = buffers[:2]
buffers = trim_offsets(offset, length, *buffers)
offset = 0
new_values = vaex.array_types.to_arrow(new_values)
type = pa.list_(new_values.type)
ar = pa.ListArray.from_buffers(type, length, [buffers[0], buffers[1]], null_count, offset, children=[new_values])
else:
Expand Down
10 changes: 8 additions & 2 deletions packages/vaex-core/vaex/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ def to_arrow(self, type=None):

supported_column_types = (Column, ) + supported_array_types

def is_column_like(col):
return isinstance(col, supported_column_types)


class ColumnVirtualRange(Column):
def __init__(self, start, stop, step=1, dtype=None):
Expand Down Expand Up @@ -631,13 +634,16 @@ def trim(self, i1, i2):
references=references)

@classmethod
def from_string_sequence(cls, string_sequence):
def from_string_sequence(cls, string_sequence, copy=False):
s = string_sequence
null_bitmap = s.null_bitmap
if s.null_offset != 0:
mask = ~s.mask()
null_bitmap = np.frombuffer(pa.array(mask, pa.bool_()).buffers()[1], dtype='b')
return cls(s.indices, s.bytes, s.length, s.offset, string_sequence=s, null_bitmap=null_bitmap)
if copy:
return cls(s.indices.copy(), s.bytes.copy(), s.length, s.offset, string_sequence=s, null_bitmap=null_bitmap)
else:
return cls(s.indices, s.bytes, s.length, s.offset, string_sequence=s, null_bitmap=null_bitmap)

@classmethod
def from_arrow(cls, ar):
Expand Down
2 changes: 1 addition & 1 deletion packages/vaex-core/vaex/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ def unique(self, expression, return_inverse=False, dropna=False, dropnan=False,
:param dropmissing: do not count missing values
:param dropnan: do not count nan values
:param dropna: short for any of the above, (see :func:`Expression.isna`)
:param bool axis: Axis over which to determine the unique elements (None will flatten arrays or lists)
:param int axis: Axis over which to determine the unique elements (None will flatten arrays or lists)
:param str array_type: {array_type}
"""
if dropna:
Expand Down
Loading