Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Contributing to duckdb-python

## Setting up a development environment

See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).

## General Guidelines

### **Did you find a bug?**
Expand Down Expand Up @@ -39,7 +43,3 @@
### Testing cross-platform and cross-Python

* On your fork you can [run](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow#running-a-workflow) the Packaging workflow manually for any branch. You can choose whether to build for all platforms or a subset, and to either run the full testsuite, the fast tests only, or no tests at all.

## Setting up a development environment

See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).
2 changes: 2 additions & 0 deletions _duckdb-stubs/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,7 @@ class DuckDBPyRelation:
write_partition_columns: bool | None = None,
append: bool | None = None,
filename_pattern: str | None = None,
file_size_bytes: str | int | None = None,
) -> None: ...
def to_table(self, table_name: str) -> None: ...
def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
Expand Down Expand Up @@ -774,6 +775,7 @@ class DuckDBPyRelation:
write_partition_columns: bool | None = None,
append: bool | None = None,
filename_pattern: str | None = None,
file_size_bytes: str | int | None = None,
) -> None: ...
@property
def alias(self) -> str: ...
Expand Down
3 changes: 2 additions & 1 deletion src/duckdb_py/include/duckdb_python/pyrelation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ struct DuckDBPyRelation {
const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
const py::object &append = py::none(), const py::object &filename_pattern = py::none());
const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
const py::object &file_size_bytes = py::none());

void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
const py::object &header = py::none(), const py::object &quotechar = py::none(),
Expand Down
13 changes: 12 additions & 1 deletion src/duckdb_py/pyrelation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
const py::object &overwrite, const py::object &per_thread_output,
const py::object &use_tmp_file, const py::object &partition_by,
const py::object &write_partition_columns, const py::object &append,
const py::object &filename_pattern) {
const py::object &filename_pattern, const py::object &file_size_bytes) {
case_insensitive_map_t<vector<Value>> options;

if (!py::none().is(compression)) {
Expand Down Expand Up @@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
options["filename_pattern"] = {Value(py::str(filename_pattern))};
}

if (!py::none().is(file_size_bytes)) {
if (py::isinstance<py::int_>(file_size_bytes)) {
int64_t file_size_bytes_int = py::int_(file_size_bytes);
options["file_size_bytes"] = {Value(file_size_bytes_int)};
} else if (py::isinstance<py::str>(file_size_bytes)) {
options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
} else {
throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
}
}

auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
PyExecuteRelation(write_parquet);
}
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb_py/pyrelation/initialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
py::arg("filename_pattern") = py::none());
py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none());

DefineMethod(
{"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",
Expand Down
34 changes: 34 additions & 0 deletions tests/fast/api/test_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,37 @@ def test_filename_pattern_with_uuid(self, pd):
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
assert result.execute().fetchall() == expected

@pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
def test_file_size_bytes_basic(self, file_size_bytes):
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118

# use same test data as external/duckdb/test/sql/copy/file_size_bytes.test
rel = duckdb.from_query("SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);")
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, row_group_size=2000)

# Check that multiple files were created
files = list(pathlib.Path(temp_file_name).iterdir())
assert len(files) > 1, f"Expected multiple files, got {len(files)}"

# Verify data integrity
result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
assert len(result.execute().fetchall()) == 10000

@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
@pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"])
def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
df = pd.DataFrame(
{
"name": ["rei", "shinji", "asuka", "kaworu"],
"float": [321.0, 123.0, 23.0, 340.0],
"category": ["a", "a", "b", "c"],
}
)
rel = duckdb.from_df(df)
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes)

# With large file size limits, should create just one file
parquet_rel = duckdb.read_parquet(temp_file_name)
assert rel.execute().fetchall() == parquet_rel.execute().fetchall()
Loading