diff --git a/.github/workflows/build_linux_arm64_wheels-gh.yml b/.github/workflows/build_linux_arm64_wheels-gh.yml index ae75b70281b..5fd5c76fd83 100644 --- a/.github/workflows/build_linux_arm64_wheels-gh.yml +++ b/.github/workflows/build_linux_arm64_wheels-gh.yml @@ -117,7 +117,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel + python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel jupyter nbconvert pyenv shell --unset done - name: Upgrade Rust toolchain @@ -281,6 +281,15 @@ jobs: pyenv shell --unset done continue-on-error: false + - name: Run notebook tests + run: | + export PATH="$HOME/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + pyenv shell 3.8 + python -m pip install dist/*.whl --force-reinstall + jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb + pyenv shell --unset + continue-on-error: false - name: Check and upload core files if present if: always() run: | diff --git a/.github/workflows/build_linux_x86_wheels.yml b/.github/workflows/build_linux_x86_wheels.yml index 846ead19186..ccb0180fbcc 100644 --- a/.github/workflows/build_linux_x86_wheels.yml +++ b/.github/workflows/build_linux_x86_wheels.yml @@ -117,7 +117,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel + python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel jupyter nbconvert pyenv shell --unset done - name: Upgrade Rust toolchain @@ -280,6 +280,15 @@ jobs: pyenv shell --unset done continue-on-error: false + - name: Run notebook tests + run: | + export PATH="$HOME/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + pyenv shell 3.8 + python -m pip install dist/*.whl --force-reinstall + jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb + pyenv shell --unset + continue-on-error: false - name: Check and upload core files if present if: always() run: | diff --git a/.github/workflows/build_macos_arm64_wheels.yml b/.github/workflows/build_macos_arm64_wheels.yml index e5f20e6302d..4c7b24f1ac2 100644 --- a/.github/workflows/build_macos_arm64_wheels.yml +++ b/.github/workflows/build_macos_arm64_wheels.yml @@ -102,7 +102,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools wheel tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 + python -m pip install setuptools wheel tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 jupyter nbconvert pyenv shell --unset done - name: Remove /usr/local/bin/python3 @@ -276,6 +276,15 @@ jobs: pyenv shell --unset done continue-on-error: false + - name: Run notebook tests + run: | + export PATH="$HOME/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + pyenv shell 3.8 + python -m pip install dist/*.whl --force-reinstall + jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb + pyenv shell --unset + continue-on-error: false - name: Check and upload core files if present if: always() run: | diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml index 3017b44c1fd..e8ba7149307 100644 --- a/.github/workflows/build_macos_x86_wheels.yml +++ b/.github/workflows/build_macos_x86_wheels.yml @@ -91,7 +91,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 + python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 jupyter nbconvert pyenv shell --unset done - name: Remove /usr/local/bin/python3 @@ -277,6 +277,15 @@ jobs: pyenv shell --unset done continue-on-error: false + - name: Run notebook tests + run: | + export PATH="$HOME/.pyenv/bin:$PATH" + eval "$(pyenv init -)" + pyenv shell 3.8 + python -m pip install dist/*.whl --force-reinstall + jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb + pyenv shell --unset + continue-on-error: false - name: Check and upload core files if present if: always() run: | diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index ef6d6df68a0..4980f8d610f 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1777,6 +1777,21 @@ bool isStdinNotEmptyAndValid(ReadBuffer & std_in) { try { + // Use non-blocking check for stdin to avoid hanging + if (auto * fd_buffer = typeid_cast(&std_in)) + { + int fd = fd_buffer->getFD(); + if (fd == STDIN_FILENO) + { + int flags = fcntl(fd, F_GETFL); + if (flags != -1) + { + fcntl(fd, F_SETFL, flags | O_NONBLOCK); + SCOPE_EXIT({ fcntl(fd, F_SETFL, flags); }); + return !std_in.eof(); + } + } + } return !std_in.eof(); } catch (const Exception & e) diff --git a/tests/test_data_insertion.ipynb b/tests/test_data_insertion.ipynb new file mode 100644 index 00000000000..20bbb87bae2 --- /dev/null +++ b/tests/test_data_insertion.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "from chdb import session\n", + "import time\n", + "import tempfile\n", + "import os\n", + "\n", + "print(\"Connecting to chdb session...\")\n", + "chs = session.Session()\n", + "\n", + "temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)\n", + "temp_csv.write(\"movieId,embedding\\n\") # Header\n", + "\n", + "# Generate 10,000 rows of test data\n", + "for i in range(1, 10001):\n", + " embedding = [float(i + j * 0.1) for j in range(10)]\n", + " embedding_str = '[' + ','.join(map(str, embedding)) + ']'\n", + " temp_csv.write(f'{i},\"{embedding_str}\"\\n')\n", + "\n", + "temp_csv.close()\n", + "csv_path = temp_csv.name\n", + "\n", + "# Setup database and table\n", + "print(\"\\n=== Setup Phase ===\")\n", + "chs.query(\"CREATE DATABASE IF NOT EXISTS test ENGINE = Atomic\")\n", + "chs.query(\"USE test\")\n", + "chs.query('DROP TABLE IF EXISTS embeddings')\n", + "\n", + "chs.query(\"\"\"CREATE TABLE embeddings (\n", + " movieId UInt32 NOT NULL,\n", + " embedding Array(Float32) NOT NULL\n", + " ) ENGINE = MergeTree()\n", + " ORDER BY movieId\"\"\")\n", + "\n", + "# Test 1: INFILE insertion (10k rows)\n", + "print(\"\\n=== Test 1: INFILE Insertion (10k rows) ===\")\n", + "start_time = time.time()\n", + "try:\n", + " result = chs.query(f\"INSERT INTO embeddings FROM INFILE '{csv_path}' FORMAT CSV\")\n", + " infile_time = time.time() - start_time\n", + " print(f\"✓ INFILE insertion successful! Time: {infile_time:.3f}s\")\n", + " \n", + " count = chs.query('SELECT COUNT(*) as count FROM embeddings')\n", + " print(f\"Records inserted via INFILE: {count}\")\n", + " \n", + " if count != '0':\n", + " print(\"Sample data from INFILE:\")\n", + " sample = chs.query('SELECT movieId, embedding FROM embeddings ORDER BY movieId LIMIT 3')\n", + " print(sample)\n", + " \n", + "except Exception as e:\n", + " print(f\"✗ INFILE insertion failed: {e}\")\n", + " infile_time = 0\n", + "\n", + "# Test 2: Regular insertion (10 additional rows)\n", + "print(\"\\n=== Test 2: Regular VALUES Insertion (10 rows) ===\")\n", + "start_time = time.time()\n", + "try:\n", + " # Insert 10 additional rows with movieId starting from 20001\n", + " for i in range(20001, 20011):\n", + " embedding = [float(i + j * 0.1) for j in range(10)]\n", + " embedding_str = '[' + ','.join(map(str, embedding)) + ']'\n", + " chs.query(f\"INSERT INTO embeddings VALUES ({i}, {embedding_str})\")\n", + " \n", + " values_time = time.time() - start_time\n", + " print(f\"✓ VALUES insertion successful! Time: {values_time:.3f}s\")\n", + " \n", + "except Exception as e:\n", + " print(f\"✗ VALUES insertion failed: {e}\")\n", + " values_time = 0\n", + "\n", + "# Test 3: Verify total count\n", + "print(\"\\n=== Test 3: Count Verification ===\")\n", + "try:\n", + " total_count = chs.query('SELECT COUNT(*) as total FROM embeddings')\n", + " print(f\"Total records in embeddings table: {total_count}\")\n", + " \n", + " # Count by range\n", + " infile_count = chs.query('SELECT COUNT(*) as infile_count FROM embeddings WHERE movieId <= 10000')\n", + " values_count = chs.query('SELECT COUNT(*) as values_count FROM embeddings WHERE movieId >= 20001')\n", + " \n", + " print(f\"Records from INFILE (movieId <= 10000): {infile_count}\")\n", + " print(f\"Records from VALUES (movieId >= 20001): {values_count}\")\n", + " \n", + " # Sample from both ranges\n", + " print(\"\\nSample from INFILE data:\")\n", + " print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId <= 10000 ORDER BY movieId LIMIT 2'))\n", + " \n", + " print(\"Sample from VALUES data:\")\n", + " print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId >= 20001 ORDER BY movieId LIMIT 2'))\n", + " \n", + "except Exception as e:\n", + " print(f\"Count verification error: {e}\")\n", + "\n", + "# Test 4: Direct CSV engine reading\n", + "print(\"\\n=== Test 4: CSV Engine Direct Reading ===\")\n", + "try:\n", + " print(\"Reading generated CSV file directly using CSV engine:\")\n", + " \n", + " # Method 1: Using file() function\n", + " csv_count1 = chs.query(f\"SELECT COUNT(*) as csv_count FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String')\")\n", + " print(f\"CSV file rows (via file() function): {csv_count1}\")\n", + " \n", + " # Method 2: Using CSV table engine directly\n", + " print(\"Sample rows from CSV file:\")\n", + " csv_sample = chs.query(f\"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId LIMIT 3\")\n", + " print(csv_sample)\n", + " \n", + " print(\"Last few rows from CSV file:\")\n", + " csv_tail = chs.query(f\"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId DESC LIMIT 3\")\n", + " print(csv_tail)\n", + " \n", + "except Exception as e:\n", + " print(f\"CSV engine reading error: {e}\")\n", + "\n", + "# Cleanup\n", + "print(\"\\n=== Cleanup ===\")\n", + "try:\n", + " os.unlink(csv_path)\n", + " print(\"✓ Temporary CSV file cleaned up\")\n", + "except Exception as e:\n", + " print(f\"Warning: Could not clean up temporary file: {e}\")\n", + "\n", + "print(f\"\\n=== Performance Summary ===\")\n", + "if infile_time > 0:\n", + " print(f\"INFILE insertion (10k rows): {infile_time:.3f}s\")\n", + "if values_time > 0:\n", + " print(f\"VALUES insertion (10 rows): {values_time:.3f}s\")\n", + " if infile_time > 0:\n", + " print(f\"INFILE is {values_time/infile_time*1000:.1f}x faster per 1000 rows\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}