Fix(jupyter):use non-blocking stdin check to prevent hanging (#380)

kafka1991 · web-flow · commit a46d8510cc29 · 2025-11-03T16:16:43.000+08:00
* implement non-blocking stdin check to prevent hanging

* Add Jupyter notebook tests and run it in CI

* add more juper notebook tests

* generate temporary CSV for testing
diff --git a/.github/workflows/build_linux_arm64_wheels-gh.yml b/.github/workflows/build_linux_arm64_wheels-gh.yml
@@ -117,7 +117,7 @@ jobs:
             echo "Installing dependencies for Python $version"
             pyenv shell $version
             python -m pip install --upgrade pip
-            python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel
+            python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel jupyter nbconvert
             pyenv shell --unset
           done
       - name: Upgrade Rust toolchain
@@ -281,6 +281,15 @@ jobs:
             pyenv shell --unset
           done
         continue-on-error: false
+      - name: Run notebook tests
+        run: |
+          export PATH="$HOME/.pyenv/bin:$PATH"
+          eval "$(pyenv init -)"
+          pyenv shell 3.8
+          python -m pip install dist/*.whl --force-reinstall
+          jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb
+          pyenv shell --unset
+        continue-on-error: false
       - name: Check and upload core files if present
         if: always()
         run: |
diff --git a/.github/workflows/build_linux_x86_wheels.yml b/.github/workflows/build_linux_x86_wheels.yml
@@ -117,7 +117,7 @@ jobs:
             echo "Installing dependencies for Python $version"
             pyenv shell $version
             python -m pip install --upgrade pip
-            python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel
+            python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel jupyter nbconvert
             pyenv shell --unset
           done
       - name: Upgrade Rust toolchain
@@ -280,6 +280,15 @@ jobs:
             pyenv shell --unset
           done
         continue-on-error: false
+      - name: Run notebook tests
+        run: |
+          export PATH="$HOME/.pyenv/bin:$PATH"
+          eval "$(pyenv init -)"
+          pyenv shell 3.8
+          python -m pip install dist/*.whl --force-reinstall
+          jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb
+          pyenv shell --unset
+        continue-on-error: false
       - name: Check and upload core files if present
         if: always()
         run: |
diff --git a/.github/workflows/build_macos_arm64_wheels.yml b/.github/workflows/build_macos_arm64_wheels.yml
@@ -102,7 +102,7 @@ jobs:
             echo "Installing dependencies for Python $version"
             pyenv shell $version
             python -m pip install --upgrade pip
-            python -m pip install setuptools wheel tox pandas pyarrow twine psutil deltalake wheel>=0.40.0
+            python -m pip install setuptools wheel tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 jupyter nbconvert
             pyenv shell --unset
           done
       - name: Remove /usr/local/bin/python3
@@ -276,6 +276,15 @@ jobs:
             pyenv shell --unset
           done
         continue-on-error: false
+      - name: Run notebook tests
+        run: |
+          export PATH="$HOME/.pyenv/bin:$PATH"
+          eval "$(pyenv init -)"
+          pyenv shell 3.8
+          python -m pip install dist/*.whl --force-reinstall
+          jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb
+          pyenv shell --unset
+        continue-on-error: false
       - name: Check and upload core files if present
         if: always()
         run: |
diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml
@@ -91,7 +91,7 @@ jobs:
             echo "Installing dependencies for Python $version"
             pyenv shell $version
             python -m pip install --upgrade pip
-            python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel>=0.40.0
+            python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 jupyter nbconvert
             pyenv shell --unset
           done
       - name: Remove /usr/local/bin/python3
@@ -277,6 +277,15 @@ jobs:
             pyenv shell --unset
           done
         continue-on-error: false
+      - name: Run notebook tests
+        run: |
+          export PATH="$HOME/.pyenv/bin:$PATH"
+          eval "$(pyenv init -)"
+          pyenv shell 3.8
+          python -m pip install dist/*.whl --force-reinstall
+          jupyter nbconvert --to notebook --execute tests/test_data_insertion.ipynb --output test_data_insertion_output.ipynb
+          pyenv shell --unset
+        continue-on-error: false
       - name: Check and upload core files if present
         if: always()
         run: |
diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
@@ -1777,6 +1777,21 @@ bool isStdinNotEmptyAndValid(ReadBuffer & std_in)
 {
     try
     {
+        // Use non-blocking check for stdin to avoid hanging
+        if (auto * fd_buffer = typeid_cast<ReadBufferFromFileDescriptor *>(&std_in))
+        {
+            int fd = fd_buffer->getFD();
+            if (fd == STDIN_FILENO)
+            {
+                int flags = fcntl(fd, F_GETFL);
+                if (flags != -1)
+                {
+                    fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+                    SCOPE_EXIT({ fcntl(fd, F_SETFL, flags); });
+                    return !std_in.eof();
+                }
+            }
+        }
         return !std_in.eof();
     }
     catch (const Exception & e)
diff --git a/tests/test_data_insertion.ipynb b/tests/test_data_insertion.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from chdb import session\n",
+    "import time\n",
+    "import tempfile\n",
+    "import os\n",
+    "\n",
+    "print(\"Connecting to chdb session...\")\n",
+    "chs = session.Session()\n",
+    "\n",
+    "temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)\n",
+    "temp_csv.write(\"movieId,embedding\\n\")  # Header\n",
+    "\n",
+    "# Generate 10,000 rows of test data\n",
+    "for i in range(1, 10001):\n",
+    "    embedding = [float(i + j * 0.1) for j in range(10)]\n",
+    "    embedding_str = '[' + ','.join(map(str, embedding)) + ']'\n",
+    "    temp_csv.write(f'{i},\"{embedding_str}\"\\n')\n",
+    "\n",
+    "temp_csv.close()\n",
+    "csv_path = temp_csv.name\n",
+    "\n",
+    "# Setup database and table\n",
+    "print(\"\\n=== Setup Phase ===\")\n",
+    "chs.query(\"CREATE DATABASE IF NOT EXISTS test ENGINE = Atomic\")\n",
+    "chs.query(\"USE test\")\n",
+    "chs.query('DROP TABLE IF EXISTS embeddings')\n",
+    "\n",
+    "chs.query(\"\"\"CREATE TABLE embeddings (\n",
+    "      movieId UInt32 NOT NULL,\n",
+    "      embedding Array(Float32) NOT NULL\n",
+    "  ) ENGINE = MergeTree()\n",
+    "  ORDER BY movieId\"\"\")\n",
+    "\n",
+    "# Test 1: INFILE insertion (10k rows)\n",
+    "print(\"\\n=== Test 1: INFILE Insertion (10k rows) ===\")\n",
+    "start_time = time.time()\n",
+    "try:\n",
+    "    result = chs.query(f\"INSERT INTO embeddings FROM INFILE '{csv_path}' FORMAT CSV\")\n",
+    "    infile_time = time.time() - start_time\n",
+    "    print(f\"✓ INFILE insertion successful! Time: {infile_time:.3f}s\")\n",
+    "    \n",
+    "    count = chs.query('SELECT COUNT(*) as count FROM embeddings')\n",
+    "    print(f\"Records inserted via INFILE: {count}\")\n",
+    "    \n",
+    "    if count != '0':\n",
+    "        print(\"Sample data from INFILE:\")\n",
+    "        sample = chs.query('SELECT movieId, embedding FROM embeddings ORDER BY movieId LIMIT 3')\n",
+    "        print(sample)\n",
+    "        \n",
+    "except Exception as e:\n",
+    "    print(f\"✗ INFILE insertion failed: {e}\")\n",
+    "    infile_time = 0\n",
+    "\n",
+    "# Test 2: Regular insertion (10 additional rows)\n",
+    "print(\"\\n=== Test 2: Regular VALUES Insertion (10 rows) ===\")\n",
+    "start_time = time.time()\n",
+    "try:\n",
+    "    # Insert 10 additional rows with movieId starting from 20001\n",
+    "    for i in range(20001, 20011):\n",
+    "        embedding = [float(i + j * 0.1) for j in range(10)]\n",
+    "        embedding_str = '[' + ','.join(map(str, embedding)) + ']'\n",
+    "        chs.query(f\"INSERT INTO embeddings VALUES ({i}, {embedding_str})\")\n",
+    "    \n",
+    "    values_time = time.time() - start_time\n",
+    "    print(f\"✓ VALUES insertion successful! Time: {values_time:.3f}s\")\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f\"✗ VALUES insertion failed: {e}\")\n",
+    "    values_time = 0\n",
+    "\n",
+    "# Test 3: Verify total count\n",
+    "print(\"\\n=== Test 3: Count Verification ===\")\n",
+    "try:\n",
+    "    total_count = chs.query('SELECT COUNT(*) as total FROM embeddings')\n",
+    "    print(f\"Total records in embeddings table: {total_count}\")\n",
+    "    \n",
+    "    # Count by range\n",
+    "    infile_count = chs.query('SELECT COUNT(*) as infile_count FROM embeddings WHERE movieId <= 10000')\n",
+    "    values_count = chs.query('SELECT COUNT(*) as values_count FROM embeddings WHERE movieId >= 20001')\n",
+    "    \n",
+    "    print(f\"Records from INFILE (movieId <= 10000): {infile_count}\")\n",
+    "    print(f\"Records from VALUES (movieId >= 20001): {values_count}\")\n",
+    "    \n",
+    "    # Sample from both ranges\n",
+    "    print(\"\\nSample from INFILE data:\")\n",
+    "    print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId <= 10000 ORDER BY movieId LIMIT 2'))\n",
+    "    \n",
+    "    print(\"Sample from VALUES data:\")\n",
+    "    print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId >= 20001 ORDER BY movieId LIMIT 2'))\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f\"Count verification error: {e}\")\n",
+    "\n",
+    "# Test 4: Direct CSV engine reading\n",
+    "print(\"\\n=== Test 4: CSV Engine Direct Reading ===\")\n",
+    "try:\n",
+    "    print(\"Reading generated CSV file directly using CSV engine:\")\n",
+    "    \n",
+    "    # Method 1: Using file() function\n",
+    "    csv_count1 = chs.query(f\"SELECT COUNT(*) as csv_count FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String')\")\n",
+    "    print(f\"CSV file rows (via file() function): {csv_count1}\")\n",
+    "    \n",
+    "    # Method 2: Using CSV table engine directly\n",
+    "    print(\"Sample rows from CSV file:\")\n",
+    "    csv_sample = chs.query(f\"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId LIMIT 3\")\n",
+    "    print(csv_sample)\n",
+    "    \n",
+    "    print(\"Last few rows from CSV file:\")\n",
+    "    csv_tail = chs.query(f\"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId DESC LIMIT 3\")\n",
+    "    print(csv_tail)\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f\"CSV engine reading error: {e}\")\n",
+    "\n",
+    "# Cleanup\n",
+    "print(\"\\n=== Cleanup ===\")\n",
+    "try:\n",
+    "    os.unlink(csv_path)\n",
+    "    print(\"✓ Temporary CSV file cleaned up\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Warning: Could not clean up temporary file: {e}\")\n",
+    "\n",
+    "print(f\"\\n=== Performance Summary ===\")\n",
+    "if infile_time > 0:\n",
+    "    print(f\"INFILE insertion (10k rows): {infile_time:.3f}s\")\n",
+    "if values_time > 0:\n",
+    "    print(f\"VALUES insertion (10 rows): {values_time:.3f}s\")\n",
+    "    if infile_time > 0:\n",
+    "        print(f\"INFILE is {values_time/infile_time*1000:.1f}x faster per 1000 rows\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}