|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": null, |
| 6 | + "metadata": { |
| 7 | + "jupyter": { |
| 8 | + "is_executing": true |
| 9 | + } |
| 10 | + }, |
| 11 | + "outputs": [], |
| 12 | + "source": [ |
| 13 | + "from chdb import session\n", |
| 14 | + "import time\n", |
| 15 | + "import tempfile\n", |
| 16 | + "import os\n", |
| 17 | + "\n", |
| 18 | + "print(\"Connecting to chdb session...\")\n", |
| 19 | + "chs = session.Session()\n", |
| 20 | + "\n", |
| 21 | + "temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)\n", |
| 22 | + "temp_csv.write(\"movieId,embedding\\n\") # Header\n", |
| 23 | + "\n", |
| 24 | + "# Generate 10,000 rows of test data\n", |
| 25 | + "for i in range(1, 10001):\n", |
| 26 | + " embedding = [float(i + j * 0.1) for j in range(10)]\n", |
| 27 | + " embedding_str = '[' + ','.join(map(str, embedding)) + ']'\n", |
| 28 | + " temp_csv.write(f'{i},\"{embedding_str}\"\\n')\n", |
| 29 | + "\n", |
| 30 | + "temp_csv.close()\n", |
| 31 | + "csv_path = temp_csv.name\n", |
| 32 | + "\n", |
| 33 | + "# Setup database and table\n", |
| 34 | + "print(\"\\n=== Setup Phase ===\")\n", |
| 35 | + "chs.query(\"CREATE DATABASE IF NOT EXISTS test ENGINE = Atomic\")\n", |
| 36 | + "chs.query(\"USE test\")\n", |
| 37 | + "chs.query('DROP TABLE IF EXISTS embeddings')\n", |
| 38 | + "\n", |
| 39 | + "chs.query(\"\"\"CREATE TABLE embeddings (\n", |
| 40 | + " movieId UInt32 NOT NULL,\n", |
| 41 | + " embedding Array(Float32) NOT NULL\n", |
| 42 | + " ) ENGINE = MergeTree()\n", |
| 43 | + " ORDER BY movieId\"\"\")\n", |
| 44 | + "\n", |
| 45 | + "# Test 1: INFILE insertion (10k rows)\n", |
| 46 | + "print(\"\\n=== Test 1: INFILE Insertion (10k rows) ===\")\n", |
| 47 | + "start_time = time.time()\n", |
| 48 | + "try:\n", |
| 49 | + " result = chs.query(f\"INSERT INTO embeddings FROM INFILE '{csv_path}' FORMAT CSV\")\n", |
| 50 | + " infile_time = time.time() - start_time\n", |
| 51 | + " print(f\"✓ INFILE insertion successful! Time: {infile_time:.3f}s\")\n", |
| 52 | + " \n", |
| 53 | + " count = chs.query('SELECT COUNT(*) as count FROM embeddings')\n", |
| 54 | + " print(f\"Records inserted via INFILE: {count}\")\n", |
| 55 | + " \n", |
| 56 | + " if count != '0':\n", |
| 57 | + " print(\"Sample data from INFILE:\")\n", |
| 58 | + " sample = chs.query('SELECT movieId, embedding FROM embeddings ORDER BY movieId LIMIT 3')\n", |
| 59 | + " print(sample)\n", |
| 60 | + " \n", |
| 61 | + "except Exception as e:\n", |
| 62 | + " print(f\"✗ INFILE insertion failed: {e}\")\n", |
| 63 | + " infile_time = 0\n", |
| 64 | + "\n", |
| 65 | + "# Test 2: Regular insertion (10 additional rows)\n", |
| 66 | + "print(\"\\n=== Test 2: Regular VALUES Insertion (10 rows) ===\")\n", |
| 67 | + "start_time = time.time()\n", |
| 68 | + "try:\n", |
| 69 | + " # Insert 10 additional rows with movieId starting from 20001\n", |
| 70 | + " for i in range(20001, 20011):\n", |
| 71 | + " embedding = [float(i + j * 0.1) for j in range(10)]\n", |
| 72 | + " embedding_str = '[' + ','.join(map(str, embedding)) + ']'\n", |
| 73 | + " chs.query(f\"INSERT INTO embeddings VALUES ({i}, {embedding_str})\")\n", |
| 74 | + " \n", |
| 75 | + " values_time = time.time() - start_time\n", |
| 76 | + " print(f\"✓ VALUES insertion successful! Time: {values_time:.3f}s\")\n", |
| 77 | + " \n", |
| 78 | + "except Exception as e:\n", |
| 79 | + " print(f\"✗ VALUES insertion failed: {e}\")\n", |
| 80 | + " values_time = 0\n", |
| 81 | + "\n", |
| 82 | + "# Test 3: Verify total count\n", |
| 83 | + "print(\"\\n=== Test 3: Count Verification ===\")\n", |
| 84 | + "try:\n", |
| 85 | + " total_count = chs.query('SELECT COUNT(*) as total FROM embeddings')\n", |
| 86 | + " print(f\"Total records in embeddings table: {total_count}\")\n", |
| 87 | + " \n", |
| 88 | + " # Count by range\n", |
| 89 | + " infile_count = chs.query('SELECT COUNT(*) as infile_count FROM embeddings WHERE movieId <= 10000')\n", |
| 90 | + " values_count = chs.query('SELECT COUNT(*) as values_count FROM embeddings WHERE movieId >= 20001')\n", |
| 91 | + " \n", |
| 92 | + " print(f\"Records from INFILE (movieId <= 10000): {infile_count}\")\n", |
| 93 | + " print(f\"Records from VALUES (movieId >= 20001): {values_count}\")\n", |
| 94 | + " \n", |
| 95 | + " # Sample from both ranges\n", |
| 96 | + " print(\"\\nSample from INFILE data:\")\n", |
| 97 | + " print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId <= 10000 ORDER BY movieId LIMIT 2'))\n", |
| 98 | + " \n", |
| 99 | + " print(\"Sample from VALUES data:\")\n", |
| 100 | + " print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId >= 20001 ORDER BY movieId LIMIT 2'))\n", |
| 101 | + " \n", |
| 102 | + "except Exception as e:\n", |
| 103 | + " print(f\"Count verification error: {e}\")\n", |
| 104 | + "\n", |
| 105 | + "# Test 4: Direct CSV engine reading\n", |
| 106 | + "print(\"\\n=== Test 4: CSV Engine Direct Reading ===\")\n", |
| 107 | + "try:\n", |
| 108 | + " print(\"Reading generated CSV file directly using CSV engine:\")\n", |
| 109 | + " \n", |
| 110 | + " # Method 1: Using file() function\n", |
| 111 | + " csv_count1 = chs.query(f\"SELECT COUNT(*) as csv_count FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String')\")\n", |
| 112 | + " print(f\"CSV file rows (via file() function): {csv_count1}\")\n", |
| 113 | + " \n", |
| 114 | + " # Method 2: Using CSV table engine directly\n", |
| 115 | + " print(\"Sample rows from CSV file:\")\n", |
| 116 | + " csv_sample = chs.query(f\"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId LIMIT 3\")\n", |
| 117 | + " print(csv_sample)\n", |
| 118 | + " \n", |
| 119 | + " print(\"Last few rows from CSV file:\")\n", |
| 120 | + " csv_tail = chs.query(f\"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId DESC LIMIT 3\")\n", |
| 121 | + " print(csv_tail)\n", |
| 122 | + " \n", |
| 123 | + "except Exception as e:\n", |
| 124 | + " print(f\"CSV engine reading error: {e}\")\n", |
| 125 | + "\n", |
| 126 | + "# Cleanup\n", |
| 127 | + "print(\"\\n=== Cleanup ===\")\n", |
| 128 | + "try:\n", |
| 129 | + " os.unlink(csv_path)\n", |
| 130 | + " print(\"✓ Temporary CSV file cleaned up\")\n", |
| 131 | + "except Exception as e:\n", |
| 132 | + " print(f\"Warning: Could not clean up temporary file: {e}\")\n", |
| 133 | + "\n", |
| 134 | + "print(f\"\\n=== Performance Summary ===\")\n", |
| 135 | + "if infile_time > 0:\n", |
| 136 | + " print(f\"INFILE insertion (10k rows): {infile_time:.3f}s\")\n", |
| 137 | + "if values_time > 0:\n", |
| 138 | + " print(f\"VALUES insertion (10 rows): {values_time:.3f}s\")\n", |
| 139 | + " if infile_time > 0:\n", |
| 140 | + " print(f\"INFILE is {values_time/infile_time*1000:.1f}x faster per 1000 rows\")" |
| 141 | + ] |
| 142 | + } |
| 143 | + ], |
| 144 | + "metadata": { |
| 145 | + "kernelspec": { |
| 146 | + "display_name": ".venv", |
| 147 | + "language": "python", |
| 148 | + "name": "python3" |
| 149 | + }, |
| 150 | + "language_info": { |
| 151 | + "codemirror_mode": { |
| 152 | + "name": "ipython", |
| 153 | + "version": 3 |
| 154 | + }, |
| 155 | + "file_extension": ".py", |
| 156 | + "mimetype": "text/x-python", |
| 157 | + "name": "python", |
| 158 | + "nbconvert_exporter": "python", |
| 159 | + "pygments_lexer": "ipython3", |
| 160 | + "version": "3.13.7" |
| 161 | + } |
| 162 | + }, |
| 163 | + "nbformat": 4, |
| 164 | + "nbformat_minor": 4 |
| 165 | +} |
0 commit comments