diff --git a/docs/use-cases/AI_ML/data-exploration/_category_.json b/docs/use-cases/AI_ML/data-exploration/_category_.json
index e4a25326544..8d924deefb1 100644
--- a/docs/use-cases/AI_ML/data-exploration/_category_.json
+++ b/docs/use-cases/AI_ML/data-exploration/_category_.json
@@ -1,5 +1,5 @@
{
- "label": "Data exploration",
+ "label": "Data exploration & ML",
"collapsible": true,
"collapsed": true,
}
\ No newline at end of file
diff --git a/docs/use-cases/AI_ML/data-exploration/scikit-learn.md b/docs/use-cases/AI_ML/data-exploration/scikit-learn.md
new file mode 100644
index 00000000000..7e43e8c9460
--- /dev/null
+++ b/docs/use-cases/AI_ML/data-exploration/scikit-learn.md
@@ -0,0 +1,1292 @@
+---
+slug: /use-cases/AI/scikit-learn
+sidebar_label: 'Building a binary classifier with scikit-learn'
+title: 'Classifying UK property types with chDB and scikit-learn'
+description: 'This guide explains how to build a binary classifier to distinguish flats from detached houses using chDB, ClickHouse Cloud and Scikit-learn'
+keywords: ['ML', 'Scikit-learn', 'chDB', 'binary classification']
+doc_type: 'guide'
+---
+
+import Image from '@theme/IdealImage';
+import image_1 from '@site/static/images/use-cases/AI_ML/jupyter/1.png';
+import image_2 from '@site/static/images/use-cases/AI_ML/jupyter/2.png';
+import image_3 from '@site/static/images/use-cases/AI_ML/jupyter/3.png';
+import image_4 from '@site/static/images/use-cases/AI_ML/Marimo/4.png';
+import property_price_distributions from '@site/static/images/use-cases/AI_ML/Scikit/property_price_distributions.png';
+import property_type_composition from '@site/static/images/use-cases/AI_ML/Scikit/property_type_composition.png';
+import confusion_matrix from '@site/static/images/use-cases/AI_ML/Scikit/confusion_matrix.png';
+
+# Classifying UK property types with chDB and scikit-learn
+
+Machine learning prototyping often involves juggling datasets, preprocessing steps, and performance constraints, which can make the process both complex and time-consuming.
+**scikit-learn** is one of Python's most popular and powerful libraries for machine learning, offering a large set of utilities for building and evaluating models.
+In this guide, we will explore how **chDB** complements scikit-learn in the model development life cycle, by building a binary classifier that predicts whether a property is a flat or a detached house, based on features like price, location, and date.
+
+**Prerequisites:**
+
+Before starting, ensure you have:
+- Python 3.8 or higher installed
+- A virtual environment activated (recommended)
+- Basic understanding of SQL and machine learning concepts
+- A ClickHouse Cloud service or trial account
+
+## What we'll be building {#what-we-will-build}
+
+The UK property market features diverse property types, each with distinct characteristics. Two of the most common types are **flats** and **detached houses**.
+Flats are typically self-contained housing units within larger buildings, often found in urban areas with higher density. They generally cost less and have smaller floor areas.
+Detached houses are standalone properties with no shared walls, usually found in suburban or rural areas. They typically command higher prices and offer more privacy and space.
+
+Our goal in this guide is to build a binary classifier that can predict `is_flat = 1` (flat) or `is_flat = 0` (detached house) from property transaction features like price, location, and date.
+By focusing only on these two property types, we can build a model that learns to distinguish between these fundamentally different housing options.
+
+We'll be using the UK Property Price dataset which is available on ClickHouse Cloud as one of the starter datasets that you can import in a few easy clicks.
+It contains pricing data for properties which were sold for in the United Kingdom from 1995 to 2024.
+
+## Setup {#setup}
+
+### Add the dataset to your Cloud service {#add-dataset}
+
+To add this dataset to an existing ClickHouse Cloud service, login to [console.clickhouse.cloud](https://console.clickhouse.cloud/) with your account details.
+
+In the left hand menu, click on `Data sources`. Then click `Add sample data`:
+
+
+
+Select `Get started` in the UK property price paid data (4GB) card:
+
+
+
+Then click `Import dataset`:
+
+
+
+ClickHouse will automatically create the `pp_complete` table in the `default` database and fill the table with 28.92 million rows of price point data.
+
+In order to reduce the likelihood of exposing your credentials, we recommend you add your Cloud username and password as environment variables on your local machine.
+From a terminal run the following command to add your username and password as environment variables:
+
+### Set up your credentials {#setting-up-credentials}
+
+```bash
+export CLICKHOUSE_CLOUD_HOSTNAME=
+export CLICKHOUSE_CLOUD_USER=default
+export CLICKHOUSE_CLOUD_PASSWORD=your_actual_password
+```
+
+:::note
+The environment variables above persist only as long as your terminal session.
+To set them permanently, add them to your shell configuration file.
+
+**Why environment variables?** Storing credentials in environment variables rather than hardcoding them in your notebook:
+- Prevents accidentally committing passwords to version control
+- Makes it easy to switch between development and production environments
+- Follows security best practices
+:::
+
+### Setup a Marimo notebook {#marimo}
+
+Activate your python virtual environment and install the following packages:
+
+```sql
+pip install marimo chdb scikit-learn pandas matplotlib numpy imbalanced-learn seaborn
+```
+
+We will be using Marimo as a notebook to explore our data and develop our ML model.
+Run the following command to start a new notebook:
+
+```sql
+marimo edit property_type_classifier.py
+```
+
+A new browser window should open with the Marimo interface on localhost:2718:
+
+
+
+Marimo notebooks are stored as pure Python files, making them easy to version control and share with others.
+
+In your first cell, import the necessary libraries:
+
+```python
+import marimo as mo
+import chdb
+import pandas as pd
+import numpy as np
+import pickle
+import os
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
+from imblearn.over_sampling import SMOTE
+import matplotlib.pyplot as plt
+import seaborn as sns
+```
+
+In this guide we'll be making use of the following libraries:
+- `chdb`: an embedded ClickHouse SQL engine for data processing
+- `pandas` and `numpy`: a familiar data-science tool for data manipulation and numerical operations
+- `scikit-learn`: a Machine learning tools (model, evaluation metrics) library
+- `matplotlib` and `seaborn`: popular libraries for visualization
+- `marimo`: reactive notebook components
+- `imbalanced-learn`: provides tools for handling imbalanced datasets
+
+### Import data from Cloud {#import-data}
+
+With our environment variables set, our data imported into our Cloud service, and our notebook up and running, we'll next load the data into our notebook for processing.
+
+ClickHouse's [`remoteSecure`](/sql-reference/table-functions/remote) function allows you to easily retrieve the data from ClickHouse Cloud.
+You can instruct chDB to return this data in process as a Pandas data frame - which is a convenient and familiar way of working with data.
+
+Run the following query in a new cell to fetch the UK price paid data from your ClickHouse Cloud service and turn it into a `pandas.DataFrame`:
+
+```python
+# Query UK property data - only flats and detached houses
+query = """
+SELECT
+ date,
+ price,
+ type::String AS type,
+ town,
+ district,
+ locality,
+ CASE WHEN type = 'flat' THEN 1 ELSE 0 END AS is_flat
+FROM remoteSecure(
+ '{host}',
+ 'default.pp_complete',
+ '{user}',
+ '{password}'
+)
+WHERE price > 0
+ AND town IS NOT NULL
+ AND district IS NOT NULL
+ AND locality IS NOT NULL
+ AND type IN ('detached', 'flat')
+FORMAT Arrow
+""".format(
+ host=os.getenv('CLICKHOUSE_CLOUD_HOSTNAME'),
+ user=os.getenv('CLICKHOUSE_CLOUD_USER'),
+ password=os.getenv('CLICKHOUSE_CLOUD_PASSWORD')
+)
+
+# Load as Arrow table for efficiency
+arrow_data = chdb.query(query, "ArrowTable")
+df_raw = arrow_data.to_pandas()
+
+mo.md(f"Loaded {len(df_raw):,} property records (flats and detached houses only) from ClickHouse Cloud")
+```
+
+The query above will load 11.8 million records from Cloud.
+
+:::tip
+Make sure your Cloud service is not in an idle state when you run the cell above.
+
+You can run `df_raw.head()` in a new cell to take a look at the first few rows of the data.
+:::
+
+In the code snippet above we defined our SQL query as a templated string, passing in `host`, `user` and `password` as variables which hold our environment variables.
+We used the **`remoteSecure()`** table engine to connect to ClickHouse Cloud over a secure connection (TLS encryption) and selected the columns we are interested in.
+The **`WHERE`** clause filters data server side *before* transfer—removing invalid prices, null locations, and keeping only flats (`type = 'F'`) and detached houses (`type = 'D'`). This server-side filtering is much faster than transferring everything and filtering locally.
+We also created a target variable `is_flat` using `CASE WHEN type = 'F' THEN 1 ELSE 0 END AS is_flat`, which creates a binary label (1 for flat, 0 for detached house).
+
+**`FORMAT Arrow`** tells chDB to return data in the Apache Arrow format.
+We chose Apache Arrow as it is a columnar data format that provides 3-10x faster data transfer compared to CSV.
+ClickHouse offers the ability to work with over a hundred different file [formats](/interfaces/formats#formats-overview) including familiar ones like CSV, TSV and JSON.
+
+The **`chdb.query(query, "ArrowTable")`** call tells chdb to execute the query we defined in String `query` and to return a PyArrow Table object as the result.
+We then use the **`to_pandas()`** function, to convert the data to a Pandas data frame which is a convenient and familiar way of working with the data.
+You'll see later in this guide how you can seamlessly transition between processing data using Pandas and using chdb depending on the task at hand.
+
+## Data exploration {#data-exploration}
+
+Before building our model, let's use chDB to understand how flats differ from detached houses.
+
+### Compare price distributions {#compare-price-distributions}
+
+Run the code below in a new cell to generate price distribution statistics for flats vs detached houses.
+
+```python
+def compare_price_stats():
+ # Compare price statistics between flats and detached houses
+ query_stats = """
+ SELECT
+ CASE WHEN is_flat = 1 THEN 'Flat' ELSE 'Detached' END as property_type,
+ formatReadableQuantity(COUNT(*)) AS count,
+ formatReadableQuantity(AVG(price)) AS avg_price,
+ formatReadableQuantity(quantile(0.5)(price)) AS median_price,
+ formatReadableQuantity(quantile(0.25)(price)) AS q25_price,
+ formatReadableQuantity(quantile(0.75)(price)) AS q75_price,
+ formatReadableQuantity(MIN(price)) AS min_price,
+ formatReadableQuantity(MAX(price)) AS max_price
+ FROM Python('df_raw')
+ GROUP BY is_flat
+ ORDER BY is_flat DESC
+ """
+
+ price_comparison = chdb.query(query_stats, "DataFrame")
+ return mo.ui.table(price_comparison, label="Price distribution: Flats vs Detached houses")
+
+compare_price_stats()
+```
+
+| Property Type | Count | Average Price | Median Price | Q25 Price | Q75 Price | Min Price | Max Price |
+|---------------|-----------|---------------|--------------|-----------|-----------|-----------|--------------|
+| Flat | 5,204,810 | £204,500 | £144,720 | £85,250 | £232,500 | £1 | £160,000,000 |
+| Detached | 6,666,708 | £296,000 | £234,950 | £145,000 | £360,000 | £1 | £53,490,000 |
+
+In the `query_stats` string above we made use of the `Python` table engine.
+This table engine allows us to read the data into chdb from the `df_raw` DataFrame which was defined in the previous cell.
+We then used ClickHouse's built-in SQL capabilities and just a few of the many aggregate functions to aggregate and transform the data, calculating statistics like the average using `AVG` and the 25th percentile, median and 75th percentile using the `quantiles` function, as well as the min and max values using the `MIN` and `MAX` functions.
+We used **`GROUP BY is_flat`** to split the data into two groups for comparison.
+
+We're looking at these particular statistics because if the median price of a detached house is significantly different from the median price of a flat, then it's likely that we'll be able to train our model easily.
+The 25th and 75th quartiles show us the "typical range" of the price distribution—if these don't overlap much between property types, then price alone is a strong signal.
+**Min/Max** show us outliers in our data.
+
+### Visualizing the price difference between flats and detached houses {#visualizing-flats-v-detached}
+
+Run the following code in a new cell to visualize the distribution:
+
+```python
+def visualize_price_difference():
+ # 1. Calculate the 99th percentile threshold for the entire dataset
+ # we do this for visualization purposes due to extremely large outliers
+ price_99th_percentile = df_raw['price'].quantile(0.99)
+
+ # 2. Filter the DataFrame to include only prices up to the 99th percentile
+ df_filtered = df_raw[df_raw['price'] <= price_99th_percentile]
+
+ # Create side-by-side histograms using the filtered data
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+
+ # Flats (Filtered)
+ flat_prices = df_filtered[df_filtered['is_flat'] == 1]['price']
+ axes[0].hist(flat_prices, bins=50, alpha=0.7, color='#e74c3c', edgecolor='white')
+ axes[0].axvline(flat_prices.median(), color='darkred', linestyle='--', linewidth=2,
+ label=f'Median: £{flat_prices.median():,.0f}')
+ axes[0].set_xlabel('Price (£)')
+ axes[0].set_ylabel('Count')
+ axes[0].set_title('Flats (Outliers Excluded)')
+ axes[0].legend()
+ axes[0].grid(True, alpha=0.3)
+ axes[0].ticklabel_format(style='plain', axis='x')
+
+ # Detached houses (Filtered)
+ detached_prices = df_filtered[df_filtered['is_flat'] == 0]['price']
+ axes[1].hist(detached_prices, bins=50, alpha=0.7, color='#3498db', edgecolor='white')
+ axes[1].axvline(detached_prices.median(), color='darkblue', linestyle='--', linewidth=2,
+ label=f'Median: £{detached_prices.median():,.0f}')
+ axes[1].set_xlabel('Price (£)')
+ axes[1].set_ylabel('Count')
+ axes[1].set_title('Detached Houses (Outliers Excluded)')
+ axes[1].legend()
+ axes[1].grid(True, alpha=0.3)
+ axes[1].ticklabel_format(style='plain', axis='x')
+
+ plt.suptitle(f"Price Distribution up to 99th Percentile (Max Price: £{price_99th_percentile:,.0f})")
+ plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+
+ plt.show()
+
+visualize_price_difference()
+```
+
+
+
+The distributions reveal three key differences that indicate we could build an effective classifier:
+- significant shift in central tendency between the two property types
+- different distribution shapes and peak frequencies
+- detached houses exhibit a much more substantial high-price tail
+
+
+First, there is a significant shift in central tendency between the two property types.
+Detached houses typically have a higher median price compared to flats, reflecting their larger size, privacy, and standalone nature.
+This difference in typical property price provides a simple yet powerful signal that classifiers can leverage.
+
+Second, the distributions show different shapes and peak frequencies.
+Flats tend to be concentrated at the lower end of the price spectrum, with their peak occurring in the more affordable price ranges.
+In contrast, detached houses have a distribution that peaks at higher price points and extends further into premium price territory.
+This separation means that price alone can be a strong indicator of property type.
+
+Lastly, detached houses exhibit a much more substantial high-price tail in their distribution.
+While both property types have outliers, detached houses maintain a noticeably thicker tail extending to much higher prices.
+Properties in these higher price brackets are more likely to be detached houses, which offer more space, land, and privacy—features that command premium prices in the market.
+
+
+### Geographic distribution {#geographic-distribution}
+
+We'll again use chDB to explore how flats and detached houses are distributed across different localities.
+
+In the query below we make use of the `Python` table engine to read our DataFrame, transform the data using ClickHouse, and return the result back as a DataFrame.
+
+```python
+def create_flat_vs_detached_chart(df):
+ fig, ax = plt.subplots(figsize=(14, 10))
+
+ # Define colors
+ colors = {
+ 'detached': '#E74C3C', # Red
+ 'flat': '#3498DB', # Blue
+ }
+
+ # Prepare data
+ localities = df['locality'].tolist()
+ y_pos = np.arange(len(localities))
+
+ # Create stacked bars - detached first, then flat
+ ax.barh(y_pos, df['pct_detached'].values,
+ color=colors['detached'],
+ edgecolor='white', linewidth=1.5, label='Detached')
+
+ ax.barh(y_pos, df['pct_flats'].values,
+ left=df['pct_detached'].values,
+ color=colors['flat'],
+ edgecolor='white', linewidth=1.5, label='Flat')
+
+ # Customize the plot
+ ax.set_yticks(y_pos)
+ ax.set_yticklabels(localities)
+ ax.set_xlabel('Percentage of Properties', fontsize=12)
+ ax.set_ylabel('City/Town', fontsize=12)
+ ax.set_title('Property Type Composition by Major UK Cities (Flat vs Detached)',
+ fontsize=14, fontweight='bold', pad=20)
+ ax.set_xlim(0, 100)
+
+ # Add grid
+ ax.grid(axis='x', alpha=0.3, linestyle='--', linewidth=0.5)
+ ax.set_axisbelow(True)
+
+ # Create legend
+ handles, labels = ax.get_legend_handles_labels()
+ legend = ax.legend(handles, labels, title='Property Type',
+ loc='center left', bbox_to_anchor=(1, 0.5),
+ frameon=True, fancybox=False, shadow=False)
+ legend.get_title().set_fontweight('bold')
+
+ # Style adjustments
+ ax.spines['top'].set_visible(False)
+ ax.spines['right'].set_visible(False)
+ ax.spines['left'].set_linewidth(0.5)
+ ax.spines['bottom'].set_linewidth(0.5)
+
+ plt.tight_layout()
+
+ return fig
+
+def compare_type_by_locality():
+ # Compare property type composition by locality
+ query_locality = """
+ SELECT
+ locality,
+ COUNT(*) as total_properties,
+ SUM(CASE WHEN is_flat = 1 THEN 1 ELSE 0 END) as flat_count,
+ SUM(CASE WHEN is_flat = 0 THEN 1 ELSE 0 END) as detached_count,
+ (SUM(CASE WHEN is_flat = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*)) as pct_flats,
+ (SUM(CASE WHEN is_flat = 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*)) as pct_detached
+ FROM Python(df_raw)
+ GROUP BY locality
+ HAVING COUNT(*) > 10000 -- Only show localities with significant data
+ ORDER BY pct_flats DESC
+ LIMIT 10
+ """
+
+ locality_comparison = chdb.query(query_locality, "DataFrame")
+ fig = create_flat_vs_detached_chart(locality_comparison)
+ return fig
+
+compare_type_by_locality()
+```
+
+This query reveals how the composition of property types varies by locality.
+Urban areas like London tend to have a much higher percentage of flats, while more rural or suburban localities have a higher proportion of detached houses.
+This geographic variation is important because location becomes another useful signal for our classifier—the locality where a property is located can help predict whether it's more likely to be a flat or a detached house.
+
+
+
+## Feature engineering with chDB {#feature-engineering}
+
+Now we'll create features for our machine learning model.
+Feature engineering is where we transform raw data into inputs the model can learn from effectively.
+
+### Creating time-based features {#creating-time-based-features}
+
+```python
+# Create features for classification
+features_query = """
+SELECT
+ is_flat,
+ price,
+ toYear(date::Date) AS year,
+ toMonth(date::Date) AS month,
+ toQuarter(date::Date) AS quarter,
+ toDayOfWeek(date::Date) AS day_of_week,
+ town,
+ district,
+ locality,
+ toYear(date::Date) - 2010 AS years_since_2010,
+ log(price) AS log_price,
+ row_number() OVER () AS record_id
+FROM Python(df_raw)
+WHERE price > 0
+"""
+
+df_features = chdb.query(features_query, "DataFrame")
+mo.md(f"**Feature dataset:** {df_features.shape[0]:,} rows × {df_features.shape[1]} columns")
+```
+
+The query above creates a feature dataset by extracting and transforming multiple variables from the raw property data.
+Starting with the original columns like `is_flat`, `price`, and location fields, the query adds several time-based and derived features to help the classification model identify patterns.
+
+Time-based features are extracted from the transaction date to capture temporal patterns in the property market.
+- The `year` extraction is important because property prices fluctuate over time due to changing market conditions.
+- The `month` feature captures seasonal patterns, as the property market sees more activity during spring and summer.
+- The `quarter` identifies broader economic cycles, while the day of the week (1 for Monday, 7 for Sunday) reflects that most transactions occur on weekdays.
+- The `years_since_2010` variable provides a continuous time representation that machine learning models can more easily incorporate into their predictions.
+
+The query also engineers a logarithmic transformation of the price variable, which offers several advantages for modeling.
+It reduces the influence of extreme outliers, makes the price distribution more normal, and treats proportional changes equally—so the jump from £100,000 to £200,000 is modeled similarly to £500,000 to £1,000,000. Finally, a unique record ID is assigned to each row, making it easy to trace predictions back to their original records.
+
+One of chDB's key advantages is that all these transformations happen together in a single SQL statement, which executes much faster than performing each transformation sequentially in pandas.
+This efficiency is especially valuable when working with large datasets like ours.
+
+### Encoding categorical variables {#encoding-categorical-variables}
+
+Machine learning models need numerical inputs, but we have text values like town names ("LONDON", "BIRMINGHAM", "MANCHESTER"), district names, and locality names.
+We'll need to convert these to numbers first.
+While we could use a function like `LabelEncoder` from Scikit learn, we'll use an approach called **label encoding** implemented through **reference tables** (also called dimension tables in data warehousing).
+
+
+
+#### Create reference tables {#create-reference-tables}
+
+In a new cell, run the code shown below:
+
+```python
+def create_reference_table(df, column_name):
+ """
+ Create a reference table that maps category values to numeric IDs.
+
+ This is like creating a lookup table or dictionary:
+ LONDON -> 0
+ BIRMINGHAM -> 1
+ MANCHESTER -> 2
+ etc.
+ """
+ query = f"""
+ SELECT
+ {column_name},
+ row_number() OVER (ORDER BY {column_name}) - 1 as {column_name}_id
+ FROM (
+ SELECT DISTINCT {column_name}
+ FROM Python(df)
+ WHERE {column_name} IS NOT NULL
+ )
+ ORDER BY {column_name}
+ """
+ return chdb.query(query, "DataFrame")
+```
+
+Let's break this query down step by step, starting with the inner select statement:
+
+```sql
+SELECT DISTINCT {column_name} FROM Python(df) WHERE {column_name} IS NOT NULL
+```
+
+Here we use **`DISTINCT`** to get only unique column names (removing duplicates) and include **`WHERE ... IS NOT NULL`** to exclude missing values.
+For column `town`, for example, this will return a list of unique towns such as "LONDON", "BIRMINGHAM", "MANCHESTER", etc.
+
+```sql
+SELECT {column_name}, row_number() OVER (ORDER BY {column_name}) - 1 as {column_name}_id
+```
+
+The outer query, shown above, then uses that data and assigns a sequential numbers to each row (to each unique town) using **`row_number() OVER (ORDER BY {column_name})`** where `row_number` is a window function that numbers rows: 1, 2, 3, 4, etc.
+We use `ORDER BY {column_name}` to order alphabetically.
+Using the `town` column for example, will result in `BIRMINGHAM=1`, `LEEDS=2`, `LONDON=3`, `MANCHESTER=4` etc.
+Finally, we subtract one so that IDs start from 0, which is a common ML convention.
+
+This produces a mapping table where each unique categorical variable is assigned a unique numeric variable.
+An example is shown below:
+
+| town | town_id |
+| ---------- | ------- |
+| BIRMINGHAM | 0 |
+| BRISTOL | 1 |
+| LEEDS | 2 |
+| LIVERPOOL | 3 |
+| LONDON | 4 |
+| MANCHESTER | 5 |
+
+#### Create reference tables for all categorical features {#create-reference-tables-for-all-categorical-features}
+
+We can now use the `create_reference_table` function to generate reference tables for the features of interest.
+Note that we don't encode the property type since it's already captured in our binary target variable `is_flat`.
+
+```python
+# Create reference tables for categorical columns
+town_ref = create_reference_table(df_features, 'town')
+district_ref = create_reference_table(df_features, 'district')
+locality_ref = create_reference_table(df_features, 'locality')
+
+# Display the mappings
+mo.hstack([
+ mo.ui.table(town_ref, label="Town Encoding"),
+ mo.ui.table(district_ref, label="District Encoding"),
+ mo.ui.table(locality_ref, label="Locality Encoding")
+])
+```
+
+This produces a unique numerical mapping for each of the categorical variables.
+
+#### Apply the mappings {#apply-the-mappings}
+
+Now we replace text values with numeric IDs using these reference tables with the
+`zip` function to pair up columns and the `dict` function to convert pairs to a dictionary:
+
+```python
+# Create dictionaries for fast lookups
+town_dict = dict(zip(town_ref['town'], town_ref['town_id']))
+district_dict = dict(zip(district_ref['district'], district_ref['district_id']))
+locality_dict = dict(zip(locality_ref['locality'], locality_ref['locality_id']))
+```
+
+We can then apply the mappings:
+
+```python
+# Replace text with numeric IDs
+df_features['town_id'] = df_features['town'].map(town_dict)
+df_features['district_id'] = df_features['district'].map(district_dict)
+df_features['locality_id'] = df_features['locality'].map(locality_dict)
+```
+
+The table below shows an example of how our features looked before applying the encoding:
+
+| record_id | town | district | locality | price | is_flat |
+|------------|----------------|------------------|-------------|--------|---------|
+| 1 | ABBOTS LANGLEY | ABERCONWY | ABBERTON | 450000 | 1 |
+| 2 | ABERAERON | ADUR | AB KETTLEBY | 180000 | 1 |
+| 3 | ABERDARE | ALLERDALE | ABBERD | 520000 | 0 |
+| 4 | ABERDOVEY | ALNWICK | ABBERLEY | 320000 | 0 |
+| 5 | ABERGAVENNY | ALYN AND DEESIDE | ABBERTON | 275000 | 1 |
+
+After encoding, they look like this:
+
+| record_id | town | town_id | district | district_id | locality | locality_id | price | is_flat |
+|------------|----------------|----------|------------------|--------------|-------------|--------------|--------|---------|
+| 1 | ABBOTS LANGLEY | 0 | ABERCONWY | 0 | ABBERTON | 4 | 450000 | 1 |
+| 2 | ABERAERON | 1 | ADUR | 1 | AB KETTLEBY | 1 | 180000 | 1 |
+| 3 | ABERDARE | 2 | ALLERDALE | 2 | ABBERD | 2 | 520000 | 0 |
+| 4 | ABERDOVEY | 3 | ALNWICK | 3 | ABBERLEY | 3 | 320000 | 0 |
+| 5 | ABERGAVENNY | 4 | ALYN AND DEESIDE | 4 | ABBERTON | 4 | 275000 | 1 |
+
+#### Clean the data {#clean-the-data}
+
+Most ML models can't handle `NaN` (missing) values so we either need to fill them or remove them.
+For this demonstration, we will remove them using the `dropna` function:
+
+```python
+# Remove any rows with missing values
+df_clean = df_features.dropna()
+
+mo.ui.table(df_clean.head(10))
+```
+
+
+
+## Model training {#model-training}
+
+Now that we have numerical features, it's time to build our classifier.
+
+### Select feature columns {#select-feature-columns}
+
+```python
+# Select feature columns for the model
+feature_columns = [
+ 'price',
+ 'log_price',
+ 'year',
+ 'month',
+ 'quarter',
+ 'day_of_week',
+ 'town_id',
+ 'district_id',
+ 'locality_id',
+ 'years_since_2010'
+]
+
+# Separate features (X) from target (y)
+X = df_clean[feature_columns]
+y = df_clean['is_flat']
+
+mo.md(f"""
+**Training data:**
+- Features (X): {X.shape} - {X.shape[1]} features for {X.shape[0]:,} properties
+- Target (y): {y.shape} - Binary (0 or 1)
+- Positive class (Flat): {y.sum():,} properties ({y.mean():.1%})
+- Negative class (Detached): {(~y.astype(bool)).sum():,} properties ({(~y.astype(bool)).mean():.1%})
+""")
+```
+
+For this classification task, we include location features (`town_id`, `district_id`, `locality_id`) because geographic location is a legitimate signal for predicting property type.
+Urban areas tend to have more flats, while suburban and rural areas have more detached houses.
+Unlike the previous is_london task, these location features don't directly leak the answer—they provide useful context about the area's characteristics.
+
+### Split the data {#split-the-data}
+
+Machine learning models need to be tested on data they've never seen before.
+We split our data into two sets:
+
+```python
+# Split into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(
+ X, y,
+ test_size=0.20, # 20% for testing, 80% for training
+ random_state=42, # Seed for reproducibility
+ stratify=y # Maintain class balance in both sets
+)
+
+mo.md(f"""
+**Data split:**
+- Training: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.0f}%)
+- Testing: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.0f}%)
+- Class distribution preserved: ✓
+""")
+```
+
+In the code above we set `test_size=0.20` to hold out 20% of the data for testing.
+Our model will learn patterns from the remaining 80% of the data, the training set.
+
+We set `random_state=42`as the reproducibility seed.
+With this set, we get the same split every time the code is run.
+
+We set `stratify=y` to maintain class proportions.
+If the overall data is 60% flats and 40% detached houses, then both the training and test sets will maintain the same 60/40 split.
+We do this to prevent a situation in which the training or test sets are selected in such a way that they are no longer representative of the original data.
+
+### Correct for class imbalance {#correct-for-class-imbalance}
+
+If there is a significant imbalance between flats and detached houses in our dataset (for example, if one type is much more common than the other), we might want to address this imbalance.
+An imbalanced dataset can cause the model to be biased toward the majority class.
+
+The simplest and most direct way to handle class imbalance is to use a technique like SMOTE (Synthetic Minority Over-sampling Technique) on the training data to create synthetic examples of the minority class, providing the model with more balanced examples to learn from.
+
+Run the following code in a new cell to use SMOTE:
+
+```python
+# Initialize SMOTE
+# random_state is set for reproducibility
+smote = SMOTE(sampling_strategy='minority', random_state=42)
+
+mo.md(f"""
+**Applying SMOTE to the training data...**
+- Training set size BEFORE SMOTE: {X_train.shape[0]:,} samples
+- Minority class count BEFORE SMOTE: {min(y_train.sum(), len(y_train) - y_train.sum()):,}
+- Majority class count BEFORE SMOTE: {max(y_train.sum(), len(y_train) - y_train.sum()):,}
+""")
+
+# Apply SMOTE to only the training data
+X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
+
+mo.md(f"""
+- Training set size AFTER SMOTE: {X_train_resampled.shape[0]:,} samples
+- Flat class AFTER SMOTE: {y_train_resampled.sum():,}
+- Detached class AFTER SMOTE: {(~y_train_resampled.astype(bool)).sum():,}
+""")
+```
+
+#### What SMOTE does {#what-smote-does}
+
+1. `smote.fit_resample(X_train, y_train)` calculates the feature space relationships in the training data.
+2. It identifies the minority class (whichever property type is less common).
+3. It then creates new, synthetic property records for the minority class until both classes have equal representation, thus balancing the training set.
+
+By training on balanced data, the Random Forest model will no longer have a bias toward the majority class, which should significantly improve the model's ability to correctly identify both flats and detached houses, leading to better overall performance metrics.
+
+### Training a random forest classifier {#training-a-random-forest-classifier}
+
+For our model, we'll use a **Random Forest** classifier, which is an ensemble of decision trees:
+Run the code below to train the model:
+
+```python
+# Initialize and train the model
+model = RandomForestClassifier(
+ n_estimators=100, # Build 100 decision trees
+ max_depth=15, # Each tree max 15 levels deep
+ min_samples_split=20, # Need ≥20 samples to split a node
+ min_samples_leaf=10, # Each leaf must have ≥10 samples
+ random_state=42, # Reproducibility
+ n_jobs=-1, # Use all CPU cores
+)
+
+model.fit(X_train_resampled, y_train_resampled)
+
+# Save model for later use
+os.makedirs("models", exist_ok=True)
+pickle.dump(model, open("models/property_type_classifier.pkl", "wb"))
+
+mo.md("**Model trained successfully!**")
+```
+
+:::tip Time for a coffee break
+This step can take some time.
+We recommend grabbing a warm beverage and reading something interesting on our [blog](https://clickhouse.com/blog) while you wait. ⏱️☕
+:::
+
+## Model evaluation {#model-evaluation}
+
+Now let's test how well our model performs on data it has never seen:
+
+```python
+# Make predictions on test set
+y_pred = model.predict(X_test)
+
+# Calculate various performance metrics
+accuracy = accuracy_score(y_test, y_pred)
+precision = precision_score(y_test, y_pred)
+recall = recall_score(y_test, y_pred)
+f1 = f1_score(y_test, y_pred)
+
+mo.md(f"""
+## Model Performance Metrics
+
+- **Accuracy: {accuracy:.2%}** - Overall correctness of predictions
+- **Precision: {precision:.2%}** - Of all properties predicted as flats, how many were actually flats
+- **Recall: {recall:.2%}** - Of all actual flats, how many did we correctly identify
+- **F1-Score: {f1:.2%}** - Harmonic mean of precision and recall
+""")
+```
+
+The model achieves strong performance with around **87% accuracy**. Let's understand what each metric means:
+
+- **Accuracy** tells us the overall percentage of correct predictions across both classes
+- **Precision** measures how reliable our "flat" predictions are (few false positives means high precision)
+- **Recall** measures how well we catch all actual flats (few false negatives means high recall)
+- **F1-Score** balances precision and recall, giving us a single metric that considers both
+
+### Classification report {#classification-report}
+
+For a more detailed breakdown, we can generate a full classification report:
+
+```python
+# Generate classification report
+report = classification_report(y_test, y_pred, target_names=['Detached', 'Flat'])
+print(report)
+```
+
+```text
+ precision recall f1-score support
+
+Detached 0.89 0.88 0.89 1333708
+Flat 0.85 0.86 0.85 1040810
+
+accuracy 0.87 2374518
+macro avg 0.87 0.87 0.87 2374518
+weighted avg 0.87 0.87 0.87 2374518
+```
+
+This report shows precision, recall, and F1-score for both classes (flats and detached houses), giving us insights into how well the model performs for each property type.
+
+### Confusion matrix {#confusion-matrix}
+
+A confusion matrix helps us visualize exactly where our model makes mistakes:
+
+```python
+def plot_confusion_matrix():
+ # Calculate confusion matrix
+ cm = confusion_matrix(y_test, y_pred)
+
+ # Create figure
+ fig, ax = plt.subplots(figsize=(8, 6))
+
+ # Plot confusion matrix using seaborn
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
+ xticklabels=['Detached', 'Flat'],
+ yticklabels=['Detached', 'Flat'],
+ ax=ax, cbar_kws={'label': 'Count'})
+
+ # Add labels and title
+ ax.set_xlabel('Predicted Label', fontsize=12, fontweight='bold')
+ ax.set_ylabel('True Label', fontsize=12, fontweight='bold')
+ ax.set_title('Confusion Matrix: Flat vs Detached House Classification',
+ fontsize=14, fontweight='bold', pad=20)
+
+ # Add percentage annotations
+ total = cm.sum()
+ for i in range(2):
+ for j in range(2):
+ text = ax.text(j + 0.5, i + 0.7, f'({cm[i,j]/total*100:.1f}%)',
+ ha="center", va="center", color="gray", fontsize=10)
+
+ plt.tight_layout()
+ plt.show()
+
+ # Print interpretation
+ tn, fp, fn, tp = cm.ravel()
+ mo.md(f"""
+ ### Confusion Matrix Interpretation:
+
+ - **True Negatives (TN):** {tn:,} - Detached houses correctly identified as detached
+ - **False Positives (FP):** {fp:,} - Detached houses incorrectly classified as flats
+ - **False Negatives (FN):** {fn:,} - Flats incorrectly classified as detached
+ - **True Positives (TP):** {tp:,} - Flats correctly identified as flats
+
+ The diagonal values (TN and TP) represent correct predictions, while off-diagonal values represent errors.
+ """)
+
+plot_confusion_matrix()
+```
+
+
+
+The confusion matrix provides valuable insights:
+- The **darker cells on the diagonal** show correct predictions
+- **Off-diagonal cells** reveal where the model gets confused
+- By examining which type of error is more common (false positives vs false negatives), we can understand the model's biases
+
+### Understanding model performance {#understanding-model-performance}
+
+The table below gives context to what different levels of accuracy mean for classification tasks:
+
+| Accuracy Range | Interpretation |
+|----------------|----------------------------------------------------------------------------------------|
+| 50-60% | Barely better than guessing. The model didn't learn much. |
+| 60-75% | There is some signal, but it is weak. |
+| 75-85% | The model has good performance, and useful patterns were learned. |
+| 85-95% | The model shows strong predictive patterns |
+| 95%+ | The model is suspiciously performant. This might indicate data leakage or overfitting.|
+
+For our property type classification task, we would expect between **85-95% accuracy** because:
+- Detached houses typically have higher prices than flats
+- Geographic location correlates strongly with property type (urban areas have more flats)
+- Clear separating patterns exist between the two property types
+
+Our model's performance of ~87% accuracy with balanced precision and recall indicates that it has successfully learned to distinguish between flats and detached houses using the features we provided.
+
+## Inference with chDB {#inference-with-chdb}
+
+Now that we have a trained model, let's explore different approaches to make predictions on new data.
+This is where chDB really shines—showing multiple patterns for integrating ML with SQL workflows.
+
+### Loading the trained model {#loading-trained-model}
+
+First, let's load our saved model that we trained earlier:
+
+```python
+# Load the trained model
+with open("models/property_type_classifier.pkl", "rb") as f:
+ loaded_model = pickle.load(f)
+
+# Also load the reference tables for encoding new data
+town_dict = dict(zip(town_ref['town'], town_ref['town_id']))
+district_dict = dict(zip(district_ref['district'], district_ref['district_id']))
+locality_dict = dict(zip(locality_ref['locality'], locality_ref['locality_id']))
+```
+
+### Single property prediction {#single-property-prediction}
+
+Let's create a helper function that can predict the property type for a single property:
+
+```python
+def predict_property_type(price, date, town, district, locality):
+ """
+ Predict if a property is a flat (1) or detached house (0).
+
+ Args:
+ price: Property price in GBP
+ date: Transaction date (YYYY-MM-DD)
+ town: Town name
+ district: District name
+ locality: Locality name
+
+ Returns:
+ Dictionary with prediction and probability
+ """
+ # Convert date to pandas datetime
+ date = pd.to_datetime(date)
+
+ # Extract time features
+ year = date.year
+ month = date.month
+ quarter = date.quarter
+ day_of_week = date.dayofweek + 1 # Monday=1, Sunday=7
+ years_since_2010 = year - 2010
+ log_price = np.log(price)
+
+ # Encode categorical variables
+ town_id = town_dict.get(town, -1)
+ district_id = district_dict.get(district, -1)
+ locality_id = locality_dict.get(locality, -1)
+
+ # Handle unknown locations
+ if town_id == -1 or district_id == -1 or locality_id == -1:
+ return {
+ 'prediction': None,
+ 'property_type': 'Unknown',
+ 'confidence': 0.0,
+ 'error': 'Unknown location - not in training data'
+ }
+
+ # Create feature array
+ features = np.array([[
+ price, log_price, year, month, quarter, day_of_week,
+ town_id, district_id, locality_id, years_since_2010
+ ]])
+
+ # Make prediction
+ prediction = loaded_model.predict(features)[0]
+ probability = loaded_model.predict_proba(features)[0]
+
+ return {
+ 'prediction': int(prediction),
+ 'property_type': 'Flat' if prediction == 1 else 'Detached House',
+ 'confidence': float(probability[prediction]),
+ 'flat_probability': float(probability[1]),
+ 'detached_probability': float(probability[0])
+ }
+
+# Example: Predict for a £250,000 property in London
+result = predict_property_type(
+ price=250000,
+ date='2024-01-15',
+ town='LONDON',
+ district='CAMDEN',
+ locality='CAMDEN'
+)
+
+mo.md(f"""
+### Prediction Result:
+- **Property Type:** {result['property_type']}
+- **Confidence:** {result['confidence']:.1%}
+- **Flat Probability:** {result['flat_probability']:.1%}
+- **Detached Probability:** {result['detached_probability']:.1%}
+""")
+```
+
+### Batch predictions with chDB {#batch-predictions-chdb}
+
+For multiple properties, we can use chDB to handle feature engineering and then use our model for batch predictions:
+
+```python
+# Sample new properties to predict
+new_properties = pd.DataFrame({
+ 'price': [250000, 450000, 180000, 650000, 320000],
+ 'date': ['2024-01-15', '2024-02-20', '2024-03-10', '2024-04-05', '2024-05-12'],
+ 'town': ['LONDON', 'MANCHESTER', 'BIRMINGHAM', 'OXFORD', 'LEEDS'],
+ 'district': ['CAMDEN', 'MANCHESTER', 'BIRMINGHAM', 'OXFORD', 'LEEDS'],
+ 'locality': ['CAMDEN', 'MANCHESTER', 'BIRMINGHAM', 'OXFORD', 'LEEDS']
+})
+
+# Use chDB for feature engineering
+feature_query = """
+SELECT
+ price,
+ log(price) AS log_price,
+ toYear(date::Date) AS year,
+ toMonth(date::Date) AS month,
+ toQuarter(date::Date) AS quarter,
+ toDayOfWeek(date::Date) AS day_of_week,
+ toYear(date::Date) - 2010 AS years_since_2010,
+ town,
+ district,
+ locality
+FROM Python(new_properties)
+"""
+
+features_df = chdb.query(feature_query, "DataFrame")
+
+# Encode categorical variables
+features_df['town_id'] = features_df['town'].map(town_dict)
+features_df['district_id'] = features_df['district'].map(district_dict)
+features_df['locality_id'] = features_df['locality'].map(locality_dict)
+
+# Prepare features for prediction
+X_new = features_df[[
+ 'price', 'log_price', 'year', 'month', 'quarter',
+ 'day_of_week', 'town_id', 'district_id', 'locality_id',
+ 'years_since_2010'
+]]
+
+# Make predictions
+predictions = loaded_model.predict(X_new)
+probabilities = loaded_model.predict_proba(X_new)
+
+# Add predictions back to original data
+new_properties['predicted_type'] = ['Flat' if p == 1 else 'Detached' for p in predictions]
+new_properties['flat_probability'] = probabilities[:, 1]
+new_properties['detached_probability'] = probabilities[:, 0]
+new_properties['confidence'] = [probabilities[i, predictions[i]] for i in range(len(predictions))]
+
+mo.ui.table(new_properties, label="Batch Predictions")
+```
+
+## Inference with ClickHouse User-Defined-functions (UDFs) {#inference-with-clickhouse-udfs}
+
+ClickHouse supports User Defined Functions (UDFs) that allow you to invoke machine learning models directly at insert or query time.
+
+### Implementation steps {#implementation-steps}
+
+
+
+#### Create a Python UDF Script {#create-python-udf}
+
+Create a Python script that reads data from stdin, loads your model, and outputs predictions to stdout.
+Save this in ClickHouse's `user_scripts` directory (typically `/var/lib/clickhouse/user_scripts/`):
+
+:::tip
+Make sure that the file is executable
+:::
+
+```python title="/var/lib/clickhouse/user_scripts/predict_property_type.py"
+#!/usr/bin/env python3
+import sys
+import pickle
+import os
+import numpy as np
+
+# Use absolute path to model - place your model file here
+MODEL_PATH = "/var/lib/clickhouse/user_scripts/property_type_classifier.pkl"
+
+# Load the model once (for executable_pool type)
+try:
+ with open(MODEL_PATH, "rb") as f:
+ model = pickle.load(f)
+except Exception as e:
+ # Log error to stderr (will appear in ClickHouse logs)
+ sys.stderr.write(f"Error loading model from {MODEL_PATH}: {e}\n")
+ sys.stderr.flush()
+ sys.exit(1)
+
+def main():
+ try:
+ # Read chunk length for batching
+ for number in sys.stdin:
+ length = int(number.strip())
+
+ # Collect rows in batch
+ features_batch = []
+ for _ in range(length):
+ line = sys.stdin.readline().strip()
+ if not line:
+ continue
+
+ # Parse TabSeparated input: 10 features expected
+ values = line.split('\t')
+
+ if len(values) != 10:
+ sys.stderr.write(f"Expected 10 features, got {len(values)}: {values}\n")
+ sys.stderr.flush()
+ continue
+
+ # Convert to appropriate types
+ features = [
+ float(values[0]), # price
+ float(values[1]), # log_price
+ int(values[2]), # year
+ int(values[3]), # month
+ int(values[4]), # quarter
+ int(values[5]), # day_of_week
+ int(values[6]), # town_id
+ int(values[7]), # district_id
+ int(values[8]), # locality_id
+ int(values[9]) # years_since_2010
+ ]
+ features_batch.append(features)
+
+ if not features_batch:
+ continue
+
+ # Batch prediction
+ predictions = model.predict(np.array(features_batch))
+
+ # Output predictions (one per line)
+ for pred in predictions:
+ print(int(pred))
+
+ sys.stdout.flush()
+
+ except Exception as e:
+ sys.stderr.write(f"Error during prediction: {e}\n")
+ sys.stderr.flush()
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()
+```
+
+**Setup Steps:**
+
+1. **Install required Python dependencies** for the ClickHouse user:
+
+ :::important
+ This is the most common cause of `CHILD_WAS_NOT_EXITED_NORMALLY` and `ModuleNotFoundError` errors!
+ :::
+
+ ```bash
+ # Install for the clickhouse user
+ sudo -u clickhouse python3 -m pip install numpy scikit-learn
+
+ # Or install system-wide if the above doesn't work
+ sudo python3 -m pip install numpy scikit-learn
+ ```
+
+ Verify the installation:
+ ```bash
+ sudo -u clickhouse python3 -c "import sklearn; import numpy; print('Dependencies OK')"
+ ```
+
+2. **Copy the model file** to the ClickHouse user_scripts directory:
+
+ ```bash
+ sudo cp models/property_type_classifier.pkl /var/lib/clickhouse/user_scripts/
+ ```
+
+3. **Make the script executable**:
+
+ ```bash
+ sudo chmod +x /var/lib/clickhouse/user_scripts/predict_property_type.py
+ ```
+
+4. **Set proper ownership**:
+
+ ```bash
+ sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_scripts/predict_property_type.py
+ sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_scripts/property_type_classifier.pkl
+ ```
+
+5. **Test the script manually** to verify it works:
+
+```bash
+echo -e "1\n250000\t12.429\t2023\t5\t2\t3\t100\t50\t25\t13" | \
+/var/lib/clickhouse/user_scripts/predict_property_type.py
+```
+
+This should output `0` or `1` (the prediction).
+
+
+
+
+Common issues
+
+If you get `CHILD_WAS_NOT_EXITED_NORMALLY` error:
+
+- **Missing dependencies**: Make sure numpy and scikit-learn are installed for the user running ClickHouse:
+
+```bash
+sudo -u clickhouse python3 -m pip install numpy scikit-learn
+```
+
+- **Model not found**: Verify the model file exists and that the path is correct:
+
+```bash
+ls -la /var/lib/clickhouse/user_scripts/property_type_classifier.pkl
+```
+
+- **Permission issues**: Check the script is executable and owned by clickhouse user
+
+- **Check ClickHouse logs** for stderr output from the script:
+
+```bash
+sudo tail -f /var/log/clickhouse-server/clickhouse-server.err.log
+```
+
+
+
+#### Configure the UDF {#configure-the-udf}
+
+Create an XML configuration file in `/etc/clickhouse-server/` (ending with `_function.xml`)
+
+```xml title="predict_property_type_function.xml"
+
+
+ executable_pool
+ predict_is_flat
+ UInt8
+ Float64
+ Float64
+ Int32
+ Int32
+ Int32
+ Int32
+ Int32
+ Int32
+ Int32
+ Int32
+ TabSeparated
+ predict_property_type.py
+ 1
+ 10
+ 10000
+ 10
+
+
+```
+
+**After creating the XML file:**
+
+1. **Restart ClickHouse** to load the new function:
+
+```bash
+sudo systemctl restart clickhouse-server
+```
+
+2. **Verify the function is registered**:
+
+```sql
+SELECT name, origin FROM system.functions WHERE name = 'predict_is_flat';
+```
+
+You should see:
+
+```response
+┌─name────────────┬─origin────────────────┐
+│ predict_is_flat │ ExecutableUserDefined │
+└─────────────────┴───────────────────────┘
+```
+
+#### Use the UDF in SQL Queries {#use-udfs-in-SQL-queries}
+
+With the setup complete, we'll take one of the existing rows and see how our newly defined `predict_is_flat`
+UDF performs.
+
+We'll take the following row from our existing dataset
+
+```response
+┌─price─┬─type─┬───────date─┬─town─────┬─district───┬─locality─┐
+│ 39950 │ flat │ 1996-08-13 │ NEW QUAY │ CEREDIGION │ NEW QUAY │
+└───────┴──────┴────────────┴──────────┴────────────┴──────────┘
+```
+
+Where our encoding for `NEW QUAY` (town) is 710, `CEREDIGION` is 74 and `NEW QUAY` (locality) is 13899.
+
+```sql
+-- Single property prediction
+WITH price = 39950,
+ year = '1996-08-13'::Date
+SELECT
+predict_is_flat(
+ price,
+ log(price),
+ toYear(year),
+ toMonth(year),
+ toQuarter(year),
+ toDayOfWeek(year),
+ 710,
+ 74,
+ 13899,
+ toYear(year) - 2010
+) AS predicted_is_flat
+```
+
+The function returns the prediction:
+
+```sql
+┌─predicted_is_flat─┐
+│ 1 │
+└───────────────────┘
+```
+
+
+
+## Inference with ClickHouse User-Defined-functions (UDFs) in Cloud (coming soon) {#cloud-udfs}
+
+:::tip
+UDFs in Cloud are currently in private-preview.
+This guide will be updated soon, when executable UDFs in Cloud become general access.
+:::
diff --git a/scripts/aspell-ignore/en/aspell-dict.txt b/scripts/aspell-ignore/en/aspell-dict.txt
index 357113ad3e6..b2b6efa0342 100644
--- a/scripts/aspell-ignore/en/aspell-dict.txt
+++ b/scripts/aspell-ignore/en/aspell-dict.txt
@@ -1,8 +1,20 @@
-personal_ws-1.1 en 3816
+personal_ws-1.1 en 3836
AArch
+ABBERD
+ABBERLEY
+ABBERTON
+ABERAERON
+ABERCONWY
+ABERDARE
+ABERDOVEY
+ABERGAVENNY
ACLs
+ADUR
AICPA
+ALLERDALE
+ALNWICK
ALTERs
+ALYN
AMPLab
AMQP
ANNIndex
@@ -285,6 +297,7 @@ DDLWorker
DDLWorkerThreads
DDLWorkerThreadsActive
DDLs
+DEESIDE
DECRYPT
DELETEs
DESC
@@ -676,6 +689,7 @@ KeeperAliveConnections
KeeperMap
KeeperOutstandingRequests
Kerberos
+KETTLEBY
Khanna
Kibana
Kinesis
@@ -1243,6 +1257,7 @@ Sapchuk
Scalable
Scatterplot
Schaefer
+Scikit
Schedulable
SchemaInferenceMode
Schemas
@@ -2366,6 +2381,7 @@ hadoop
halfMD
halfday
hardcoded
+hardcoding
hardlink
hardlinked
hardlinks
@@ -2841,6 +2857,7 @@ nullables
nullptr
num
numerics
+numpy
nypd
obfuscator
observability
@@ -3060,6 +3077,7 @@ quantiletdigestweighted
quantiletiming
quantiletimingweighted
quartile
+quartiles
queryID
queryString
queryStringAndFragment
@@ -3226,6 +3244,7 @@ scala
scalable
sccache
schemaless
+scikit
schemas
seekable
seektable
@@ -3459,6 +3478,7 @@ tcp
tcpPort
tcpnodelay
templateignorespaces
+templated
textAlign
tgamma
tgz
diff --git a/static/images/use-cases/AI_ML/Scikit/confusion_matrix.png b/static/images/use-cases/AI_ML/Scikit/confusion_matrix.png
new file mode 100644
index 00000000000..d09aa1ed6f3
Binary files /dev/null and b/static/images/use-cases/AI_ML/Scikit/confusion_matrix.png differ
diff --git a/static/images/use-cases/AI_ML/Scikit/property_price_distributions.png b/static/images/use-cases/AI_ML/Scikit/property_price_distributions.png
new file mode 100644
index 00000000000..3fe1a832ab8
Binary files /dev/null and b/static/images/use-cases/AI_ML/Scikit/property_price_distributions.png differ
diff --git a/static/images/use-cases/AI_ML/Scikit/property_type_composition.png b/static/images/use-cases/AI_ML/Scikit/property_type_composition.png
new file mode 100644
index 00000000000..84287a1a2a0
Binary files /dev/null and b/static/images/use-cases/AI_ML/Scikit/property_type_composition.png differ
diff --git a/static/images/use-cases/AI_ML/Scikit/statistics.png b/static/images/use-cases/AI_ML/Scikit/statistics.png
new file mode 100644
index 00000000000..7f11e7778a0
Binary files /dev/null and b/static/images/use-cases/AI_ML/Scikit/statistics.png differ