Skip to content

Commit 205f3f6

Browse files
Merge branch 'tensorzero:main' into main
2 parents 47d28d9 + 57b024e commit 205f3f6

File tree

273 files changed

+15402
-1695
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

273 files changed

+15402
-1695
lines changed

.config/nextest.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ slow-timeout = { period = "10s", terminate-after = 3 }
1111
# On CI, we use the 'ci' profile, which runs all tests.
1212
default-filter = "not test(no_aws_credentials)"
1313

14+
[[profile.default.overrides]]
15+
filter = 'test(evaluations)'
16+
slow-timeout = { period = "20s", terminate-after = 3 }
17+
1418
# Profiles config
1519
# We use these profiles to define our major test groups.
1620
# By using `default-filter` to specify our tests, we can further restrict the tests
@@ -36,6 +40,8 @@ default-filter = 'binary(optimization-live)'
3640
[profile.optimization-mock]
3741
default-filter = 'binary(optimization-mock)'
3842

43+
[profile.ci-unit.junit]
44+
path = "junit.xml"
3945

4046
[[profile.optimization.overrides]]
4147
# Settings for running optimization tests

.github/workflows/batch-test.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,19 @@ jobs:
4141
steps:
4242
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
4343

44-
- uses: dtolnay/rust-toolchain@stable
44+
- name: Install Rust toolchain
45+
run: |
46+
for attempt in 1 2 3; do
47+
if rustup toolchain install stable && rustup default stable; then
48+
break
49+
fi
50+
if [ $attempt -eq 3 ]; then
51+
echo "Failed to install Rust toolchain after 3 attempts"
52+
exit 1
53+
fi
54+
sleep $((10 * attempt))
55+
done
56+
shell: bash
4557

4658
- name: Login to DockerHub
4759
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772

.github/workflows/build-gateway-container.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ on:
55

66
jobs:
77
build-gateway-container:
8-
runs-on: namespace-profile-tensorzero-8x16;overrides.cache-tag=build-gateway-cache
8+
runs-on: namespace-profile-tensorzero-2x8;overrides.cache-tag=build-gateway-cache
99

1010
steps:
1111
# TODO - investigate why using the Namespace checkout action causes
@@ -36,4 +36,4 @@ jobs:
3636
path: gateway-container.tar
3737
retention-days: 1
3838
if-no-files-found: error
39-
overwrite: false
39+
overwrite: false

.github/workflows/build-ui-container.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ on:
55

66
jobs:
77
build-ui-container:
8-
runs-on: namespace-profile-tensorzero-8x16;overrides.cache-tag=build-ui-cache
8+
runs-on: namespace-profile-tensorzero-2x8;overrides.cache-tag=build-ui-cache
99

1010
steps:
1111
- uses: namespacelabs/nscloud-checkout-action@953fed31a6113cc2347ca69c9d823743c65bc84b
@@ -33,4 +33,4 @@ jobs:
3333
path: ui-container.tar
3434
retention-days: 1
3535
if-no-files-found: error
36-
overwrite: false
36+
overwrite: false
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: Buildkite Test Suite
2+
run-name: "Buildkite Test Suite: ${{ github.event.pull_request.title || github.ref }}"
3+
4+
on:
5+
merge_group:
6+
pull_request:
7+
branches: ["main"]
8+
9+
jobs:
10+
unit-tests:
11+
# Run for non-PR events, or PRs where the head repo is the same as this repo (not a fork).
12+
if: ${{ github.event_name != 'pull_request' || (github.event.pull_request.head.repo.full_name == github.repository && github.actor != 'dependabot[bot]')}}
13+
runs-on: ubuntu-latest
14+
steps:
15+
- name: Trigger a Buildkite Build on Push using v2.0.0
16+
uses: buildkite/[email protected]
17+
with:
18+
buildkite_api_access_token: ${{ secrets.TRIGGER_BK_BUILD_TOKEN }}
19+
pipeline: "tensorzero/pr-tests"

.github/workflows/caching-hack.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,19 @@ jobs:
2525
runs-on: ${{ matrix.runner }}
2626
steps:
2727
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
28-
- uses: dtolnay/rust-toolchain@stable
28+
- name: Install Rust toolchain
29+
run: |
30+
for attempt in 1 2 3; do
31+
if rustup toolchain install stable && rustup default stable; then
32+
break
33+
fi
34+
if [ $attempt -eq 3 ]; then
35+
echo "Failed to install Rust toolchain after 3 attempts"
36+
exit 1
37+
fi
38+
sleep $((10 * attempt))
39+
done
40+
shell: bash
2941
- uses: Swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
3042
with:
3143
cache-provider: "buildjet"
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Cancel stale merge queue workflows
2+
on:
3+
merge_group:
4+
types:
5+
- destroyed
6+
7+
permissions:
8+
actions: write
9+
contents: read
10+
11+
jobs:
12+
cancel-workflows:
13+
name: Cancel Workflow Runs
14+
runs-on: ubuntu-latest
15+
if: github.event.reason != 'merged'
16+
steps:
17+
- name: Get Merge Queue Commit SHA
18+
id: get-sha
19+
run: |
20+
echo "Repository: ${{ github.repository }}"
21+
echo "Head SHA: ${{ github.sha }}"
22+
echo "Merge Group Reason: ${{ github.event.reason }}"
23+
24+
- name: Cancel Workflow Runs by SHA
25+
run: |
26+
# Get all workflow runs for the specific SHA
27+
workflow_runs=$(curl -s -H "Authorization: Bearer ${{ github.token }}" \
28+
"https://api.github.com/repos/${{ github.repository }}/actions/runs?head_sha=${{ github.sha }}")
29+
30+
# Extract run IDs and cancel them (except current workflow)
31+
current_run_id="${{ github.run_id }}"
32+
echo "Current Run ID: $current_run_id"
33+
echo "$workflow_runs" | jq -r '.workflow_runs[] | select(.status != "completed") | .id' | while read -r run_id; do
34+
echo "Run ID: $run_id"
35+
if [ -n "$run_id" ] && [ "$run_id" != "$current_run_id" ]; then
36+
echo "Cancelling workflow run $run_id"
37+
curl -X POST -H "Authorization: Bearer ${{ github.token }}" \
38+
"https://api.github.com/repos/${{ github.repository }}/actions/runs/$run_id/cancel"
39+
fi
40+
done

.github/workflows/general.yml

Lines changed: 89 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ run-name: "General Checks for: ${{ github.event.pull_request.title || github.ref
33

44
on:
55
merge_group:
6+
paths-ignore:
7+
- "docs/**"
68
pull_request:
79
branches: ["main"]
810

@@ -109,7 +111,18 @@ jobs:
109111
node-version: "22.9.0"
110112

111113
- name: Install pnpm
112-
uses: pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda
114+
run: |
115+
for attempt in 1 2 3; do
116+
if npm install -g pnpm@latest; then
117+
break
118+
fi
119+
if [ $attempt -eq 3 ]; then
120+
echo "Failed to install pnpm after 3 attempts"
121+
exit 1
122+
fi
123+
sleep $((10 * attempt))
124+
done
125+
shell: bash
113126

114127
- name: Install dependencies
115128
working-directory: internal/tensorzero-node
@@ -138,7 +151,19 @@ jobs:
138151
runs-on: windows-latest
139152
steps:
140153
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
141-
- uses: dtolnay/rust-toolchain@stable
154+
- name: Install Rust toolchain
155+
run: |
156+
for attempt in 1 2 3; do
157+
if rustup toolchain install stable && rustup default stable; then
158+
break
159+
fi
160+
if [ $attempt -eq 3 ]; then
161+
echo "Failed to install Rust toolchain after 3 attempts"
162+
exit 1
163+
fi
164+
sleep $((10 * attempt))
165+
done
166+
shell: bash
142167
- uses: Swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
143168
with:
144169
cache-provider: "buildjet"
@@ -147,29 +172,27 @@ jobs:
147172
run: cargo build --workspace
148173

149174
validate:
150-
runs-on: namespace-profile-tensorzero-8x16
175+
runs-on: namespace-profile-tensorzero-32x64
151176

152177
timeout-minutes: 30
153178

154179
steps:
155180
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
156181

157-
# We allow the namespace builder setup to fail on Dependabot PRs and PRs from forks
158-
# (where the oidc token is not available)
159-
160-
- name: Install Namespace CLI
161-
uses: namespacelabs/nscloud-setup@d1c625762f7c926a54bd39252efff0705fd11c64
162-
continue-on-error: ${{ github.event.pull_request.head.repo.full_name != github.repository || github.actor == 'dependabot[bot]' }}
163-
164-
- name: Configure Namespace-powered Buildx
165-
uses: namespacelabs/nscloud-setup-buildx-action@84ca8c58fdf372d6a4750476cd09b7b96ee778ca
166-
continue-on-error: ${{ github.event.pull_request.head.repo.full_name != github.repository || github.actor == 'dependabot[bot]' }}
167-
168182
# We deliberately install our MSRV here (rather than 'stable') to ensure that everything compiles with that version
169183
- name: Install Rust 1.86.0
170184
run: |
171-
rustup install 1.86.0 --component clippy,rustfmt
172-
rustup default 1.86.0
185+
for attempt in 1 2 3; do
186+
if rustup install 1.86.0 --component clippy,rustfmt && rustup default 1.86.0; then
187+
break
188+
fi
189+
if [ $attempt -eq 3 ]; then
190+
echo "Failed to install Rust 1.86.0 after 3 attempts"
191+
exit 1
192+
fi
193+
sleep $((10 * attempt))
194+
done
195+
shell: bash
173196

174197
- name: Print Rust version
175198
run: rustc --version
@@ -180,10 +203,21 @@ jobs:
180203
node-version: "22.9.0"
181204

182205
- name: Install pnpm
183-
uses: pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda
206+
run: |
207+
for attempt in 1 2 3; do
208+
if npm install -g pnpm@latest; then
209+
break
210+
fi
211+
if [ $attempt -eq 3 ]; then
212+
echo "Failed to install pnpm after 3 attempts"
213+
exit 1
214+
fi
215+
sleep $((10 * attempt))
216+
done
217+
shell: bash
184218

185219
- name: Install uv
186-
run: curl -LsSf https://astral.sh/uv/0.6.17/install.sh | sh
220+
run: curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh
187221

188222
- name: Configure Namespace cache for Rust, Python (pip), and pnpm
189223
uses: namespacelabs/nscloud-cache-action@2f50e7d0f70475e6f59a55ba0f05eec9108e77cc
@@ -359,23 +393,29 @@ jobs:
359393
# We don't run many tests here, so use a normal runner with Github Actions caching
360394
# to avoid unnecessarily using Namespace credits (it should still always finish before
361395
# the main 'validate' job)
362-
runs-on: ${{ matrix.replicated && 'namespace-profile-tensorzero-16x32' || 'ubuntu-latest' }}
396+
runs-on: ${{ matrix.replicated && 'namespace-profile-tensorzero-32x64' || 'namespace-profile-tensorzero-16x32' }}
363397
continue-on-error: ${{ matrix.clickhouse_version.allow_failure }}
364398
strategy:
365399
matrix:
366400
# Only include replicated: true when running in merge queue
367401
replicated: ${{ github.event_name == 'merge_group' && fromJSON('[true, false]') || fromJSON('[false]') }}
368-
clickhouse_version:
369-
- tag: "24.12-alpine"
370-
prefix: "24.12"
371-
allow_failure: false
372-
- tag: "latest-alpine"
373-
prefix: ""
374-
allow_failure: false
402+
clickhouse_version: ${{ github.event_name == 'merge_group' && fromJSON('[{"tag":"24.12-alpine","prefix":"24.12","allow_failure":false},{"tag":"25.7-alpine","prefix":"25.7","allow_failure":false}]') || fromJSON('[{"tag":"25.7-alpine","prefix":"25.7","allow_failure":false}]') }}
375403

376404
steps:
377405
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
378-
- uses: dtolnay/rust-toolchain@stable
406+
- name: Install Rust toolchain
407+
run: |
408+
for attempt in 1 2 3; do
409+
if rustup toolchain install stable && rustup default stable; then
410+
break
411+
fi
412+
if [ $attempt -eq 3 ]; then
413+
echo "Failed to install Rust toolchain after 3 attempts"
414+
exit 1
415+
fi
416+
sleep $((10 * attempt))
417+
done
418+
shell: bash
379419
- uses: Swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
380420
with:
381421
cache-provider: "buildjet"
@@ -397,7 +437,7 @@ jobs:
397437
run: echo "TENSORZERO_CLICKHOUSE_CLUSTER_NAME=tensorzero_e2e_tests_cluster" >> $GITHUB_ENV
398438

399439
- name: Install uv
400-
run: curl -LsSf https://astral.sh/uv/0.6.17/install.sh | sh
440+
run: curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh
401441

402442
- name: Download ClickHouse fixtures
403443
run: uv run ./ui/fixtures/download-fixtures.py
@@ -431,7 +471,7 @@ jobs:
431471
run: |
432472
cargo run-e2e > e2e_logs.txt 2>&1 &
433473
count=0
434-
max_attempts=10
474+
max_attempts=20
435475
while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
436476
echo "Waiting for gateway to be healthy..."
437477
sleep 1
@@ -449,7 +489,7 @@ jobs:
449489
cargo run-e2e --run-migrations-only &&
450490
cargo run-e2e > e2e_logs.txt 2>&1 &
451491
count=0
452-
max_attempts=10
492+
max_attempts=40
453493
while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
454494
echo "Waiting for gateway to be healthy..."
455495
sleep 1
@@ -523,7 +563,19 @@ jobs:
523563

524564
steps:
525565
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
526-
- uses: dtolnay/rust-toolchain@stable
566+
- name: Install Rust toolchain
567+
run: |
568+
for attempt in 1 2 3; do
569+
if rustup toolchain install stable && rustup default stable; then
570+
break
571+
fi
572+
if [ $attempt -eq 3 ]; then
573+
echo "Failed to install Rust toolchain after 3 attempts"
574+
exit 1
575+
fi
576+
sleep $((10 * attempt))
577+
done
578+
shell: bash
527579
- uses: Swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
528580
with:
529581
cache-provider: "buildjet"
@@ -535,7 +587,7 @@ jobs:
535587
tool: cargo-nextest
536588

537589
- name: Install uv
538-
run: curl -LsSf https://astral.sh/uv/0.6.17/install.sh | sh
590+
run: curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh
539591

540592
- name: Download ClickHouse fixtures
541593
run: uv run ./ui/fixtures/download-fixtures.py
@@ -594,6 +646,8 @@ jobs:
594646
check-python-client-build,
595647
check-node-bindings,
596648
build-windows,
649+
build-ui-container,
650+
build-gateway-container,
597651
validate,
598652
clickhouse-tests,
599653
ui-tests,
@@ -602,5 +656,7 @@ jobs:
602656
]
603657
runs-on: ubuntu-latest
604658
steps:
605-
- if: ${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
659+
# When running in the merge queue, jobs should never be skipped.
660+
# In PR CI, some jobs may be intentionally skipped (e.g. due to running from a fork, or to save money)
661+
- if: ${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled' || (github.event_name == 'merge_group' && contains(needs.*.result, 'skipped'))) }}
606662
run: exit 1

0 commit comments

Comments
 (0)