@@ -3,6 +3,8 @@ run-name: "General Checks for: ${{ github.event.pull_request.title || github.ref
33
44on :
55 merge_group :
6+ paths-ignore :
7+ - " docs/**"
68 pull_request :
79 branches : ["main"]
810
@@ -109,7 +111,18 @@ jobs:
109111 node-version : " 22.9.0"
110112
111113 - name : Install pnpm
112- uses : pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda
114+ run : |
115+ for attempt in 1 2 3; do
116+ if npm install -g pnpm@latest; then
117+ break
118+ fi
119+ if [ $attempt -eq 3 ]; then
120+ echo "Failed to install pnpm after 3 attempts"
121+ exit 1
122+ fi
123+ sleep $((10 * attempt))
124+ done
125+ shell : bash
113126
114127 - name : Install dependencies
115128 working-directory : internal/tensorzero-node
@@ -138,7 +151,19 @@ jobs:
138151 runs-on : windows-latest
139152 steps :
140153 - uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
141- - uses : dtolnay/rust-toolchain@stable
154+ - name : Install Rust toolchain
155+ run : |
156+ for attempt in 1 2 3; do
157+ if rustup toolchain install stable && rustup default stable; then
158+ break
159+ fi
160+ if [ $attempt -eq 3 ]; then
161+ echo "Failed to install Rust toolchain after 3 attempts"
162+ exit 1
163+ fi
164+ sleep $((10 * attempt))
165+ done
166+ shell : bash
142167 - uses : Swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
143168 with :
144169 cache-provider : " buildjet"
@@ -147,29 +172,27 @@ jobs:
147172 run : cargo build --workspace
148173
149174 validate :
150- runs-on : namespace-profile-tensorzero-8x16
175+ runs-on : namespace-profile-tensorzero-32x64
151176
152177 timeout-minutes : 30
153178
154179 steps :
155180 - uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
156181
157- # We allow the namespace builder setup to fail on Dependabot PRs and PRs from forks
158- # (where the oidc token is not available)
159-
160- - name : Install Namespace CLI
161- uses : namespacelabs/nscloud-setup@d1c625762f7c926a54bd39252efff0705fd11c64
162- continue-on-error : ${{ github.event.pull_request.head.repo.full_name != github.repository || github.actor == 'dependabot[bot]' }}
163-
164- - name : Configure Namespace-powered Buildx
165- uses : namespacelabs/nscloud-setup-buildx-action@84ca8c58fdf372d6a4750476cd09b7b96ee778ca
166- continue-on-error : ${{ github.event.pull_request.head.repo.full_name != github.repository || github.actor == 'dependabot[bot]' }}
167-
168182 # We deliberately install our MSRV here (rather than 'stable') to ensure that everything compiles with that version
169183 - name : Install Rust 1.86.0
170184 run : |
171- rustup install 1.86.0 --component clippy,rustfmt
172- rustup default 1.86.0
185+ for attempt in 1 2 3; do
186+ if rustup install 1.86.0 --component clippy,rustfmt && rustup default 1.86.0; then
187+ break
188+ fi
189+ if [ $attempt -eq 3 ]; then
190+ echo "Failed to install Rust 1.86.0 after 3 attempts"
191+ exit 1
192+ fi
193+ sleep $((10 * attempt))
194+ done
195+ shell : bash
173196
174197 - name : Print Rust version
175198 run : rustc --version
@@ -180,10 +203,21 @@ jobs:
180203 node-version : " 22.9.0"
181204
182205 - name : Install pnpm
183- uses : pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda
206+ run : |
207+ for attempt in 1 2 3; do
208+ if npm install -g pnpm@latest; then
209+ break
210+ fi
211+ if [ $attempt -eq 3 ]; then
212+ echo "Failed to install pnpm after 3 attempts"
213+ exit 1
214+ fi
215+ sleep $((10 * attempt))
216+ done
217+ shell : bash
184218
185219 - name : Install uv
186- run : curl -LsSf https://astral.sh/uv/0.6.17/install.sh | sh
220+ run : curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh
187221
188222 - name : Configure Namespace cache for Rust, Python (pip), and pnpm
189223 uses : namespacelabs/nscloud-cache-action@2f50e7d0f70475e6f59a55ba0f05eec9108e77cc
@@ -359,23 +393,29 @@ jobs:
359393 # We don't run many tests here, so use a normal runner with Github Actions caching
360394 # to avoid unnecessarily using Namespace credits (it should still always finish before
361395 # the main 'validate' job)
362- runs-on : ${{ matrix.replicated && 'namespace-profile-tensorzero-16x32 ' || 'ubuntu-latest ' }}
396+ runs-on : ${{ matrix.replicated && 'namespace-profile-tensorzero-32x64 ' || 'namespace-profile-tensorzero-16x32 ' }}
363397 continue-on-error : ${{ matrix.clickhouse_version.allow_failure }}
364398 strategy :
365399 matrix :
366400 # Only include replicated: true when running in merge queue
367401 replicated : ${{ github.event_name == 'merge_group' && fromJSON('[true, false]') || fromJSON('[false]') }}
368- clickhouse_version :
369- - tag : " 24.12-alpine"
370- prefix : " 24.12"
371- allow_failure : false
372- - tag : " latest-alpine"
373- prefix : " "
374- allow_failure : false
402+ clickhouse_version : ${{ github.event_name == 'merge_group' && fromJSON('[{"tag":"24.12-alpine","prefix":"24.12","allow_failure":false},{"tag":"25.7-alpine","prefix":"25.7","allow_failure":false}]') || fromJSON('[{"tag":"25.7-alpine","prefix":"25.7","allow_failure":false}]') }}
375403
376404 steps :
377405 - uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
378- - uses : dtolnay/rust-toolchain@stable
406+ - name : Install Rust toolchain
407+ run : |
408+ for attempt in 1 2 3; do
409+ if rustup toolchain install stable && rustup default stable; then
410+ break
411+ fi
412+ if [ $attempt -eq 3 ]; then
413+ echo "Failed to install Rust toolchain after 3 attempts"
414+ exit 1
415+ fi
416+ sleep $((10 * attempt))
417+ done
418+ shell : bash
379419 - uses : Swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
380420 with :
381421 cache-provider : " buildjet"
@@ -397,7 +437,7 @@ jobs:
397437 run : echo "TENSORZERO_CLICKHOUSE_CLUSTER_NAME=tensorzero_e2e_tests_cluster" >> $GITHUB_ENV
398438
399439 - name : Install uv
400- run : curl -LsSf https://astral.sh/uv/0.6.17/install.sh | sh
440+ run : curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh
401441
402442 - name : Download ClickHouse fixtures
403443 run : uv run ./ui/fixtures/download-fixtures.py
@@ -431,7 +471,7 @@ jobs:
431471 run : |
432472 cargo run-e2e > e2e_logs.txt 2>&1 &
433473 count=0
434- max_attempts=10
474+ max_attempts=20
435475 while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
436476 echo "Waiting for gateway to be healthy..."
437477 sleep 1
@@ -449,7 +489,7 @@ jobs:
449489 cargo run-e2e --run-migrations-only &&
450490 cargo run-e2e > e2e_logs.txt 2>&1 &
451491 count=0
452- max_attempts=10
492+ max_attempts=40
453493 while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
454494 echo "Waiting for gateway to be healthy..."
455495 sleep 1
@@ -523,7 +563,19 @@ jobs:
523563
524564 steps :
525565 - uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
526- - uses : dtolnay/rust-toolchain@stable
566+ - name : Install Rust toolchain
567+ run : |
568+ for attempt in 1 2 3; do
569+ if rustup toolchain install stable && rustup default stable; then
570+ break
571+ fi
572+ if [ $attempt -eq 3 ]; then
573+ echo "Failed to install Rust toolchain after 3 attempts"
574+ exit 1
575+ fi
576+ sleep $((10 * attempt))
577+ done
578+ shell : bash
527579 - uses : Swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
528580 with :
529581 cache-provider : " buildjet"
@@ -535,7 +587,7 @@ jobs:
535587 tool : cargo-nextest
536588
537589 - name : Install uv
538- run : curl -LsSf https://astral.sh/uv/0.6.17/install.sh | sh
590+ run : curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh
539591
540592 - name : Download ClickHouse fixtures
541593 run : uv run ./ui/fixtures/download-fixtures.py
@@ -594,6 +646,8 @@ jobs:
594646 check-python-client-build,
595647 check-node-bindings,
596648 build-windows,
649+ build-ui-container,
650+ build-gateway-container,
597651 validate,
598652 clickhouse-tests,
599653 ui-tests,
@@ -602,5 +656,7 @@ jobs:
602656 ]
603657 runs-on : ubuntu-latest
604658 steps :
605- - if : ${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
659+ # When running in the merge queue, jobs should never be skipped.
660+ # In PR CI, some jobs may be intentionally skipped (e.g. due to running from a fork, or to save money)
661+ - if : ${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled' || (github.event_name == 'merge_group' && contains(needs.*.result, 'skipped'))) }}
606662 run : exit 1
0 commit comments