From 474d31b001389115cbbfeb717706da3589f43227 Mon Sep 17 00:00:00 2001 From: Gabriel De Andrade Date: Wed, 15 Oct 2025 18:21:16 -0400 Subject: [PATCH 1/5] feat: (evals) add axiom reporting step --- examples/example-evals-nextjs/axiom.config.ts | 3 + .../evaluations/ticket-classification.eval.ts | 2 +- packages/ai/src/config/index.ts | 8 ++ packages/ai/src/config/resolver.ts | 1 + packages/ai/src/evals/eval.service.ts | 54 ++++++++++++ packages/ai/src/evals/eval.ts | 61 ++++++++++++- packages/ai/src/utils/fetcher.ts | 16 ++++ pnpm-lock.yaml | 86 +++++++++---------- 8 files changed, 185 insertions(+), 46 deletions(-) create mode 100644 packages/ai/src/utils/fetcher.ts diff --git a/examples/example-evals-nextjs/axiom.config.ts b/examples/example-evals-nextjs/axiom.config.ts index bb536150..25388f27 100644 --- a/examples/example-evals-nextjs/axiom.config.ts +++ b/examples/example-evals-nextjs/axiom.config.ts @@ -7,6 +7,9 @@ export default defineConfig({ token: process.env.AXIOM_TOKEN, dataset: process.env.AXIOM_DATASET, + // TODO: REMOVE THIS @gabrielelpidio + apiUrl: process.env.AXIOM_API_URL, + include: ['**/*.eval.{ts,js,mts,mjs,cts,cjs}'], exclude: [], diff --git a/examples/example-evals-nextjs/src/lib/capabilities/classify-ticket/evaluations/ticket-classification.eval.ts b/examples/example-evals-nextjs/src/lib/capabilities/classify-ticket/evaluations/ticket-classification.eval.ts index 0b807f20..93d37de8 100644 --- a/examples/example-evals-nextjs/src/lib/capabilities/classify-ticket/evaluations/ticket-classification.eval.ts +++ b/examples/example-evals-nextjs/src/lib/capabilities/classify-ticket/evaluations/ticket-classification.eval.ts @@ -3,7 +3,7 @@ import { jaccardResponseScorer, spamClassificationScorer } from '../../../scorer import { classifyTicketStep } from '../../../capabilities/classify-ticket/prompts'; import { pickFlags } from '@/lib/app-scope'; -Eval('Spam classification', { +Eval('spam-classification', { configFlags: pickFlags('ticketClassification'), data: () => [ { diff --git a/packages/ai/src/config/index.ts b/packages/ai/src/config/index.ts index f36bac94..e770d32f 100644 --- a/packages/ai/src/config/index.ts +++ b/packages/ai/src/config/index.ts @@ -27,6 +27,13 @@ export interface AxiomConnectionConfig { */ url?: string; + /** + * TODO: REMOVE THIS @gabrielelpidio + * + * temp apiUrl + */ + apiUrl?: string; + /** * Axiom API token (can be undefined if not set) * @example process.env.AXIOM_TOKEN @@ -48,6 +55,7 @@ export interface AxiomConnectionConfig { */ export interface AxiomEvalInstrumentationOptions { url: string; + apiUrl: string; token: string; dataset: string; } diff --git a/packages/ai/src/config/resolver.ts b/packages/ai/src/config/resolver.ts index c56f278e..7a19204c 100644 --- a/packages/ai/src/config/resolver.ts +++ b/packages/ai/src/config/resolver.ts @@ -14,6 +14,7 @@ export function resolveAxiomConnection( ): AxiomEvalInstrumentationOptions { return { url: config.eval.url, + apiUrl: config.eval.apiUrl, token: config.eval.token, dataset: config.eval.dataset, }; diff --git a/packages/ai/src/evals/eval.service.ts b/packages/ai/src/evals/eval.service.ts index 60be54ea..5961f487 100644 --- a/packages/ai/src/evals/eval.service.ts +++ b/packages/ai/src/evals/eval.service.ts @@ -1,7 +1,61 @@ import type { Case, Chat, Evaluation, Task } from './eval.types'; +import { createFetcher, type Fetcher } from '../utils/fetcher'; import type { ResolvedAxiomConfig } from '../config/index'; import { resolveAxiomConnection } from '../config/resolver'; +export interface EvaluationApiConfig { + readonly dataset?: string; + readonly region?: string; + readonly baseUrl?: string; + readonly apiUrl?: string; + readonly token?: string; +} + +export type EvaluationStatus = 'running' | 'completed' | 'errored' | 'cancelled'; + +export interface EvaluationApiPayloadBase { + readonly id: string; + readonly name: string; + readonly dataset: string; + readonly region: string; + readonly baselineId?: string; + readonly totalCases?: number; + readonly scorers?: readonly string[]; + readonly config?: Readonly>; + readonly status: EvaluationStatus; + readonly successCases?: number; + readonly erroredCases?: number; + readonly durationMs?: number; + readonly scorerAvgs?: readonly number[]; +} + +export class EvaluationApiClient { + private readonly fetcher: Fetcher; + constructor(config: ResolvedAxiomConfig) { + const { apiUrl, token } = resolveAxiomConnection(config); + + this.fetcher = createFetcher(apiUrl, token ?? ''); + } + + async createEvaluation(evaluation: EvaluationApiPayloadBase) { + const resp = await this.fetcher(`/api/evaluations/v3`, { + method: 'POST', + body: JSON.stringify(evaluation), + }); + + return resp.json(); + } + + async updateEvaluation(evaluation: Partial) { + const resp = await this.fetcher(`/api/evaluations/v3/${evaluation.id}`, { + method: 'PATCH', + body: JSON.stringify(evaluation), + }); + + return resp.json(); + } +} + /** Query axiom to find a baseline for an Eval */ export const findBaseline = async (evalName: string, config: ResolvedAxiomConfig) => { const { dataset, url, token } = resolveAxiomConnection(config); diff --git a/packages/ai/src/evals/eval.ts b/packages/ai/src/evals/eval.ts index c5f7945d..2a68f824 100644 --- a/packages/ai/src/evals/eval.ts +++ b/packages/ai/src/evals/eval.ts @@ -19,7 +19,12 @@ import type { OutOfScopeFlag, } from './eval.types'; import type { Score, Scorer } from './scorers'; -import { findBaseline, findEvaluationCases } from './eval.service'; +import { + EvaluationApiClient, + findBaseline, + findEvaluationCases, + type EvaluationStatus, +} from './eval.service'; import { getGlobalFlagOverrides, setGlobalFlagOverrides } from './context/global-flags'; import { deepEqual } from '../util/deep-equal'; import { dotNotationToNested } from '../util/dot-path'; @@ -171,9 +176,12 @@ async function registerEval< ? await findEvaluationCases(baselineId, axiomConfig) : await findBaseline(evalName, axiomConfig); + const evaluationApiClient = new EvaluationApiClient(axiomConfig); + // create a version code const evalVersion = nanoid(); let evalId = ''; // get traceId + let suiteStart: number; let suiteSpan: ReturnType | undefined; let suiteContext: Context | undefined; @@ -189,6 +197,14 @@ async function registerEval< | undefined; beforeAll(async (suite) => { + suite.meta.evaluation = { + id: evalId, + name: evalName, + version: evalVersion, + baseline: baseline ?? undefined, + configFlags: opts.configFlags, + }; + try { await instrumentationReady; } catch (error) { @@ -217,9 +233,32 @@ async function registerEval< }, }); evalId = suiteSpan.spanContext().traceId; + suite.meta.evaluation.id = evalId; suiteSpan.setAttribute(Attr.Eval.ID, evalId); suiteContext = trace.setSpan(context.active(), suiteSpan); + // Report evaluation creation to API + const res = await evaluationApiClient.createEvaluation({ + id: evalId, + name: evalName, + dataset: axiomConfig.eval.dataset, + // TODO: add region to axiomConfig? + region: 'US', + baselineId: baseline?.id ?? undefined, + totalCases: dataset.length, + scorers: opts.scorers?.map((s) => s.name) ?? [], + config: { + flags: opts.configFlags ?? [], + }, + status: 'running', + }); + + if (!res) { + // TODO: Remove from release @gabrielelpidio + console.error('Error creating evaluation, skipping'); + return; + } + // Ensure worker process knows CLI overrides if (injectedOverrides && Object.keys(injectedOverrides).length > 0) { try { @@ -239,6 +278,7 @@ async function registerEval< suite.meta.evaluation.flagConfig = flagConfig; const flagConfigJson = JSON.stringify(flagConfig); suiteSpan.setAttribute('eval.config.flags', flagConfigJson); + suiteStart = performance.now(); }); afterAll(async (suite) => { @@ -287,6 +327,25 @@ async function registerEval< }; } + const status: EvaluationStatus = suite ? 'errored' : 'completed'; + const durationMs = Math.round(performance.now() - suiteStart); + + const successCases = suite.tasks.filter( + (task) => task.meta.case.status === 'success', + ).length; + const erroredCases = suite.tasks.filter( + (task) => task.meta.case.status === 'fail' || task.meta.case.status === 'pending', + ).length; + + await evaluationApiClient.updateEvaluation({ + id: evalId, + status, + totalCases: dataset.length, + successCases, + erroredCases, + durationMs, + }); + // end root span suiteSpan?.setStatus({ code: SpanStatusCode.OK }); suiteSpan?.end(); diff --git a/packages/ai/src/utils/fetcher.ts b/packages/ai/src/utils/fetcher.ts new file mode 100644 index 00000000..ea31c20a --- /dev/null +++ b/packages/ai/src/utils/fetcher.ts @@ -0,0 +1,16 @@ +export interface Fetcher { + (path: string, options: RequestInit): Promise; +} + +export const createFetcher = (baseUrl: string, token: string): Fetcher => { + return (path: string, options: RequestInit) => + fetch(new URL(path, baseUrl).toString(), { + ...options, + headers: { + ...options.headers, + 'content-type': 'application/json', + authorization: `Bearer ${token}`, + 'x-axiom-check': 'good', + }, + }); +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2a345c75..5173b5a5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -73,7 +73,7 @@ importers: version: link:../../packages/ai next: specifier: latest - version: 15.5.4(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + version: 15.5.3(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) react: specifier: ^18.2.0 version: 18.3.1 @@ -204,7 +204,7 @@ importers: version: link:../../packages/ai next: specifier: latest - version: 15.5.4(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + version: 15.5.3(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) react: specifier: ^18.2.0 version: 18.3.1 @@ -268,7 +268,7 @@ importers: version: link:../../packages/ai next: specifier: latest - version: 15.5.4(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + version: 15.5.3(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) react: specifier: ^18.2.0 version: 18.3.1 @@ -1151,57 +1151,57 @@ packages: '@next/env@15.4.6': resolution: {integrity: sha512-yHDKVTcHrZy/8TWhj0B23ylKv5ypocuCwey9ZqPyv4rPdUdRzpGCkSi03t04KBPyU96kxVtUqx6O3nE1kpxASQ==} - '@next/env@15.5.4': - resolution: {integrity: sha512-27SQhYp5QryzIT5uO8hq99C69eLQ7qkzkDPsk3N+GuS2XgOgoYEeOav7Pf8Tn4drECOVDsDg8oj+/DVy8qQL2A==} + '@next/env@15.5.3': + resolution: {integrity: sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==} - '@next/swc-darwin-arm64@15.5.4': - resolution: {integrity: sha512-nopqz+Ov6uvorej8ndRX6HlxCYWCO3AHLfKK2TYvxoSB2scETOcfm/HSS3piPqc3A+MUgyHoqE6je4wnkjfrOA==} + '@next/swc-darwin-arm64@15.5.3': + resolution: {integrity: sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==} engines: {node: '>= 10'} cpu: [arm64] os: [darwin] - '@next/swc-darwin-x64@15.5.4': - resolution: {integrity: sha512-QOTCFq8b09ghfjRJKfb68kU9k2K+2wsC4A67psOiMn849K9ZXgCSRQr0oVHfmKnoqCbEmQWG1f2h1T2vtJJ9mA==} + '@next/swc-darwin-x64@15.5.3': + resolution: {integrity: sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==} engines: {node: '>= 10'} cpu: [x64] os: [darwin] - '@next/swc-linux-arm64-gnu@15.5.4': - resolution: {integrity: sha512-eRD5zkts6jS3VfE/J0Kt1VxdFqTnMc3QgO5lFE5GKN3KDI/uUpSyK3CjQHmfEkYR4wCOl0R0XrsjpxfWEA++XA==} + '@next/swc-linux-arm64-gnu@15.5.3': + resolution: {integrity: sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] libc: [glibc] - '@next/swc-linux-arm64-musl@15.5.4': - resolution: {integrity: sha512-TOK7iTxmXFc45UrtKqWdZ1shfxuL4tnVAOuuJK4S88rX3oyVV4ZkLjtMT85wQkfBrOOvU55aLty+MV8xmcJR8A==} + '@next/swc-linux-arm64-musl@15.5.3': + resolution: {integrity: sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] libc: [musl] - '@next/swc-linux-x64-gnu@15.5.4': - resolution: {integrity: sha512-7HKolaj+481FSW/5lL0BcTkA4Ueam9SPYWyN/ib/WGAFZf0DGAN8frNpNZYFHtM4ZstrHZS3LY3vrwlIQfsiMA==} + '@next/swc-linux-x64-gnu@15.5.3': + resolution: {integrity: sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==} engines: {node: '>= 10'} cpu: [x64] os: [linux] libc: [glibc] - '@next/swc-linux-x64-musl@15.5.4': - resolution: {integrity: sha512-nlQQ6nfgN0nCO/KuyEUwwOdwQIGjOs4WNMjEUtpIQJPR2NUfmGpW2wkJln1d4nJ7oUzd1g4GivH5GoEPBgfsdw==} + '@next/swc-linux-x64-musl@15.5.3': + resolution: {integrity: sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==} engines: {node: '>= 10'} cpu: [x64] os: [linux] libc: [musl] - '@next/swc-win32-arm64-msvc@15.5.4': - resolution: {integrity: sha512-PcR2bN7FlM32XM6eumklmyWLLbu2vs+D7nJX8OAIoWy69Kef8mfiN4e8TUv2KohprwifdpFKPzIP1njuCjD0YA==} + '@next/swc-win32-arm64-msvc@15.5.3': + resolution: {integrity: sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==} engines: {node: '>= 10'} cpu: [arm64] os: [win32] - '@next/swc-win32-x64-msvc@15.5.4': - resolution: {integrity: sha512-1ur2tSHZj8Px/KMAthmuI9FMp/YFusMMGoRNJaRZMOlSkgvLjzosSdQI0cJAKogdHl3qXUQKL9MGaYvKwA7DXg==} + '@next/swc-win32-x64-msvc@15.5.3': + resolution: {integrity: sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==} engines: {node: '>= 10'} cpu: [x64] os: [win32] @@ -1280,7 +1280,6 @@ packages: '@opentelemetry/exporter-jaeger@2.0.1': resolution: {integrity: sha512-FeHtOp2XMhYxzYhC8sXhsc3gMeoDzjI+CGuPX+vRSyUdHZHDKTMoY9jRfk8ObmZsZDTWmd63Yqcf4X472YtHeA==} engines: {node: ^18.19.0 || >=20.6.0} - deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. peerDependencies: '@opentelemetry/api': ^1.0.0 @@ -1660,7 +1659,6 @@ packages: '@opentelemetry/otlp-proto-exporter-base@0.41.2': resolution: {integrity: sha512-BxmEMiP6tHiFroe5/dTt9BsxCci7BTLtF7A6d4DKHLiLweWWZxQ9l7hON7qt/IhpKrQcAFD1OzZ1Gq2ZkNzhCw==} engines: {node: '>=14'} - deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. peerDependencies: '@opentelemetry/api': ^1.0.0 @@ -3208,8 +3206,8 @@ packages: neo-async@2.6.2: resolution: {integrity: sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==} - next@15.5.4: - resolution: {integrity: sha512-xH4Yjhb82sFYQfY3vbkJfgSDgXvBB6a8xPs9i35k6oZJRoQRihZH+4s9Yo2qsWpzBmZ3lPXaJ2KPXLfkvW4LnA==} + next@15.5.3: + resolution: {integrity: sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==} engines: {node: ^18.18.0 || ^19.8.0 || >= 20.0.0} hasBin: true peerDependencies: @@ -4518,30 +4516,30 @@ snapshots: '@next/env@15.4.6': {} - '@next/env@15.5.4': {} + '@next/env@15.5.3': {} - '@next/swc-darwin-arm64@15.5.4': + '@next/swc-darwin-arm64@15.5.3': optional: true - '@next/swc-darwin-x64@15.5.4': + '@next/swc-darwin-x64@15.5.3': optional: true - '@next/swc-linux-arm64-gnu@15.5.4': + '@next/swc-linux-arm64-gnu@15.5.3': optional: true - '@next/swc-linux-arm64-musl@15.5.4': + '@next/swc-linux-arm64-musl@15.5.3': optional: true - '@next/swc-linux-x64-gnu@15.5.4': + '@next/swc-linux-x64-gnu@15.5.3': optional: true - '@next/swc-linux-x64-musl@15.5.4': + '@next/swc-linux-x64-musl@15.5.3': optional: true - '@next/swc-win32-arm64-msvc@15.5.4': + '@next/swc-win32-arm64-msvc@15.5.3': optional: true - '@next/swc-win32-x64-msvc@15.5.4': + '@next/swc-win32-x64-msvc@15.5.3': optional: true '@nodelib/fs.scandir@2.1.5': @@ -6890,9 +6888,9 @@ snapshots: neo-async@2.6.2: {} - next@15.5.4(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1): + next@15.5.3(@opentelemetry/api@1.9.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1): dependencies: - '@next/env': 15.5.4 + '@next/env': 15.5.3 '@swc/helpers': 0.5.15 caniuse-lite: 1.0.30001735 postcss: 8.4.31 @@ -6900,14 +6898,14 @@ snapshots: react-dom: 18.3.1(react@18.3.1) styled-jsx: 5.1.6(react@18.3.1) optionalDependencies: - '@next/swc-darwin-arm64': 15.5.4 - '@next/swc-darwin-x64': 15.5.4 - '@next/swc-linux-arm64-gnu': 15.5.4 - '@next/swc-linux-arm64-musl': 15.5.4 - '@next/swc-linux-x64-gnu': 15.5.4 - '@next/swc-linux-x64-musl': 15.5.4 - '@next/swc-win32-arm64-msvc': 15.5.4 - '@next/swc-win32-x64-msvc': 15.5.4 + '@next/swc-darwin-arm64': 15.5.3 + '@next/swc-darwin-x64': 15.5.3 + '@next/swc-linux-arm64-gnu': 15.5.3 + '@next/swc-linux-arm64-musl': 15.5.3 + '@next/swc-linux-x64-gnu': 15.5.3 + '@next/swc-linux-x64-musl': 15.5.3 + '@next/swc-win32-arm64-msvc': 15.5.3 + '@next/swc-win32-x64-msvc': 15.5.3 '@opentelemetry/api': 1.9.0 sharp: 0.34.3 transitivePeerDependencies: From da2a52566475ee477bbb8da9a70343b2203d1ae7 Mon Sep 17 00:00:00 2001 From: Gabriel De Andrade Date: Wed, 15 Oct 2025 19:15:41 -0400 Subject: [PATCH 2/5] feat:(evals): add resources url builder and override --- examples/example-evals-nextjs/axiom.config.ts | 3 --- packages/ai/src/config/index.ts | 15 ++++++--------- packages/ai/src/config/resolver.ts | 19 ++++++++++++++++++- packages/ai/src/evals/eval.service.ts | 4 ++-- packages/ai/src/evals/instrument.ts | 2 +- 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/examples/example-evals-nextjs/axiom.config.ts b/examples/example-evals-nextjs/axiom.config.ts index 25388f27..bb536150 100644 --- a/examples/example-evals-nextjs/axiom.config.ts +++ b/examples/example-evals-nextjs/axiom.config.ts @@ -7,9 +7,6 @@ export default defineConfig({ token: process.env.AXIOM_TOKEN, dataset: process.env.AXIOM_DATASET, - // TODO: REMOVE THIS @gabrielelpidio - apiUrl: process.env.AXIOM_API_URL, - include: ['**/*.eval.{ts,js,mts,mjs,cts,cjs}'], exclude: [], diff --git a/packages/ai/src/config/index.ts b/packages/ai/src/config/index.ts index e770d32f..e0cd46f6 100644 --- a/packages/ai/src/config/index.ts +++ b/packages/ai/src/config/index.ts @@ -27,13 +27,6 @@ export interface AxiomConnectionConfig { */ url?: string; - /** - * TODO: REMOVE THIS @gabrielelpidio - * - * temp apiUrl - */ - apiUrl?: string; - /** * Axiom API token (can be undefined if not set) * @example process.env.AXIOM_TOKEN @@ -51,11 +44,15 @@ export interface AxiomConnectionConfig { * Options passed to the instrumentation hook * - url: string * - token: string + * - resourcesUrl: string * - dataset: string */ export interface AxiomEvalInstrumentationOptions { url: string; - apiUrl: string; + /** + * Axiom URL for resources like evaluations, prompts, etc. + */ + resourcesUrl: string; token: string; dataset: string; } @@ -86,7 +83,7 @@ export interface AxiomEvalInstrumentationResult { * ``` */ export type AxiomEvalInstrumentationHook = ( - options: AxiomEvalInstrumentationOptions, + options: Omit, ) => AxiomEvalInstrumentationResult | Promise; /** diff --git a/packages/ai/src/config/resolver.ts b/packages/ai/src/config/resolver.ts index 7a19204c..0910d1cc 100644 --- a/packages/ai/src/config/resolver.ts +++ b/packages/ai/src/config/resolver.ts @@ -1,5 +1,16 @@ import type { AxiomEvalInstrumentationOptions, ResolvedAxiomConfig } from './index'; +/** + * Builds a resources URL under the assumption that the API URL is in the format of https://api.axiom.co by replacing the subdomain with app. + * @param urlString - The API URL + * @returns The resources URL + */ +const buildResourcesUrl = (urlString: string) => { + const url = new URL(urlString); + + return `${url.protocol}//app.${url.host.split('api.').at(-1)}`; +}; + /** * Resolve Axiom connection settings from resolved config. * @@ -12,9 +23,15 @@ import type { AxiomEvalInstrumentationOptions, ResolvedAxiomConfig } from './ind export function resolveAxiomConnection( config: ResolvedAxiomConfig, ): AxiomEvalInstrumentationOptions { + let resourcesUrl = buildResourcesUrl(config.eval.url); + + if ('__overrideResourcesUrl' in config.eval) { + resourcesUrl = config.eval.__overrideResourcesUrl as string; + } + return { url: config.eval.url, - apiUrl: config.eval.apiUrl, + resourcesUrl, token: config.eval.token, dataset: config.eval.dataset, }; diff --git a/packages/ai/src/evals/eval.service.ts b/packages/ai/src/evals/eval.service.ts index 5961f487..c8419c4c 100644 --- a/packages/ai/src/evals/eval.service.ts +++ b/packages/ai/src/evals/eval.service.ts @@ -32,9 +32,9 @@ export interface EvaluationApiPayloadBase { export class EvaluationApiClient { private readonly fetcher: Fetcher; constructor(config: ResolvedAxiomConfig) { - const { apiUrl, token } = resolveAxiomConnection(config); + const { resourcesUrl, token } = resolveAxiomConnection(config); - this.fetcher = createFetcher(apiUrl, token ?? ''); + this.fetcher = createFetcher(resourcesUrl, token ?? ''); } async createEvaluation(evaluation: EvaluationApiPayloadBase) { diff --git a/packages/ai/src/evals/instrument.ts b/packages/ai/src/evals/instrument.ts index c247cc22..df791697 100644 --- a/packages/ai/src/evals/instrument.ts +++ b/packages/ai/src/evals/instrument.ts @@ -28,7 +28,7 @@ let initialized = false; async function runInstrumentationHook( hook: AxiomEvalInstrumentationHook, - options: AxiomEvalInstrumentationOptions, + options: Omit, ): Promise { try { return await hook(options); From 29480f677961cc03d8a356babe0b24786c5dee76 Mon Sep 17 00:00:00 2001 From: Gabriel De Andrade Date: Wed, 15 Oct 2025 19:25:27 -0400 Subject: [PATCH 3/5] feat(evals): send version to api reporting --- packages/ai/src/evals/eval.service.ts | 1 + packages/ai/src/evals/eval.ts | 15 ++++----------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/packages/ai/src/evals/eval.service.ts b/packages/ai/src/evals/eval.service.ts index c8419c4c..baddd82a 100644 --- a/packages/ai/src/evals/eval.service.ts +++ b/packages/ai/src/evals/eval.service.ts @@ -27,6 +27,7 @@ export interface EvaluationApiPayloadBase { readonly erroredCases?: number; readonly durationMs?: number; readonly scorerAvgs?: readonly number[]; + readonly version: string; } export class EvaluationApiClient { diff --git a/packages/ai/src/evals/eval.ts b/packages/ai/src/evals/eval.ts index 2a68f824..a0124cd5 100644 --- a/packages/ai/src/evals/eval.ts +++ b/packages/ai/src/evals/eval.ts @@ -46,7 +46,7 @@ declare module 'vitest' { } } -const nanoid = customAlphabet('1234567890abcdefghijklmnopqrstuvwxyz', 10); +const createVersionId = customAlphabet('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', 10); /** * Creates and registers an evaluation suite with the given name and parameters. @@ -179,7 +179,7 @@ async function registerEval< const evaluationApiClient = new EvaluationApiClient(axiomConfig); // create a version code - const evalVersion = nanoid(); + const evalVersion = createVersionId(); let evalId = ''; // get traceId let suiteStart: number; @@ -237,12 +237,11 @@ async function registerEval< suiteSpan.setAttribute(Attr.Eval.ID, evalId); suiteContext = trace.setSpan(context.active(), suiteSpan); - // Report evaluation creation to API - const res = await evaluationApiClient.createEvaluation({ + await evaluationApiClient.createEvaluation({ id: evalId, name: evalName, dataset: axiomConfig.eval.dataset, - // TODO: add region to axiomConfig? + version: evalVersion, region: 'US', baselineId: baseline?.id ?? undefined, totalCases: dataset.length, @@ -253,12 +252,6 @@ async function registerEval< status: 'running', }); - if (!res) { - // TODO: Remove from release @gabrielelpidio - console.error('Error creating evaluation, skipping'); - return; - } - // Ensure worker process knows CLI overrides if (injectedOverrides && Object.keys(injectedOverrides).length > 0) { try { From 00ecfb483f0aa003f2114a805d821879a98e405f Mon Sep 17 00:00:00 2001 From: Gabriel De Andrade Date: Wed, 15 Oct 2025 22:13:42 -0400 Subject: [PATCH 4/5] fix(evals): correct evaluation status logic in registerEval function --- packages/ai/src/evals/eval.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/ai/src/evals/eval.ts b/packages/ai/src/evals/eval.ts index a0124cd5..0a26572a 100644 --- a/packages/ai/src/evals/eval.ts +++ b/packages/ai/src/evals/eval.ts @@ -320,7 +320,6 @@ async function registerEval< }; } - const status: EvaluationStatus = suite ? 'errored' : 'completed'; const durationMs = Math.round(performance.now() - suiteStart); const successCases = suite.tasks.filter( @@ -330,6 +329,9 @@ async function registerEval< (task) => task.meta.case.status === 'fail' || task.meta.case.status === 'pending', ).length; + // TODO: Is this right? @gabrielelpidio + const status: EvaluationStatus = successCases > erroredCases ? 'completed' : 'errored'; + await evaluationApiClient.updateEvaluation({ id: evalId, status, From 5ac56b55be533311fb88467b33a975b1b16c7590 Mon Sep 17 00:00:00 2001 From: Christopher Ehrlich Date: Fri, 17 Oct 2025 19:12:57 +0900 Subject: [PATCH 5/5] fix eval name --- examples/example-evals-nextjs/test/feature.eval.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example-evals-nextjs/test/feature.eval.ts b/examples/example-evals-nextjs/test/feature.eval.ts index 624de95a..e46c1d40 100644 --- a/examples/example-evals-nextjs/test/feature.eval.ts +++ b/examples/example-evals-nextjs/test/feature.eval.ts @@ -20,7 +20,7 @@ const exactMatchScorer = ({ output, expected }: { output: string; expected?: str }; }; -Eval('Basic demo', { +Eval('Basic-demo', { configFlags: pickFlags('behavior'), data: () => [ {