diff --git a/.changeset/shy-rats-lick.md b/.changeset/shy-rats-lick.md new file mode 100644 index 000000000..9ef1a3780 --- /dev/null +++ b/.changeset/shy-rats-lick.md @@ -0,0 +1,5 @@ +--- +"braintrust": patch +--- + +Fix eval summaries to compare against the experiment’s explicit base experiment ID. diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts index 7c77dc4e2..12bcbb8c1 100644 --- a/js/src/framework.test.ts +++ b/js/src/framework.test.ts @@ -759,6 +759,91 @@ test("Eval with returnResults: true collects all results", async () => { expect(result.summary.scores.exact_match.score).toBe(1); }); +test("runEvaluator forwards baseExperimentId to summary", async () => { + await _exportsForTestingOnly.simulateLoginForTests(); + const experiment = _exportsForTestingOnly.initTestExperiment( + "js-base-experiment-id", + "proj", + ); + const expectedSummary = { + projectName: "proj", + experimentName: "js-base-experiment-id", + projectId: "proj", + experimentId: "js-base-experiment-id", + scores: {}, + metrics: {}, + }; + const summarize = vi + .spyOn(experiment, "summarize") + .mockResolvedValue(expectedSummary); + + const result = await runEvaluator( + experiment, + { + projectName: "proj", + evalName: "js-base-experiment-id", + data: [{ input: "hello", expected: "hello" }], + task: (input) => input, + scores: [], + baseExperimentId: "base-exp-id", + }, + new NoopProgressReporter(), + [], + undefined, + undefined, + true, + ); + + expect(result.summary).toBe(expectedSummary); + expect(summarize).toHaveBeenCalledWith({ + summarizeScores: undefined, + comparisonExperimentId: "base-exp-id", + }); +}); + +test("runEvaluator forwards persisted baseExperimentName id to summary", async () => { + await _exportsForTestingOnly.simulateLoginForTests(); + const experiment = _exportsForTestingOnly.initTestExperiment( + "js-base-experiment-name", + "proj", + { base_exp_id: "resolved-base-exp-id" }, + ); + const expectedSummary = { + projectName: "proj", + experimentName: "js-base-experiment-name", + projectId: "proj", + experimentId: "js-base-experiment-name", + scores: {}, + metrics: {}, + }; + const summarize = vi + .spyOn(experiment, "summarize") + .mockResolvedValue(expectedSummary); + + const result = await runEvaluator( + experiment, + { + projectName: "proj", + evalName: "js-base-experiment-name", + data: [{ input: "hello", expected: "hello" }], + task: (input) => input, + scores: [], + baseExperimentName: "base-exp", + }, + new NoopProgressReporter(), + [], + undefined, + undefined, + true, + ); + + expect(result.summary).toBe(expectedSummary); + expect(summarize).toHaveBeenCalledWith({ + summarizeScores: undefined, + comparisonExperimentId: "resolved-base-exp-id", + }); +}); + test("tags can be appended and logged to root span", async () => { await _exportsForTestingOnly.simulateLoginForTests(); const memoryLogger = _exportsForTestingOnly.useTestBackgroundLogger(); diff --git a/js/src/framework.ts b/js/src/framework.ts index 73629e7ac..3183a575a 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -410,6 +410,16 @@ export type { ReporterDef, } from "./reporters/types"; +async function getPersistedBaseExperimentId( + experiment: Experiment, +): Promise { + try { + return await experiment._getBaseExperimentId(); + } catch { + return undefined; + } +} + function makeEvalName(projectName: string, experimentName?: string) { let out = projectName; if (experimentName) { @@ -1636,9 +1646,17 @@ async function runEvaluatorInternal( } } + const comparisonExperimentId = experiment + ? (evaluator.baseExperimentId ?? + (await getPersistedBaseExperimentId(experiment))) + : undefined; + const summary = experiment ? await experiment.summarize({ summarizeScores: evaluator.summarizeScores, + ...(comparisonExperimentId !== undefined + ? { comparisonExperimentId } + : {}), }) : buildLocalSummary( evaluator, diff --git a/js/src/logger.test.ts b/js/src/logger.test.ts index 352b71f52..7a5e12b70 100644 --- a/js/src/logger.test.ts +++ b/js/src/logger.test.ts @@ -825,6 +825,53 @@ test("dataset.toEvalData preserves dataset_snapshot_name", async () => { vi.restoreAllMocks(); }); +test("experiment.summarize resolves explicit comparison experiment name", async () => { + const state = await _exportsForTestingOnly.simulateLoginForTests(); + const memoryLogger = _exportsForTestingOnly.useTestBackgroundLogger(); + const experiment = _exportsForTestingOnly.initTestExperiment( + "test-evaluator", + "test-project", + ); + + try { + const getJson = vi + .spyOn(state.apiConn(), "get_json") + .mockImplementation(async (path, args) => { + if (path === "v1/experiment/base-exp-id") { + return { name: "base-exp" }; + } + if (path === "/experiment-comparison2") { + expect(args).toEqual({ + experiment_id: "test-evaluator", + base_experiment_id: "base-exp-id", + }); + return { scores: {}, metrics: {} }; + } + throw new Error(`Unexpected get_json call: ${path}`); + }); + + const summary = await experiment.summarize({ + comparisonExperimentId: "base-exp-id", + }); + + expect(summary.comparisonExperimentName).toBe("base-exp"); + expect(getJson).toHaveBeenCalledWith("v1/experiment/base-exp-id"); + expect(getJson).toHaveBeenCalledWith( + "/experiment-comparison2", + { + experiment_id: "test-evaluator", + base_experiment_id: "base-exp-id", + }, + 3, + ); + } finally { + await memoryLogger.flush(); + _exportsForTestingOnly.clearTestBackgroundLogger(); + _exportsForTestingOnly.simulateLogoutForTests(); + vi.restoreAllMocks(); + } +}); + test("dataset.version preserves pinned-version fast path", async () => { const state = await _exportsForTestingOnly.simulateLoginForTests(); const login = vi.spyOn(state, "login").mockResolvedValue(state); diff --git a/js/src/logger.ts b/js/src/logger.ts index ccbe22ef6..f81200f66 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -1063,6 +1063,7 @@ function clearTestBackgroundLogger() { function initTestExperiment( experimentName: string, projectName?: string, + experimentFullInfo: Record = {}, ): Experiment { setInitialTestState(); const state = _internalGetGlobalState(); @@ -1071,7 +1072,11 @@ function initTestExperiment( const lazyMetadata: LazyValue = new LazyValue( async () => ({ project: { id: project, name: project, fullInfo: {} }, - experiment: { id: experimentName, name: experimentName, fullInfo: {} }, + experiment: { + id: experimentName, + name: experimentName, + fullInfo: experimentFullInfo, + }, }), ); @@ -6317,6 +6322,14 @@ export class Experiment })(); } + public async _getBaseExperimentId(): Promise { + const baseExperimentId = (await this.lazyMetadata.get()).experiment + .fullInfo["base_exp_id"]; + return typeof baseExperimentId === "string" && baseExperimentId + ? baseExperimentId + : undefined; + } + private parentObjectType() { return SpanObjectTypeV3.EXPERIMENT; } @@ -6484,6 +6497,17 @@ export class Experiment comparisonExperimentId = baseExperiment.id; comparisonExperimentName = baseExperiment.name; } + } else { + try { + const comparisonExperiment = await state + .apiConn() + .get_json(`v1/experiment/${comparisonExperimentId}`); + if (typeof comparisonExperiment["name"] === "string") { + comparisonExperimentName = comparisonExperiment["name"]; + } + } catch { + // If the explicit comparison name lookup fails, still summarize by ID. + } } try {