Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/shy-rats-lick.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"braintrust": patch
---

Fix eval summaries to compare against the experiment’s explicit base experiment ID.
85 changes: 85 additions & 0 deletions js/src/framework.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,91 @@ test("Eval with returnResults: true collects all results", async () => {
expect(result.summary.scores.exact_match.score).toBe(1);
});

test("runEvaluator forwards baseExperimentId to summary", async () => {
await _exportsForTestingOnly.simulateLoginForTests();
const experiment = _exportsForTestingOnly.initTestExperiment(
"js-base-experiment-id",
"proj",
);
const expectedSummary = {
projectName: "proj",
experimentName: "js-base-experiment-id",
projectId: "proj",
experimentId: "js-base-experiment-id",
scores: {},
metrics: {},
};
const summarize = vi
.spyOn(experiment, "summarize")
.mockResolvedValue(expectedSummary);

const result = await runEvaluator(
experiment,
{
projectName: "proj",
evalName: "js-base-experiment-id",
data: [{ input: "hello", expected: "hello" }],
task: (input) => input,
scores: [],
baseExperimentId: "base-exp-id",
},
new NoopProgressReporter(),
[],
undefined,
undefined,
true,
);

expect(result.summary).toBe(expectedSummary);
expect(summarize).toHaveBeenCalledWith({
summarizeScores: undefined,
comparisonExperimentId: "base-exp-id",
});
});

test("runEvaluator forwards persisted baseExperimentName id to summary", async () => {
await _exportsForTestingOnly.simulateLoginForTests();
const experiment = _exportsForTestingOnly.initTestExperiment(
"js-base-experiment-name",
"proj",
{ base_exp_id: "resolved-base-exp-id" },
);
const expectedSummary = {
projectName: "proj",
experimentName: "js-base-experiment-name",
projectId: "proj",
experimentId: "js-base-experiment-name",
scores: {},
metrics: {},
};
const summarize = vi
.spyOn(experiment, "summarize")
.mockResolvedValue(expectedSummary);

const result = await runEvaluator(
experiment,
{
projectName: "proj",
evalName: "js-base-experiment-name",
data: [{ input: "hello", expected: "hello" }],
task: (input) => input,
scores: [],
baseExperimentName: "base-exp",
},
new NoopProgressReporter(),
[],
undefined,
undefined,
true,
);

expect(result.summary).toBe(expectedSummary);
expect(summarize).toHaveBeenCalledWith({
summarizeScores: undefined,
comparisonExperimentId: "resolved-base-exp-id",
});
});

test("tags can be appended and logged to root span", async () => {
await _exportsForTestingOnly.simulateLoginForTests();
const memoryLogger = _exportsForTestingOnly.useTestBackgroundLogger();
Expand Down
18 changes: 18 additions & 0 deletions js/src/framework.ts
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,16 @@ export type {
ReporterDef,
} from "./reporters/types";

async function getPersistedBaseExperimentId(
experiment: Experiment,
): Promise<string | undefined> {
try {
return await experiment._getBaseExperimentId();
} catch {
return undefined;
}
}

function makeEvalName(projectName: string, experimentName?: string) {
let out = projectName;
if (experimentName) {
Expand Down Expand Up @@ -1636,9 +1646,17 @@ async function runEvaluatorInternal(
}
}

const comparisonExperimentId = experiment
? (evaluator.baseExperimentId ??
(await getPersistedBaseExperimentId(experiment)))
: undefined;

const summary = experiment
? await experiment.summarize({
summarizeScores: evaluator.summarizeScores,
...(comparisonExperimentId !== undefined
? { comparisonExperimentId }
: {}),
})
: buildLocalSummary(
evaluator,
Expand Down
47 changes: 47 additions & 0 deletions js/src/logger.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,53 @@ test("dataset.toEvalData preserves dataset_snapshot_name", async () => {
vi.restoreAllMocks();
});

test("experiment.summarize resolves explicit comparison experiment name", async () => {
const state = await _exportsForTestingOnly.simulateLoginForTests();
const memoryLogger = _exportsForTestingOnly.useTestBackgroundLogger();
const experiment = _exportsForTestingOnly.initTestExperiment(
"test-evaluator",
"test-project",
);

try {
const getJson = vi
.spyOn(state.apiConn(), "get_json")
.mockImplementation(async (path, args) => {
if (path === "v1/experiment/base-exp-id") {
return { name: "base-exp" };
}
if (path === "/experiment-comparison2") {
expect(args).toEqual({
experiment_id: "test-evaluator",
base_experiment_id: "base-exp-id",
});
return { scores: {}, metrics: {} };
}
throw new Error(`Unexpected get_json call: ${path}`);
});

const summary = await experiment.summarize({
comparisonExperimentId: "base-exp-id",
});

expect(summary.comparisonExperimentName).toBe("base-exp");
expect(getJson).toHaveBeenCalledWith("v1/experiment/base-exp-id");
expect(getJson).toHaveBeenCalledWith(
"/experiment-comparison2",
{
experiment_id: "test-evaluator",
base_experiment_id: "base-exp-id",
},
3,
);
} finally {
await memoryLogger.flush();
_exportsForTestingOnly.clearTestBackgroundLogger();
_exportsForTestingOnly.simulateLogoutForTests();
vi.restoreAllMocks();
}
});

test("dataset.version preserves pinned-version fast path", async () => {
const state = await _exportsForTestingOnly.simulateLoginForTests();
const login = vi.spyOn(state, "login").mockResolvedValue(state);
Expand Down
26 changes: 25 additions & 1 deletion js/src/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,7 @@ function clearTestBackgroundLogger() {
function initTestExperiment(
experimentName: string,
projectName?: string,
experimentFullInfo: Record<string, unknown> = {},
): Experiment {
setInitialTestState();
const state = _internalGetGlobalState();
Expand All @@ -1071,7 +1072,11 @@ function initTestExperiment(
const lazyMetadata: LazyValue<ProjectExperimentMetadata> = new LazyValue(
async () => ({
project: { id: project, name: project, fullInfo: {} },
experiment: { id: experimentName, name: experimentName, fullInfo: {} },
experiment: {
id: experimentName,
name: experimentName,
fullInfo: experimentFullInfo,
},
}),
);

Expand Down Expand Up @@ -6317,6 +6322,14 @@ export class Experiment
})();
}

public async _getBaseExperimentId(): Promise<string | undefined> {
const baseExperimentId = (await this.lazyMetadata.get()).experiment
.fullInfo["base_exp_id"];
return typeof baseExperimentId === "string" && baseExperimentId
? baseExperimentId
: undefined;
}

private parentObjectType() {
return SpanObjectTypeV3.EXPERIMENT;
}
Expand Down Expand Up @@ -6484,6 +6497,17 @@ export class Experiment
comparisonExperimentId = baseExperiment.id;
comparisonExperimentName = baseExperiment.name;
}
} else {
try {
const comparisonExperiment = await state
.apiConn()
.get_json(`v1/experiment/${comparisonExperimentId}`);
if (typeof comparisonExperiment["name"] === "string") {
comparisonExperimentName = comparisonExperiment["name"];
}
} catch {
// If the explicit comparison name lookup fails, still summarize by ID.
}
}

try {
Expand Down
Loading