Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions benchmarks/review_analysis/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
__pycache__/
.cache/
plots/
validation/

# Generated analysis outputs (regeneratable from the scripts in this dir).
cluster_*.json
per_paper_*.json

# Local symlink into ../conference_study/results/ used by analysis_three_systems.py
# and analysis_with_humans.py; recreate locally with `ln -s`.
frontier_subset_progressive
515 changes: 229 additions & 286 deletions benchmarks/review_analysis/analysis.py

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions benchmarks/review_analysis/analysis_claude_gpt_efficient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""3-way paragraph overlap on the perturbation benchmark, under OpenAIReview
(progressive): Claude Opus 4.7 vs GPT-5.5 vs the UNION of the efficient models.

Mirrors analysis_gpt_claude.py (same perturbation tree and per-cell granularity:
one cell per domain x paper x error_type), but adds a third set that pools all
efficient backbones. Output: plots/venn_claude_gpt_efficient.{png,pdf}.
"""

from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt

from utils import (
COLOR_BLUE, COLOR_RED, COLOR_GREEN,
load, para_set, regions_3, draw_venn3, save_fig,
)

PERTURB_ROOT = Path(__file__).resolve().parent.parent / "perturbation" / "results"

CLAUDE = "claude-opus-4.7"
GPT = "gpt-5.5"
# Efficient backbones with full progressive coverage on the perturbation tree
# (glm-4.7-flash is excluded: only ~13 cells available).
EFFICIENT = ["deepseek-v4-flash", "gemini-3.1-flash-lite-preview", "grok-4.1-fast", "qwen3.6-35b-a3b"]
MODELS = [CLAUDE, GPT] + EFFICIENT


def method_key(model):
return f"progressive__{model}"


def perturb_cells(models, root=PERTURB_ROOT):
"""Return {cell_id: {model: set(paragraph_index)}} from the perturbation tree.

cell_id = "<domain>__<paper>__<error_type>"; only the progressive method is read.
"""
cells = defaultdict(dict)
for domain_dir in sorted(root.glob("*")):
if not domain_dir.is_dir() or domain_dir.name.startswith("_"):
continue
for model in models:
mdir = domain_dir / model
if not mdir.is_dir():
continue
for etype_dir in sorted(p for p in mdir.glob("*") if p.is_dir()):
prog = etype_dir / "progressive"
if not prog.is_dir():
continue
for paper_dir in sorted(prog.glob("paper_*")):
review_jsons = sorted((paper_dir / "review").glob("*.json"))
if not review_jsons:
continue
d = load(review_jsons[0])
cell_id = f"{domain_dir.name}__{paper_dir.name}__{etype_dir.name}"
cells[cell_id][model] = para_set(d, method_key(model))
return cells


def overlap_three():
cells = perturb_cells(MODELS)
# Match the Claude-vs-GPT figure: cells where both frontier models are present.
paired = sorted(cid for cid, m in cells.items() if CLAUDE in m and GPT in m)
print(f"Cells with both {CLAUDE} and {GPT}: {len(paired)}")

totals = defaultdict(float)
jaccard_sum, jaccard_n = 0.0, 0
eff_models_seen = set()

for cid in paired:
m = cells[cid]
a = m[CLAUDE]
b = m[GPT]
c = set()
for em in EFFICIENT:
if em in m:
c |= m[em]
eff_models_seen.add(em)
r = regions_3(a, b, c)
for k in ("only_a", "only_b", "only_c", "a_b", "a_c", "b_c", "all"):
totals[k] += r[k]
if r["total"]:
jaccard_sum += r["jaccard"]
jaccard_n += 1

n = len(paired)
avg = {k: v / n for k, v in totals.items()}
jaccard_avg = jaccard_sum / jaccard_n if jaccard_n else 0.0

print(f"Efficient models pooled: {sorted(eff_models_seen)}")
print(f"\n{'Region':<26} {'Avg/cell':>10}")
print("-" * 38)
print(f"{'Only Claude':<26} {avg['only_a']:>10.2f}")
print(f"{'Only GPT':<26} {avg['only_b']:>10.2f}")
print(f"{'Only Efficient':<26} {avg['only_c']:>10.2f}")
print(f"{'Claude & GPT':<26} {avg['a_b']:>10.2f}")
print(f"{'Claude & Efficient':<26} {avg['a_c']:>10.2f}")
print(f"{'GPT & Efficient':<26} {avg['b_c']:>10.2f}")
print(f"{'All three':<26} {avg['all']:>10.2f}")
print(f"{'Jaccard (3-way)':<26} {jaccard_avg:>10.3f}")

plot(avg, jaccard_avg)


def plot(avg, jaccard_avg):
sizes = tuple(round(avg[k], 2) for k in ("only_a", "only_b", "a_b", "only_c", "a_c", "b_c", "all"))
names = ("Opus 4.7", "GPT-5.5", "Efficient models\n(union)")

fig, ax = plt.subplots(figsize=(8, 7), dpi=400)
draw_venn3(ax, sizes, names, colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN),
region_fontsize=26, set_fontsize=30)
plt.tight_layout()
paths = save_fig("venn_claude_gpt_efficient", dpi=400)
print(f"\nWrote {', '.join(str(p) for p in paths)}")


if __name__ == "__main__":
overlap_three()
106 changes: 106 additions & 0 deletions benchmarks/review_analysis/analysis_claude_gpt_efficient_outcomes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Quality-proxy counterpart of analysis_claude_gpt_efficient.py.

3-way paragraph overlap on the quality-proxy (conference) papers, under
OpenAIReview (progressive): Claude Opus 4.7 vs GPT-5.5 vs the UNION of the
efficient models. Same efficient-model pool as the main-text figure.

Frontier models are read from frontier_subset_progressive; efficient models
from scaleup_v2_progressive (grok lives in its own scaleup_v2_grok_progressive
folder). Per-paper granularity, restricted to papers with both frontier
models. Output: plots/venn_claude_gpt_efficient_outcomes.{png,pdf}.
"""

from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt

from utils import (
COLOR_BLUE, COLOR_RED, COLOR_GREEN,
load, para_set, stems, regions_3, draw_venn3, save_fig,
)

RESULTS = Path(__file__).resolve().parent.parent / "conference_study" / "results"

FRONTIER_DIR = RESULTS / "frontier_subset_progressive"
CLAUDE = "claude-opus-4.7"
GPT = "gpt-5.5"
# Same efficient pool as analysis_claude_gpt_efficient.py (main-text figure).
EFFICIENT = {
"deepseek-v4-flash": RESULTS / "scaleup_v2_progressive",
"gemini-3.1-flash-lite-preview": RESULTS / "scaleup_v2_progressive",
"grok-4.1-fast": RESULTS / "scaleup_v2_grok_progressive",
"qwen3.6-35b-a3b": RESULTS / "scaleup_v2_progressive",
}


def method_key(model):
return f"progressive__{model}"


def overlap_three():
papers = sorted(stems(FRONTIER_DIR))

totals = defaultdict(float)
jaccard_sum, jaccard_n = 0.0, 0
n_used = 0
eff_models_seen = set()

for stem in papers:
d = load(FRONTIER_DIR / f"{stem}.json")
a = para_set(d, method_key(CLAUDE))
b = para_set(d, method_key(GPT))
if not a or not b:
continue
c = set()
for em, folder in EFFICIENT.items():
p = folder / f"{stem}.json"
if not p.exists():
continue
s = para_set(load(p), method_key(em))
if s:
c |= s
eff_models_seen.add(em)
r = regions_3(a, b, c)
for k in ("only_a", "only_b", "only_c", "a_b", "a_c", "b_c", "all"):
totals[k] += r[k]
if r["total"]:
jaccard_sum += r["jaccard"]
jaccard_n += 1
n_used += 1

print(f"Papers with both {CLAUDE} and {GPT}: {n_used}")
avg = {k: v / n_used for k, v in totals.items()}
jaccard_avg = jaccard_sum / jaccard_n if jaccard_n else 0.0

print(f"Efficient models pooled: {sorted(eff_models_seen)}")
print(f"\n{'Region':<26} {'Avg/paper':>10}")
print("-" * 38)
print(f"{'Only Claude':<26} {avg['only_a']:>10.2f}")
print(f"{'Only GPT':<26} {avg['only_b']:>10.2f}")
print(f"{'Only Efficient':<26} {avg['only_c']:>10.2f}")
print(f"{'Claude & GPT':<26} {avg['a_b']:>10.2f}")
print(f"{'Claude & Efficient':<26} {avg['a_c']:>10.2f}")
print(f"{'GPT & Efficient':<26} {avg['b_c']:>10.2f}")
print(f"{'All three':<26} {avg['all']:>10.2f}")
print(f"{'Jaccard (3-way)':<26} {jaccard_avg:>10.3f}")

plot(avg, jaccard_avg)


def plot(avg, jaccard_avg):
sizes = tuple(round(avg[k], 2) for k in ("only_a", "only_b", "a_b", "only_c", "a_c", "b_c", "all"))
names = ("Opus 4.7", "GPT-5.5", "Efficient models\n(union)")

fig, ax = plt.subplots(figsize=(8, 7), dpi=400)
# Smaller fonts than the main-text figure: this one renders at 0.85\columnwidth
# like the other appendix venns (analysis_three_systems.py), not in a half-width subfigure.
draw_venn3(ax, sizes, names, colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN),
region_fontsize=18, set_fontsize=20)
plt.tight_layout()
paths = save_fig("venn_claude_gpt_efficient_outcomes", dpi=400)
print(f"\nWrote {', '.join(str(p) for p in paths)}")


if __name__ == "__main__":
overlap_three()
Loading