diff --git a/benchmarks/review_analysis/.gitignore b/benchmarks/review_analysis/.gitignore new file mode 100644 index 0000000..1612b85 --- /dev/null +++ b/benchmarks/review_analysis/.gitignore @@ -0,0 +1,12 @@ +__pycache__/ +.cache/ +plots/ +validation/ + +# Generated analysis outputs (regeneratable from the scripts in this dir). +cluster_*.json +per_paper_*.json + +# Local symlink into ../conference_study/results/ used by analysis_three_systems.py +# and analysis_with_humans.py; recreate locally with `ln -s`. +frontier_subset_progressive diff --git a/benchmarks/review_analysis/analysis.py b/benchmarks/review_analysis/analysis.py index 61da37a..d7c37e7 100644 --- a/benchmarks/review_analysis/analysis.py +++ b/benchmarks/review_analysis/analysis.py @@ -1,63 +1,80 @@ -import json +"""Multi-method × multi-model comparison on the v2 scaleup cohort. + +Compares three OpenAIReview methods — `coarse`, `progressive`, `zero_shot` — across +four shared backbones (DeepSeek-V4-Flash, Gemini-3.1-Flash-Lite, GLM-4.7-Flash, +Qwen3.6-35B-A3B), all run on the same set of papers. Per (method, model) it loads +the per-paper result JSONs from `./coarse_v2/`, `./scaleup_v2_progressive/`, and +`./scaleup_v2_zero_shot/`, then reports: + + * `volume_dicts` : average #comments per paper per (method, model), plus + which method "wins" most often per model. + * `overlap_cp` : 2-way paragraph-index overlap of coarse vs progressive, + per model, with a 2×2 panel of venn2 plots → venn_cp.{png,pdf}. + * `overlap_all` : 3-way overlap across coarse/progressive/zero_shot, per + model, with a 2×2 panel of venn3 plots → venn_all.{png,pdf}. + * `cluster_cp` / + `cluster_all` : SentenceTransformer + KMeans (10 clusters) over comment + titles+explanations, with TF-IDF top keywords and 5 + representative comments per cluster. + +All shared helpers (`load`, `para_set`, region math, venn styling, `save_fig`) +live in `utils.py`. Plots are written to `./plots/` in both PNG and PDF. + +Run: `python analysis.py` from this directory. +""" + +from collections import defaultdict, Counter from pathlib import Path -from collections import defaultdict -from rapidfuzz import fuzz -import numpy as np + import matplotlib.pyplot as plt -from matplotlib_venn import venn2, venn2_circles, venn3, venn3_circles +import numpy as np +from rapidfuzz import fuzz +from sentence_transformers import SentenceTransformer +from sklearn.cluster import KMeans +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.manifold import TSNE + +from utils import ( + COLOR_BLUE, COLOR_RED, COLOR_GREEN, + load, para_set, regions_2, regions_3, draw_venn2, draw_venn3, save_fig, +) -from sentence_transformers import SentenceTransformer -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.cluster import KMeans -from sklearn.manifold import TSNE -from collections import Counter model_dict = { - 'deepseek-v4-flash': 'DeepSeek-V4-Flash', - 'gemini-3.1-flash-lite-preview': 'Gemini-3.1-Flash-Lite', - 'glm-4.7-flash': 'GLM-4.7-Flash', - 'qwen3.6-35b-a3b': 'Qwen3.6-35B-A3B' + 'deepseek-v4-flash': 'DeepSeek-V4-Flash', + 'gemini-3.1-flash-lite-preview': 'Gemini-3.1-Flash-Lite', + 'glm-4.7-flash': 'GLM-4.7-Flash', + 'qwen3.6-35b-a3b': 'Qwen3.6-35B-A3B', } - -def load(path): - return json.loads(Path(path).read_text()) - -def method_key(folder_name, model): - prefix = {"coarse": "coarse", "progressive": "progressive", "zero_shot": "zero_shot"} - return f"{prefix[folder_name]}__{model}" -''' -folders = {method: folder path} -models = [models] -total_papers = # of papers -Ex: -FOLDERS = { - "coarse": "./coarse_v2/", - "progressive": "./scaleup_v2_progressive/", - "zero_shot": "./scaleup_v2_zero_shot/" -} - -MODELS = ["deepseek-v4-flash", "gemini-3.1-flash-lite-preview", "glm-4.7-flash", "qwen3.6-35b-a3b"] +def method_key(folder_name, model): + prefix = {"coarse": "coarse", "progressive": "progressive", "zero_shot": "zero_shot"} + return f"{prefix[folder_name]}__{model}" -TOTAL_PAPERS = len(list(Path(FOLDERS["coarse"]).glob("*.json"))) -''' +def get_papers(folders): + first_folder = next(iter(folders.values())) + if not first_folder: + return None + return [p.stem for p in Path(first_folder).glob("*.json")] -# VOLUME +# --------------------------------------------------------------------------- +# Volume +# --------------------------------------------------------------------------- def volume_dicts(folders, models, total_papers): - volume = {} # { slug -> { "coarse/deepseek": 5, "progressive/deepseek": 3, ... } } - - for folder_name, folder_path in folders.items(): - for p in Path(folder_path).glob("*.json"): - slug = p.stem - d = load(p) + volume = {} # { slug -> { model -> { folder -> n_comments } } } + + for folder_name, folder_path in folders.items(): + for p in Path(folder_path).glob("*.json"): + slug = p.stem + d = load(p) if slug not in volume: volume[slug] = defaultdict(dict) for model in models: - key = method_key(folder_name, model) + key = method_key(folder_name, model) comments = d.get("methods", {}).get(key, {}).get("comments", []) volume[slug][model][folder_name] = len(comments) @@ -70,43 +87,33 @@ def volume_dicts(folders, models, total_papers): if highest not in highest_volume[model]: highest_volume[model][highest] = 0 highest_volume[model][highest] += 1 - + for method, number in counts.items(): if method not in average_volume[model]: average_volume[model][method] = 0 average_volume[model][method] += number / total_papers - - print(f"Average number of comments per paper:\n") - print(f"{'Model':<40} {'Coarse':>10} {'Progressive':>12} {'Zero Shot':>11} {'Winner':>12}") - print("-" * 90) - for model, counts in average_volume.items(): - coarse = counts.get('coarse', 0) - prog = counts.get('progressive', 0) + + print("Average number of comments per paper:\n") + print(f"{'Model':<40} {'Coarse':>10} {'Progressive':>12} {'Zero Shot':>11} {'Winner':>12}") + print("-" * 90) + for model, counts in average_volume.items(): + coarse = counts.get('coarse', 0) + prog = counts.get('progressive', 0) zero_shot = counts.get('zero_shot', 0) - winner = max(counts, key=counts.get) + winner = max(counts, key=counts.get) print(f"{model:<40} {coarse:>10.2f} {prog:>12.2f} {zero_shot:>11.2f} {winner:>12}") return volume, highest_volume, average_volume - -# Comment Overlap (Coarse, Progressive) - -def get_papers(folders): - first_folder = next(iter(folders.values())) - if not first_folder: - return None - - papers = [] - for p in list(Path(first_folder).glob("*.json")): - papers.append(p.stem) - - return papers +# --------------------------------------------------------------------------- +# 2-way overlap: coarse vs progressive +# --------------------------------------------------------------------------- def overlap_cp(folders, models, total_papers): overlap_ind = defaultdict(lambda: defaultdict(dict)) - overlap_total = defaultdict(lambda: {"both_total": 0, "only_c_total": 0, "only_p_total": 0}) - overlap_avg = defaultdict(lambda: {"both_avg": 0, "only_c_avg": 0, "only_p_avg": 0, "jaccard_sim_avg": 0}) + overlap_total = defaultdict(lambda: {"both_total": 0, "only_c_total": 0, "only_p_total": 0}) + overlap_avg = defaultdict(lambda: {"both_avg": 0, "only_c_avg": 0, "only_p_avg": 0, "jaccard_sim_avg": 0}) papers = get_papers(folders) if not papers: @@ -114,113 +121,81 @@ def overlap_cp(folders, models, total_papers): temp_jaccard_sim = defaultdict(int) temp_count = defaultdict(int) - for stem in papers: - coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) + + for stem in papers: + coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) prog_data = load(Path(folders["progressive"]) / (stem + ".json")) - - def para_set(d, method_key): - comments = d.get("methods", {}).get(method_key, {}).get("comments", []) - return {c["paragraph_index"] for c in comments if "paragraph_index" in c} - - for model in models: - coarse_paras = para_set(coarse_data, method_key("coarse", model)) - prog_paras = para_set(prog_data, method_key("progressive", model)) - - both_idx = coarse_paras & prog_paras - only_c_idx = coarse_paras - prog_paras - only_p_idx = prog_paras - coarse_paras - - both_num = len(coarse_paras & prog_paras) - only_c_num = len(coarse_paras - prog_paras) - only_p_num = len(prog_paras - coarse_paras) - total_num = both_num + only_c_num + only_p_num - - overlap_ind[model][stem]["both_idx"] = both_idx - overlap_ind[model][stem]["only_c_idx"] = only_c_idx - overlap_ind[model][stem]["only_p_idx"] = only_p_idx - overlap_ind[model][stem]["both_num"] = both_num - overlap_ind[model][stem]["only_c_num"] = only_c_num - overlap_ind[model][stem]["only_p_num"] = only_p_num - overlap_ind[model][stem]["both_pct"] = both_num / total_num if total_num != 0 else None - overlap_ind[model][stem]["only_c_pct"] = only_c_num / total_num if total_num != 0 else None - overlap_ind[model][stem]["only_p_pct"] = only_p_num / total_num if total_num != 0 else None - overlap_ind[model][stem]["jaccard_sim"] = both_num / total_num if total_num != 0 else None - - overlap_total[model]["both_total"] += both_num + + for model in models: + coarse_paras = para_set(coarse_data, method_key("coarse", model)) + prog_paras = para_set(prog_data, method_key("progressive", model)) + + r = regions_2(coarse_paras, prog_paras) + both_idx = coarse_paras & prog_paras + only_c_idx = coarse_paras - prog_paras + only_p_idx = prog_paras - coarse_paras + both_num, only_c_num, only_p_num, total_num = r["both"], r["only_a"], r["only_b"], r["total"] + + overlap_ind[model][stem]["both_idx"] = both_idx + overlap_ind[model][stem]["only_c_idx"] = only_c_idx + overlap_ind[model][stem]["only_p_idx"] = only_p_idx + overlap_ind[model][stem]["both_num"] = both_num + overlap_ind[model][stem]["only_c_num"] = only_c_num + overlap_ind[model][stem]["only_p_num"] = only_p_num + overlap_ind[model][stem]["both_pct"] = both_num / total_num if total_num != 0 else None + overlap_ind[model][stem]["only_c_pct"] = only_c_num / total_num if total_num != 0 else None + overlap_ind[model][stem]["only_p_pct"] = only_p_num / total_num if total_num != 0 else None + overlap_ind[model][stem]["jaccard_sim"] = r["jaccard"] if total_num != 0 else None + + overlap_total[model]["both_total"] += both_num overlap_total[model]["only_c_total"] += only_c_num overlap_total[model]["only_p_total"] += only_p_num - - temp_jaccard_sim[model] += both_num / total_num if total_num != 0 else 0 + + temp_jaccard_sim[model] += r["jaccard"] temp_count[model] += 1 if total_num != 0 else 0 for model in models: - overlap_avg[model]["both_avg"] = overlap_total[model]["both_total"] / total_papers - overlap_avg[model]["only_c_avg"] = overlap_total[model]["only_c_total"] / total_papers - overlap_avg[model]["only_p_avg"] = overlap_total[model]["only_p_total"] / total_papers - overlap_avg[model]["jaccard_sim_avg"] = temp_jaccard_sim[model] / temp_count[model] - - print(f"Average overlap per paper:\n") + overlap_avg[model]["both_avg"] = overlap_total[model]["both_total"] / total_papers + overlap_avg[model]["only_c_avg"] = overlap_total[model]["only_c_total"] / total_papers + overlap_avg[model]["only_p_avg"] = overlap_total[model]["only_p_total"] / total_papers + overlap_avg[model]["jaccard_sim_avg"] = temp_jaccard_sim[model] / temp_count[model] if temp_count[model] else 0 + + print("Average overlap per paper:\n") print(f"{'Model':<35} {'Both':>8} {'Only C':>8} {'Only P':>8} {'Jaccard':>8}") print("-" * 71) - for model, counts in overlap_avg.items(): - print(f"{model:<35} {counts['both_avg']:>8.2f} {counts['only_c_avg']:>8.2f} {counts['only_p_avg']:>8.2f} {counts['jaccard_sim_avg']:>8.3f}") - + for model, counts in overlap_avg.items(): + print(f"{model:<35} {counts['both_avg']:>8.2f} {counts['only_c_avg']:>8.2f} " + f"{counts['only_p_avg']:>8.2f} {counts['jaccard_sim_avg']:>8.3f}") + plot_overlap_cp(overlap_avg) return overlap_ind, overlap_total, overlap_avg -def plot_overlap_cp(overlap_avg): - COLORS = ["#2196F3", "#E53935"] # blue, red - - fig, axes = plt.subplots(2, 2, figsize=(14, 11), dpi=400) + +def plot_overlap_cp(overlap_avg): + # Style matches the single-panel main-paper venns (analysis_three_systems.py): + # clean per-panel model-name titles, larger fonts, and no in-figure Jaccard + # text (Jaccard is reported in the table columns instead). + fig, axes = plt.subplots(2, 2, figsize=(12, 10), dpi=400) axes = axes.flatten() - + for i, (model, counts) in enumerate(overlap_avg.items()): - only_c = round(counts["only_c_avg"], 2) - only_p = round(counts["only_p_avg"], 2) - both = round(counts["both_avg"], 2) - - v = venn2( - subsets=(only_c, only_p, both), - set_labels=("Coarse", "OpenAIReview"), - ax=axes[i], - set_colors=COLORS, - alpha=0.15, - ) - - c = venn2_circles( - subsets=(only_c, only_p, both), - ax=axes[i], - linewidth=2.0, - ) + sizes = (round(counts["only_c_avg"], 2), + round(counts["only_p_avg"], 2), + round(counts["both_avg"], 2)) + draw_venn2(axes[i], sizes, set_labels=("coarse", "OpenAIReview"), + colors=(COLOR_BLUE, COLOR_RED), + region_fontsize=24, set_fontsize=21) + axes[i].set_title(model_dict.get(model, model), fontsize=21, pad=12) + + plt.tight_layout() + plt.subplots_adjust(hspace=0.16, wspace=0.06) + save_fig("venn_cp", dpi=400) - for circle, color in zip(c, COLORS): - circle.set_edgecolor(color) - circle.set_linewidth(2.0) - - for label_id in ["10", "01", "11"]: - lbl = v.get_label_by_id(label_id) - if lbl: - lbl.set_fontsize(15) - lbl.set_color("black") - lbl.set_fontweight("normal") - lbl.set_ha("center") - - for set_label in v.set_labels: - if set_label: - set_label.set_fontsize(15) - set_label.set_color("black") - - axes[i].set_title(f"{model_dict.get(model, model)}\nJaccard Similarity: {counts['jaccard_sim_avg']:.3f}", - fontsize=15, fontweight="bold", pad=10) - - plt.tight_layout() - plt.subplots_adjust(hspace=0.2, wspace=0.1) - plt.savefig("./venn_cp.png", dpi=400, bbox_inches="tight") - - - -# Comment Overlap (Coarse, Progressive, Zero Shot) + +# --------------------------------------------------------------------------- +# 3-way overlap: coarse, progressive, zero_shot +# --------------------------------------------------------------------------- def overlap_all(folders, models, total_papers): overlap_ind = defaultdict(lambda: defaultdict(dict)) @@ -237,10 +212,6 @@ def overlap_all(folders, models, total_papers): if not papers: return None - def para_set(d, mk): - comments = d.get("methods", {}).get(mk, {}).get("comments", []) - return {c["paragraph_index"] for c in comments if "paragraph_index" in c} - temp_jaccard_sim = defaultdict(int) temp_count = defaultdict(int) @@ -254,22 +225,23 @@ def para_set(d, mk): prog_paras = para_set(prog_data, method_key("progressive", model)) zero_paras = para_set(zero_data, method_key("zero_shot", model)) - all_idx = coarse_paras & prog_paras & zero_paras - only_c_idx = coarse_paras - prog_paras - zero_paras - only_p_idx = prog_paras - coarse_paras - zero_paras - only_z_idx = zero_paras - coarse_paras - prog_paras - only_c_p_idx = (coarse_paras & prog_paras) - zero_paras - only_c_z_idx = (coarse_paras & zero_paras) - prog_paras - only_p_z_idx = (prog_paras & zero_paras) - coarse_paras - - all_num = len(all_idx) - only_c_num = len(only_c_idx) - only_p_num = len(only_p_idx) - only_z_num = len(only_z_idx) - only_c_p_num = len(only_c_p_idx) - only_c_z_num = len(only_c_z_idx) - only_p_z_num = len(only_p_z_idx) - total_num = all_num + only_c_num + only_p_num + only_z_num + only_c_p_num + only_c_z_num + only_p_z_num + all_idx = coarse_paras & prog_paras & zero_paras + only_c_idx = coarse_paras - prog_paras - zero_paras + only_p_idx = prog_paras - coarse_paras - zero_paras + only_z_idx = zero_paras - coarse_paras - prog_paras + only_c_p_idx = (coarse_paras & prog_paras) - zero_paras + only_c_z_idx = (coarse_paras & zero_paras) - prog_paras + only_p_z_idx = (prog_paras & zero_paras) - coarse_paras + + r = regions_3(coarse_paras, prog_paras, zero_paras) + all_num = r["all"] + only_c_num = r["only_a"] + only_p_num = r["only_b"] + only_z_num = r["only_c"] + only_c_p_num = r["a_b"] + only_c_z_num = r["a_c"] + only_p_z_num = r["b_c"] + total_num = r["total"] overlap_ind[model][stem]["all_idx"] = all_idx overlap_ind[model][stem]["only_c_idx"] = only_c_idx @@ -285,7 +257,7 @@ def para_set(d, mk): overlap_ind[model][stem]["only_c_p_num"] = only_c_p_num overlap_ind[model][stem]["only_c_z_num"] = only_c_z_num overlap_ind[model][stem]["only_p_z_num"] = only_p_z_num - overlap_ind[model][stem]["jaccard_sim"] = all_num / total_num if total_num != 0 else None + overlap_ind[model][stem]["jaccard_sim"] = r["jaccard"] if total_num != 0 else None overlap_total[model]["all_total"] += all_num overlap_total[model]["only_c_total"] += only_c_num @@ -295,21 +267,22 @@ def para_set(d, mk): overlap_total[model]["only_c_z_total"] += only_c_z_num overlap_total[model]["only_p_z_total"] += only_p_z_num - temp_jaccard_sim[model] += all_num / total_num if total_num != 0 else 0 + temp_jaccard_sim[model] += r["jaccard"] temp_count[model] += 1 if total_num != 0 else 0 for model in models: - overlap_avg[model]["all_avg"] = overlap_total[model]["all_total"] / total_papers - overlap_avg[model]["only_c_avg"] = overlap_total[model]["only_c_total"] / total_papers - overlap_avg[model]["only_p_avg"] = overlap_total[model]["only_p_total"] / total_papers - overlap_avg[model]["only_z_avg"] = overlap_total[model]["only_z_total"] / total_papers - overlap_avg[model]["only_c_p_avg"] = overlap_total[model]["only_c_p_total"] / total_papers - overlap_avg[model]["only_c_z_avg"] = overlap_total[model]["only_c_z_total"] / total_papers - overlap_avg[model]["only_p_z_avg"] = overlap_total[model]["only_p_z_total"] / total_papers + overlap_avg[model]["all_avg"] = overlap_total[model]["all_total"] / total_papers + overlap_avg[model]["only_c_avg"] = overlap_total[model]["only_c_total"] / total_papers + overlap_avg[model]["only_p_avg"] = overlap_total[model]["only_p_total"] / total_papers + overlap_avg[model]["only_z_avg"] = overlap_total[model]["only_z_total"] / total_papers + overlap_avg[model]["only_c_p_avg"] = overlap_total[model]["only_c_p_total"] / total_papers + overlap_avg[model]["only_c_z_avg"] = overlap_total[model]["only_c_z_total"] / total_papers + overlap_avg[model]["only_p_z_avg"] = overlap_total[model]["only_p_z_total"] / total_papers overlap_avg[model]["jaccard_sim_avg"] = temp_jaccard_sim[model] / temp_count[model] if temp_count[model] else 0 - print(f"Average 3-way overlap per paper:\n") - print(f"{'Model':<35} {'All':>6} {'Only C':>8} {'Only P':>8} {'Only Z':>8} {'C∩P':>6} {'C∩Z':>6} {'P∩Z':>6} {'Jaccard':>8}") + print("Average 3-way overlap per paper:\n") + print(f"{'Model':<35} {'All':>6} {'Only C':>8} {'Only P':>8} {'Only Z':>8} " + f"{'C∩P':>6} {'C∩Z':>6} {'P∩Z':>6} {'Jaccard':>8}") print("-" * 100) for model, counts in overlap_avg.items(): print(f"{model:<35} " @@ -328,67 +301,39 @@ def para_set(d, mk): def plot_overlap_all(overlap_avg): - COLORS = ["#2196F3", "#E53935", "#43A047"] # blue, red, green - - fig, axes = plt.subplots(2, 2, figsize=(14, 11), dpi=400) + # Same clean style as plot_overlap_cp: model-name-only titles, no in-figure + # Jaccard, consistent palette/edges via draw_venn3. + fig, axes = plt.subplots(2, 2, figsize=(12, 11), dpi=400) axes = axes.flatten() for i, (model, counts) in enumerate(overlap_avg.items()): - only_c = round(counts["only_c_avg"], 2) - only_p = round(counts["only_p_avg"], 2) - only_z = round(counts["only_z_avg"], 2) - only_cp = round(counts["only_c_p_avg"], 2) - only_cz = round(counts["only_c_z_avg"], 2) - only_pz = round(counts["only_p_z_avg"], 2) - all_three = round(counts["all_avg"], 2) - - v = venn3( - subsets=(only_c, only_p, only_cp, only_z, only_cz, only_pz, all_three), - set_labels=("'coarse", "OpenAIReview", "zero-shot"), - ax=axes[i], - set_colors=COLORS, - alpha=0.15, - ) - - c = venn3_circles( - subsets=(only_c, only_p, only_cp, only_z, only_cz, only_pz, all_three), - ax=axes[i], - linewidth=2.0, + sizes = ( + round(counts["only_c_avg"], 2), + round(counts["only_p_avg"], 2), + round(counts["only_c_p_avg"], 2), + round(counts["only_z_avg"], 2), + round(counts["only_c_z_avg"], 2), + round(counts["only_p_z_avg"], 2), + round(counts["all_avg"], 2), ) - - for circle, color in zip(c, COLORS): - circle.set_edgecolor(color) - circle.set_linewidth(2.0) - - for label_id in ["100", "010", "110", "001", "101", "011", "111"]: - lbl = v.get_label_by_id(label_id) - if lbl: - lbl.set_fontsize(13) - lbl.set_color("black") - lbl.set_fontweight("normal") - lbl.set_ha("center") - - for set_label in v.set_labels: - if set_label: - set_label.set_fontsize(13) - set_label.set_color("black") - - axes[i].set_title(f"{model_dict.get(model, model)}\nJaccard Similarity: {counts['jaccard_sim_avg']:.3f}", - fontsize=15, fontweight="bold", pad=10) + draw_venn3(axes[i], sizes, set_labels=("coarse", "OpenAIReview", "zero-shot"), + colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN), + region_fontsize=17, set_fontsize=18) + axes[i].set_title(model_dict.get(model, model), fontsize=21, pad=12) plt.tight_layout() - plt.subplots_adjust(hspace=0.2, wspace=0.1) - plt.savefig("./venn_all.png", dpi=400, bbox_inches="tight") - + plt.subplots_adjust(hspace=0.16, wspace=0.06) + save_fig("venn_all", dpi=400) -# Cluster Analysis +# --------------------------------------------------------------------------- +# Cluster analysis +# --------------------------------------------------------------------------- def cluster_cp(folders, models): - # 1. Collect comments - texts = [] - labels = [] # "coarse" or "progressive" - models_tag = [] + texts = [] + labels = [] # "coarse" or "progressive" + models_tag = [] papers = get_papers(folders) if not papers: @@ -396,56 +341,52 @@ def cluster_cp(folders, models): for model in models: for stem in papers: - coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) - prog_data = load(Path(folders["progressive"]) / (stem + ".json")) - - for c in coarse_data.get("methods", {}).get(method_key("coarse", model), {}).get("comments", []): - texts.append(c.get("title", "") + " " + c.get("explanation", "")) - labels.append("coarse") + coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) + prog_data = load(Path(folders["progressive"]) / (stem + ".json")) + + for c in coarse_data.get("methods", {}).get(method_key("coarse", model), {}).get("comments", []): + texts.append(c.get("title", "") + " " + c.get("explanation", "")) + labels.append("coarse") models_tag.append(model) - for p in prog_data.get("methods", {}).get(method_key("progressive", model), {}).get("comments", []): + for p in prog_data.get("methods", {}).get(method_key("progressive", model), {}).get("comments", []): texts.append(p.get("title", "") + " " + p.get("explanation", "")) - labels.append("progressive") + labels.append("progressive") models_tag.append(model) - - print(f"Total comments: {len(texts)} (coarse: {labels.count('coarse')}, progressive: {labels.count('progressive')})") - - # 2. Embed and cluster + + print(f"Total comments: {len(texts)} " + f"(coarse: {labels.count('coarse')}, progressive: {labels.count('progressive')})") + model_emb = SentenceTransformer("all-MiniLM-L6-v2") - X = model_emb.encode(texts, show_progress_bar=True) - - N_CLUSTERS = 10 - km = KMeans(n_clusters=N_CLUSTERS, random_state=42) - cluster_ids = km.fit_predict(X) + X = model_emb.encode(texts, show_progress_bar=True) - # 3. Results + N_CLUSTERS = 10 + km = KMeans(n_clusters=N_CLUSTERS, random_state=42) + cluster_ids = km.fit_predict(X) - # fit TF-IDF on clustered comments for keywords + # TF-IDF for cluster keywords tfidf = TfidfVectorizer(max_features=10000, stop_words="english") - X_tfidf = tfidf.fit_transform(texts) + X_tfidf = tfidf.fit_transform(texts) terms = tfidf.get_feature_names_out() - for cluster_id in range(N_CLUSTERS): - indices = np.where(cluster_ids == cluster_id)[0] - method_counts = Counter(labels[i] for i in indices) - total = len(indices) - - # find 5 comments closest to the cluster centroid + for cluster_id in range(N_CLUSTERS): + indices = np.where(cluster_ids == cluster_id)[0] + method_counts = Counter(labels[i] for i in indices) + total = len(indices) + + # 5 comments closest to the cluster centroid centroid = km.cluster_centers_[cluster_id] - distances = np.linalg.norm(X[indices] - centroid, axis=1) - closest = indices[np.argsort(distances)[:5]] + distances = np.linalg.norm(X[indices] - centroid, axis=1) + closest = indices[np.argsort(distances)[:5]] - # average TF-IDF score across all docs in this cluster - cluster_tfidf = X_tfidf[indices].mean(axis=0) - cluster_tfidf = np.asarray(cluster_tfidf).flatten() - top_keywords = [terms[j] for j in cluster_tfidf.argsort()[-15:][::-1]] + cluster_tfidf = np.asarray(X_tfidf[indices].mean(axis=0)).flatten() + top_keywords = [terms[j] for j in cluster_tfidf.argsort()[-15:][::-1]] - print(f"\nCluster {cluster_id} ({total} comments)") + print(f"\nCluster {cluster_id} ({total} comments)") print(f" coarse: {method_counts['coarse']} ({method_counts['coarse']/total*100:.0f}%) " - f"progressive: {method_counts['progressive']} ({method_counts['progressive']/total*100:.0f}%)") - print(f" Keywords: {', '.join(top_keywords)}") - print(f" Most representative comments:") + f"progressive: {method_counts['progressive']} ({method_counts['progressive']/total*100:.0f}%)") + print(f" Keywords: {', '.join(top_keywords)}") + print(f" Most representative comments:") for i in closest: print(f" [{labels[i]:12s}] {texts[i][:100]}") @@ -480,7 +421,8 @@ def cluster_all(folders, models): labels.append("zero_shot") models_tag.append(model) - print(f"Total comments: {len(texts)} (coarse: {labels.count('coarse')}, progressive: {labels.count('progressive')}, zero_shot: {labels.count('zero_shot')})") + print(f"Total comments: {len(texts)} (coarse: {labels.count('coarse')}, " + f"progressive: {labels.count('progressive')}, zero_shot: {labels.count('zero_shot')})") model_emb = SentenceTransformer("all-MiniLM-L6-v2") X = model_emb.encode(texts, show_progress_bar=True) @@ -498,12 +440,12 @@ def cluster_all(folders, models): method_counts = Counter(labels[i] for i in indices) total = len(indices) - centroid = km.cluster_centers_[cluster_id] + centroid = km.cluster_centers_[cluster_id] distances = np.linalg.norm(X[indices] - centroid, axis=1) - closest = indices[np.argsort(distances)[:5]] + closest = indices[np.argsort(distances)[:5]] cluster_tfidf = np.asarray(X_tfidf[indices].mean(axis=0)).flatten() - top_keywords = [terms[j] for j in cluster_tfidf.argsort()[-15:][::-1]] + top_keywords = [terms[j] for j in cluster_tfidf.argsort()[-15:][::-1]] print(f"\nCluster {cluster_id} ({total} comments)") print(f" coarse: {method_counts['coarse']} ({method_counts['coarse']/total*100:.0f}%)") @@ -521,11 +463,12 @@ def cluster_all(folders, models): "progressive": "./scaleup_v2_progressive/", "zero_shot": "./scaleup_v2_zero_shot/", } - MODELS = ["deepseek-v4-flash", - "gemini-3.1-flash-lite-preview", - "glm-4.7-flash", - "qwen3.6-35b-a3b" - ] + MODELS = [ + "deepseek-v4-flash", + "gemini-3.1-flash-lite-preview", + "glm-4.7-flash", + "qwen3.6-35b-a3b", + ] TOTAL_PAPERS = len(list(Path(FOLDERS["coarse"]).glob("*.json"))) print("=" * 90) diff --git a/benchmarks/review_analysis/analysis_claude_gpt_efficient.py b/benchmarks/review_analysis/analysis_claude_gpt_efficient.py new file mode 100644 index 0000000..a649eb0 --- /dev/null +++ b/benchmarks/review_analysis/analysis_claude_gpt_efficient.py @@ -0,0 +1,118 @@ +"""3-way paragraph overlap on the perturbation benchmark, under OpenAIReview +(progressive): Claude Opus 4.7 vs GPT-5.5 vs the UNION of the efficient models. + +Mirrors analysis_gpt_claude.py (same perturbation tree and per-cell granularity: +one cell per domain x paper x error_type), but adds a third set that pools all +efficient backbones. Output: plots/venn_claude_gpt_efficient.{png,pdf}. +""" + +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt + +from utils import ( + COLOR_BLUE, COLOR_RED, COLOR_GREEN, + load, para_set, regions_3, draw_venn3, save_fig, +) + +PERTURB_ROOT = Path(__file__).resolve().parent.parent / "perturbation" / "results" + +CLAUDE = "claude-opus-4.7" +GPT = "gpt-5.5" +# Efficient backbones with full progressive coverage on the perturbation tree +# (glm-4.7-flash is excluded: only ~13 cells available). +EFFICIENT = ["deepseek-v4-flash", "gemini-3.1-flash-lite-preview", "grok-4.1-fast", "qwen3.6-35b-a3b"] +MODELS = [CLAUDE, GPT] + EFFICIENT + + +def method_key(model): + return f"progressive__{model}" + + +def perturb_cells(models, root=PERTURB_ROOT): + """Return {cell_id: {model: set(paragraph_index)}} from the perturbation tree. + + cell_id = "____"; only the progressive method is read. + """ + cells = defaultdict(dict) + for domain_dir in sorted(root.glob("*")): + if not domain_dir.is_dir() or domain_dir.name.startswith("_"): + continue + for model in models: + mdir = domain_dir / model + if not mdir.is_dir(): + continue + for etype_dir in sorted(p for p in mdir.glob("*") if p.is_dir()): + prog = etype_dir / "progressive" + if not prog.is_dir(): + continue + for paper_dir in sorted(prog.glob("paper_*")): + review_jsons = sorted((paper_dir / "review").glob("*.json")) + if not review_jsons: + continue + d = load(review_jsons[0]) + cell_id = f"{domain_dir.name}__{paper_dir.name}__{etype_dir.name}" + cells[cell_id][model] = para_set(d, method_key(model)) + return cells + + +def overlap_three(): + cells = perturb_cells(MODELS) + # Match the Claude-vs-GPT figure: cells where both frontier models are present. + paired = sorted(cid for cid, m in cells.items() if CLAUDE in m and GPT in m) + print(f"Cells with both {CLAUDE} and {GPT}: {len(paired)}") + + totals = defaultdict(float) + jaccard_sum, jaccard_n = 0.0, 0 + eff_models_seen = set() + + for cid in paired: + m = cells[cid] + a = m[CLAUDE] + b = m[GPT] + c = set() + for em in EFFICIENT: + if em in m: + c |= m[em] + eff_models_seen.add(em) + r = regions_3(a, b, c) + for k in ("only_a", "only_b", "only_c", "a_b", "a_c", "b_c", "all"): + totals[k] += r[k] + if r["total"]: + jaccard_sum += r["jaccard"] + jaccard_n += 1 + + n = len(paired) + avg = {k: v / n for k, v in totals.items()} + jaccard_avg = jaccard_sum / jaccard_n if jaccard_n else 0.0 + + print(f"Efficient models pooled: {sorted(eff_models_seen)}") + print(f"\n{'Region':<26} {'Avg/cell':>10}") + print("-" * 38) + print(f"{'Only Claude':<26} {avg['only_a']:>10.2f}") + print(f"{'Only GPT':<26} {avg['only_b']:>10.2f}") + print(f"{'Only Efficient':<26} {avg['only_c']:>10.2f}") + print(f"{'Claude & GPT':<26} {avg['a_b']:>10.2f}") + print(f"{'Claude & Efficient':<26} {avg['a_c']:>10.2f}") + print(f"{'GPT & Efficient':<26} {avg['b_c']:>10.2f}") + print(f"{'All three':<26} {avg['all']:>10.2f}") + print(f"{'Jaccard (3-way)':<26} {jaccard_avg:>10.3f}") + + plot(avg, jaccard_avg) + + +def plot(avg, jaccard_avg): + sizes = tuple(round(avg[k], 2) for k in ("only_a", "only_b", "a_b", "only_c", "a_c", "b_c", "all")) + names = ("Opus 4.7", "GPT-5.5", "Efficient models\n(union)") + + fig, ax = plt.subplots(figsize=(8, 7), dpi=400) + draw_venn3(ax, sizes, names, colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN), + region_fontsize=26, set_fontsize=30) + plt.tight_layout() + paths = save_fig("venn_claude_gpt_efficient", dpi=400) + print(f"\nWrote {', '.join(str(p) for p in paths)}") + + +if __name__ == "__main__": + overlap_three() diff --git a/benchmarks/review_analysis/analysis_claude_gpt_efficient_outcomes.py b/benchmarks/review_analysis/analysis_claude_gpt_efficient_outcomes.py new file mode 100644 index 0000000..b229b10 --- /dev/null +++ b/benchmarks/review_analysis/analysis_claude_gpt_efficient_outcomes.py @@ -0,0 +1,106 @@ +"""Quality-proxy counterpart of analysis_claude_gpt_efficient.py. + +3-way paragraph overlap on the quality-proxy (conference) papers, under +OpenAIReview (progressive): Claude Opus 4.7 vs GPT-5.5 vs the UNION of the +efficient models. Same efficient-model pool as the main-text figure. + +Frontier models are read from frontier_subset_progressive; efficient models +from scaleup_v2_progressive (grok lives in its own scaleup_v2_grok_progressive +folder). Per-paper granularity, restricted to papers with both frontier +models. Output: plots/venn_claude_gpt_efficient_outcomes.{png,pdf}. +""" + +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt + +from utils import ( + COLOR_BLUE, COLOR_RED, COLOR_GREEN, + load, para_set, stems, regions_3, draw_venn3, save_fig, +) + +RESULTS = Path(__file__).resolve().parent.parent / "conference_study" / "results" + +FRONTIER_DIR = RESULTS / "frontier_subset_progressive" +CLAUDE = "claude-opus-4.7" +GPT = "gpt-5.5" +# Same efficient pool as analysis_claude_gpt_efficient.py (main-text figure). +EFFICIENT = { + "deepseek-v4-flash": RESULTS / "scaleup_v2_progressive", + "gemini-3.1-flash-lite-preview": RESULTS / "scaleup_v2_progressive", + "grok-4.1-fast": RESULTS / "scaleup_v2_grok_progressive", + "qwen3.6-35b-a3b": RESULTS / "scaleup_v2_progressive", +} + + +def method_key(model): + return f"progressive__{model}" + + +def overlap_three(): + papers = sorted(stems(FRONTIER_DIR)) + + totals = defaultdict(float) + jaccard_sum, jaccard_n = 0.0, 0 + n_used = 0 + eff_models_seen = set() + + for stem in papers: + d = load(FRONTIER_DIR / f"{stem}.json") + a = para_set(d, method_key(CLAUDE)) + b = para_set(d, method_key(GPT)) + if not a or not b: + continue + c = set() + for em, folder in EFFICIENT.items(): + p = folder / f"{stem}.json" + if not p.exists(): + continue + s = para_set(load(p), method_key(em)) + if s: + c |= s + eff_models_seen.add(em) + r = regions_3(a, b, c) + for k in ("only_a", "only_b", "only_c", "a_b", "a_c", "b_c", "all"): + totals[k] += r[k] + if r["total"]: + jaccard_sum += r["jaccard"] + jaccard_n += 1 + n_used += 1 + + print(f"Papers with both {CLAUDE} and {GPT}: {n_used}") + avg = {k: v / n_used for k, v in totals.items()} + jaccard_avg = jaccard_sum / jaccard_n if jaccard_n else 0.0 + + print(f"Efficient models pooled: {sorted(eff_models_seen)}") + print(f"\n{'Region':<26} {'Avg/paper':>10}") + print("-" * 38) + print(f"{'Only Claude':<26} {avg['only_a']:>10.2f}") + print(f"{'Only GPT':<26} {avg['only_b']:>10.2f}") + print(f"{'Only Efficient':<26} {avg['only_c']:>10.2f}") + print(f"{'Claude & GPT':<26} {avg['a_b']:>10.2f}") + print(f"{'Claude & Efficient':<26} {avg['a_c']:>10.2f}") + print(f"{'GPT & Efficient':<26} {avg['b_c']:>10.2f}") + print(f"{'All three':<26} {avg['all']:>10.2f}") + print(f"{'Jaccard (3-way)':<26} {jaccard_avg:>10.3f}") + + plot(avg, jaccard_avg) + + +def plot(avg, jaccard_avg): + sizes = tuple(round(avg[k], 2) for k in ("only_a", "only_b", "a_b", "only_c", "a_c", "b_c", "all")) + names = ("Opus 4.7", "GPT-5.5", "Efficient models\n(union)") + + fig, ax = plt.subplots(figsize=(8, 7), dpi=400) + # Smaller fonts than the main-text figure: this one renders at 0.85\columnwidth + # like the other appendix venns (analysis_three_systems.py), not in a half-width subfigure. + draw_venn3(ax, sizes, names, colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN), + region_fontsize=18, set_fontsize=20) + plt.tight_layout() + paths = save_fig("venn_claude_gpt_efficient_outcomes", dpi=400) + print(f"\nWrote {', '.join(str(p) for p in paths)}") + + +if __name__ == "__main__": + overlap_three() diff --git a/benchmarks/review_analysis/analysis_gpt_claude.py b/benchmarks/review_analysis/analysis_gpt_claude.py index 9b15113..dfeb3b8 100644 --- a/benchmarks/review_analysis/analysis_gpt_claude.py +++ b/benchmarks/review_analysis/analysis_gpt_claude.py @@ -1,26 +1,23 @@ -import json from pathlib import Path from collections import defaultdict, Counter import numpy as np import matplotlib.pyplot as plt -from matplotlib_venn import venn2, venn2_circles from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans +from utils import COLOR_BLUE, COLOR_RED, load, para_set, stems, regions_2, draw_venn2, save_fig + model_dict = { 'claude-opus-4.7': 'Claude Opus 4.7', 'gpt-5.5': 'GPT-5.5', } -def load(path): - return json.loads(Path(path).read_text()) - def method_key(model): return f"progressive__{model}" def get_papers(folder): - return [p.stem for p in Path(folder).glob("*.json")] + return sorted(stems(folder)) # VOLUME @@ -64,10 +61,6 @@ def overlap(folder, models, total_papers): temp_jaccard_sim = defaultdict(int) temp_count = defaultdict(int) - def para_set(d, mk): - comments = d.get("methods", {}).get(mk, {}).get("comments", []) - return {c["paragraph_index"] for c in comments if "paragraph_index" in c} - claude, gpt = models[0], models[1] for stem in papers: @@ -76,10 +69,8 @@ def para_set(d, mk): claude_paras = para_set(d, method_key(claude)) gpt_paras = para_set(d, method_key(gpt)) - both_num = len(claude_paras & gpt_paras) - only_c_num = len(claude_paras - gpt_paras) - only_p_num = len(gpt_paras - claude_paras) - total_num = both_num + only_c_num + only_p_num + r = regions_2(claude_paras, gpt_paras) + both_num, only_c_num, only_p_num, total_num = r["both"], r["only_a"], r["only_b"], r["total"] overlap_ind[stem]["both_idx"] = claude_paras & gpt_paras overlap_ind[stem]["only_c_idx"] = claude_paras - gpt_paras @@ -87,13 +78,13 @@ def para_set(d, mk): overlap_ind[stem]["both_num"] = both_num overlap_ind[stem]["only_c_num"] = only_c_num overlap_ind[stem]["only_p_num"] = only_p_num - overlap_ind[stem]["jaccard_sim"] = both_num / total_num if total_num else None + overlap_ind[stem]["jaccard_sim"] = r["jaccard"] if total_num else None overlap_total["both_total"] += both_num overlap_total["only_c_total"] += only_c_num overlap_total["only_p_total"] += only_p_num - temp_jaccard_sim["all"] += both_num / total_num if total_num else 0 + temp_jaccard_sim["all"] += r["jaccard"] temp_count["all"] += 1 if total_num else 0 overlap_avg["both_avg"] = overlap_total["both_total"] / total_papers @@ -112,45 +103,19 @@ def para_set(d, mk): def plot_overlap(overlap_avg, models): - COLORS = ["#2196F3", "#E53935"] # blue, red + sizes = (round(overlap_avg["only_c_avg"], 2), + round(overlap_avg["only_p_avg"], 2), + round(overlap_avg["both_avg"], 2)) fig, ax = plt.subplots(1, 1, figsize=(7, 6), dpi=400) - - only_c = round(overlap_avg["only_c_avg"], 2) - only_p = round(overlap_avg["only_p_avg"], 2) - both = round(overlap_avg["both_avg"], 2) - - v = venn2( - subsets=(only_c, only_p, both), + draw_venn2( + ax, sizes, set_labels=(model_dict.get(models[0], models[0]), model_dict.get(models[1], models[1])), - ax=ax, - set_colors=COLORS, - alpha=0.15, + colors=(COLOR_BLUE, COLOR_RED), + region_fontsize=22, set_fontsize=20, ) - - c = venn2_circles(subsets=(only_c, only_p, both), ax=ax, linewidth=2.0) - for circle, color in zip(c, COLORS): - circle.set_edgecolor(color) - circle.set_linewidth(2.0) - - for label_id in ["10", "01", "11"]: - lbl = v.get_label_by_id(label_id) - if lbl: - lbl.set_fontsize(15) - lbl.set_color("black") - lbl.set_fontweight("normal") - lbl.set_ha("center") - - for set_label in v.set_labels: - if set_label: - set_label.set_fontsize(15) - set_label.set_color("black") - - ax.set_title(f"Claude Opus 4.7 vs. GPT-5.5\nJaccard Similarity: {overlap_avg['jaccard_sim_avg']:.3f}", - fontsize=15, fontweight="bold", pad=10) - plt.tight_layout() - plt.savefig("./venn_gpt_claude.png", dpi=400, bbox_inches="tight") + save_fig("venn_gpt_claude", dpi=400) # CLUSTERING @@ -206,22 +171,89 @@ def cluster(folder, models): print(f" [{labels[i]:20s}] {texts[i][:100]}") -if __name__ == "__main__": - FOLDER = "./frontier_subset_progressive/" - MODELS = ["claude-opus-4.7", "gpt-5.5"] - TOTAL_PAPERS = len(list(Path(FOLDER).glob("*.json"))) +# PERTURBATION SOURCE +# +# On the perturbation benchmark the OpenAIReview (progressive) reviews live in a tree +# perturbation/results////progressive/paper_xxx/review/*.json +# rather than a flat folder of per-paper JSONs. Each leaf JSON carries +# methods["progressive__"]. We treat each (domain, paper, error_type) as one cell +# (matching the _fused_for_venn granularity) and average the 2-way Venn over cells in +# which both models are present. - print("=" * 90) - print("VOLUME") - print("=" * 90) - volume_dicts(FOLDER, MODELS, TOTAL_PAPERS) +PERTURB_ROOT = Path(__file__).resolve().parent.parent / "perturbation" / "results" - print("\n" + "=" * 90) - print("OVERLAP (Claude vs. GPT)") - print("=" * 90) - overlap(FOLDER, MODELS, TOTAL_PAPERS) - print("\n" + "=" * 90) - print("CLUSTERING") +def perturb_cells(models, root=PERTURB_ROOT): + """Return {cell_id: {model: set(paragraph_index)}} from the perturbation tree. + + cell_id = "____". Only the progressive (OpenAIReview) + method is read. Skips the _fused_for_venn/ helper dir and any non-domain dirs. + """ + cells = defaultdict(dict) + for domain_dir in sorted(root.glob("*")): + if not domain_dir.is_dir() or domain_dir.name.startswith("_"): + continue + for model in models: + mdir = domain_dir / model + if not mdir.is_dir(): + continue + for etype_dir in sorted(p for p in mdir.glob("*") if p.is_dir()): + prog = etype_dir / "progressive" + if not prog.is_dir(): + continue + for paper_dir in sorted(prog.glob("paper_*")): + review_jsons = sorted((paper_dir / "review").glob("*.json")) + if not review_jsons: + continue + d = load(review_jsons[0]) + cell_id = f"{domain_dir.name}__{paper_dir.name}__{etype_dir.name}" + cells[cell_id][model] = para_set(d, method_key(model)) + return cells + + +def overlap_perturb(models, root=PERTURB_ROOT): + claude, gpt = models[0], models[1] + cells = perturb_cells(models, root) + paired = sorted(cid for cid, m in cells.items() if claude in m and gpt in m) + print(f"Cells with both {claude} and {gpt}: {len(paired)}") + + totals = {"both": 0, "only_c": 0, "only_p": 0} + jaccard_sum, jaccard_n = 0.0, 0 + vol = {claude: 0, gpt: 0} + for cid in paired: + ca, gp = cells[cid][claude], cells[cid][gpt] + r = regions_2(ca, gp) + totals["both"] += r["both"] + totals["only_c"] += r["only_a"] + totals["only_p"] += r["only_b"] + vol[claude] += len(ca) + vol[gpt] += len(gp) + if r["total"]: + jaccard_sum += r["jaccard"] + jaccard_n += 1 + + n = len(paired) + overlap_avg = { + "both_avg": totals["both"] / n if n else 0, + "only_c_avg": totals["only_c"] / n if n else 0, + "only_p_avg": totals["only_p"] / n if n else 0, + "jaccard_sim_avg": jaccard_sum / jaccard_n if jaccard_n else 0, + } + + print(f"\nAverage comments per cell: {claude} {vol[claude]/n:.2f}, {gpt} {vol[gpt]/n:.2f}") + print(f"\nAverage overlap per cell:\n") + print(f"{'Both':>8} {'Only Claude':>12} {'Only GPT':>10} {'Jaccard':>8}") + print("-" * 45) + print(f"{overlap_avg['both_avg']:>8.2f} {overlap_avg['only_c_avg']:>12.2f} {overlap_avg['only_p_avg']:>10.2f} {overlap_avg['jaccard_sim_avg']:>8.3f}") + + plot_overlap(overlap_avg, models) + return overlap_avg + + +if __name__ == "__main__": + MODELS = ["claude-opus-4.7", "gpt-5.5"] + + print("=" * 90) + print("OVERLAP (Claude vs. GPT) -- PERTURBATION benchmark") print("=" * 90) - cluster(FOLDER, MODELS) + overlap_perturb(MODELS) diff --git a/benchmarks/review_analysis/analysis_three_systems.py b/benchmarks/review_analysis/analysis_three_systems.py new file mode 100644 index 0000000..26b53cf --- /dev/null +++ b/benchmarks/review_analysis/analysis_three_systems.py @@ -0,0 +1,80 @@ +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt + +from utils import ( + COLOR_BLUE, COLOR_RED, COLOR_GREEN, + load, para_set, stems, regions_3, draw_venn3, save_fig, +) + + +# Three systems to compare, each pinned to (results folder, method key in that folder). +# Computed on the PERTURBATION benchmark, best model per system (coarse=DeepSeek, +# OpenAIReview=GPT-5.5, Reviewer3). The `_fused_for_venn/` dirs are flattened per-cell +# (one file per domain x paper x error_type) copies of the perturbation review results. +SYSTEMS = { + "coarse\nDeepSeek": ("../perturbation/results/_fused_for_venn/coarse", "coarse__deepseek-v4-flash"), + "OpenAIReview\nGPT-5.5": ("../perturbation/results/_fused_for_venn/oair_gpt55", "progressive__gpt-5.5"), + "Reviewer 3": ("../perturbation/results/_fused_for_venn/reviewer3", "reviewer3__reviewer3"), +} + + +def overlap_three(systems): + names = list(systems.keys()) + folders = {n: Path(systems[n][0]) for n in names} + keys = {n: systems[n][1] for n in names} + + # Restrict to papers present in all three folders. + papers = sorted(set.intersection(*(stems(folders[n]) for n in names))) + print(f"Papers in all 3 systems: {len(papers)}") + + totals = defaultdict(int) + jaccard_sum = 0.0 + jaccard_n = 0 + + a, b, c = names + + for stem in papers: + sa = para_set(load(folders[a] / f"{stem}.json"), keys[a]) + sb = para_set(load(folders[b] / f"{stem}.json"), keys[b]) + sc = para_set(load(folders[c] / f"{stem}.json"), keys[c]) + + r = regions_3(sa, sb, sc) + for k in ("only_a", "only_b", "only_c", "a_b", "a_c", "b_c", "all"): + totals[k] += r[k] + if r["total"]: + jaccard_sum += r["jaccard"] + jaccard_n += 1 + + n_papers = len(papers) + avg = {k: v / n_papers for k, v in totals.items()} + jaccard_avg = jaccard_sum / jaccard_n if jaccard_n else 0.0 + + print(f"\n{'Region':<28} {'Avg/paper':>10}") + print("-" * 40) + print(f"{'Only ' + a:<28} {avg['only_a']:>10.2f}") + print(f"{'Only ' + b:<28} {avg['only_b']:>10.2f}") + print(f"{'Only ' + c:<28} {avg['only_c']:>10.2f}") + print(f"{a + ' & ' + b:<28} {avg['a_b']:>10.2f}") + print(f"{a + ' & ' + c:<28} {avg['a_c']:>10.2f}") + print(f"{b + ' & ' + c:<28} {avg['b_c']:>10.2f}") + print(f"{'All three':<28} {avg['all']:>10.2f}") + print(f"{'Jaccard (3-way)':<28} {jaccard_avg:>10.3f}") + + plot(names, avg, jaccard_avg, n_papers) + + +def plot(names, avg, jaccard_avg, n_papers): + sizes = tuple(round(avg[k], 2) for k in ("only_a", "only_b", "a_b", "only_c", "a_c", "b_c", "all")) + + fig, ax = plt.subplots(figsize=(8, 7), dpi=400) + draw_venn3(ax, sizes, tuple(names), colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN), + region_fontsize=18, set_fontsize=20) + plt.tight_layout() + paths = save_fig("venn_three_systems", dpi=400) + print(f"\nWrote {', '.join(str(p) for p in paths)}") + + +if __name__ == "__main__": + overlap_three(SYSTEMS) diff --git a/benchmarks/review_analysis/analysis_union_models.py b/benchmarks/review_analysis/analysis_union_models.py new file mode 100644 index 0000000..cf4d09c --- /dev/null +++ b/benchmarks/review_analysis/analysis_union_models.py @@ -0,0 +1,95 @@ +"""Three-system paragraph overlap on the quality-proxy (conference) papers, where each +system's paragraph set is the UNION over all of its backbone models. + +This is the aggregate-over-models counterpart to analysis_three_systems.py (which pins +one best model per system). coarse unions over all coarse__ runs, OpenAIReview +unions over all progressive__ runs, and Reviewer3 has no model selector so it is +used as-is. Output: plots/venn_union_models.{png,pdf}. +""" + +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt + +from utils import ( + COLOR_BLUE, COLOR_RED, COLOR_GREEN, + load, stems, regions_3, draw_venn3, save_fig, +) + + +# Each system: (results folder, method-key prefix to union over). +# Prefix "progressive__" excludes the "progressive_original__" (pre-consolidation) keys. +SYSTEMS = { + "coarse": ("../conference_study/results/coarse_v2", "coarse__"), + "OpenAIReview": ("../conference_study/results/scaleup_v2_progressive", "progressive__"), + "Reviewer 3": ("../conference_study/results/reviewer3_v2", "reviewer3__"), +} + + +def para_set_union(d: dict, prefix: str) -> set: + """Union of paragraph_index over every method whose key starts with `prefix`.""" + union = set() + for key, m in d.get("methods", {}).items(): + if not key.startswith(prefix): + continue + union |= {c["paragraph_index"] for c in m.get("comments", []) + if c.get("paragraph_index") is not None} + return union + + +def overlap_union(systems): + names = list(systems.keys()) + folders = {n: Path(systems[n][0]) for n in names} + prefix = {n: systems[n][1] for n in names} + + papers = sorted(set.intersection(*(stems(folders[n]) for n in names))) + print(f"Papers in all 3 systems: {len(papers)}") + + totals = defaultdict(int) + jaccard_sum, jaccard_n = 0.0, 0 + a, b, c = names + + for stem in papers: + sa = para_set_union(load(folders[a] / f"{stem}.json"), prefix[a]) + sb = para_set_union(load(folders[b] / f"{stem}.json"), prefix[b]) + sc = para_set_union(load(folders[c] / f"{stem}.json"), prefix[c]) + + r = regions_3(sa, sb, sc) + for k in ("only_a", "only_b", "only_c", "a_b", "a_c", "b_c", "all"): + totals[k] += r[k] + if r["total"]: + jaccard_sum += r["jaccard"] + jaccard_n += 1 + + n_papers = len(papers) + avg = {k: v / n_papers for k, v in totals.items()} + jaccard_avg = jaccard_sum / jaccard_n if jaccard_n else 0.0 + + print(f"\n{'Region':<28} {'Avg/paper':>10}") + print("-" * 40) + print(f"{'Only ' + a:<28} {avg['only_a']:>10.2f}") + print(f"{'Only ' + b:<28} {avg['only_b']:>10.2f}") + print(f"{'Only ' + c:<28} {avg['only_c']:>10.2f}") + print(f"{a + ' & ' + b:<28} {avg['a_b']:>10.2f}") + print(f"{a + ' & ' + c:<28} {avg['a_c']:>10.2f}") + print(f"{b + ' & ' + c:<28} {avg['b_c']:>10.2f}") + print(f"{'All three':<28} {avg['all']:>10.2f}") + print(f"{'Jaccard (3-way)':<28} {jaccard_avg:>10.3f}") + + plot(names, avg, jaccard_avg) + + +def plot(names, avg, jaccard_avg): + sizes = tuple(round(avg[k], 2) for k in ("only_a", "only_b", "a_b", "only_c", "a_c", "b_c", "all")) + + fig, ax = plt.subplots(figsize=(8, 7), dpi=400) + draw_venn3(ax, sizes, tuple(names), colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN), + region_fontsize=18, set_fontsize=20) + plt.tight_layout() + paths = save_fig("venn_union_models", dpi=400) + print(f"\nWrote {', '.join(str(p) for p in paths)}") + + +if __name__ == "__main__": + overlap_union(SYSTEMS) diff --git a/benchmarks/review_analysis/analysis_with_humans.py b/benchmarks/review_analysis/analysis_with_humans.py new file mode 100644 index 0000000..acfb3b8 --- /dev/null +++ b/benchmarks/review_analysis/analysis_with_humans.py @@ -0,0 +1,567 @@ +"""Overlap between human reviewers (OpenReview) and the union of 3 AI systems. + +Per paper: + H = set of paragraph_index touched by any human reviewer + A = set of paragraph_index touched by any of (coarse / openaireview / reviewer3) +Reports per-paper and averaged |H∩A|, |H\\A|, |A\\H|, Jaccard; saves a venn2 PNG. + +Human comments are extracted from raw OpenReview review text in two LLM passes: + 1. verbatim atomic-concern extraction (no paraphrasing) + 2. top-5 paragraph retrieval (SentenceTransformer) + LLM picks best paragraph_index + +Caches everything under .cache/ in the working directory so re-runs cost nothing. + +Capped at ~70 papers (intersection of scaleup ∩ AI runs ∩ openreview-available); +see the printed count at startup. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +import time +from collections import defaultdict +from pathlib import Path +from typing import Any + +import numpy as np +import matplotlib.pyplot as plt + +# Local repo imports +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT / "src")) +from reviewer import client as llm_client # noqa: E402 + +from utils import COLOR_BLUE, COLOR_RED, load, para_set, stems, regions_2, draw_venn2, save_fig # noqa: E402 + + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- + +DEFAULT_RESULTS_DIR = Path("../conference_study/results") +DEFAULT_MANIFEST = Path("../conference_study/manifests/v2_frontier/combined.json") +DEFAULT_CACHE_DIR = Path(".cache") + +SYSTEMS = { + "coarse / DeepSeek": ("coarse_v2", "coarse__deepseek-v4-flash"), + "OpenAIReview / GPT-5.5": ("frontier_subset_progressive","progressive__gpt-5.5"), + "Reviewer 3": ("reviewer3_v2", "reviewer3__reviewer3"), +} + +# Where to write per-paper human results in the same format as the AI systems. +HUMAN_RESULTS_SUBDIR = "human_v1" +HUMAN_METHOD_KEY = "human__openreview" +HUMAN_METHOD_LABEL = "Human (OpenReview)" + +# Review-body fields likely to contain concerns/criticisms. +# Bare summaries (`summary`, `summary_of_the_paper`) are excluded. +CRITIQUE_FIELDS = ( + "review", + "main_review", + "strengths_and_weaknesses", + "strength_and_weaknesses", + "weaknesses", + "questions", + "limitations", + "limitations_and_societal_impact", + "summary_of_the_review", +) + + +# --------------------------------------------------------------------------- +# AI side +# --------------------------------------------------------------------------- + +def load_ai_union(results_dir: Path, slug: str) -> tuple[set[int], list[dict]]: + """Return (union of paragraph_index across 3 systems, canonical paragraphs list).""" + union: set[int] = set() + paragraphs = None + for _, (subdir, mk) in SYSTEMS.items(): + p = results_dir / subdir / f"{slug}.json" + d = load(p) + union |= para_set(d, mk) + if paragraphs is None: + paragraphs = d.get("paragraphs", []) + return union, (paragraphs or []) + + +def ai_stems(results_dir: Path) -> set[str]: + return set.intersection(*(stems(results_dir / subdir) for _, (subdir, _) in SYSTEMS.items())) + + +# --------------------------------------------------------------------------- +# Manifest → slug↔forum_id +# --------------------------------------------------------------------------- + +def load_manifest(manifest_path: Path) -> dict[str, str]: + """Return {slug: forum_id}.""" + m = json.loads(manifest_path.read_text()) + papers = m["papers"] if isinstance(m, dict) and "papers" in m else m + return {p["slug"]: p["forum_id"] for p in papers if p.get("slug") and p.get("forum_id")} + + +# --------------------------------------------------------------------------- +# OpenReview fetcher +# --------------------------------------------------------------------------- + +_OPENREVIEW_CLIENT = None + +def _or_client(): + global _OPENREVIEW_CLIENT + if _OPENREVIEW_CLIENT is None: + import openreview + _OPENREVIEW_CLIENT = openreview.Client(baseurl="https://api.openreview.net") + return _OPENREVIEW_CLIENT + + +def fetch_reviews(forum_id: str, cache_dir: Path) -> list[dict]: + """Return list of {reviewer_id, text} for Official_Review notes. Cached.""" + cache_file = cache_dir / "openreview_reviews" / f"{forum_id}.json" + if cache_file.exists(): + return json.loads(cache_file.read_text()) + + cache_file.parent.mkdir(parents=True, exist_ok=True) + client = _or_client() + notes = client.get_notes(forum=forum_id) + reviews = [] + for n in notes: + inv = getattr(n, "invitation", "") or "" + if "Official_Review" not in inv: + continue + # Reviewer id from signatures, e.g. ".../AnonReviewer3" + sigs = getattr(n, "signatures", []) or [] + reviewer_id = sigs[0].split("/")[-1] if sigs else f"reviewer_{len(reviews)}" + content = getattr(n, "content", {}) or {} + parts = [] + for k in CRITIQUE_FIELDS: + v = content.get(k) + if isinstance(v, str) and v.strip(): + parts.append(f"## {k}\n{v.strip()}") + if not parts: + continue + reviews.append({ + "reviewer_id": reviewer_id, + "rating": content.get("rating") or content.get("recommendation"), + "confidence": content.get("confidence"), + "text": "\n\n".join(parts), + }) + + cache_file.write_text(json.dumps(reviews, indent=2)) + time.sleep(1.0) # be polite + return reviews + + +# --------------------------------------------------------------------------- +# LLM pass 1: verbatim atomic extraction +# --------------------------------------------------------------------------- + +_EXTRACT_PROMPT = """You are given one peer review of a research paper. Split it into atomic concerns — each one a distinct criticism, question, or weakness the reviewer raises. + +For each concern, copy the reviewer's own sentence(s) verbatim. DO NOT paraphrase, summarize, or rewrite. If a sentence packs multiple concerns, emit multiple items that each quote the same span. If no quote exists for the concern, skip it. Skip pure praise, generic summary statements, and procedural remarks. + +Return a JSON array (and ONLY a JSON array) of objects with exactly these fields: + - "title": a short (≤10 word) label you write + - "verbatim": the reviewer's exact words for this concern (one or more contiguous sentences, copied without changes) + - "comment_type": "technical" if it concerns math/formulas/experiments/methodology, else "logical" + +REVIEW: +\"\"\" +{review_text} +\"\"\"""" + + +def _strip_code_fence(s: str) -> str: + s = s.strip() + if s.startswith("```"): + s = re.sub(r"^```(?:json)?\s*", "", s) + s = re.sub(r"\s*```$", "", s) + return s.strip() + + +def llm_extract(review_text: str, model: str) -> tuple[list[dict], dict]: + """Return (list of {title, verbatim, comment_type}, usage).""" + messages = [{"role": "user", "content": _EXTRACT_PROMPT.format(review_text=review_text)}] + raw, usage = llm_client.chat(messages=messages, model=model, temperature=0.0, max_tokens=4096) + raw = _strip_code_fence(raw) + try: + items = json.loads(raw) + except json.JSONDecodeError: + # Try to find a JSON array inside + m = re.search(r"\[.*\]", raw, re.DOTALL) + if not m: + print(f" WARN: extraction returned non-JSON; got {raw[:200]!r}", file=sys.stderr) + return [], usage + try: + items = json.loads(m.group(0)) + except json.JSONDecodeError: + print(f" WARN: still not parseable: {raw[:200]!r}", file=sys.stderr) + return [], usage + cleaned = [] + for it in items: + if not isinstance(it, dict): + continue + verbatim = (it.get("verbatim") or "").strip() + if not verbatim: + continue + cleaned.append({ + "title": (it.get("title") or "").strip()[:120], + "verbatim": verbatim, + "comment_type": it.get("comment_type", "logical"), + }) + return cleaned, usage + + +def extract_for_forum(reviews: list[dict], forum_id: str, model: str, + cache_dir: Path) -> tuple[list[dict], dict]: + """Return (list of comments with reviewer_id attached, accumulated usage).""" + out_dir = cache_dir / "human_comments" + out_dir.mkdir(parents=True, exist_ok=True) + all_comments = [] + total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "cost_usd": 0.0} + for r in reviews: + rid = r["reviewer_id"] + cache_file = out_dir / f"{forum_id}__{rid}.json" + if cache_file.exists(): + entries = json.loads(cache_file.read_text()) + else: + entries, usage = llm_extract(r["text"], model) + cache_file.write_text(json.dumps(entries, indent=2)) + for k in ("prompt_tokens", "completion_tokens"): + total_usage[k] += usage.get(k, 0) + total_usage["cost_usd"] += usage.get("cost_usd") or 0.0 + for e in entries: + all_comments.append({**e, "reviewer_id": rid}) + return all_comments, total_usage + + +# --------------------------------------------------------------------------- +# Embeddings + LLM pass 2 grounding +# --------------------------------------------------------------------------- + +_EMBED_MODEL = None + +def _embedder(): + global _EMBED_MODEL + if _EMBED_MODEL is None: + from sentence_transformers import SentenceTransformer + _EMBED_MODEL = SentenceTransformer("all-MiniLM-L6-v2") + return _EMBED_MODEL + + +def paragraph_embeddings(slug: str, paragraphs: list[dict], cache_dir: Path) -> np.ndarray: + cache_file = cache_dir / "paragraph_embeddings" / f"{slug}.npy" + if cache_file.exists(): + arr = np.load(cache_file) + if arr.shape[0] == len(paragraphs): + return arr + cache_file.parent.mkdir(parents=True, exist_ok=True) + texts = [p.get("text", "") for p in paragraphs] + emb = _embedder().encode(texts, show_progress_bar=False, normalize_embeddings=True) + np.save(cache_file, emb) + return emb + + +_GROUND_PROMPT = """You will be given a concern from a peer reviewer and 5 candidate paragraphs from the paper. Pick the single paragraph that the concern is most directly about. + +Output ONLY the integer index of the best paragraph, or the literal word "none" if no candidate is a reasonable match. + +REVIEWER CONCERN: +\"\"\" +{concern} +\"\"\" + +CANDIDATE PARAGRAPHS: +{candidates} + +Your answer (a single integer index from the candidates, or "none"):""" + + +def llm_ground(concern: str, candidates: list[tuple[int, str]], model: str) -> tuple[int | None, dict]: + cand_text = "\n\n".join( + f"[index {idx}]\n{text[:1200]}" for idx, text in candidates + ) + prompt = _GROUND_PROMPT.format(concern=concern, candidates=cand_text) + raw, usage = llm_client.chat( + messages=[{"role": "user", "content": prompt}], + model=model, temperature=0.0, max_tokens=64, + ) + s = raw.strip().lower() + if s.startswith("none") or "none" in s and not re.search(r"\d", s): + return None, usage + m = re.search(r"-?\d+", s) + if not m: + return None, usage + chosen = int(m.group(0)) + # Validate it's one of the candidates + if chosen not in {idx for idx, _ in candidates}: + return None, usage + return chosen, usage + + +def ground_comments(slug: str, comments: list[dict], paragraphs: list[dict], + model: str, cache_dir: Path, top_k: int = 5) -> tuple[set[int], int, dict, list[dict]]: + """Return (set of grounded paragraph_index, ungrounded_count, usage, per-comment grounding info). + + The per-comment list contains the input comment dict augmented with `paragraph_index` + (int or None). Ordering matches `comments`. + """ + if not comments or not paragraphs: + return set(), len(comments), {"prompt_tokens": 0, "completion_tokens": 0, "cost_usd": 0.0}, [ + {**c, "paragraph_index": None} for c in comments + ] + + cache_file = cache_dir / "grounding" / f"{slug}.json" + cache_file.parent.mkdir(parents=True, exist_ok=True) + cache: dict[str, Any] = json.loads(cache_file.read_text()) if cache_file.exists() else {} + + emb = paragraph_embeddings(slug, paragraphs, cache_dir) + embedder = _embedder() + + grounded = set() + ungrounded = 0 + total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "cost_usd": 0.0} + per_comment: list[dict] = [] + + for c in comments: + key = f"{c['reviewer_id']}::{c['verbatim'][:200]}" + if key in cache: + choice = cache[key] + else: + q = embedder.encode([c["verbatim"]], normalize_embeddings=True)[0] + sims = emb @ q + top_idx = np.argsort(-sims)[:top_k].tolist() + candidates = [(int(i), paragraphs[i].get("text", "")) for i in top_idx] + chosen, usage = llm_ground(c["verbatim"], candidates, model) + for k in ("prompt_tokens", "completion_tokens"): + total_usage[k] += usage.get(k, 0) + total_usage["cost_usd"] += usage.get("cost_usd") or 0.0 + choice = chosen + cache[key] = choice + cache_file.write_text(json.dumps(cache, indent=2)) + + per_comment.append({**c, "paragraph_index": (int(choice) if choice is not None else None)}) + if choice is None: + ungrounded += 1 + else: + grounded.add(int(choice)) + + return grounded, ungrounded, total_usage, per_comment + + +# --------------------------------------------------------------------------- +# Save human comments in the same format as AI results +# --------------------------------------------------------------------------- + +def save_human_results(results_dir: Path, slug: str, title: str, + paragraphs: list[dict], grounded_comments: list[dict], + reviews: list[dict]) -> Path: + """Write a per-paper JSON under results/human_v1/ in the AI-style schema. + + The "quote" field gets the paper paragraph text (mirroring AI semantics where + quote = passage being critiqued); the "explanation" field gets the reviewer's + verbatim words. Reviewer attribution lives in the `reviewer_id` extra field. + """ + out_dir = results_dir / HUMAN_RESULTS_SUBDIR + out_dir.mkdir(parents=True, exist_ok=True) + out_file = out_dir / f"{slug}.json" + + comments_out = [] + for i, c in enumerate(grounded_comments): + if c.get("paragraph_index") is None: + continue # only emit grounded comments — they're the ones that count for overlap + pi = c["paragraph_index"] + para_text = paragraphs[pi].get("text", "") if 0 <= pi < len(paragraphs) else "" + comments_out.append({ + "id": f"{HUMAN_METHOD_KEY}_{i}", + "title": c.get("title", ""), + "quote": para_text, + "explanation": c["verbatim"], + "comment_type": c.get("comment_type", "logical"), + "paragraph_index": pi, + "reviewer_id": c.get("reviewer_id", ""), + }) + + # Build the standard top-level shell, with paragraphs reused for visualization. + doc = { + "slug": slug, + "title": title, + "paragraphs": paragraphs, + "methods": { + HUMAN_METHOD_KEY: { + "label": HUMAN_METHOD_LABEL, + "model": "openreview-human-reviewers", + "overall_feedback": "\n\n".join( + f"### {r['reviewer_id']} (rating={r.get('rating')}, confidence={r.get('confidence')})\n{r['text']}" + for r in reviews + ), + "comments": comments_out, + "cost_usd": 0.0, + "cost_method": "n/a", + "prompt_tokens": None, + "completion_tokens": None, + "n_reviewers": len(reviews), + "n_comments_total": len(grounded_comments), + "n_comments_grounded": len(comments_out), + } + }, + } + out_file.write_text(json.dumps(doc, indent=2)) + return out_file + + +# --------------------------------------------------------------------------- +# Overlap + plot +# --------------------------------------------------------------------------- + +def compute_overlap(papers: list[str], results_dir: Path, slug_to_forum: dict[str, str], + model: str, cache_dir: Path, limit: int) -> dict: + if limit > 0: + papers = papers[:limit] + totals = defaultdict(int) + jaccard_sum = 0.0 + jaccard_n = 0 + skipped_no_reviews = 0 + total_ungrounded = 0 + total_human = 0 + cumulative_cost = 0.0 + per_paper_rows = [] + + for slug in papers: + forum = slug_to_forum.get(slug) + if not forum: + print(f" SKIP {slug}: no forum_id in manifest") + continue + try: + ai_set, paragraphs = load_ai_union(results_dir, slug) + except FileNotFoundError as e: + print(f" SKIP {slug}: missing AI JSON ({e})") + continue + + try: + reviews = fetch_reviews(forum, cache_dir) + except Exception as e: + print(f" SKIP {slug}: OpenReview fetch failed: {e}") + skipped_no_reviews += 1 + continue + if not reviews: + print(f" SKIP {slug}: no Official_Review notes for forum {forum}") + skipped_no_reviews += 1 + continue + + h_comments, u1 = extract_for_forum(reviews, forum, model, cache_dir) + h_set, ungrounded, u2, h_grounded = ground_comments(slug, h_comments, paragraphs, model, cache_dir) + cumulative_cost += (u1.get("cost_usd") or 0.0) + (u2.get("cost_usd") or 0.0) + + # Save in AI-style per-paper JSON so the human side is browsable next to AI results. + title = "" + try: + title = load(results_dir / SYSTEMS["coarse / DeepSeek"][0] / f"{slug}.json").get("title", "") + except Exception: + pass + save_human_results(results_dir, slug, title, paragraphs, h_grounded, reviews) + + r = regions_2(h_set, ai_set) + totals["h_only"] += r["only_a"] + totals["a_only"] += r["only_b"] + totals["both"] += r["both"] + total_ungrounded += ungrounded + total_human += len(h_comments) + if r["total"]: + jaccard_sum += r["jaccard"] + jaccard_n += 1 + + per_paper_rows.append({ + "slug": slug, "forum": forum, "n_reviews": len(reviews), + "h_comments": len(h_comments), "ungrounded": ungrounded, + "|H|": len(h_set), "|A|": len(ai_set), + "H_only": r["only_a"], "A_only": r["only_b"], "both": r["both"], "jaccard": r["jaccard"], + }) + print(f" {slug[:60]:60s} H={len(h_set):3d} A={len(ai_set):3d} ∩={r['both']:3d} J={r['jaccard']:.3f}") + + n = len(per_paper_rows) + avg = {k: v / max(n, 1) for k, v in totals.items()} + jac_avg = jaccard_sum / max(jaccard_n, 1) + return { + "n_papers": n, + "totals": dict(totals), + "avg": avg, + "jaccard_avg": jac_avg, + "rows": per_paper_rows, + "skipped_no_reviews": skipped_no_reviews, + "total_ungrounded": total_ungrounded, + "total_human_comments": total_human, + "cumulative_cost_usd": cumulative_cost, + } + + +def plot_venn(avg: dict, jaccard_avg: float, n_papers: int, base_name: str) -> None: + sizes = (round(avg["h_only"], 2), round(avg["a_only"], 2), round(avg["both"], 2)) + fig, ax = plt.subplots(figsize=(7, 6), dpi=300) + draw_venn2( + ax, sizes, + set_labels=("Human", "AI"), + colors=(COLOR_RED, COLOR_BLUE), + alpha=0.15, region_fontsize=32, set_fontsize=30, + ) + plt.tight_layout() + paths = save_fig(base_name, dpi=300) + print(f"\nWrote {', '.join(str(p) for p in paths)}") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--results-dir", type=Path, default=DEFAULT_RESULTS_DIR) + ap.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST) + ap.add_argument("--cache-dir", type=Path, default=DEFAULT_CACHE_DIR) + ap.add_argument("--model", type=str, default="google/gemini-3-flash-preview") + ap.add_argument("--limit", type=int, default=0, help="0 = all") + ap.add_argument("--out", type=str, default="venn_human_vs_ai", + help="Base name (no extension) for the venn figure under plots/") + ap.add_argument("--rows-out", type=Path, default=Path("per_paper_human_vs_ai.json")) + args = ap.parse_args() + + slug_to_forum = load_manifest(args.manifest) + ai_intersection = ai_stems(args.results_dir) + candidates = sorted(set(slug_to_forum) & ai_intersection) + print(f"Manifest papers: {len(slug_to_forum)}") + print(f"AI 3-way intersection: {len(ai_intersection)}") + print(f"Candidates (manifest ∩ AI): {len(candidates)}") + if args.limit > 0: + print(f"--limit {args.limit} → using first {min(args.limit, len(candidates))} candidates") + print(f"Note: scaleup has ~209 PDFs but the bottleneck is frontier_subset_progressive (74).\n") + + summary = compute_overlap(candidates, args.results_dir, slug_to_forum, + args.model, args.cache_dir, args.limit) + + n = summary["n_papers"] + a = summary["avg"] + print(f"\n{'='*60}") + print(f"Papers analyzed: {n}") + print(f"Skipped (no reviews): {summary['skipped_no_reviews']}") + print(f"Total human comments: {summary['total_human_comments']}") + print(f" ungrounded (dropped): {summary['total_ungrounded']} " + f"({summary['total_ungrounded']/max(summary['total_human_comments'],1):.1%})") + print(f"LLM cost this run: ${summary['cumulative_cost_usd']:.4f}") + print() + print(f"{'Region':<32} {'Avg/paper':>10}") + print("-" * 44) + print(f"{'Human only (H \\ A)':<32} {a['h_only']:>10.2f}") + print(f"{'AI only (A \\ H)':<32} {a['a_only']:>10.2f}") + print(f"{'Both (H ∩ A)':<32} {a['both']:>10.2f}") + print(f"{'Jaccard (avg)':<32} {summary['jaccard_avg']:>10.3f}") + + args.rows_out.write_text(json.dumps(summary, indent=2)) + print(f"\nPer-paper rows → {args.rows_out}") + + plot_venn(a, summary["jaccard_avg"], n, args.out) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/review_analysis/cluster_new.py b/benchmarks/review_analysis/cluster_new.py new file mode 100644 index 0000000..2c43e30 --- /dev/null +++ b/benchmarks/review_analysis/cluster_new.py @@ -0,0 +1,156 @@ +"""One-off clustering for the two new tables (3-system + human-vs-AI). + +Mirrors the pipeline in analysis.py:cluster_cp but on the (coarse, OpenAIReview, +Reviewer 3) triple and on (human union, AI union) — same 70-paper cohort that +analysis_three_systems.py and analysis_with_humans.py use. +""" + +from __future__ import annotations + +import json +from collections import Counter +from pathlib import Path + +import numpy as np +from sentence_transformers import SentenceTransformer +from sklearn.cluster import KMeans +from sklearn.feature_extraction.text import TfidfVectorizer + + +RESULTS = Path("../conference_study/results") +HUMAN_DIR = RESULTS / "human_v1" + +SYSTEMS = [ + ("coarse", RESULTS / "coarse_v2", "coarse__deepseek-v4-flash"), + ("openaireview", RESULTS / "frontier_subset_progressive", "progressive__gpt-5.5"), + ("reviewer3", RESULTS / "reviewer3_v2", "reviewer3__reviewer3"), +] + +N_CLUSTERS = 10 +TOP_KEYWORDS = 15 +N_REPRESENTATIVE = 6 + + +def load_comments(folder: Path, method_key: str, slugs: set[str]) -> list[dict]: + """Return list of {text, source} for `method_key` comments in `folder`.""" + out = [] + for slug in slugs: + p = folder / f"{slug}.json" + if not p.exists(): + continue + d = json.loads(p.read_text()) + for c in d.get("methods", {}).get(method_key, {}).get("comments", []): + text = (c.get("title", "") + " " + c.get("explanation", "")).strip() + if text: + out.append(text) + return out + + +def stems(folder: Path) -> set[str]: + return {p.stem for p in folder.glob("*.json")} + + +def cluster_and_report(texts: list[str], labels: list[str], source_names: list[str]) -> dict: + """Run kmeans + TF-IDF, print cluster summaries, return per-cluster source shares.""" + print(f"Total comments: {len(texts)}") + for name in source_names: + n = labels.count(name) + print(f" {name}: {n} ({n / len(texts) * 100:.0f}%)") + + embedder = SentenceTransformer("all-MiniLM-L6-v2") + X = embedder.encode(texts, show_progress_bar=False) + + km = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init="auto") + cluster_ids = km.fit_predict(X) + + tfidf = TfidfVectorizer(max_features=10000, stop_words="english") + X_tfidf = tfidf.fit_transform(texts) + terms = tfidf.get_feature_names_out() + + cluster_info = [] + for cid in range(N_CLUSTERS): + idx = np.where(cluster_ids == cid)[0] + n = len(idx) + if n == 0: + continue + counts = Counter(labels[i] for i in idx) + centroid = km.cluster_centers_[cid] + dists = np.linalg.norm(X[idx] - centroid, axis=1) + closest = idx[np.argsort(dists)[:N_REPRESENTATIVE]] + cluster_tfidf = np.asarray(X_tfidf[idx].mean(axis=0)).flatten() + top = [terms[j] for j in cluster_tfidf.argsort()[-TOP_KEYWORDS:][::-1]] + + info = { + "id": cid, + "size": n, + "share": {name: counts.get(name, 0) for name in source_names}, + "keywords": top, + "representative": [texts[i][:140] for i in closest], + } + cluster_info.append(info) + + print(f"\n--- Cluster {cid} (n={n}) ---") + for name in source_names: + k = counts.get(name, 0) + print(f" {name:15s} {k:5d} ({k / n * 100:5.1f}%)") + print(f" keywords: {', '.join(top)}") + for s in info["representative"]: + print(f" > {s}") + return cluster_info + + +def main_three_systems() -> None: + print("=" * 90) + print("3-WAY CLUSTERING: coarse vs OpenAIReview vs Reviewer 3") + print("=" * 90) + slugs = set.intersection(*(stems(folder) for _, folder, _ in SYSTEMS)) + print(f"Papers in all 3 systems: {len(slugs)}") + + texts, labels = [], [] + for name, folder, mk in SYSTEMS: + cs = load_comments(folder, mk, slugs) + texts.extend(cs) + labels.extend([name] * len(cs)) + + info = cluster_and_report(texts, labels, [n for n, _, _ in SYSTEMS]) + Path("cluster_three_systems.json").write_text(json.dumps(info, indent=2)) + + +def main_human_vs_ai() -> None: + print("\n" + "=" * 90) + print("HUMAN vs AI-UNION CLUSTERING") + print("=" * 90) + ai_intersection = set.intersection(*(stems(folder) for _, folder, _ in SYSTEMS)) + human_present = stems(HUMAN_DIR) + slugs = sorted(ai_intersection & human_present) + print(f"Papers in 3 AI systems AND human_v1/: {len(slugs)}") + + texts, labels = [], [] + # Human side + for slug in slugs: + d = json.loads((HUMAN_DIR / f"{slug}.json").read_text()) + for c in d.get("methods", {}).get("human__openreview", {}).get("comments", []): + text = (c.get("title", "") + " " + c.get("explanation", "")).strip() + if text: + texts.append(text) + labels.append("human") + # AI union side + for name, folder, mk in SYSTEMS: + for slug in slugs: + p = folder / f"{slug}.json" + if not p.exists(): + continue + d = json.loads(p.read_text()) + for c in d.get("methods", {}).get(mk, {}).get("comments", []): + text = (c.get("title", "") + " " + c.get("explanation", "")).strip() + if text: + texts.append(text) + labels.append("ai") + + info = cluster_and_report(texts, labels, ["human", "ai"]) + Path("cluster_human_vs_ai.json").write_text(json.dumps(info, indent=2)) + + +if __name__ == "__main__": + main_three_systems() + main_human_vs_ai() diff --git a/benchmarks/review_analysis/regen_appendix_venns.py b/benchmarks/review_analysis/regen_appendix_venns.py new file mode 100644 index 0000000..aa0956d --- /dev/null +++ b/benchmarks/review_analysis/regen_appendix_venns.py @@ -0,0 +1,89 @@ +"""Regenerate the appendix per-model Venn grids (venn_cp, venn_all) in the +main-paper style. + +The raw result JSONs are not always co-located with this script, so this driver +plots directly from the per-model region averages already reported in the EMNLP +appendix tables (tab:overlap_cp and tab:overlap_all). The styling mirrors the +single-panel main-paper venns (analysis_three_systems.py / analysis_gpt_claude.py): +clean per-panel model-name titles, consistent palette and edge styling, larger +fonts, and no in-figure Jaccard text (Jaccard lives in the table columns). + +Run with an env that has matplotlib + matplotlib_venn: + python regen_appendix_venns.py +Outputs are written straight into the paper's plots/ directory. +""" + +import sys +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from utils import COLOR_BLUE, COLOR_RED, COLOR_GREEN, draw_venn2, draw_venn3 + +# Write into the EMNLP paper's plots dir. +PAPER_PLOTS = Path("/data/dangnguyen/openaireview_project/openaireview_emnlp/plots") + +MODEL_ORDER = ["DeepSeek-V4-Flash", "Qwen3.6-35B-A3B", "Gemini-3.1-Flash-Lite", "GLM-4.7-Flash"] + +# tab:overlap_cp -> (coarse_only, openaireview_only, both) +CP = { + "DeepSeek-V4-Flash": (10.54, 5.34, 2.41), + "Qwen3.6-35B-A3B": (8.87, 5.66, 2.88), + "Gemini-3.1-Flash-Lite": (4.19, 5.15, 1.05), + "GLM-4.7-Flash": (2.69, 4.87, 0.38), +} + +# tab:overlap_all -> region averages keyed C=coarse, O=openaireview, Z=zero-shot +# columns: C only, O only, Z only, C&O only, C&Z only, O&Z only, C&O&Z +ALL = { + "DeepSeek-V4-Flash": dict(c=9.85, o=4.61, z=1.30, co=1.83, cz=0.70, oz=0.73, all=0.58), + "Qwen3.6-35B-A3B": dict(c=8.38, o=4.93, z=1.10, co=2.36, cz=0.49, oz=0.73, all=0.52), + "Gemini-3.1-Flash-Lite": dict(c=4.01, o=4.47, z=1.06, co=0.84, cz=0.18, oz=0.69, all=0.21), + "GLM-4.7-Flash": dict(c=2.66, o=4.63, z=0.72, co=0.36, cz=0.03, oz=0.23, all=0.02), +} + + +def save(base): + PAPER_PLOTS.mkdir(parents=True, exist_ok=True) + for fmt in ("png", "pdf"): + plt.savefig(PAPER_PLOTS / f"{base}.{fmt}", dpi=400, bbox_inches="tight") + print("wrote", base) + + +def plot_cp(): + fig, axes = plt.subplots(2, 2, figsize=(12, 10), dpi=400) + axes = axes.flatten() + for ax, model in zip(axes, MODEL_ORDER): + draw_venn2(ax, CP[model], set_labels=("coarse", "OpenAIReview"), + colors=(COLOR_BLUE, COLOR_RED), + region_fontsize=24, set_fontsize=21) + ax.set_title(model, fontsize=21, pad=12) + plt.tight_layout() + plt.subplots_adjust(hspace=0.16, wspace=0.06) + save("venn_cp") + plt.close(fig) + + +def plot_all(): + fig, axes = plt.subplots(2, 2, figsize=(12, 11), dpi=400) + axes = axes.flatten() + for ax, model in zip(axes, MODEL_ORDER): + r = ALL[model] + # draw_venn3 order: (Abc, aBc, ABc, abC, AbC, aBC, ABC) + sizes = (r["c"], r["o"], r["co"], r["z"], r["cz"], r["oz"], r["all"]) + draw_venn3(ax, sizes, set_labels=("coarse", "OpenAIReview", "zero-shot"), + colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN), + region_fontsize=17, set_fontsize=18) + ax.set_title(model, fontsize=21, pad=12) + plt.tight_layout() + plt.subplots_adjust(hspace=0.16, wspace=0.06) + save("venn_all") + plt.close(fig) + + +if __name__ == "__main__": + plot_cp() + plot_all() diff --git a/benchmarks/review_analysis/utils.py b/benchmarks/review_analysis/utils.py new file mode 100644 index 0000000..0b490c9 --- /dev/null +++ b/benchmarks/review_analysis/utils.py @@ -0,0 +1,145 @@ +"""Shared helpers for benchmarks/review_analysis/analysis_*.py scripts. + +Each `analysis_*.py` script has its own comparison target (4-system × model +matrix, 2-way Claude/GPT, 3-system, human-vs-AI-union) but shares the same +plumbing: load a per-paper result JSON, pull the set of paragraph_index for a +method, compute Venn regions, and apply consistent styling. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import matplotlib.pyplot as plt +from matplotlib_venn import venn2, venn2_circles, venn3, venn3_circles + + +PLOTS_DIR = Path(__file__).resolve().parent / "plots" + + +def save_fig(base_name: str, *, dpi: int = 300, formats: tuple[str, ...] = ("png", "pdf")) -> list[Path]: + """Save the current matplotlib figure to plots/{base_name}.{ext} for each format. + + Returns the list of paths written. + """ + PLOTS_DIR.mkdir(parents=True, exist_ok=True) + out = [] + for fmt in formats: + path = PLOTS_DIR / f"{base_name}.{fmt}" + plt.savefig(path, dpi=dpi, bbox_inches="tight") + out.append(path) + return out + + +# Palette used across scripts. +COLOR_BLUE = "#2196F3" +COLOR_RED = "#E53935" +COLOR_GREEN = "#43A047" + + +def load(path) -> dict: + """Read a JSON result file.""" + return json.loads(Path(path).read_text()) + + +def para_set(d: dict, method_key: str) -> set[int]: + """Set of paragraph_index touched by `method_key`'s comments in result `d`.""" + comments = d.get("methods", {}).get(method_key, {}).get("comments", []) + return {c["paragraph_index"] for c in comments if c.get("paragraph_index") is not None} + + +def stems(folder) -> set[str]: + """Set of paper slugs (.json file stems) in a results folder.""" + return {p.stem for p in Path(folder).glob("*.json")} + + +def regions_2(a: set, b: set) -> dict: + """2-way region counts: only_a, only_b, both, total, jaccard.""" + only_a = len(a - b) + only_b = len(b - a) + both = len(a & b) + total = only_a + only_b + both + return { + "only_a": only_a, "only_b": only_b, "both": both, + "total": total, + "jaccard": both / total if total else 0.0, + } + + +def regions_3(a: set, b: set, c: set) -> dict: + """3-way region counts: only_a/b/c, a_b, a_c, b_c, all, total, jaccard.""" + only_a = len(a - b - c) + only_b = len(b - a - c) + only_c = len(c - a - b) + a_b = len((a & b) - c) + a_c = len((a & c) - b) + b_c = len((b & c) - a) + all3 = len(a & b & c) + total = only_a + only_b + only_c + a_b + a_c + b_c + all3 + return { + "only_a": only_a, "only_b": only_b, "only_c": only_c, + "a_b": a_b, "a_c": a_c, "b_c": b_c, "all": all3, + "total": total, + "jaccard": all3 / total if total else 0.0, + } + + +def style_venn2(v, circles, colors, *, region_fontsize=30, set_fontsize=18): + """Apply consistent edge colors and font sizes to a venn2 figure.""" + for circle, color in zip(circles, colors): + circle.set_edgecolor(color) + circle.set_linewidth(2.0) + for label_id in ("10", "01", "11"): + lbl = v.get_label_by_id(label_id) + if lbl: + lbl.set_fontsize(region_fontsize) + lbl.set_color("black") + lbl.set_ha("center") + for sl in v.set_labels: + if sl: + sl.set_fontsize(set_fontsize) + sl.set_color("black") + # matplotlib_venn places venn2 set_labels below the circles when alpha + # patches overlap heavily; flip them above so the model/system names sit + # at the top of the figure (more conventional and matches the venn3 layout). + for sl in v.set_labels: + if sl: + x, y = sl.get_position() + sl.set_position((x, abs(y))) + sl.set_va("bottom") + + +def style_venn3(v, circles, colors, *, region_fontsize=26, set_fontsize=16): + """Apply consistent edge colors and font sizes to a venn3 figure.""" + for circle, color in zip(circles, colors): + circle.set_edgecolor(color) + circle.set_linewidth(2.0) + for label_id in ("100", "010", "110", "001", "101", "011", "111"): + lbl = v.get_label_by_id(label_id) + if lbl: + lbl.set_fontsize(region_fontsize) + lbl.set_color("black") + lbl.set_ha("center") + for sl in v.set_labels: + if sl: + sl.set_fontsize(set_fontsize) + sl.set_color("black") + + +def draw_venn2(ax, sizes, set_labels, colors=(COLOR_BLUE, COLOR_RED), *, + alpha=0.15, region_fontsize=30, set_fontsize=18): + """Draw a styled venn2 on `ax`. `sizes` = (only_a, only_b, both).""" + v = venn2(subsets=sizes, set_labels=set_labels, ax=ax, set_colors=colors, alpha=alpha) + circles = venn2_circles(subsets=sizes, ax=ax, linewidth=2.0) + style_venn2(v, circles, colors, region_fontsize=region_fontsize, set_fontsize=set_fontsize) + return v, circles + + +def draw_venn3(ax, sizes, set_labels, colors=(COLOR_BLUE, COLOR_RED, COLOR_GREEN), *, + alpha=0.15, region_fontsize=26, set_fontsize=16): + """Draw a styled venn3 on `ax`. `sizes` = (Abc, aBc, ABc, abC, AbC, aBC, ABC).""" + v = venn3(subsets=sizes, set_labels=set_labels, ax=ax, set_colors=colors, alpha=alpha) + circles = venn3_circles(subsets=sizes, ax=ax, linewidth=2.0) + style_venn3(v, circles, colors, region_fontsize=region_fontsize, set_fontsize=set_fontsize) + return v, circles diff --git a/benchmarks/review_analysis/venn_all.png b/benchmarks/review_analysis/venn_all.png deleted file mode 100644 index 6c5a910..0000000 Binary files a/benchmarks/review_analysis/venn_all.png and /dev/null differ diff --git a/benchmarks/review_analysis/venn_cp.png b/benchmarks/review_analysis/venn_cp.png deleted file mode 100644 index 6b8ab07..0000000 Binary files a/benchmarks/review_analysis/venn_cp.png and /dev/null differ diff --git a/benchmarks/review_analysis/venn_cp_deepseek.png b/benchmarks/review_analysis/venn_cp_deepseek.png deleted file mode 100644 index 26c17fd..0000000 Binary files a/benchmarks/review_analysis/venn_cp_deepseek.png and /dev/null differ diff --git a/benchmarks/review_analysis/venn_gpt_claude.png b/benchmarks/review_analysis/venn_gpt_claude.png deleted file mode 100644 index 5ee09b7..0000000 Binary files a/benchmarks/review_analysis/venn_gpt_claude.png and /dev/null differ