diff --git a/experiments/bot_detection/data/open_pr_activity.parquet b/experiments/bot_detection/data/open_pr_activity.parquet new file mode 100644 index 0000000..5545276 Binary files /dev/null and b/experiments/bot_detection/data/open_pr_activity.parquet differ diff --git a/experiments/bot_detection/data/results/pocket_veto_analysis.json b/experiments/bot_detection/data/results/pocket_veto_analysis.json new file mode 100644 index 0000000..4138096 --- /dev/null +++ b/experiments/bot_detection/data/results/pocket_veto_analysis.json @@ -0,0 +1,447 @@ +{ + "universal_threshold_days": 90, + "characterization": { + "state_totals": { + "CLOSED": 34637, + "MERGED": 146033, + "OPEN": 19502 + }, + "outcome_totals": { + "merged": 146033, + "pocket_veto": 30894, + "rejected": 23245 + }, + "state_outcome_crosstab": [ + { + "state": "CLOSED", + "outcome": "pocket_veto", + "n": 11392 + }, + { + "state": "CLOSED", + "outcome": "rejected", + "n": 23245 + }, + { + "state": "MERGED", + "outcome": "merged", + "n": 146033 + }, + { + "state": "OPEN", + "outcome": "pocket_veto", + "n": 19502 + } + ], + "stale_threshold_distribution": [ + { + "threshold_days": 30.0, + "n": 179748 + }, + { + "threshold_days": 51.44804763793945, + "n": 7325 + }, + { + "threshold_days": 62.094017028808594, + "n": 4728 + }, + { + "threshold_days": 32.073368072509766, + "n": 4034 + }, + { + "threshold_days": 30.195825576782227, + "n": 1035 + }, + { + "threshold_days": 73.39346313476562, + "n": 790 + }, + { + "threshold_days": 30.832239151000977, + "n": 426 + }, + { + "threshold_days": 40.00093460083008, + "n": 398 + }, + { + "threshold_days": 33.224021911621094, + "n": 296 + }, + { + "threshold_days": 104.15167236328125, + "n": 197 + }, + { + "threshold_days": 34.333587646484375, + "n": 176 + }, + { + "threshold_days": 30.21098518371582, + "n": 150 + }, + { + "threshold_days": 33.63052749633789, + "n": 150 + }, + { + "threshold_days": 41.0978889465332, + "n": 137 + }, + { + "threshold_days": 30.72226905822754, + "n": 130 + }, + { + "threshold_days": 39.4652099609375, + "n": 127 + }, + { + "threshold_days": 58.54113006591797, + "n": 122 + }, + { + "threshold_days": 92.92449951171875, + "n": 118 + }, + { + "threshold_days": 64.95616912841797, + "n": 85 + } + ], + "repos_total": 96, + "repos_using_default_30d": 78, + "repos_calibrated": 18, + "per_repo_calibration_check": { + "mean_delta_vs_2x_median_ttc": 31.311005377063044, + "median_delta_vs_2x_median_ttc": 28.69724537037037, + "n_repos_with_closed_prs": 96 + }, + "open_pr_age_quantiles_days": { + "p10": 72.75832986111112, + "p25": 138.96756076388888, + "p50": 245.59493055555555, + "p75": 481.70465856481485, + "p90": 923.2952430555556, + "p95": 1096.9740763888894 + } + }, + "distributions": { + "merge_rate_v3": { + "mean": 0.5567027952888478, + "median": 0.75, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_universal": { + "mean": 0.5368757212868577, + "median": 0.6666666666666666, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_per_repo": { + "mean": 0.5332918174688098, + "median": 0.6182486417385746, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_idle_universal": { + "mean": 0.5514260290930824, + "median": 0.7, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_idle_per_repo": { + "mean": 0.5505710864785152, + "median": 0.6729415904292751, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_universal_30d": { + "mean": 0.5332379062475792, + "median": 0.6153846153846154, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_universal_60d": { + "mean": 0.5355565622443484, + "median": 0.6363636363636364, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_universal_90d": { + "mean": 0.5368757212868577, + "median": 0.6666666666666666, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + }, + "merge_rate_universal_180d": { + "mean": 0.5409542251449829, + "median": 0.6666666666666666, + "p10": 0.0, + "p25": 0.0, + "p75": 1.0, + "p90": 1.0 + } + }, + "shift_analysis": { + "n_authors": 31296, + "merge_rate_universal": { + "mean_delta": -0.019827074001990234, + "median_delta": 0.0, + "n_dropped_gt_0.05": 2370, + "n_dropped_gt_0.10": 1836, + "n_dropped_gt_0.25": 895, + "n_unchanged": 28493 + }, + "merge_rate_per_repo": { + "mean_delta": -0.023410977820038193, + "median_delta": 0.0, + "n_dropped_gt_0.05": 2663, + "n_dropped_gt_0.10": 2111, + "n_dropped_gt_0.25": 1094, + "n_unchanged": 28179 + }, + "merge_rate_idle_universal": { + "mean_delta": -0.005276766195765481, + "median_delta": 0.0, + "n_dropped_gt_0.05": 607, + "n_dropped_gt_0.10": 459, + "n_dropped_gt_0.25": 241, + "n_unchanged": 30434 + }, + "merge_rate_idle_per_repo": { + "mean_delta": -0.006131708810332761, + "median_delta": 0.0, + "n_dropped_gt_0.05": 705, + "n_dropped_gt_0.10": 529, + "n_dropped_gt_0.25": 279, + "n_unchanged": 30305 + } + }, + "signal_evaluation": { + "n_labeled": 31293, + "n_suspended": 739, + "n_active": 30554, + "merge_rate_v3": { + "cv_auc": 0.5493659354900655, + "fold_auc_std": 0.01149119403429272, + "mean_merge_rate_suspended": 0.5136005337296283, + "mean_merge_rate_active": 0.557734499146874, + "cohens_d_active_vs_suspended": 0.09623777997840265, + "n_suspended": 739, + "n_active": 30554 + }, + "merge_rate_universal": { + "cv_auc": 0.548848295654899, + "fold_auc_std": 0.011339429192468128, + "mean_merge_rate_suspended": 0.5011117987116154, + "mean_merge_rate_active": 0.5377443527572696, + "cohens_d_active_vs_suspended": 0.08114560040346083, + "n_suspended": 739, + "n_active": 30554 + }, + "merge_rate_per_repo": { + "cv_auc": 0.5486495526055911, + "fold_auc_std": 0.011542410042848126, + "mean_merge_rate_suspended": 0.5010423916595617, + "mean_merge_rate_active": 0.534091457487316, + "cohens_d_active_vs_suspended": 0.0734021937818801, + "n_suspended": 739, + "n_active": 30554 + }, + "merge_rate_idle_universal": { + "cv_auc": 0.5488917644689147, + "fold_auc_std": 0.011813844138263785, + "mean_merge_rate_suspended": 0.5128096256371848, + "mean_merge_rate_active": 0.552348716801441, + "cohens_d_active_vs_suspended": 0.08659711664734189, + "n_suspended": 739, + "n_active": 30554 + }, + "merge_rate_idle_per_repo": { + "cv_auc": 0.5487741351566113, + "fold_auc_std": 0.011853501921961123, + "mean_merge_rate_suspended": 0.5128096256371848, + "mean_merge_rate_active": 0.5514730120143265, + "cohens_d_active_vs_suspended": 0.08473831668514915, + "n_suspended": 739, + "n_active": 30554 + } + }, + "example_authors": [ + { + "login": "wuhang2014", + "account_status": "active", + "total_prs": 9, + "merged": 1.0, + "closed": 0.0, + "open_total": 8.0, + "open_stale_per_repo": 8.0, + "open_stale_universal": 8.0, + "open_stale_idle_universal": 0.0, + "open_stale_idle_per_repo": 0.0, + "merge_rate_v3": 1.0, + "merge_rate_universal": 0.1111111111111111, + "merge_rate_per_repo": 0.1111111111111111, + "merge_rate_idle_universal": 1.0, + "merge_rate_idle_per_repo": 1.0 + }, + { + "login": "simondanielsson", + "account_status": "active", + "total_prs": 7, + "merged": 1.0, + "closed": 0.0, + "open_total": 6.0, + "open_stale_per_repo": 6.0, + "open_stale_universal": 6.0, + "open_stale_idle_universal": 1.0, + "open_stale_idle_per_repo": 1.0, + "merge_rate_v3": 1.0, + "merge_rate_universal": 0.14285714285714285, + "merge_rate_per_repo": 0.14285714285714285, + "merge_rate_idle_universal": 0.5, + "merge_rate_idle_per_repo": 0.5 + }, + { + "login": "sahelib25", + "account_status": "active", + "total_prs": 6, + "merged": 1.0, + "closed": 0.0, + "open_total": 5.0, + "open_stale_per_repo": 5.0, + "open_stale_universal": 5.0, + "open_stale_idle_universal": 0.0, + "open_stale_idle_per_repo": 0.0, + "merge_rate_v3": 1.0, + "merge_rate_universal": 0.16666666666666666, + "merge_rate_per_repo": 0.16666666666666666, + "merge_rate_idle_universal": 1.0, + "merge_rate_idle_per_repo": 1.0 + }, + { + "login": "Copilot", + "account_status": "suspended", + "total_prs": 438, + "merged": 177.0, + "closed": 38.0, + "open_total": 223.0, + "open_stale_per_repo": 223.0, + "open_stale_universal": 204.0, + "open_stale_idle_universal": 4.0, + "open_stale_idle_per_repo": 4.0, + "merge_rate_v3": 0.8232558139534883, + "merge_rate_universal": 0.4224343675417661, + "merge_rate_per_repo": 0.4041095890410959, + "merge_rate_idle_universal": 0.8082191780821918, + "merge_rate_idle_per_repo": 0.8082191780821918 + }, + { + "login": "iycheng", + "account_status": "suspended", + "total_prs": 423, + "merged": 295.0, + "closed": 110.0, + "open_total": 18.0, + "open_stale_per_repo": 18.0, + "open_stale_universal": 18.0, + "open_stale_idle_universal": 0.0, + "open_stale_idle_per_repo": 0.0, + "merge_rate_v3": 0.7283950617283951, + "merge_rate_universal": 0.6973995271867612, + "merge_rate_per_repo": 0.6973995271867612, + "merge_rate_idle_universal": 0.7283950617283951, + "merge_rate_idle_per_repo": 0.7283950617283951 + }, + { + "login": "amd-jmacaran", + "account_status": "suspended", + "total_prs": 105, + "merged": 105.0, + "closed": 0.0, + "open_total": 0.0, + "open_stale_per_repo": 0.0, + "open_stale_universal": 0.0, + "open_stale_idle_universal": 0.0, + "open_stale_idle_per_repo": 0.0, + "merge_rate_v3": 1.0, + "merge_rate_universal": 1.0, + "merge_rate_per_repo": 1.0, + "merge_rate_idle_universal": 1.0, + "merge_rate_idle_per_repo": 1.0 + }, + { + "login": "harupy", + "account_status": "active", + "total_prs": 2771, + "merged": 2151.0, + "closed": 327.0, + "open_total": 293.0, + "open_stale_per_repo": 293.0, + "open_stale_universal": 281.0, + "open_stale_idle_universal": 0.0, + "open_stale_idle_per_repo": 0.0, + "merge_rate_v3": 0.8680387409200968, + "merge_rate_universal": 0.7796303008336354, + "merge_rate_per_repo": 0.776254059906171, + "merge_rate_idle_universal": 0.8680387409200968, + "merge_rate_idle_per_repo": 0.8680387409200968 + }, + { + "login": "baskaryan", + "account_status": "active", + "total_prs": 1494, + "merged": 1295.0, + "closed": 153.0, + "open_total": 46.0, + "open_stale_per_repo": 46.0, + "open_stale_universal": 46.0, + "open_stale_idle_universal": 0.0, + "open_stale_idle_per_repo": 0.0, + "merge_rate_v3": 0.8943370165745856, + "merge_rate_universal": 0.8668005354752343, + "merge_rate_per_repo": 0.8668005354752343, + "merge_rate_idle_universal": 0.8943370165745856, + "merge_rate_idle_per_repo": 0.8943370165745856 + } + ], + "recommendation": { + "decision": "Keep v3 as-is", + "rationale": "No variant beats v3 CV AUC 0.5494 by >0.005 (aucs={'merge_rate_v3': 0.5494, 'merge_rate_universal': 0.5488, 'merge_rate_per_repo': 0.5486, 'merge_rate_idle_universal': 0.5489, 'merge_rate_idle_per_repo': 0.5488}). Cohen's d also fails to improve (base=0.096, best_alt=0.087).", + "cv_aucs": { + "merge_rate_v3": 0.5493659354900655, + "merge_rate_universal": 0.548848295654899, + "merge_rate_per_repo": 0.5486495526055911, + "merge_rate_idle_universal": 0.5488917644689147, + "merge_rate_idle_per_repo": 0.5487741351566113 + }, + "cohens_d": { + "merge_rate_v3": 0.09623777997840265, + "merge_rate_universal": 0.08114560040346083, + "merge_rate_per_repo": 0.0734021937818801, + "merge_rate_idle_universal": 0.08659711664734189, + "merge_rate_idle_per_repo": 0.08473831668514915 + }, + "universal_threshold_days": 90 + } +} \ No newline at end of file diff --git a/experiments/bot_detection/pocket_veto_findings.md b/experiments/bot_detection/pocket_veto_findings.md new file mode 100644 index 0000000..813af4a --- /dev/null +++ b/experiments/bot_detection/pocket_veto_findings.md @@ -0,0 +1,86 @@ +# Pocket Veto Investigation — Findings + +Investigation for issue #51. Does counting stale open PRs as implicit +rejections meaningfully change merge-rate distributions and improve the +signal's ability to separate suspended from active accounts? + +## Dataset + +- 200172 PRs across 96 repos +- State totals: {'CLOSED': 34637, 'MERGED': 146033, 'OPEN': 19502} +- Outcome totals: {'merged': 146033, 'pocket_veto': 30894, 'rejected': 23245} +- Labeled authors: 31293 (739 suspended, 30554 active) + +## Staleness definitions compared + +- **v3 (baseline)**: `merged / (merged + closed)` — current scorer.py. +- **age_universal**: open PR is stale if age > 90d since `created_at`. +- **age_per_repo**: open PR is stale if age > that repo's + `stale_threshold_days` (populated in the DuckDB; default 30d). +- **idle_universal**: open PR is stale if it is still open AND idle > 90d (`fetch_now - updated_at`). +- **idle_per_repo**: same, with the per-repo threshold substituted. + +The `idle_*` variants use a live re-fetch of every DB-OPEN PR's +`updatedAt` (see `fetch_open_pr_activity.py`). PRs that were OPEN at +the snapshot but have since been closed or merged are treated as +non-stale — the close/merge event itself is activity. + +## Calibration sanity check + +- Repos using the default 30d threshold: 78 / 96 +- Repos with a calibrated threshold: 18 +- Per-repo calibrated thresholds vs 2x median time-to-close: + mean delta = 31.3110, median delta = 28.6972 (days). + +## Distribution shift + +Mean merge rate across all authors: + +| Definition | mean | median | p10 | p90 | +|---|---|---|---|---| +| v3 baseline | 0.5567 | 0.7500 | 0.0000 | 1.0000 | +| age_universal (90d) | 0.5369 | 0.6667 | 0.0000 | 1.0000 | +| age_per_repo | 0.5333 | 0.6182 | 0.0000 | 1.0000 | +| idle_universal (90d) | 0.5514 | 0.7000 | 0.0000 | 1.0000 | +| idle_per_repo | 0.5506 | 0.6729 | 0.0000 | 1.0000 | + +Per-author drop from the v3 baseline (n authors, >0.10 / >0.25): + +- **age_universal**: 1836 / 895 +- **age_per_repo**: 2111 / 1094 +- **idle_universal**: 459 / 241 +- **idle_per_repo**: 529 / 279 + +## Signal quality vs ground truth + +2-feature logistic regression (merge_rate + log1p(median_additions)), +5-fold CV on 31293 labeled authors: + +| Definition | CV AUC | Active mean | Suspended mean | Cohen's d | +|---|---|---|---|---| +| v3 baseline | 0.5494 | 0.5577 | 0.5136 | 0.0962 | +| age_universal | 0.5488 | 0.5377 | 0.5011 | 0.0811 | +| age_per_repo | 0.5486 | 0.5341 | 0.5010 | 0.0734 | +| idle_universal | 0.5489 | 0.5523 | 0.5128 | 0.0866 | +| idle_per_repo | 0.5488 | 0.5515 | 0.5128 | 0.0847 | + +## Recommendation + +See the `recommendation` field in `data/results/pocket_veto_analysis.json` for the machine-readable +decision logic. Text summary and follow-up branch sketch below. + +**Keep v3 as-is** — No variant beats v3 CV AUC 0.5494 by >0.005 (aucs={'merge_rate_v3': 0.5494, 'merge_rate_universal': 0.5488, 'merge_rate_per_repo': 0.5486, 'merge_rate_idle_universal': 0.5489, 'merge_rate_idle_per_repo': 0.5488}). Cohen's d also fails to improve (base=0.096, best_alt=0.087). + +### Follow-up branch sketch (if adopted) + +- `src/good_egg/github_client.py`: extend `_COMBINED_QUERY` with an + `openPullRequests` selection that pulls `createdAt`/`updatedAt` for + each OPEN PR on the scored user (or `totalCount` if we can push the + staleness filter into the query). +- `src/good_egg/models.py`: add `open_stale_pr_count: int` (or similar) + to `UserContributionData`. +- `src/good_egg/scorer.py:256-261`: change the `_score_v3` merge-rate + formula to `merged / (merged + closed + open_stale)`. +- `src/good_egg/config.py`: add the staleness threshold as a tunable + config value. +- Tests: parallel coverage in `tests/test_scorer.py`. diff --git a/experiments/bot_detection/scripts/fetch_open_pr_activity.py b/experiments/bot_detection/scripts/fetch_open_pr_activity.py new file mode 100644 index 0000000..2fa964f --- /dev/null +++ b/experiments/bot_detection/scripts/fetch_open_pr_activity.py @@ -0,0 +1,182 @@ +"""Fetch updatedAt for every OPEN PR in the bot_detection DuckDB. + +Used by pocket_veto_analysis.py to compute idle-time-based staleness (a +better proxy than age-since-created, which the DuckDB schema forces). + +For each repo that has OPEN PRs in the DB, paginate +repository.pullRequests(states: OPEN) and collect (number, updatedAt). PRs +that were OPEN in the DB snapshot but have since been closed or merged are +by definition non-stale (the close/merge event itself is activity), so we +don't need to look them up — they just won't appear in the fetched set and +the analysis treats them as non-stale. + +Output: experiments/bot_detection/data/open_pr_activity.parquet + columns: repo, number, updated_at, fetch_now +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import time +from datetime import UTC, datetime +from pathlib import Path + +import duckdb +import httpx +import pandas as pd + +BASE = Path(__file__).resolve().parents[1] +DB_PATH = BASE / "data" / "bot_detection.duckdb" +OUT_PATH = BASE / "data" / "open_pr_activity.parquet" + +GRAPHQL_URL = "https://api.github.com/graphql" +PAGE_SIZE = 100 + +QUERY = """ +query($owner: String!, $name: String!, $cursor: String) { + repository(owner: $owner, name: $name) { + pullRequests(states: OPEN, first: 100, after: $cursor, + orderBy: {field: CREATED_AT, direction: ASC}) { + pageInfo { hasNextPage endCursor } + nodes { number updatedAt } + } + } + rateLimit { remaining resetAt } +} +""" + + +def get_token() -> str: + token = os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN") + if token: + return token + result = subprocess.run( + ["gh", "auth", "token"], check=True, capture_output=True, text=True, + ) + return result.stdout.strip() + + +def fetch_repo( + client: httpx.Client, owner: str, name: str, +) -> list[tuple[int, str]]: + results: list[tuple[int, str]] = [] + cursor: str | None = None + while True: + resp = client.post( + GRAPHQL_URL, + json={ + "query": QUERY, + "variables": {"owner": owner, "name": name, "cursor": cursor}, + }, + ) + resp.raise_for_status() + payload = resp.json() + if "errors" in payload: + print(f" GraphQL errors: {payload['errors']}", file=sys.stderr) + break + repo_data = payload["data"]["repository"] + if repo_data is None: + print(f" repo not found: {owner}/{name}", file=sys.stderr) + break + prs = repo_data["pullRequests"] + for node in prs["nodes"]: + results.append((node["number"], node["updatedAt"])) + remaining = payload["data"]["rateLimit"]["remaining"] + if remaining < 100: + reset_at = payload["data"]["rateLimit"]["resetAt"] + print( + f" rate limit low ({remaining}), sleeping until {reset_at}", + file=sys.stderr, + ) + time.sleep(60) + if not prs["pageInfo"]["hasNextPage"]: + break + cursor = prs["pageInfo"]["endCursor"] + return results + + +def main() -> None: + con = duckdb.connect(str(DB_PATH), read_only=True) + repo_counts = con.execute(""" + SELECT repo, COUNT(*) AS n + FROM prs WHERE state='OPEN' + GROUP BY repo ORDER BY n DESC + """).fetchall() + con.close() + + db_open_keys: dict[str, set[int]] = {} + con = duckdb.connect(str(DB_PATH), read_only=True) + for (repo,) in con.execute( + "SELECT DISTINCT repo FROM prs WHERE state='OPEN'" + ).fetchall(): + numbers = con.execute( + "SELECT number FROM prs WHERE state='OPEN' AND repo=?", [repo] + ).fetchall() + db_open_keys[repo] = {n[0] for n in numbers} + con.close() + + token = get_token() + headers = { + "Authorization": f"bearer {token}", + "Accept": "application/vnd.github+json", + } + fetch_now = datetime.now(UTC).isoformat() + rows: list[dict[str, object]] = [] + + with httpx.Client(headers=headers, timeout=60.0) as client: + for idx, (repo, n_db_open) in enumerate(repo_counts, start=1): + owner, name = repo.split("/", 1) + print( + f"[{idx}/{len(repo_counts)}] {repo} " + f"(db_open={n_db_open})...", + flush=True, + ) + try: + fetched = fetch_repo(client, owner, name) + except httpx.HTTPStatusError as exc: + print( + f" HTTP error for {repo}: {exc}", file=sys.stderr, + ) + continue + relevant = [ + (num, ts) for (num, ts) in fetched + if num in db_open_keys.get(repo, set()) + ] + print( + f" fetched {len(fetched)} currently-open, " + f"{len(relevant)} match db-open set", + flush=True, + ) + for num, ts in relevant: + rows.append( + { + "repo": repo, + "number": num, + "updated_at": ts, + "fetch_now": fetch_now, + } + ) + + df = pd.DataFrame(rows) + df["updated_at"] = pd.to_datetime(df["updated_at"]) + df["fetch_now"] = pd.to_datetime(df["fetch_now"]) + OUT_PATH.parent.mkdir(parents=True, exist_ok=True) + df.to_parquet(OUT_PATH, index=False) + + total_db_open = sum(len(v) for v in db_open_keys.values()) + print() + print(f"Wrote {len(df)} rows to {OUT_PATH}") + print( + f"Coverage: {len(df)} / {total_db_open} db-OPEN PRs still currently " + f"open ({100 * len(df) / total_db_open:.1f}%)" + ) + print( + f"Missing: {total_db_open - len(df)} PRs (closed/merged since snapshot" + f" — treated as non-stale)" + ) + + +if __name__ == "__main__": + main() diff --git a/experiments/bot_detection/scripts/pocket_veto_analysis.py b/experiments/bot_detection/scripts/pocket_veto_analysis.py new file mode 100644 index 0000000..512c414 --- /dev/null +++ b/experiments/bot_detection/scripts/pocket_veto_analysis.py @@ -0,0 +1,637 @@ +"""Pocket-veto investigation for issue #51. + +Analyze whether counting stale open PRs as implicit rejections meaningfully +shifts merge-rate distributions and improves the signal's ability to separate +suspended from active GitHub accounts. + +Operates entirely on the existing bot_detection DuckDB. Does not fetch from +GitHub. Does not modify src/good_egg/. + +Outputs: + - experiments/bot_detection/data/results/pocket_veto_analysis.json + - experiments/bot_detection/pocket_veto_findings.md +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import duckdb +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import StandardScaler + +BASE = Path(__file__).resolve().parents[1] +DB_PATH = BASE / "data" / "bot_detection.duckdb" +ACTIVITY_PATH = BASE / "data" / "open_pr_activity.parquet" +RESULTS_PATH = BASE / "data" / "results" / "pocket_veto_analysis.json" +FINDINGS_PATH = BASE / "pocket_veto_findings.md" + +UNIVERSAL_THRESHOLD_DAYS = 90 +SENSITIVITY_THRESHOLDS = (30, 60, 90, 180) +SEED = 42 + +MERGE_RATE_COLS = ( + "merge_rate_v3", + "merge_rate_universal", + "merge_rate_per_repo", + "merge_rate_idle_universal", + "merge_rate_idle_per_repo", +) + + +def characterize(con: duckdb.DuckDBPyConnection) -> dict[str, Any]: + """Phase 1: sanity-check the existing outcome/stale_threshold_days columns.""" + totals = con.execute(""" + SELECT state, COUNT(*) FROM prs GROUP BY state ORDER BY state + """).fetchall() + outcomes = con.execute(""" + SELECT outcome, COUNT(*) FROM prs GROUP BY outcome ORDER BY outcome + """).fetchall() + state_outcome = con.execute(""" + SELECT state, outcome, COUNT(*) FROM prs + GROUP BY state, outcome ORDER BY state, outcome + """).fetchall() + + thresh_dist = con.execute(""" + SELECT stale_threshold_days, COUNT(*) AS n + FROM prs WHERE stale_threshold_days IS NOT NULL + GROUP BY stale_threshold_days ORDER BY n DESC + """).fetchall() + + # Per-repo: how does stored stale_threshold_days compare to 2x median + # time-to-close (the hypothesis in the issue)? + repo_stats = con.execute(""" + WITH closed AS ( + SELECT repo, + EXTRACT(EPOCH FROM (closed_at - created_at))/86400.0 AS ttc_days + FROM prs + WHERE state IN ('MERGED', 'CLOSED') + AND closed_at IS NOT NULL + AND created_at IS NOT NULL + AND closed_at > created_at + ), + repo_ttc AS ( + SELECT repo, MEDIAN(ttc_days) AS median_ttc_days, COUNT(*) AS n_closed + FROM closed GROUP BY repo + ), + repo_thresh AS ( + SELECT repo, + ANY_VALUE(stale_threshold_days) AS stale_threshold_days, + COUNT(DISTINCT stale_threshold_days) AS distinct_thresh + FROM prs + WHERE stale_threshold_days IS NOT NULL + GROUP BY repo + ) + SELECT t.repo, t.stale_threshold_days, t.distinct_thresh, + r.median_ttc_days, r.n_closed + FROM repo_thresh t LEFT JOIN repo_ttc r USING (repo) + ORDER BY t.repo + """).fetchdf() + + repo_stats["two_x_median"] = 2.0 * repo_stats["median_ttc_days"] + repo_stats["delta_vs_2x"] = ( + repo_stats["stale_threshold_days"] - repo_stats["two_x_median"] + ) + + # Open PR age distribution (relative to the DB's max created_at as "now") + age_stats = con.execute(""" + WITH ref AS (SELECT MAX(created_at) AS now FROM prs) + SELECT + quantile_cont( + EXTRACT(EPOCH FROM (ref.now - p.created_at))/86400.0, + [0.1, 0.25, 0.5, 0.75, 0.9, 0.95] + ) AS quantiles + FROM prs p, ref WHERE p.state = 'OPEN' + """).fetchone() + + return { + "state_totals": dict(totals), + "outcome_totals": dict(outcomes), + "state_outcome_crosstab": [ + {"state": s, "outcome": o, "n": n} for s, o, n in state_outcome + ], + "stale_threshold_distribution": [ + {"threshold_days": float(t), "n": int(n)} for t, n in thresh_dist + ], + "repos_total": int(len(repo_stats)), + "repos_using_default_30d": int( + (repo_stats["stale_threshold_days"] == 30.0).sum() + ), + "repos_calibrated": int( + (repo_stats["stale_threshold_days"] != 30.0).sum() + ), + "per_repo_calibration_check": { + "mean_delta_vs_2x_median_ttc": float( + repo_stats["delta_vs_2x"].dropna().mean() + ), + "median_delta_vs_2x_median_ttc": float( + repo_stats["delta_vs_2x"].dropna().median() + ), + "n_repos_with_closed_prs": int( + repo_stats["median_ttc_days"].notna().sum() + ), + }, + "open_pr_age_quantiles_days": { + label: float(v) for label, v in zip( + ["p10", "p25", "p50", "p75", "p90", "p95"], + age_stats[0], + strict=True, + ) + }, + } + + +def build_author_features(con: duckdb.DuckDBPyConnection) -> pd.DataFrame: + """Phase 2: per-author counts and all five merge-rate definitions. + + Age-based variants use (now - created_at) with 'now' = max(created_at) + in the DB. Idle-based variants use (fetch_now - updated_at) from the + open_pr_activity.parquet sidecar; PRs that were OPEN in the DB snapshot + but are no longer currently open are treated as non-stale (the close or + merge event since the snapshot is itself evidence of activity). + """ + has_activity = ACTIVITY_PATH.exists() + sensitivity_cols = ",\n ".join( + f"SUM(CASE WHEN state='OPEN' AND age_days > {d} " + f"THEN 1 ELSE 0 END) AS open_stale_{d}d" + for d in SENSITIVITY_THRESHOLDS + ) + activity_join = "" + idle_cols = ( + "0 AS open_stale_idle_universal, 0 AS open_stale_idle_per_repo," + ) + if has_activity: + activity_join = f""" + LEFT JOIN read_parquet('{ACTIVITY_PATH}') oa + ON oa.repo = aged.repo AND oa.number = aged.number + """ + idle_cols = f""" + SUM(CASE + WHEN state='OPEN' AND oa.updated_at IS NOT NULL + AND EXTRACT(EPOCH FROM (oa.fetch_now - oa.updated_at)) + /86400.0 > {UNIVERSAL_THRESHOLD_DAYS} + THEN 1 ELSE 0 + END) AS open_stale_idle_universal, + SUM(CASE + WHEN state='OPEN' AND oa.updated_at IS NOT NULL + AND EXTRACT(EPOCH FROM (oa.fetch_now - oa.updated_at)) + /86400.0 > COALESCE(stale_threshold_days, 30) + THEN 1 ELSE 0 + END) AS open_stale_idle_per_repo, + """ + query = f""" + WITH ref AS (SELECT MAX(created_at) AS now FROM prs), + aged AS ( + SELECT p.*, + EXTRACT(EPOCH FROM (ref.now - p.created_at))/86400.0 AS age_days + FROM prs p, ref + ) + SELECT + author AS login, + COUNT(*) AS total_prs, + SUM(CASE WHEN state='MERGED' THEN 1 ELSE 0 END) AS merged, + SUM(CASE WHEN state='CLOSED' THEN 1 ELSE 0 END) AS closed, + SUM(CASE WHEN state='OPEN' THEN 1 ELSE 0 END) AS open_total, + SUM(CASE WHEN state='OPEN' + AND age_days > COALESCE(stale_threshold_days, 30) + THEN 1 ELSE 0 END) AS open_stale_per_repo, + SUM(CASE WHEN state='OPEN' AND age_days > {UNIVERSAL_THRESHOLD_DAYS} + THEN 1 ELSE 0 END) AS open_stale_universal, + {idle_cols} + {sensitivity_cols}, + MEDIAN(additions) AS median_additions + FROM aged + {activity_join} + GROUP BY author + """ + df = con.execute(query).fetchdf() + + def compute(col: str, stale_col: str) -> None: + denom = df["merged"] + df["closed"] + df[stale_col] + df[col] = np.where(denom > 0, df["merged"] / denom, 0.0) + + df["merge_rate_v3"] = np.where( + (df["merged"] + df["closed"]) > 0, + df["merged"] / (df["merged"] + df["closed"]), + 0.0, + ) + compute("merge_rate_universal", "open_stale_universal") + compute("merge_rate_per_repo", "open_stale_per_repo") + compute("merge_rate_idle_universal", "open_stale_idle_universal") + compute("merge_rate_idle_per_repo", "open_stale_idle_per_repo") + for d in SENSITIVITY_THRESHOLDS: + compute(f"merge_rate_universal_{d}d", f"open_stale_{d}d") + return df + + +def distribution_summary(df: pd.DataFrame) -> dict[str, Any]: + """Phase 2 deliverable: summary stats for each merge-rate definition.""" + + def summarize(col: str) -> dict[str, float]: + s = df[col] + return { + "mean": float(s.mean()), + "median": float(s.median()), + "p10": float(s.quantile(0.10)), + "p25": float(s.quantile(0.25)), + "p75": float(s.quantile(0.75)), + "p90": float(s.quantile(0.90)), + } + + cols = [ + *MERGE_RATE_COLS, + *[f"merge_rate_universal_{d}d" for d in SENSITIVITY_THRESHOLDS], + ] + return {col: summarize(col) for col in cols} + + +def shift_analysis(df: pd.DataFrame) -> dict[str, Any]: + """Phase 3: per-author shift from v3 baseline to each alternative.""" + out: dict[str, Any] = {"n_authors": int(len(df))} + for alt in [c for c in MERGE_RATE_COLS if c != "merge_rate_v3"]: + delta = df[alt] - df["merge_rate_v3"] + out[alt] = { + "mean_delta": float(delta.mean()), + "median_delta": float(delta.median()), + "n_dropped_gt_0.05": int((delta < -0.05).sum()), + "n_dropped_gt_0.10": int((delta < -0.10).sum()), + "n_dropped_gt_0.25": int((delta < -0.25).sum()), + "n_unchanged": int((delta == 0).sum()), + } + return out + + +def cohens_d(group_a: np.ndarray, group_b: np.ndarray) -> float: + if len(group_a) < 2 or len(group_b) < 2: + return float("nan") + pooled = np.sqrt( + ((len(group_a) - 1) * group_a.var(ddof=1) + + (len(group_b) - 1) * group_b.var(ddof=1)) + / (len(group_a) + len(group_b) - 2) + ) + if pooled == 0: + return float("nan") + return float((group_a.mean() - group_b.mean()) / pooled) + + +def cv_auc( + df: pd.DataFrame, + merge_rate_col: str, + n_folds: int = 5, +) -> dict[str, float]: + """Phase 4: minimal 2-feature LR CV — merge_rate variant + median_additions. + + Mirrors the 2-feature baseline in scripts/refit_bad_egg.py but swaps the + merge-rate column so all three definitions are evaluated on identical + labeled-author splits. + """ + y = (df["account_status"] == "suspended").astype(int).values + mr = df[merge_rate_col].fillna(0).to_numpy(dtype=float) + ma = df["median_additions"].fillna(0).to_numpy(dtype=float) + ma = np.log1p(np.abs(ma)) * np.sign(ma) + x = np.column_stack([mr, ma]) + + skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED) + oof = np.full(len(y), np.nan) + fold_aucs: list[float] = [] + for train_idx, test_idx in skf.split(x, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x[train_idx]) + x_test = scaler.transform(x[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=SEED, + ) + model.fit(x_train, y[train_idx]) + probs = model.predict_proba(x_test)[:, 1] + oof[test_idx] = probs + fold_aucs.append(roc_auc_score(y[test_idx], probs)) + + mr_susp = mr[y == 1] + mr_act = mr[y == 0] + return { + "cv_auc": float(roc_auc_score(y, oof)), + "fold_auc_std": float(np.std(fold_aucs)), + "mean_merge_rate_suspended": float(mr_susp.mean()), + "mean_merge_rate_active": float(mr_act.mean()), + "cohens_d_active_vs_suspended": cohens_d(mr_act, mr_susp), + "n_suspended": int(y.sum()), + "n_active": int((1 - y).sum()), + } + + +def signal_evaluation( + df: pd.DataFrame, con: duckdb.DuckDBPyConnection, +) -> dict[str, Any]: + """Phase 4 deliverable: run CV for each merge-rate definition.""" + authors = con.execute( + "SELECT login, account_status FROM authors" + ).fetchdf() + labeled = df.merge(authors, on="login", how="inner") + labeled = labeled[labeled["account_status"].isin(["active", "suspended"])] + labeled = labeled.copy() + + results: dict[str, Any] = { + "n_labeled": int(len(labeled)), + "n_suspended": int((labeled["account_status"] == "suspended").sum()), + "n_active": int((labeled["account_status"] == "active").sum()), + } + for col in MERGE_RATE_COLS: + results[col] = cv_auc(labeled, col) + return results + + +def pick_example_authors( + df: pd.DataFrame, con: duckdb.DuckDBPyConnection, +) -> list[dict[str, Any]]: + """Phase 3 deliverable: handful of authors where definitions disagree.""" + authors = con.execute( + "SELECT login, account_status FROM authors" + ).fetchdf() + merged = df.merge(authors, on="login", how="left") + merged["shift_universal"] = ( + merged["merge_rate_universal"] - merged["merge_rate_v3"] + ) + # 3 big-shift authors (most affected) + 3 suspended authors + 2 active + # high-PR authors. + big_shift = merged.nsmallest(3, "shift_universal") + susp = merged[merged["account_status"] == "suspended"].nlargest( + 3, "total_prs", + ) + act = merged[merged["account_status"] == "active"].nlargest( + 2, "total_prs", + ) + picks = pd.concat([big_shift, susp, act]).drop_duplicates(subset=["login"]) + cols = [ + "login", "account_status", "total_prs", "merged", "closed", + "open_total", "open_stale_per_repo", "open_stale_universal", + "open_stale_idle_universal", "open_stale_idle_per_repo", + *MERGE_RATE_COLS, + ] + return picks[cols].to_dict("records") + + +def write_findings(results: dict[str, Any]) -> None: + c = results["characterization"] + d = results["distributions"] + s = results["shift_analysis"] + e = results["signal_evaluation"] + + def fmt(x: float) -> str: + return f"{x:.4f}" + + lines = [ + "# Pocket Veto Investigation — Findings", + "", + "Investigation for issue #51. Does counting stale open PRs as implicit", + "rejections meaningfully change merge-rate distributions and improve the", + "signal's ability to separate suspended from active accounts?", + "", + "## Dataset", + "", + f"- {sum(c['state_totals'].values())} PRs across " + f"{c['repos_total']} repos", + f"- State totals: {c['state_totals']}", + f"- Outcome totals: {c['outcome_totals']}", + f"- Labeled authors: {e['n_labeled']} " + f"({e['n_suspended']} suspended, {e['n_active']} active)", + "", + "## Staleness definitions compared", + "", + "- **v3 (baseline)**: `merged / (merged + closed)` — current scorer.py.", + f"- **age_universal**: open PR is stale if age > " + f"{UNIVERSAL_THRESHOLD_DAYS}d since `created_at`.", + "- **age_per_repo**: open PR is stale if age > that repo's", + " `stale_threshold_days` (populated in the DuckDB; default 30d).", + f"- **idle_universal**: open PR is stale if it is still open AND idle " + f"> {UNIVERSAL_THRESHOLD_DAYS}d (`fetch_now - updated_at`).", + "- **idle_per_repo**: same, with the per-repo threshold substituted.", + "", + "The `idle_*` variants use a live re-fetch of every DB-OPEN PR's", + "`updatedAt` (see `fetch_open_pr_activity.py`). PRs that were OPEN at", + "the snapshot but have since been closed or merged are treated as", + "non-stale — the close/merge event itself is activity.", + "", + "## Calibration sanity check", + "", + f"- Repos using the default 30d threshold: {c['repos_using_default_30d']}" + f" / {c['repos_total']}", + f"- Repos with a calibrated threshold: {c['repos_calibrated']}", + "- Per-repo calibrated thresholds vs 2x median time-to-close:", + f" mean delta = " + f"{fmt(c['per_repo_calibration_check']['mean_delta_vs_2x_median_ttc'])}" + f", median delta = " + f"{fmt(c['per_repo_calibration_check']['median_delta_vs_2x_median_ttc'])}" + " (days).", + "", + "## Distribution shift", + "", + "Mean merge rate across all authors:", + "", + "| Definition | mean | median | p10 | p90 |", + "|---|---|---|---|---|", + *[ + f"| {label} | " + f"{fmt(d[col]['mean'])} | " + f"{fmt(d[col]['median'])} | " + f"{fmt(d[col]['p10'])} | " + f"{fmt(d[col]['p90'])} |" + for label, col in [ + ("v3 baseline", "merge_rate_v3"), + (f"age_universal ({UNIVERSAL_THRESHOLD_DAYS}d)", + "merge_rate_universal"), + ("age_per_repo", "merge_rate_per_repo"), + (f"idle_universal ({UNIVERSAL_THRESHOLD_DAYS}d)", + "merge_rate_idle_universal"), + ("idle_per_repo", "merge_rate_idle_per_repo"), + ] + ], + "", + "Per-author drop from the v3 baseline (n authors, >0.10 / >0.25):", + "", + *[ + f"- **{label}**: " + f"{s[col]['n_dropped_gt_0.10']} / {s[col]['n_dropped_gt_0.25']}" + for label, col in [ + ("age_universal", "merge_rate_universal"), + ("age_per_repo", "merge_rate_per_repo"), + ("idle_universal", "merge_rate_idle_universal"), + ("idle_per_repo", "merge_rate_idle_per_repo"), + ] + ], + "", + "## Signal quality vs ground truth", + "", + "2-feature logistic regression (merge_rate + log1p(median_additions)),", + f"5-fold CV on {e['n_labeled']} labeled authors:", + "", + "| Definition | CV AUC | Active mean | Suspended mean | Cohen's d |", + "|---|---|---|---|---|", + *[ + f"| {label} | " + f"{fmt(e[col]['cv_auc'])} | " + f"{fmt(e[col]['mean_merge_rate_active'])} | " + f"{fmt(e[col]['mean_merge_rate_suspended'])} | " + f"{fmt(e[col]['cohens_d_active_vs_suspended'])} |" + for label, col in [ + ("v3 baseline", "merge_rate_v3"), + ("age_universal", "merge_rate_universal"), + ("age_per_repo", "merge_rate_per_repo"), + ("idle_universal", "merge_rate_idle_universal"), + ("idle_per_repo", "merge_rate_idle_per_repo"), + ] + ], + "", + "## Recommendation", + "", + "See the `recommendation` field in " + "`data/results/pocket_veto_analysis.json` for the machine-readable", + "decision logic. Text summary and follow-up branch sketch below.", + "", + f"**{results['recommendation']['decision']}** — " + f"{results['recommendation']['rationale']}", + "", + "### Follow-up branch sketch (if adopted)", + "", + "- `src/good_egg/github_client.py`: extend `_COMBINED_QUERY` with an", + " `openPullRequests` selection that pulls `createdAt`/`updatedAt` for", + " each OPEN PR on the scored user (or `totalCount` if we can push the", + " staleness filter into the query).", + "- `src/good_egg/models.py`: add `open_stale_pr_count: int` (or similar)", + " to `UserContributionData`.", + "- `src/good_egg/scorer.py:256-261`: change the `_score_v3` merge-rate", + " formula to `merged / (merged + closed + open_stale)`.", + "- `src/good_egg/config.py`: add the staleness threshold as a tunable", + " config value.", + "- Tests: parallel coverage in `tests/test_scorer.py`.", + "", + ] + FINDINGS_PATH.write_text("\n".join(lines)) + + +def decide( + e: dict[str, Any], s: dict[str, Any], d: dict[str, Any], +) -> dict[str, Any]: + """Produce a simple quantitative recommendation.""" + base_auc = e["merge_rate_v3"]["cv_auc"] + aucs = {col: e[col]["cv_auc"] for col in MERGE_RATE_COLS} + + best_name = "merge_rate_v3" + for col, auc in aucs.items(): + if col == "merge_rate_v3": + continue + if auc > aucs[best_name] + 0.005: + best_name = col + + cohens = { + col: e[col]["cohens_d_active_vs_suspended"] for col in MERGE_RATE_COLS + } + + if best_name == "merge_rate_v3": + decision = "Keep v3 as-is" + rationale = ( + f"No variant beats v3 CV AUC {base_auc:.4f} by >0.005 " + f"(aucs={ {k: round(v, 4) for k, v in aucs.items()} }). " + f"Cohen's d also fails to improve " + f"(base={cohens['merge_rate_v3']:.3f}, " + f"best_alt={max(v for k, v in cohens.items() if k != 'merge_rate_v3'):.3f})." + ) + else: + affected = s[best_name]["n_dropped_gt_0.10"] + decision = f"Adopt {best_name}" + rationale = ( + f"{best_name} CV AUC {aucs[best_name]:.4f} beats v3 baseline " + f"{base_auc:.4f} by >0.005. Cohen's d " + f"{cohens[best_name]:.3f} vs v3 {cohens['merge_rate_v3']:.3f}. " + f"{affected} authors shift by >0.10 in merge rate." + ) + return { + "decision": decision, + "rationale": rationale, + "cv_aucs": aucs, + "cohens_d": cohens, + "universal_threshold_days": UNIVERSAL_THRESHOLD_DAYS, + } + + +def main() -> None: + print(f"Loading {DB_PATH}") + con = duckdb.connect(str(DB_PATH), read_only=True) + + print("Phase 1: characterization") + characterization = characterize(con) + print(f" state totals: {characterization['state_totals']}") + print(f" outcome totals: {characterization['outcome_totals']}") + print( + f" repos calibrated: {characterization['repos_calibrated']} / " + f"{characterization['repos_total']}" + ) + + print("Phase 2: per-author features + merge-rate variants") + if ACTIVITY_PATH.exists(): + print(f" using idle-time sidecar: {ACTIVITY_PATH.name}") + else: + print(" (no idle-time sidecar; idle_* variants will be all-zero)") + df = build_author_features(con) + print(f" {len(df)} authors") + distributions = distribution_summary(df) + for col in MERGE_RATE_COLS: + v = distributions[col] + print(f" {col}: mean={v['mean']:.4f} median={v['median']:.4f}") + + print("Phase 3: shift analysis") + shifts = shift_analysis(df) + for alt in [c for c in MERGE_RATE_COLS if c != "merge_rate_v3"]: + print( + f" {alt}: mean_delta={shifts[alt]['mean_delta']:+.4f} " + f"n_dropped>0.10={shifts[alt]['n_dropped_gt_0.10']}" + ) + + print("Phase 4: signal evaluation") + signal = signal_evaluation(df, con) + for col in MERGE_RATE_COLS: + print( + f" {col}: CV AUC={signal[col]['cv_auc']:.4f} " + f"cohens_d={signal[col]['cohens_d_active_vs_suspended']:.4f}" + ) + + examples = pick_example_authors(df, con) + print("Phase 3: example authors (most-shifted + labeled high-volume)") + for row in examples: + print( + f" {row['login']:20s} [{row['account_status']}] " + f"total={row['total_prs']} v3={row['merge_rate_v3']:.3f} " + f"uni={row['merge_rate_universal']:.3f} " + f"per_repo={row['merge_rate_per_repo']:.3f}" + ) + + recommendation = decide(signal, shifts, distributions) + print(f"\nRecommendation: {recommendation['decision']}") + print(f" {recommendation['rationale']}") + + results = { + "universal_threshold_days": UNIVERSAL_THRESHOLD_DAYS, + "characterization": characterization, + "distributions": distributions, + "shift_analysis": shifts, + "signal_evaluation": signal, + "example_authors": examples, + "recommendation": recommendation, + } + RESULTS_PATH.parent.mkdir(parents=True, exist_ok=True) + RESULTS_PATH.write_text(json.dumps(results, indent=2, default=str)) + print(f"\nWrote {RESULTS_PATH}") + + write_findings(results) + print(f"Wrote {FINDINGS_PATH}") + + con.close() + + +if __name__ == "__main__": + main()