Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
267 changes: 267 additions & 0 deletions generate_release_notes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
#!/usr/bin/env python3
"""Generate weekly release notes for HydraDB by scanning merged PRs across all repos.

Usage:
python generate_release_notes.py --days 7
python generate_release_notes.py --days 7 --dry-run # skip AI summarization

Requires:
GITHUB_TOKEN - GitHub personal access token with repo read access
OPENAI_API_KEY - OpenAI API key (optional if --dry-run)
"""

from __future__ import annotations

import argparse
import json
import os
import subprocess
import sys
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from pathlib import Path

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------

REPOS: list[dict[str, str]] = [
{"owner": "usecortex", "name": "cortex-application", "display": "Core API"},
{"owner": "usecortex", "name": "cortex-ingestion", "display": "Ingestion Pipeline"},
{"owner": "usecortex", "name": "cortex-dashboard", "display": "Dashboard"},
{"owner": "usecortex", "name": "hydradb-on-prem-infra", "display": "On-Prem Infrastructure"},
{"owner": "usecortex", "name": "hydradb-cli", "display": "CLI"},
{"owner": "usecortex", "name": "hydradb-mcp", "display": "MCP Server"},
{"owner": "usecortex", "name": "hydradb-claude-code", "display": "Claude Code Integration"},
{"owner": "usecortex", "name": "hydradb-bench", "display": "Benchmarks"},
{"owner": "usecortex", "name": "python-sdk", "display": "Python SDK"},
{"owner": "usecortex", "name": "ts-sdk", "display": "TypeScript SDK"},
{"owner": "usecortex", "name": "mintlify-docs", "display": "Documentation"},
{"owner": "usecortex", "name": "docs", "display": "Docs (legacy)"},
{"owner": "usecortex", "name": "openclaw-hydradb", "display": "OpenClaw"},
]

CATEGORY_KEYWORDS: dict[str, list[str]] = {
"Features": ["feat", "feature", "add", "new"],
"Bug Fixes": ["fix", "bug", "hotfix", "patch", "resolve"],
"Performance": ["perf", "optim", "speed", "latency", "cache"],
"Security": ["security", "auth", "encrypt", "vulnerability", "cve"],
"Infrastructure": ["infra", "deploy", "ci", "cd", "docker", "helm", "k8s", "argo"],
"Documentation": ["doc", "readme", "guide", "cookbook"],
"Chores": ["chore", "bump", "refactor", "cleanup", "lint", "format", "revert"],
}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def fetch_merged_prs(owner: str, name: str, since: datetime) -> list[dict]:
"""Fetch merged PRs from a GitHub repo using the gh CLI."""
token = os.environ.get("GITHUB_TOKEN", "")
if not token:
print(f" WARNING: GITHUB_TOKEN not set, skipping {owner}/{name}", file=sys.stderr)
return []

cmd = [
"gh", "pr", "list",
"--repo", f"{owner}/{name}",
"--state", "merged",
"--json", "number,title,author,mergedAt,url,body,labels",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Unused fields fetched from GitHub API

body and labels are included in the --json fields but are never referenced anywhere in the script. Removing them reduces payload size and keeps the API call minimal.

Suggested change
"--json", "number,title,author,mergedAt,url,body,labels",
"--json", "number,title,author,mergedAt,url",

"--limit", "100",
]
Comment on lines +67 to +73
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 --limit 100 silently drops PRs on active repos

gh pr list returns the 100 most recently merged PRs and date filtering happens client-side. For any repo that merged more than 100 PRs within the lookback window the script silently omits the oldest ones — there is no warning or error. For cortex-application or cortex-ingestion during a busy sprint this is a real risk.

Adding a server-side date filter via --search and raising the limit eliminates the gap:

Suggested change
cmd = [
"gh", "pr", "list",
"--repo", f"{owner}/{name}",
"--state", "merged",
"--json", "number,title,author,mergedAt,url,body,labels",
"--limit", "100",
]
cmd = [
"gh", "pr", "list",
"--repo", f"{owner}/{name}",
"--state", "merged",
"--json", "number,title,author,mergedAt,url,body,labels",
"--search", f"merged:>={since.strftime('%Y-%m-%d')}",
"--limit", "500",
]

This lets GitHub pre-filter by merge date and the higher ceiling keeps the client-side pass as a safety net only.

env = {**os.environ, "GH_TOKEN": token}
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, env=env)
if result.returncode != 0:
print(f" WARNING: gh failed for {owner}/{name}: {result.stderr.strip()}", file=sys.stderr)
return []
prs = json.loads(result.stdout)
except (subprocess.TimeoutExpired, json.JSONDecodeError) as exc:
print(f" WARNING: error fetching {owner}/{name}: {exc}", file=sys.stderr)
return []

# Filter to the time window
recent = []
for pr in prs:
merged_at = datetime.fromisoformat(pr["mergedAt"].replace("Z", "+00:00"))
if merged_at >= since:
pr["_merged_at"] = merged_at
recent.append(pr)
return recent


def categorize_pr(pr: dict) -> str:
"""Categorize a PR based on its title."""
title_lower = pr["title"].lower()
for category, keywords in CATEGORY_KEYWORDS.items():
for kw in keywords:
if kw in title_lower:
return category
return "Other"


def generate_ai_summary(categorized: dict, dry_run: bool = False) -> str | None:
"""Use OpenAI to generate a polished executive summary."""
if dry_run:
return None

api_key = os.environ.get("OPENAI_API_KEY", "")
if not api_key:
print(" WARNING: OPENAI_API_KEY not set, skipping AI summary", file=sys.stderr)
return None

try:
from openai import OpenAI
except ImportError:
print(" WARNING: openai package not installed, skipping AI summary", file=sys.stderr)
return None

# Build a compact representation of the PRs
pr_list_text = ""
for category, repos in categorized.items():
pr_list_text += f"\n## {category}\n"
for repo_display, prs in repos.items():
for pr in prs:
pr_list_text += f"- [{repo_display}] {pr['title']} (#{pr['number']})\n"

prompt = f"""You are a technical writer for HydraDB, a vector database product.
Write a concise executive summary (3-5 paragraphs) of this week's release highlights.
Focus on user-facing impact. Group related changes together. Use professional tone.
Do NOT list every PR -- synthesize the key themes and improvements.

PRs merged this week:
{pr_list_text}
"""

client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.4,
max_tokens=1000,
)
return response.choices[0].message.content


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main() -> None:
parser = argparse.ArgumentParser(description="Generate HydraDB weekly release notes")
parser.add_argument("--days", type=int, default=7, help="Look-back window in days (default: 7)")
parser.add_argument("--dry-run", action="store_true", help="Skip AI summarization")
parser.add_argument("--output-dir", default="reports", help="Output directory (default: reports)")
args = parser.parse_args()

since = datetime.now(timezone.utc) - timedelta(days=args.days)
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
print(f"Generating release notes for {args.days}-day window ending {today}")
print(f"Cutoff: {since.isoformat()}")
print()

# Fetch PRs from all repos
all_prs: list[tuple[dict, dict]] = [] # (repo_config, pr)
for repo in REPOS:
slug = f"{repo['owner']}/{repo['name']}"
print(f"Fetching {slug}...")
prs = fetch_merged_prs(repo["owner"], repo["name"], since)
if prs:
print(f" Found {len(prs)} merged PRs")
for pr in prs:
all_prs.append((repo, pr))
else:
print(f" No merged PRs in window")

if not all_prs:
print("\nNo merged PRs found in the time window. Nothing to report.")
sys.exit(0)

print(f"\nTotal: {len(all_prs)} merged PRs across {len({r['name'] for r, _ in all_prs})} repos")

# Categorize
categorized: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
for repo, pr in all_prs:
category = categorize_pr(pr)
categorized[category][repo["display"]].append(pr)

# Generate AI summary
ai_summary = generate_ai_summary(categorized, dry_run=args.dry_run)

# Build markdown
lines: list[str] = []
lines.append(f"# HydraDB Release Notes -- Week of {today}")
lines.append("")
lines.append(f"**Period:** {since.strftime('%B %d')} -- {datetime.now(timezone.utc).strftime('%B %d, %Y')}")
lines.append(f"**Total PRs Merged:** {len(all_prs)}")
active_repos = sorted({r["display"] for r, _ in all_prs})
lines.append(f"**Active Repositories:** {', '.join(active_repos)}")
lines.append("")

if ai_summary:
lines.append("## Executive Summary")
lines.append("")
lines.append(ai_summary)
lines.append("")

lines.append("---")
lines.append("")

# Ordered categories
category_order = [
"Features", "Bug Fixes", "Performance", "Security",
"Infrastructure", "Documentation", "Chores", "Other",
]
for category in category_order:
if category not in categorized:
continue
repos = categorized[category]
total = sum(len(prs) for prs in repos.values())
lines.append(f"## {category} ({total})")
lines.append("")
for repo_display in sorted(repos.keys()):
prs = repos[repo_display]
lines.append(f"### {repo_display}")
lines.append("")
for pr in sorted(prs, key=lambda p: p["number"], reverse=True):
author = pr["author"].get("login", "unknown")
url = pr["url"]
lines.append(f"- **[#{pr['number']}]({url})** {pr['title']} _(by @{author})_")
lines.append("")

# Stats
lines.append("---")
lines.append("")
lines.append("## Contributors")
lines.append("")
contributors: dict[str, int] = defaultdict(int)
for _, pr in all_prs:
author = pr["author"].get("login", "unknown")
if not pr["author"].get("is_bot", False):
contributors[author] += 1
for author, count in sorted(contributors.items(), key=lambda x: -x[1]):
lines.append(f"- @{author} ({count} PRs)")
lines.append("")

# Bot contributions
bot_count = sum(1 for _, pr in all_prs if pr["author"].get("is_bot", False))
Comment on lines +243 to +250
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Bot detection always evaluates to False

gh pr list --json author does not include an is_bot field — the returned object only contains login (and sometimes name). As a result, pr["author"].get("is_bot", False) is always False, every bot PR gets counted in the human contributors list, and bot_count is permanently 0 so the "Automated (Vorflux bot)" line never appears.

The reliable fix is to check whether the login ends with [bot], which is GitHub's naming convention for all Apps and automation accounts (e.g. renovate[bot], dependabot[bot], vorflux[bot]). Replace pr["author"].get("is_bot", False) with pr["author"].get("login", "").endswith("[bot]") in both the contributors filter (line 243) and the bot_count sum (line 250).

if bot_count:
lines.append(f"- Automated (Vorflux bot): {bot_count} PRs")
lines.append("")

content = "\n".join(lines)

# Write output
out_dir = Path(args.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"release-notes-{today}.md"
out_path.write_text(content, encoding="utf-8")
print(f"\nRelease notes written to: {out_path}")
print(f"Length: {len(content)} chars, {len(lines)} lines")


if __name__ == "__main__":
main()
Loading