From 21a08722290654ddc4defc80d66b14d90f73402f Mon Sep 17 00:00:00 2001 From: Sudhanshu Sali Date: Wed, 6 May 2026 16:47:42 -0400 Subject: [PATCH 1/2] fix: reduce PR review false positives, increase context budget --- scripts/issue_bot/config.py | 4 ++-- scripts/issue_bot/github_client.py | 1 + scripts/issue_bot/main.py | 15 ++++++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/scripts/issue_bot/config.py b/scripts/issue_bot/config.py index b6fdb0c..b090477 100644 --- a/scripts/issue_bot/config.py +++ b/scripts/issue_bot/config.py @@ -33,8 +33,8 @@ def __init__(self): self.upstream_repo = os.getenv("UPSTREAM_REPO", "awslabs/python-deequ") - self.bedrock_timeout = 120 - self.max_context_chars = 200000 + self.bedrock_timeout = 240 + self.max_context_chars = 800000 self.max_github_search_results = 8 self.github_api_timeout = 10 self.allowed_labels = { diff --git a/scripts/issue_bot/github_client.py b/scripts/issue_bot/github_client.py index 82d0ed2..d195bb5 100644 --- a/scripts/issue_bot/github_client.py +++ b/scripts/issue_bot/github_client.py @@ -150,6 +150,7 @@ def post_pr_review(self, number, summary, inline_comments): if resp.status_code in (200, 201): return True logger.error(f"PR review API failed: {resp.status_code}, falling back to comment") + logger.error(f"Response: {resp.text[:500]}") except Exception as e: logger.error(f"PR review API failed: {e}, falling back to comment") diff --git a/scripts/issue_bot/main.py b/scripts/issue_bot/main.py index 77f5ef7..eae12e1 100644 --- a/scripts/issue_bot/main.py +++ b/scripts/issue_bot/main.py @@ -149,17 +149,30 @@ def analyze(): diff = gh.get_pr_diff(number) review_comments = gh.get_pr_review_comments(number) existing_feedback = _format_pr_feedback(comments_data, review_comments) + # Fetch full source files modified in the PR for complete context + pr_files = gh.get_pr_files(number) + full_sources = "" + for pf in pr_files: + fname = pf.get("filename", "") + content = gh.get_file_content(fname) + if content: + entry = f"\n### `{fname}`\n```\n{content}\n```\n" + if len(full_sources) + len(entry) > 3_000_000: + full_sources += f"\n### `{fname}` — SKIPPED (context budget)\n" + break + full_sources += entry # System prompt: instructions + all trusted context (not scanned by guardrail) system_prompt = _render(tmpl, current_date=datetime.date.today().isoformat()) + ( f"\n\n\n{context}\n\n" f"\n{codebase_map}\n\n" + f"\n{full_sources}\n\n" f"\n{diff}\n\n" f"\n{existing_feedback}\n" ) # User prompt: only user-authored content (scanned by guardrail) user_prompt = f"\nTitle: {title}\nBody: {body}\n" raw = bedrock.invoke(system_prompt, user_prompt, - max_tokens=4000, json_schema=PR_REVIEW_SCHEMA) + max_tokens=8000, json_schema=PR_REVIEW_SCHEMA) if raw is None: _write_artifact({ "action": "ESCALATE", "reason": "bedrock_unavailable", "title": title, From 7118e84d003067466a721fa9d3ce6e7a315db787 Mon Sep 17 00:00:00 2001 From: Sudhanshu Sali Date: Wed, 13 May 2026 15:05:52 -0400 Subject: [PATCH 2/2] fix: incremental PR review, auto-approve, and bot operational improvements --- .github/workflows/auto-approve.yml | 101 +++ .github/workflows/issue-bot.yml | 12 +- .github/workflows/stale.yml | 36 + scripts/issue_bot/config.py | 6 +- scripts/issue_bot/github_client.py | 116 ++- scripts/issue_bot/main.py | 94 ++- scripts/issue_bot/prompts.py | 40 +- .../issue_bot/schemas/pr_review_response.json | 13 +- tests/test_bot.py | 788 ++++++++++++++++++ 9 files changed, 1158 insertions(+), 48 deletions(-) create mode 100644 .github/workflows/auto-approve.yml create mode 100644 .github/workflows/stale.yml diff --git a/.github/workflows/auto-approve.yml b/.github/workflows/auto-approve.yml new file mode 100644 index 0000000..f24c211 --- /dev/null +++ b/.github/workflows/auto-approve.yml @@ -0,0 +1,101 @@ +name: Auto-Approve Clean PRs + +on: + workflow_run: + workflows: [".github/workflows/base.yml", "PyDeequ Bot"] + types: [completed] + +permissions: + pull-requests: write + actions: read + +jobs: + approve: + runs-on: ubuntu-latest + if: github.event.workflow_run.event == 'pull_request' || github.event.workflow_run.event == 'pull_request_target' + timeout-minutes: 2 + + steps: + - name: Find PR and check both conditions + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + with: + script: | + const sha = context.payload.workflow_run.head_sha; + const owner = context.repo.owner; + const repo = context.repo.repo; + + // Find the PR for this SHA + let prNumber = null; + const prs = context.payload.workflow_run.pull_requests; + if (prs && prs.length > 0) { + prNumber = prs[0].number; + } else { + const {data: searchResult} = await github.rest.pulls.list({ + owner, repo, state: 'open', sort: 'updated', direction: 'desc', per_page: 30 + }); + const match = searchResult.find(pr => pr.head.sha === sha); + if (match) { + prNumber = match.number; + } + } + + if (!prNumber) { + core.info(`No open PR found for SHA ${sha}, skipping`); + return; + } + + core.info(`Found PR #${prNumber} for SHA ${sha}`); + + // Verify the PR head SHA still matches (no new push since trigger) + const {data: pr} = await github.rest.pulls.get({ + owner, repo, pull_number: prNumber + }); + if (pr.head.sha !== sha) { + core.info(`PR head ${pr.head.sha} differs from trigger SHA ${sha} — new push arrived, skipping`); + return; + } + + // Condition 1: CI must have passed for this SHA + const {data: workflowRuns} = await github.rest.actions.listWorkflowRunsForRepo({ + owner, repo, head_sha: sha, status: 'completed' + }); + const ciRun = workflowRuns.workflow_runs.find(r => + r.name === '.github/workflows/base.yml' && r.conclusion === 'success' + ); + if (!ciRun) { + core.info(`CI has not passed for SHA ${sha}, skipping`); + return; + } + + // Condition 2: Bot must have posted a clean review for this SHA + const {data: reviews} = await github.rest.pulls.listReviews({ + owner, repo, pull_number: prNumber + }); + + const CLEAN_MARKER = ''; + + const latestBot = reviews + .filter(r => r.user.login === 'github-actions[bot]') + .sort((a, b) => new Date(b.submitted_at) - new Date(a.submitted_at))[0]; + + if (!latestBot || !latestBot.body.includes(CLEAN_MARKER) || latestBot.commit_id !== sha) { + core.info('Bot has not posted a clean review for this SHA, skipping'); + return; + } + + // Both conditions met — check for existing approval to prevent doubles + const botApprovals = reviews.filter(r => + r.user.login === 'github-actions[bot]' && r.state === 'APPROVED' + ); + if (botApprovals.length > 0) { + core.info('Bot already approved this PR, skipping'); + return; + } + + // Approve + core.info(`Approving PR #${prNumber}: bot review clean + CI passed for SHA ${sha}`); + await github.rest.pulls.createReview({ + owner, repo, pull_number: prNumber, + event: 'APPROVE', + body: `No issues found and CI is passing. Auto-approved.\n\n---\n*Generated by AI — human merge required.*` + }); diff --git a/.github/workflows/issue-bot.yml b/.github/workflows/issue-bot.yml index b24a577..96a2056 100644 --- a/.github/workflows/issue-bot.yml +++ b/.github/workflows/issue-bot.yml @@ -61,16 +61,20 @@ jobs: ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }} EVENT_TYPE: ${{ github.event_name }} EVENT_ACTION: ${{ github.event.action }} + EVENT_BEFORE: ${{ github.event.before }} + EVENT_AFTER: ${{ github.event.pull_request.head.sha || github.event.after }} GITHUB_ACTOR: ${{ github.actor }} KB_S3_BUCKET: ${{ secrets.KB_S3_BUCKET }} KB_S3_KEY: ${{ secrets.KB_S3_KEY }} BEDROCK_MODEL_ID: ${{ secrets.BEDROCK_MODEL_ID }} GUARDRAIL_ID: ${{ secrets.GUARDRAIL_ID }} GUARDRAIL_VERSION: ${{ secrets.GUARDRAIL_VERSION }} - ISSUE_CLASSIFY_PROMPT: ${{ secrets.ISSUE_CLASSIFY_PROMPT }} - ISSUE_RESPOND_PROMPT: ${{ secrets.ISSUE_RESPOND_PROMPT }} - PR_FILE_REVIEW_PROMPT: ${{ secrets.PR_FILE_REVIEW_PROMPT }} - FOLLOWUP_PROMPT: ${{ secrets.FOLLOWUP_PROMPT }} + SM_ISSUE_CLASSIFY_PROMPT: pydeequ-bot/issue-classify-prompt + SM_ISSUE_RESPOND_PROMPT: pydeequ-bot/issue-respond-prompt + SM_PR_FILE_REVIEW_PROMPT: pydeequ-bot/pr-file-review-prompt + SM_FOLLOWUP_PROMPT: pydeequ-bot/followup-prompt + CODEBASE_SRC_DIR: pydeequ + CODEBASE_FILE_EXT: .py DRY_RUN: ${{ inputs.dry_run || 'false' }} ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json run: python -m issue_bot.main analyze diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000..94ae395 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,36 @@ +name: Manage Stale Issues and PRs + +on: + schedule: + - cron: '0 9 * * MON' + workflow_dispatch: + +permissions: + issues: write + pull-requests: write + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 + with: + days-before-stale: 60 + days-before-close: 14 + stale-issue-label: 'stale' + stale-pr-label: 'stale' + stale-issue-message: > + This issue has been inactive for 60 days. It will be closed in 14 days + if there is no further activity. If this is still relevant, please comment + to keep it open. + stale-pr-message: > + This PR has been inactive for 60 days. It will be closed in 14 days + if there is no further activity. If you are still working on this, + please push an update or comment to keep it open. + close-issue-message: > + Closed due to inactivity. Feel free to reopen if this is still relevant. + close-pr-message: > + Closed due to inactivity. Feel free to reopen if you'd like to continue this work. + exempt-issue-labels: 'bug,enhancement,help-wanted' + exempt-pr-labels: 'help-wanted' + operations-per-run: 50 diff --git a/scripts/issue_bot/config.py b/scripts/issue_bot/config.py index b090477..5fff510 100644 --- a/scripts/issue_bot/config.py +++ b/scripts/issue_bot/config.py @@ -17,6 +17,8 @@ def __init__(self): sys.exit(1) self.repo = _require("GITHUB_REPOSITORY") self.actor = os.getenv("GITHUB_ACTOR", "") + self.event_before = os.getenv("EVENT_BEFORE", "") + self.event_after = os.getenv("EVENT_AFTER", "") self.bedrock_model_id = os.getenv("BEDROCK_MODEL_ID", "us.anthropic.claude-opus-4-6-v1") @@ -32,6 +34,8 @@ def __init__(self): self.enable_repo_search = os.getenv("ENABLE_REPO_SEARCH", "true").lower() == "true" self.upstream_repo = os.getenv("UPSTREAM_REPO", "awslabs/python-deequ") + self.codebase_src_dir = os.getenv("CODEBASE_SRC_DIR", "pydeequ") + self.codebase_file_ext = os.getenv("CODEBASE_FILE_EXT", ".py") self.bedrock_timeout = 240 self.max_context_chars = 800000 @@ -39,7 +43,7 @@ def __init__(self): self.github_api_timeout = 10 self.allowed_labels = { "bug", "enhancement", "question", "documentation", - "help-wanted", "analyzer", "check", "spark-compatibility", "installation", + "help wanted", "python", } diff --git a/scripts/issue_bot/github_client.py b/scripts/issue_bot/github_client.py index d195bb5..5c2d046 100644 --- a/scripts/issue_bot/github_client.py +++ b/scripts/issue_bot/github_client.py @@ -11,6 +11,7 @@ def __init__(self, cfg): self._repo = cfg.repo self._timeout = cfg.github_api_timeout self._dry_run = cfg.dry_run + self._cfg = cfg self._repo_root = os.getenv("GITHUB_WORKSPACE", os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) self._headers = { "Authorization": f"token {self._token}", @@ -48,6 +49,59 @@ def get_pr_diff(self, number): logger.error(f"PR diff fetch failed: {e}") return "" + def get_compare_diff(self, base_sha, head_sha): + """Fetch the diff between two commits using the Compare API. + Returns the diff text, or empty string on failure (e.g. force-push + where base_sha no longer exists).""" + headers = {**self._headers, "Accept": "application/vnd.github.v3.diff"} + try: + resp = requests.get( + f"https://api.github.com/repos/{self._repo}/compare/{base_sha}...{head_sha}", + headers=headers, timeout=self._timeout, + ) + if resp.status_code == 200: + return resp.text + logger.warning(f"Compare API {base_sha[:7]}...{head_sha[:7]}: {resp.status_code}") + return "" + except Exception as e: + logger.error(f"Compare diff failed: {e}") + return "" + + def get_ci_status(self, sha): + """Check commit statuses and check runs. Returns (passed, summary). + passed: True (all green), False (something failed), None (pending/unknown).""" + status = self._get(f"/repos/{self._repo}/commits/{sha}/status") + if status is None: + return None, "CI status unavailable" + combined_state = status.get("state", "pending") + + check_data = self._get(f"/repos/{self._repo}/commits/{sha}/check-runs") + runs = check_data.get("check_runs", []) if check_data else [] + + def _is_own_check(name): + lower = name.lower() + return "bot" in lower and ("analyze" in lower or "/ act" in lower) + + external_runs = [r for r in runs if not _is_own_check(r.get("name", ""))] + + failed = [] + pending = [] + for r in external_runs: + if r.get("status") != "completed": + pending.append(r["name"]) + elif r.get("conclusion") not in ("success", "neutral", "skipped"): + failed.append(r["name"]) + + if failed: + return False, f"CI failing: {', '.join(failed)}" + if pending: + return None, f"CI pending: {', '.join(pending)}" + if combined_state == "failure": + return False, "CI failing (status checks)" + if combined_state == "pending": + return None, "CI pending (status checks)" + return True, "CI passed" + def get_pr_files(self, number): return self._get(f"/repos/{self._repo}/pulls/{number}/files") or [] @@ -64,16 +118,18 @@ def get_pr_review_comments(self, number, max_pages=10): page += 1 return comments - def get_codebase_map(self, src_dir="pydeequ"): - """List all Python source files (excluding tests) as relative paths.""" + def get_codebase_map(self): + """List source files (excluding tests) as relative paths.""" + src_dir = self._cfg.codebase_src_dir + file_ext = self._cfg.codebase_file_ext full_dir = os.path.join(self._repo_root, src_dir) prefix = self._repo_root.rstrip("/") + "/" try: paths = [] for root, dirs, files in os.walk(full_dir): - dirs[:] = [d for d in dirs if d not in ("tests", "__pycache__", ".git")] + dirs[:] = [d for d in dirs if d not in ("examples", "__pycache__", ".git", "tests", "test")] for f in files: - if f.endswith(".py"): + if f.endswith(file_ext): full = os.path.join(root, f) rel = full[len(prefix):] if full.startswith(prefix) else full paths.append(rel) @@ -116,9 +172,9 @@ def post_comment(self, number, body): return True return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body}) - def post_pr_review(self, number, summary, inline_comments): + def post_pr_review(self, number, summary, inline_comments, event="COMMENT"): if self._dry_run: - logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments") + logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments, event={event}") return True # Get valid diff lines per file from the PR @@ -134,32 +190,34 @@ def post_pr_review(self, number, summary, inline_comments): else: invalid_comments.append(ic) + body = summary + if invalid_comments: + body += "\n\n**Additional feedback:**\n" + for ic in invalid_comments: + line_ref = f":{ic['line']}" if ic.get('line') else "" + body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n" + + payload = {"body": body, "event": event} if valid_comments: - body = summary - if invalid_comments: - body += "\n\n**Additional feedback:**\n" - for ic in invalid_comments: - line_ref = f":{ic['line']}" if ic.get('line') else "" - body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n" - payload = {"body": body, "event": "REQUEST_CHANGES", "comments": valid_comments} - try: - resp = requests.post( - f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews", - headers=self._headers, json=payload, timeout=self._timeout, - ) - if resp.status_code in (200, 201): - return True - logger.error(f"PR review API failed: {resp.status_code}, falling back to comment") - logger.error(f"Response: {resp.text[:500]}") - except Exception as e: - logger.error(f"PR review API failed: {e}, falling back to comment") - - # Fallback: post all as regular comment - all_comments = inline_comments + payload["comments"] = valid_comments + + try: + resp = requests.post( + f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews", + headers=self._headers, json=payload, timeout=self._timeout, + ) + if resp.status_code in (200, 201): + return True + logger.error(f"PR review API failed: {resp.status_code}, falling back to comment") + logger.error(f"Response: {resp.text[:500]}") + except Exception as e: + logger.error(f"PR review API failed: {e}, falling back to comment") + + # Fallback: post as regular comment if review API fails body = summary - if all_comments: + if inline_comments: body += "\n\n**Inline feedback:**\n" - for ic in all_comments: + for ic in inline_comments: line_ref = f":{ic['line']}" if ic.get('line') else "" body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n" return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body}) diff --git a/scripts/issue_bot/main.py b/scripts/issue_bot/main.py index eae12e1..593eaa8 100644 --- a/scripts/issue_bot/main.py +++ b/scripts/issue_bot/main.py @@ -6,6 +6,7 @@ """ import json +import re import sys import os import datetime @@ -149,25 +150,52 @@ def analyze(): diff = gh.get_pr_diff(number) review_comments = gh.get_pr_review_comments(number) existing_feedback = _format_pr_feedback(comments_data, review_comments) - # Fetch full source files modified in the PR for complete context + + # Incremental review: on synchronize, compute what changed since last push + incremental_diff = "" + incremental_files = set() + if is_pr_update and cfg.event_before and cfg.event_after: + incremental_diff = gh.get_compare_diff(cfg.event_before, cfg.event_after) + if incremental_diff: + incremental_files = _extract_diff_files(incremental_diff) + + # Fetch full source files at the SHA the diff is anchored to + head_sha = cfg.event_after or item.get("head", {}).get("sha", "") pr_files = gh.get_pr_files(number) full_sources = "" for pf in pr_files: fname = pf.get("filename", "") - content = gh.get_file_content(fname) + content = gh.get_file_content(fname, ref=head_sha) if head_sha else gh.get_file_content(fname) if content: entry = f"\n### `{fname}`\n```\n{content}\n```\n" if len(full_sources) + len(entry) > 3_000_000: full_sources += f"\n### `{fname}` — SKIPPED (context budget)\n" break full_sources += entry + + # Build incremental review instructions + incremental_section = "" + if incremental_diff: + incremental_section = ( + "\n\n" + "This is a RE-REVIEW after the author pushed new commits. " + "The below shows ONLY what changed since the last push. " + "You MUST limit your comments to lines/files in the incremental diff. " + "Do NOT re-raise issues on unchanged code — the author already saw prior feedback. " + "Do NOT comment on lines that are not part of the incremental diff. " + "If the incremental diff only fixes issues from prior feedback, respond with zero comments." + "\n\n" + f"\n{incremental_diff}\n\n" + ) + # System prompt: instructions + all trusted context (not scanned by guardrail) system_prompt = _render(tmpl, current_date=datetime.date.today().isoformat()) + ( f"\n\n\n{context}\n\n" f"\n{codebase_map}\n\n" f"\n{full_sources}\n\n" f"\n{diff}\n\n" - f"\n{existing_feedback}\n" + f"\n{existing_feedback}\n\n" + f"{incremental_section}" ) # User prompt: only user-authored content (scanned by guardrail) user_prompt = f"\nTitle: {title}\nBody: {body}\n" @@ -185,14 +213,50 @@ def analyze(): inline_comments = pr_result.get("comments", []) except json.JSONDecodeError: inline_comments = _parse_file_review_multi(raw) + + # Hard filter: on incremental review, drop comments on files not in the incremental diff + if incremental_files and inline_comments: + inline_comments = [ + c for c in inline_comments + if c.get("file", "") in incremental_files + ] + + # Hard filter: drop NITs on re-reviews (code-enforced, not prompt-dependent) + if is_pr_update and inline_comments: + inline_comments = [ + c for c in inline_comments + if c.get("severity", "").upper() != "NIT" + ] + + # Format comments: prepend severity, append evidence as context + for c in inline_comments: + severity = c.get("severity", "") + evidence = c.get("evidence", "") + prefix = f"**{severity}**: " if severity else "" + suffix = "\n\n> " + evidence.replace("\n", "\n> ") if evidence else "" + c["comment"] = prefix + c.get("comment", "") + suffix + + # Check CI status to give accurate signal to human reviewers + ci_passed, ci_summary = gh.get_ci_status(head_sha) if head_sha else (None, "") + + if not inline_comments: + if ci_passed is True: + response = "No issues found. CI is passing.\n" + elif ci_passed is False: + response = f"No code issues found, but {ci_summary}." + else: + response = "No issues found.\n" + else: + response = "" + _write_artifact({ "action": "RESPOND", - "labels": [], "response": "No issues found." if not inline_comments else "", + "labels": [], "response": response, "inline_comments": inline_comments, "title": title, "html_url": html_url, "number": number, - "is_pr": True, "prompt_id": prompts.prompt_version(tmpl), + "is_pr": True, "is_incremental": bool(incremental_diff), + "prompt_id": prompts.prompt_version(tmpl), "model_id": cfg.bedrock_model_id, - "reason": "no_issues_found" if not inline_comments else "", }) return @@ -321,7 +385,11 @@ def act(): sanitized_comments.append({**ic, "comment": safe_comment}) inline_comments = sanitized_comments if is_pr and inline_comments: - gh.post_pr_review(number, response + footer, inline_comments) + gh.post_pr_review(number, response + footer, inline_comments, event="COMMENT") + elif is_pr and response and not inline_comments: + gh.post_pr_review(number, response + footer, [], event="COMMENT") + elif not response and not inline_comments: + logger.info(f"Skip #{number}: nothing to post after sanitization") else: gh.post_comment(number, response + footer) gh.add_labels(number, labels) @@ -367,7 +435,7 @@ def act(): elif action == "CLOSE" and not is_pr: msg = ( - "This issue may not be related to the PyDeequ data quality library. " + "This issue may not be related to the PyDeequ library. " "The maintainer team has been notified and will review." + footer ) gh.post_comment(number, msg) @@ -597,6 +665,16 @@ def _format_pr_feedback(issue_comments, review_comments): return "\n".join(parts) if parts else "(no existing feedback)" +def _extract_diff_files(diff_text): + """Extract the set of file paths touched in a unified diff.""" + files = set() + for line in diff_text.split("\n"): + m = re.match(r'^diff --git a/.+ b/(.+)$', line) + if m: + files.add(m.group(1)) + return files + + def _read_requested_files(gh, file_paths, cfg): snippets = [] for path in file_paths[:cfg.max_github_search_results]: diff --git a/scripts/issue_bot/prompts.py b/scripts/issue_bot/prompts.py index e2ff385..9fc97ea 100644 --- a/scripts/issue_bot/prompts.py +++ b/scripts/issue_bot/prompts.py @@ -1,21 +1,53 @@ import hashlib +import logging import os +import boto3 + +logger = logging.getLogger("issue_bot") + +_sm_client = None + + +def _get_sm_client(): + global _sm_client + if _sm_client is None: + _sm_client = boto3.client("secretsmanager") + return _sm_client + + +def _read_from_sm(secret_id): + if not secret_id: + return "" + try: + resp = _get_sm_client().get_secret_value(SecretId=secret_id) + return resp["SecretString"] + except Exception as e: + logger.error("Failed to read prompt from Secrets Manager: %s", type(e).__name__) + return "" + + +def _get_prompt(env_var, sm_env_var): + val = os.getenv(env_var, "") + if val: + return val + return _read_from_sm(os.getenv(sm_env_var, "")) + def get_issue_prompt(): - return os.getenv("ISSUE_CLASSIFY_PROMPT", "") + return _get_prompt("ISSUE_CLASSIFY_PROMPT", "SM_ISSUE_CLASSIFY_PROMPT") def get_issue_respond_prompt(): - return os.getenv("ISSUE_RESPOND_PROMPT", "") + return _get_prompt("ISSUE_RESPOND_PROMPT", "SM_ISSUE_RESPOND_PROMPT") def get_pr_file_review_prompt(): - return os.getenv("PR_FILE_REVIEW_PROMPT", "") + return _get_prompt("PR_FILE_REVIEW_PROMPT", "SM_PR_FILE_REVIEW_PROMPT") def get_followup_prompt(): - return os.getenv("FOLLOWUP_PROMPT", "") + return _get_prompt("FOLLOWUP_PROMPT", "SM_FOLLOWUP_PROMPT") def prompt_version(template): diff --git a/scripts/issue_bot/schemas/pr_review_response.json b/scripts/issue_bot/schemas/pr_review_response.json index ef22e07..1901a95 100644 --- a/scripts/issue_bot/schemas/pr_review_response.json +++ b/scripts/issue_bot/schemas/pr_review_response.json @@ -12,14 +12,23 @@ "line": { "type": "integer" }, + "severity": { + "type": "string", + "enum": ["BUG", "EDGE_CASE", "MISSING_TEST", "DESIGN", "NIT"] + }, "comment": { "type": "string" + }, + "evidence": { + "type": "string" } }, "required": [ "file", "line", - "comment" + "severity", + "comment", + "evidence" ], "additionalProperties": false } @@ -29,4 +38,4 @@ "comments" ], "additionalProperties": false -} \ No newline at end of file +} diff --git a/tests/test_bot.py b/tests/test_bot.py index 77992dc..90f0f66 100644 --- a/tests/test_bot.py +++ b/tests/test_bot.py @@ -17,6 +17,7 @@ _user_dissatisfied, _clean_response, _render, + _extract_diff_files, ) from issue_bot.sanitizer import sanitize, _fix_accidental_issue_refs @@ -203,6 +204,208 @@ def test_preserves_normal_text(self): assert _clean_response(text) == text +class TestGetCiStatus: + """Tests for GitHubClient.get_ci_status method.""" + + def _make_client(self): + import unittest.mock as mock + with mock.patch.dict(os.environ, { + "GITHUB_TOKEN": "fake", "GITHUB_REPOSITORY": "awslabs/test", + "ISSUE_NUMBER": "1", "EVENT_TYPE": "issues", "EVENT_ACTION": "opened", + "GITHUB_WORKFLOW": "PyDeequ Bot", + }): + from issue_bot.config import Config + from issue_bot.github_client import GitHubClient + cfg = Config() + client = GitHubClient(cfg) + return client + + def test_all_checks_passed(self): + import unittest.mock as mock + client = self._make_client() + client._get = mock.MagicMock(side_effect=[ + {"state": "success"}, # commit status + {"check_runs": [ + {"name": "Java CI", "status": "completed", "conclusion": "success"}, + {"name": "CodeQL", "status": "completed", "conclusion": "success"}, + ]}, + ]) + passed, summary = client.get_ci_status("abc123") + assert passed is True + assert "passed" in summary.lower() + + def test_check_run_failed(self): + import unittest.mock as mock + client = self._make_client() + client._get = mock.MagicMock(side_effect=[ + {"state": "success"}, + {"check_runs": [ + {"name": "Java CI", "status": "completed", "conclusion": "failure"}, + {"name": "CodeQL", "status": "completed", "conclusion": "success"}, + ]}, + ]) + passed, summary = client.get_ci_status("abc123") + assert passed is False + assert "Java CI" in summary + + def test_check_run_pending(self): + import unittest.mock as mock + client = self._make_client() + client._get = mock.MagicMock(side_effect=[ + {"state": "pending"}, + {"check_runs": [ + {"name": "Java CI", "status": "in_progress", "conclusion": None}, + ]}, + ]) + passed, summary = client.get_ci_status("abc123") + assert passed is None + assert "pending" in summary.lower() + + def test_bot_check_filtered_out(self): + import unittest.mock as mock + client = self._make_client() + client._get = mock.MagicMock(side_effect=[ + {"state": "success"}, + {"check_runs": [ + {"name": "Java CI", "status": "completed", "conclusion": "success"}, + {"name": "PyDeequ Bot / analyze", "status": "completed", "conclusion": "success"}, + {"name": "PyDeequ Bot / act", "status": "completed", "conclusion": "success"}, + ]}, + ]) + passed, _ = client.get_ci_status("abc123") + assert passed is True + + def test_non_bot_check_with_bot_in_name_not_filtered(self): + import unittest.mock as mock + client = self._make_client() + client._get = mock.MagicMock(side_effect=[ + {"state": "success"}, + {"check_runs": [ + {"name": "robot-tests", "status": "completed", "conclusion": "failure"}, + ]}, + ]) + passed, _ = client.get_ci_status("abc123") + assert passed is False + + def test_skipped_and_neutral_count_as_passed(self): + import unittest.mock as mock + client = self._make_client() + client._get = mock.MagicMock(side_effect=[ + {"state": "success"}, + {"check_runs": [ + {"name": "Optional Check", "status": "completed", "conclusion": "skipped"}, + {"name": "Info Check", "status": "completed", "conclusion": "neutral"}, + ]}, + ]) + passed, _ = client.get_ci_status("abc123") + assert passed is True + + def test_api_failure_returns_unknown(self): + import unittest.mock as mock + client = self._make_client() + client._get = mock.MagicMock(return_value=None) + passed, summary = client.get_ci_status("abc123") + assert passed is None + + +class TestAutoApproveSignal: + """Tests that bot posts the correct signal for the auto-approve workflow to act on.""" + + def _make_artifact(self, tmp_path, response, inline_comments=None): + artifact = { + "action": "RESPOND", + "labels": [], + "response": response, + "inline_comments": inline_comments or [], + "title": "Fix", "html_url": "https://github.com/x", + "number": 42, "is_pr": True, "is_incremental": False, + "prompt_id": "abc123", "model_id": "test", + } + path = str(tmp_path / "result.json") + with open(path, "w") as f: + json.dump(artifact, f) + return path + + def test_no_issues_posts_pr_review_with_signal(self, tmp_path, monkeypatch): + """Bot posts 'No issues found' as a PR review — auto-approve.yml looks for this in listReviews.""" + import unittest.mock as mock + path = self._make_artifact(tmp_path, response="No issues found. CI is passing.") + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "42") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", "opened") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", path) + + with mock.patch("issue_bot.github_client.GitHubClient.post_pr_review") as mock_review, \ + mock.patch("issue_bot.github_client.GitHubClient.post_comment") as mock_comment, \ + mock.patch("issue_bot.github_client.GitHubClient.add_labels"), \ + mock.patch("issue_bot.slack_client.SlackClient.send_escalation"): + mock_review.return_value = True + bot_main.act() + mock_review.assert_called_once() + mock_comment.assert_not_called() + body = mock_review.call_args[0][1] + assert "No issues found" in body + + def test_with_issues_posts_review_not_comment(self, tmp_path, monkeypatch): + """Bot posts inline review when there are issues — no approve signal.""" + import unittest.mock as mock + path = self._make_artifact(tmp_path, response="", + inline_comments=[{"file": "a.py", "line": 1, "comment": "BUG: issue"}]) + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "42") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", "opened") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", path) + + with mock.patch("issue_bot.github_client.GitHubClient.post_pr_review") as mock_review, \ + mock.patch("issue_bot.github_client.GitHubClient.post_comment") as mock_comment, \ + mock.patch("issue_bot.github_client.GitHubClient.add_labels"), \ + mock.patch("issue_bot.slack_client.SlackClient.send_escalation"): + mock_review.return_value = True + bot_main.act() + mock_review.assert_called_once() + mock_comment.assert_not_called() + + +class TestPrompts: + def test_env_var_takes_precedence(self, monkeypatch): + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "from env") + monkeypatch.setenv("SM_PR_FILE_REVIEW_PROMPT", "deequ-bot/pr-file-review-prompt") + from issue_bot.prompts import get_pr_file_review_prompt + assert get_pr_file_review_prompt() == "from env" + + def test_empty_env_var_falls_through_to_sm(self, monkeypatch): + import unittest.mock as mock + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "") + monkeypatch.setenv("SM_PR_FILE_REVIEW_PROMPT", "deequ-bot/pr-file-review-prompt") + with mock.patch("issue_bot.prompts._read_from_sm", return_value="from sm") as m: + from issue_bot.prompts import get_pr_file_review_prompt + result = get_pr_file_review_prompt() + assert result == "from sm" + m.assert_called_once_with("deequ-bot/pr-file-review-prompt") + + def test_no_sm_env_var_returns_empty(self, monkeypatch): + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "") + monkeypatch.setenv("SM_PR_FILE_REVIEW_PROMPT", "") + from issue_bot.prompts import get_pr_file_review_prompt + # No env var, no SM secret name → empty string + assert get_pr_file_review_prompt() == "" + + def test_sm_failure_returns_empty(self, monkeypatch): + import unittest.mock as mock + monkeypatch.setenv("FOLLOWUP_PROMPT", "") + monkeypatch.setenv("SM_FOLLOWUP_PROMPT", "deequ-bot/followup-prompt") + with mock.patch("issue_bot.prompts._get_sm_client") as mock_client: + mock_client.return_value.get_secret_value.side_effect = Exception("timeout") + from issue_bot.prompts import get_followup_prompt + assert get_followup_prompt() == "" + + class TestSmoke: def test_main_module_imports(self): from issue_bot import main @@ -320,3 +523,588 @@ def test_no_guardrail_no_config(self): client.invoke("system", "user") kwargs = client._client.converse.call_args[1] assert "guardrailConfig" not in kwargs + + +class TestExtractDiffFiles: + def test_single_file(self): + diff = ( + "diff --git a/src/foo.py b/src/foo.py\n" + "index abc1234..def5678 100644\n" + "--- a/src/foo.py\n" + "+++ b/src/foo.py\n" + "@@ -1,3 +1,4 @@\n" + "+new line\n" + ) + assert _extract_diff_files(diff) == {"src/foo.py"} + + def test_multiple_files(self): + diff = ( + "diff --git a/a.py b/a.py\n" + "--- a/a.py\n" + "+++ b/a.py\n" + "@@ -1 +1 @@\n" + "-old\n" + "+new\n" + "diff --git a/b.py b/b.py\n" + "--- a/b.py\n" + "+++ b/b.py\n" + "@@ -1 +1 @@\n" + "-old\n" + "+new\n" + ) + assert _extract_diff_files(diff) == {"a.py", "b.py"} + + def test_empty_diff(self): + assert _extract_diff_files("") == set() + + def test_renamed_file(self): + diff = "diff --git a/old_name.py b/new_name.py\n" + assert _extract_diff_files(diff) == {"new_name.py"} + + def test_path_with_spaces(self): + diff = "diff --git a/path with spaces/file.py b/path with spaces/file.py\n" + assert _extract_diff_files(diff) == {"path with spaces/file.py"} + + +class TestIncrementalFiltering: + """Test that the incremental file filter drops comments on unrelated files.""" + + def test_comments_filtered_to_incremental_files(self): + incremental_files = {"src/changed.py"} + inline_comments = [ + {"file": "src/changed.py", "line": 10, "comment": "new issue"}, + {"file": "src/untouched.py", "line": 5, "comment": "old issue re-raised"}, + ] + filtered = [c for c in inline_comments if c.get("file", "") in incremental_files] + assert len(filtered) == 1 + assert filtered[0]["file"] == "src/changed.py" + + def test_empty_incremental_files_passes_all(self): + incremental_files = set() + inline_comments = [ + {"file": "src/any.py", "line": 1, "comment": "comment"}, + ] + # When incremental_files is empty (fallback to full review), no filtering + if incremental_files: + filtered = [c for c in inline_comments if c.get("file", "") in incremental_files] + else: + filtered = inline_comments + assert len(filtered) == 1 + + def test_all_comments_filtered_yields_empty(self): + incremental_files = {"src/only_this.py"} + inline_comments = [ + {"file": "src/other.py", "line": 1, "comment": "stale"}, + {"file": "src/another.py", "line": 2, "comment": "stale too"}, + ] + filtered = [c for c in inline_comments if c.get("file", "") in incremental_files] + assert filtered == [] + + +class TestNitFilterAndFormatting: + """Tests for hard NIT filter on re-reviews and evidence formatting.""" + + def test_nits_dropped_on_re_review(self, tmp_path, monkeypatch): + import unittest.mock as mock + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "10") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", "synchronize") + monkeypatch.setenv("EVENT_BEFORE", "aaa") + monkeypatch.setenv("EVENT_AFTER", "bbb") + monkeypatch.setenv("KB_S3_BUCKET", "") + monkeypatch.setenv("KB_S3_KEY", "") + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json")) + + incremental = "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n@@ -1 +1 @@\n-x\n+y\n" + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff"), \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map"), \ + mock.patch("issue_bot.github_client.GitHubClient.get_ci_status") as mock_ci, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context", return_value=""), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + mock_pr.return_value = { + "user": {"login": "dev"}, "title": "Fix", "body": "", + "state": "open", "html_url": "https://github.com/x", + "head": {"sha": "bbb"}, + } + mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "prior"}] + mock_rc.return_value = [] + mock_compare.return_value = incremental + mock_files.return_value = [{"filename": "f.py"}] + mock_content.return_value = "content" + mock_ci.return_value = (True, "CI passed") + mock_bedrock.return_value = json.dumps({"comments": [ + {"file": "f.py", "line": 1, "severity": "BUG", "comment": "real bug", + "evidence": "line 1 divides by zero"}, + {"file": "f.py", "line": 1, "severity": "NIT", "comment": "rename var", + "evidence": "x is not descriptive"}, + ]}) + + bot_main.analyze() + + with open(str(tmp_path / "result.json")) as f: + result = json.load(f) + + # NIT should be filtered, BUG should remain + assert result["action"] == "RESPOND" + assert len(result["inline_comments"]) == 1 + assert "real bug" in result["inline_comments"][0]["comment"] + + def test_nits_kept_on_first_review(self, tmp_path, monkeypatch): + import unittest.mock as mock + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "10") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", "opened") + monkeypatch.setenv("EVENT_BEFORE", "") + monkeypatch.setenv("EVENT_AFTER", "") + monkeypatch.setenv("KB_S3_BUCKET", "") + monkeypatch.setenv("KB_S3_KEY", "") + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json")) + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff"), \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map"), \ + mock.patch("issue_bot.github_client.GitHubClient.get_ci_status") as mock_ci, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context", return_value=""), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + mock_pr.return_value = { + "user": {"login": "dev"}, "title": "Fix", "body": "", + "state": "open", "html_url": "https://github.com/x", + "head": {"sha": "abc123"}, + } + mock_comments.return_value = [] + mock_rc.return_value = [] + mock_files.return_value = [{"filename": "f.py"}] + mock_content.return_value = "content" + mock_ci.return_value = (True, "CI passed") + mock_bedrock.return_value = json.dumps({"comments": [ + {"file": "f.py", "line": 1, "severity": "BUG", "comment": "bug", + "evidence": "evidence1"}, + {"file": "f.py", "line": 2, "severity": "NIT", "comment": "nit", + "evidence": "evidence2"}, + ]}) + + bot_main.analyze() + + with open(str(tmp_path / "result.json")) as f: + result = json.load(f) + + # Both BUG and NIT should be present on first review + assert len(result["inline_comments"]) == 2 + + def test_evidence_formatted_in_comment(self, tmp_path, monkeypatch): + import unittest.mock as mock + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "10") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", "opened") + monkeypatch.setenv("EVENT_BEFORE", "") + monkeypatch.setenv("EVENT_AFTER", "") + monkeypatch.setenv("KB_S3_BUCKET", "") + monkeypatch.setenv("KB_S3_KEY", "") + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json")) + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff"), \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map"), \ + mock.patch("issue_bot.github_client.GitHubClient.get_ci_status") as mock_ci, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context", return_value=""), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + mock_pr.return_value = { + "user": {"login": "dev"}, "title": "Fix", "body": "", + "state": "open", "html_url": "https://github.com/x", + "head": {"sha": "abc123"}, + } + mock_comments.return_value = [] + mock_rc.return_value = [] + mock_files.return_value = [{"filename": "f.py"}] + mock_content.return_value = "content" + mock_ci.return_value = (True, "CI passed") + mock_bedrock.return_value = json.dumps({"comments": [ + {"file": "f.py", "line": 5, "severity": "BUG", + "comment": "division by zero", + "evidence": "line 3 sets count=0, line 5 divides by count"}, + ]}) + + bot_main.analyze() + + with open(str(tmp_path / "result.json")) as f: + result = json.load(f) + + comment_text = result["inline_comments"][0]["comment"] + assert comment_text.startswith("**BUG**: ") + assert "division by zero" in comment_text + assert "line 3 sets count=0" in comment_text + + +class TestIncrementalReviewIntegration: + """End-to-end tests for the incremental review path through analyze().""" + + def _setup_env(self, tmp_path, monkeypatch, event_before="abc123", event_after="def456"): + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "99") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", "synchronize") + monkeypatch.setenv("EVENT_BEFORE", event_before) + monkeypatch.setenv("EVENT_AFTER", event_after) + monkeypatch.setenv("GITHUB_ACTOR", "contributor") + monkeypatch.setenv("KB_S3_BUCKET", "") + monkeypatch.setenv("KB_S3_KEY", "") + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review this PR. Date: {current_date}") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json")) + + def test_incremental_review_filters_stale_comments(self, tmp_path, monkeypatch): + import unittest.mock as mock + self._setup_env(tmp_path, monkeypatch) + + incremental_diff = ( + "diff --git a/src/fixed.py b/src/fixed.py\n" + "--- a/src/fixed.py\n+++ b/src/fixed.py\n" + "@@ -1 +1 @@\n-old\n+new\n" + ) + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + mock_pr.return_value = {"user": {"login": "contributor"}, "title": "Fix bug", + "body": "Fixes the thing", "state": "open", "html_url": "https://github.com/x"} + mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "prior review"}] + mock_diff.return_value = "full diff here" + mock_rc.return_value = [] + mock_compare.return_value = incremental_diff + mock_files.return_value = [{"filename": "src/fixed.py"}, {"filename": "src/untouched.py"}] + mock_content.return_value = "file content" + mock_map.return_value = "" + mock_kb.return_value = "" + mock_bedrock.return_value = json.dumps({ + "comments": [ + {"file": "src/fixed.py", "line": 1, "comment": "new issue in changed file"}, + {"file": "src/untouched.py", "line": 5, "comment": "stale comment on unchanged file"}, + ] + }) + + from issue_bot.main import analyze + analyze() + + with open(str(tmp_path / "result.json")) as f: + result = json.load(f) + + assert result["action"] == "RESPOND" + assert result["is_incremental"] is True + assert len(result["inline_comments"]) == 1 + assert result["inline_comments"][0]["file"] == "src/fixed.py" + + def test_force_push_falls_back_to_full_review(self, tmp_path, monkeypatch): + import unittest.mock as mock + self._setup_env(tmp_path, monkeypatch) + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + mock_pr.return_value = {"user": {"login": "contributor"}, "title": "Fix bug", + "body": "Fixes the thing", "state": "open", "html_url": "https://github.com/x"} + mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "prior review"}] + mock_diff.return_value = "full diff here" + mock_rc.return_value = [] + mock_compare.return_value = "" # Force push — compare fails + mock_files.return_value = [{"filename": "src/a.py"}] + mock_content.return_value = "content" + mock_map.return_value = "" + mock_kb.return_value = "" + mock_bedrock.return_value = json.dumps({ + "comments": [ + {"file": "src/a.py", "line": 1, "comment": "issue found"}, + ] + }) + + from issue_bot.main import analyze + analyze() + + with open(str(tmp_path / "result.json")) as f: + result = json.load(f) + + # Falls back to full review — no filtering, not marked incremental + assert result["action"] == "RESPOND" + assert result["is_incremental"] is False + assert len(result["inline_comments"]) == 1 + + def test_no_before_sha_skips_incremental(self, tmp_path, monkeypatch): + import unittest.mock as mock + self._setup_env(tmp_path, monkeypatch, event_before="", event_after="def456") + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + mock_pr.return_value = {"user": {"login": "contributor"}, "title": "Fix", + "body": "Fix", "state": "open", "html_url": "https://github.com/x"} + mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "review"}] + mock_diff.return_value = "full diff" + mock_rc.return_value = [] + mock_compare.return_value = "should not be called" + mock_files.return_value = [{"filename": "src/a.py"}] + mock_content.return_value = "content" + mock_map.return_value = "" + mock_kb.return_value = "" + mock_bedrock.return_value = json.dumps({"comments": [ + {"file": "src/a.py", "line": 1, "comment": "issue"}, + ]}) + + from issue_bot.main import analyze + analyze() + + # Should NOT have called compare because event_before is empty + mock_compare.assert_not_called() + + with open(str(tmp_path / "result.json")) as f: + result = json.load(f) + assert result["is_incremental"] is False + + +class TestFileContentUsesHeadSha: + """Verify get_file_content is called with PR head SHA, not default branch.""" + + def _setup_env(self, tmp_path, monkeypatch): + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "42") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", "opened") + monkeypatch.setenv("EVENT_BEFORE", "") + monkeypatch.setenv("EVENT_AFTER", "") + monkeypatch.setenv("GITHUB_ACTOR", "contributor") + monkeypatch.setenv("KB_S3_BUCKET", "") + monkeypatch.setenv("KB_S3_KEY", "") + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review this PR. Date: {current_date}") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json")) + + def test_file_content_fetched_with_head_sha(self, tmp_path, monkeypatch): + import unittest.mock as mock + self._setup_env(tmp_path, monkeypatch) + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + mock_pr.return_value = { + "user": {"login": "contributor"}, "title": "Add feature", + "body": "New file", "state": "open", + "html_url": "https://github.com/x", + "head": {"sha": "abc123deadbeef"}, + } + mock_comments.return_value = [] + mock_diff.return_value = "diff content" + mock_rc.return_value = [] + mock_files.return_value = [ + {"filename": "src/new_file.py"}, + {"filename": "src/existing.py"}, + ] + mock_content.return_value = "file content" + mock_map.return_value = "" + mock_kb.return_value = "" + mock_bedrock.return_value = json.dumps({"comments": []}) + + from issue_bot.main import analyze + analyze() + + # Every get_file_content call must include ref=head_sha + for call in mock_content.call_args_list: + args, kwargs = call + assert kwargs.get("ref") == "abc123deadbeef" or \ + (len(args) > 1 and args[1] == "abc123deadbeef"), \ + f"get_file_content called without head SHA: {call}" + + def test_missing_head_sha_falls_back_gracefully(self, tmp_path, monkeypatch): + import unittest.mock as mock + self._setup_env(tmp_path, monkeypatch) + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock: + + # PR object without head.sha (shouldn't happen, but defensive) + mock_pr.return_value = { + "user": {"login": "contributor"}, "title": "Fix", + "body": "Fix", "state": "open", + "html_url": "https://github.com/x", + } + mock_comments.return_value = [] + mock_diff.return_value = "diff" + mock_rc.return_value = [] + mock_files.return_value = [{"filename": "src/a.py"}] + mock_content.return_value = "content" + mock_map.return_value = "" + mock_kb.return_value = "" + mock_bedrock.return_value = json.dumps({"comments": []}) + + from issue_bot.main import analyze + analyze() + + # Should still work — falls back to no ref (default branch) + with open(str(tmp_path / "result.json")) as f: + result = json.load(f) + # First review with 0 comments → RESPOND with CI-aware message + assert result["action"] == "RESPOND" + assert "No issues found" in result["response"] + + +class TestReviewEventType: + """Verify bot always uses COMMENT event type, never REQUEST_CHANGES.""" + + def _setup_env(self, tmp_path, monkeypatch, event_action="opened"): + monkeypatch.setenv("GITHUB_TOKEN", "fake") + monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test") + monkeypatch.setenv("ISSUE_NUMBER", "50") + monkeypatch.setenv("EVENT_TYPE", "pull_request_target") + monkeypatch.setenv("EVENT_ACTION", event_action) + monkeypatch.setenv("EVENT_BEFORE", "aaa111" if event_action == "synchronize" else "") + monkeypatch.setenv("EVENT_AFTER", "bbb222" if event_action == "synchronize" else "") + monkeypatch.setenv("GITHUB_ACTOR", "contributor") + monkeypatch.setenv("KB_S3_BUCKET", "") + monkeypatch.setenv("KB_S3_KEY", "") + monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}") + import issue_bot.main as bot_main + monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json")) + + def _run_and_get_artifact(self, tmp_path, monkeypatch, mock, event_action="opened"): + self._setup_env(tmp_path, monkeypatch, event_action=event_action) + + with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \ + mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \ + mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \ + mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \ + mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \ + mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \ + mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \ + mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \ + mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock, \ + mock.patch("issue_bot.github_client.GitHubClient.post_pr_review") as mock_post: + + mock_pr.return_value = { + "user": {"login": "contributor"}, "title": "Fix", + "body": "Fix", "state": "open", + "html_url": "https://github.com/x", + "head": {"sha": "abc123"}, + } + mock_comments.return_value = ( + [{"user": {"login": "github-actions[bot]"}, "body": "prior"}] + if event_action == "synchronize" else [] + ) + mock_diff.return_value = "diff" + mock_rc.return_value = [] + mock_compare.return_value = ( + "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n@@ -1 +1 @@\n-x\n+y\n" + if event_action == "synchronize" else "" + ) + mock_files.return_value = [{"filename": "f.py"}] + mock_content.return_value = "content" + mock_map.return_value = "" + mock_kb.return_value = "" + mock_bedrock.return_value = json.dumps({ + "comments": [{"file": "f.py", "line": 1, "comment": "issue"}] + }) + mock_post.return_value = True + + from issue_bot.main import analyze, act + analyze() + + with open(str(tmp_path / "result.json")) as f: + return json.load(f), mock_post + + def test_first_review_uses_comment_event(self, tmp_path, monkeypatch): + import unittest.mock as mock + result, _ = self._run_and_get_artifact(tmp_path, monkeypatch, mock, "opened") + assert result["action"] == "RESPOND" + assert result.get("is_incremental") is False + assert len(result["inline_comments"]) > 0 + + def test_incremental_review_uses_comment_event(self, tmp_path, monkeypatch): + import unittest.mock as mock + result, _ = self._run_and_get_artifact(tmp_path, monkeypatch, mock, "synchronize") + assert result["action"] == "RESPOND" + assert result.get("is_incremental") is True