diff --git a/.github/workflows/auto-approve.yml b/.github/workflows/auto-approve.yml
new file mode 100644
index 0000000..f24c211
--- /dev/null
+++ b/.github/workflows/auto-approve.yml
@@ -0,0 +1,101 @@
+name: Auto-Approve Clean PRs
+
+on:
+ workflow_run:
+ workflows: [".github/workflows/base.yml", "PyDeequ Bot"]
+ types: [completed]
+
+permissions:
+ pull-requests: write
+ actions: read
+
+jobs:
+ approve:
+ runs-on: ubuntu-latest
+ if: github.event.workflow_run.event == 'pull_request' || github.event.workflow_run.event == 'pull_request_target'
+ timeout-minutes: 2
+
+ steps:
+ - name: Find PR and check both conditions
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ with:
+ script: |
+ const sha = context.payload.workflow_run.head_sha;
+ const owner = context.repo.owner;
+ const repo = context.repo.repo;
+
+ // Find the PR for this SHA
+ let prNumber = null;
+ const prs = context.payload.workflow_run.pull_requests;
+ if (prs && prs.length > 0) {
+ prNumber = prs[0].number;
+ } else {
+ const {data: searchResult} = await github.rest.pulls.list({
+ owner, repo, state: 'open', sort: 'updated', direction: 'desc', per_page: 30
+ });
+ const match = searchResult.find(pr => pr.head.sha === sha);
+ if (match) {
+ prNumber = match.number;
+ }
+ }
+
+ if (!prNumber) {
+ core.info(`No open PR found for SHA ${sha}, skipping`);
+ return;
+ }
+
+ core.info(`Found PR #${prNumber} for SHA ${sha}`);
+
+ // Verify the PR head SHA still matches (no new push since trigger)
+ const {data: pr} = await github.rest.pulls.get({
+ owner, repo, pull_number: prNumber
+ });
+ if (pr.head.sha !== sha) {
+ core.info(`PR head ${pr.head.sha} differs from trigger SHA ${sha} — new push arrived, skipping`);
+ return;
+ }
+
+ // Condition 1: CI must have passed for this SHA
+ const {data: workflowRuns} = await github.rest.actions.listWorkflowRunsForRepo({
+ owner, repo, head_sha: sha, status: 'completed'
+ });
+ const ciRun = workflowRuns.workflow_runs.find(r =>
+ r.name === '.github/workflows/base.yml' && r.conclusion === 'success'
+ );
+ if (!ciRun) {
+ core.info(`CI has not passed for SHA ${sha}, skipping`);
+ return;
+ }
+
+ // Condition 2: Bot must have posted a clean review for this SHA
+ const {data: reviews} = await github.rest.pulls.listReviews({
+ owner, repo, pull_number: prNumber
+ });
+
+ const CLEAN_MARKER = '';
+
+ const latestBot = reviews
+ .filter(r => r.user.login === 'github-actions[bot]')
+ .sort((a, b) => new Date(b.submitted_at) - new Date(a.submitted_at))[0];
+
+ if (!latestBot || !latestBot.body.includes(CLEAN_MARKER) || latestBot.commit_id !== sha) {
+ core.info('Bot has not posted a clean review for this SHA, skipping');
+ return;
+ }
+
+ // Both conditions met — check for existing approval to prevent doubles
+ const botApprovals = reviews.filter(r =>
+ r.user.login === 'github-actions[bot]' && r.state === 'APPROVED'
+ );
+ if (botApprovals.length > 0) {
+ core.info('Bot already approved this PR, skipping');
+ return;
+ }
+
+ // Approve
+ core.info(`Approving PR #${prNumber}: bot review clean + CI passed for SHA ${sha}`);
+ await github.rest.pulls.createReview({
+ owner, repo, pull_number: prNumber,
+ event: 'APPROVE',
+ body: `No issues found and CI is passing. Auto-approved.\n\n---\n*Generated by AI — human merge required.*`
+ });
diff --git a/.github/workflows/issue-bot.yml b/.github/workflows/issue-bot.yml
index b24a577..96a2056 100644
--- a/.github/workflows/issue-bot.yml
+++ b/.github/workflows/issue-bot.yml
@@ -61,16 +61,20 @@ jobs:
ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
EVENT_TYPE: ${{ github.event_name }}
EVENT_ACTION: ${{ github.event.action }}
+ EVENT_BEFORE: ${{ github.event.before }}
+ EVENT_AFTER: ${{ github.event.pull_request.head.sha || github.event.after }}
GITHUB_ACTOR: ${{ github.actor }}
KB_S3_BUCKET: ${{ secrets.KB_S3_BUCKET }}
KB_S3_KEY: ${{ secrets.KB_S3_KEY }}
BEDROCK_MODEL_ID: ${{ secrets.BEDROCK_MODEL_ID }}
GUARDRAIL_ID: ${{ secrets.GUARDRAIL_ID }}
GUARDRAIL_VERSION: ${{ secrets.GUARDRAIL_VERSION }}
- ISSUE_CLASSIFY_PROMPT: ${{ secrets.ISSUE_CLASSIFY_PROMPT }}
- ISSUE_RESPOND_PROMPT: ${{ secrets.ISSUE_RESPOND_PROMPT }}
- PR_FILE_REVIEW_PROMPT: ${{ secrets.PR_FILE_REVIEW_PROMPT }}
- FOLLOWUP_PROMPT: ${{ secrets.FOLLOWUP_PROMPT }}
+ SM_ISSUE_CLASSIFY_PROMPT: pydeequ-bot/issue-classify-prompt
+ SM_ISSUE_RESPOND_PROMPT: pydeequ-bot/issue-respond-prompt
+ SM_PR_FILE_REVIEW_PROMPT: pydeequ-bot/pr-file-review-prompt
+ SM_FOLLOWUP_PROMPT: pydeequ-bot/followup-prompt
+ CODEBASE_SRC_DIR: pydeequ
+ CODEBASE_FILE_EXT: .py
DRY_RUN: ${{ inputs.dry_run || 'false' }}
ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
run: python -m issue_bot.main analyze
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 0000000..94ae395
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,36 @@
+name: Manage Stale Issues and PRs
+
+on:
+ schedule:
+ - cron: '0 9 * * MON'
+ workflow_dispatch:
+
+permissions:
+ issues: write
+ pull-requests: write
+
+jobs:
+ stale:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+ with:
+ days-before-stale: 60
+ days-before-close: 14
+ stale-issue-label: 'stale'
+ stale-pr-label: 'stale'
+ stale-issue-message: >
+ This issue has been inactive for 60 days. It will be closed in 14 days
+ if there is no further activity. If this is still relevant, please comment
+ to keep it open.
+ stale-pr-message: >
+ This PR has been inactive for 60 days. It will be closed in 14 days
+ if there is no further activity. If you are still working on this,
+ please push an update or comment to keep it open.
+ close-issue-message: >
+ Closed due to inactivity. Feel free to reopen if this is still relevant.
+ close-pr-message: >
+ Closed due to inactivity. Feel free to reopen if you'd like to continue this work.
+ exempt-issue-labels: 'bug,enhancement,help-wanted'
+ exempt-pr-labels: 'help-wanted'
+ operations-per-run: 50
diff --git a/scripts/issue_bot/config.py b/scripts/issue_bot/config.py
index b6fdb0c..5fff510 100644
--- a/scripts/issue_bot/config.py
+++ b/scripts/issue_bot/config.py
@@ -17,6 +17,8 @@ def __init__(self):
sys.exit(1)
self.repo = _require("GITHUB_REPOSITORY")
self.actor = os.getenv("GITHUB_ACTOR", "")
+ self.event_before = os.getenv("EVENT_BEFORE", "")
+ self.event_after = os.getenv("EVENT_AFTER", "")
self.bedrock_model_id = os.getenv("BEDROCK_MODEL_ID", "us.anthropic.claude-opus-4-6-v1")
@@ -32,14 +34,16 @@ def __init__(self):
self.enable_repo_search = os.getenv("ENABLE_REPO_SEARCH", "true").lower() == "true"
self.upstream_repo = os.getenv("UPSTREAM_REPO", "awslabs/python-deequ")
+ self.codebase_src_dir = os.getenv("CODEBASE_SRC_DIR", "pydeequ")
+ self.codebase_file_ext = os.getenv("CODEBASE_FILE_EXT", ".py")
- self.bedrock_timeout = 120
- self.max_context_chars = 200000
+ self.bedrock_timeout = 240
+ self.max_context_chars = 800000
self.max_github_search_results = 8
self.github_api_timeout = 10
self.allowed_labels = {
"bug", "enhancement", "question", "documentation",
- "help-wanted", "analyzer", "check", "spark-compatibility", "installation",
+ "help wanted", "python",
}
diff --git a/scripts/issue_bot/github_client.py b/scripts/issue_bot/github_client.py
index 82d0ed2..5c2d046 100644
--- a/scripts/issue_bot/github_client.py
+++ b/scripts/issue_bot/github_client.py
@@ -11,6 +11,7 @@ def __init__(self, cfg):
self._repo = cfg.repo
self._timeout = cfg.github_api_timeout
self._dry_run = cfg.dry_run
+ self._cfg = cfg
self._repo_root = os.getenv("GITHUB_WORKSPACE", os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
self._headers = {
"Authorization": f"token {self._token}",
@@ -48,6 +49,59 @@ def get_pr_diff(self, number):
logger.error(f"PR diff fetch failed: {e}")
return ""
+ def get_compare_diff(self, base_sha, head_sha):
+ """Fetch the diff between two commits using the Compare API.
+ Returns the diff text, or empty string on failure (e.g. force-push
+ where base_sha no longer exists)."""
+ headers = {**self._headers, "Accept": "application/vnd.github.v3.diff"}
+ try:
+ resp = requests.get(
+ f"https://api.github.com/repos/{self._repo}/compare/{base_sha}...{head_sha}",
+ headers=headers, timeout=self._timeout,
+ )
+ if resp.status_code == 200:
+ return resp.text
+ logger.warning(f"Compare API {base_sha[:7]}...{head_sha[:7]}: {resp.status_code}")
+ return ""
+ except Exception as e:
+ logger.error(f"Compare diff failed: {e}")
+ return ""
+
+ def get_ci_status(self, sha):
+ """Check commit statuses and check runs. Returns (passed, summary).
+ passed: True (all green), False (something failed), None (pending/unknown)."""
+ status = self._get(f"/repos/{self._repo}/commits/{sha}/status")
+ if status is None:
+ return None, "CI status unavailable"
+ combined_state = status.get("state", "pending")
+
+ check_data = self._get(f"/repos/{self._repo}/commits/{sha}/check-runs")
+ runs = check_data.get("check_runs", []) if check_data else []
+
+ def _is_own_check(name):
+ lower = name.lower()
+ return "bot" in lower and ("analyze" in lower or "/ act" in lower)
+
+ external_runs = [r for r in runs if not _is_own_check(r.get("name", ""))]
+
+ failed = []
+ pending = []
+ for r in external_runs:
+ if r.get("status") != "completed":
+ pending.append(r["name"])
+ elif r.get("conclusion") not in ("success", "neutral", "skipped"):
+ failed.append(r["name"])
+
+ if failed:
+ return False, f"CI failing: {', '.join(failed)}"
+ if pending:
+ return None, f"CI pending: {', '.join(pending)}"
+ if combined_state == "failure":
+ return False, "CI failing (status checks)"
+ if combined_state == "pending":
+ return None, "CI pending (status checks)"
+ return True, "CI passed"
+
def get_pr_files(self, number):
return self._get(f"/repos/{self._repo}/pulls/{number}/files") or []
@@ -64,16 +118,18 @@ def get_pr_review_comments(self, number, max_pages=10):
page += 1
return comments
- def get_codebase_map(self, src_dir="pydeequ"):
- """List all Python source files (excluding tests) as relative paths."""
+ def get_codebase_map(self):
+ """List source files (excluding tests) as relative paths."""
+ src_dir = self._cfg.codebase_src_dir
+ file_ext = self._cfg.codebase_file_ext
full_dir = os.path.join(self._repo_root, src_dir)
prefix = self._repo_root.rstrip("/") + "/"
try:
paths = []
for root, dirs, files in os.walk(full_dir):
- dirs[:] = [d for d in dirs if d not in ("tests", "__pycache__", ".git")]
+ dirs[:] = [d for d in dirs if d not in ("examples", "__pycache__", ".git", "tests", "test")]
for f in files:
- if f.endswith(".py"):
+ if f.endswith(file_ext):
full = os.path.join(root, f)
rel = full[len(prefix):] if full.startswith(prefix) else full
paths.append(rel)
@@ -116,9 +172,9 @@ def post_comment(self, number, body):
return True
return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})
- def post_pr_review(self, number, summary, inline_comments):
+ def post_pr_review(self, number, summary, inline_comments, event="COMMENT"):
if self._dry_run:
- logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments")
+ logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments, event={event}")
return True
# Get valid diff lines per file from the PR
@@ -134,31 +190,34 @@ def post_pr_review(self, number, summary, inline_comments):
else:
invalid_comments.append(ic)
+ body = summary
+ if invalid_comments:
+ body += "\n\n**Additional feedback:**\n"
+ for ic in invalid_comments:
+ line_ref = f":{ic['line']}" if ic.get('line') else ""
+ body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
+
+ payload = {"body": body, "event": event}
if valid_comments:
- body = summary
- if invalid_comments:
- body += "\n\n**Additional feedback:**\n"
- for ic in invalid_comments:
- line_ref = f":{ic['line']}" if ic.get('line') else ""
- body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
- payload = {"body": body, "event": "REQUEST_CHANGES", "comments": valid_comments}
- try:
- resp = requests.post(
- f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
- headers=self._headers, json=payload, timeout=self._timeout,
- )
- if resp.status_code in (200, 201):
- return True
- logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
- except Exception as e:
- logger.error(f"PR review API failed: {e}, falling back to comment")
-
- # Fallback: post all as regular comment
- all_comments = inline_comments
+ payload["comments"] = valid_comments
+
+ try:
+ resp = requests.post(
+ f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
+ headers=self._headers, json=payload, timeout=self._timeout,
+ )
+ if resp.status_code in (200, 201):
+ return True
+ logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
+ logger.error(f"Response: {resp.text[:500]}")
+ except Exception as e:
+ logger.error(f"PR review API failed: {e}, falling back to comment")
+
+ # Fallback: post as regular comment if review API fails
body = summary
- if all_comments:
+ if inline_comments:
body += "\n\n**Inline feedback:**\n"
- for ic in all_comments:
+ for ic in inline_comments:
line_ref = f":{ic['line']}" if ic.get('line') else ""
body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})
diff --git a/scripts/issue_bot/main.py b/scripts/issue_bot/main.py
index 77f5ef7..593eaa8 100644
--- a/scripts/issue_bot/main.py
+++ b/scripts/issue_bot/main.py
@@ -6,6 +6,7 @@
"""
import json
+import re
import sys
import os
import datetime
@@ -149,17 +150,57 @@ def analyze():
diff = gh.get_pr_diff(number)
review_comments = gh.get_pr_review_comments(number)
existing_feedback = _format_pr_feedback(comments_data, review_comments)
+
+ # Incremental review: on synchronize, compute what changed since last push
+ incremental_diff = ""
+ incremental_files = set()
+ if is_pr_update and cfg.event_before and cfg.event_after:
+ incremental_diff = gh.get_compare_diff(cfg.event_before, cfg.event_after)
+ if incremental_diff:
+ incremental_files = _extract_diff_files(incremental_diff)
+
+ # Fetch full source files at the SHA the diff is anchored to
+ head_sha = cfg.event_after or item.get("head", {}).get("sha", "")
+ pr_files = gh.get_pr_files(number)
+ full_sources = ""
+ for pf in pr_files:
+ fname = pf.get("filename", "")
+ content = gh.get_file_content(fname, ref=head_sha) if head_sha else gh.get_file_content(fname)
+ if content:
+ entry = f"\n### `{fname}`\n```\n{content}\n```\n"
+ if len(full_sources) + len(entry) > 3_000_000:
+ full_sources += f"\n### `{fname}` — SKIPPED (context budget)\n"
+ break
+ full_sources += entry
+
+ # Build incremental review instructions
+ incremental_section = ""
+ if incremental_diff:
+ incremental_section = (
+ "\n\n"
+ "This is a RE-REVIEW after the author pushed new commits. "
+ "The below shows ONLY what changed since the last push. "
+ "You MUST limit your comments to lines/files in the incremental diff. "
+ "Do NOT re-raise issues on unchanged code — the author already saw prior feedback. "
+ "Do NOT comment on lines that are not part of the incremental diff. "
+ "If the incremental diff only fixes issues from prior feedback, respond with zero comments."
+ "\n\n"
+ f"\n{incremental_diff}\n\n"
+ )
+
# System prompt: instructions + all trusted context (not scanned by guardrail)
system_prompt = _render(tmpl, current_date=datetime.date.today().isoformat()) + (
f"\n\n\n{context}\n\n"
f"\n{codebase_map}\n\n"
+ f"\n{full_sources}\n\n"
f"\n{diff}\n\n"
- f"\n{existing_feedback}\n"
+ f"\n{existing_feedback}\n\n"
+ f"{incremental_section}"
)
# User prompt: only user-authored content (scanned by guardrail)
user_prompt = f"\nTitle: {title}\nBody: {body}\n"
raw = bedrock.invoke(system_prompt, user_prompt,
- max_tokens=4000, json_schema=PR_REVIEW_SCHEMA)
+ max_tokens=8000, json_schema=PR_REVIEW_SCHEMA)
if raw is None:
_write_artifact({
"action": "ESCALATE", "reason": "bedrock_unavailable", "title": title,
@@ -172,14 +213,50 @@ def analyze():
inline_comments = pr_result.get("comments", [])
except json.JSONDecodeError:
inline_comments = _parse_file_review_multi(raw)
+
+ # Hard filter: on incremental review, drop comments on files not in the incremental diff
+ if incremental_files and inline_comments:
+ inline_comments = [
+ c for c in inline_comments
+ if c.get("file", "") in incremental_files
+ ]
+
+ # Hard filter: drop NITs on re-reviews (code-enforced, not prompt-dependent)
+ if is_pr_update and inline_comments:
+ inline_comments = [
+ c for c in inline_comments
+ if c.get("severity", "").upper() != "NIT"
+ ]
+
+ # Format comments: prepend severity, append evidence as context
+ for c in inline_comments:
+ severity = c.get("severity", "")
+ evidence = c.get("evidence", "")
+ prefix = f"**{severity}**: " if severity else ""
+ suffix = "\n\n> " + evidence.replace("\n", "\n> ") if evidence else ""
+ c["comment"] = prefix + c.get("comment", "") + suffix
+
+ # Check CI status to give accurate signal to human reviewers
+ ci_passed, ci_summary = gh.get_ci_status(head_sha) if head_sha else (None, "")
+
+ if not inline_comments:
+ if ci_passed is True:
+ response = "No issues found. CI is passing.\n"
+ elif ci_passed is False:
+ response = f"No code issues found, but {ci_summary}."
+ else:
+ response = "No issues found.\n"
+ else:
+ response = ""
+
_write_artifact({
"action": "RESPOND",
- "labels": [], "response": "No issues found." if not inline_comments else "",
+ "labels": [], "response": response,
"inline_comments": inline_comments,
"title": title, "html_url": html_url, "number": number,
- "is_pr": True, "prompt_id": prompts.prompt_version(tmpl),
+ "is_pr": True, "is_incremental": bool(incremental_diff),
+ "prompt_id": prompts.prompt_version(tmpl),
"model_id": cfg.bedrock_model_id,
- "reason": "no_issues_found" if not inline_comments else "",
})
return
@@ -308,7 +385,11 @@ def act():
sanitized_comments.append({**ic, "comment": safe_comment})
inline_comments = sanitized_comments
if is_pr and inline_comments:
- gh.post_pr_review(number, response + footer, inline_comments)
+ gh.post_pr_review(number, response + footer, inline_comments, event="COMMENT")
+ elif is_pr and response and not inline_comments:
+ gh.post_pr_review(number, response + footer, [], event="COMMENT")
+ elif not response and not inline_comments:
+ logger.info(f"Skip #{number}: nothing to post after sanitization")
else:
gh.post_comment(number, response + footer)
gh.add_labels(number, labels)
@@ -354,7 +435,7 @@ def act():
elif action == "CLOSE" and not is_pr:
msg = (
- "This issue may not be related to the PyDeequ data quality library. "
+ "This issue may not be related to the PyDeequ library. "
"The maintainer team has been notified and will review." + footer
)
gh.post_comment(number, msg)
@@ -584,6 +665,16 @@ def _format_pr_feedback(issue_comments, review_comments):
return "\n".join(parts) if parts else "(no existing feedback)"
+def _extract_diff_files(diff_text):
+ """Extract the set of file paths touched in a unified diff."""
+ files = set()
+ for line in diff_text.split("\n"):
+ m = re.match(r'^diff --git a/.+ b/(.+)$', line)
+ if m:
+ files.add(m.group(1))
+ return files
+
+
def _read_requested_files(gh, file_paths, cfg):
snippets = []
for path in file_paths[:cfg.max_github_search_results]:
diff --git a/scripts/issue_bot/prompts.py b/scripts/issue_bot/prompts.py
index e2ff385..9fc97ea 100644
--- a/scripts/issue_bot/prompts.py
+++ b/scripts/issue_bot/prompts.py
@@ -1,21 +1,53 @@
import hashlib
+import logging
import os
+import boto3
+
+logger = logging.getLogger("issue_bot")
+
+_sm_client = None
+
+
+def _get_sm_client():
+ global _sm_client
+ if _sm_client is None:
+ _sm_client = boto3.client("secretsmanager")
+ return _sm_client
+
+
+def _read_from_sm(secret_id):
+ if not secret_id:
+ return ""
+ try:
+ resp = _get_sm_client().get_secret_value(SecretId=secret_id)
+ return resp["SecretString"]
+ except Exception as e:
+ logger.error("Failed to read prompt from Secrets Manager: %s", type(e).__name__)
+ return ""
+
+
+def _get_prompt(env_var, sm_env_var):
+ val = os.getenv(env_var, "")
+ if val:
+ return val
+ return _read_from_sm(os.getenv(sm_env_var, ""))
+
def get_issue_prompt():
- return os.getenv("ISSUE_CLASSIFY_PROMPT", "")
+ return _get_prompt("ISSUE_CLASSIFY_PROMPT", "SM_ISSUE_CLASSIFY_PROMPT")
def get_issue_respond_prompt():
- return os.getenv("ISSUE_RESPOND_PROMPT", "")
+ return _get_prompt("ISSUE_RESPOND_PROMPT", "SM_ISSUE_RESPOND_PROMPT")
def get_pr_file_review_prompt():
- return os.getenv("PR_FILE_REVIEW_PROMPT", "")
+ return _get_prompt("PR_FILE_REVIEW_PROMPT", "SM_PR_FILE_REVIEW_PROMPT")
def get_followup_prompt():
- return os.getenv("FOLLOWUP_PROMPT", "")
+ return _get_prompt("FOLLOWUP_PROMPT", "SM_FOLLOWUP_PROMPT")
def prompt_version(template):
diff --git a/scripts/issue_bot/schemas/pr_review_response.json b/scripts/issue_bot/schemas/pr_review_response.json
index ef22e07..1901a95 100644
--- a/scripts/issue_bot/schemas/pr_review_response.json
+++ b/scripts/issue_bot/schemas/pr_review_response.json
@@ -12,14 +12,23 @@
"line": {
"type": "integer"
},
+ "severity": {
+ "type": "string",
+ "enum": ["BUG", "EDGE_CASE", "MISSING_TEST", "DESIGN", "NIT"]
+ },
"comment": {
"type": "string"
+ },
+ "evidence": {
+ "type": "string"
}
},
"required": [
"file",
"line",
- "comment"
+ "severity",
+ "comment",
+ "evidence"
],
"additionalProperties": false
}
@@ -29,4 +38,4 @@
"comments"
],
"additionalProperties": false
-}
\ No newline at end of file
+}
diff --git a/tests/test_bot.py b/tests/test_bot.py
index 77992dc..90f0f66 100644
--- a/tests/test_bot.py
+++ b/tests/test_bot.py
@@ -17,6 +17,7 @@
_user_dissatisfied,
_clean_response,
_render,
+ _extract_diff_files,
)
from issue_bot.sanitizer import sanitize, _fix_accidental_issue_refs
@@ -203,6 +204,208 @@ def test_preserves_normal_text(self):
assert _clean_response(text) == text
+class TestGetCiStatus:
+ """Tests for GitHubClient.get_ci_status method."""
+
+ def _make_client(self):
+ import unittest.mock as mock
+ with mock.patch.dict(os.environ, {
+ "GITHUB_TOKEN": "fake", "GITHUB_REPOSITORY": "awslabs/test",
+ "ISSUE_NUMBER": "1", "EVENT_TYPE": "issues", "EVENT_ACTION": "opened",
+ "GITHUB_WORKFLOW": "PyDeequ Bot",
+ }):
+ from issue_bot.config import Config
+ from issue_bot.github_client import GitHubClient
+ cfg = Config()
+ client = GitHubClient(cfg)
+ return client
+
+ def test_all_checks_passed(self):
+ import unittest.mock as mock
+ client = self._make_client()
+ client._get = mock.MagicMock(side_effect=[
+ {"state": "success"}, # commit status
+ {"check_runs": [
+ {"name": "Java CI", "status": "completed", "conclusion": "success"},
+ {"name": "CodeQL", "status": "completed", "conclusion": "success"},
+ ]},
+ ])
+ passed, summary = client.get_ci_status("abc123")
+ assert passed is True
+ assert "passed" in summary.lower()
+
+ def test_check_run_failed(self):
+ import unittest.mock as mock
+ client = self._make_client()
+ client._get = mock.MagicMock(side_effect=[
+ {"state": "success"},
+ {"check_runs": [
+ {"name": "Java CI", "status": "completed", "conclusion": "failure"},
+ {"name": "CodeQL", "status": "completed", "conclusion": "success"},
+ ]},
+ ])
+ passed, summary = client.get_ci_status("abc123")
+ assert passed is False
+ assert "Java CI" in summary
+
+ def test_check_run_pending(self):
+ import unittest.mock as mock
+ client = self._make_client()
+ client._get = mock.MagicMock(side_effect=[
+ {"state": "pending"},
+ {"check_runs": [
+ {"name": "Java CI", "status": "in_progress", "conclusion": None},
+ ]},
+ ])
+ passed, summary = client.get_ci_status("abc123")
+ assert passed is None
+ assert "pending" in summary.lower()
+
+ def test_bot_check_filtered_out(self):
+ import unittest.mock as mock
+ client = self._make_client()
+ client._get = mock.MagicMock(side_effect=[
+ {"state": "success"},
+ {"check_runs": [
+ {"name": "Java CI", "status": "completed", "conclusion": "success"},
+ {"name": "PyDeequ Bot / analyze", "status": "completed", "conclusion": "success"},
+ {"name": "PyDeequ Bot / act", "status": "completed", "conclusion": "success"},
+ ]},
+ ])
+ passed, _ = client.get_ci_status("abc123")
+ assert passed is True
+
+ def test_non_bot_check_with_bot_in_name_not_filtered(self):
+ import unittest.mock as mock
+ client = self._make_client()
+ client._get = mock.MagicMock(side_effect=[
+ {"state": "success"},
+ {"check_runs": [
+ {"name": "robot-tests", "status": "completed", "conclusion": "failure"},
+ ]},
+ ])
+ passed, _ = client.get_ci_status("abc123")
+ assert passed is False
+
+ def test_skipped_and_neutral_count_as_passed(self):
+ import unittest.mock as mock
+ client = self._make_client()
+ client._get = mock.MagicMock(side_effect=[
+ {"state": "success"},
+ {"check_runs": [
+ {"name": "Optional Check", "status": "completed", "conclusion": "skipped"},
+ {"name": "Info Check", "status": "completed", "conclusion": "neutral"},
+ ]},
+ ])
+ passed, _ = client.get_ci_status("abc123")
+ assert passed is True
+
+ def test_api_failure_returns_unknown(self):
+ import unittest.mock as mock
+ client = self._make_client()
+ client._get = mock.MagicMock(return_value=None)
+ passed, summary = client.get_ci_status("abc123")
+ assert passed is None
+
+
+class TestAutoApproveSignal:
+ """Tests that bot posts the correct signal for the auto-approve workflow to act on."""
+
+ def _make_artifact(self, tmp_path, response, inline_comments=None):
+ artifact = {
+ "action": "RESPOND",
+ "labels": [],
+ "response": response,
+ "inline_comments": inline_comments or [],
+ "title": "Fix", "html_url": "https://github.com/x",
+ "number": 42, "is_pr": True, "is_incremental": False,
+ "prompt_id": "abc123", "model_id": "test",
+ }
+ path = str(tmp_path / "result.json")
+ with open(path, "w") as f:
+ json.dump(artifact, f)
+ return path
+
+ def test_no_issues_posts_pr_review_with_signal(self, tmp_path, monkeypatch):
+ """Bot posts 'No issues found' as a PR review — auto-approve.yml looks for this in listReviews."""
+ import unittest.mock as mock
+ path = self._make_artifact(tmp_path, response="No issues found. CI is passing.")
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "42")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", "opened")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", path)
+
+ with mock.patch("issue_bot.github_client.GitHubClient.post_pr_review") as mock_review, \
+ mock.patch("issue_bot.github_client.GitHubClient.post_comment") as mock_comment, \
+ mock.patch("issue_bot.github_client.GitHubClient.add_labels"), \
+ mock.patch("issue_bot.slack_client.SlackClient.send_escalation"):
+ mock_review.return_value = True
+ bot_main.act()
+ mock_review.assert_called_once()
+ mock_comment.assert_not_called()
+ body = mock_review.call_args[0][1]
+ assert "No issues found" in body
+
+ def test_with_issues_posts_review_not_comment(self, tmp_path, monkeypatch):
+ """Bot posts inline review when there are issues — no approve signal."""
+ import unittest.mock as mock
+ path = self._make_artifact(tmp_path, response="",
+ inline_comments=[{"file": "a.py", "line": 1, "comment": "BUG: issue"}])
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "42")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", "opened")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", path)
+
+ with mock.patch("issue_bot.github_client.GitHubClient.post_pr_review") as mock_review, \
+ mock.patch("issue_bot.github_client.GitHubClient.post_comment") as mock_comment, \
+ mock.patch("issue_bot.github_client.GitHubClient.add_labels"), \
+ mock.patch("issue_bot.slack_client.SlackClient.send_escalation"):
+ mock_review.return_value = True
+ bot_main.act()
+ mock_review.assert_called_once()
+ mock_comment.assert_not_called()
+
+
+class TestPrompts:
+ def test_env_var_takes_precedence(self, monkeypatch):
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "from env")
+ monkeypatch.setenv("SM_PR_FILE_REVIEW_PROMPT", "deequ-bot/pr-file-review-prompt")
+ from issue_bot.prompts import get_pr_file_review_prompt
+ assert get_pr_file_review_prompt() == "from env"
+
+ def test_empty_env_var_falls_through_to_sm(self, monkeypatch):
+ import unittest.mock as mock
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "")
+ monkeypatch.setenv("SM_PR_FILE_REVIEW_PROMPT", "deequ-bot/pr-file-review-prompt")
+ with mock.patch("issue_bot.prompts._read_from_sm", return_value="from sm") as m:
+ from issue_bot.prompts import get_pr_file_review_prompt
+ result = get_pr_file_review_prompt()
+ assert result == "from sm"
+ m.assert_called_once_with("deequ-bot/pr-file-review-prompt")
+
+ def test_no_sm_env_var_returns_empty(self, monkeypatch):
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "")
+ monkeypatch.setenv("SM_PR_FILE_REVIEW_PROMPT", "")
+ from issue_bot.prompts import get_pr_file_review_prompt
+ # No env var, no SM secret name → empty string
+ assert get_pr_file_review_prompt() == ""
+
+ def test_sm_failure_returns_empty(self, monkeypatch):
+ import unittest.mock as mock
+ monkeypatch.setenv("FOLLOWUP_PROMPT", "")
+ monkeypatch.setenv("SM_FOLLOWUP_PROMPT", "deequ-bot/followup-prompt")
+ with mock.patch("issue_bot.prompts._get_sm_client") as mock_client:
+ mock_client.return_value.get_secret_value.side_effect = Exception("timeout")
+ from issue_bot.prompts import get_followup_prompt
+ assert get_followup_prompt() == ""
+
+
class TestSmoke:
def test_main_module_imports(self):
from issue_bot import main
@@ -320,3 +523,588 @@ def test_no_guardrail_no_config(self):
client.invoke("system", "user")
kwargs = client._client.converse.call_args[1]
assert "guardrailConfig" not in kwargs
+
+
+class TestExtractDiffFiles:
+ def test_single_file(self):
+ diff = (
+ "diff --git a/src/foo.py b/src/foo.py\n"
+ "index abc1234..def5678 100644\n"
+ "--- a/src/foo.py\n"
+ "+++ b/src/foo.py\n"
+ "@@ -1,3 +1,4 @@\n"
+ "+new line\n"
+ )
+ assert _extract_diff_files(diff) == {"src/foo.py"}
+
+ def test_multiple_files(self):
+ diff = (
+ "diff --git a/a.py b/a.py\n"
+ "--- a/a.py\n"
+ "+++ b/a.py\n"
+ "@@ -1 +1 @@\n"
+ "-old\n"
+ "+new\n"
+ "diff --git a/b.py b/b.py\n"
+ "--- a/b.py\n"
+ "+++ b/b.py\n"
+ "@@ -1 +1 @@\n"
+ "-old\n"
+ "+new\n"
+ )
+ assert _extract_diff_files(diff) == {"a.py", "b.py"}
+
+ def test_empty_diff(self):
+ assert _extract_diff_files("") == set()
+
+ def test_renamed_file(self):
+ diff = "diff --git a/old_name.py b/new_name.py\n"
+ assert _extract_diff_files(diff) == {"new_name.py"}
+
+ def test_path_with_spaces(self):
+ diff = "diff --git a/path with spaces/file.py b/path with spaces/file.py\n"
+ assert _extract_diff_files(diff) == {"path with spaces/file.py"}
+
+
+class TestIncrementalFiltering:
+ """Test that the incremental file filter drops comments on unrelated files."""
+
+ def test_comments_filtered_to_incremental_files(self):
+ incremental_files = {"src/changed.py"}
+ inline_comments = [
+ {"file": "src/changed.py", "line": 10, "comment": "new issue"},
+ {"file": "src/untouched.py", "line": 5, "comment": "old issue re-raised"},
+ ]
+ filtered = [c for c in inline_comments if c.get("file", "") in incremental_files]
+ assert len(filtered) == 1
+ assert filtered[0]["file"] == "src/changed.py"
+
+ def test_empty_incremental_files_passes_all(self):
+ incremental_files = set()
+ inline_comments = [
+ {"file": "src/any.py", "line": 1, "comment": "comment"},
+ ]
+ # When incremental_files is empty (fallback to full review), no filtering
+ if incremental_files:
+ filtered = [c for c in inline_comments if c.get("file", "") in incremental_files]
+ else:
+ filtered = inline_comments
+ assert len(filtered) == 1
+
+ def test_all_comments_filtered_yields_empty(self):
+ incremental_files = {"src/only_this.py"}
+ inline_comments = [
+ {"file": "src/other.py", "line": 1, "comment": "stale"},
+ {"file": "src/another.py", "line": 2, "comment": "stale too"},
+ ]
+ filtered = [c for c in inline_comments if c.get("file", "") in incremental_files]
+ assert filtered == []
+
+
+class TestNitFilterAndFormatting:
+ """Tests for hard NIT filter on re-reviews and evidence formatting."""
+
+ def test_nits_dropped_on_re_review(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "10")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", "synchronize")
+ monkeypatch.setenv("EVENT_BEFORE", "aaa")
+ monkeypatch.setenv("EVENT_AFTER", "bbb")
+ monkeypatch.setenv("KB_S3_BUCKET", "")
+ monkeypatch.setenv("KB_S3_KEY", "")
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json"))
+
+ incremental = "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n@@ -1 +1 @@\n-x\n+y\n"
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff"), \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map"), \
+ mock.patch("issue_bot.github_client.GitHubClient.get_ci_status") as mock_ci, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context", return_value=""), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ mock_pr.return_value = {
+ "user": {"login": "dev"}, "title": "Fix", "body": "",
+ "state": "open", "html_url": "https://github.com/x",
+ "head": {"sha": "bbb"},
+ }
+ mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "prior"}]
+ mock_rc.return_value = []
+ mock_compare.return_value = incremental
+ mock_files.return_value = [{"filename": "f.py"}]
+ mock_content.return_value = "content"
+ mock_ci.return_value = (True, "CI passed")
+ mock_bedrock.return_value = json.dumps({"comments": [
+ {"file": "f.py", "line": 1, "severity": "BUG", "comment": "real bug",
+ "evidence": "line 1 divides by zero"},
+ {"file": "f.py", "line": 1, "severity": "NIT", "comment": "rename var",
+ "evidence": "x is not descriptive"},
+ ]})
+
+ bot_main.analyze()
+
+ with open(str(tmp_path / "result.json")) as f:
+ result = json.load(f)
+
+ # NIT should be filtered, BUG should remain
+ assert result["action"] == "RESPOND"
+ assert len(result["inline_comments"]) == 1
+ assert "real bug" in result["inline_comments"][0]["comment"]
+
+ def test_nits_kept_on_first_review(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "10")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", "opened")
+ monkeypatch.setenv("EVENT_BEFORE", "")
+ monkeypatch.setenv("EVENT_AFTER", "")
+ monkeypatch.setenv("KB_S3_BUCKET", "")
+ monkeypatch.setenv("KB_S3_KEY", "")
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json"))
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff"), \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map"), \
+ mock.patch("issue_bot.github_client.GitHubClient.get_ci_status") as mock_ci, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context", return_value=""), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ mock_pr.return_value = {
+ "user": {"login": "dev"}, "title": "Fix", "body": "",
+ "state": "open", "html_url": "https://github.com/x",
+ "head": {"sha": "abc123"},
+ }
+ mock_comments.return_value = []
+ mock_rc.return_value = []
+ mock_files.return_value = [{"filename": "f.py"}]
+ mock_content.return_value = "content"
+ mock_ci.return_value = (True, "CI passed")
+ mock_bedrock.return_value = json.dumps({"comments": [
+ {"file": "f.py", "line": 1, "severity": "BUG", "comment": "bug",
+ "evidence": "evidence1"},
+ {"file": "f.py", "line": 2, "severity": "NIT", "comment": "nit",
+ "evidence": "evidence2"},
+ ]})
+
+ bot_main.analyze()
+
+ with open(str(tmp_path / "result.json")) as f:
+ result = json.load(f)
+
+ # Both BUG and NIT should be present on first review
+ assert len(result["inline_comments"]) == 2
+
+ def test_evidence_formatted_in_comment(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "10")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", "opened")
+ monkeypatch.setenv("EVENT_BEFORE", "")
+ monkeypatch.setenv("EVENT_AFTER", "")
+ monkeypatch.setenv("KB_S3_BUCKET", "")
+ monkeypatch.setenv("KB_S3_KEY", "")
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json"))
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff"), \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map"), \
+ mock.patch("issue_bot.github_client.GitHubClient.get_ci_status") as mock_ci, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context", return_value=""), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ mock_pr.return_value = {
+ "user": {"login": "dev"}, "title": "Fix", "body": "",
+ "state": "open", "html_url": "https://github.com/x",
+ "head": {"sha": "abc123"},
+ }
+ mock_comments.return_value = []
+ mock_rc.return_value = []
+ mock_files.return_value = [{"filename": "f.py"}]
+ mock_content.return_value = "content"
+ mock_ci.return_value = (True, "CI passed")
+ mock_bedrock.return_value = json.dumps({"comments": [
+ {"file": "f.py", "line": 5, "severity": "BUG",
+ "comment": "division by zero",
+ "evidence": "line 3 sets count=0, line 5 divides by count"},
+ ]})
+
+ bot_main.analyze()
+
+ with open(str(tmp_path / "result.json")) as f:
+ result = json.load(f)
+
+ comment_text = result["inline_comments"][0]["comment"]
+ assert comment_text.startswith("**BUG**: ")
+ assert "division by zero" in comment_text
+ assert "line 3 sets count=0" in comment_text
+
+
+class TestIncrementalReviewIntegration:
+ """End-to-end tests for the incremental review path through analyze()."""
+
+ def _setup_env(self, tmp_path, monkeypatch, event_before="abc123", event_after="def456"):
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "99")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", "synchronize")
+ monkeypatch.setenv("EVENT_BEFORE", event_before)
+ monkeypatch.setenv("EVENT_AFTER", event_after)
+ monkeypatch.setenv("GITHUB_ACTOR", "contributor")
+ monkeypatch.setenv("KB_S3_BUCKET", "")
+ monkeypatch.setenv("KB_S3_KEY", "")
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review this PR. Date: {current_date}")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json"))
+
+ def test_incremental_review_filters_stale_comments(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ self._setup_env(tmp_path, monkeypatch)
+
+ incremental_diff = (
+ "diff --git a/src/fixed.py b/src/fixed.py\n"
+ "--- a/src/fixed.py\n+++ b/src/fixed.py\n"
+ "@@ -1 +1 @@\n-old\n+new\n"
+ )
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ mock_pr.return_value = {"user": {"login": "contributor"}, "title": "Fix bug",
+ "body": "Fixes the thing", "state": "open", "html_url": "https://github.com/x"}
+ mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "prior review"}]
+ mock_diff.return_value = "full diff here"
+ mock_rc.return_value = []
+ mock_compare.return_value = incremental_diff
+ mock_files.return_value = [{"filename": "src/fixed.py"}, {"filename": "src/untouched.py"}]
+ mock_content.return_value = "file content"
+ mock_map.return_value = ""
+ mock_kb.return_value = ""
+ mock_bedrock.return_value = json.dumps({
+ "comments": [
+ {"file": "src/fixed.py", "line": 1, "comment": "new issue in changed file"},
+ {"file": "src/untouched.py", "line": 5, "comment": "stale comment on unchanged file"},
+ ]
+ })
+
+ from issue_bot.main import analyze
+ analyze()
+
+ with open(str(tmp_path / "result.json")) as f:
+ result = json.load(f)
+
+ assert result["action"] == "RESPOND"
+ assert result["is_incremental"] is True
+ assert len(result["inline_comments"]) == 1
+ assert result["inline_comments"][0]["file"] == "src/fixed.py"
+
+ def test_force_push_falls_back_to_full_review(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ self._setup_env(tmp_path, monkeypatch)
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ mock_pr.return_value = {"user": {"login": "contributor"}, "title": "Fix bug",
+ "body": "Fixes the thing", "state": "open", "html_url": "https://github.com/x"}
+ mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "prior review"}]
+ mock_diff.return_value = "full diff here"
+ mock_rc.return_value = []
+ mock_compare.return_value = "" # Force push — compare fails
+ mock_files.return_value = [{"filename": "src/a.py"}]
+ mock_content.return_value = "content"
+ mock_map.return_value = ""
+ mock_kb.return_value = ""
+ mock_bedrock.return_value = json.dumps({
+ "comments": [
+ {"file": "src/a.py", "line": 1, "comment": "issue found"},
+ ]
+ })
+
+ from issue_bot.main import analyze
+ analyze()
+
+ with open(str(tmp_path / "result.json")) as f:
+ result = json.load(f)
+
+ # Falls back to full review — no filtering, not marked incremental
+ assert result["action"] == "RESPOND"
+ assert result["is_incremental"] is False
+ assert len(result["inline_comments"]) == 1
+
+ def test_no_before_sha_skips_incremental(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ self._setup_env(tmp_path, monkeypatch, event_before="", event_after="def456")
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ mock_pr.return_value = {"user": {"login": "contributor"}, "title": "Fix",
+ "body": "Fix", "state": "open", "html_url": "https://github.com/x"}
+ mock_comments.return_value = [{"user": {"login": "github-actions[bot]"}, "body": "review"}]
+ mock_diff.return_value = "full diff"
+ mock_rc.return_value = []
+ mock_compare.return_value = "should not be called"
+ mock_files.return_value = [{"filename": "src/a.py"}]
+ mock_content.return_value = "content"
+ mock_map.return_value = ""
+ mock_kb.return_value = ""
+ mock_bedrock.return_value = json.dumps({"comments": [
+ {"file": "src/a.py", "line": 1, "comment": "issue"},
+ ]})
+
+ from issue_bot.main import analyze
+ analyze()
+
+ # Should NOT have called compare because event_before is empty
+ mock_compare.assert_not_called()
+
+ with open(str(tmp_path / "result.json")) as f:
+ result = json.load(f)
+ assert result["is_incremental"] is False
+
+
+class TestFileContentUsesHeadSha:
+ """Verify get_file_content is called with PR head SHA, not default branch."""
+
+ def _setup_env(self, tmp_path, monkeypatch):
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "42")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", "opened")
+ monkeypatch.setenv("EVENT_BEFORE", "")
+ monkeypatch.setenv("EVENT_AFTER", "")
+ monkeypatch.setenv("GITHUB_ACTOR", "contributor")
+ monkeypatch.setenv("KB_S3_BUCKET", "")
+ monkeypatch.setenv("KB_S3_KEY", "")
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review this PR. Date: {current_date}")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json"))
+
+ def test_file_content_fetched_with_head_sha(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ self._setup_env(tmp_path, monkeypatch)
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ mock_pr.return_value = {
+ "user": {"login": "contributor"}, "title": "Add feature",
+ "body": "New file", "state": "open",
+ "html_url": "https://github.com/x",
+ "head": {"sha": "abc123deadbeef"},
+ }
+ mock_comments.return_value = []
+ mock_diff.return_value = "diff content"
+ mock_rc.return_value = []
+ mock_files.return_value = [
+ {"filename": "src/new_file.py"},
+ {"filename": "src/existing.py"},
+ ]
+ mock_content.return_value = "file content"
+ mock_map.return_value = ""
+ mock_kb.return_value = ""
+ mock_bedrock.return_value = json.dumps({"comments": []})
+
+ from issue_bot.main import analyze
+ analyze()
+
+ # Every get_file_content call must include ref=head_sha
+ for call in mock_content.call_args_list:
+ args, kwargs = call
+ assert kwargs.get("ref") == "abc123deadbeef" or \
+ (len(args) > 1 and args[1] == "abc123deadbeef"), \
+ f"get_file_content called without head SHA: {call}"
+
+ def test_missing_head_sha_falls_back_gracefully(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ self._setup_env(tmp_path, monkeypatch)
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock:
+
+ # PR object without head.sha (shouldn't happen, but defensive)
+ mock_pr.return_value = {
+ "user": {"login": "contributor"}, "title": "Fix",
+ "body": "Fix", "state": "open",
+ "html_url": "https://github.com/x",
+ }
+ mock_comments.return_value = []
+ mock_diff.return_value = "diff"
+ mock_rc.return_value = []
+ mock_files.return_value = [{"filename": "src/a.py"}]
+ mock_content.return_value = "content"
+ mock_map.return_value = ""
+ mock_kb.return_value = ""
+ mock_bedrock.return_value = json.dumps({"comments": []})
+
+ from issue_bot.main import analyze
+ analyze()
+
+ # Should still work — falls back to no ref (default branch)
+ with open(str(tmp_path / "result.json")) as f:
+ result = json.load(f)
+ # First review with 0 comments → RESPOND with CI-aware message
+ assert result["action"] == "RESPOND"
+ assert "No issues found" in result["response"]
+
+
+class TestReviewEventType:
+ """Verify bot always uses COMMENT event type, never REQUEST_CHANGES."""
+
+ def _setup_env(self, tmp_path, monkeypatch, event_action="opened"):
+ monkeypatch.setenv("GITHUB_TOKEN", "fake")
+ monkeypatch.setenv("GITHUB_REPOSITORY", "awslabs/test")
+ monkeypatch.setenv("ISSUE_NUMBER", "50")
+ monkeypatch.setenv("EVENT_TYPE", "pull_request_target")
+ monkeypatch.setenv("EVENT_ACTION", event_action)
+ monkeypatch.setenv("EVENT_BEFORE", "aaa111" if event_action == "synchronize" else "")
+ monkeypatch.setenv("EVENT_AFTER", "bbb222" if event_action == "synchronize" else "")
+ monkeypatch.setenv("GITHUB_ACTOR", "contributor")
+ monkeypatch.setenv("KB_S3_BUCKET", "")
+ monkeypatch.setenv("KB_S3_KEY", "")
+ monkeypatch.setenv("PR_FILE_REVIEW_PROMPT", "Review. Date: {current_date}")
+ import issue_bot.main as bot_main
+ monkeypatch.setattr(bot_main, "ARTIFACT_PATH", str(tmp_path / "result.json"))
+
+ def _run_and_get_artifact(self, tmp_path, monkeypatch, mock, event_action="opened"):
+ self._setup_env(tmp_path, monkeypatch, event_action=event_action)
+
+ with mock.patch("issue_bot.github_client.GitHubClient.get_pr") as mock_pr, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_comments") as mock_comments, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_diff") as mock_diff, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_review_comments") as mock_rc, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_compare_diff") as mock_compare, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_pr_files") as mock_files, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_file_content") as mock_content, \
+ mock.patch("issue_bot.github_client.GitHubClient.get_codebase_map") as mock_map, \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.load"), \
+ mock.patch("issue_bot.knowledge_base.KnowledgeBase.build_context") as mock_kb, \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.__init__", return_value=None), \
+ mock.patch("issue_bot.bedrock_client.BedrockClient.invoke") as mock_bedrock, \
+ mock.patch("issue_bot.github_client.GitHubClient.post_pr_review") as mock_post:
+
+ mock_pr.return_value = {
+ "user": {"login": "contributor"}, "title": "Fix",
+ "body": "Fix", "state": "open",
+ "html_url": "https://github.com/x",
+ "head": {"sha": "abc123"},
+ }
+ mock_comments.return_value = (
+ [{"user": {"login": "github-actions[bot]"}, "body": "prior"}]
+ if event_action == "synchronize" else []
+ )
+ mock_diff.return_value = "diff"
+ mock_rc.return_value = []
+ mock_compare.return_value = (
+ "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n@@ -1 +1 @@\n-x\n+y\n"
+ if event_action == "synchronize" else ""
+ )
+ mock_files.return_value = [{"filename": "f.py"}]
+ mock_content.return_value = "content"
+ mock_map.return_value = ""
+ mock_kb.return_value = ""
+ mock_bedrock.return_value = json.dumps({
+ "comments": [{"file": "f.py", "line": 1, "comment": "issue"}]
+ })
+ mock_post.return_value = True
+
+ from issue_bot.main import analyze, act
+ analyze()
+
+ with open(str(tmp_path / "result.json")) as f:
+ return json.load(f), mock_post
+
+ def test_first_review_uses_comment_event(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ result, _ = self._run_and_get_artifact(tmp_path, monkeypatch, mock, "opened")
+ assert result["action"] == "RESPOND"
+ assert result.get("is_incremental") is False
+ assert len(result["inline_comments"]) > 0
+
+ def test_incremental_review_uses_comment_event(self, tmp_path, monkeypatch):
+ import unittest.mock as mock
+ result, _ = self._run_and_get_artifact(tmp_path, monkeypatch, mock, "synchronize")
+ assert result["action"] == "RESPOND"
+ assert result.get("is_incremental") is True