awslabs · sudsali · May 6, 2026 · May 13, 2026 · github-actions · May 13, 2026
diff --git a/.github/workflows/auto-approve.yml b/.github/workflows/auto-approve.yml
@@ -0,0 +1,101 @@
+name: Auto-Approve Clean PRs
+
+on:
+  workflow_run:
+    workflows: [".github/workflows/base.yml", "PyDeequ Bot"]
+    types: [completed]
+
+permissions:
+  pull-requests: write
+  actions: read
+
+jobs:
+  approve:
+    runs-on: ubuntu-latest
+    if: github.event.workflow_run.event == 'pull_request' || github.event.workflow_run.event == 'pull_request_target'
+    timeout-minutes: 2
+
+    steps:
+      - name: Find PR and check both conditions
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        with:
+          script: |
+            const sha = context.payload.workflow_run.head_sha;
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+
+            // Find the PR for this SHA
+            let prNumber = null;
+            const prs = context.payload.workflow_run.pull_requests;
+            if (prs && prs.length > 0) {
+              prNumber = prs[0].number;
+            } else {
+              const {data: searchResult} = await github.rest.pulls.list({
+                owner, repo, state: 'open', sort: 'updated', direction: 'desc', per_page: 30
+              });
+              const match = searchResult.find(pr => pr.head.sha === sha);
+              if (match) {
+                prNumber = match.number;
+              }
+            }
+
+            if (!prNumber) {
+              core.info(`No open PR found for SHA ${sha}, skipping`);
+              return;
+            }
+
+            core.info(`Found PR #${prNumber} for SHA ${sha}`);
+
+            // Verify the PR head SHA still matches (no new push since trigger)
+            const {data: pr} = await github.rest.pulls.get({
+              owner, repo, pull_number: prNumber
+            });
+            if (pr.head.sha !== sha) {
+              core.info(`PR head ${pr.head.sha} differs from trigger SHA ${sha} — new push arrived, skipping`);
+              return;
+            }
+
+            // Condition 1: CI must have passed for this SHA
+            const {data: workflowRuns} = await github.rest.actions.listWorkflowRunsForRepo({
+              owner, repo, head_sha: sha, status: 'completed'
+            });
+            const ciRun = workflowRuns.workflow_runs.find(r =>
+              r.name === '.github/workflows/base.yml' && r.conclusion === 'success'
+            );
+            if (!ciRun) {
+              core.info(`CI has not passed for SHA ${sha}, skipping`);
+              return;
+            }
+
+            // Condition 2: Bot must have posted a clean review for this SHA
+            const {data: reviews} = await github.rest.pulls.listReviews({
+              owner, repo, pull_number: prNumber
+            });
+
+            const CLEAN_MARKER = '<!-- deequ-bot:clean -->';
+
+            const latestBot = reviews
+              .filter(r => r.user.login === 'github-actions[bot]')
+              .sort((a, b) => new Date(b.submitted_at) - new Date(a.submitted_at))[0];
+
+            if (!latestBot || !latestBot.body.includes(CLEAN_MARKER) || latestBot.commit_id !== sha) {
+              core.info('Bot has not posted a clean review for this SHA, skipping');
+              return;
+            }
+
+            // Both conditions met — check for existing approval to prevent doubles
+            const botApprovals = reviews.filter(r =>
+              r.user.login === 'github-actions[bot]' && r.state === 'APPROVED'
+            );
+            if (botApprovals.length > 0) {
+              core.info('Bot already approved this PR, skipping');
+              return;
+            }
+
+            // Approve
+            core.info(`Approving PR #${prNumber}: bot review clean + CI passed for SHA ${sha}`);
+            await github.rest.pulls.createReview({
+              owner, repo, pull_number: prNumber,
+              event: 'APPROVE',
+              body: `No issues found and CI is passing. Auto-approved.\n\n---\n*Generated by AI — human merge required.*`
+            });
diff --git a/.github/workflows/issue-bot.yml b/.github/workflows/issue-bot.yml
@@ -61,16 +61,20 @@ jobs:
           ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
           EVENT_TYPE: ${{ github.event_name }}
           EVENT_ACTION: ${{ github.event.action }}
+          EVENT_BEFORE: ${{ github.event.before }}
+          EVENT_AFTER: ${{ github.event.pull_request.head.sha || github.event.after }}
           GITHUB_ACTOR: ${{ github.actor }}
           KB_S3_BUCKET: ${{ secrets.KB_S3_BUCKET }}
           KB_S3_KEY: ${{ secrets.KB_S3_KEY }}
           BEDROCK_MODEL_ID: ${{ secrets.BEDROCK_MODEL_ID }}
           GUARDRAIL_ID: ${{ secrets.GUARDRAIL_ID }}
           GUARDRAIL_VERSION: ${{ secrets.GUARDRAIL_VERSION }}
-          ISSUE_CLASSIFY_PROMPT: ${{ secrets.ISSUE_CLASSIFY_PROMPT }}
-          ISSUE_RESPOND_PROMPT: ${{ secrets.ISSUE_RESPOND_PROMPT }}
-          PR_FILE_REVIEW_PROMPT: ${{ secrets.PR_FILE_REVIEW_PROMPT }}
-          FOLLOWUP_PROMPT: ${{ secrets.FOLLOWUP_PROMPT }}
+          SM_ISSUE_CLASSIFY_PROMPT: pydeequ-bot/issue-classify-prompt
+          SM_ISSUE_RESPOND_PROMPT: pydeequ-bot/issue-respond-prompt
+          SM_PR_FILE_REVIEW_PROMPT: pydeequ-bot/pr-file-review-prompt
+          SM_FOLLOWUP_PROMPT: pydeequ-bot/followup-prompt
+          CODEBASE_SRC_DIR: pydeequ
+          CODEBASE_FILE_EXT: .py
           DRY_RUN: ${{ inputs.dry_run || 'false' }}
           ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
         run: python -m issue_bot.main analyze

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
@@ -0,0 +1,36 @@
+name: Manage Stale Issues and PRs
+
+on:
+  schedule:
+    - cron: '0 9 * * MON'
+  workflow_dispatch:
+
+permissions:
+  issues: write
+  pull-requests: write
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+        with:
+          days-before-stale: 60
+          days-before-close: 14
+          stale-issue-label: 'stale'
+          stale-pr-label: 'stale'
+          stale-issue-message: >
+            This issue has been inactive for 60 days. It will be closed in 14 days
+            if there is no further activity. If this is still relevant, please comment
+            to keep it open.
+          stale-pr-message: >
+            This PR has been inactive for 60 days. It will be closed in 14 days
+            if there is no further activity. If you are still working on this,
+            please push an update or comment to keep it open.
+          close-issue-message: >
+            Closed due to inactivity. Feel free to reopen if this is still relevant.
+          close-pr-message: >
+            Closed due to inactivity. Feel free to reopen if you'd like to continue this work.
+          exempt-issue-labels: 'bug,enhancement,help-wanted'
+          exempt-pr-labels: 'help-wanted'
+          operations-per-run: 50
diff --git a/scripts/issue_bot/config.py b/scripts/issue_bot/config.py
@@ -17,6 +17,8 @@ def __init__(self):
             sys.exit(1)
         self.repo = _require("GITHUB_REPOSITORY")
         self.actor = os.getenv("GITHUB_ACTOR", "")
+        self.event_before = os.getenv("EVENT_BEFORE", "")
+        self.event_after = os.getenv("EVENT_AFTER", "")
 
         self.bedrock_model_id = os.getenv("BEDROCK_MODEL_ID", "us.anthropic.claude-opus-4-6-v1")
 
@@ -32,14 +34,16 @@ def __init__(self):
         self.enable_repo_search = os.getenv("ENABLE_REPO_SEARCH", "true").lower() == "true"
 
         self.upstream_repo = os.getenv("UPSTREAM_REPO", "awslabs/python-deequ")
+        self.codebase_src_dir = os.getenv("CODEBASE_SRC_DIR", "pydeequ")
+        self.codebase_file_ext = os.getenv("CODEBASE_FILE_EXT", ".py")
 
-        self.bedrock_timeout = 120
-        self.max_context_chars = 200000
+        self.bedrock_timeout = 240
+        self.max_context_chars = 800000
         self.max_github_search_results = 8
         self.github_api_timeout = 10
         self.allowed_labels = {
             "bug", "enhancement", "question", "documentation",
-            "help-wanted", "analyzer", "check", "spark-compatibility", "installation",
+            "help wanted", "python",
         }
 
 

diff --git a/scripts/issue_bot/github_client.py b/scripts/issue_bot/github_client.py
@@ -11,6 +11,7 @@ def __init__(self, cfg):
         self._repo = cfg.repo
         self._timeout = cfg.github_api_timeout
         self._dry_run = cfg.dry_run
+        self._cfg = cfg
         self._repo_root = os.getenv("GITHUB_WORKSPACE", os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
         self._headers = {
             "Authorization": f"token {self._token}",
@@ -48,6 +49,59 @@ def get_pr_diff(self, number):
             logger.error(f"PR diff fetch failed: {e}")
             return ""
 
+    def get_compare_diff(self, base_sha, head_sha):
+        """Fetch the diff between two commits using the Compare API.
+        Returns the diff text, or empty string on failure (e.g. force-push
+        where base_sha no longer exists)."""
+        headers = {**self._headers, "Accept": "application/vnd.github.v3.diff"}
+        try:
+            resp = requests.get(
+                f"https://api.github.com/repos/{self._repo}/compare/{base_sha}...{head_sha}",
+                headers=headers, timeout=self._timeout,
+            )
+            if resp.status_code == 200:
+                return resp.text
+            logger.warning(f"Compare API {base_sha[:7]}...{head_sha[:7]}: {resp.status_code}")
+            return ""
+        except Exception as e:
+            logger.error(f"Compare diff failed: {e}")
+            return ""
+
+    def get_ci_status(self, sha):
+        """Check commit statuses and check runs. Returns (passed, summary).
+        passed: True (all green), False (something failed), None (pending/unknown)."""
+        status = self._get(f"/repos/{self._repo}/commits/{sha}/status")
+        if status is None:
+            return None, "CI status unavailable"
+        combined_state = status.get("state", "pending")
+
+        check_data = self._get(f"/repos/{self._repo}/commits/{sha}/check-runs")
+        runs = check_data.get("check_runs", []) if check_data else []
+
+        def _is_own_check(name):
+            lower = name.lower()
+            return "bot" in lower and ("analyze" in lower or "/ act" in lower)
+
+        external_runs = [r for r in runs if not _is_own_check(r.get("name", ""))]
+
+        failed = []
+        pending = []
+        for r in external_runs:
+            if r.get("status") != "completed":
+                pending.append(r["name"])
+            elif r.get("conclusion") not in ("success", "neutral", "skipped"):
+                failed.append(r["name"])
+
+        if failed:
+            return False, f"CI failing: {', '.join(failed)}"
+        if pending:
+            return None, f"CI pending: {', '.join(pending)}"
+        if combined_state == "failure":
+            return False, "CI failing (status checks)"
+        if combined_state == "pending":
+            return None, "CI pending (status checks)"
+        return True, "CI passed"
+
     def get_pr_files(self, number):
         return self._get(f"/repos/{self._repo}/pulls/{number}/files") or []
 
@@ -64,16 +118,18 @@ def get_pr_review_comments(self, number, max_pages=10):
             page += 1
         return comments
 
-    def get_codebase_map(self, src_dir="pydeequ"):
-        """List all Python source files (excluding tests) as relative paths."""
+    def get_codebase_map(self):
+        """List source files (excluding tests) as relative paths."""
+        src_dir = self._cfg.codebase_src_dir
+        file_ext = self._cfg.codebase_file_ext
         full_dir = os.path.join(self._repo_root, src_dir)
         prefix = self._repo_root.rstrip("/") + "/"
         try:
             paths = []
             for root, dirs, files in os.walk(full_dir):
-                dirs[:] = [d for d in dirs if d not in ("tests", "__pycache__", ".git")]
+                dirs[:] = [d for d in dirs if d not in ("examples", "__pycache__", ".git", "tests", "test")]
                 for f in files:
-                    if f.endswith(".py"):
+                    if f.endswith(file_ext):
                         full = os.path.join(root, f)
                         rel = full[len(prefix):] if full.startswith(prefix) else full
                         paths.append(rel)
@@ -116,9 +172,9 @@ def post_comment(self, number, body):
             return True
         return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})
 
-    def post_pr_review(self, number, summary, inline_comments):
+    def post_pr_review(self, number, summary, inline_comments, event="COMMENT"):
         if self._dry_run:
-            logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments")
+            logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments, event={event}")
             return True
 
         # Get valid diff lines per file from the PR
@@ -134,31 +190,34 @@ def post_pr_review(self, number, summary, inline_comments):
             else:
                 invalid_comments.append(ic)
 
+        body = summary
+        if invalid_comments:
+            body += "\n\n**Additional feedback:**\n"
+            for ic in invalid_comments:
+                line_ref = f":{ic['line']}" if ic.get('line') else ""
+                body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
+
+        payload = {"body": body, "event": event}
         if valid_comments:
-            body = summary
-            if invalid_comments:
-                body += "\n\n**Additional feedback:**\n"
-                for ic in invalid_comments:
-                    line_ref = f":{ic['line']}" if ic.get('line') else ""
-                    body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
-            payload = {"body": body, "event": "REQUEST_CHANGES", "comments": valid_comments}
-            try:
-                resp = requests.post(
-                    f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
-                    headers=self._headers, json=payload, timeout=self._timeout,
-                )
-                if resp.status_code in (200, 201):
-                    return True
-                logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
-            except Exception as e:
-                logger.error(f"PR review API failed: {e}, falling back to comment")
-
-        # Fallback: post all as regular comment
-        all_comments = inline_comments
+            payload["comments"] = valid_comments
+
+        try:
+            resp = requests.post(
+                f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
+                headers=self._headers, json=payload, timeout=self._timeout,
+            )
+            if resp.status_code in (200, 201):
+                return True
+            logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
+            logger.error(f"Response: {resp.text[:500]}")
+        except Exception as e:
+            logger.error(f"PR review API failed: {e}, falling back to comment")
+
+        # Fallback: post as regular comment if review API fails
         body = summary
-        if all_comments:
+        if inline_comments:
             body += "\n\n**Inline feedback:**\n"
-            for ic in all_comments:
+            for ic in inline_comments:
                 line_ref = f":{ic['line']}" if ic.get('line') else ""
                 body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
         return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})