Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions .github/workflows/auto-approve.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
name: Auto-Approve Clean PRs

on:
workflow_run:
workflows: [".github/workflows/base.yml", "PyDeequ Bot"]
types: [completed]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BUG: The workflows field in workflow_run trigger must match the workflow name (the name: key in the YAML), not the file path. ".github/workflows/base.yml" is a file path, not a workflow name. This workflow will never trigger unless base.yml has name: .github/workflows/base.yml. Change this to the actual name: value from base.yml (e.g., "CI" or whatever it's named).

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BUG: The workflows field in workflow_run trigger must match the workflow name (the name: key in the target YAML file), not the file path. ".github/workflows/base.yml" is a file path. GitHub Actions will never match this unless base.yml literally has name: .github/workflows/base.yml. Change this to the actual name: value from base.yml.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BUG: The workflows field in workflow_run trigger must match the workflow name (the name: key in the target YAML), not the file path. ".github/workflows/base.yml" is a file path — GitHub Actions will never match this unless base.yml literally has name: .github/workflows/base.yml. Change this to the actual name: value from base.yml (e.g., "CI" or whatever it declares).

permissions:
pull-requests: write
actions: read

jobs:
approve:
runs-on: ubuntu-latest
if: github.event.workflow_run.event == 'pull_request' || github.event.workflow_run.event == 'pull_request_target'
timeout-minutes: 2

steps:
- name: Find PR and check both conditions
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
with:
script: |
const sha = context.payload.workflow_run.head_sha;
const owner = context.repo.owner;
const repo = context.repo.repo;

// Find the PR for this SHA
let prNumber = null;
const prs = context.payload.workflow_run.pull_requests;
if (prs && prs.length > 0) {
prNumber = prs[0].number;
} else {
const {data: searchResult} = await github.rest.pulls.list({
owner, repo, state: 'open', sort: 'updated', direction: 'desc', per_page: 30
});
const match = searchResult.find(pr => pr.head.sha === sha);
if (match) {
prNumber = match.number;
}
}

if (!prNumber) {
core.info(`No open PR found for SHA ${sha}, skipping`);
return;
}

core.info(`Found PR #${prNumber} for SHA ${sha}`);

// Verify the PR head SHA still matches (no new push since trigger)
const {data: pr} = await github.rest.pulls.get({
owner, repo, pull_number: prNumber
});
if (pr.head.sha !== sha) {
core.info(`PR head ${pr.head.sha} differs from trigger SHA ${sha} — new push arrived, skipping`);
return;
}

// Condition 1: CI must have passed for this SHA
const {data: workflowRuns} = await github.rest.actions.listWorkflowRunsForRepo({
owner, repo, head_sha: sha, status: 'completed'
});
const ciRun = workflowRuns.workflow_runs.find(r =>
r.name === '.github/workflows/base.yml' && r.conclusion === 'success'
);
if (!ciRun) {
core.info(`CI has not passed for SHA ${sha}, skipping`);
return;
}

// Condition 2: Bot must have posted a clean review for this SHA
const {data: reviews} = await github.rest.pulls.listReviews({
owner, repo, pull_number: prNumber
});

const CLEAN_MARKER = '<!-- deequ-bot:clean -->';

const latestBot = reviews
.filter(r => r.user.login === 'github-actions[bot]')
.sort((a, b) => new Date(b.submitted_at) - new Date(a.submitted_at))[0];

if (!latestBot || !latestBot.body.includes(CLEAN_MARKER) || latestBot.commit_id !== sha) {
core.info('Bot has not posted a clean review for this SHA, skipping');
return;
}

// Both conditions met — check for existing approval to prevent doubles
const botApprovals = reviews.filter(r =>
r.user.login === 'github-actions[bot]' && r.state === 'APPROVED'
);
if (botApprovals.length > 0) {
core.info('Bot already approved this PR, skipping');
return;
}

// Approve
core.info(`Approving PR #${prNumber}: bot review clean + CI passed for SHA ${sha}`);
await github.rest.pulls.createReview({
owner, repo, pull_number: prNumber,
event: 'APPROVE',
body: `No issues found and CI is passing. Auto-approved.\n\n---\n*Generated by AI — human merge required.*`
});
12 changes: 8 additions & 4 deletions .github/workflows/issue-bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,20 @@ jobs:
ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
EVENT_TYPE: ${{ github.event_name }}
EVENT_ACTION: ${{ github.event.action }}
EVENT_BEFORE: ${{ github.event.before }}
EVENT_AFTER: ${{ github.event.pull_request.head.sha || github.event.after }}
GITHUB_ACTOR: ${{ github.actor }}
KB_S3_BUCKET: ${{ secrets.KB_S3_BUCKET }}
KB_S3_KEY: ${{ secrets.KB_S3_KEY }}
BEDROCK_MODEL_ID: ${{ secrets.BEDROCK_MODEL_ID }}
GUARDRAIL_ID: ${{ secrets.GUARDRAIL_ID }}
GUARDRAIL_VERSION: ${{ secrets.GUARDRAIL_VERSION }}
ISSUE_CLASSIFY_PROMPT: ${{ secrets.ISSUE_CLASSIFY_PROMPT }}
ISSUE_RESPOND_PROMPT: ${{ secrets.ISSUE_RESPOND_PROMPT }}
PR_FILE_REVIEW_PROMPT: ${{ secrets.PR_FILE_REVIEW_PROMPT }}
FOLLOWUP_PROMPT: ${{ secrets.FOLLOWUP_PROMPT }}
SM_ISSUE_CLASSIFY_PROMPT: pydeequ-bot/issue-classify-prompt
SM_ISSUE_RESPOND_PROMPT: pydeequ-bot/issue-respond-prompt
SM_PR_FILE_REVIEW_PROMPT: pydeequ-bot/pr-file-review-prompt
SM_FOLLOWUP_PROMPT: pydeequ-bot/followup-prompt
CODEBASE_SRC_DIR: pydeequ
CODEBASE_FILE_EXT: .py
DRY_RUN: ${{ inputs.dry_run || 'false' }}
ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
run: python -m issue_bot.main analyze
Expand Down
36 changes: 36 additions & 0 deletions .github/workflows/stale.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Manage Stale Issues and PRs

on:
schedule:
- cron: '0 9 * * MON'
workflow_dispatch:

permissions:
issues: write
pull-requests: write

jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
days-before-stale: 60
days-before-close: 14
stale-issue-label: 'stale'
stale-pr-label: 'stale'
stale-issue-message: >
This issue has been inactive for 60 days. It will be closed in 14 days
if there is no further activity. If this is still relevant, please comment
to keep it open.
stale-pr-message: >
This PR has been inactive for 60 days. It will be closed in 14 days
if there is no further activity. If you are still working on this,
please push an update or comment to keep it open.
close-issue-message: >
Closed due to inactivity. Feel free to reopen if this is still relevant.
close-pr-message: >
Closed due to inactivity. Feel free to reopen if you'd like to continue this work.
exempt-issue-labels: 'bug,enhancement,help-wanted'
exempt-pr-labels: 'help-wanted'
operations-per-run: 50
10 changes: 7 additions & 3 deletions scripts/issue_bot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ def __init__(self):
sys.exit(1)
self.repo = _require("GITHUB_REPOSITORY")
self.actor = os.getenv("GITHUB_ACTOR", "")
self.event_before = os.getenv("EVENT_BEFORE", "")
self.event_after = os.getenv("EVENT_AFTER", "")

self.bedrock_model_id = os.getenv("BEDROCK_MODEL_ID", "us.anthropic.claude-opus-4-6-v1")

Expand All @@ -32,14 +34,16 @@ def __init__(self):
self.enable_repo_search = os.getenv("ENABLE_REPO_SEARCH", "true").lower() == "true"

self.upstream_repo = os.getenv("UPSTREAM_REPO", "awslabs/python-deequ")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DESIGN: The upstream_repo default was changed from "awslabs/python-deequ" to... actually looking at the full diff, I don't see this change in the diff. Disregard. However, max_context_chars was increased from 200,000 to 800,000 while the full_sources budget in main.py is 3,000,000 characters. The max_context_chars limit doesn't appear to be enforced against the system prompt that includes full_sources. If a PR touches many large files, the system prompt could far exceed 800K chars (and potentially exceed model context limits). Consider enforcing the budget: truncate full_sources so that len(system_prompt) <= cfg.max_context_chars.

self.codebase_src_dir = os.getenv("CODEBASE_SRC_DIR", "pydeequ")
self.codebase_file_ext = os.getenv("CODEBASE_FILE_EXT", ".py")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DESIGN: max_context_chars was increased from 200,000 to 800,000. Combined with the new full_sources section (up to 3MB budget on line 175 of main.py), the system prompt could easily exceed Bedrock/Claude's context window limits or cause excessive token costs. The 800K char limit (~200K tokens) may exceed model context for some Bedrock model IDs. Consider validating that len(system_prompt) <= max_context_chars before invoking Bedrock, or at least truncating full_sources to fit within the budget.


self.bedrock_timeout = 120
self.max_context_chars = 200000
self.bedrock_timeout = 240
self.max_context_chars = 800000
self.max_github_search_results = 8
self.github_api_timeout = 10
self.allowed_labels = {
"bug", "enhancement", "question", "documentation",
"help-wanted", "analyzer", "check", "spark-compatibility", "installation",
"help wanted", "python",
}


Expand Down
115 changes: 87 additions & 28 deletions scripts/issue_bot/github_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def __init__(self, cfg):
self._repo = cfg.repo
self._timeout = cfg.github_api_timeout
self._dry_run = cfg.dry_run
self._cfg = cfg
self._repo_root = os.getenv("GITHUB_WORKSPACE", os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
self._headers = {
"Authorization": f"token {self._token}",
Expand Down Expand Up @@ -48,6 +49,59 @@ def get_pr_diff(self, number):
logger.error(f"PR diff fetch failed: {e}")
return ""

def get_compare_diff(self, base_sha, head_sha):
"""Fetch the diff between two commits using the Compare API.
Returns the diff text, or empty string on failure (e.g. force-push
where base_sha no longer exists)."""
headers = {**self._headers, "Accept": "application/vnd.github.v3.diff"}
try:
resp = requests.get(
f"https://api.github.com/repos/{self._repo}/compare/{base_sha}...{head_sha}",
headers=headers, timeout=self._timeout,
)
if resp.status_code == 200:
return resp.text
logger.warning(f"Compare API {base_sha[:7]}...{head_sha[:7]}: {resp.status_code}")
return ""
except Exception as e:
logger.error(f"Compare diff failed: {e}")
return ""

def get_ci_status(self, sha):
"""Check commit statuses and check runs. Returns (passed, summary).
passed: True (all green), False (something failed), None (pending/unknown)."""
status = self._get(f"/repos/{self._repo}/commits/{sha}/status")
if status is None:
return None, "CI status unavailable"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EDGE_CASE: get_ci_status calls self._get(...) for check-runs on line after the status call. If the first _get (commit status) succeeds but the second _get (check-runs) returns None, check_data.get("check_runs", []) will raise AttributeError because None.get(...) fails. The code has check_data.get(...) if check_data else [] which handles this, but the variable runs assignment uses a ternary that's easy to miss. This is actually correct — just noting it's handled.

combined_state = status.get("state", "pending")

check_data = self._get(f"/repos/{self._repo}/commits/{sha}/check-runs")
runs = check_data.get("check_runs", []) if check_data else []

def _is_own_check(name):
lower = name.lower()
return "bot" in lower and ("analyze" in lower or "/ act" in lower)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DESIGN: The _is_own_check heuristic filters checks where the name contains both 'bot' AND ('analyze' OR '/ act'). However, the check "robot-tests" in the test test_non_bot_check_with_bot_in_name_not_filtered passes because it contains 'bot' but not 'analyze' or '/ act'. But a check named e.g. "my-bot-analyzer" would be incorrectly filtered out. The heuristic is fragile — consider matching against the exact workflow names used by this repo.


external_runs = [r for r in runs if not _is_own_check(r.get("name", ""))]

failed = []
pending = []
for r in external_runs:
if r.get("status") != "completed":
pending.append(r["name"])
elif r.get("conclusion") not in ("success", "neutral", "skipped"):
failed.append(r["name"])

if failed:
return False, f"CI failing: {', '.join(failed)}"
if pending:
return None, f"CI pending: {', '.join(pending)}"
if combined_state == "failure":
return False, "CI failing (status checks)"
if combined_state == "pending":
return None, "CI pending (status checks)"
return True, "CI passed"

def get_pr_files(self, number):
return self._get(f"/repos/{self._repo}/pulls/{number}/files") or []

Expand All @@ -64,16 +118,18 @@ def get_pr_review_comments(self, number, max_pages=10):
page += 1
return comments

def get_codebase_map(self, src_dir="pydeequ"):
"""List all Python source files (excluding tests) as relative paths."""
def get_codebase_map(self):
"""List source files (excluding tests) as relative paths."""
src_dir = self._cfg.codebase_src_dir
file_ext = self._cfg.codebase_file_ext
full_dir = os.path.join(self._repo_root, src_dir)
prefix = self._repo_root.rstrip("/") + "/"
try:
paths = []
for root, dirs, files in os.walk(full_dir):
dirs[:] = [d for d in dirs if d not in ("tests", "__pycache__", ".git")]
dirs[:] = [d for d in dirs if d not in ("examples", "__pycache__", ".git", "tests", "test")]
for f in files:
if f.endswith(".py"):
if f.endswith(file_ext):
full = os.path.join(root, f)
rel = full[len(prefix):] if full.startswith(prefix) else full
paths.append(rel)
Expand Down Expand Up @@ -116,9 +172,9 @@ def post_comment(self, number, body):
return True
return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})

def post_pr_review(self, number, summary, inline_comments):
def post_pr_review(self, number, summary, inline_comments, event="COMMENT"):
if self._dry_run:
logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments")
logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments, event={event}")
return True

# Get valid diff lines per file from the PR
Expand All @@ -134,31 +190,34 @@ def post_pr_review(self, number, summary, inline_comments):
else:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BUG: When valid_comments is empty but invalid_comments is non-empty, the payload has "event": event (e.g., "COMMENT") but no "comments" key. The GitHub Reviews API requires that if you submit a review with event=COMMENT and no inline comments, the body must be non-empty. This works because body will contain the summary + invalid comments text. However, if summary is empty string and there are no invalid_comments either (all comments valid but empty list edge), the body could be empty, which GitHub rejects with 422. Consider adding a guard: if body is empty and no comments, skip the API call.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EDGE_CASE: When valid_comments is empty and summary (the response field from the artifact) is also empty string (which happens when there are inline comments — see line 240 of main.py), the body variable will only contain the invalid_comments section. But if inline_comments was non-empty yet all were invalid (none matched diff positions), body could end up being just "\n\n**Additional feedback:**\n..." with no summary prefix. More critically, if both summary and invalid_comments are empty and valid_comments is also empty, body is "" and the GitHub Reviews API will reject with 422. Consider adding a guard: if body is empty and there are no valid_comments, skip the review API call or set a default body.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EDGE_CASE: If summary is empty string (which happens when there are inline comments — see main.py line 240 where response = "") and all inline comments are invalid (none match diff positions), then valid_comments is empty, invalid_comments is non-empty, and body starts with "\n\n**Additional feedback:**\n...". But if somehow both summary is empty AND invalid_comments is empty AND valid_comments is empty (e.g., all comments were filtered out between analyze and act), body is "" and the GitHub Reviews API will reject with 422. Add a guard: if body is empty and valid_comments is empty, skip the API call or set a fallback body.

invalid_comments.append(ic)

body = summary
if invalid_comments:
body += "\n\n**Additional feedback:**\n"
for ic in invalid_comments:
line_ref = f":{ic['line']}" if ic.get('line') else ""
body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"

payload = {"body": body, "event": event}
if valid_comments:
body = summary
if invalid_comments:
body += "\n\n**Additional feedback:**\n"
for ic in invalid_comments:
line_ref = f":{ic['line']}" if ic.get('line') else ""
body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
payload = {"body": body, "event": "REQUEST_CHANGES", "comments": valid_comments}
try:
resp = requests.post(
f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
headers=self._headers, json=payload, timeout=self._timeout,
)
if resp.status_code in (200, 201):
return True
logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
except Exception as e:
logger.error(f"PR review API failed: {e}, falling back to comment")

# Fallback: post all as regular comment
all_comments = inline_comments
payload["comments"] = valid_comments

try:
resp = requests.post(
f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
headers=self._headers, json=payload, timeout=self._timeout,
)
if resp.status_code in (200, 201):
return True
logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
logger.error(f"Response: {resp.text[:500]}")
except Exception as e:
logger.error(f"PR review API failed: {e}, falling back to comment")

# Fallback: post as regular comment if review API fails
body = summary
if all_comments:
if inline_comments:
body += "\n\n**Inline feedback:**\n"
for ic in all_comments:
for ic in inline_comments:
line_ref = f":{ic['line']}" if ic.get('line') else ""
body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})
Expand Down
Loading
Loading