diff --git a/pyproject.toml b/pyproject.toml index e24371d..4306d2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sentinelguard" -version = "0.0.5" +version = "0.0.6" description = "A comprehensive, production-ready LLM security and guardrails framework" readme = "README.md" license = {text = "MIT"} @@ -32,6 +32,8 @@ dependencies = [ "pyyaml>=6.0", "tiktoken>=0.5.0", "regex>=2023.0", + # Secrets detection + "detect-secrets>=1.4.0", # PII detection (Presidio) "presidio-analyzer>=2.2.0", "presidio-anonymizer>=2.2.0", diff --git a/sentinelguard/__init__.py b/sentinelguard/__init__.py index 4895bd7..b8a8890 100644 --- a/sentinelguard/__init__.py +++ b/sentinelguard/__init__.py @@ -36,7 +36,7 @@ guard = SentinelGuard(config=config) """ -__version__ = "0.0.5" +__version__ = "0.0.6" __author__ = "SentinelGuard Contributors" from sentinelguard.core.guard import SentinelGuard diff --git a/sentinelguard/scanners/prompt/secrets.py b/sentinelguard/scanners/prompt/secrets.py index 8e061b4..23386be 100644 --- a/sentinelguard/scanners/prompt/secrets.py +++ b/sentinelguard/scanners/prompt/secrets.py @@ -1,20 +1,25 @@ """Secrets detection scanner. -Detects API keys, tokens, passwords, and other credentials in text -using three detection methods: -1. Vendor-specific patterns (AWS, GitHub, Stripe, etc.) -2. Generic keyword + value patterns (any password=, key=, token=, etc.) -3. High-entropy string detection (catches unknown/custom secrets) +Detects API keys, tokens, passwords, and other credentials in text. + +Detection methods (in order): +1. detect-secrets library (Yelp) — industry-standard secrets detection with + 20+ built-in plugins (AWS, GitHub, Stripe, high-entropy, keyword, etc.) +2. Vendor-specific regex patterns (fallback if detect-secrets unavailable) +3. Generic keyword + value patterns (password=, key=, token=, etc.) """ from __future__ import annotations +import logging import math import re from typing import Any, ClassVar, Dict, List, Optional from sentinelguard.core.scanner import PromptScanner, RiskLevel, ScanResult, register_scanner +logger = logging.getLogger(__name__) + # ── Vendor-specific patterns ── VENDOR_PATTERNS = { "aws_access_key": re.compile(r"(? float: class SecretsScanner(PromptScanner): """Detects API keys, tokens, passwords, and credentials. - Uses three detection methods: - 1. Vendor-specific patterns (AWS, GitHub, Stripe, etc.) - 2. Generic keyword patterns (password=, key=, token=, etc.) - 3. High-entropy string detection (catches unknown secrets) + Primary detection via detect-secrets (Yelp) with 20+ plugins. + Falls back to built-in regex patterns if detect-secrets is unavailable. Args: threshold: Score threshold (0.0-1.0). Default 0.5. @@ -145,47 +145,142 @@ def __init__( super().__init__(threshold=threshold, **kwargs) self.secret_types = secret_types self.detect_entropy = detect_entropy + self._detect_secrets_available = None + + def _try_detect_secrets(self, text: str) -> tuple[Dict[str, int], List[Dict[str, Any]]]: + """Use detect-secrets library for primary detection.""" + if self._detect_secrets_available is False: + return {}, [] + + try: + from detect_secrets.core.secrets_collection import SecretsCollection + from detect_secrets.settings import transient_settings + import tempfile + import os + except ImportError: + self._detect_secrets_available = False + logger.debug("detect-secrets not installed, using built-in patterns") + return {}, [] + + self._detect_secrets_available = True + found: Dict[str, int] = {} + matches: List[Dict[str, Any]] = [] + + # Write text to temp file for full-context scanning + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt") + temp_file.write(text.encode("utf-8")) + temp_file.close() + + try: + secrets = SecretsCollection() + with transient_settings({"plugins_used": [ + {"name": "ArtifactoryDetector"}, + {"name": "AWSKeyDetector"}, + {"name": "AzureStorageKeyDetector"}, + {"name": "BasicAuthDetector"}, + {"name": "CloudantDetector"}, + {"name": "DiscordBotTokenDetector"}, + {"name": "GitHubTokenDetector"}, + {"name": "IbmCloudIamDetector"}, + {"name": "IbmCosHmacDetector"}, + {"name": "JwtTokenDetector"}, + {"name": "KeywordDetector"}, + {"name": "MailchimpDetector"}, + {"name": "NpmDetector"}, + {"name": "PrivateKeyDetector"}, + {"name": "SendGridDetector"}, + {"name": "SlackDetector"}, + {"name": "SoftlayerDetector"}, + {"name": "SquareOAuthDetector"}, + {"name": "StripeDetector"}, + {"name": "TwilioKeyDetector"}, + {"name": "Base64HighEntropyString", "limit": 4.5}, + {"name": "HexHighEntropyString", "limit": 3.0}, + ]}): + secrets.scan_file(str(temp_file.name)) + + for file in secrets.files: + for secret in secrets[file]: + if secret.secret_value is None: + continue + secret_val = secret.secret_value + if len(secret_val) < 6: + continue + + secret_type = secret.type + found[secret_type] = found.get(secret_type, 0) + 1 + + # Find position in original text + try: + val_start = text.index(secret_val) + except ValueError: + continue + matches.append({ + "type": secret_type, + "start": val_start, + "end": val_start + len(secret_val), + "text": secret_val, + }) + finally: + os.remove(temp_file.name) + + return found, matches def scan(self, text: str, **kwargs: Any) -> ScanResult: found_secrets: Dict[str, int] = {} - secret_matches: List[Dict[str, Any]] = [] # position + value info for sanitization + secret_matches: List[Dict[str, Any]] = [] + + # Method 1: detect-secrets library (primary) + ds_found, ds_matches = self._try_detect_secrets(text) + found_secrets.update(ds_found) + secret_matches.extend(ds_matches) - # Method 1: Vendor-specific patterns + # Method 2: Built-in vendor-specific patterns (supplement detect-secrets) for secret_type, pattern in VENDOR_PATTERNS.items(): if self.secret_types and secret_type not in self.secret_types: continue for match in pattern.finditer(text): - found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1 - secret_matches.append({ - "type": secret_type, - "start": match.start(), - "end": match.end(), - "text": match.group(0), - }) - - # Method 2: Generic keyword patterns + # Avoid duplicates with detect-secrets findings + match_text = match.group(0) + already_found = any( + m["text"] == match_text or + (m["start"] <= match.start() < m["end"]) + for m in secret_matches + ) + if not already_found: + found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1 + secret_matches.append({ + "type": secret_type, + "start": match.start(), + "end": match.end(), + "text": match.group(0), + }) + + # Method 3: Generic keyword patterns (supplement) for secret_type, pattern in KEYWORD_PATTERNS.items(): if self.secret_types and secret_type not in self.secret_types: continue for match in pattern.finditer(text): - found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1 - # For keyword patterns, the value is in group(1) or group(2) try: value = match.group(2) if match.lastindex and match.lastindex >= 2 else match.group(1) val_start = match.start() + match.group(0).index(value) + except (IndexError, ValueError): + value = match.group(0) + val_start = match.start() + # Avoid duplicates + already_found = any( + m["text"] == value or + (m["start"] <= val_start < m["end"]) + for m in secret_matches + ) + if not already_found: + found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1 secret_matches.append({ "type": secret_type, "start": val_start, "end": val_start + len(value), "text": value, }) - except (IndexError, ValueError): - secret_matches.append({ - "type": secret_type, - "start": match.start(), - "end": match.end(), - "text": match.group(0), - }) # Method 3: High-entropy string detection if self.detect_entropy: