Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "sentinelguard"
version = "0.0.5"
version = "0.0.6"
description = "A comprehensive, production-ready LLM security and guardrails framework"
readme = "README.md"
license = {text = "MIT"}
Expand Down Expand Up @@ -32,6 +32,8 @@ dependencies = [
"pyyaml>=6.0",
"tiktoken>=0.5.0",
"regex>=2023.0",
# Secrets detection
"detect-secrets>=1.4.0",
# PII detection (Presidio)
"presidio-analyzer>=2.2.0",
"presidio-anonymizer>=2.2.0",
Expand Down
2 changes: 1 addition & 1 deletion sentinelguard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
guard = SentinelGuard(config=config)
"""

__version__ = "0.0.5"
__version__ = "0.0.6"
__author__ = "SentinelGuard Contributors"

from sentinelguard.core.guard import SentinelGuard
Expand Down
163 changes: 129 additions & 34 deletions sentinelguard/scanners/prompt/secrets.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
"""Secrets detection scanner.

Detects API keys, tokens, passwords, and other credentials in text
using three detection methods:
1. Vendor-specific patterns (AWS, GitHub, Stripe, etc.)
2. Generic keyword + value patterns (any password=, key=, token=, etc.)
3. High-entropy string detection (catches unknown/custom secrets)
Detects API keys, tokens, passwords, and other credentials in text.

Detection methods (in order):
1. detect-secrets library (Yelp) — industry-standard secrets detection with
20+ built-in plugins (AWS, GitHub, Stripe, high-entropy, keyword, etc.)
2. Vendor-specific regex patterns (fallback if detect-secrets unavailable)
3. Generic keyword + value patterns (password=, key=, token=, etc.)
"""

from __future__ import annotations

import logging
import math
import re
from typing import Any, ClassVar, Dict, List, Optional

from sentinelguard.core.scanner import PromptScanner, RiskLevel, ScanResult, register_scanner

logger = logging.getLogger(__name__)

# ── Vendor-specific patterns ──
VENDOR_PATTERNS = {
"aws_access_key": re.compile(r"(?<![A-Za-z0-9/+=])AKIA[0-9A-Z]{16}(?![A-Za-z0-9/+=])"),
Expand Down Expand Up @@ -43,7 +48,7 @@
# These match: keyword = value, keyword: value, keyword="value", etc.
KEYWORD_PATTERNS = {
"generic_password": re.compile(
r"(?i)(?:password|passwd|pwd|pass)\s*[:=]\s*['\"]?([^\s'\"]{4,})['\"]?"
r"(?i)(?:\w*password|\w*passwd|\w*pwd|\w*pass)\s*[:=]\s*['\"]?([^\s'\"]{4,})['\"]?"
),
"generic_api_key": re.compile(
r"(?i)(?:api[_-]?key|apikey|api[_-]?secret|api[_-]?token|access[_-]?key)\s*[:=]\s*['\"]?([A-Za-z0-9_\-./+=]{8,})['\"]?"
Expand All @@ -63,9 +68,6 @@
"bearer_header": re.compile(
r"(?i)(?:bearer|authorization)\s*[:=]?\s*['\"]?([A-Za-z0-9_\-./+=]{20,})['\"]?"
),
"database_password": re.compile(
r"(?i)(?:db[_-]?pass|db[_-]?password|database[_-]?password|mysql[_-]?pwd|pg[_-]?password)\s*[:=]\s*['\"]?([^\s'\"]{4,})['\"]?"
),
"encryption_key": re.compile(
r"(?i)(?:encrypt[_-]?key|encryption[_-]?key|aes[_-]?key|signing[_-]?key|hmac[_-]?key)\s*[:=]\s*['\"]?([A-Za-z0-9_\-./+=]{8,})['\"]?"
),
Expand All @@ -75,7 +77,7 @@
HIGH_SENSITIVITY = {
"aws_access_key", "aws_secret_key", "private_key",
"connection_string", "stripe_key", "azure_key",
"generic_password", "generic_secret", "database_password",
"generic_password", "generic_secret",
"encryption_key",
}
MEDIUM_SENSITIVITY = {
Expand Down Expand Up @@ -122,10 +124,8 @@ def _shannon_entropy(data: str) -> float:
class SecretsScanner(PromptScanner):
"""Detects API keys, tokens, passwords, and credentials.

Uses three detection methods:
1. Vendor-specific patterns (AWS, GitHub, Stripe, etc.)
2. Generic keyword patterns (password=, key=, token=, etc.)
3. High-entropy string detection (catches unknown secrets)
Primary detection via detect-secrets (Yelp) with 20+ plugins.
Falls back to built-in regex patterns if detect-secrets is unavailable.

Args:
threshold: Score threshold (0.0-1.0). Default 0.5.
Expand All @@ -145,47 +145,142 @@ def __init__(
super().__init__(threshold=threshold, **kwargs)
self.secret_types = secret_types
self.detect_entropy = detect_entropy
self._detect_secrets_available = None

def _try_detect_secrets(self, text: str) -> tuple[Dict[str, int], List[Dict[str, Any]]]:
"""Use detect-secrets library for primary detection."""
if self._detect_secrets_available is False:
return {}, []

try:
from detect_secrets.core.secrets_collection import SecretsCollection
from detect_secrets.settings import transient_settings
import tempfile
import os
except ImportError:
self._detect_secrets_available = False
logger.debug("detect-secrets not installed, using built-in patterns")
return {}, []

self._detect_secrets_available = True
found: Dict[str, int] = {}
matches: List[Dict[str, Any]] = []

# Write text to temp file for full-context scanning
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
temp_file.write(text.encode("utf-8"))
temp_file.close()

try:
secrets = SecretsCollection()
with transient_settings({"plugins_used": [
{"name": "ArtifactoryDetector"},
{"name": "AWSKeyDetector"},
{"name": "AzureStorageKeyDetector"},
{"name": "BasicAuthDetector"},
{"name": "CloudantDetector"},
{"name": "DiscordBotTokenDetector"},
{"name": "GitHubTokenDetector"},
{"name": "IbmCloudIamDetector"},
{"name": "IbmCosHmacDetector"},
{"name": "JwtTokenDetector"},
{"name": "KeywordDetector"},
{"name": "MailchimpDetector"},
{"name": "NpmDetector"},
{"name": "PrivateKeyDetector"},
{"name": "SendGridDetector"},
{"name": "SlackDetector"},
{"name": "SoftlayerDetector"},
{"name": "SquareOAuthDetector"},
{"name": "StripeDetector"},
{"name": "TwilioKeyDetector"},
{"name": "Base64HighEntropyString", "limit": 4.5},
{"name": "HexHighEntropyString", "limit": 3.0},
]}):
secrets.scan_file(str(temp_file.name))

for file in secrets.files:
for secret in secrets[file]:
if secret.secret_value is None:
continue
secret_val = secret.secret_value
if len(secret_val) < 6:
continue

secret_type = secret.type
found[secret_type] = found.get(secret_type, 0) + 1

# Find position in original text
try:
val_start = text.index(secret_val)
except ValueError:
continue
matches.append({
"type": secret_type,
"start": val_start,
"end": val_start + len(secret_val),
"text": secret_val,
})
finally:
os.remove(temp_file.name)

return found, matches

def scan(self, text: str, **kwargs: Any) -> ScanResult:
found_secrets: Dict[str, int] = {}
secret_matches: List[Dict[str, Any]] = [] # position + value info for sanitization
secret_matches: List[Dict[str, Any]] = []

# Method 1: detect-secrets library (primary)
ds_found, ds_matches = self._try_detect_secrets(text)
found_secrets.update(ds_found)
secret_matches.extend(ds_matches)

# Method 1: Vendor-specific patterns
# Method 2: Built-in vendor-specific patterns (supplement detect-secrets)
for secret_type, pattern in VENDOR_PATTERNS.items():
if self.secret_types and secret_type not in self.secret_types:
continue
for match in pattern.finditer(text):
found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1
secret_matches.append({
"type": secret_type,
"start": match.start(),
"end": match.end(),
"text": match.group(0),
})

# Method 2: Generic keyword patterns
# Avoid duplicates with detect-secrets findings
match_text = match.group(0)
already_found = any(
m["text"] == match_text or
(m["start"] <= match.start() < m["end"])
for m in secret_matches
)
if not already_found:
found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1
secret_matches.append({
"type": secret_type,
"start": match.start(),
"end": match.end(),
"text": match.group(0),
})

# Method 3: Generic keyword patterns (supplement)
for secret_type, pattern in KEYWORD_PATTERNS.items():
if self.secret_types and secret_type not in self.secret_types:
continue
for match in pattern.finditer(text):
found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1
# For keyword patterns, the value is in group(1) or group(2)
try:
value = match.group(2) if match.lastindex and match.lastindex >= 2 else match.group(1)
val_start = match.start() + match.group(0).index(value)
except (IndexError, ValueError):
value = match.group(0)
val_start = match.start()
# Avoid duplicates
already_found = any(
m["text"] == value or
(m["start"] <= val_start < m["end"])
for m in secret_matches
)
if not already_found:
found_secrets[secret_type] = found_secrets.get(secret_type, 0) + 1
secret_matches.append({
"type": secret_type,
"start": val_start,
"end": val_start + len(value),
"text": value,
})
except (IndexError, ValueError):
secret_matches.append({
"type": secret_type,
"start": match.start(),
"end": match.end(),
"text": match.group(0),
})

# Method 3: High-entropy string detection
if self.detect_entropy:
Expand Down
Loading