@@ -102,9 +102,9 @@ def detect_ml_context(self, text: str) -> list[DetectionResult]:
102102
103103 # Context patterns
104104 context_patterns = [
105- (r'(password|passwd|pwd)\s*[=:]\ s*["\']?([^"\'\s]{6,})' , 'password_context' ),
106- (r'(secret|key|token)\s*[=:]\s*["\']?([A-Za-z0-9!@#$%^&*]{8,})' , 'secret_context' ),
107- (r'(api[_-]?key)\s*[=:]\s*["\']?([A-Za-z0-9_-]{16,})' , 'api_key_context' ),
105+ (r'(password|passwd|pwd)\\ s*[=:] \\ s*["\']?([^"\'\ \s]{6,})' , 'password_context' ),
106+ (r'(secret|key|token)\\ s*[=:] \ \s*["\']?([A-Za-z0-9!@#$%^&*]{8,})' , 'secret_context' ),
107+ (r'(api[_-]?key)\\ s*[=:] \ \s*["\']?([A-Za-z0-9_-]{16,})' , 'api_key_context' ),
108108 ]
109109
110110 for pattern , ctx_type in context_patterns :
@@ -191,237 +191,41 @@ def _merge_results(self, results: list[DetectionResult]) -> list[DetectionResult
191191 merged .append (current )
192192 return merged
193193
194- def hybrid_anonymize (self , text : str ) -> tuple [str , list [DetectionResult ], dict ]:
195- """Anonymize using hybrid approach."""
196- detections = self .hybrid_detect (text )
197-
198- # Sort by position descending for replacement
199- sorted_detections = sorted (detections , key = lambda d : d .position [0 ], reverse = True )
200-
194+ def sort_detections (self , detections : list [DetectionResult ]) -> list [DetectionResult ]:
195+ """Sort detections by position in descending order for replacement."""
196+ return sorted (detections , key = lambda d : d .position [0 ], reverse = True )
197+
198+ def create_anonymization_mask (self , detection : DetectionResult , index : int ) -> str :
199+ """Create an anonymization mask for a detection."""
200+ if detection .detected_by == 'regex' :
201+ mask = f"[REGEX_{ detection .pattern_type .upper ()} _{ index :04d} ]"
202+ else :
203+ mask = f"[ML_{ detection .detected_by .upper ()} _{ index :04d} ]"
204+ return mask
205+
206+ def perform_anonymization (self , text : str , detections : list [DetectionResult ]) -> tuple [str , dict ]:
207+ """Perform anonymization by replacing detected text with masks."""
201208 anonymized = text
202209 mapping = {}
203- stats = {'regex' : 0 , 'ml_entropy' : 0 , 'ml_context' : 0 , 'ml_semantic' : 0 }
204-
205- for i , detection in enumerate (sorted_detections ):
210+ for i , detection in enumerate (detections ):
206211 original = detection .text
207-
208- # Create mask
209- if detection .detected_by == 'regex' :
210- mask = f"[REGEX_{ detection .pattern_type .upper ()} _{ i :04d} ]"
211- else :
212- mask = f"[ML_{ detection .detected_by .upper ()} _{ detection .pattern_type .upper ()} _{ i :04d} ]"
213-
214- # Replace in text
212+ mask = self .create_anonymization_mask (detection , i )
215213 start , end = detection .position
216214 anonymized = anonymized [:start ] + mask + anonymized [end :]
217-
218215 mapping [mask ] = original
219- stats [detection .detected_by ] = stats .get (detection .detected_by , 0 ) + 1
220-
221- return anonymized , detections , mapping , stats
222-
223-
224- def create_test_scenarios () -> dict [str , str ]:
225- """Create test scenarios with various sensitive data."""
226-
227- return {
228- "scenario_1_mixed" : """
229- # Mixed sensitive data types
230- DATABASE_URL = "postgresql://admin:SuperSecret123!@db.internal.com:5432/myapp"
231- STRIPE_KEY = "sk_live_EXAMPLE_DUMMY_KEY_NOT_REAL"
232- AWS_KEY = "AKIAIOSFODNN7EXAMPLE"
233- email = "admin@company.com"
234- phone = "+1 555 123 4567"
235- password = "MyStr0ng!P@ssw0rd2024"
236- secret_token = "x9k#mP2$vL8@nQ4*wJ7&cR3^hF5(bN6"
237- encryption_key = "a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456"
238- """ ,
239-
240- "scenario_2_code" : """
241- class PaymentService:
242- def __init__(self):
243- self.api_key = "sk_live_EXAMPLE_DUMMY_KEY_NOT_REAL"
244- self.db_password = "p@ssw0rd!#2024Secure"
245- self.jwt_secret = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9"
246- self.session_token = "a1b2c3d4e5f6789012345678901234567890abcdef"
247-
248- def connect(self):
249- conn_str = "postgresql://user:Secret123!@localhost/db"
250- return conn_str
251- """ ,
252-
253- "scenario_3_config" : """
254- [database]
255- host = prod-db-01.internal.company.com
256- password = AnotherSecretPassword456!
257- encryption_key = AES256-KEY-HERE-VERY-SECRET
258-
259- [api_keys]
260- stripe = sk_live_EXAMPLE_DUMMY_KEY_NOT_REAL
261- aws_access = AKIAIOSFODNN7EXAMPLE
262- aws_secret = wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
263-
264- [tokens]
265- session = x9k#mP2$vL8@nQ4*wJ7&cR3^hF5(bN6
266- nonce = 7f83b1657ff1fc53b92dc18148a1d65dfc2d4b1fa3d677284addd200126d9069
267- """ ,
268-
269- "scenario_4_edge_cases" : """
270- # Edge cases that challenge detection
271- # 1. Password in URL
272- url = "https://user:pass123@api.example.com/data"
273-
274- # 2. High entropy but readable
275- readable_random = "correct-horse-battery-staple" # diceware style
276-
277- # 3. Base64 encoded secret
278- b64_secret = "d2Vha2J1dHN0cm9uZ3Bhc3N3b3JkMTIz"
279-
280- # 4. Very long random
281- long_random = "a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6"
282-
283- # 5. Short but high entropy (should NOT be detected)
284- short_high = "aB3$"
285-
286- # 6. Normal text that might trigger
287- normal_text = "This is just a normal sentence with some words"
288- """ ,
289- }
290-
291-
292- def main ():
293- print ("=" * 80 )
294- print ("LLX Privacy: Hybrid ML + Regex Anonymization System" )
295- print ("=" * 80 )
296-
297- hybrid = HybridAnonymizer ()
298- scenarios = create_test_scenarios ()
299-
300- # Compare detection methods
301- print ("\n 1. DETECTION METHOD COMPARISON" )
302- print ("-" * 60 )
303-
304- for scenario_name , text in list (scenarios .items ())[:2 ]:
305- print (f"\n Scenario: { scenario_name } " )
306- print ("-" * 40 )
307-
308- # Regex only
309- regex_findings = hybrid .regex_anon .scan (text )
310- regex_count = sum (len (v ) for v in regex_findings .values ())
311- print (f"Regex detection: { regex_count } items" )
312-
313- # ML only
314- ml_results = hybrid .detect_ml_entropy (text ) + hybrid .detect_ml_context (text )
315- ml_count = len (ml_results )
316- print (f"ML detection: { ml_count } items" )
317-
318- # Hybrid
319- hybrid_results = hybrid .hybrid_detect (text )
320- print (f"Hybrid detection: { len (hybrid_results )} items" )
321-
322- # Show breakdown
323- by_method = {}
324- for r in hybrid_results :
325- by_method [r .detected_by ] = by_method .get (r .detected_by , 0 ) + 1
326-
327- print (" Breakdown:" , end = "" )
328- for method , count in by_method .items ():
329- print (f" { method } ={ count } " , end = "" )
330- print ()
331-
332- # Full anonymization example
333- print ("\n 2. FULL HYBRID ANONYMIZATION" )
334- print ("-" * 60 )
335-
336- test_text = scenarios ['scenario_1_mixed' ]
337-
338- print ("Original text:" )
339- print (test_text )
340-
341- anon_text , detections , mapping , stats = hybrid .hybrid_anonymize (test_text )
342-
343- print ("\n Anonymized text:" )
344- print (anon_text )
345-
346- print (f"\n Detection statistics:" )
347- for method , count in stats .items ():
348- print (f" { method } : { count } items" )
349-
350- print (f"\n Mapping ({ len (mapping )} items):" )
351- for mask , original in list (mapping .items ())[:5 ]:
352- print (f" { mask } ← { original } " )
353-
354- # Edge cases
355- print ("\n 3. EDGE CASE ANALYSIS" )
356- print ("-" * 60 )
357-
358- edge_cases = scenarios ['scenario_4_edge_cases' ]
359- edge_results = hybrid .hybrid_detect (edge_cases )
360-
361- print ("Edge case results:" )
362- for result in edge_results :
363- print (f" { result .pattern_type :<25} detected_by={ result .detected_by :<12} "
364- f"conf={ result .confidence :.2f} text='{ result .text [:30 ]} ...'" )
365-
366- # Project-level hybrid
367- print ("\n 4. PROJECT-LEVEL HYBRID ANONYMIZATION" )
368- print ("-" * 60 )
216+ return anonymized , mapping
369217
370- with tempfile .TemporaryDirectory () as tmpdir :
371- project_path = Path (tmpdir ) / "hybrid_project"
372- project_path .mkdir ()
373-
374- # Create project files
375- (project_path / "config.py" ).write_text (scenarios ['scenario_1_mixed' ])
376- (project_path / "services.py" ).write_text (scenarios ['scenario_2_code' ])
377- (project_path / "settings.ini" ).write_text (scenarios ['scenario_3_config' ])
378-
379- print (f"Created project with 3 files" )
380-
381- # Hybrid approach on project
382- ctx = AnonymizationContext (project_path = project_path )
383-
384- # First pass: regex-based via ProjectAnonymizer
385- project_anon = ProjectAnonymizer (ctx )
386- result = project_anon .anonymize_project ()
387-
388- # Second pass: ML-based on content
389- ml_hybrid = HybridAnonymizer ()
390-
391- total_ml_findings = 0
392- for file_path , content in result .files .items ():
393- if file_path .endswith (('.py' , '.ini' , '.txt' )):
394- ml_results = ml_hybrid .hybrid_detect (content )
395- total_ml_findings += len (ml_results )
396-
397- print (f"Project anonymization:" )
398- print (f" Regex-based: { len (ctx .variables )} variables, { len (ctx .functions )} functions" )
399- print (f" ML-based findings: { total_ml_findings } high-entropy/contextual items" )
400-
401- # Show combined result
402- sample_file = list (result .files .keys ())[0 ]
403- print (f"\n Sample output ({ sample_file } ):" )
404- print (result .files [sample_file ][:500 ])
405-
406- # Comparison table
407- print ("\n 5. METHOD COMPARISON SUMMARY" )
408- print ("-" * 60 )
409- print (f"{ 'Method' :<20} { 'Strengths' :<35} { 'Limitations' } " )
410- print ("-" * 80 )
411- print (f"{ 'Regex-only' :<20} { 'Known patterns, fast, precise' :<35} { 'Misses unknown/random strings' } " )
412- print (f"{ 'ML-entropy' :<20} { 'Random strings, high entropy' :<35} { 'May flag legitimate code' } " )
413- print (f"{ 'ML-context' :<20} { 'Contextual passwords' :<35} { 'Requires context analysis' } " )
414- print (f"{ 'Hybrid' :<20} { 'Maximum coverage, best of both' :<35} { 'Slightly more complex' } " )
218+ def collect_anonymization_stats (self , detections : list [DetectionResult ]) -> dict :
219+ """Collect statistics on detections by type."""
220+ stats = {'regex' : 0 , 'ml_entropy' : 0 , 'ml_context' : 0 , 'ml_semantic' : 0 }
221+ for detection in detections :
222+ stats [detection .detected_by ] += 1
223+ return stats
415224
416- print ("\n " + "=" * 80 )
417- print ("Hybrid system advantages:" )
418- print (" ✓ Regex catches known patterns with high precision" )
419- print (" ✓ ML catches random passwords/keys regex misses" )
420- print (" ✓ Context analysis finds passwords in code patterns" )
421- print (" ✓ Merging avoids duplicate detections" )
422- print (" ✓ Best coverage for both structured and unstructured secrets" )
423- print ("=" * 80 )
424-
425-
426- if __name__ == "__main__" :
427- main ()
225+ def hybrid_anonymize (self , text : str ) -> tuple [str , list [DetectionResult ], dict ]:
226+ """Anonymize using hybrid approach."""
227+ detections = self .hybrid_detect (text )
228+ sorted_detections = self .sort_detections (detections )
229+ anonymized , mapping = self .perform_anonymization (text , sorted_detections )
230+ stats = self .collect_anonymization_stats (detections )
231+ return anonymized , detections , stats
0 commit comments