-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy path02_semantic_cache.py
More file actions
45 lines (39 loc) · 1.63 KB
/
02_semantic_cache.py
File metadata and controls
45 lines (39 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import time
from hyperion import HyperionClient
"""
Hyperion Semantic Cache Example
Demonstrates L2 vector-based semantic hit caching for conceptually similar queries.
"""
client = HyperionClient(
#api_key="HYPERION_API_KEY"
)
def semantic_cache_demo():
print("--- 1. First Request (Cache Miss) ---")
t0 = time.perf_counter()
response1 = client.chat.completions.create(
model="openai/gpt-5.2",
messages=[{"role": "user", "content": "How far away is jupiter from the earth?"}]
)
t1 = time.perf_counter()
print(f"Response: {response1.choices[0].message.content}")
print(f"Latency: {(t1 - t0) * 1000:.1f}ms")
print(f"Cache Status: {response1.hyperion.cache_status}")
# Brief pause to allow async embedding storage to complete in the Gateway.
time.sleep(1)
print("\n--- 2. Conceptually Similar Request (Semantic Cache Hit) ---")
# This query uses different words but has the same semantic meaning.
# Hyperion's vector embedder catches it and returns the cached answer.
t2 = time.perf_counter()
response2 = client.chat.completions.create(
model="openai/gpt-5.2-mini",
messages=[{"role": "user", "content": "How far away is the earth from jupiter?"}]
)
t3 = time.perf_counter()
print(f"Response: {response2.choices[0].message.content}")
print(f"Latency: {(t3 - t2) * 1000:.1f}ms")
print(f"Cache Status: {response2.hyperion.cache_status}")
print(f"Cache Type: {response2.hyperion.cache_type or 'None'}")
print(f"Similarity Score: {response2.hyperion.formatted_similarity_score}")
if __name__ == "__main__":
semantic_cache_demo()