Hyperion/docs/examples/python/02_semantic_cache.py at main · Hyperion-HQ/Hyperion · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import time
from hyperion import HyperionClient

"""
Hyperion Semantic Cache Example
Demonstrates L2 vector-based semantic hit caching for conceptually similar queries.
"""

client = HyperionClient(
    #api_key="HYPERION_API_KEY"
)

def semantic_cache_demo():
    print("--- 1. First Request (Cache Miss) ---")
    t0 = time.perf_counter()
    response1 = client.chat.completions.create(
        model="openai/gpt-5.2",
        messages=[{"role": "user", "content": "How far away is jupiter from the earth?"}]
    )
    t1 = time.perf_counter()
    print(f"Response: {response1.choices[0].message.content}")
    print(f"Latency: {(t1 - t0) * 1000:.1f}ms")
    print(f"Cache Status: {response1.hyperion.cache_status}")

    # Brief pause to allow async embedding storage to complete in the Gateway.
    time.sleep(1)

    print("\n--- 2. Conceptually Similar Request (Semantic Cache Hit) ---")
    # This query uses different words but has the same semantic meaning.
    # Hyperion's vector embedder catches it and returns the cached answer.
    t2 = time.perf_counter()
    response2 = client.chat.completions.create(
        model="openai/gpt-5.2-mini",
        messages=[{"role": "user", "content": "How far away is the earth from jupiter?"}]
    )
    t3 = time.perf_counter()
    print(f"Response: {response2.choices[0].message.content}")
    print(f"Latency: {(t3 - t2) * 1000:.1f}ms")
    print(f"Cache Status: {response2.hyperion.cache_status}")
    print(f"Cache Type: {response2.hyperion.cache_type or 'None'}")
    print(f"Similarity Score: {response2.hyperion.formatted_similarity_score}")

if __name__ == "__main__":
    semantic_cache_demo()