-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgpu.jsonc
More file actions
81 lines (69 loc) · 2.43 KB
/
gpu.jsonc
File metadata and controls
81 lines (69 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
{
"$schema": "https://gpu-cli.sh/schema/v1/gpu.json",
// Ollama Models Example
// Run Open Source LLMs on remote GPU with Web UI and API endpoints
//
// Endpoints exposed:
// - Port 11434: Ollama API (OpenAI-compatible + Anthropic Messages API)
// - Port 8080: Lightweight Web UI for chatting with models
//
// To configure which models to pre-pull, edit models.json
"project_id": "ollama-models",
"provider": "runpod",
"template": {
"name": "Ollama Models",
"description": "Run Open Source LLMs on cloud GPUs with Ollama + Web UI",
"author": "gpu-cli"
},
// GPU selection - LLMs benefit from high VRAM
// Priority order: try first, fall back to next
"gpu_types": [
{ "type": "NVIDIA GeForce RTX 4090" }, // 24GB - great for 7B-13B models
{ "type": "NVIDIA A40" }, // 48GB - good for 30B models
{ "type": "NVIDIA L40S" }, // 48GB - newer, good availability
{ "type": "NVIDIA A100 80GB PCIe" } // 80GB - for 70B+ models
],
"min_vram": 16,
"max_price": 1.50, // Max $/hr - enables auto-fallback to available GPUs within budget
// Disk space for models (glm-4.7-flash ~18GB + system overhead)
"workspace_size_gb": 50,
// Dual port forwarding - both ports accessible on localhost after gpu run
"ports": [11434, 8080],
// Keep-alive time after last activity (pod stops after this idle period)
// Set higher to allow model loading time (large models can take 5-15 minutes)
"keep_alive_minutes": 20,
// Startup launches Ollama server + Web UI
"startup": "bash ./startup.sh",
// Sync conversation database back to local machine
// WAL checkpoint happens on shutdown for clean sync
"outputs": ["data/conversations.db"],
// Readiness hook - wait for Ollama API to be ready before marking pod as ready
"hooks": {
"readiness": {
"type": "command",
"name": "ollama-ready",
"run": ["curl", "-sf", "http://localhost:11434/api/tags"],
"retry_count": 30,
"retry_delay_secs": 2,
"timeout_secs": 10
}
},
"environment": {
"base_image": "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04",
"system": {
"apt": [
{ "name": "curl" },
{ "name": "jq" },
{ "name": "zstd" }
]
},
"shell": {
"steps": [
// Install Ollama (runs once during image build)
{
"run": "curl -fsSL https://ollama.com/install.sh | sh"
}
]
}
}
}