gpu/templates/ollama-models/gpu.jsonc at main · gpu-cli/gpu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
{
  "$schema": "https://gpu-cli.sh/schema/v1/gpu.json",
  // Ollama Models Example
  // Run Open Source LLMs on remote GPU with Web UI and API endpoints
  //
  // Endpoints exposed:
  // - Port 11434: Ollama API (OpenAI-compatible + Anthropic Messages API)
  // - Port 8080:  Lightweight Web UI for chatting with models
  //
  // To configure which models to pre-pull, edit models.json

  "project_id": "ollama-models",
  "provider": "runpod",

  "template": {
    "name": "Ollama Models",
    "description": "Run Open Source LLMs on cloud GPUs with Ollama + Web UI",
    "author": "gpu-cli"
  },

  // GPU selection - LLMs benefit from high VRAM
  // Priority order: try first, fall back to next
  "gpu_types": [
    { "type": "NVIDIA GeForce RTX 4090" },  // 24GB - great for 7B-13B models
    { "type": "NVIDIA A40" },                // 48GB - good for 30B models
    { "type": "NVIDIA L40S" },               // 48GB - newer, good availability
    { "type": "NVIDIA A100 80GB PCIe" }      // 80GB - for 70B+ models
  ],
  "min_vram": 16,
  "max_price": 1.50,  // Max $/hr - enables auto-fallback to available GPUs within budget

  // Disk space for models (glm-4.7-flash ~18GB + system overhead)
  "workspace_size_gb": 50,

  // Dual port forwarding - both ports accessible on localhost after gpu run
  "ports": [11434, 8080],

  // Keep-alive time after last activity (pod stops after this idle period)
  // Set higher to allow model loading time (large models can take 5-15 minutes)
  "keep_alive_minutes": 20,

  // Startup launches Ollama server + Web UI
  "startup": "bash ./startup.sh",

  // Sync conversation database back to local machine
  // WAL checkpoint happens on shutdown for clean sync
  "outputs": ["data/conversations.db"],

  // Readiness hook - wait for Ollama API to be ready before marking pod as ready
  "hooks": {
    "readiness": {
      "type": "command",
      "name": "ollama-ready",
      "run": ["curl", "-sf", "http://localhost:11434/api/tags"],
      "retry_count": 30,
      "retry_delay_secs": 2,
      "timeout_secs": 10
    }
  },

  "environment": {
    "base_image": "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04",

    "system": {
      "apt": [
        { "name": "curl" },
        { "name": "jq" },
        { "name": "zstd" }
      ]
    },

    "shell": {
      "steps": [
        // Install Ollama (runs once during image build)
        {
          "run": "curl -fsSL https://ollama.com/install.sh | sh"
        }
      ]
    }
  }
}