-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgpu_memory_optimization.cpp
More file actions
98 lines (85 loc) · 4.48 KB
/
Copy pathgpu_memory_optimization.cpp
File metadata and controls
98 lines (85 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include "model.h"
#include "logger.h"
#ifdef HAS_CUDA
#include <cuda_runtime.h>
#include "cuda_kernels.h"
void TinyLlamaModel::allocate_persistent_batch_buffers() {
if (d_persistent_batch_input_ != nullptr) {
Logger::info("[GPU_OPT] Persistent batch buffers already allocated");
return;
}
int hs = config_.hidden_size;
int is = config_.intermediate_size;
int n_kv_heads = config_.num_key_value_heads;
int head_dim = hs / config_.num_attention_heads;
size_t input_size = MAX_BATCH_TOKENS * hs * sizeof(float);
size_t norm_size = MAX_BATCH_TOKENS * hs * sizeof(float);
size_t residual_size = MAX_BATCH_TOKENS * hs * 2 * sizeof(float);
size_t q_size = MAX_BATCH_TOKENS * hs * sizeof(float);
size_t k_size = MAX_BATCH_TOKENS * n_kv_heads * head_dim * sizeof(float);
size_t v_size = MAX_BATCH_TOKENS * n_kv_heads * head_dim * sizeof(float);
size_t attn_out_size = MAX_BATCH_TOKENS * hs * sizeof(float);
size_t attn_proj_size = MAX_BATCH_TOKENS * hs * sizeof(float);
size_t gate_size = MAX_BATCH_TOKENS * is * sizeof(float);
size_t up_size = MAX_BATCH_TOKENS * is * sizeof(float);
size_t swiglu_size = MAX_BATCH_TOKENS * is * sizeof(float);
size_t mlp_down_size = MAX_BATCH_TOKENS * hs * sizeof(float);
size_t total_size = input_size + norm_size + residual_size + q_size + k_size + v_size +
attn_out_size + attn_proj_size + gate_size + up_size + swiglu_size + mlp_down_size;
Logger::info("[GPU_OPT] Allocating persistent batch buffers: " +
std::to_string(total_size / (1024*1024)) + " MB total");
gpuErrchk(cudaMalloc(&d_persistent_batch_input_, input_size));
gpuErrchk(cudaMalloc(&d_persistent_batch_norm_out_, norm_size));
gpuErrchk(cudaMalloc(&d_persistent_batch_residual_, residual_size));
gpuErrchk(cudaMalloc(&d_persistent_q_batch_, q_size));
gpuErrchk(cudaMalloc(&d_persistent_k_batch_, k_size));
gpuErrchk(cudaMalloc(&d_persistent_v_batch_, v_size));
gpuErrchk(cudaMalloc(&d_persistent_attn_output_, attn_out_size));
gpuErrchk(cudaMalloc(&d_persistent_attn_proj_out_, attn_proj_size));
gpuErrchk(cudaMalloc(&d_persistent_gate_proj_out_, gate_size));
gpuErrchk(cudaMalloc(&d_persistent_up_proj_out_, up_size));
gpuErrchk(cudaMalloc(&d_persistent_swiglu_out_, swiglu_size));
gpuErrchk(cudaMalloc(&d_persistent_mlp_down_out_, mlp_down_size));
Logger::info("[GPU_OPT] Successfully allocated persistent batch buffers");
}
void TinyLlamaModel::free_persistent_batch_buffers() {
if (d_persistent_batch_input_) {
Logger::info("[GPU_OPT] Freeing persistent batch buffers");
gpuErrchk(cudaFree(d_persistent_batch_input_));
gpuErrchk(cudaFree(d_persistent_batch_norm_out_));
gpuErrchk(cudaFree(d_persistent_batch_residual_));
gpuErrchk(cudaFree(d_persistent_q_batch_));
gpuErrchk(cudaFree(d_persistent_k_batch_));
gpuErrchk(cudaFree(d_persistent_v_batch_));
gpuErrchk(cudaFree(d_persistent_attn_output_));
gpuErrchk(cudaFree(d_persistent_attn_proj_out_));
gpuErrchk(cudaFree(d_persistent_gate_proj_out_));
gpuErrchk(cudaFree(d_persistent_up_proj_out_));
gpuErrchk(cudaFree(d_persistent_swiglu_out_));
gpuErrchk(cudaFree(d_persistent_mlp_down_out_));
d_persistent_batch_input_ = nullptr;
d_persistent_batch_norm_out_ = nullptr;
d_persistent_batch_residual_ = nullptr;
d_persistent_q_batch_ = nullptr;
d_persistent_k_batch_ = nullptr;
d_persistent_v_batch_ = nullptr;
d_persistent_attn_output_ = nullptr;
d_persistent_attn_proj_out_ = nullptr;
d_persistent_gate_proj_out_ = nullptr;
d_persistent_up_proj_out_ = nullptr;
d_persistent_swiglu_out_ = nullptr;
d_persistent_mlp_down_out_ = nullptr;
}
}
void TinyLlamaModel::resize_persistent_batch_buffers_if_needed(int required_batch_size) {
if (required_batch_size <= MAX_BATCH_TOKENS) {
if (d_persistent_batch_input_ == nullptr) {
allocate_persistent_batch_buffers();
}
return;
}
Logger::warning("[GPU_OPT] Required batch size " + std::to_string(required_batch_size) +
" exceeds MAX_BATCH_TOKENS " + std::to_string(MAX_BATCH_TOKENS) +
". This will cause memory reallocation per forward pass.");
}
#endif // HAS_CUDA