diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 11d3e8a8167..96cac71c68a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -688,7 +688,9 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[8]; + char padding[16]; + // add a struct ggml_tensor * named org_src, initialized to NULL, for keeping track of original source tensors in case of in-place operations + struct ggml_tensor * org_src; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 22c656996cc..7c01bf0b6bf 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1124,8 +1124,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra struct ggml_tensor * node = graph->nodes[i]; int * cur_backend_id = &tensor_backend_id(node); if (node->view_src != NULL && *cur_backend_id == -1) { - *cur_backend_id = tensor_backend_id(node->view_src); - SET_CAUSE(node, "4.vsrc"); + auto view_src_backend = tensor_backend_id(node->view_src); + if (view_src_backend != -1 && ggml_backend_supports_op(sched->backends[view_src_backend], node)) { + *cur_backend_id = tensor_backend_id(node->view_src); + SET_CAUSE(node, "4.vsrc"); + } } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1151,6 +1154,14 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra GGML_ASSERT(*cur_backend_id != -1); } + // add the node id to the name for easier debugging + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + char new_name[128]; + snprintf(new_name, sizeof(new_name), "%s#%d", node->name, i); + ggml_format_name(node, "%s", new_name); + } + // pass 5: split graph, find tensors that need to be copied { int i_split = 0; @@ -1171,7 +1182,9 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { - continue; + if ((tensor_backend_id(node) != cur_backend_id) && (ggml_backend_supports_op(sched->backends[cur_backend_id], node))) { + tensor_backend_id(node) = cur_backend_id; + } } const int node_backend_id = tensor_backend_id(node); @@ -1269,6 +1282,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } + tensor_copy->org_src = src; tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0938d2273e9..9753272038c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,25 +1,19 @@ #include "ggml-decoder.h" -#include "ggml-backend-impl.h" -#include "ggml-backend.h" +#include "ggml-impl.h" #include "ggml-openvino-extra.h" #include "ggml-openvino.h" #include "ggml-quants.h" -#include -#include - #include #include #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -31,12 +25,10 @@ #include #include #include -#include #include #include #include #include -#include #include GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, @@ -45,6 +37,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, bool is_static, bool is_stateful, + bool model_is_splitted, bool is_prefill, int prefill_chunk_size) : m_is_static(is_static), @@ -52,6 +45,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_is_prefill(is_prefill), m_naive(false), m_prefill_chunk_size(prefill_chunk_size), + m_model_is_splitted(model_is_splitted), m_cgraph(cgraph), m_model_weights(model_weights), m_model_params(model_params), @@ -68,6 +62,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, validate_cgraph(); set_input_output(); + compute_node_dynamic_dims(); compute_model_inputs(); compute_model_outputs(); @@ -157,23 +152,13 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { if (src->ne[2] * src->ne[3] == node->ne[1]) { op_case = 5; } - } else if (src->ne[0] * src->ne[1] == node->ne[1]) { + } else if (src->ne[0] * src->ne[1] * src->ne[2] == node->ne[1]) { op_case = 3; } else if (src->ne[1] * src->ne[2] == node->ne[1]) { op_case = 6; } break; } - case GGML_OP_CONT: { - if (node->src[0]->op == GGML_OP_PERMUTE) { - op_case = 1; - } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { - op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW) { - op_case = 3; - } - break; - } case GGML_OP_PERMUTE: { if (node->src[0]->op != GGML_OP_VIEW) { op_case = 1; @@ -181,22 +166,40 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { // kv cache tensor std::string src_name(node->view_src->name); int layer = extract_layer_from_name(src_name); - if (!is_swa_layer(layer)) { - op_case = 2; + if (ggml_is_contiguous(node->src[0])) { + // - 19: [ 64, 8, 256, 1] VIEW cache_k_l0 (view) [ 2, 128, 1024, 1048576] + // [ 512, 1024, 1, 1] 0: NONE cache_k_l0 [ 2, 1024, 1048576, 1048576] + // - 20: [ 64, 256, 8, 1] PERMUTE cache_k_l0 (view) (permuted) [ 2, 1024, 128, 1048576] + // [ 64, 8, 256, 1] 0: VIEW cache_k_l0 (view) [ 2, 128, 1024, 1048576] + if (!is_swa_layer(layer)) { + op_case = 3; + } else { + op_case = 4; + } } else { - op_case = 3; + // special case of cache v when `-fa off` + // - 17: [ 256, 8, 64, 1] VIEW cache_v_l0 (view) [ 2, 131072, 2048, 1048576] + // [ 512, 1024, 1, 1] 0: NONE cache_v_l0 [ 2, 1024, 1048576, 1048576] + // - 18: [ 256, 64, 8, 1] PERMUTE cache_v_l0 (view) (permuted) [ 2, 2048, 131072, 1048576] + // [ 256, 8, 64, 1] 0: VIEW cache_v_l0 (view) [ 2, 131072, 2048, 1048576] + if (!is_swa_layer(layer)) { + op_case = 5; + } else { + op_case = 6; + } } } else { // rope'ed query tensor - op_case = 4; + op_case = 2; } break; } case GGML_OP_MUL_MAT: { - if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { - op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { + if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { op_case = 3; + } else if (node->src[1]->op == GGML_OP_SOFT_MAX) { + // In the case of `-fa off`, softmax is used, v_trans=true, the dynamic dim is ne[0] for cache_v + op_case = 2; } break; } @@ -219,18 +222,39 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { throw std::runtime_error("Unsupported VIEW case"); } op_case = 2; + if (m_model_is_splitted && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) { + op_case = 0; + } } { auto * src = node->src[0]; - if ((ggml_nelements(node) != ggml_nelements(src)) && m_naive) { - // Compare each dimension of node and src, if only one dimension differs then op_case=3 + if (ggml_nelements(node) != ggml_nelements(src)) { + // Case 4: select one slice on src dim1 (via view offset), keep src dim2 as output dim1. + // Typical pattern: + // src: ne=[N, M, K, 1], nb=[b0, b1, b2, b3] + // dst: ne=[N, K, 1, 1], nb=[b0, b2, b3, b3] + if (node->ne[0] == src->ne[0] && + node->ne[1] == src->ne[2] && + node->ne[2] == 1 && + node->nb[0] == src->nb[0] && + node->nb[1] == src->nb[2] && + src->ne[1] > 1) { + op_case = 4; + break; + } + + // General case 3: shape differs from source (one or more dims) and is handled as VIEW slicing. int diff_count = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { if (node->ne[i] != src->ne[i]) { diff_count++; } + // if node ne[i] > src ne[i], case = 0 + if (node->ne[i] > src->ne[i]) { + return 0; + } } - if (diff_count == 1) { + if (diff_count >= 1) { op_case = 3; } } @@ -262,13 +286,20 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr for (int i = 0; i < cgraph->n_nodes; i++) { auto * node = cgraph->nodes[i]; std::string name = std::string(node->name); - if (node->op == GGML_OP_FLASH_ATTN_EXT) { - model_params.n_heads = node->src[0]->ne[2]; - model_params.n_heads_kv = node->src[1]->ne[2]; - model_params.head_size = node->src[0]->ne[0]; + if (node->op == GGML_OP_FLASH_ATTN_EXT || (node->op == GGML_OP_SOFT_MAX && node->src[1] != nullptr)) { compute_params.input_len = node->src[0]->ne[1]; + auto * q_perm = node->src[0]; auto * cache_k_perm = node->src[1]; + if (node->op == GGML_OP_SOFT_MAX) { + q_perm = node->src[0]->src[1]; + cache_k_perm = node->src[0]->src[0]; + } + model_params.head_size = cache_k_perm->ne[0]; + model_params.n_heads_kv = cache_k_perm->ne[2]; + model_params.n_heads = q_perm->ne[2]; + compute_params.token_len_per_seq = q_perm->ne[1]; + if (cache_k_perm->op == GGML_OP_CPY) { cache_k_perm = cache_k_perm->src[0]; } @@ -278,7 +309,11 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr auto * cache_k = cache_k_view->src[0]; int layer = extract_layer_from_name(cache_k->name); + auto * mask = node->src[3]; + if (node->op == GGML_OP_SOFT_MAX) { + mask = node->src[1]; + } std::string mask_name(mask->name); model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer); @@ -295,7 +330,6 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr size_t offset; memcpy(&offset, cache_k_view->op_params, sizeof(size_t)); compute_params.seq_active_start = offset / seq_size; - compute_params.token_len_per_seq = node->ne[2]; if (mask_name.find("swa") != std::string::npos) { compute_params.attention_size_swa = mask->ne[0]; @@ -307,7 +341,14 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr compute_params.attention_size_swa = model_params.ctx_per_seq_swa; compute_params.token_len_per_seq = 1; } - break; + } + // if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT) + if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE && + node->src[0]->src[0]->op == GGML_OP_VIEW) { + compute_params.attention_size = node->ne[0]; + if (is_static) { + compute_params.attention_size = model_params.ctx_per_seq; + } } if (node->op == GGML_OP_ROPE) { memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15); @@ -330,7 +371,7 @@ void GgmlOvDecoder::validate_cgraph() const { } } -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const { +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const { if (m_naive) { return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)}; } @@ -381,6 +422,14 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else { input_shape = ov::PartialShape{get_shape(input)}; } + if (dynamic_dim_index != -1 && m_model_is_splitted) { + input_shape[3 - dynamic_dim_index] = -1; + } + if (op->op == GGML_OP_SOFT_MAX && op->src[1] != nullptr && op->src[1]->op == GGML_OP_NONE && op->src[1]->flags & GGML_TENSOR_FLAG_INPUT && op->src[1] == input) { + // for softmax input mask, the shape is [1, 1, seq_active, seq_active], where seq_active is determined by the input active sequence length instead of the kv cache sequence length + input_shape[2] = -1; + input_shape[3] = -1; + } return input_shape; } @@ -443,7 +492,7 @@ void GgmlOvDecoder::compute_model_inputs() { if (m_model_weights.find(node_name) == m_model_weights.end()) { m_inputs[node_name] = node; auto param_node = - std::make_shared(get_ov_type(node), get_graph_input_shape(node, nullptr)); + std::make_shared(get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node])); param_node->set_friendly_name(node_name); param_node->output(0).get_tensor().set_names({node_name}); m_model_inputs[node_name] = param_node; @@ -487,7 +536,7 @@ void GgmlOvDecoder::compute_model_inputs() { m_model_params.kv_names.push_back(src_name); } } - ov::PartialShape param_shape = get_graph_input_shape(node, src); + ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]); auto param_node = std::make_shared(get_ov_type(src), param_shape); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); @@ -573,9 +622,6 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) { - static std::mutex weights_mutex; - std::lock_guard lock(weights_mutex); - std::map> model_weights; auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; @@ -875,6 +921,11 @@ ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const { return get_ov_type(m_node_info_list[node_idx].node); } +std::vector GgmlOvDecoder::get_output_stride(int node_idx) const { + auto * ggml_tensor = m_node_info_list[node_idx].node; + return get_stride(ggml_tensor); +} + std::vector GgmlOvDecoder::get_output_names(int node_idx) const { return {m_node_info_list[node_idx].node_output_name}; } @@ -884,6 +935,14 @@ const std::string & GgmlOvDecoder::get_op_name() const { return unknown_name; } +int32_t GgmlOvDecoder::get_op_dynamic_dim(int node_idx) const { + auto it = m_node_dynamic_dims.find(m_node_info_list[node_idx].node); + if (it == m_node_dynamic_dims.end()) { + return -1; + } + return it->second; +} + const std::string & GgmlOvDecoder::get_op_name(int node_idx) const { return m_node_info_list[node_idx].node_name; } @@ -896,6 +955,10 @@ int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const { return m_node_info_list[node_idx].node->op_params; } +size_t GgmlOvDecoder::get_output_op_offset(int node_idx) const { + return m_node_info_list[node_idx].node->view_offs; +} + void GgmlOvDecoder::visit_subgraph(std::function, int node_idx)> node_visitor) const { for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) { if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) { @@ -920,6 +983,7 @@ std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) { {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, + {GGML_OP_NORM, "GGML_OP_NORM" }, {GGML_OP_ROPE, "GGML_OP_ROPE" }, {GGML_OP_SCALE, "GGML_OP_SCALE" }, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, @@ -973,3 +1037,275 @@ const std::string & GgmlOvDecoder::get_op_type() const { static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } + +void GgmlOvDecoder::compute_node_dynamic_dims() { + auto visit_node = [&](auto && self, ggml_tensor * node) -> void { + if (!node) { + return; + } + + if (node->op == GGML_OP_CPY) { + m_node_dynamic_dims[node] = -1; + } + + if (m_node_dynamic_dims.count(node)) { + return; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + ggml_tensor * src = node->src[i]; + if (src == nullptr) { + continue; + } + struct ggml_tensor *root_src = nullptr; + if (src->org_src) { + root_src = src->org_src; + } + if (root_src) { + if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) || + is_output_idx(root_src, node)) { + m_node_dynamic_dims[root_src] = 0; + m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src]; + continue; + } + self(self, root_src); + m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src]; + } else { + if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) { + m_node_dynamic_dims[src] = 0; + continue; + } + if ( node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful() && !m_model_is_splitted) { + m_node_dynamic_dims[src] = 1; + continue; + } + self(self, src); + } + } + switch (node->op) { + case GGML_OP_NONE: + m_node_dynamic_dims[node] = -1; + break; + case GGML_OP_GET_ROWS: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[1]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]]; + auto dynamic_dim_value = node->src[1]->ne[dynamic_dim_idx]; + if (dynamic_dim_idx == 0) { + m_node_dynamic_dims[node] = 1; + } else { + auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) * + ggml_type_size(node->src[0]->type); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (dynamic_dim_stride == node->src[0]->nb[i]) { + m_node_dynamic_dims[node] = i; + break; + } + } + } + OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]], + "Dynamic dim value mismatch for node: " + std::string(node->name) + + " and its src[1]: " + std::string(node->src[1]->name)); + } + break; + case GGML_OP_MUL: + case GGML_OP_MUL_MAT: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + } + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; + } + break; + case GGML_OP_PERMUTE: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->op_params[i] == dynamic_dim_idx) { + m_node_dynamic_dims[node] = i; + break; + } + } + OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]], + "Dynamic dim value mismatch for node: " + std::string(node->name) + + " and its src[0]: " + std::string(node->src[0]->name)); + } + break; + case GGML_OP_VIEW: { + // Use stride-based matching: the stride of a VIEW dimension directly + // encodes which source dimension it indexes into, so it uniquely + // identifies the dynamic dim even when two dims share the same size. + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + if (node->src[0]->op == GGML_OP_NONE && node->src[0]->org_src == nullptr) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + } + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx]; + auto dynamic_dim_stride = + node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) * + ggml_type_size(node->type); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->nb[i] == dynamic_dim_stride) { + m_node_dynamic_dims[node] = i; + break; + } + } + OPENVINO_ASSERT(m_node_dynamic_dims[node] != -1 && + dynamic_dim_value == node->ne[m_node_dynamic_dims[node]], + "Dynamic dim value mismatch for node: " + std::string(node->name) + + " and its src[0]: " + std::string(node->src[0]->name)); + } + break; + } + case GGML_OP_TRANSPOSE: + case GGML_OP_RESHAPE: { + // RESHAPE requires src[0] to be contiguous, so both src and result + // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]). + // Match src->nb[dynamic_dim] against result->nb[i] to find the output + // dimension whose flat-memory boundary aligns with the source dynamic + // boundary. This is unambiguous (result strides are strictly monotone) + // and handles merged-lower-dim cases that ne-value matching misses. + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) { + m_node_dynamic_dims[node] = i; + break; + } + } + if (m_node_dynamic_dims[node] == -1) { + std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl; + } + } + break; + } + case GGML_OP_FLASH_ATTN_EXT: { + // Output shape is hard-coded in ggml_flash_attn_ext as: + // ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] } + // i.e. output dim 0 <- v dim 0 (head_size, static) + // output dim 1 <- q dim 2 (n_heads, static) + // output dim 2 <- q dim 1 (n_tokens, potentially dynamic) + // output dim 3 <- q dim 3 (batch, static) + // Using the fixed q-dim -> output-dim mapping table. + // q is src[0]; the mapping from q's dynamic dim to the output dim is: + // q dim 1 -> output dim 2 + // q dim 2 -> output dim 1 + // q dim 3 -> output dim 3 + // q dim 0 -> output dim 0 (head_size axis, unlikely to be dynamic) + constexpr int q_to_out[GGML_MAX_DIMS] = { 0, 2, 1, 3 }; + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]]; + m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim]; + } + break; + } + case GGML_OP_CONT: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + if (ggml_are_same_shape(node, node->src[0])) { + m_node_dynamic_dims[node] = dynamic_dim_idx; + } else { + size_t src_logical_nb[GGML_MAX_DIMS]; + src_logical_nb[0] = ggml_type_size(node->src[0]->type); + src_logical_nb[1] = src_logical_nb[0] * + (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type)); + for (int i = 2; i < GGML_MAX_DIMS; i++) { + src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1]; + } + + auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] / + ggml_type_size(node->src[0]->type) * + ggml_type_size(node->type); + int matched_dim_count = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) { + m_node_dynamic_dims[node] = i; + matched_dim_count++; + } + } + + OPENVINO_ASSERT(matched_dim_count == 1, + "Cannot determine dynamic dim for CONT node: " + std::string(node->name)); + } + } + break; + case GGML_OP_RMS_NORM: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_GLU: + case GGML_OP_ROPE: + case GGML_OP_SCALE: + case GGML_OP_SOFT_MAX: + case GGML_OP_ARGSORT: + case GGML_OP_ADD_ID: + case GGML_OP_UNARY: + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + case GGML_OP_MUL_MAT_ID: + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; + break; + case GGML_OP_CPY: + case GGML_OP_SET_ROWS: + m_node_dynamic_dims[node] = -1; + break; + default: + std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl; + break; + } + }; + + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + visit_node(visit_node, node); + } + + // print the nodes in m_cgraph name & shape with the dynamic dim (the dynamic dim is the dimension with -1 in m_node_dynamic_dims) for debugging + if (0) { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + int dynamic_dim = m_node_dynamic_dims[node]; + std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op) + << " shape: ["; + for (int j = 0; j < 4; j++) { + if (j == dynamic_dim) { + std::cout << "*"; + } else { + std::cout << node->ne[j]; + } + if (j < 3) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + // print the src name & shape with the dynamic dim for debugging + for (int j = 0; j < GGML_MAX_SRC; j++) { + ggml_tensor * src = node->src[j]; + if (src == nullptr) { + continue; + } + int src_dynamic_dim = m_node_dynamic_dims[src]; + std::cout << " [" << j << "] src_name: " << src->name << " ["; + for (int k = 0; k < 4; k++) { + if (k == src_dynamic_dim) { + std::cout << "*"; + } else { + std::cout << src->ne[k]; + } + if (k < 3) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + } + std::cout << std::endl; + } + } +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 3ae25ddda32..93e6973fb38 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,6 +1,7 @@ #pragma once -#include "ggml-quants.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" #include "ggml.h" #include "openvino/decoder.h" @@ -9,7 +10,6 @@ #include #include #include -#include #include struct ModelParams { @@ -69,6 +69,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> & model_weights, bool is_static, bool is_stateful = false, + bool model_is_splitted = false, bool is_prefill = false, int prefill_chunk_size = 256); @@ -106,10 +107,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual ov::element::Type get_output_type(int node_idx) const override; + virtual std::vector get_output_stride(int node_idx) const override; + virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; virtual int32_t * get_output_op_params(int node_idx) const override; + virtual size_t get_output_op_offset(int node_idx) const override; + virtual std::vector get_output_names(int node_idx) const override; virtual const std::string & get_op_type() const override; @@ -120,6 +125,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual const std::string & get_op_name(int node_idx) const override; + virtual int32_t get_op_dynamic_dim(int node_idx) const override; + virtual void visit_subgraph(std::function, int node_idx)> node_visitor) const override; ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } @@ -175,7 +182,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_stateful() const override { return m_is_stateful; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const; + virtual bool is_splited_model() const override { + return m_model_is_splitted; + } + + ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -205,6 +216,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_is_prefill = false; bool m_naive = false; int m_prefill_chunk_size = 0; + bool m_model_is_splitted = false; // label the cgraph is splited or not static ov::Shape get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); @@ -215,7 +227,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void update_io(ggml_cgraph * cgraph); inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) { - return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE; + return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE && op->src[0]->org_src == nullptr; } inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) { @@ -227,7 +239,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) { - return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]); + return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) || + (op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]); } inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) { @@ -235,7 +248,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) { - return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor; + return (op->op == GGML_OP_SET_ROWS && op->src[2] == tensor) || + tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY; } inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) { @@ -256,9 +270,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { if (is_inp_emb(tensor, op)) { return "embd"; } - if (is_output_idx(tensor, op)) { - return "inp_out_ids"; - } if (is_inp_mask(tensor, op)) { return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa"; } @@ -272,6 +283,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void compute_model_inputs(); void compute_model_outputs(); + // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph. + void compute_node_dynamic_dims(); + void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; @@ -284,6 +298,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map m_model_outputs; std::vector m_model_output_names; std::vector m_node_info_list; + std::map m_node_dynamic_dims; ModelParams m_model_params; ComputeParams m_compute_params; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index cc3cb4583cd..4140136aca2 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include ov::Core & ov_singleton_core() { @@ -42,11 +43,13 @@ void ggml_openvino_device_config::init() { {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, }; - if (cache_dir) { + if (cache_dir && strlen(cache_dir) > 0) { compile_config["NPUW_CACHE_DIR"] = cache_dir; + compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE)); } - } else if (cache_dir) { - ov_singleton_core().set_property(ov::cache_dir(cache_dir)); + } else if (cache_dir && strlen(cache_dir) > 0) { + compile_config.insert(ov::cache_dir(cache_dir)); + compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE)); } // Initialize remote context with queue sharing for GPU @@ -259,10 +262,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); - // For symmetric quantization, we only need one zp value (not one per block) - // Zero points are stored in U4 or U8 format matching the weight type - size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; - layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; + // For symmetric quantization, no zp needed (weights stored as signed) + if (layout.is_symmetric) { + layout.zp_size = 0; + } else { + layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks; + } layout.weights_offset = 0; layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; @@ -313,10 +318,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten // Scales: F16 per block int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - // Zero points: U4 or U8 matching weight type - // For symmetric quantization, we only need one zp value (not one per block) - size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; - layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; + // For symmetric quantization, no zp needed (weights stored as signed) + if (layout.is_symmetric) { + layout.zp_size = 0; + } else { + layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks; + } // Layout in buffer: [weights | scales | zp] with alignment layout.weights_offset = 0; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index b3058b4af73..e9ff724042d 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -145,13 +145,18 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer return ctx->data; } +static bool is_stateful_enabled() { + static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION"); + return stateful && strcmp(stateful, "1") == 0; +} + static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" && - !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) { + !is_stateful_enabled()) { GGML_ASSERT(ctx->tensor_extras.empty()); auto device = ctx->device; auto size = ctx->size; @@ -666,7 +671,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { std::shared_ptr r_ctx = std::static_pointer_cast(ctx->runtime_context); r_ctx->device = ggml_openvino_get_device_name(); - r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu(); + r_ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu(); ggml_backend_t openvino_backend = new ggml_backend{ /* .guid = */ ggml_backend_openvino_guid(), @@ -802,15 +807,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); return true; } - float scale = 1.0f; - float max_bias = 0.0f; - const auto * op_params = op->op_params; - memcpy(&scale, (const float *) op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); - if (max_bias > 0) { - // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); - return true; - } break; } case GGML_OP_FLASH_ATTN_EXT: { @@ -862,9 +858,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) { return true; } - if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) { - return true; - } if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) { // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1) // triggers a bug in ov matmul_shape_inference.hpp @@ -879,7 +872,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { const int32_t * op_params = op->op_params; const int n_dims = op_params[1]; const int mode = op_params[2]; - if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) { + if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) { // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); return true; } @@ -888,18 +881,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // op->src[0]->ne[0]); return true; } - if (op->type != GGML_TYPE_F32) { + if (op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) { // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); return true; } - float freq_scale; - float ext_factor; - memcpy(&freq_scale, op_params + 6, sizeof(float)); - memcpy(&ext_factor, op_params + 7, sizeof(float)); - if (ext_factor != 0.0f) { - // GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); - return true; - } if (op->src[0]->op == GGML_OP_VIEW) { if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { // GGML_LOG_WARN( @@ -909,6 +894,20 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { return true; } } + if (mode == GGML_ROPE_TYPE_IMROPE && + (op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 || + ((const float *) op_params)[8] != 1)) { + // GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n"); + return true; + } + break; + } + case GGML_OP_TRANSPOSE: { + // if the type is bf16, will return true + if (op->type == GGML_TYPE_BF16) { + // GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n"); + return true; + } break; } default: @@ -932,13 +931,14 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, - /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, - // softmax is not updated due to replaced by flash_attn_ext - // GGML_OP_SOFT_MAX, + GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_NORM, + GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; static const std::set supported_unary_ops{ + GGML_UNARY_OP_GELU, GGML_UNARY_OP_SILU, + GGML_UNARY_OP_TANH, }; static const std::set supported_glu_ops{ GGML_GLU_OP_SWIGLU, diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index dbf38646ddd..57d66df4f01 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -46,6 +46,7 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) { // Extracts (weight, scales, zp) from Q4_0 tensors. // Data layout is: |16 bit scale|32 x 4bit weights|. +// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8). void extract_q4_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, @@ -55,28 +56,32 @@ void extract_q4_0_data(const ggml_tensor * tensor, auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - // For Q4_0, zero point is always 8 - if (is_scalar_zp) { - zp[0] = 8 | (8 << 4); // Pack two 4-bit values - } + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path - ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); - // For asymmetric quantization, compute per-block zero points - if (!is_scalar_zp) { + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); // Pack two 4-bit zero points per byte if (i % 2 == 0) { zp[i / 2] = 8; // Lower nibble } else { zp[i / 2] |= (8 << 4); // Upper nibble } - } - unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); - }); + unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); + }); + } else { + // Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble) + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); + unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); + // Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8. + for (int j = 0; j < 16; ++j) { + weights[i * 16 + j] ^= 0x88; + } + }); + } } // Extracts (weight, scales, zp) from Q4_1 tensors. @@ -123,6 +128,7 @@ void extract_q4_1_data(const ggml_tensor * tensor, // Extracts (weight, scales, zp) from Q8_0 tensors. // Data layout is: |16 bit scale|32 x 8bit weights|. +// When zp_arr is empty (symmetric), weights are stored as signed i8 directly. void extract_q8_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, @@ -133,29 +139,30 @@ void extract_q8_0_data(const ggml_tensor * tensor, auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - // For Q8_0, zero point is always 128 - if (is_scalar_zp) { - zp[0] = 128; - } + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path - ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - uint8_t * block_data = data + i * bytes_per_block; - scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); zp[i] = 128; - } - for (size_t j = 0; j < weights_per_block; ++j) { - uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. - // Original data is in int8_t, so we add a bias of -128 and invert the first bit. - x ^= 1 << 7; - weights[i * weights_per_block + j] = x; - } - }); + for (size_t j = 0; j < weights_per_block; ++j) { + uint8_t x = block_data[j + 2]; + x ^= 1 << 7; // Convert int8 to uint8 by flipping sign bit + weights[i * weights_per_block + j] = x; + } + }); + } else { + // Symmetric: store original int8 values directly (no unsigned bias) + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); + // Copy int8 weights as-is (the tensor element type is i8) + memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block); + }); + } } void unpack_256_4(const uint8_t * data, uint8_t * dst) { @@ -256,44 +263,62 @@ void extract_q6_k_data(const ggml_tensor * tensor, auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - - // For Q6_K, zero point is always 32 - if (is_scalar_zp) { - zp[0] = 32; - } - - ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t * block_data = data + i * bytes_per_block; - float scale_factor = - static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2 + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path - for (size_t j = 0; j < 16; j++) { - scales[j + i * 16] = - ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + float scale_factor = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); + for (size_t j = 0; j < 16; j++) { + scales[j + i * 16] = + ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); zp[j + i * 16] = 32; } - } - - uint8_t * ql = block_data; - uint8_t * qh = block_data + 128; - - for (int64_t j = 0; j < 32; ++j) { - weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); - weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); - weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4); - weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4); - weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4); - weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4); - weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); - weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); - } - }); + uint8_t * ql = block_data; + uint8_t * qh = block_data + 128; + for (int64_t j = 0; j < 32; ++j) { + weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); + weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); + weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4); + weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4); + weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4); + weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4); + weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); + weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); + } + }); + } else { + // Symmetric: subtract 32 from each weight to store as signed i8 + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + float scale_factor = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); + for (size_t j = 0; j < 16; j++) { + scales[j + i * 16] = + ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); + } + uint8_t * ql = block_data; + uint8_t * qh = block_data + 128; + auto * signed_weights = reinterpret_cast(weights); + for (int64_t j = 0; j < 32; ++j) { + signed_weights[i * 256 + j] = static_cast((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 32] = + static_cast((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 64] = static_cast((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 96] = + static_cast((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 128] = + static_cast((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 160] = + static_cast((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 192] = + static_cast((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 224] = + static_cast((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32; + } + }); + } } static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { @@ -389,11 +414,10 @@ ov::Output make_int8_weights(ov::Tensor & weight, size_t group_size, bool use_bias) { ov::Shape orig_shape = weight.get_shape(); + bool is_signed = (weight.get_element_type() == ov::element::i8); // Symmetric: signed weights, no ZP // Expand dimensions for scales and zp/bias auto scale_shape = scales.get_shape(); - auto zp_shape = zp.get_shape(); - bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; @@ -403,37 +427,48 @@ ov::Output make_int8_weights(ov::Tensor & weight, } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - // For symmetric quantization, zp remains scalar (don't resize) - if (!is_scalar_zp) { + if (!is_signed && zp.get_size() > 0) { + auto zp_shape = zp.get_shape(); zp_shape.push_back(1); zp.set_shape(zp_shape); } } - // Create graph nodes - auto weights_node = std::make_shared(ov::element::u8, packed_shape, - static_cast(weight.data()), nullptr); - weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - auto weights_f16 = std::make_shared(weights_node, ov::element::f16); ov::Output result; - if (use_bias && !is_scalar_zp) { - // Bias path: w * s + b (zp tensor holds f16 bias values) - auto bias_f16 = std::make_shared(zp); - auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + if (is_signed) { + // Signed path: q * s (no zero point subtraction needed) + auto weights_node = std::make_shared(ov::element::i8, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + result = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); } else { - // Zero point path: (w - zp) * s - auto zero_point = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_point, zp_value)) { - zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + // Unsigned path + auto weights_node = std::make_shared(ov::element::u8, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + + if (use_bias && zp.get_size() > 0) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = + std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_point = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_point, zp_value)) { + zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + } + auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } - auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); - auto w_zp = - std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } if (packed_shape.size() != 2) { @@ -452,11 +487,10 @@ ov::Output make_int4_weights(ov::Tensor & weight, size_t group_size, bool use_bias) { ov::Shape orig_weight_shape = weight.get_shape(); + bool is_signed = (weight.get_element_type() == ov::element::i4); // Symmetric: signed weights, no ZP // Expand dimensions for scales and zp/bias ov::Shape scale_shape = scales.get_shape(); - auto zp_shape = zp.get_shape(); - bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization // Create INT4 weight tensor ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; @@ -467,36 +501,48 @@ ov::Output make_int4_weights(ov::Tensor & weight, } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - // For symmetric quantization, zp remains scalar (don't resize) - if (!is_scalar_zp) { + if (!is_signed && zp.get_size() > 0) { + auto zp_shape = zp.get_shape(); zp_shape.push_back(1); zp.set_shape(zp_shape); } } - auto weights_node = std::make_shared(ov::element::u4, packed_shape, - static_cast(weight.data()), nullptr); - weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; - auto weights_f16 = std::make_shared(weights_node, ov::element::f16); auto scales_f16 = std::make_shared(scales); ov::Output result; - if (use_bias && !is_scalar_zp) { - // Bias path: w * s + b (zp tensor holds f16 bias values) - auto bias_f16 = std::make_shared(zp); - auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + if (is_signed) { + // Signed path: q * s (no zero point subtraction needed) + auto weights_node = std::make_shared(ov::element::i4, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + result = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); } else { - // Zero point path: (w - zp) * s - auto zero_points_node = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_points_node, zp_value)) { - zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + // Unsigned path + auto weights_node = std::make_shared(ov::element::u4, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + + if (use_bias && zp.get_size() > 0) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = + std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_points_node = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_points_node, zp_value)) { + zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + } + auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } - auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); - auto w_zp = - std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } if (packed_shape.size() != 2) { @@ -699,24 +745,32 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo // Quantized path (normal extraction or quantized requant) // Create weight/scale/zp tensors - shared between both paths - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + // For symmetric quantization, use signed types (i4/i8) and no ZP tensor + ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) : + (layout.is_u4 ? ov::element::u4 : ov::element::u8); ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); + if (!layout.is_symmetric) { + ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset); + } + // else: result.zp remains default-constructed (empty) for symmetric } else { result.weights = ov::Tensor(weight_type, node_shape); result.scales = ov::Tensor(ov::element::f16, scale_shape); - if (use_bias && !layout.is_symmetric) { - // bias only has effect for asymmetric quant - result.zp = ov::Tensor(ov::element::f16, zp_shape); - } else { - result.zp = ov::Tensor(weight_type, zp_shape); + if (!layout.is_symmetric) { + if (use_bias) { + result.zp = ov::Tensor(ov::element::f16, scale_shape); + } else { + ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + result.zp = ov::Tensor(zp_type, scale_shape); + } } + // else: result.zp remains default-constructed (empty) for symmetric } if (layout.is_requant && layout.requant_type.has_value()) { @@ -741,59 +795,75 @@ void quantize_q4_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - - // For Q4_0, zero point is always 8 - if (is_scalar_zp) { - zp[0] = 8 | (8 << 4); // Pack two 4-bit values - } + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < qk; j++) { - const float v = x[i * qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + float max = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } } - } - - const float d = max / -8; - - if (d == 0) { - scales[i] = ov::float16(1.0f); - // zp is already set to 8 for symmetric, or set per-block for asymmetric - if (!is_scalar_zp) { + const float d = max / -8; + if (d == 0) { + scales[i] = ov::float16(1.0f); if (i % 2 == 0) { zp[i / 2] = 8; } else { zp[i / 2] |= (8 << 4); } + memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2); + continue; } - memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2); - continue; - } - - const float id = 1.0f / d; - scales[i] = ov::float16(d); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + const float id = 1.0f / d; + scales[i] = ov::float16(d); if (i % 2 == 0) { zp[i / 2] = 8; } else { zp[i / 2] |= (8 << 4); } + for (int j = 0; j < qk / 2; ++j) { + const float x0 = x[i * qk + 2 * j] * id; + const float x1 = x[i * qk + 2 * j + 1] * id; + const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f)); + weights[i * qk / 2 + j] = xi0 | (xi1 << 4); + } } - - for (int j = 0; j < qk / 2; ++j) { - const float x0 = x[i * qk + 2 * j] * id; - const float x1 = x[i * qk + 2 * j + 1] * id; - const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f)); - const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f)); - weights[i * qk / 2 + j] = xi0 | (xi1 << 4); + } else { + // Symmetric: produce signed i4 values in [-8, 7] + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + float max = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + const float d = max / -8; + if (d == 0) { + scales[i] = ov::float16(1.0f); + // i4 value 0 packed: 0x00 + memset(weights + i * qk / 2, 0, qk / 2); + continue; + } + const float id = 1.0f / d; + scales[i] = ov::float16(d); + for (int j = 0; j < qk / 2; ++j) { + const float x0 = x[i * qk + 2 * j] * id; + const float x1 = x[i * qk + 2 * j + 1] * id; + // Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement. + int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0))); + int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1))); + weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4); + } } } } @@ -809,36 +879,42 @@ void quantize_q8_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - - // For Q8_0, zero point is always 128 - if (is_scalar_zp) { - zp[0] = 128; - } - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path - for (int j = 0; j < qk; j++) { - const float v = x[i * qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + amax = std::max(amax, fabsf(v)); } - } - - const float d = amax / 127.0f; - const float id = d ? 1.0f / d : 0.0f; - scales[i] = ov::float16(d); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + const float d = amax / 127.0f; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); zp[i] = 128; + for (int j = 0; j < qk; ++j) { + const float x0 = x[i * qk + j] * id; + const int8_t xi0 = roundf(x0); + weights[i * qk + j] = (uint8_t) (xi0 + 128); + } } - - for (int j = 0; j < qk; ++j) { - const float x0 = x[i * qk + j] * id; - const int8_t xi0 = roundf(x0); - weights[i * qk + j] = (uint8_t) (xi0 + 128); + } else { + // Symmetric: store signed int8 values directly + auto * signed_weights = reinterpret_cast(weights); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + amax = std::max(amax, fabsf(v)); + } + const float d = amax / 127.0f; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + for (int j = 0; j < qk; ++j) { + const float x0 = x[i * qk + j] * id; + signed_weights[i * qk + j] = (int8_t) roundf(x0); + } } } } @@ -861,12 +937,8 @@ void quantize_q8_1(const float * x, for (int j = 0; j < qk; j++) { const float v = x[i * qk + j]; - if (v < min) { - min = v; - } - if (v > max) { - max = v; - } + min = std::min(v, min); + max = std::max(v, max); } const float d = (max - min) / ((1 << 8) - 1); diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h index 3b8da2be5d2..b487afd720d 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.h +++ b/ggml/src/ggml-openvino/openvino/decoder.h @@ -35,10 +35,14 @@ class GgmlDecoder : public DecoderBase { virtual element::Type get_output_type(const int node_idx) const = 0; + virtual std::vector get_output_stride(int node_idx) const = 0; + virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0; virtual int32_t * get_output_op_params(int node_idx) const = 0; + virtual size_t get_output_op_offset(int node_idx) const = 0; + virtual std::vector get_output_names(int node_idx) const = 0; virtual const std::string& get_op_type() const = 0; @@ -66,7 +70,11 @@ class GgmlDecoder : public DecoderBase { virtual bool is_stateful() const = 0; + virtual bool is_splited_model() const = 0; + virtual int is_swa_layer(int layer) const = 0; + + virtual int32_t get_op_dynamic_dim(int node_idx) const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h index aa484128a95..26498566134 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.h +++ b/ggml/src/ggml-openvino/openvino/node_context.h @@ -59,12 +59,22 @@ class NodeContext : public frontend::NodeContext { return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]); } + int32_t get_op_dynamic_dim() const { + return m_decoder->get_op_dynamic_dim(m_node_idx); + } + int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); } + size_t get_output_op_offset() const { return m_decoder->get_output_op_offset(m_node_idx); } + ov::element::Type get_output_type() const { return m_decoder->get_output_type(m_node_idx); } + std::vector get_output_stride() const { + return m_decoder->get_output_stride(m_node_idx); + } + Output get_input(int idx) const override { return m_tensor_map->at(m_input_names[idx]); } diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 6160dd74444..243e236f166 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -18,27 +18,17 @@ namespace op { OutputVector translate_cont(const NodeContext & context) { num_inputs_check(context, 1, 1); - int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); - auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape().to_shape(); - ov::Output res; - if (op_case == 1) { - // The input comes from a PERMUTE - throw std::runtime_error("Code of this case might be outdated"); - dst_shape[1] = -1; - res = std::make_shared( - context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); - } else if (op_case == 2) { - // The input comes from a TRANSPOSE - return {context.get_input(0)}; - } else { - // The input comes from a VIEW - res = process_view_input(context, 0); + if (context.get_op_dynamic_dim() != -1) { + dst_shape[3 - context.get_op_dynamic_dim()] = -1; } + ov::Output res; + res = std::make_shared( + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); + return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 42602a730a4..059556107ef 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -34,23 +34,19 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto q = std::make_shared(q_f32, ov::element::f16); auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); - ov::Output mask_sliced, res; + ov::Output res; + + // For stateful std::string mask_name = "KQ_mask_sliced"; if (context.get_input_names()[3].find("swa") != std::string::npos) { mask_name = "KQ_mask_swa_sliced"; } if (context.has_input(mask_name)) { - mask_sliced = context.get_input(mask_name); - } else { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto token_len = get_dimensions(q, {2}); - mask_sliced = std::make_shared(mask, zero, token_len, one, two); + mask = context.get_input(mask_name); } - if (mask_sliced.get_element_type() != ov::element::f16) { - mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + if (mask.get_element_type() != ov::element::f16) { + mask = std::make_shared(mask, ov::element::f16); } auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output kv) { @@ -77,7 +73,7 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k); v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v); - auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); + auto sdpa = std::make_shared(q, k, v, mask, scale_node, false); res = std::make_shared(sdpa, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); res = std::make_shared(res, ov::element::f32); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 38edec85ddf..71cf1fd17aa 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -34,10 +34,7 @@ OutputVector translate_mulmat(const NodeContext & context) { ov::Output A = context.get_input(1); bool transpose_b = true; - if (op_case == 2) { - B = B.get_node_shared_ptr()->input_value(0); - transpose_b = false; - } else if (op_case == 3) { + if (op_case == 3) { B = process_view_input(context, 0); A = process_view_input(context, 1); } @@ -55,6 +52,7 @@ OutputVector translate_mulmat(const NodeContext & context) { auto batch_small = A_batch_larger ? B_batch : A_batch; Output Z = A_batch_larger ? B : A; + auto Z_shape = A_batch_larger ? B_shape : A_shape; int64_t factor = batch_large / batch_small; if (factor > 1 && batch_small > 1) { auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{batch_large}); @@ -67,7 +65,11 @@ OutputVector translate_mulmat(const NodeContext & context) { auto broadcast_shape = ov::op::v0::Constant::create( ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1}); auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, - {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]}); + {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) Z_shape[3]}); + if (op_case == 2) { + new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, + {(int64_t) 0, batch_large, (int64_t) Z_shape[2], (int64_t) -1}); + } auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape, ov::op::BroadcastType::BIDIRECTIONAL); diff --git a/ggml/src/ggml-openvino/openvino/op/norm.cpp b/ggml/src/ggml-openvino/openvino/op/norm.cpp new file mode 100644 index 00000000000..b6e54914e1f --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/norm.cpp @@ -0,0 +1,58 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_norm(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input_node = context.get_input(0); + + // Step 1: Calculate mean along the last dimension + // mean = reduce_mean(input, axis=-1, keepdims=true) + auto mean = std::make_shared( + input_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); + + // Step 2: Calculate (input - mean) + auto centered = std::make_shared(input_node, mean); + + // Step 3: Calculate squared differences (input - mean)^2 + auto squared = std::make_shared( + centered, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f})); + + // Step 4: Calculate variance = mean((input - mean)^2) + auto variance = std::make_shared( + squared, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); + + // Step 5: Get epsilon from op_params + float eps; + memcpy(&eps, context.get_output_op_params(), sizeof(float)); + + // Step 6: Calculate std = sqrt(variance + eps) + auto std_dev = std::make_shared( + std::make_shared(variance, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps}))); + + // Step 7: Normalize: output = (input - mean) / std + auto res = std::make_shared(centered, std_dev); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 4c800f9ee4f..a9a3800e663 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -22,16 +23,26 @@ OutputVector translate_permute(const NodeContext & context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4, - "Unsupported PERMUTE case"); + FRONT_END_CHECK_IMPLEMENTED(op_case != 0, "Unsupported PERMUTE case"); + // op_case 1 is trivial permute + // op_case 2 is to permute Q. It has a preceding VIEW that reshapes Q to restore the sequqence dimension + // op_case 3 4 it to permute KV cache in the default layout + // op_case 5 6 is to permute V cache when `-fa off`, where v_trans=true ov::Output res; auto src = context.get_input(0); - auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); + std::vector perm_values{0, 2, 1, 3}; + const int32_t* op_params = context.get_output_op_params(); + if (op_params != nullptr) { + for (size_t i = 0; i < perm_values.size(); ++i) { + perm_values[i] = static_cast(perm_values.size() - 1 - op_params[perm_values.size() - 1 - i]); + } + } + auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, perm_values); if (op_case == 1 || context.is_stateful()) { res = std::make_shared(src, perm); - } else if (op_case == 4) { + } else if (op_case == 2) { auto output_shape = context.get_output_shape().to_shape(); auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]}); auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); @@ -54,13 +65,17 @@ OutputVector translate_permute(const NodeContext & context) { auto output_shape = context.get_output_shape().to_shape(); int64_t head_size = output_shape[3]; int64_t n_heads = output_shape[1]; + if (op_case == 5 || op_case == 6) { + head_size = output_shape[2]; + n_heads = output_shape[1]; + } int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1; int64_t n_seq = cache_shape[1].get_length(); Output attention_size; if (!context.has_input("attention_size")) { attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); - } else if (op_case == 2) { + } else if (op_case == 3 || op_case == 5) { attention_size = context.get_input("attention_size"); } else { attention_size = context.get_input("attention_size_swa"); @@ -80,18 +95,31 @@ OutputVector translate_permute(const NodeContext & context) { seq_active_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_end_val}); } - // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] + // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] (for `-fa off` [n_seq, n_heads, head_size, ctx_per_seq]) // 2. slice out the active sequences // 3. slice out the attention part in each sequence - // 4. permute + // 4. permute (skip for `-fa off`) auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto src_reshaped = std::make_shared( - src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), false); - auto slice1 = std::make_shared(src_reshaped, seq_active_start, seq_active_end, one, zero); - auto slice2 = std::make_shared(slice1, zero, attention_size, one, one); - res = std::make_shared(slice2, perm); + if (op_case == 3 || op_case == 4) { + auto src_reshaped = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), + false); + auto slice1 = + std::make_shared(src_reshaped, seq_active_start, seq_active_end, one, zero); + auto slice2 = std::make_shared(slice1, zero, attention_size, one, one); + res = std::make_shared(slice2, perm); + } else { + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto src_reshaped = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, n_heads, head_size, ctx_per_seq}), + false); + auto slice1 = + std::make_shared(src_reshaped, seq_active_start, seq_active_end, one, zero); + auto slice2 = std::make_shared(slice1, zero, attention_size, one, three); + res = slice2; + } } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index efd9a5a860a..2a1a082d863 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include namespace ov { @@ -47,7 +46,14 @@ OutputVector translate_reshape(const NodeContext & context) { std::vector{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, (int64_t) output_shape[3]}); } else if (op_case == 3) { - throw std::runtime_error("might be outdated RESHAPE case"); + // - 14: [ 1, 1024, 1, 1] RESHAPE Vcur-0 (reshaped) (reshaped) + // [ 512, 2, 1, 1] 0: RESHAPE Vcur-0 (reshaped) + // - 15: [ 1, 524288, 1, 1] RESHAPE cache_v_l0 (reshaped) + // [ 512, 1024, 1, 1] 0: NONE cache_v_l0 + // - 16: [ 1, 524288, 1, 1] SET_ROWS cache_v_l0 (reshaped) (view) + // [ 1, 1024, 1, 1] 0: RESHAPE Vcur-0 (reshaped) (reshaped) + // [ 1024, 1, 1, 1] 1: NONE leaf_11 + // [ 1, 524288, 1, 1] 2: RESHAPE cache_v_l0 (reshaped) new_shape_node = ov::op::v0::Constant::create( ov::element::i64, {4}, std::vector{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, 1}); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 26dc2d24f82..1954154835c 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -9,12 +9,17 @@ #include #include #include +#include +#include +#include #include #include #include +#include #include #include #include +#include #include #include @@ -33,6 +38,11 @@ OutputVector translate_rope(const NodeContext & context) { auto data_node = context.get_input(0).get_node_shared_ptr(); auto output_shape = context.get_output_shape().to_shape(); int32_t * op_params = context.get_output_op_params(); + const int mode = op_params[2]; + + constexpr int ROPE_TYPE_NORMAL = 0; + constexpr int ROPE_TYPE_NEOX = 2; + constexpr int ROPE_TYPE_IMROPE = 40; Output cos_theta_node; Output sin_theta_node; @@ -45,7 +55,7 @@ OutputVector translate_rope(const NodeContext & context) { if (context.get_input_size() == 3) { rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); } - auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight); + auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == ROPE_TYPE_IMROPE); sin_theta_node = sin_cos.first; cos_theta_node = sin_cos.second; } @@ -65,9 +75,10 @@ OutputVector translate_rope(const NodeContext & context) { } } - const int mode = op_params[2]; - constexpr int ROPE_TYPE_NORMAL = 0; - constexpr int ROPE_TYPE_NEOX = 2; + auto output_type = context.get_output_type(); + if (data_node->get_element_type() != ov::element::f32) { + data_node = std::make_shared(data_node, ov::element::f32); + } if (mode == ROPE_TYPE_NORMAL) { auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); @@ -112,6 +123,29 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_1, cos_theta_node)); res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); + } else if (mode == ROPE_TYPE_IMROPE) { + int64_t n_dims = data_node->get_shape()[3]; + auto cos_sin_shape = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1,-1,1,(n_dims >> 1)}); + auto cos_reshaped = std::make_shared(cos_theta_node, cos_sin_shape, true); + auto sin_reshaped = std::make_shared(sin_theta_node, cos_sin_shape, true); + + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}); + auto split_a = std::make_shared(data_node, split_axis, 2); + auto x0 = split_a->output(0); + auto x1 = split_a->output(1); + auto mul_a = std::make_shared(x0, cos_reshaped); + auto mul_b = std::make_shared(x1, sin_reshaped); + auto sub = std::make_shared(mul_a, mul_b); + + auto mul_c = std::make_shared(x0, sin_reshaped); + auto mul_d = std::make_shared(x1, cos_reshaped); + auto add = std::make_shared(mul_c, mul_d); + + res = std::make_shared(ov::OutputVector{sub, add}, 3); + } + + if (res.get_element_type() != output_type) { + res = std::make_shared(res, output_type); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 136e4265b42..9f2b841b19c 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -34,14 +34,14 @@ OutputVector translate_set_rows(const NodeContext & context) { data = std::make_shared(data, context.get_output_type()); - auto dst_shape = context.get_output_shape().to_shape(); + auto row_size = context.get_input_shape(2)[3].get_length(); auto ind_squeezed = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2})); auto data_reshaped = std::make_shared( data, ov::op::v0::Constant::create(ov::element::i64, {4}, - {(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) dst_shape[3]}), + {(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) row_size}), false); auto axes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 9f6330862be..3f3dd5e548d 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -2,18 +2,16 @@ #include "../op_table.h" #include "../utils.h" -#include +#include #include +#include #include -#include -#include +#include #include -#include #include #include -#include #include -#include +#include #include #include @@ -22,63 +20,82 @@ namespace frontend { namespace ggml { namespace op { +// Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend: +// 1) logits = src0 * scale +// 2) logits += mask (if provided) +// 3) softmax over the last dimension OutputVector translate_soft_max(const NodeContext & context) { - // TODO code is outdated num_inputs_check(context, 1, 2); - auto input_node = context.get_input(0).get_node_shared_ptr(); - ov::Output res; - float scale = 1.0f; float max_bias = 0.0f; - auto * op_params = context.get_output_op_params(); - memcpy(&scale, (float *) op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) op_params + 1, sizeof(float)); - auto src0_shape = context.get_input_shape(0).get_shape(); - const uint32_t h = src0_shape[2]; - const uint32_t n_head = src0_shape[0]; - const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); - - const float m0 = powf(2.0f, -(max_bias) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const float slope = - (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; - - auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); - auto scaled_input = std::make_shared(input_node, scale_node); - - if (context.get_input_size() < 2) { - res = std::make_shared(scaled_input, 2); - return rename_outputs_with_suffix({res}, context.get_name()); - } + memcpy(&scale, (float *) context.get_output_op_params() + 0, sizeof(float)); + memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float)); - ov::Output mask_node_sliced; - if (context.has_input("KQ_mask_sliced")) { - mask_node_sliced = context.get_input("KQ_mask_sliced"); - } else { - auto token_len = get_dimensions(input_node, {1}); - auto mask_node = context.get_input(1); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); - } + ov::Output logits = context.get_input(0); - if (mask_node_sliced.get_element_type() != context.get_output_type()) { - mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type()); + // Apply scale first: logits = src0 * scale + if (scale != 1.0f) { + auto scale_const = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + logits = std::make_shared(logits, scale_const); } - Output slope_mask; - if (slope != 1.0f) { - auto slope_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{slope}); - slope_mask = std::make_shared(mask_node_sliced, slope_node); - throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use."); - } - slope_mask = mask_node_sliced; + FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2), + "OpenVINO softmax ALiBi path requires mask input"); + + // Optional mask add: logits += mask + // For max_bias > 0 (ALiBi), apply per-head slope to mask before adding. + if (context.get_input_size() > 1) { + ov::Output mask = context.get_input(1); + + // For stateful + std::string mask_name = "KQ_mask_sliced"; + if (context.get_input_names()[1].find("swa") != std::string::npos) { + mask_name = "KQ_mask_swa_sliced"; + } + if (context.has_input(mask_name)) { + mask = context.get_input(mask_name); + } + + if (mask.get_element_type() != logits.get_element_type()) { + mask = std::make_shared(mask, logits.get_element_type()); + } - auto input_slope_mask_node = std::make_shared(scaled_input, slope_mask); + if (max_bias > 0.0f) { + auto out_shape = context.get_output_shape().to_shape(); + FRONT_END_CHECK_IMPLEMENTED(out_shape.size() == 4, + "OpenVINO softmax ALiBi path expects rank-4 tensor"); + + const uint32_t n_head = static_cast(out_shape[1]); + FRONT_END_CHECK_IMPLEMENTED(n_head > 0, "OpenVINO softmax ALiBi path expects n_head > 0"); + + const uint32_t n_head_log2 = 1u << static_cast(std::floor(std::log2(static_cast(n_head)))); + const float m0 = std::pow(2.0f, -(max_bias) / static_cast(n_head_log2)); + const float m1 = std::pow(2.0f, -(max_bias / 2.0f) / static_cast(n_head_log2)); + + std::vector slopes(n_head); + for (uint32_t h = 0; h < n_head; ++h) { + slopes[h] = h < n_head_log2 ? std::pow(m0, static_cast(h + 1)) + : std::pow(m1, static_cast(2 * (h - n_head_log2) + 1)); + } + + ov::Output slope_node = + std::make_shared(ov::element::f32, ov::Shape{n_head}, slopes); + if (slope_node.get_element_type() != mask.get_element_type()) { + slope_node = std::make_shared(slope_node, mask.get_element_type()); + } + + auto slope_shape = std::make_shared(ov::element::i64, ov::Shape{4}, + std::vector{1, static_cast(n_head), 1, 1}); + auto slope_4d = std::make_shared(slope_node, slope_shape, false); + mask = std::make_shared(mask, slope_4d); + } + + logits = std::make_shared(logits, mask); + } - res = std::make_shared(input_slope_mask_node, 2); + // Softmax along last dimension (equivalent to ggml softmax over ne[0]). + auto res = std::make_shared(logits, -1); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 8e62e83c0d7..b3b4614e440 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -12,8 +12,37 @@ namespace op { OutputVector translate_transpose(const NodeContext & context) { num_inputs_check(context, 1, 1); + // Compute permute order from input/output shape and stride information + // so it adapts to different input and output layouts. + auto input_shape = context.get_input_shape(0).to_shape(); + auto input_stride = context.get_input_stride(0); + auto output_shape = context.get_output_shape().to_shape(); + auto output_stride = context.get_output_stride(); + + // Compute permute order by matching output and input stride rankings. + // Build pairs. + std::vector> output_stride_dims; + std::vector> input_stride_dims; + + for (int i = 0; i < 4; ++i) { + output_stride_dims.push_back({output_stride[i], i}); + input_stride_dims.push_back({input_stride[i], i}); + } + + // Sort by stride in descending order. + std::sort(output_stride_dims.rbegin(), output_stride_dims.rend()); + std::sort(input_stride_dims.rbegin(), input_stride_dims.rend()); + + // Build permute order. + std::vector permute_order(4); + for (int i = 0; i < 4; ++i) { + int output_dim = output_stride_dims[i].second; + int input_dim = input_stride_dims[i].second; + permute_order[output_dim] = input_dim; + } + auto res = std::make_shared( - context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 1, 3, 2})); + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, permute_order)); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp new file mode 100644 index 00000000000..d1e9efc33a5 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp @@ -0,0 +1,25 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary_gelu(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = context.get_input(0); + auto res = std::make_shared(input); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary_tanh.cpp b/ggml/src/ggml-openvino/openvino/op/unary_tanh.cpp new file mode 100644 index 00000000000..5e6744b2290 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary_tanh.cpp @@ -0,0 +1,25 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary_tanh(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = context.get_input(0); + auto res = std::make_shared(input); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 8528d252336..93831af9b4d 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,6 +1,7 @@ #include "../op_table.h" #include "../utils.h" #include +#include namespace ov { namespace frontend { namespace ggml { @@ -28,6 +29,49 @@ OutputVector translate_view(const NodeContext & context) { auto dst_shape = context.get_output_shape().to_shape(); + std::vector diff_dims; + for (size_t i = 0; i < dst_shape.size(); ++i) { + if (dst_shape[i] != input_llama_shape[i]) { + diff_dims.push_back(i); + } + } + + FRONT_END_CHECK_IMPLEMENTED(!diff_dims.empty(), "VIEW op_case 3 failed to infer changed dims"); + + const size_t offset = context.get_output_op_offset(); + const auto input_stride = context.get_input_stride(0); + FRONT_END_CHECK_IMPLEMENTED(input_stride.size() == dst_shape.size(), + "VIEW op_case 3 shape/stride rank mismatch"); + + // Multi-dim change: infer begin/end for each axis from shape/stride/offset directly. + if (diff_dims.size() > 1) { + std::vector begin(dst_shape.size(), 0); + std::vector end(dst_shape.size(), 0); + std::vector step(dst_shape.size(), 1); + std::vector axes(dst_shape.size(), 0); + + size_t rem_offset = offset; + for (size_t i = 0; i < dst_shape.size(); ++i) { + FRONT_END_CHECK_IMPLEMENTED(input_stride[i] > 0, "VIEW op_case 3 invalid stride"); + begin[i] = static_cast(rem_offset / input_stride[i]); + rem_offset %= input_stride[i]; + end[i] = begin[i] + static_cast(dst_shape[i]); + axes[i] = static_cast(i); + + FRONT_END_CHECK_IMPLEMENTED(begin[i] >= 0 && + end[i] <= static_cast(input_llama_shape[i]), + "VIEW op_case 3 multi-dim inferred slice out of bounds"); + } + + auto sliced = std::make_shared( + input, + ov::op::v0::Constant::create(ov::element::i64, {begin.size()}, begin), + ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end), + ov::op::v0::Constant::create(ov::element::i64, {step.size()}, step), + ov::op::v0::Constant::create(ov::element::i64, {axes.size()}, axes)); + return {sliced}; + } + // find the index of dst_shape that is different from input shape, and use that index to slice the input int slice_dim = -1; for (size_t i = 0; i < dst_shape.size(); ++i) { @@ -37,12 +81,124 @@ OutputVector translate_view(const NodeContext & context) { } } - auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {dst_shape[slice_dim]}); + FRONT_END_CHECK_IMPLEMENTED(slice_dim >= 0, "VIEW op_case 3 failed to infer slice dim"); + + FRONT_END_CHECK_IMPLEMENTED(input_stride[slice_dim] > 0, "VIEW op_case 3 invalid stride"); + + const int64_t dim_size = static_cast(input_llama_shape[slice_dim]); + + if (offset % input_stride[slice_dim] == 0) { + const int64_t begin_val = static_cast((offset / input_stride[slice_dim]) % static_cast(dim_size)); + const int64_t end_val = begin_val + static_cast(dst_shape[slice_dim]); + + FRONT_END_CHECK_IMPLEMENTED(begin_val >= 0 && + end_val <= dim_size, + "VIEW op_case 3 inferred slice out of bounds"); + + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}); + auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}); + auto sliced = std::make_shared(input, begin, end, stride, axes); + return {sliced}; + } + + // Fallback for offsets that cross lower dimensions: flatten tail dims, slice 1D range, then reshape. + FRONT_END_CHECK_IMPLEMENTED(slice_dim + 1 < static_cast(dst_shape.size()), + "VIEW op_case 3 fallback requires lower dimensions"); + + int64_t tail_src_elems = 1; + int64_t tail_dst_elems = 1; + for (size_t i = static_cast(slice_dim); i < input_llama_shape.size(); ++i) { + tail_src_elems *= static_cast(input_llama_shape[i]); + tail_dst_elems *= static_cast(dst_shape[i]); + } + + const auto elem_stride = input_stride.back(); + FRONT_END_CHECK_IMPLEMENTED(elem_stride > 0 && offset % elem_stride == 0, + "VIEW op_case 3 fallback invalid element stride/alignment"); + + const int64_t tail_begin = static_cast((offset / elem_stride) % static_cast(tail_src_elems)); + const int64_t tail_end = tail_begin + tail_dst_elems; + FRONT_END_CHECK_IMPLEMENTED(tail_begin >= 0 && tail_end <= tail_src_elems, + "VIEW op_case 3 fallback slice out of bounds"); + + std::vector flat_shape; + for (int i = 0; i < slice_dim; ++i) { + flat_shape.push_back(static_cast(input_llama_shape[i])); + } + flat_shape.push_back(tail_src_elems); + + auto flat = std::make_shared( + input, + ov::op::v0::Constant::create(ov::element::i64, {flat_shape.size()}, flat_shape), + false); + + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_begin}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_end}); auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}); + auto sliced = std::make_shared(flat, begin, end, stride, axes); + + auto reshaped = std::make_shared( + sliced, + ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), + false); + return {reshaped}; + } + + // op_case 4: view offset selects one index from a middle dimension, then output keeps another source dim. + // Example: src [N,M,K,1] -> dst [N,K,1,1] with offsets 0, nb1, 2*nb1, ... + if (context.get_op_case() == 4) { + auto input = context.get_input(0); + auto src_shape = context.get_input_shape(0).to_shape(); + auto dst_shape = context.get_output_shape().to_shape(); + auto src_stride = context.get_input_stride(0); + auto dst_stride = context.get_output_stride(); + + FRONT_END_CHECK_IMPLEMENTED(src_shape.size() == dst_shape.size() && + src_shape.size() == src_stride.size() && + src_shape.size() == dst_stride.size(), + "VIEW op_case 4 shape/stride rank mismatch"); + + std::set used_dst_strides; + for (size_t i = 0; i < dst_shape.size(); ++i) { + if (dst_shape[i] > 1) { + used_dst_strides.insert(dst_stride[i]); + } + } + + int64_t slice_axis = -1; + for (size_t i = 0; i < src_shape.size(); ++i) { + if (src_shape[i] > 1 && used_dst_strides.find(src_stride[i]) == used_dst_strides.end()) { + slice_axis = static_cast(i); + break; + } + } + FRONT_END_CHECK_IMPLEMENTED(slice_axis >= 0, "VIEW op_case 4 failed to infer slice axis"); + + const size_t offset = context.get_output_op_offset(); + const size_t axis_stride = src_stride[static_cast(slice_axis)]; + FRONT_END_CHECK_IMPLEMENTED(axis_stride > 0, "VIEW op_case 4 invalid axis stride"); + + const int64_t axis_size = static_cast(src_shape[static_cast(slice_axis)]); + const int64_t slice_index = static_cast((offset / axis_stride) % static_cast(axis_size)); + + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_index}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_index + 1}); + auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_axis}); auto sliced = std::make_shared(input, begin, end, stride, axes); - return {sliced}; + + if (context.get_op_dynamic_dim() != -1) { + dst_shape[3 - context.get_op_dynamic_dim()] = -1; + } + + auto reshaped = std::make_shared( + sliced, + ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), + false); + return rename_outputs_with_suffix({reshaped}, context.get_name()); } return {context.get_input(0)}; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index beadafe8103..723ade12c54 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -26,12 +26,15 @@ std::unordered_map get_supported_ops() { {"GGML_OP_PERMUTE", op::translate_permute }, {"GGML_OP_RESHAPE", op::translate_reshape }, {"GGML_OP_RMS_NORM", op::translate_rms_norm }, + {"GGML_OP_NORM", op::translate_norm }, {"GGML_OP_ROPE", op::translate_rope }, {"GGML_OP_SCALE", op::translate_scale }, {"GGML_OP_SOFT_MAX", op::translate_soft_max }, {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_GELU", op::translate_unary_gelu }, {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, + {"GGML_UNARY_OP_TANH", op::translate_unary_tanh }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, {"GGML_GLU_OP_GEGLU", op::translate_glu_geglu }, diff --git a/ggml/src/ggml-openvino/openvino/op_table.h b/ggml/src/ggml-openvino/openvino/op_table.h index 37f763117aa..a2614ae5762 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.h +++ b/ggml/src/ggml-openvino/openvino/op_table.h @@ -18,9 +18,12 @@ GGML_OP_CONVERTER(translate_mulmat); GGML_OP_CONVERTER(translate_permute); GGML_OP_CONVERTER(translate_reshape); GGML_OP_CONVERTER(translate_rms_norm); +GGML_OP_CONVERTER(translate_norm); GGML_OP_CONVERTER(translate_rope); GGML_OP_CONVERTER(translate_scale); GGML_OP_CONVERTER(translate_unary_silu); +GGML_OP_CONVERTER(translate_unary_gelu); +GGML_OP_CONVERTER(translate_unary_tanh); GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp deleted file mode 100644 index ed2a3ab6d1b..00000000000 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "eliminate_zp.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ov { -namespace frontend { -namespace ggml { -namespace pass { - -EliminateZeroPoints::EliminateZeroPoints() { - // Find pattern: - // (Multiply Any(scale) - // (Subtract (Convert Constant(data))) - // (Convert Constant(zero_point))) - // where zero_point is a scalar - // If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val - // If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant - - auto m_data_constant = ov::pass::pattern::wrap_type(); - auto m_data_convert = ov::pass::pattern::wrap_type({m_data_constant}); - - auto m_zp_constant = ov::pass::pattern::wrap_type(); - auto m_zp_convert = ov::pass::pattern::wrap_type({m_zp_constant}); - - auto m_subtract = ov::pass::pattern::wrap_type({m_data_convert, m_zp_convert}); - auto m_scale = ov::pass::pattern::any_input(); - auto m_multiply = ov::pass::pattern::wrap_type({m_scale, m_subtract}); - - const auto callback = [=](ov::pass::pattern::Matcher & m) { - const auto & pattern_map = m.get_pattern_value_map(); - - auto multiply_node = - std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); - auto subtract_node = - std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); - auto data_constant = - std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); - auto zp_constant = - std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); - - if (!multiply_node || !subtract_node || !data_constant || !zp_constant) { - return false; - } - - if (ov::shape_size(zp_constant->get_shape()) != 1) { - return false; - } - - auto data_type = data_constant->get_element_type(); - auto zp_data = zp_constant->cast_vector(); - - if (zp_data.empty()) { - return false; - } - - int zp_value = zp_data[0]; - - bool should_eliminate = false; - ov::element::Type target_type; - - if (data_type == ov::element::u4 && zp_value == 8) { - should_eliminate = true; - target_type = ov::element::i4; - } else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) { - should_eliminate = true; - target_type = ov::element::i8; - } - - if (!should_eliminate) { - return false; - } - - auto data_shape = data_constant->get_shape(); - size_t total_elements = ov::shape_size(data_shape); - - std::shared_ptr new_constant; - - // TODO improve performance - if (data_type == ov::element::u4) { - auto data_values = data_constant->cast_vector(); - std::vector adjusted_values(total_elements); - - ov::parallel_for(total_elements, [&](size_t i) { - adjusted_values[i] = static_cast(static_cast(data_values[i]) - 8); - }); - - new_constant = std::make_shared(target_type, data_shape, adjusted_values); - } else if (data_type == ov::element::u8) { - auto data_values = data_constant->cast_vector(); - std::vector adjusted_values(total_elements); - - ov::parallel_for(total_elements, [&, zp_value](size_t i) { - adjusted_values[i] = static_cast(static_cast(data_values[i]) - zp_value); - }); - - new_constant = std::make_shared(target_type, data_shape, adjusted_values); - } - - auto new_convert = - std::make_shared(new_constant, subtract_node->get_output_element_type(0)); - ov::replace_node(subtract_node, new_convert); - - return true; - }; - - register_matcher( - std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), - callback); -} - -} // namespace pass -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h deleted file mode 100644 index edd3cd718d9..00000000000 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +++ /dev/null @@ -1,17 +0,0 @@ -#include "openvino/pass/matcher_pass.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace pass { - -class EliminateZeroPoints : public ov::pass::MatcherPass { -public: - OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints") - EliminateZeroPoints(); -}; - -} // namespace pass -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp b/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp new file mode 100644 index 00000000000..f051891c481 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +namespace ov { + +/** + * @brief Holds weightless caching attributes of a single constant. + * + * WeightlessCacheAttribute class represents runtime info attribute that holds + * the values of original size of the constant in bytes and the binary offset of the + * constant's data in the weights file used by the weightless caching mechanism. It's + * not copyable in case the data was changed (the original node was replaced by a new + * one produced during the tranformation pipeline) - in that case weightless caching + * can't be used for that constant. + */ +class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute { +public: + OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute) + + WeightlessCacheAttribute() = delete; + + WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype) + : original_size(original_size), + bin_offset(bin_offset), + original_dtype(original_dtype) {} + + bool is_copyable() const override; + + size_t original_size; + size_t bin_offset; + ov::element::Type original_dtype; +}; + +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 23a1dea2496..8283777cdd0 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -3,15 +3,17 @@ #include "ggml-openvino/openvino/node_context.h" #include "ggml-openvino/openvino/utils.h" #include "input_model.h" -#include "pass/eliminate_zp.h" #include "pass/mark_decompression_convert_constant_folding.h" #include "pass/squeeze_matmul.h" +#include "rt_info/weightless_caching_attributes.hpp" #include #include #include #include #include +#include +#include #include #include #include @@ -33,7 +35,6 @@ #include #include #include -#include namespace ov { namespace frontend { @@ -88,19 +89,22 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { if (is_static) { mask_sliced = mask; } else if (ggml_model_decoder.is_stateful()) { - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); - auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); - auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto gather_inp_pos = std::make_shared(inp_pos, neg_one_1d, three_1d); - auto reshaped_inp_pos = std::make_shared(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false); - auto inp_pos_incremented = std::make_shared(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1})); - auto stop = std::make_shared(ov::OutputVector{token_len_per_seq, std::make_shared(inp_pos_incremented, token_len_per_seq)}, 0); - mask_sliced = - std::make_shared(mask, zero_2d, stop, one_2d, axes); + auto last_inp_pos = std::make_shared(inp_pos, neg_one, three); + auto last_inp_pos_1d = std::make_shared( + last_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false); + auto last_inp_pos_cvt = std::make_shared(last_inp_pos_1d, ov::element::i64); + auto last_inp_pos_inc = std::make_shared(last_inp_pos_cvt, one); + + mask_sliced = std::make_shared(mask, zero, last_inp_pos_inc, step, axes); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); } else { @@ -240,6 +244,31 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo resulting_model = std::make_shared(results, used_params); apply_transformations(resulting_model); + + // Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies + // in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor + // (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export + // occurs", doubling memory usage per compile_model call. + // + // The bin_offset field serves as a unique key (not a real file offset) — this is + // the same convention the GPU plugin uses for non-IR models (see + // Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp). + // Each constant must have a distinct bin_offset, otherwise GPU's weightless cache + // import will map multiple constants to the same data. + // + // Small constants (< 16 elements) are excluded since they may be introduced by + // optimization patterns and the overhead is negligible. + size_t offset = 0; + for (auto & node : resulting_model->get_ordered_ops()) { + if (auto cnst = ov::as_type_ptr(node); + cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) { + auto & rt_info = cnst->get_rt_info(); + if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) { + rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] = + ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type()); + } + } + } return resulting_model; } @@ -257,7 +286,6 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptris_static()) { - manager.register_pass(); manager.register_pass(); } manager.run_passes(model); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 65356a51b51..0baaf88e17a 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -2,6 +2,7 @@ #include "ggml-impl.h" +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -87,8 +89,11 @@ ov::Output rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl auto ramp_y = std::make_shared(std::make_shared(dim_ids, corr_low), denom); auto ramp_clamped = std::make_shared(ramp_y, 0.0f, 1.0f); + // rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling + auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + auto ramp_inverted = std::make_shared(one, ramp_clamped); auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor}); - auto ramp_mix = std::make_shared(ramp_clamped, ext_factor_node); + auto ramp_mix = std::make_shared(ramp_inverted, ext_factor_node); return ramp_mix; } @@ -115,6 +120,7 @@ void ggml_rope_yarn_corr_dims(int n_dims, std::pair, ov::Output> make_sin_cos(int32_t * rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight, + bool imrope, bool stateful) { if (stateful) { inp_pos = std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); @@ -122,6 +128,13 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params auto pos_perm = std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); inp_pos = std::make_shared(inp_pos, pos_perm); + } else if (imrope) { + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1}); + inp_pos = std::make_shared(inp_pos, pos_shape, true); + auto pos_transpose_shape = + std::make_shared(ov::element::i64, ov::Shape{5}, std::vector{0, 1, 2, 4, 3}); + inp_pos = std::make_shared(inp_pos, pos_transpose_shape); } else { inp_pos = std::make_shared(inp_pos, ov::element::f32); auto pos_perm = @@ -136,6 +149,7 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params float beta_fast; float beta_slow; const int n_dims = rope_params[1]; + const size_t n_dims_half = n_dims >> 1; const int n_ctx_orig = rope_params[4]; memcpy(&freq_base, rope_params + 5, sizeof(float)); memcpy(&freq_scale, rope_params + 6, sizeof(float)); @@ -146,57 +160,74 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params const float theta_scale = powf(freq_base, -2.0f / n_dims); - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - std::vector factor(n_dims / 2); - factor[0] = 1.0f; - for (size_t i = 1; i < factor.size(); i++) { - factor[i] = theta_scale * factor[i - 1]; - } + std::vector factor(n_dims_half); Output freq_factors; - if (stateful) { - freq_factors = - std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); - } else { - freq_factors = - std::make_shared(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor); - } - if (rope_freqs_weight) { - freq_factors = std::make_shared(freq_factors, rope_freqs_weight); - } - - auto theta_extrap = std::make_shared(freq_factors, inp_pos); - auto theta_interp = std::make_shared( - theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); Output theta; float mscale = attn_factor; - if (ext_factor == 0.0f) { - theta = theta_interp; + if (imrope) { + std::vector gather_indices(n_dims_half); + for (size_t j = 0; j < n_dims_half; j++) { + gather_indices[j] = j % 3; + factor[j] = std::pow(theta_scale, j); + } + auto gather_indices_const = + std::make_shared(ov::element::i64, ov::Shape{n_dims_half}, gather_indices); + auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4}); + inp_pos = std::make_shared(inp_pos, gather_indices_const, gather_axis); + auto factor_const = std::make_shared(ov::element::f32, ov::Shape{n_dims_half}, factor); + theta = std::make_shared(inp_pos, factor_const); } else { - auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); - Output one; + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + factor[0] = 1.0f; + for (size_t i = 1; i < factor.size(); i++) { + factor[i] = theta_scale * factor[i - 1]; + } if (stateful) { - one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); } else { - one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor); + } + if (rope_freqs_weight) { + freq_factors = std::make_shared(freq_factors, rope_freqs_weight); } - auto one_minus_ramp = std::make_shared(one, ramp_mix); - theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), - std::make_shared(theta_extrap, ramp_mix)); - mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); + auto theta_extrap = std::make_shared(freq_factors, inp_pos); + auto theta_interp = std::make_shared( + theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); + + if (ext_factor == 0.0f) { + theta = theta_interp; + } else { + auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); + Output one; + if (stateful) { + one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + } else { + one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + } + auto one_minus_ramp = std::make_shared(one, ramp_mix); + + theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), + std::make_shared(theta_extrap, ramp_mix)); + mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); + } } Output cos_theta = std::make_shared(theta); Output sin_theta = std::make_shared(theta); - auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); + if (!imrope) { + auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); + + cos_theta = std::make_shared(cos_theta, mscale_node); + sin_theta = std::make_shared(sin_theta, mscale_node); + } - cos_theta = std::make_shared(cos_theta, mscale_node); - sin_theta = std::make_shared(sin_theta, mscale_node); return std::make_pair(sin_theta, cos_theta); } diff --git a/ggml/src/ggml-openvino/openvino/utils.h b/ggml/src/ggml-openvino/openvino/utils.h index 88dcad4c906..767dd4c53ea 100644 --- a/ggml/src/ggml-openvino/openvino/utils.h +++ b/ggml/src/ggml-openvino/openvino/utils.h @@ -67,6 +67,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: std::pair, ov::Output> make_sin_cos(int32_t* rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight = nullptr, + bool imrope = false, bool stateful = false); ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 1b553a0de00..b034dc79469 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -86,7 +87,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< static auto is_static = false; if (is_naive(cgraph)) { - return naive_compute(cgraph, core, device, config); + if (!is_model_splitted(cgraph)) { + return naive_compute(cgraph, core, device, config); + } } auto start_time = ggml_time_us(); @@ -106,17 +109,25 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< int64_t infer_end_time; { - std::lock_guard lock(r_ctx->ov_compute_mutex); + std::shared_ptr mutex; auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); ModelParams old_m_params; if (cache_hit) { - ggml_decoder = it->second; + mutex = it->second->mutex; + std::lock_guard lock(*(mutex)); + ggml_decoder = it->second->ptr; old_m_params = ggml_decoder->get_model_params(); - cache_hit = old_m_params.can_reuse_dynamically(m_params); + if (!ggml_decoder->is_splited_model()) { + cache_hit = old_m_params.can_reuse_dynamically(m_params); + } + } else { + mutex = std::make_shared(); + r_ctx->decoder_cache[key] = std::make_shared(mutex); } + std::lock_guard lock(*(mutex)); if (cache_hit) { std::map> model_weights; @@ -171,11 +182,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< compile_end_time = decoder_end_time; } else { r_ctx->infer_request_cache.erase(key); + bool model_is_splitted = is_model_splitted(cgraph); std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful, model_is_splitted); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); @@ -200,7 +212,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); r_ctx->infer_request_cache[key] = infer_request; - r_ctx->decoder_cache[key] = ggml_decoder; + r_ctx->decoder_cache.at(key)->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; @@ -271,17 +283,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr 0) { - return atoi(chunk_size_str); + static int chunk_size = -1; + if (chunk_size == -1) { + const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); + if (chunk_size_str && atoi(chunk_size_str) > 0) { + chunk_size = atoi(chunk_size_str); + } else { + chunk_size = 256; + } } - return 256; + return chunk_size; }; static std::string device = "NPU"; static auto is_static = true; static auto stateful = false; - static auto prefill_chunk_size = get_prefill_chunk_size(); + + auto prefill_chunk_size = get_prefill_chunk_size(); const auto & config = ggml_openvino_get_compile_config(); if (is_naive(cgraph)) { @@ -306,15 +324,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr mutex; + auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); ModelParams old_m_params; if (cache_hit) { - ggml_decoder = it->second; + mutex = it->second->mutex; + std::lock_guard lock(*(mutex)); + ggml_decoder = it->second->ptr; old_m_params = ggml_decoder->get_model_params(); cache_hit = old_m_params.can_reuse_statically(m_params); + } else { + mutex = std::make_shared(); + r_ctx->decoder_cache[key] = std::make_shared(mutex); } + std::lock_guard lock(*(mutex)); if (cache_hit) { std::map> model_weights; @@ -337,10 +363,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + if (m_params.n_heads == -1) { + // graph is not a LLM, e.g. context-shift graph + prefill_chunk_size = inp_pos->ne[0]; + } auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, - is_static, stateful, true, prefill_chunk_size); + is_static, stateful, false, true, prefill_chunk_size); auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, - stateful, false, prefill_chunk_size); + stateful, false, false, prefill_chunk_size); decoder_end_time = ggml_time_us(); auto input_model_prefill = std::make_shared(ggml_decoder_prefill); @@ -381,7 +411,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrinfer_request_cache_prefill[key] : r_ctx->infer_request_cache[key]; - r_ctx->decoder_cache[key] = ggml_decoder; + r_ctx->decoder_cache.at(key)->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; @@ -470,6 +500,55 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrsrc. +// Step 2 verifies that node inputs come from model nodes/weights/leafs; external sources imply split. +bool is_model_splitted(ggml_cgraph * cgraph) { + // check the nodes of the model are used by the following nodes, through compare the node's use count and the count of nodes that use it as input. If does not match, return true, else return false. + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + int use_count = cgraph->use_counts[ggml_hash_find(&cgraph->visited_hash_set, node)]; + // TODO: this is a workround for the tests case from llama.cpp, fix should from the root cause in the future. + if ((cgraph->n_nodes <= 1 && use_count==0) || (cgraph->n_nodes <= 1 && node->op == GGML_OP_VIEW && use_count == 1 && node->src[0] != nullptr && node->src[0]->op == GGML_OP_NONE)) { + return false; + } + if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_TRANSPOSE || cgraph->nodes[0]->op == GGML_OP_PERMUTE)) { + return false; + } + int input_use_count = 0; + for (int j = 0; j < cgraph->n_nodes; j++) { + ggml_tensor * other_node = cgraph->nodes[j]; + for (int k = 0; k < GGML_MAX_SRC; k++) { + if (other_node->src[k] == node) { + input_use_count++; + } + } + } + if (use_count != input_use_count && node->op != GGML_OP_NONE) { + return true; + } + } + // if all nodes's src node's src is not come from the nodes in the model, we think the model is splitted. This is a complementary check for the above check, because for some special case like the output node is not used by any node, the use count and input use count are both 0, we can not determine whether the model is splitted or not just based on the first check. + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, true); + std::set model_nodes(cgraph->nodes, cgraph->nodes + cgraph->n_nodes); + // leaf nodes + std::set model_leafs(cgraph->leafs, cgraph->leafs + cgraph->n_leafs); + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + ggml_tensor * src = node->src[j]; + // the src is also not the model weights, we think the model is splitted. + // the src is also not in model leafs, we think the model is splitted. + if (src != nullptr && model_nodes.find(src) == model_nodes.end() && + model_weights.find(std::string(src->name)) == model_weights.end() && !model_leafs.empty() == false && + model_leafs.find(src) == model_leafs.end()) { + return true; + } + } + } + return false; +} + bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; int count = 0; @@ -485,7 +564,7 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, ov::Core & core, const std::string & device, const ov::AnyMap & config) { - if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { + if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE)) { return GGML_STATUS_SUCCESS; } @@ -521,14 +600,17 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, infer_request->set_input_tensor(i, input_tensor); } + // Use get_output_tensor + memcpy instead of set_output_tensor to avoid memory overwritten + // when i/o buffer overlaps, e.g. the cgraph is a single PERMUTE + + infer_request->infer(); + auto ov_results = model->get_results(); for (size_t i = 0; i < ov_results.size(); i++) { + auto output_tensor = infer_request->get_output_tensor(i); auto * ggml_tensor = decoder->get_model_outputs().at(ov_results[i]->get_friendly_name()); - auto output_tensor = create_ov_output_tensor(decoder, infer_request, i, ggml_tensor); - infer_request->set_output_tensor(i, output_tensor); + std::memcpy(ggml_tensor->data, output_tensor.data(), output_tensor.get_byte_size()); } - - infer_request->infer(); return GGML_STATUS_SUCCESS; } @@ -536,7 +618,7 @@ namespace { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); - if (ggml_tensor->extra != nullptr) { + if (ggml_tensor->extra != nullptr && !ggml_decoder->is_splited_model()) { // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); auto * extra_base = static_cast(ggml_tensor->extra); if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) { @@ -549,12 +631,43 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str()); auto * input_data = ggml_tensor->data; ov::Shape input_shape; - if (ggml_tensor->op == GGML_OP_VIEW) { + if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_splited_model()) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_shape(ggml_tensor); } + + // Add explicit strided-copy reconstruction for PERMUTE and VIEW tensors in split + // models: iterate over all 4 dimensions using `nb[]` strides and `view_offs` to + // copy non-contiguous source data into a contiguous `ov::Tensor` buffer + if ((ggml_tensor->op == GGML_OP_PERMUTE || ggml_tensor->op == GGML_OP_VIEW) && ggml_decoder->is_splited_model()) { + // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + const auto * src_tensor = ggml_tensor->view_src; + std::vector data; + auto n_bytes = ggml_nbytes(src_tensor); + data.resize(n_bytes); + ggml_backend_tensor_get(src_tensor, data.data(), 0, n_bytes); + + size_t des_index = 0; + for (size_t i0 = 0; i0 < static_cast(ggml_tensor->ne[3]); i0++) { + for (size_t i1 = 0; i1 < static_cast(ggml_tensor->ne[2]); i1++) { + for (size_t i2 = 0; i2 < static_cast(ggml_tensor->ne[1]); i2++) { + for (size_t i3 = 0; i3 < static_cast(ggml_tensor->ne[0]); i3++) { + size_t src_index = ggml_tensor->view_offs + i0 * ggml_tensor->nb[3] + i1 * ggml_tensor->nb[2] + + i2 * ggml_tensor->nb[1] + i3 * ggml_tensor->nb[0]; + + memcpy(static_cast(input_tensor.data()) + des_index, + reinterpret_cast(data.data()) + src_index, ggml_tensor->nb[0]); + des_index += ggml_tensor->nb[0]; + } + } + } + } + return input_tensor; + } + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; } @@ -696,6 +809,68 @@ size_t checksum(const void * data, size_t size) { return sum; } +bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path) { + if (tensor == nullptr || tensor->data == nullptr) { + return false; + } + + std::ofstream out(file_path); + if (!out.is_open()) { + return false; + } + + const size_t n = ggml_nelements(tensor); + out << "name: " << tensor->name + << ", type: " << ggml_type_name(tensor->type) + << ", shape: [" << tensor->ne[0] << ", " << tensor->ne[1] << ", " << tensor->ne[2] << ", " << tensor->ne[3] + << "]" + << ", elements: " << n + << ", data:" << '\n'; + + switch (tensor->type) { + case GGML_TYPE_F32: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << data[i] << '\n'; + } + break; + } + case GGML_TYPE_F16: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << ggml_fp16_to_fp32(data[i]) << '\n'; + } + break; + } + case GGML_TYPE_BF16: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << ggml_bf16_to_fp32(data[i]) << '\n'; + } + break; + } + case GGML_TYPE_I32: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << data[i] << '\n'; + } + break; + } + case GGML_TYPE_I64: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << data[i] << '\n'; + } + break; + } + default: + out << "unsupported tensor type for text dump" << '\n'; + return false; + } + + return true; +} + void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 656573d1389..dd92dc374f4 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -40,11 +40,17 @@ struct graph_key_hash { } }; +struct decoder_runtime_ctx { + decoder_runtime_ctx(std::shared_ptr mutex) : + mutex(mutex) {} + std::shared_ptr mutex; + std::shared_ptr ptr; +}; + struct ov_runtime_context { - std::mutex ov_compute_mutex; std::string device; bool stateful; - std::unordered_map, graph_key_hash> decoder_cache; + std::unordered_map, graph_key_hash> decoder_cache; std::unordered_map, graph_key_hash> infer_request_cache; std::unordered_map, graph_key_hash> infer_request_cache_prefill; std::unordered_map, graph_key_hash> ov_input_names_cache; @@ -67,6 +73,8 @@ enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph, std::share size_t checksum(const void * data, size_t size); +bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path); + void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor); void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst); @@ -117,6 +125,13 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr ggml_decoder, bool is_naive(struct ggml_cgraph * cgraph); +/** + * @brief Heuristically checks whether the given computation graph is a split-model fragment. + * @param cgraph Pointer to the GGML computation graph to analyze. + * @return true if the graph is identified as split; otherwise false. + */ +bool is_model_splitted(struct ggml_cgraph * cgraph); + enum ggml_status naive_compute(struct ggml_cgraph * cgraph, ov::Core & core, const std::string & device, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 0142498d967..41f3541da65 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1767,6 +1767,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.name =*/ { 0 }, /*.extra =*/ NULL, /*.padding =*/ { 0 }, + /*.org_src =*/ NULL, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads