From 2339edc73b30477b1d5121b3b6a4b53661d41890 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Tue, 17 Mar 2026 15:55:29 -0700
Subject: [PATCH 01/24] Thread safety per request only

---
 ggml/src/ggml-openvino/ggml-decoder.cpp |  4 ----
 ggml/src/ggml-openvino/utils.cpp        | 24 +++++++++++++++++++-----
 ggml/src/ggml-openvino/utils.h          | 10 ++++++++--
 3 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 0938d2273e9..1b8c865c331 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -19,7 +19,6 @@
 #include <iomanip>
 #include <map>
 #include <memory>
-#include <mutex>
 #include <openvino/core/dimension.hpp>
 #include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
@@ -573,9 +572,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 }
 
 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
-    static std::mutex weights_mutex;
-    std::lock_guard<std::mutex> lock(weights_mutex);
-
     std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
     auto * nodes = cgraph->nodes;
     auto n_nodes = cgraph->n_nodes;
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 1b553a0de00..6f6ecde65ea 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -106,17 +106,23 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
     int64_t infer_end_time;
 
     {
-        std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
+        std::shared_ptr<std::mutex> mutex;
 
         auto it = r_ctx->decoder_cache.find(key);
 
         cache_hit = it != r_ctx->decoder_cache.end();
         ModelParams old_m_params;
         if (cache_hit) {
-            ggml_decoder = it->second;
+            mutex = it->second->mutex;
+            std::lock_guard<std::mutex> lock(*(mutex));
+            ggml_decoder = it->second->ptr;
             old_m_params = ggml_decoder->get_model_params();
             cache_hit = old_m_params.can_reuse_dynamically(m_params);
+        } else {
+            mutex = std::make_shared<std::mutex>();
+            r_ctx->decoder_cache[key] = std::make_shared<decoder_runtime_ctx>(mutex);
         }
+        std::lock_guard<std::mutex> lock(*(mutex));
 
         if (cache_hit) {
             std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
@@ -200,7 +206,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
             compile_end_time = ggml_time_us();
             infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
             r_ctx->infer_request_cache[key] = infer_request;
-            r_ctx->decoder_cache[key] = ggml_decoder;
+            r_ctx->decoder_cache.at(key)->ptr = ggml_decoder;
 
             std::vector<std::string> ov_input_names;
             std::vector<std::string> ov_output_names;
@@ -306,15 +312,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
     int64_t compile_end_time;
     int64_t infer_end_time;
 
+    std::shared_ptr<std::mutex> mutex;
+
     auto it = r_ctx->decoder_cache.find(key);
 
     cache_hit = it != r_ctx->decoder_cache.end();
     ModelParams old_m_params;
     if (cache_hit) {
-        ggml_decoder = it->second;
+        mutex = it->second->mutex;
+        std::lock_guard<std::mutex> lock(*(mutex));
+        ggml_decoder = it->second->ptr;
         old_m_params = ggml_decoder->get_model_params();
         cache_hit = old_m_params.can_reuse_statically(m_params);
+    } else {
+        mutex = std::make_shared<std::mutex>();
+        r_ctx->decoder_cache[key] = std::make_shared<decoder_runtime_ctx>(mutex);
     }
+    std::lock_guard<std::mutex> lock(*(mutex));
 
     if (cache_hit) {
         std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
@@ -381,7 +395,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
         model = is_prefill ? model_prefill : model_decode;
         ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
         infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
-        r_ctx->decoder_cache[key] = ggml_decoder;
+        r_ctx->decoder_cache.at(key)->ptr = ggml_decoder;
 
         std::vector<std::string> ov_input_names;
         std::vector<std::string> ov_output_names;
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
index 656573d1389..89b1b209342 100644
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@@ -40,11 +40,17 @@ struct graph_key_hash {
     }
 };
 
+struct decoder_runtime_ctx {
+    decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) :
+        mutex(mutex) {}
+    std::shared_ptr<std::mutex> mutex;
+    std::shared_ptr<GgmlOvDecoder> ptr;
+};
+
 struct ov_runtime_context {
-    std::mutex ov_compute_mutex;
     std::string device;
     bool stateful;
-    std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
+    std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
     std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
     std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
     std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;

From 94df04cd7c32a0d5d52d60ecbca04da7d157ed80 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Tue, 24 Mar 2026 14:01:43 +0530
Subject: [PATCH 02/24] Fix ROPE yarn case

---
 ggml/src/ggml-openvino/ggml-openvino.cpp  | 8 --------
 ggml/src/ggml-openvino/openvino/utils.cpp | 5 ++++-
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index b3058b4af73..9b89c9db05f 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -892,14 +892,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
             return true;
         }
-        float freq_scale;
-        float ext_factor;
-        memcpy(&freq_scale, op_params + 6, sizeof(float));
-        memcpy(&ext_factor, op_params + 7, sizeof(float));
-        if (ext_factor != 0.0f) {
-            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
-            return true;
-        }
         if (op->src[0]->op == GGML_OP_VIEW) {
             if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
                 // GGML_LOG_WARN(
diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp
index 65356a51b51..d9a28887643 100644
--- a/ggml/src/ggml-openvino/openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/openvino/utils.cpp
@@ -87,8 +87,11 @@ ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl
     auto ramp_y =
         std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
     auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
+    // rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling
+    auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
+    auto ramp_inverted = std::make_shared<ov::op::v1::Subtract>(one, ramp_clamped);
     auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
-    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
+    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_inverted, ext_factor_node);
     return ramp_mix;
 }
 

From 2dcbe2f534d48269cb05166b7750f433fc5e2b97 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Thu, 19 Mar 2026 13:19:20 +0530
Subject: [PATCH 03/24] Fix sticky stateful config

---
 ggml/src/ggml-openvino/ggml-openvino.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 9b89c9db05f..0af294d85c8 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -145,13 +145,18 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
     return ctx->data;
 }
 
+static bool is_stateful_enabled() {
+    static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
+    return stateful && strcmp(stateful, "1") == 0;
+}
+
 static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
     ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
 
     // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
     if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
-        !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
+        !is_stateful_enabled()) {
         GGML_ASSERT(ctx->tensor_extras.empty());
         auto device = ctx->device;
         auto size = ctx->size;
@@ -666,7 +671,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
 
     std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
     r_ctx->device = ggml_openvino_get_device_name();
-    r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
+    r_ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
 
     ggml_backend_t openvino_backend = new ggml_backend{
         /* .guid      = */ ggml_backend_openvino_guid(),

From c21ed578a4746a2e6413b04ff5d5e3530a4c3db5 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Thu, 19 Mar 2026 09:34:57 +0530
Subject: [PATCH 04/24] Use i4/i8 directly for symmetric quant

---
 .../src/ggml-openvino/ggml-openvino-extra.cpp |  20 +-
 ggml/src/ggml-openvino/ggml-quants.cpp        | 456 ++++++++++--------
 .../openvino/pass/eliminate_zp.cpp            | 123 -----
 .../openvino/pass/eliminate_zp.h              |  17 -
 .../openvino/translate_session.cpp            |   4 +-
 5 files changed, 277 insertions(+), 343 deletions(-)
 delete mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp
 delete mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h

diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
index cc3cb4583cd..317b34e9053 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -259,10 +259,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
             layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
             int64_t n_blocks = n_elements / layout.weights_per_block;
             layout.scales_size = n_blocks * sizeof(uint16_t);
-            // For symmetric quantization, we only need one zp value (not one per block)
-            // Zero points are stored in U4 or U8 format matching the weight type
-            size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
-            layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
+            // For symmetric quantization, no zp needed (weights stored as signed)
+            if (layout.is_symmetric) {
+                layout.zp_size = 0;
+            } else {
+                layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
+            }
 
             layout.weights_offset = 0;
             layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
@@ -313,10 +315,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     // Scales: F16 per block
     int64_t n_blocks = n_elements / layout.weights_per_block;
     layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-    // Zero points: U4 or U8 matching weight type
-    // For symmetric quantization, we only need one zp value (not one per block)
-    size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
-    layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
+    // For symmetric quantization, no zp needed (weights stored as signed)
+    if (layout.is_symmetric) {
+        layout.zp_size = 0;
+    } else {
+        layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
+    }
 
     // Layout in buffer: [weights | scales | zp] with alignment
     layout.weights_offset = 0;
diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
index dbf38646ddd..57d66df4f01 100644
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@@ -46,6 +46,7 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
 
 // Extracts (weight, scales, zp) from Q4_0 tensors.
 // Data layout is: |16 bit scale|32 x 4bit weights|.
+// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
 void extract_q4_0_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
@@ -55,28 +56,32 @@ void extract_q4_0_data(const ggml_tensor * tensor,
     auto * data = static_cast<uint8_t *>(tensor->data);
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
-
-    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
 
-    // For Q4_0, zero point is always 8
-    if (is_scalar_zp) {
-        zp[0] = 8 | (8 << 4);  // Pack two 4-bit values
-    }
+    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4);  // Signed i4 path
 
-    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-        scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
-        // For asymmetric quantization, compute per-block zero points
-        if (!is_scalar_zp) {
+    if (!is_symmetric) {
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
             // Pack two 4-bit zero points per byte
             if (i % 2 == 0) {
                 zp[i / 2] = 8;          // Lower nibble
             } else {
                 zp[i / 2] |= (8 << 4);  // Upper nibble
             }
-        }
-        unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
-    });
+            unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
+        });
+    } else {
+        // Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble)
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
+            unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
+            // Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8.
+            for (int j = 0; j < 16; ++j) {
+                weights[i * 16 + j] ^= 0x88;
+            }
+        });
+    }
 }
 
 // Extracts (weight, scales, zp) from Q4_1 tensors.
@@ -123,6 +128,7 @@ void extract_q4_1_data(const ggml_tensor * tensor,
 
 // Extracts (weight, scales, zp) from Q8_0 tensors.
 // Data layout is: |16 bit scale|32 x 8bit weights|.
+// When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
 void extract_q8_0_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
@@ -133,29 +139,30 @@ void extract_q8_0_data(const ggml_tensor * tensor,
     auto * data = static_cast<uint8_t *>(tensor->data);
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
-
-    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
 
-    // For Q8_0, zero point is always 128
-    if (is_scalar_zp) {
-        zp[0] = 128;
-    }
+    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
 
-    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-        uint8_t * block_data = data + i * bytes_per_block;
-        scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
-        // For asymmetric quantization, store per-block zero points
-        if (!is_scalar_zp) {
+    if (!is_symmetric) {
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            uint8_t * block_data = data + i * bytes_per_block;
+            scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
             zp[i] = 128;
-        }
-        for (size_t j = 0; j < weights_per_block; ++j) {
-            uint8_t x = block_data[j + 2];  // j+2 to skip the scale bytes.
-            // Original data is in int8_t, so we add a bias of -128 and invert the first bit.
-            x ^= 1 << 7;
-            weights[i * weights_per_block + j] = x;
-        }
-    });
+            for (size_t j = 0; j < weights_per_block; ++j) {
+                uint8_t x = block_data[j + 2];
+                x ^= 1 << 7;  // Convert int8 to uint8 by flipping sign bit
+                weights[i * weights_per_block + j] = x;
+            }
+        });
+    } else {
+        // Symmetric: store original int8 values directly (no unsigned bias)
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            uint8_t * block_data = data + i * bytes_per_block;
+            scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
+            // Copy int8 weights as-is (the tensor element type is i8)
+            memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block);
+        });
+    }
 }
 
 void unpack_256_4(const uint8_t * data, uint8_t * dst) {
@@ -256,44 +263,62 @@ void extract_q6_k_data(const ggml_tensor * tensor,
     auto * data = static_cast<uint8_t *>(tensor->data);
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
-
-    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
-
-    // For Q6_K, zero point is always 32
-    if (is_scalar_zp) {
-        zp[0] = 32;
-    }
-
-    ov::parallel_for(n_super_block, [&](size_t i) {
-        uint8_t * block_data = data + i * bytes_per_block;
 
-        float scale_factor =
-            static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));  // (128+64+16)/2
+    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
 
-        for (size_t j = 0; j < 16; j++) {
-            scales[j + i * 16] =
-                ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
-            // For asymmetric quantization, store per-block zero points
-            if (!is_scalar_zp) {
+    if (!is_symmetric) {
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());
+        ov::parallel_for(n_super_block, [&](size_t i) {
+            uint8_t * block_data = data + i * bytes_per_block;
+            float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
+            for (size_t j = 0; j < 16; j++) {
+                scales[j + i * 16] =
+                    ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
                 zp[j + i * 16] = 32;
             }
-        }
-
-        uint8_t * ql = block_data;
-        uint8_t * qh = block_data + 128;
-
-        for (int64_t j = 0; j < 32; ++j) {
-            weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
-            weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
-            weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
-            weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
-            weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
-            weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
-            weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
-            weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
-        }
-    });
+            uint8_t * ql = block_data;
+            uint8_t * qh = block_data + 128;
+            for (int64_t j = 0; j < 32; ++j) {
+                weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
+                weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
+                weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
+                weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
+                weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
+                weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
+                weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
+                weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
+            }
+        });
+    } else {
+        // Symmetric: subtract 32 from each weight to store as signed i8
+        ov::parallel_for(n_super_block, [&](size_t i) {
+            uint8_t * block_data = data + i * bytes_per_block;
+            float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
+            for (size_t j = 0; j < 16; j++) {
+                scales[j + i * 16] =
+                    ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
+            }
+            uint8_t * ql = block_data;
+            uint8_t * qh = block_data + 128;
+            auto * signed_weights = reinterpret_cast<int8_t *>(weights);
+            for (int64_t j = 0; j < 32; ++j) {
+                signed_weights[i * 256 + j] = static_cast<int8_t>((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32;
+                signed_weights[i * 256 + j + 32] =
+                    static_cast<int8_t>((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32;
+                signed_weights[i * 256 + j + 64] = static_cast<int8_t>((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32;
+                signed_weights[i * 256 + j + 96] =
+                    static_cast<int8_t>((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32;
+                signed_weights[i * 256 + j + 128] =
+                    static_cast<int8_t>((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32;
+                signed_weights[i * 256 + j + 160] =
+                    static_cast<int8_t>((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32;
+                signed_weights[i * 256 + j + 192] =
+                    static_cast<int8_t>((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32;
+                signed_weights[i * 256 + j + 224] =
+                    static_cast<int8_t>((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32;
+            }
+        });
+    }
 }
 
 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
@@ -389,11 +414,10 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                        size_t group_size,
                                        bool use_bias) {
     ov::Shape orig_shape = weight.get_shape();
+    bool is_signed = (weight.get_element_type() == ov::element::i8);  // Symmetric: signed weights, no ZP
 
     // Expand dimensions for scales and zp/bias
     auto scale_shape = scales.get_shape();
-    auto zp_shape = zp.get_shape();
-    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
 
     ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
 
@@ -403,37 +427,48 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
     } else {
         scale_shape.push_back(1);
         scales.set_shape(scale_shape);
-        // For symmetric quantization, zp remains scalar (don't resize)
-        if (!is_scalar_zp) {
+        if (!is_signed && zp.get_size() > 0) {
+            auto zp_shape = zp.get_shape();
             zp_shape.push_back(1);
             zp.set_shape(zp_shape);
         }
     }
 
-    // Create graph nodes
-    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
-                                                               static_cast<uint8_t *>(weight.data()), nullptr);
-    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
     auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
-    auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
 
     ov::Output<ov::Node> result;
-    if (use_bias && !is_scalar_zp) {
-        // Bias path: w * s + b (zp tensor holds f16 bias values)
-        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
-        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
-        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
+    if (is_signed) {
+        // Signed path: q * s (no zero point subtraction needed)
+        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i8, packed_shape,
+                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
+        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
+        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
+        result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
     } else {
-        // Zero point path: (w - zp) * s
-        auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
-        float zp_value;
-        if (ov::op::util::get_single_value(zero_point, zp_value)) {
-            zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
+        // Unsigned path
+        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
+                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
+        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
+        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
+
+        if (use_bias && zp.get_size() > 0) {
+            // Bias path: w * s + b (zp tensor holds f16 bias values)
+            auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
+            auto w_s =
+                std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+            result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
+        } else {
+            // Zero point path: (w - zp) * s
+            auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
+            float zp_value;
+            if (ov::op::util::get_single_value(zero_point, zp_value)) {
+                zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
+            }
+            auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
+            auto w_zp =
+                std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
+            result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
         }
-        auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
-        auto w_zp =
-            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
-        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
     }
 
     if (packed_shape.size() != 2) {
@@ -452,11 +487,10 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
                                        size_t group_size,
                                        bool use_bias) {
     ov::Shape orig_weight_shape = weight.get_shape();
+    bool is_signed = (weight.get_element_type() == ov::element::i4);  // Symmetric: signed weights, no ZP
 
     // Expand dimensions for scales and zp/bias
     ov::Shape scale_shape = scales.get_shape();
-    auto zp_shape = zp.get_shape();
-    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
 
     // Create INT4 weight tensor
     ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
@@ -467,36 +501,48 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
     } else {
         scale_shape.push_back(1);
         scales.set_shape(scale_shape);
-        // For symmetric quantization, zp remains scalar (don't resize)
-        if (!is_scalar_zp) {
+        if (!is_signed && zp.get_size() > 0) {
+            auto zp_shape = zp.get_shape();
             zp_shape.push_back(1);
             zp.set_shape(zp_shape);
         }
     }
 
-    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
-                                                               static_cast<uint8_t *>(weight.data()), nullptr);
-    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
-    auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
     auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
 
     ov::Output<ov::Node> result;
-    if (use_bias && !is_scalar_zp) {
-        // Bias path: w * s + b (zp tensor holds f16 bias values)
-        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
-        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
-        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
+    if (is_signed) {
+        // Signed path: q * s (no zero point subtraction needed)
+        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i4, packed_shape,
+                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
+        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
+        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
+        result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
     } else {
-        // Zero point path: (w - zp) * s
-        auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
-        float zp_value;
-        if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
-            zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
+        // Unsigned path
+        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
+                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
+        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
+        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
+
+        if (use_bias && zp.get_size() > 0) {
+            // Bias path: w * s + b (zp tensor holds f16 bias values)
+            auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
+            auto w_s =
+                std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+            result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
+        } else {
+            // Zero point path: (w - zp) * s
+            auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
+            float zp_value;
+            if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
+                zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
+            }
+            auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
+            auto w_zp =
+                std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
+            result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
         }
-        auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
-        auto w_zp =
-            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
-        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
     }
 
     if (packed_shape.size() != 2) {
@@ -699,24 +745,32 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
 
     // Quantized path (normal extraction or quantized requant)
     // Create weight/scale/zp tensors - shared between both paths
-    ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
+    // For symmetric quantization, use signed types (i4/i8) and no ZP tensor
+    ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
+                                                          (layout.is_u4 ? ov::element::u4 : ov::element::u8);
     ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
-    ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
 
     if (output_base_ptr) {
         uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
         result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
         result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-        result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
+        if (!layout.is_symmetric) {
+            ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
+            result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
+        }
+        // else: result.zp remains default-constructed (empty) for symmetric
     } else {
         result.weights = ov::Tensor(weight_type, node_shape);
         result.scales = ov::Tensor(ov::element::f16, scale_shape);
-        if (use_bias && !layout.is_symmetric) {
-            // bias only has effect for asymmetric quant
-            result.zp = ov::Tensor(ov::element::f16, zp_shape);
-        } else {
-            result.zp = ov::Tensor(weight_type, zp_shape);
+        if (!layout.is_symmetric) {
+            if (use_bias) {
+                result.zp = ov::Tensor(ov::element::f16, scale_shape);
+            } else {
+                ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
+                result.zp = ov::Tensor(zp_type, scale_shape);
+            }
         }
+        // else: result.zp remains default-constructed (empty) for symmetric
     }
 
     if (layout.is_requant && layout.requant_type.has_value()) {
@@ -741,59 +795,75 @@ void quantize_q4_0(const float * x,
 
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
-    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
-
-    // For Q4_0, zero point is always 8
-    if (is_scalar_zp) {
-        zp[0] = 8 | (8 << 4);  // Pack two 4-bit values
-    }
+    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4);  // Signed i4 path
 
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f;  // absolute max
-        float max = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i * qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max = v;
+    if (!is_symmetric) {
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());
+        for (int i = 0; i < nb; i++) {
+            float amax = 0.0f;
+            float max = 0.0f;
+            for (int j = 0; j < qk; j++) {
+                const float v = x[i * qk + j];
+                if (amax < fabsf(v)) {
+                    amax = fabsf(v);
+                    max = v;
+                }
             }
-        }
-
-        const float d = max / -8;
-
-        if (d == 0) {
-            scales[i] = ov::float16(1.0f);
-            // zp is already set to 8 for symmetric, or set per-block for asymmetric
-            if (!is_scalar_zp) {
+            const float d = max / -8;
+            if (d == 0) {
+                scales[i] = ov::float16(1.0f);
                 if (i % 2 == 0) {
                     zp[i / 2] = 8;
                 } else {
                     zp[i / 2] |= (8 << 4);
                 }
+                memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
+                continue;
             }
-            memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
-            continue;
-        }
-
-        const float id = 1.0f / d;
-        scales[i] = ov::float16(d);
-        // For asymmetric quantization, store per-block zero points
-        if (!is_scalar_zp) {
+            const float id = 1.0f / d;
+            scales[i] = ov::float16(d);
             if (i % 2 == 0) {
                 zp[i / 2] = 8;
             } else {
                 zp[i / 2] |= (8 << 4);
             }
+            for (int j = 0; j < qk / 2; ++j) {
+                const float x0 = x[i * qk + 2 * j] * id;
+                const float x1 = x[i * qk + 2 * j + 1] * id;
+                const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
+                const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
+                weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
+            }
         }
-
-        for (int j = 0; j < qk / 2; ++j) {
-            const float x0 = x[i * qk + 2 * j] * id;
-            const float x1 = x[i * qk + 2 * j + 1] * id;
-            const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
-            const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
-            weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
+    } else {
+        // Symmetric: produce signed i4 values in [-8, 7]
+        for (int i = 0; i < nb; i++) {
+            float amax = 0.0f;
+            float max = 0.0f;
+            for (int j = 0; j < qk; j++) {
+                const float v = x[i * qk + j];
+                if (amax < fabsf(v)) {
+                    amax = fabsf(v);
+                    max = v;
+                }
+            }
+            const float d = max / -8;
+            if (d == 0) {
+                scales[i] = ov::float16(1.0f);
+                // i4 value 0 packed: 0x00
+                memset(weights + i * qk / 2, 0, qk / 2);
+                continue;
+            }
+            const float id = 1.0f / d;
+            scales[i] = ov::float16(d);
+            for (int j = 0; j < qk / 2; ++j) {
+                const float x0 = x[i * qk + 2 * j] * id;
+                const float x1 = x[i * qk + 2 * j + 1] * id;
+                // Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement.
+                int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0)));
+                int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1)));
+                weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4);
+            }
         }
     }
 }
@@ -809,36 +879,42 @@ void quantize_q8_0(const float * x,
 
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
-    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
-
-    // For Q8_0, zero point is always 128
-    if (is_scalar_zp) {
-        zp[0] = 128;
-    }
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f;  // absolute max
+    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
 
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i * qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
+    if (!is_symmetric) {
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());
+        for (int i = 0; i < nb; i++) {
+            float amax = 0.0f;
+            for (int j = 0; j < qk; j++) {
+                const float v = x[i * qk + j];
+                amax = std::max(amax, fabsf(v));
             }
-        }
-
-        const float d = amax / 127.0f;
-        const float id = d ? 1.0f / d : 0.0f;
-        scales[i] = ov::float16(d);
-        // For asymmetric quantization, store per-block zero points
-        if (!is_scalar_zp) {
+            const float d = amax / 127.0f;
+            const float id = d ? 1.0f / d : 0.0f;
+            scales[i] = ov::float16(d);
             zp[i] = 128;
+            for (int j = 0; j < qk; ++j) {
+                const float x0 = x[i * qk + j] * id;
+                const int8_t xi0 = roundf(x0);
+                weights[i * qk + j] = (uint8_t) (xi0 + 128);
+            }
         }
-
-        for (int j = 0; j < qk; ++j) {
-            const float x0 = x[i * qk + j] * id;
-            const int8_t xi0 = roundf(x0);
-            weights[i * qk + j] = (uint8_t) (xi0 + 128);
+    } else {
+        // Symmetric: store signed int8 values directly
+        auto * signed_weights = reinterpret_cast<int8_t *>(weights);
+        for (int i = 0; i < nb; i++) {
+            float amax = 0.0f;
+            for (int j = 0; j < qk; j++) {
+                const float v = x[i * qk + j];
+                amax = std::max(amax, fabsf(v));
+            }
+            const float d = amax / 127.0f;
+            const float id = d ? 1.0f / d : 0.0f;
+            scales[i] = ov::float16(d);
+            for (int j = 0; j < qk; ++j) {
+                const float x0 = x[i * qk + j] * id;
+                signed_weights[i * qk + j] = (int8_t) roundf(x0);
+            }
         }
     }
 }
@@ -861,12 +937,8 @@ void quantize_q8_1(const float * x,
 
         for (int j = 0; j < qk; j++) {
             const float v = x[i * qk + j];
-            if (v < min) {
-                min = v;
-            }
-            if (v > max) {
-                max = v;
-            }
+            min = std::min(v, min);
+            max = std::max(v, max);
         }
 
         const float d = (max - min) / ((1 << 8) - 1);
diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp
deleted file mode 100644
index ed2a3ab6d1b..00000000000
--- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "eliminate_zp.h"
-
-#include <openvino/core/graph_util.hpp>
-#include <openvino/core/parallel.hpp>
-#include <openvino/core/rt_info.hpp>
-#include <openvino/op/constant.hpp>
-#include <openvino/op/convert.hpp>
-#include <openvino/op/multiply.hpp>
-#include <openvino/op/subtract.hpp>
-#include <openvino/pass/pattern/op/label.hpp>
-#include <openvino/pass/pattern/op/pattern.hpp>
-#include <openvino/pass/pattern/op/wrap_type.hpp>
-
-namespace ov {
-namespace frontend {
-namespace ggml {
-namespace pass {
-
-EliminateZeroPoints::EliminateZeroPoints() {
-    // Find pattern:
-    // (Multiply Any(scale)
-    //           (Subtract (Convert Constant(data)))
-    //                     (Convert Constant(zero_point)))
-    // where zero_point is a scalar
-    // If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
-    // If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
-
-    auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
-    auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
-
-    auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
-    auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
-
-    auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
-    auto m_scale = ov::pass::pattern::any_input();
-    auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
-
-    const auto callback = [=](ov::pass::pattern::Matcher & m) {
-        const auto & pattern_map = m.get_pattern_value_map();
-
-        auto multiply_node =
-            std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
-        auto subtract_node =
-            std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
-        auto data_constant =
-            std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
-        auto zp_constant =
-            std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
-
-        if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
-            return false;
-        }
-
-        if (ov::shape_size(zp_constant->get_shape()) != 1) {
-            return false;
-        }
-
-        auto data_type = data_constant->get_element_type();
-        auto zp_data = zp_constant->cast_vector<int>();
-
-        if (zp_data.empty()) {
-            return false;
-        }
-
-        int zp_value = zp_data[0];
-
-        bool should_eliminate = false;
-        ov::element::Type target_type;
-
-        if (data_type == ov::element::u4 && zp_value == 8) {
-            should_eliminate = true;
-            target_type = ov::element::i4;
-        } else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
-            should_eliminate = true;
-            target_type = ov::element::i8;
-        }
-
-        if (!should_eliminate) {
-            return false;
-        }
-
-        auto data_shape = data_constant->get_shape();
-        size_t total_elements = ov::shape_size(data_shape);
-
-        std::shared_ptr<ov::op::v0::Constant> new_constant;
-
-        // TODO improve performance
-        if (data_type == ov::element::u4) {
-            auto data_values = data_constant->cast_vector<uint8_t>();
-            std::vector<int8_t> adjusted_values(total_elements);
-
-            ov::parallel_for(total_elements, [&](size_t i) {
-                adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
-            });
-
-            new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
-        } else if (data_type == ov::element::u8) {
-            auto data_values = data_constant->cast_vector<uint8_t>();
-            std::vector<int8_t> adjusted_values(total_elements);
-
-            ov::parallel_for(total_elements, [&, zp_value](size_t i) {
-                adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
-            });
-
-            new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
-        }
-
-        auto new_convert =
-            std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
-        ov::replace_node(subtract_node, new_convert);
-
-        return true;
-    };
-
-    register_matcher(
-        std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
-        callback);
-}
-
-}  // namespace pass
-}  // namespace ggml
-}  // namespace frontend
-}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h
deleted file mode 100644
index edd3cd718d9..00000000000
--- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "openvino/pass/matcher_pass.hpp"
-
-namespace ov {
-namespace frontend {
-namespace ggml {
-namespace pass {
-
-class EliminateZeroPoints : public ov::pass::MatcherPass {
-public:
-    OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
-    EliminateZeroPoints();
-};
-
-}  // namespace pass
-}  // namespace ggml
-}  // namespace frontend
-}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
index 23a1dea2496..2d7b52a69f3 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -3,7 +3,6 @@
 #include "ggml-openvino/openvino/node_context.h"
 #include "ggml-openvino/openvino/utils.h"
 #include "input_model.h"
-#include "pass/eliminate_zp.h"
 #include "pass/mark_decompression_convert_constant_folding.h"
 #include "pass/squeeze_matmul.h"
 
@@ -12,6 +11,7 @@
 #include <map>
 #include <memory>
 #include <openvino/core/node.hpp>
+#include <openvino/core/preprocess/pre_post_process.hpp>
 #include <openvino/op/add.hpp>
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
@@ -33,7 +33,6 @@
 #include <openvino/op/unsqueeze.hpp>
 #include <openvino/pass/constant_folding.hpp>
 #include <openvino/pass/make_stateful.hpp>
-#include <openvino/core/preprocess/pre_post_process.hpp>
 
 namespace ov {
 namespace frontend {
@@ -257,7 +256,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
         }
 
         if (ggml_model_decoder->is_static()) {
-            manager.register_pass<pass::EliminateZeroPoints>();
             manager.register_pass<pass::SqueezeMatmul>();
         }
         manager.run_passes(model);

From e8045942f6ca329d1a9d8ed9b7a90a21fa1d4166 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Thu, 19 Mar 2026 09:35:22 +0530
Subject: [PATCH 05/24] Use weightless caching

---
 ggml/src/ggml-openvino/ggml-openvino-extra.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
index 317b34e9053..4140136aca2 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -6,6 +6,7 @@
 #include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
+#include <openvino/runtime/properties.hpp>
 #include <optional>
 
 ov::Core & ov_singleton_core() {
@@ -42,11 +43,13 @@ void ggml_openvino_device_config::init() {
             {"NPUW_DQ",                           "YES"   },
             {"NPUW_DQ_FULL",                      "NO"    },
         };
-        if (cache_dir) {
+        if (cache_dir && strlen(cache_dir) > 0) {
             compile_config["NPUW_CACHE_DIR"] = cache_dir;
+            compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
         }
-    } else if (cache_dir) {
-        ov_singleton_core().set_property(ov::cache_dir(cache_dir));
+    } else if (cache_dir && strlen(cache_dir) > 0) {
+        compile_config.insert(ov::cache_dir(cache_dir));
+        compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
     }
 
     // Initialize remote context with queue sharing for GPU

From 7b8c6f5cdf50c83e25223ab9e3d478206e141891 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Thu, 19 Mar 2026 09:35:57 +0530
Subject: [PATCH 06/24] Add WeightlessCacheAttribute to reduce NPU memory usage

---
 .../rt_info/weightless_caching_attributes.hpp | 41 +++++++++++++++++++
 .../openvino/translate_session.cpp            | 26 ++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp

diff --git a/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp b/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp
new file mode 100644
index 00000000000..f051891c481
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <openvino/core/core_visibility.hpp>
+#include <openvino/core/node.hpp>
+#include <openvino/core/runtime_attribute.hpp>
+
+namespace ov {
+
+/**
+ * @brief Holds weightless caching attributes of a single constant.
+ *
+ * WeightlessCacheAttribute class represents runtime info attribute that holds
+ * the values of original size of the constant in bytes and the binary offset of the
+ * constant's data in the weights file used by the weightless caching mechanism. It's
+ * not copyable in case the data was changed (the original node was replaced by a new
+ * one produced during the tranformation pipeline) - in that case weightless caching
+ * can't be used for that constant.
+ */
+class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
+public:
+    OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute)
+
+    WeightlessCacheAttribute() = delete;
+
+    WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
+        : original_size(original_size),
+          bin_offset(bin_offset),
+          original_dtype(original_dtype) {}
+
+    bool is_copyable() const override;
+
+    size_t original_size;
+    size_t bin_offset;
+    ov::element::Type original_dtype;
+};
+
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
index 2d7b52a69f3..0f68a1f5062 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -5,6 +5,7 @@
 #include "input_model.h"
 #include "pass/mark_decompression_convert_constant_folding.h"
 #include "pass/squeeze_matmul.h"
+#include "rt_info/weightless_caching_attributes.hpp"
 
 #include <cstdint>
 #include <cstdlib>
@@ -239,6 +240,31 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
     resulting_model = std::make_shared<Model>(results, used_params);
 
     apply_transformations(resulting_model);
+
+    // Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies
+    // in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor
+    // (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export
+    // occurs", doubling memory usage per compile_model call.
+    //
+    // The bin_offset field serves as a unique key (not a real file offset) — this is
+    // the same convention the GPU plugin uses for non-IR models (see
+    // Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp).
+    // Each constant must have a distinct bin_offset, otherwise GPU's weightless cache
+    // import will map multiple constants to the same data.
+    //
+    // Small constants (< 16 elements) are excluded since they may be introduced by
+    // optimization patterns and the overhead is negligible.
+    size_t offset = 0;
+    for (auto & node : resulting_model->get_ordered_ops()) {
+        if (auto cnst = ov::as_type_ptr<ov::op::v0::Constant>(node);
+            cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) {
+            auto & rt_info = cnst->get_rt_info();
+            if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) {
+                rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] =
+                    ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type());
+            }
+        }
+    }
     return resulting_model;
 }
 

From 77bd3541d823903667612ffea0631cad1bb6cc33 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Wed, 8 Apr 2026 18:37:18 -0700
Subject: [PATCH 07/24] Gelu tanh support (#125)

---
 ggml/src/ggml-openvino/ggml-openvino.cpp      |  1 +
 .../ggml-openvino/openvino/op/unary_gelu.cpp  | 25 +++++++++++++++++++
 ggml/src/ggml-openvino/openvino/op_table.cpp  |  1 +
 ggml/src/ggml-openvino/openvino/op_table.h    |  1 +
 4 files changed, 28 insertions(+)
 create mode 100644 ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 0af294d85c8..c41116e6f94 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -935,6 +935,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
                                                  // GGML_OP_SOFT_MAX,
                                                  GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
     static const std::set<ggml_unary_op> supported_unary_ops{
+        GGML_UNARY_OP_GELU,
         GGML_UNARY_OP_SILU,
     };
     static const std::set<ggml_glu_op> supported_glu_ops{
diff --git a/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp
new file mode 100644
index 00000000000..d1e9efc33a5
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp
@@ -0,0 +1,25 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/gelu.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_unary_gelu(const NodeContext & context) {
+    num_inputs_check(context, 1, 1);
+
+    auto input = context.get_input(0);
+    auto res = std::make_shared<ov::op::v7::Gelu>(input);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp
index beadafe8103..1385539279c 100644
--- a/ggml/src/ggml-openvino/openvino/op_table.cpp
+++ b/ggml/src/ggml-openvino/openvino/op_table.cpp
@@ -31,6 +31,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
         {"GGML_OP_SOFT_MAX",       op::translate_soft_max                         },
         {"GGML_OP_SUB",            op::translate_1to1_match_2_inputs<v1::Subtract>},
         {"GGML_OP_TRANSPOSE",      op::translate_transpose                        },
+        {"GGML_UNARY_OP_GELU",     op::translate_unary_gelu                       },
         {"GGML_UNARY_OP_SILU",     op::translate_unary_silu                       },
         {"GGML_OP_VIEW",           op::translate_view                             },
         {"GGML_GLU_OP_SWIGLU",     op::translate_glu_swiglu                       },
diff --git a/ggml/src/ggml-openvino/openvino/op_table.h b/ggml/src/ggml-openvino/openvino/op_table.h
index 37f763117aa..f546796d2ee 100644
--- a/ggml/src/ggml-openvino/openvino/op_table.h
+++ b/ggml/src/ggml-openvino/openvino/op_table.h
@@ -21,6 +21,7 @@ GGML_OP_CONVERTER(translate_rms_norm);
 GGML_OP_CONVERTER(translate_rope);
 GGML_OP_CONVERTER(translate_scale);
 GGML_OP_CONVERTER(translate_unary_silu);
+GGML_OP_CONVERTER(translate_unary_gelu);
 GGML_OP_CONVERTER(translate_soft_max);
 GGML_OP_CONVERTER(translate_transpose);
 GGML_OP_CONVERTER(translate_view);

From 5957f77f5e796baee7f95b5d1c242b439b484ec0 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Wed, 8 Apr 2026 18:50:10 -0700
Subject: [PATCH 08/24] Imrope support (#126)

---
 ggml/src/ggml-openvino/ggml-openvino.cpp    |  8 +-
 ggml/src/ggml-openvino/openvino/op/rope.cpp | 35 ++++++--
 ggml/src/ggml-openvino/openvino/utils.cpp   | 98 +++++++++++++--------
 ggml/src/ggml-openvino/openvino/utils.h     |  1 +
 4 files changed, 101 insertions(+), 41 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index c41116e6f94..84a9e45a146 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -884,7 +884,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         const int32_t * op_params = op->op_params;
         const int n_dims = op_params[1];
         const int mode = op_params[2];
-        if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
+        if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) {
             // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
             return true;
         }
@@ -906,6 +906,12 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
                 return true;
             }
         }
+        if (mode == GGML_ROPE_TYPE_IMROPE &&
+            (op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 ||
+             ((const float *) op_params)[8] != 1)) {
+            // GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n");
+            return true;
+        }
         break;
     }
     default:
diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp
index 26dc2d24f82..71fd90fae36 100644
--- a/ggml/src/ggml-openvino/openvino/op/rope.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp
@@ -9,12 +9,17 @@
 #include <openvino/op/add.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/cos.hpp>
+#include <openvino/op/gather.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
+#include <openvino/op/sin.hpp>
 #include <openvino/op/slice.hpp>
 #include <openvino/op/split.hpp>
 #include <openvino/op/subtract.hpp>
+#include <openvino/op/transpose.hpp>
 #include <openvino/op/unsqueeze.hpp>
 #include <vector>
 
@@ -33,6 +38,11 @@ OutputVector translate_rope(const NodeContext & context) {
     auto data_node = context.get_input(0).get_node_shared_ptr();
     auto output_shape = context.get_output_shape().to_shape();
     int32_t * op_params = context.get_output_op_params();
+    const int mode = op_params[2];
+
+    constexpr int ROPE_TYPE_NORMAL = 0;
+    constexpr int ROPE_TYPE_NEOX = 2;
+    constexpr int ROPE_TYPE_IMROPE = 40;
 
     Output<Node> cos_theta_node;
     Output<Node> sin_theta_node;
@@ -45,7 +55,7 @@ OutputVector translate_rope(const NodeContext & context) {
         if (context.get_input_size() == 3) {
             rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
         }
-        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
+        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == ROPE_TYPE_IMROPE);
         sin_theta_node = sin_cos.first;
         cos_theta_node = sin_cos.second;
     }
@@ -65,10 +75,6 @@ OutputVector translate_rope(const NodeContext & context) {
         }
     }
 
-    const int mode = op_params[2];
-    constexpr int ROPE_TYPE_NORMAL = 0;
-    constexpr int ROPE_TYPE_NEOX = 2;
-
     if (mode == ROPE_TYPE_NORMAL) {
         auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
         auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
@@ -112,6 +118,25 @@ OutputVector translate_rope(const NodeContext & context) {
             std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
 
         res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
+    } else if (mode == ROPE_TYPE_IMROPE) {
+        int64_t n_dims = data_node->get_shape()[3];
+        auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
+        auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
+        auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
+
+        auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3});
+        auto split_a = std::make_shared<ov::op::v1::Split>(data_node, split_axis, 2);
+        auto x0 = split_a->output(0);
+        auto x1 = split_a->output(1);
+        auto mul_a = std::make_shared<ov::op::v1::Multiply>(x0, cos_reshaped);
+        auto mul_b = std::make_shared<ov::op::v1::Multiply>(x1, sin_reshaped);
+        auto sub = std::make_shared<ov::op::v1::Subtract>(mul_a, mul_b);
+
+        auto mul_c = std::make_shared<ov::op::v1::Multiply>(x0, sin_reshaped);
+        auto mul_d = std::make_shared<ov::op::v1::Multiply>(x1, cos_reshaped);
+        auto add = std::make_shared<ov::op::v1::Add>(mul_c, mul_d);
+
+        res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
     }
 
     return rename_outputs_with_suffix({res}, context.get_name());
diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp
index d9a28887643..0baaf88e17a 100644
--- a/ggml/src/ggml-openvino/openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/openvino/utils.cpp
@@ -2,6 +2,7 @@
 
 #include "ggml-impl.h"
 
+#include <cmath>
 #include <cstddef>
 #include <ctime>
 #include <memory>
@@ -13,6 +14,7 @@
 #include <openvino/op/gather.hpp>
 #include <openvino/op/maximum.hpp>
 #include <openvino/op/multiply.hpp>
+#include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
 #include <openvino/op/sin.hpp>
 #include <openvino/op/squeeze.hpp>
@@ -118,6 +120,7 @@ void ggml_rope_yarn_corr_dims(int n_dims,
 std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
                                                            std::shared_ptr<ov::Node> inp_pos,
                                                            std::shared_ptr<ov::Node> rope_freqs_weight,
+                                                           bool imrope,
                                                            bool stateful) {
     if (stateful) {
         inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
@@ -125,6 +128,13 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
         auto pos_perm =
             std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
         inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
+    } else if (imrope) {
+        inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
+        auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1});
+        inp_pos = std::make_shared<ov::op::v1::Reshape>(inp_pos, pos_shape, true);
+        auto pos_transpose_shape =
+            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
+        inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_transpose_shape);
     } else {
         inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
         auto pos_perm =
@@ -139,6 +149,7 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
     float beta_fast;
     float beta_slow;
     const int n_dims = rope_params[1];
+    const size_t n_dims_half = n_dims >> 1;
     const int n_ctx_orig = rope_params[4];
     memcpy(&freq_base, rope_params + 5, sizeof(float));
     memcpy(&freq_scale, rope_params + 6, sizeof(float));
@@ -149,57 +160,74 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
-
-    std::vector<float> factor(n_dims / 2);
-    factor[0] = 1.0f;
-    for (size_t i = 1; i < factor.size(); i++) {
-        factor[i] = theta_scale * factor[i - 1];
-    }
+    std::vector<float> factor(n_dims_half);
 
     Output<Node> freq_factors;
-    if (stateful) {
-        freq_factors =
-            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
-    } else {
-        freq_factors =
-            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
-    }
-    if (rope_freqs_weight) {
-        freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
-    }
-
-    auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
-    auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
-        theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
 
     Output<Node> theta;
     float mscale = attn_factor;
-    if (ext_factor == 0.0f) {
-        theta = theta_interp;
+    if (imrope) {
+        std::vector<int64_t> gather_indices(n_dims_half);
+        for (size_t j = 0; j < n_dims_half; j++) {
+            gather_indices[j] = j % 3;
+            factor[j] = std::pow(theta_scale, j);
+        }
+        auto gather_indices_const =
+            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{n_dims_half}, gather_indices);
+        auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4});
+        inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, gather_indices_const, gather_axis);
+        auto factor_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_dims_half}, factor);
+        theta = std::make_shared<ov::op::v1::Multiply>(inp_pos, factor_const);
     } else {
-        auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
-        Output<Node> one;
+        float corr_dims[2];
+        ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+        factor[0] = 1.0f;
+        for (size_t i = 1; i < factor.size(); i++) {
+            factor[i] = theta_scale * factor[i - 1];
+        }
         if (stateful) {
-            one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
+            freq_factors =
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
         } else {
-            one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
+            freq_factors =
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
+        }
+        if (rope_freqs_weight) {
+            freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
         }
-        auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
 
-        theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
-                                                  std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
-        mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
+        auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
+        auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
+            theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
+
+        if (ext_factor == 0.0f) {
+            theta = theta_interp;
+        } else {
+            auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
+            Output<Node> one;
+            if (stateful) {
+                one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
+            } else {
+                one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
+            }
+            auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
+
+            theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
+                                                      std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
+            mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
+        }
     }
 
     Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
     Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
 
-    auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
+    if (!imrope) {
+        auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
+
+        cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
+        sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
+    }
 
-    cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
-    sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
     return std::make_pair(sin_theta, cos_theta);
 }
 
diff --git a/ggml/src/ggml-openvino/openvino/utils.h b/ggml/src/ggml-openvino/openvino/utils.h
index 88dcad4c906..767dd4c53ea 100644
--- a/ggml/src/ggml-openvino/openvino/utils.h
+++ b/ggml/src/ggml-openvino/openvino/utils.h
@@ -67,6 +67,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
 std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
                                                            std::shared_ptr<ov::Node> inp_pos,
                                                            std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
+                                                           bool imrope = false,
                                                            bool stateful = false);
 
 ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);

From 185bfcc2c6c69455b6674ad9fcf8460688f836c5 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Thu, 5 Mar 2026 17:37:21 -0800
Subject: [PATCH 09/24] Add interface is_model_splitted() to check the c-graph
 is splited or not

---
 ggml/src/ggml-openvino/ggml-decoder.cpp   |  4 +-
 ggml/src/ggml-openvino/ggml-decoder.h     |  6 +++
 ggml/src/ggml-openvino/openvino/decoder.h |  2 +
 ggml/src/ggml-openvino/utils.cpp          | 56 +++++++++++++++++++++--
 ggml/src/ggml-openvino/utils.h            |  7 +++
 5 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 1b8c865c331..8b2c1631d0d 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -44,6 +44,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
                              std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                              bool is_static,
                              bool is_stateful,
+                             bool model_is_splitted,
                              bool is_prefill,
                              int prefill_chunk_size) :
     m_is_static(is_static),
@@ -51,6 +52,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
     m_is_prefill(is_prefill),
     m_naive(false),
     m_prefill_chunk_size(prefill_chunk_size),
+    m_model_is_splitted(model_is_splitted),
     m_cgraph(cgraph),
     m_model_weights(model_weights),
     m_model_params(model_params),
@@ -968,4 +970,4 @@ const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
 const std::string & GgmlOvDecoder::get_op_type() const {
     static const std::string unknown_op = "UNKNOWN_GGML_OP";
     return unknown_op;
-}
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 3ae25ddda32..9ed52c894d4 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -69,6 +69,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
                   std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                   bool is_static,
                   bool is_stateful = false,
+                  bool model_is_splitted = false,
                   bool is_prefill = false,
                   int prefill_chunk_size = 256);
 
@@ -175,6 +176,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual bool is_stateful() const override { return m_is_stateful; }
 
+    virtual bool is_splited_model() const override {
+        return m_model_is_splitted;
+    }
+
     ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
 
     static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
@@ -205,6 +210,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     bool m_is_prefill = false;
     bool m_naive = false;
     int m_prefill_chunk_size = 0;
+    bool m_model_is_splitted = false; // label the cgraph is splited or not
 
     static ov::Shape get_shape(const ggml_tensor * tensor);
     static std::vector<size_t> get_stride(const ggml_tensor * tensor);
diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h
index 3b8da2be5d2..ed6ff7c0aba 100644
--- a/ggml/src/ggml-openvino/openvino/decoder.h
+++ b/ggml/src/ggml-openvino/openvino/decoder.h
@@ -66,6 +66,8 @@ class GgmlDecoder : public DecoderBase {
 
     virtual bool is_stateful() const = 0;
 
+    virtual bool is_splited_model() const = 0;
+
     virtual int is_swa_layer(int layer) const = 0;
 };
 
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 6f6ecde65ea..639843198e4 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -85,7 +85,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
     bool stateful = r_ctx->stateful;
     static auto is_static = false;
 
-    if (is_naive(cgraph)) {
+    bool model_is_splitted = is_model_splitted(cgraph);
+
+    if (is_naive(cgraph) && !model_is_splitted) {
         return naive_compute(cgraph, core, device, config);
     }
 
@@ -181,7 +183,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
             std::shared_ptr<ov::Model> model;
             auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
 
-            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful);
+            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful, model_is_splitted);
             decoder_end_time = ggml_time_us();
 
             auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
@@ -352,9 +354,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
         auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
 
         auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
-                                                                    is_static, stateful, true, prefill_chunk_size);
+                                                                    is_static, stateful, false, true, prefill_chunk_size);
         auto ggml_decoder_decode = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static,
-                                                                   stateful, false, prefill_chunk_size);
+                                                                   stateful, false, false, prefill_chunk_size);
         decoder_end_time = ggml_time_us();
 
         auto input_model_prefill = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_prefill);
@@ -484,6 +486,52 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
     return GGML_STATUS_SUCCESS;
 }
 
+// Detect whether a cgraph is a split subgraph or not.
+// Step 1 compares each node's recorded use_count with actual fan-out references in node->src.
+// Step 2 verifies that node inputs come from model nodes/weights/leafs; external sources imply split.
+bool is_model_splitted(ggml_cgraph * cgraph) {
+    // check the nodes of the model are used by the following nodes, through compare the node's use count and the count of nodes that use it as input. If does not match, return true, else return false.
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        int use_count = cgraph->use_counts[ggml_hash_find(&cgraph->visited_hash_set, node)];
+        // TODO: this is a workround for the tests case from llama.cpp, fix should from the root cause in the future.
+        if ((cgraph->n_nodes <= 1 && use_count==0) || (cgraph->n_nodes <= 1 && node->op == GGML_OP_VIEW && use_count == 1 && node->src[0] != nullptr && node->src[0]->op == GGML_OP_NONE)) {
+            return false;
+        }
+        int input_use_count = 0;
+        for (int j = 0; j < cgraph->n_nodes; j++) {
+            ggml_tensor * other_node = cgraph->nodes[j];
+            for (int k = 0; k < GGML_MAX_SRC; k++) {
+                if (other_node->src[k] == node) {
+                    input_use_count++;
+                }
+            }
+        }
+        if (use_count != input_use_count && node->op != GGML_OP_NONE) {
+            return true;
+        }
+    }
+    // if all nodes's src node's src is not come from the nodes in the model, we think the model is splitted. This is a complementary check for the above check, because for some special case like the output node is not used by any node, the use count and input use count are both 0, we can not determine whether the model is splitted or not just based on the first check.
+    auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, true);
+    std::set<ggml_tensor *> model_nodes(cgraph->nodes, cgraph->nodes + cgraph->n_nodes);
+    // leaf nodes
+    std::set<ggml_tensor *> model_leafs(cgraph->leafs, cgraph->leafs + cgraph->n_leafs);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            ggml_tensor * src = node->src[j];
+            // the src is also not the model weights, we think the model is splitted.
+            // the src is also not in model leafs, we think the model is splitted.
+            if (src != nullptr && model_nodes.find(src) == model_nodes.end() &&
+                model_weights.find(std::string(src->name)) == model_weights.end() && !model_leafs.empty() == false &&
+                model_leafs.find(src) == model_leafs.end()) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 bool is_naive(ggml_cgraph * cgraph) {
     constexpr int naive_graph_size_threshold = 20;
     int count = 0;
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
index 89b1b209342..b58a11282b4 100644
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@@ -123,6 +123,13 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
 
 bool is_naive(struct ggml_cgraph * cgraph);
 
+/**
+ * @brief Heuristically checks whether the given computation graph is a split-model fragment.
+ * @param cgraph Pointer to the GGML computation graph to analyze.
+ * @return true if the graph is identified as split; otherwise false.
+ */
+bool is_model_splitted(struct ggml_cgraph * cgraph);
+
 enum ggml_status naive_compute(struct ggml_cgraph * cgraph,
                                ov::Core & core,
                                const std::string & device,

From 1f25490f537d06ccd5d6dd02392a23f7bc88dbf5 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Tue, 17 Mar 2026 15:15:54 +0800
Subject: [PATCH 10/24] Infer and propagate dynamic-dimension indices for all
 tensors in the GGML graph in api compute_model_outputs()

---
 ggml/src/ggml-openvino/ggml-decoder.cpp | 274 +++++++++++++++++++++++-
 ggml/src/ggml-openvino/ggml-decoder.h   |   6 +-
 2 files changed, 275 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 8b2c1631d0d..7a42564760b 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -69,6 +69,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
     validate_cgraph();
 
     set_input_output();
+    compute_node_dynamic_dims();
     compute_model_inputs();
     compute_model_outputs();
 
@@ -331,7 +332,7 @@ void GgmlOvDecoder::validate_cgraph() const {
     }
 }
 
-ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
+ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const {
     if (m_naive) {
         return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
     }
@@ -382,6 +383,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
     } else {
         input_shape = ov::PartialShape{get_shape(input)};
     }
+    if (dynamic_dim_index != -1) {
+        input_shape[3 - dynamic_dim_index] = -1;
+    }
     return input_shape;
 }
 
@@ -444,7 +448,7 @@ void GgmlOvDecoder::compute_model_inputs() {
             if (m_model_weights.find(node_name) == m_model_weights.end()) {
                 m_inputs[node_name] = node;
                 auto param_node =
-                    std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr));
+                    std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node]));
                 param_node->set_friendly_name(node_name);
                 param_node->output(0).get_tensor().set_names({node_name});
                 m_model_inputs[node_name] = param_node;
@@ -488,7 +492,7 @@ void GgmlOvDecoder::compute_model_inputs() {
                     m_model_params.kv_names.push_back(src_name);
                 }
             }
-            ov::PartialShape param_shape = get_graph_input_shape(node, src);
+            ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]);
             auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
             param_node->set_friendly_name(src_name);
             param_node->output(0).get_tensor().set_names({src_name});
@@ -970,4 +974,266 @@ const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
 const std::string & GgmlOvDecoder::get_op_type() const {
     static const std::string unknown_op = "UNKNOWN_GGML_OP";
     return unknown_op;
-}
\ No newline at end of file
+}
+
+void GgmlOvDecoder::compute_node_dynamic_dims() {
+    auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
+        if (!node) {
+            return;
+        }
+
+        if (node->op == GGML_OP_CPY) {
+            m_node_dynamic_dims[node] = -1;
+        }
+
+        if (m_node_dynamic_dims.count(node)) {
+            return;
+        }
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            ggml_tensor * src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+            struct ggml_tensor *root_src = nullptr;
+            // if (src->org_src) {
+            //     root_src = src->org_src;
+            // }
+            if (root_src) {
+                if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) ||
+                    is_output_idx(root_src, node)) {
+                    m_node_dynamic_dims[root_src] = 0;
+                    m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
+                    continue;
+                }
+                self(self, root_src);
+                m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
+            } else {
+                if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) {
+                    m_node_dynamic_dims[src] = 0;
+                    continue;
+                }
+                self(self, src);
+            }
+        }
+        switch (node->op) {
+        case GGML_OP_NONE:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        case GGML_OP_GET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]];
+                auto dynamic_dim_value = node->src[1]->ne[dynamic_dim_idx];
+                if (dynamic_dim_idx == 0) {
+                    m_node_dynamic_dims[node] = 1;
+                } else {
+                    auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) *
+                                              ggml_type_size(node->src[0]->type);
+                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                        if (dynamic_dim_stride == node->src[0]->nb[i]) {
+                            m_node_dynamic_dims[node] = i;
+                            break;
+                        }
+                    }
+                }
+                OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[1]: " + std::string(node->src[1]->name));
+            }
+            break;
+        case GGML_OP_MUL:
+        case GGML_OP_MUL_MAT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            }
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            }
+            break;
+        case GGML_OP_PERMUTE:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->op_params[i] == dynamic_dim_idx) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[0]: " + std::string(node->src[0]->name));
+            }
+            break;
+        case GGML_OP_VIEW: {
+            // Use stride-based matching: the stride of a VIEW dimension directly
+            // encodes which source dimension it indexes into, so it uniquely
+            // identifies the dynamic dim even when two dims share the same size.
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx   = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                auto dynamic_dim_stride =
+                    node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) *
+                    ggml_type_size(node->type);
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->nb[i] == dynamic_dim_stride) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                OPENVINO_ASSERT(m_node_dynamic_dims[node] != -1 &&
+                                dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[0]: " + std::string(node->src[0]->name));
+            }
+            break;
+        }
+        case GGML_OP_RESHAPE: {
+            // RESHAPE requires src[0] to be contiguous, so both src and result
+            // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]).
+            // Match src->nb[dynamic_dim] against result->nb[i] to find the output
+            // dimension whose flat-memory boundary aligns with the source dynamic
+            // boundary. This is unambiguous (result strides are strictly monotone)
+            // and handles merged-lower-dim cases that ne-value matching misses.
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx    = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx];
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                if (m_node_dynamic_dims[node] == -1) {
+                    std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl;
+                }
+            }
+            break;
+        }
+        case GGML_OP_FLASH_ATTN_EXT: {
+            // Output shape is hard-coded in ggml_flash_attn_ext as:
+            //   ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] }
+            // i.e. output dim 0 <- v dim 0 (head_size, static)
+            //      output dim 1 <- q dim 2 (n_heads,   static)
+            //      output dim 2 <- q dim 1 (n_tokens,  potentially dynamic)
+            //      output dim 3 <- q dim 3 (batch,     static)
+            // Using the fixed q-dim -> output-dim mapping table.
+            // q is src[0]; the mapping from q's dynamic dim to the output dim is:
+            //   q dim 1 -> output dim 2
+            //   q dim 2 -> output dim 1
+            //   q dim 3 -> output dim 3
+            //   q dim 0 -> output dim 0  (head_size axis, unlikely to be dynamic)
+            constexpr int q_to_out[GGML_MAX_DIMS] = { 0, 2, 1, 3 };
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]];
+                m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim];
+            }
+            break;
+        }
+        case GGML_OP_CONT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                if (ggml_are_same_shape(node, node->src[0])) {
+                    m_node_dynamic_dims[node] = dynamic_dim_idx;
+                } else {
+                    size_t src_logical_nb[GGML_MAX_DIMS];
+                    src_logical_nb[0] = ggml_type_size(node->src[0]->type);
+                    src_logical_nb[1] = src_logical_nb[0] *
+                                        (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type));
+                    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                        src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1];
+                    }
+
+                    auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] /
+                                              ggml_type_size(node->src[0]->type) *
+                                              ggml_type_size(node->type);
+                    int matched_dim_count = 0;
+                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                        if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
+                            m_node_dynamic_dims[node] = i;
+                            matched_dim_count++;
+                        }
+                    }
+
+                    OPENVINO_ASSERT(matched_dim_count == 1,
+                                    "Cannot determine dynamic dim for CONT node: " + std::string(node->name));
+                }
+            }
+            break;
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_ADD:
+        case GGML_OP_GLU:
+        case GGML_OP_ROPE:
+        case GGML_OP_SCALE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ADD_ID:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            break;
+        case GGML_OP_CPY:
+        case GGML_OP_SET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        default:
+            std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
+            break;
+        }
+    };
+
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        visit_node(visit_node, node);
+    }
+
+    // print the nodes in m_cgraph name & shape with the dynamic dim (the dynamic dim is the dimension with -1 in m_node_dynamic_dims) for debugging
+    if (0) {
+        for (int i = 0; i < m_cgraph->n_nodes; i++) {
+            ggml_tensor * node = m_cgraph->nodes[i];
+            int dynamic_dim = m_node_dynamic_dims[node];
+            std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op)
+                      << " shape: [";
+            for (int j = 0; j < 4; j++) {
+                if (j == dynamic_dim) {
+                    std::cout << "*";
+                } else {
+                    std::cout << node->ne[j];
+                }
+                if (j < 3) {
+                    std::cout << ", ";
+                }
+            }
+            std::cout << "]" << std::endl;
+            // print the src name & shape with the dynamic dim for debugging
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                ggml_tensor * src = node->src[j];
+                if (src == nullptr) {
+                    continue;
+                }
+                int src_dynamic_dim = m_node_dynamic_dims[src];
+                std::cout << "    [" << j << "] src_name: " << src->name << " [";
+                for (int k = 0; k < 4; k++) {
+                    if (k == src_dynamic_dim) {
+                        std::cout << "*";
+                    } else {
+                        std::cout << src->ne[k];
+                    }
+                    if (k < 3) {
+                        std::cout << ", ";
+                    }
+                }
+                std::cout << "]" << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    }
+}
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 9ed52c894d4..c793c3d6ae7 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -180,7 +180,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
         return m_model_is_splitted;
     }
 
-    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
+    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;
 
     static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
 
@@ -278,6 +278,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     void compute_model_inputs();
     void compute_model_outputs();
 
+    // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
+    void compute_node_dynamic_dims();
+
     void validate_cgraph() const;
 
     ggml_cgraph * m_cgraph = nullptr;
@@ -290,6 +293,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::map<std::string, ggml_tensor *> m_model_outputs;
     std::vector<std::string> m_model_output_names;
     std::vector<NodeInfo> m_node_info_list;
+    std::map<ggml_tensor *, int> m_node_dynamic_dims;
 
     ModelParams m_model_params;
     ComputeParams m_compute_params;

From 6ae864f68c8ddbd6823f710c935a3cc9450567b7 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Wed, 18 Mar 2026 18:23:15 -0700
Subject: [PATCH 11/24] Only do this for fallback sub graph

---
 ggml/src/ggml-openvino/ggml-decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 7a42564760b..080df1891e6 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -383,7 +383,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
     } else {
         input_shape = ov::PartialShape{get_shape(input)};
     }
-    if (dynamic_dim_index != -1) {
+    if (dynamic_dim_index != -1 && m_model_is_splitted) {
         input_shape[3 - dynamic_dim_index] = -1;
     }
     return input_shape;

From 3562f7cc75430fca040387d0a9e0898eee14c2dc Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Sun, 22 Mar 2026 18:01:24 -0700
Subject: [PATCH 12/24] Move dynamic dims compute in graph missmatch

---
 ggml/src/ggml-openvino/utils.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 639843198e4..5d4ddd0fec7 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -85,10 +85,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
     bool stateful = r_ctx->stateful;
     static auto is_static = false;
 
-    bool model_is_splitted = is_model_splitted(cgraph);
-
-    if (is_naive(cgraph) && !model_is_splitted) {
-        return naive_compute(cgraph, core, device, config);
+    if (is_naive(cgraph)) {
+        if (!is_model_splitted(cgraph)) {
+            return naive_compute(cgraph, core, device, config);
+        }
     }
 
     auto start_time = ggml_time_us();
@@ -179,6 +179,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
             compile_end_time = decoder_end_time;
         } else {
             r_ctx->infer_request_cache.erase(key);
+            bool model_is_splitted = is_model_splitted(cgraph);
 
             std::shared_ptr<ov::Model> model;
             auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);

From fe01725aa74b79459bcf68527b16fb5c95bb8229 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Wed, 18 Mar 2026 20:11:05 -0700
Subject: [PATCH 13/24] ggml-openvino: fix tensor data handling for
 PERMUTE/VIEW ops in split models

---
 ggml/src/ggml-openvino/utils.cpp | 37 +++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 5d4ddd0fec7..0082ffd0796 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -119,7 +119,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
             std::lock_guard<std::mutex> lock(*(mutex));
             ggml_decoder = it->second->ptr;
             old_m_params = ggml_decoder->get_model_params();
-            cache_hit = old_m_params.can_reuse_dynamically(m_params);
+            if (!ggml_decoder->is_splited_model()) {
+                cache_hit = old_m_params.can_reuse_dynamically(m_params);
+            }
         } else {
             mutex = std::make_shared<std::mutex>();
             r_ctx->decoder_cache[key] = std::make_shared<decoder_runtime_ctx>(mutex);
@@ -599,7 +601,7 @@ namespace {
 ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name) {
     const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
 
-    if (ggml_tensor->extra != nullptr) {
+    if (ggml_tensor->extra != nullptr && !ggml_decoder->is_splited_model()) {
         // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str());
         auto * extra_base = static_cast<ggml_openvino_extra_base *>(ggml_tensor->extra);
         if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) {
@@ -612,12 +614,41 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
     // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str());
     auto * input_data = ggml_tensor->data;
     ov::Shape input_shape;
-    if (ggml_tensor->op == GGML_OP_VIEW) {
+    if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_splited_model()) {
         // This case is added to make test-backend-ops work
         input_shape = ggml_decoder->get_shape(ggml_tensor->view_src);
     } else {
         input_shape = ggml_decoder->get_shape(ggml_tensor);
     }
+
+    // If the tensor is a result of PERMUTE operation and the model is not fully supported, we need to reconstruct the data based on the view tensor shape & stride
+    if ((ggml_tensor->op == GGML_OP_PERMUTE || ggml_tensor->op == GGML_OP_VIEW) && ggml_decoder->is_splited_model()) {
+        // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride
+        ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
+        const auto * src_tensor = ggml_tensor->view_src;
+        std::vector<uint8_t>    data;
+        auto n_bytes = ggml_nbytes(src_tensor);
+        data.resize(n_bytes);
+        ggml_backend_tensor_get(src_tensor, data.data(), 0, n_bytes);
+
+        size_t des_index = 0;
+        for (size_t i0 = 0; i0 < static_cast<size_t>(ggml_tensor->ne[3]); i0++) {
+            for (size_t i1 = 0; i1 < static_cast<size_t>(ggml_tensor->ne[2]); i1++) {
+                for (size_t i2 = 0; i2 < static_cast<size_t>(ggml_tensor->ne[1]); i2++) {
+                    for (size_t i3 = 0; i3 < static_cast<size_t>(ggml_tensor->ne[0]); i3++) {
+                        size_t src_index = ggml_tensor->view_offs + i0 * ggml_tensor->nb[3] + i1 * ggml_tensor->nb[2] +
+                                           i2 * ggml_tensor->nb[1] + i3 * ggml_tensor->nb[0];
+
+                        memcpy(static_cast<char *>(input_tensor.data()) + des_index,
+                               reinterpret_cast<const char *>(data.data()) + src_index, ggml_tensor->nb[0]);
+                        des_index += ggml_tensor->nb[0];
+                    }
+                }
+            }
+        }
+        return input_tensor;
+    }
+    
     auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
     return input_tensor;
 }

From ce0e4e9cb66b97b692553e6023f38794ca6e6ea8 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Wed, 18 Mar 2026 20:12:06 -0700
Subject: [PATCH 14/24] ggml-openvino:add comments

---
 ggml/src/ggml-openvino/utils.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 0082ffd0796..08e51ca9c8b 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -621,7 +621,9 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
         input_shape = ggml_decoder->get_shape(ggml_tensor);
     }
 
-    // If the tensor is a result of PERMUTE operation and the model is not fully supported, we need to reconstruct the data based on the view tensor shape & stride
+    //   Add explicit strided-copy reconstruction for PERMUTE and VIEW tensors in split
+    //   models: iterate over all 4 dimensions using `nb[]` strides and `view_offs` to
+    //   copy non-contiguous source data into a contiguous `ov::Tensor` buffer
     if ((ggml_tensor->op == GGML_OP_PERMUTE || ggml_tensor->op == GGML_OP_VIEW) && ggml_decoder->is_splited_model()) {
         // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride
         ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
@@ -648,7 +650,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
         }
         return input_tensor;
     }
-    
+
     auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
     return input_tensor;
 }

From 91a0edae370ee871c5a1fc5b680fbded53ceb729 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Wed, 18 Mar 2026 20:21:34 -0700
Subject: [PATCH 15/24] ggml-openvino: override VIEW op_case to 0 for split
 model inputs

---
 ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 080df1891e6..7751ce21003 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -221,6 +221,9 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
                 throw std::runtime_error("Unsupported VIEW case");
             }
             op_case = 2;
+            if (m_model_is_splitted && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) {
+                op_case = 0;
+            }
         }
         {
             auto * src = node->src[0];

From c670633eec853c6b2ee80d95ddfb1cb715d6665f Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Thu, 19 Mar 2026 01:07:18 -0700
Subject: [PATCH 16/24] openvino backend: Handle unsupported VIEW
 shape-mismatch in OpenVINO backend

---
 ggml/src/ggml-openvino/ggml-openvino.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 84a9e45a146..46672c8d36d 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -914,6 +914,15 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         }
         break;
     }
+    case GGML_OP_VIEW: {
+        if (ggml_nelements(op) != ggml_nelements(op->src[0])) {
+            std::cout << __func__ << ": OpenVINO backend does not support VIEW with different number of elements: "
+                      << op->name << " " << ggml_nelements(op)
+                      << " vs " << ggml_nelements(op->src[0]) << std::endl;
+            return true;
+        }
+        break;
+    }
     default:
         break;
     }

From 8e17919f7ed40ed28b59ac130bd2796ca41eca95 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <xuejun.zhai@intel.com>
Date: Mon, 23 Mar 2026 09:46:11 +0800
Subject: [PATCH 17/24] Enable additional mul_mat tests and add tensor data
 saving function (#81)

---
 ggml/src/ggml-openvino/ggml-openvino.cpp      |  3 -
 .../src/ggml-openvino/openvino/op/permute.cpp | 10 ++-
 ggml/src/ggml-openvino/utils.cpp              | 74 ++++++++++++++++++-
 ggml/src/ggml-openvino/utils.h                |  2 +
 4 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 46672c8d36d..50ed4f37c34 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -867,9 +867,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
             return true;
         }
-        if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) {
-            return true;
-        }
         if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) {
             // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1)
             // triggers a bug in ov matmul_shape_inference.hpp
diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp
index 4c800f9ee4f..269fd99f36f 100644
--- a/ggml/src/ggml-openvino/openvino/op/permute.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp
@@ -5,6 +5,7 @@
 #include <climits>
 #include <cstdint>
 #include <memory>
+#include <vector>
 #include <openvino/core/node.hpp>
 #include <openvino/op/add.hpp>
 #include <openvino/op/concat.hpp>
@@ -27,7 +28,14 @@ OutputVector translate_permute(const NodeContext & context) {
 
     ov::Output<Node> res;
     auto src = context.get_input(0);
-    auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
+    std::vector<int64_t> perm_values{0, 2, 1, 3};
+    const int32_t* op_params = context.get_output_op_params();
+    if (op_params != nullptr) {
+        for (size_t i = 0; i < perm_values.size(); ++i) {
+            perm_values[i] = static_cast<int64_t>(perm_values.size() - 1 - op_params[perm_values.size() - 1 - i]);
+        }
+    }
+    auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, perm_values);
 
     if (op_case == 1 || context.is_stateful()) {
         res = std::make_shared<ov::op::v1::Transpose>(src, perm);
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 08e51ca9c8b..dff8894a27f 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -14,6 +14,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <memory>
@@ -586,14 +587,17 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
         infer_request->set_input_tensor(i, input_tensor);
     }
 
+    // Use get_output_tensor + memcpy instead of set_output_tensor to avoid memory overwritten
+    // when i/o buffer overlaps, e.g. the cgraph is a single PERMUTE
+
+    infer_request->infer();
+
     auto ov_results = model->get_results();
     for (size_t i = 0; i < ov_results.size(); i++) {
+        auto output_tensor = infer_request->get_output_tensor(i);
         auto * ggml_tensor = decoder->get_model_outputs().at(ov_results[i]->get_friendly_name());
-        auto output_tensor = create_ov_output_tensor(decoder, infer_request, i, ggml_tensor);
-        infer_request->set_output_tensor(i, output_tensor);
+        std::memcpy(ggml_tensor->data, output_tensor.data(), output_tensor.get_byte_size());
     }
-
-    infer_request->infer();
     return GGML_STATUS_SUCCESS;
 }
 
@@ -792,6 +796,68 @@ size_t checksum(const void * data, size_t size) {
     return sum;
 }
 
+bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path) {
+    if (tensor == nullptr || tensor->data == nullptr) {
+        return false;
+    }
+
+    std::ofstream out(file_path);
+    if (!out.is_open()) {
+        return false;
+    }
+
+    const size_t n = ggml_nelements(tensor);
+    out << "name: " << tensor->name
+        << ", type: " << ggml_type_name(tensor->type)
+        << ", shape: [" << tensor->ne[0] << ", " << tensor->ne[1] << ", " << tensor->ne[2] << ", " << tensor->ne[3]
+        << "]"
+        << ", elements: " << n
+        << ", data:" << '\n';
+
+    switch (tensor->type) {
+    case GGML_TYPE_F32: {
+        const auto * data = static_cast<const float *>(tensor->data);
+        for (size_t i = 0; i < n; ++i) {
+            out << data[i] << '\n';
+        }
+        break;
+    }
+    case GGML_TYPE_F16: {
+        const auto * data = static_cast<const ggml_fp16_t *>(tensor->data);
+        for (size_t i = 0; i < n; ++i) {
+            out << ggml_fp16_to_fp32(data[i]) << '\n';
+        }
+        break;
+    }
+    case GGML_TYPE_BF16: {
+        const auto * data = static_cast<const ggml_bf16_t *>(tensor->data);
+        for (size_t i = 0; i < n; ++i) {
+            out << ggml_bf16_to_fp32(data[i]) << '\n';
+        }
+        break;
+    }
+    case GGML_TYPE_I32: {
+        const auto * data = static_cast<const int32_t *>(tensor->data);
+        for (size_t i = 0; i < n; ++i) {
+            out << data[i] << '\n';
+        }
+        break;
+    }
+    case GGML_TYPE_I64: {
+        const auto * data = static_cast<const int64_t *>(tensor->data);
+        for (size_t i = 0; i < n; ++i) {
+            out << data[i] << '\n';
+        }
+        break;
+    }
+    default:
+        out << "unsupported tensor type for text dump" << '\n';
+        return false;
+    }
+
+    return true;
+}
+
 void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) {
     std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
               << std::endl;
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
index b58a11282b4..dd92dc374f4 100644
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@@ -73,6 +73,8 @@ enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph, std::share
 
 size_t checksum(const void * data, size_t size);
 
+bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path);
+
 void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
 
 void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst);

From 8c5ca6040bae2d6e00d68d026fd20ce481860175 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Wed, 25 Mar 2026 20:21:22 -0700
Subject: [PATCH 18/24] ggml-openvino: fix CONT/TRANSPOSE mapping and improve
 dynamic-dimension handling

---
 ggml/src/ggml-openvino/ggml-decoder.cpp       | 35 ++++++++++++-------
 ggml/src/ggml-openvino/ggml-decoder.h         |  4 +++
 ggml/src/ggml-openvino/ggml-openvino.cpp      | 10 +++++-
 ggml/src/ggml-openvino/openvino/decoder.h     |  4 +++
 .../src/ggml-openvino/openvino/node_context.h |  8 +++++
 ggml/src/ggml-openvino/openvino/op/cont.cpp   | 22 ++++--------
 .../ggml-openvino/openvino/op/transpose.cpp   | 31 +++++++++++++++-
 ggml/src/ggml-openvino/utils.cpp              |  3 ++
 8 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 7751ce21003..a8e14a4d29f 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -166,16 +166,6 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
         }
         break;
     }
-    case GGML_OP_CONT: {
-        if (node->src[0]->op == GGML_OP_PERMUTE) {
-            op_case = 1;
-        } else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
-            op_case = 2;
-        } else if (node->src[0]->op == GGML_OP_VIEW) {
-            op_case = 3;
-        }
-        break;
-    }
     case GGML_OP_PERMUTE: {
         if (node->src[0]->op != GGML_OP_VIEW) {
             op_case = 1;
@@ -195,9 +185,7 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
         break;
     }
     case GGML_OP_MUL_MAT: {
-        if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
-            op_case = 2;
-        } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
+        if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
             op_case = 3;
         }
         break;
@@ -314,6 +302,14 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
             }
             break;
         }
+        // if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT)
+        if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE &&
+            node->src[0]->src[0]->op == GGML_OP_VIEW) {
+            compute_params.attention_size = node->ne[0];
+            if (is_static) {
+                compute_params.attention_size = model_params.ctx_per_seq;
+            }
+        }
         if (node->op == GGML_OP_ROPE) {
             memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
         }
@@ -880,6 +876,11 @@ ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
     return get_ov_type(m_node_info_list[node_idx].node);
 }
 
+std::vector<size_t> GgmlOvDecoder::get_output_stride(int node_idx) const {
+    auto * ggml_tensor = m_node_info_list[node_idx].node;
+    return get_stride(ggml_tensor);
+}
+
 std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
     return {m_node_info_list[node_idx].node_output_name};
 }
@@ -889,6 +890,14 @@ const std::string & GgmlOvDecoder::get_op_name() const {
     return unknown_name;
 }
 
+int32_t GgmlOvDecoder::get_op_dynamic_dim(int node_idx) const {
+    auto it = m_node_dynamic_dims.find(m_node_info_list[node_idx].node);
+    if (it == m_node_dynamic_dims.end()) {
+        return -1;
+    }
+    return it->second;
+}
+
 const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
     return m_node_info_list[node_idx].node_name;
 }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index c793c3d6ae7..ef185dbd324 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -107,6 +107,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual ov::element::Type get_output_type(int node_idx) const override;
 
+    virtual std::vector<size_t> get_output_stride(int node_idx) const override;
+
     virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
 
     virtual int32_t * get_output_op_params(int node_idx) const override;
@@ -121,6 +123,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual const std::string & get_op_name(int node_idx) const override;
 
+    virtual int32_t get_op_dynamic_dim(int node_idx) const override;
+
     virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;
 
     ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 50ed4f37c34..94530b82152 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -920,6 +920,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         }
         break;
     }
+    case GGML_OP_TRANSPOSE: {
+        // if the type is bf16, will return true
+        if (op->type == GGML_TYPE_BF16) {
+            // GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n");
+            return true;
+        }
+        break;
+    }
     default:
         break;
     }
@@ -941,7 +949,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
                                                GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
 
     static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
-                                                 /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
+                                                 GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
                                                  GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
                                                  // softmax is not updated due to replaced by flash_attn_ext
                                                  // GGML_OP_SOFT_MAX,
diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h
index ed6ff7c0aba..764a269ec7a 100644
--- a/ggml/src/ggml-openvino/openvino/decoder.h
+++ b/ggml/src/ggml-openvino/openvino/decoder.h
@@ -35,6 +35,8 @@ class GgmlDecoder : public DecoderBase {
 
     virtual element::Type get_output_type(const int node_idx) const = 0;
 
+    virtual std::vector<size_t> get_output_stride(int node_idx) const = 0;
+
     virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0;
 
     virtual int32_t * get_output_op_params(int node_idx) const = 0;
@@ -69,6 +71,8 @@ class GgmlDecoder : public DecoderBase {
     virtual bool is_splited_model() const = 0;
 
     virtual int is_swa_layer(int layer) const = 0;
+
+    virtual int32_t get_op_dynamic_dim(int node_idx) const = 0;
 };
 
 }  // namespace ggml
diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h
index aa484128a95..70d6c02e8e1 100644
--- a/ggml/src/ggml-openvino/openvino/node_context.h
+++ b/ggml/src/ggml-openvino/openvino/node_context.h
@@ -59,12 +59,20 @@ class NodeContext : public frontend::NodeContext {
         return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]);
     }
 
+    int32_t get_op_dynamic_dim() const {
+        return m_decoder->get_op_dynamic_dim(m_node_idx);
+    }
+
     int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
 
     ov::element::Type get_output_type() const {
         return m_decoder->get_output_type(m_node_idx);
     }
 
+    std::vector<size_t> get_output_stride() const {
+        return m_decoder->get_output_stride(m_node_idx);
+    }
+
     Output<Node> get_input(int idx) const override {
         return m_tensor_map->at(m_input_names[idx]);
     }
diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp
index 6160dd74444..243e236f166 100644
--- a/ggml/src/ggml-openvino/openvino/op/cont.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp
@@ -18,27 +18,17 @@ namespace op {
 OutputVector translate_cont(const NodeContext & context) {
     num_inputs_check(context, 1, 1);
 
-    int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
-
     auto src_shape = context.get_input_shape(0).to_shape();
     auto dst_shape = context.get_output_shape().to_shape();
-    ov::Output<Node> res;
 
-    if (op_case == 1) {
-        // The input comes from a PERMUTE
-        throw std::runtime_error("Code of this case might be outdated");
-        dst_shape[1] = -1;
-        res = std::make_shared<ov::op::v1::Reshape>(
-            context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
-    } else if (op_case == 2) {
-        // The input comes from a TRANSPOSE
-        return {context.get_input(0)};
-    } else {
-        // The input comes from a VIEW
-        res = process_view_input(context, 0);
+    if (context.get_op_dynamic_dim() != -1) {
+        dst_shape[3 - context.get_op_dynamic_dim()] = -1;
     }
 
+    ov::Output<Node> res;
+    res = std::make_shared<ov::op::v1::Reshape>(
+        context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
+
     return rename_outputs_with_suffix({res}, context.get_name());
 }
 
diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp
index 8e62e83c0d7..b3b4614e440 100644
--- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp
@@ -12,8 +12,37 @@ namespace op {
 OutputVector translate_transpose(const NodeContext & context) {
     num_inputs_check(context, 1, 1);
 
+    // Compute permute order from input/output shape and stride information
+    // so it adapts to different input and output layouts.
+    auto input_shape = context.get_input_shape(0).to_shape();
+    auto input_stride = context.get_input_stride(0);
+    auto output_shape = context.get_output_shape().to_shape();
+    auto output_stride = context.get_output_stride();
+
+    // Compute permute order by matching output and input stride rankings.
+    // Build <stride, dim_index> pairs.
+    std::vector<std::pair<size_t, int>> output_stride_dims;
+    std::vector<std::pair<size_t, int>> input_stride_dims;
+
+    for (int i = 0; i < 4; ++i) {
+        output_stride_dims.push_back({output_stride[i], i});
+        input_stride_dims.push_back({input_stride[i], i});
+    }
+
+    // Sort by stride in descending order.
+    std::sort(output_stride_dims.rbegin(), output_stride_dims.rend());
+    std::sort(input_stride_dims.rbegin(), input_stride_dims.rend());
+
+    // Build permute order.
+    std::vector<int64_t> permute_order(4);
+    for (int i = 0; i < 4; ++i) {
+        int output_dim = output_stride_dims[i].second;
+        int input_dim = input_stride_dims[i].second;
+        permute_order[output_dim] = input_dim;
+    }
+
     auto res = std::make_shared<ov::op::v1::Transpose>(
-        context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 1, 3, 2}));
+        context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, permute_order));
     return rename_outputs_with_suffix({res}, context.get_name());
 }
 
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index dff8894a27f..2b62e969702 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -502,6 +502,9 @@ bool is_model_splitted(ggml_cgraph * cgraph) {
         if ((cgraph->n_nodes <= 1 && use_count==0) || (cgraph->n_nodes <= 1 && node->op == GGML_OP_VIEW && use_count == 1 && node->src[0] != nullptr && node->src[0]->op == GGML_OP_NONE)) {
             return false;
         }
+        if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_TRANSPOSE || cgraph->nodes[0]->op == GGML_OP_PERMUTE)) {
+            return false;
+        }
         int input_use_count = 0;
         for (int j = 0; j < cgraph->n_nodes; j++) {
             ggml_tensor * other_node = cgraph->nodes[j];

From b0d66ec60f3f9ce54a84879587a48af030c84187 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Fri, 27 Mar 2026 18:44:05 -0700
Subject: [PATCH 19/24] OpenVINO: add NORM/TANH support and rework SOFT_MAX
 translation

---
 ggml/src/ggml-openvino/ggml-decoder.cpp       |  20 +++-
 ggml/src/ggml-openvino/ggml-decoder.h         |   3 -
 ggml/src/ggml-openvino/ggml-openvino.cpp      |  15 +--
 ggml/src/ggml-openvino/openvino/op/norm.cpp   |  58 +++++++++
 .../src/ggml-openvino/openvino/op/softmax.cpp | 111 ++++++++++--------
 .../ggml-openvino/openvino/op/unary_tanh.cpp  |  25 ++++
 ggml/src/ggml-openvino/openvino/op_table.cpp  |   2 +
 ggml/src/ggml-openvino/openvino/op_table.h    |   2 +
 8 files changed, 167 insertions(+), 69 deletions(-)
 create mode 100644 ggml/src/ggml-openvino/openvino/op/norm.cpp
 create mode 100644 ggml/src/ggml-openvino/openvino/op/unary_tanh.cpp

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index a8e14a4d29f..001aed63d10 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -215,7 +215,7 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
         }
         {
             auto * src = node->src[0];
-            if ((ggml_nelements(node) != ggml_nelements(src)) && m_naive) {
+            if (ggml_nelements(node) != ggml_nelements(src)) {
                 // Compare each dimension of node and src, if only one dimension differs then op_case=3
                 int diff_count = 0;
                 for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -385,6 +385,11 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
     if (dynamic_dim_index != -1 && m_model_is_splitted) {
         input_shape[3 - dynamic_dim_index] = -1;
     }
+    if (op->op == GGML_OP_SOFT_MAX && op->src[1] != nullptr && op->src[1]->op == GGML_OP_NONE && op->src[1]->flags & GGML_TENSOR_FLAG_INPUT && op->src[1] == input) {
+        // for softmax input mask, the shape is [1, 1, seq_active, seq_active], where seq_active is determined by the input active sequence length instead of the kv cache sequence length
+        input_shape[2] = -1;
+        input_shape[3] = -1;
+    }
     return input_shape;
 }
 
@@ -934,6 +939,7 @@ std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
         {GGML_OP_PERMUTE,        "GGML_OP_PERMUTE"       },
         {GGML_OP_RESHAPE,        "GGML_OP_RESHAPE"       },
         {GGML_OP_RMS_NORM,       "GGML_OP_RMS_NORM"      },
+        {GGML_OP_NORM,           "GGML_OP_NORM"          },
         {GGML_OP_ROPE,           "GGML_OP_ROPE"          },
         {GGML_OP_SCALE,          "GGML_OP_SCALE"         },
         {GGML_OP_SOFT_MAX,       "GGML_OP_SOFT_MAX"      },
@@ -1024,6 +1030,10 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
                     m_node_dynamic_dims[src] = 0;
                     continue;
                 }
+                if ( node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful()) {
+                    m_node_dynamic_dims[src] = 1;
+                    continue;
+                }
                 self(self, src);
             }
         }
@@ -1085,6 +1095,10 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
             // identifies the dynamic dim even when two dims share the same size.
             m_node_dynamic_dims[node] = -1;
             if (m_node_dynamic_dims[node->src[0]] != -1) {
+                if (node->src[0]->op == GGML_OP_NONE) {
+                    m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+                    break;
+                }
                 auto dynamic_dim_idx   = m_node_dynamic_dims[node->src[0]];
                 auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
                 auto dynamic_dim_stride =
@@ -1103,6 +1117,7 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
             }
             break;
         }
+        case GGML_OP_TRANSPOSE:
         case GGML_OP_RESHAPE: {
             // RESHAPE requires src[0] to be contiguous, so both src and result
             // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]).
@@ -1179,14 +1194,15 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
             }
             break;
         case GGML_OP_RMS_NORM:
+        case GGML_OP_NORM:
         case GGML_OP_ADD:
         case GGML_OP_GLU:
         case GGML_OP_ROPE:
         case GGML_OP_SCALE:
-        case GGML_OP_TRANSPOSE:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_ARGSORT:
         case GGML_OP_ADD_ID:
+        case GGML_OP_UNARY:
             m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
             break;
         case GGML_OP_MUL_MAT_ID:
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index ef185dbd324..c19be52712c 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -266,9 +266,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
         if (is_inp_emb(tensor, op)) {
             return "embd";
         }
-        if (is_output_idx(tensor, op)) {
-            return "inp_out_ids";
-        }
         if (is_inp_mask(tensor, op)) {
             return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
         }
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 94530b82152..9785038bce2 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -807,15 +807,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
             return true;
         }
-        float scale = 1.0f;
-        float max_bias = 0.0f;
-        const auto * op_params = op->op_params;
-        memcpy(&scale, (const float *) op_params + 0, sizeof(float));
-        memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
-        if (max_bias > 0) {
-            // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
-            return true;
-        }
         break;
     }
     case GGML_OP_FLASH_ATTN_EXT: {
@@ -950,13 +941,13 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
 
     static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
                                                  GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
-                                                 GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
-                                                 // softmax is not updated due to replaced by flash_attn_ext
-                                                 // GGML_OP_SOFT_MAX,
+                                                 GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_NORM,
+                                                 GGML_OP_SOFT_MAX,
                                                  GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
     static const std::set<ggml_unary_op> supported_unary_ops{
         GGML_UNARY_OP_GELU,
         GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_TANH,
     };
     static const std::set<ggml_glu_op> supported_glu_ops{
         GGML_GLU_OP_SWIGLU,
diff --git a/ggml/src/ggml-openvino/openvino/op/norm.cpp b/ggml/src/ggml-openvino/openvino/op/norm.cpp
new file mode 100644
index 00000000000..b6e54914e1f
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/norm.cpp
@@ -0,0 +1,58 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+
+#include <memory>
+#include <openvino/op/add.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/divide.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/power.hpp>
+#include <openvino/op/reduce_mean.hpp>
+#include <openvino/op/sqrt.hpp>
+#include <openvino/op/subtract.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_norm(const NodeContext & context) {
+    num_inputs_check(context, 1, 1);
+
+    auto input_node = context.get_input(0);
+
+    // Step 1: Calculate mean along the last dimension
+    // mean = reduce_mean(input, axis=-1, keepdims=true)
+    auto mean = std::make_shared<ov::op::v1::ReduceMean>(
+        input_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true);
+
+    // Step 2: Calculate (input - mean)
+    auto centered = std::make_shared<ov::op::v1::Subtract>(input_node, mean);
+
+    // Step 3: Calculate squared differences (input - mean)^2
+    auto squared = std::make_shared<ov::op::v1::Power>(
+        centered, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f}));
+
+    // Step 4: Calculate variance = mean((input - mean)^2)
+    auto variance = std::make_shared<ov::op::v1::ReduceMean>(
+        squared, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true);
+
+    // Step 5: Get epsilon from op_params
+    float eps;
+    memcpy(&eps, context.get_output_op_params(), sizeof(float));
+
+    // Step 6: Calculate std = sqrt(variance + eps)
+    auto std_dev = std::make_shared<ov::op::v0::Sqrt>(
+        std::make_shared<ov::op::v1::Add>(variance, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps})));
+
+    // Step 7: Normalize: output = (input - mean) / std
+    auto res = std::make_shared<ov::op::v1::Divide>(centered, std_dev);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp
index 9f6330862be..6b3a679c6db 100644
--- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp
@@ -2,18 +2,16 @@
 #include "../op_table.h"
 #include "../utils.h"
 
-#include <climits>
+#include <cstring>
 #include <cstdint>
+#include <cmath>
 #include <memory>
-#include <openvino/core/node.hpp>
-#include <openvino/core/node_output.hpp>
+#include <openvino/frontend/exception.hpp>
 #include <openvino/op/add.hpp>
-#include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
-#include <openvino/op/matmul.hpp>
 #include <openvino/op/multiply.hpp>
-#include <openvino/op/slice.hpp>
+#include <openvino/op/reshape.hpp>
 #include <openvino/op/softmax.hpp>
 #include <vector>
 
@@ -22,63 +20,72 @@ namespace frontend {
 namespace ggml {
 namespace op {
 
+// Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend:
+// 1) logits = src0 * scale
+// 2) logits += mask (if provided)
+// 3) softmax over the last dimension
 OutputVector translate_soft_max(const NodeContext & context) {
-    // TODO code is outdated
     num_inputs_check(context, 1, 2);
 
-    auto input_node = context.get_input(0).get_node_shared_ptr();
-    ov::Output<Node> res;
-
     float scale = 1.0f;
     float max_bias = 0.0f;
-    auto * op_params = context.get_output_op_params();
-    memcpy(&scale, (float *) op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) op_params + 1, sizeof(float));
-    auto src0_shape = context.get_input_shape(0).get_shape();
-    const uint32_t h = src0_shape[2];
-    const uint32_t n_head = src0_shape[0];
-    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-    const float slope =
-        (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
-
-    auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
-    auto scaled_input = std::make_shared<ov::op::v1::Multiply>(input_node, scale_node);
-
-    if (context.get_input_size() < 2) {
-        res = std::make_shared<ov::op::v8::Softmax>(scaled_input, 2);
-        return rename_outputs_with_suffix({res}, context.get_name());
-    }
+    memcpy(&scale, (float *) context.get_output_op_params() + 0, sizeof(float));
+    memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float));
 
-    ov::Output<ov::Node> mask_node_sliced;
-    if (context.has_input("KQ_mask_sliced")) {
-        mask_node_sliced = context.get_input("KQ_mask_sliced");
-    } else {
-        auto token_len = get_dimensions(input_node, {1});
-        auto mask_node = context.get_input(1);
-        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
-        mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
-    }
+    ov::Output<ov::Node> logits = context.get_input(0);
 
-    if (mask_node_sliced.get_element_type() != context.get_output_type()) {
-        mask_node_sliced = std::make_shared<ov::op::v0::Convert>(mask_node_sliced, context.get_output_type());
+    // Apply scale first: logits = src0 * scale
+    if (scale != 1.0f) {
+        auto scale_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
+        logits = std::make_shared<ov::op::v1::Multiply>(logits, scale_const);
     }
 
-    Output<Node> slope_mask;
-    if (slope != 1.0f) {
-        auto slope_node =
-            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{slope});
-        slope_mask = std::make_shared<ov::op::v1::Multiply>(mask_node_sliced, slope_node);
-        throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use.");
-    }
-    slope_mask = mask_node_sliced;
+    FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2),
+                                "OpenVINO softmax ALiBi path requires mask input");
+
+    // Optional mask add: logits += mask
+    // For max_bias > 0 (ALiBi), apply per-head slope to mask before adding.
+    if (context.get_input_size() > 1) {
+        ov::Output<ov::Node> mask = context.get_input(1);
+        if (mask.get_element_type() != logits.get_element_type()) {
+            mask = std::make_shared<ov::op::v0::Convert>(mask, logits.get_element_type());
+        }
+
+        if (max_bias > 0.0f) {
+            auto out_shape = context.get_output_shape().to_shape();
+            FRONT_END_CHECK_IMPLEMENTED(out_shape.size() == 4,
+                                        "OpenVINO softmax ALiBi path expects rank-4 tensor");
 
-    auto input_slope_mask_node = std::make_shared<ov::op::v1::Add>(scaled_input, slope_mask);
+            const uint32_t n_head = static_cast<uint32_t>(out_shape[1]);
+            FRONT_END_CHECK_IMPLEMENTED(n_head > 0, "OpenVINO softmax ALiBi path expects n_head > 0");
+
+            const uint32_t n_head_log2 = 1u << static_cast<uint32_t>(std::floor(std::log2(static_cast<float>(n_head))));
+            const float m0 = std::pow(2.0f, -(max_bias) / static_cast<float>(n_head_log2));
+            const float m1 = std::pow(2.0f, -(max_bias / 2.0f) / static_cast<float>(n_head_log2));
+
+            std::vector<float> slopes(n_head);
+            for (uint32_t h = 0; h < n_head; ++h) {
+                slopes[h] = h < n_head_log2 ? std::pow(m0, static_cast<float>(h + 1))
+                                             : std::pow(m1, static_cast<float>(2 * (h - n_head_log2) + 1));
+            }
+
+            ov::Output<ov::Node> slope_node =
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_head}, slopes);
+            if (slope_node.get_element_type() != mask.get_element_type()) {
+                slope_node = std::make_shared<ov::op::v0::Convert>(slope_node, mask.get_element_type());
+            }
+
+            auto slope_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4},
+                                                                       std::vector<int64_t>{1, static_cast<int64_t>(n_head), 1, 1});
+            auto slope_4d = std::make_shared<ov::op::v1::Reshape>(slope_node, slope_shape, false);
+            mask = std::make_shared<ov::op::v1::Multiply>(mask, slope_4d);
+        }
+
+        logits = std::make_shared<ov::op::v1::Add>(logits, mask);
+    }
 
-    res = std::make_shared<ov::op::v8::Softmax>(input_slope_mask_node, 2);
+    // Softmax along last dimension (equivalent to ggml softmax over ne[0]).
+    auto res = std::make_shared<ov::op::v8::Softmax>(logits, -1);
 
     return rename_outputs_with_suffix({res}, context.get_name());
 }
diff --git a/ggml/src/ggml-openvino/openvino/op/unary_tanh.cpp b/ggml/src/ggml-openvino/openvino/op/unary_tanh.cpp
new file mode 100644
index 00000000000..5e6744b2290
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/unary_tanh.cpp
@@ -0,0 +1,25 @@
+#include "../node_context.h"
+#include "../op_table.h"
+#include "../utils.h"
+
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/tanh.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_unary_tanh(const NodeContext & context) {
+    num_inputs_check(context, 1, 1);
+
+    auto input = context.get_input(0);
+    auto res = std::make_shared<ov::op::v0::Tanh>(input);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp
index 1385539279c..723ade12c54 100644
--- a/ggml/src/ggml-openvino/openvino/op_table.cpp
+++ b/ggml/src/ggml-openvino/openvino/op_table.cpp
@@ -26,6 +26,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
         {"GGML_OP_PERMUTE",        op::translate_permute                          },
         {"GGML_OP_RESHAPE",        op::translate_reshape                          },
         {"GGML_OP_RMS_NORM",       op::translate_rms_norm                         },
+        {"GGML_OP_NORM",           op::translate_norm                             },
         {"GGML_OP_ROPE",           op::translate_rope                             },
         {"GGML_OP_SCALE",          op::translate_scale                            },
         {"GGML_OP_SOFT_MAX",       op::translate_soft_max                         },
@@ -33,6 +34,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
         {"GGML_OP_TRANSPOSE",      op::translate_transpose                        },
         {"GGML_UNARY_OP_GELU",     op::translate_unary_gelu                       },
         {"GGML_UNARY_OP_SILU",     op::translate_unary_silu                       },
+        {"GGML_UNARY_OP_TANH",     op::translate_unary_tanh                       },
         {"GGML_OP_VIEW",           op::translate_view                             },
         {"GGML_GLU_OP_SWIGLU",     op::translate_glu_swiglu                       },
         {"GGML_GLU_OP_GEGLU",      op::translate_glu_geglu                        },
diff --git a/ggml/src/ggml-openvino/openvino/op_table.h b/ggml/src/ggml-openvino/openvino/op_table.h
index f546796d2ee..a2614ae5762 100644
--- a/ggml/src/ggml-openvino/openvino/op_table.h
+++ b/ggml/src/ggml-openvino/openvino/op_table.h
@@ -18,10 +18,12 @@ GGML_OP_CONVERTER(translate_mulmat);
 GGML_OP_CONVERTER(translate_permute);
 GGML_OP_CONVERTER(translate_reshape);
 GGML_OP_CONVERTER(translate_rms_norm);
+GGML_OP_CONVERTER(translate_norm);
 GGML_OP_CONVERTER(translate_rope);
 GGML_OP_CONVERTER(translate_scale);
 GGML_OP_CONVERTER(translate_unary_silu);
 GGML_OP_CONVERTER(translate_unary_gelu);
+GGML_OP_CONVERTER(translate_unary_tanh);
 GGML_OP_CONVERTER(translate_soft_max);
 GGML_OP_CONVERTER(translate_transpose);
 GGML_OP_CONVERTER(translate_view);

From 41192585f5d76b77c445dd5573078912f34adbd0 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Mon, 30 Mar 2026 01:42:04 -0700
Subject: [PATCH 20/24] ggml-openvino: extend VIEW handling

---
 ggml/src/ggml-openvino/ggml-decoder.cpp       |  28 ++-
 ggml/src/ggml-openvino/ggml-decoder.h         |   2 +
 ggml/src/ggml-openvino/ggml-openvino.cpp      |   9 -
 ggml/src/ggml-openvino/openvino/decoder.h     |   2 +
 .../src/ggml-openvino/openvino/node_context.h |   2 +
 ggml/src/ggml-openvino/openvino/op/view.cpp   | 162 +++++++++++++++++-
 ggml/src/ggml-openvino/utils.cpp              |   2 +-
 7 files changed, 191 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 001aed63d10..8787a51cfc4 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -216,14 +216,32 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
         {
             auto * src = node->src[0];
             if (ggml_nelements(node) != ggml_nelements(src)) {
-                // Compare each dimension of node and src, if only one dimension differs then op_case=3
+                // Case 4: select one slice on src dim1 (via view offset), keep src dim2 as output dim1.
+                // Typical pattern:
+                //   src: ne=[N, M, K, 1], nb=[b0, b1, b2, b3]
+                //   dst: ne=[N, K, 1, 1], nb=[b0, b2, b3, b3]
+                if (node->ne[0] == src->ne[0] &&
+                    node->ne[1] == src->ne[2] &&
+                    node->ne[2] == 1 &&
+                    node->nb[0] == src->nb[0] &&
+                    node->nb[1] == src->nb[2] &&
+                    src->ne[1] > 1) {
+                    op_case = 4;
+                    break;
+                }
+
+                // General case 3: shape differs from source (one or more dims) and is handled as VIEW slicing.
                 int diff_count = 0;
                 for (int i = 0; i < GGML_MAX_DIMS; i++) {
                     if (node->ne[i] != src->ne[i]) {
                         diff_count++;
                     }
+                    // if node ne[i] > src ne[i], case = 0
+                    if (node->ne[i] > src->ne[i]) {
+                        return 0;
+                    }
                 }
-                if (diff_count == 1) {
+                if (diff_count >= 1) {
                     op_case = 3;
                 }
             }
@@ -915,6 +933,10 @@ int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
     return m_node_info_list[node_idx].node->op_params;
 }
 
+size_t GgmlOvDecoder::get_output_op_offset(int node_idx) const {
+    return m_node_info_list[node_idx].node->view_offs;
+}
+
 void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
     for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
         if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
@@ -1030,7 +1052,7 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
                     m_node_dynamic_dims[src] = 0;
                     continue;
                 }
-                if ( node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful()) {
+                if ( node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful() && !m_model_is_splitted) {
                     m_node_dynamic_dims[src] = 1;
                     continue;
                 }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index c19be52712c..1a7849c5251 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -113,6 +113,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual int32_t * get_output_op_params(int node_idx) const override;
 
+    virtual size_t get_output_op_offset(int node_idx) const override;
+
     virtual std::vector<std::string> get_output_names(int node_idx) const override;
 
     virtual const std::string & get_op_type() const override;
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 9785038bce2..49e2172ad3b 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -902,15 +902,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         }
         break;
     }
-    case GGML_OP_VIEW: {
-        if (ggml_nelements(op) != ggml_nelements(op->src[0])) {
-            std::cout << __func__ << ": OpenVINO backend does not support VIEW with different number of elements: "
-                      << op->name << " " << ggml_nelements(op)
-                      << " vs " << ggml_nelements(op->src[0]) << std::endl;
-            return true;
-        }
-        break;
-    }
     case GGML_OP_TRANSPOSE: {
         // if the type is bf16, will return true
         if (op->type == GGML_TYPE_BF16) {
diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h
index 764a269ec7a..b487afd720d 100644
--- a/ggml/src/ggml-openvino/openvino/decoder.h
+++ b/ggml/src/ggml-openvino/openvino/decoder.h
@@ -41,6 +41,8 @@ class GgmlDecoder : public DecoderBase {
 
     virtual int32_t * get_output_op_params(int node_idx) const = 0;
 
+    virtual size_t get_output_op_offset(int node_idx) const = 0;
+
     virtual std::vector<std::string> get_output_names(int node_idx) const = 0;
 
     virtual const std::string& get_op_type() const = 0;
diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h
index 70d6c02e8e1..26498566134 100644
--- a/ggml/src/ggml-openvino/openvino/node_context.h
+++ b/ggml/src/ggml-openvino/openvino/node_context.h
@@ -65,6 +65,8 @@ class NodeContext : public frontend::NodeContext {
 
     int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
 
+    size_t get_output_op_offset() const { return m_decoder->get_output_op_offset(m_node_idx); }
+
     ov::element::Type get_output_type() const {
         return m_decoder->get_output_type(m_node_idx);
     }
diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp
index 8528d252336..93831af9b4d 100644
--- a/ggml/src/ggml-openvino/openvino/op/view.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/view.cpp
@@ -1,6 +1,7 @@
 #include "../op_table.h"
 #include "../utils.h"
 #include <openvino/op/reshape.hpp>
+#include <set>
 namespace ov {
 namespace frontend {
 namespace ggml {
@@ -28,6 +29,49 @@ OutputVector translate_view(const NodeContext & context) {
 
         auto dst_shape = context.get_output_shape().to_shape();
 
+        std::vector<size_t> diff_dims;
+        for (size_t i = 0; i < dst_shape.size(); ++i) {
+            if (dst_shape[i] != input_llama_shape[i]) {
+                diff_dims.push_back(i);
+            }
+        }
+
+        FRONT_END_CHECK_IMPLEMENTED(!diff_dims.empty(), "VIEW op_case 3 failed to infer changed dims");
+
+        const size_t offset = context.get_output_op_offset();
+        const auto input_stride = context.get_input_stride(0);
+        FRONT_END_CHECK_IMPLEMENTED(input_stride.size() == dst_shape.size(),
+                                    "VIEW op_case 3 shape/stride rank mismatch");
+
+        // Multi-dim change: infer begin/end for each axis from shape/stride/offset directly.
+        if (diff_dims.size() > 1) {
+            std::vector<int64_t> begin(dst_shape.size(), 0);
+            std::vector<int64_t> end(dst_shape.size(), 0);
+            std::vector<int64_t> step(dst_shape.size(), 1);
+            std::vector<int64_t> axes(dst_shape.size(), 0);
+
+            size_t rem_offset = offset;
+            for (size_t i = 0; i < dst_shape.size(); ++i) {
+                FRONT_END_CHECK_IMPLEMENTED(input_stride[i] > 0, "VIEW op_case 3 invalid stride");
+                begin[i] = static_cast<int64_t>(rem_offset / input_stride[i]);
+                rem_offset %= input_stride[i];
+                end[i] = begin[i] + static_cast<int64_t>(dst_shape[i]);
+                axes[i] = static_cast<int64_t>(i);
+
+                FRONT_END_CHECK_IMPLEMENTED(begin[i] >= 0 &&
+                                                end[i] <= static_cast<int64_t>(input_llama_shape[i]),
+                                            "VIEW op_case 3 multi-dim inferred slice out of bounds");
+            }
+
+            auto sliced = std::make_shared<ov::op::v8::Slice>(
+                input,
+                ov::op::v0::Constant::create(ov::element::i64, {begin.size()}, begin),
+                ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end),
+                ov::op::v0::Constant::create(ov::element::i64, {step.size()}, step),
+                ov::op::v0::Constant::create(ov::element::i64, {axes.size()}, axes));
+            return {sliced};
+        }
+
         // find the index of dst_shape that is different from input shape, and use that index to slice the input
         int slice_dim = -1;
         for (size_t i = 0; i < dst_shape.size(); ++i) {
@@ -37,12 +81,124 @@ OutputVector translate_view(const NodeContext & context) {
             }
         }
 
-        auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-        auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {dst_shape[slice_dim]});
+        FRONT_END_CHECK_IMPLEMENTED(slice_dim >= 0, "VIEW op_case 3 failed to infer slice dim");
+
+        FRONT_END_CHECK_IMPLEMENTED(input_stride[slice_dim] > 0, "VIEW op_case 3 invalid stride");
+
+        const int64_t dim_size = static_cast<int64_t>(input_llama_shape[slice_dim]);
+
+        if (offset % input_stride[slice_dim] == 0) {
+            const int64_t begin_val = static_cast<int64_t>((offset / input_stride[slice_dim]) % static_cast<size_t>(dim_size));
+            const int64_t end_val = begin_val + static_cast<int64_t>(dst_shape[slice_dim]);
+
+            FRONT_END_CHECK_IMPLEMENTED(begin_val >= 0 &&
+                                            end_val <= dim_size,
+                                        "VIEW op_case 3 inferred slice out of bounds");
+
+            auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val});
+            auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val});
+            auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+            auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim});
+            auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
+            return {sliced};
+        }
+
+        // Fallback for offsets that cross lower dimensions: flatten tail dims, slice 1D range, then reshape.
+        FRONT_END_CHECK_IMPLEMENTED(slice_dim + 1 < static_cast<int>(dst_shape.size()),
+                                    "VIEW op_case 3 fallback requires lower dimensions");
+
+        int64_t tail_src_elems = 1;
+        int64_t tail_dst_elems = 1;
+        for (size_t i = static_cast<size_t>(slice_dim); i < input_llama_shape.size(); ++i) {
+            tail_src_elems *= static_cast<int64_t>(input_llama_shape[i]);
+            tail_dst_elems *= static_cast<int64_t>(dst_shape[i]);
+        }
+
+        const auto elem_stride = input_stride.back();
+        FRONT_END_CHECK_IMPLEMENTED(elem_stride > 0 && offset % elem_stride == 0,
+                                    "VIEW op_case 3 fallback invalid element stride/alignment");
+
+        const int64_t tail_begin = static_cast<int64_t>((offset / elem_stride) % static_cast<size_t>(tail_src_elems));
+        const int64_t tail_end = tail_begin + tail_dst_elems;
+        FRONT_END_CHECK_IMPLEMENTED(tail_begin >= 0 && tail_end <= tail_src_elems,
+                                    "VIEW op_case 3 fallback slice out of bounds");
+
+        std::vector<int64_t> flat_shape;
+        for (int i = 0; i < slice_dim; ++i) {
+            flat_shape.push_back(static_cast<int64_t>(input_llama_shape[i]));
+        }
+        flat_shape.push_back(tail_src_elems);
+
+        auto flat = std::make_shared<ov::op::v1::Reshape>(
+            input,
+            ov::op::v0::Constant::create(ov::element::i64, {flat_shape.size()}, flat_shape),
+            false);
+
+        auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_begin});
+        auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_end});
         auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
         auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim});
+        auto sliced = std::make_shared<ov::op::v8::Slice>(flat, begin, end, stride, axes);
+
+        auto reshaped = std::make_shared<ov::op::v1::Reshape>(
+            sliced,
+            ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
+            false);
+        return {reshaped};
+    }
+
+    // op_case 4: view offset selects one index from a middle dimension, then output keeps another source dim.
+    // Example: src [N,M,K,1] -> dst [N,K,1,1] with offsets 0, nb1, 2*nb1, ...
+    if (context.get_op_case() == 4) {
+        auto input = context.get_input(0);
+        auto src_shape = context.get_input_shape(0).to_shape();
+        auto dst_shape = context.get_output_shape().to_shape();
+        auto src_stride = context.get_input_stride(0);
+        auto dst_stride = context.get_output_stride();
+
+        FRONT_END_CHECK_IMPLEMENTED(src_shape.size() == dst_shape.size() &&
+                                        src_shape.size() == src_stride.size() &&
+                                        src_shape.size() == dst_stride.size(),
+                                    "VIEW op_case 4 shape/stride rank mismatch");
+
+        std::set<size_t> used_dst_strides;
+        for (size_t i = 0; i < dst_shape.size(); ++i) {
+            if (dst_shape[i] > 1) {
+                used_dst_strides.insert(dst_stride[i]);
+            }
+        }
+
+        int64_t slice_axis = -1;
+        for (size_t i = 0; i < src_shape.size(); ++i) {
+            if (src_shape[i] > 1 && used_dst_strides.find(src_stride[i]) == used_dst_strides.end()) {
+                slice_axis = static_cast<int64_t>(i);
+                break;
+            }
+        }
+        FRONT_END_CHECK_IMPLEMENTED(slice_axis >= 0, "VIEW op_case 4 failed to infer slice axis");
+
+        const size_t offset = context.get_output_op_offset();
+        const size_t axis_stride = src_stride[static_cast<size_t>(slice_axis)];
+        FRONT_END_CHECK_IMPLEMENTED(axis_stride > 0, "VIEW op_case 4 invalid axis stride");
+
+        const int64_t axis_size = static_cast<int64_t>(src_shape[static_cast<size_t>(slice_axis)]);
+        const int64_t slice_index = static_cast<int64_t>((offset / axis_stride) % static_cast<size_t>(axis_size));
+
+        auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_index});
+        auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_index + 1});
+        auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_axis});
         auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
-        return {sliced};
+
+        if (context.get_op_dynamic_dim() != -1) {
+            dst_shape[3 - context.get_op_dynamic_dim()] = -1;
+        }
+
+        auto reshaped = std::make_shared<ov::op::v1::Reshape>(
+            sliced,
+            ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
+            false);
+        return rename_outputs_with_suffix({reshaped}, context.get_name());
     }
     return {context.get_input(0)};
 }
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 2b62e969702..24384fcf674 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -554,7 +554,7 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
                                ov::Core & core,
                                const std::string & device,
                                const ov::AnyMap & config) {
-    if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) {
+    if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE)) {
         return GGML_STATUS_SUCCESS;
     }
 

From 1f1d90006c8efc4ec005be3fb48903a1a6ee19d9 Mon Sep 17 00:00:00 2001
From: Zijun Yu <zijun.yu@intel.com>
Date: Thu, 2 Apr 2026 13:54:37 +0800
Subject: [PATCH 21/24] Enable -fa off (#118)

---
 ggml/src/ggml-openvino/ggml-decoder.cpp       | 60 +++++++++++++------
 ggml/src/ggml-openvino/ggml-decoder.h         | 10 ++--
 .../openvino/op/flash_attn_ext.cpp            | 18 +++---
 ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 12 ++--
 .../src/ggml-openvino/openvino/op/permute.cpp | 42 +++++++++----
 .../src/ggml-openvino/openvino/op/reshape.cpp | 10 +++-
 .../ggml-openvino/openvino/op/set_rows.cpp    |  4 +-
 .../src/ggml-openvino/openvino/op/softmax.cpp | 10 ++++
 .../openvino/translate_session.cpp            | 28 +++++----
 9 files changed, 128 insertions(+), 66 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 8787a51cfc4..9d2cf60cf60 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -1,20 +1,15 @@
 #include "ggml-decoder.h"
 
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
+#include "ggml-impl.h"
 #include "ggml-openvino-extra.h"
 #include "ggml-openvino.h"
 #include "ggml-quants.h"
 
-#include <ggml-impl.h>
-#include <ggml.h>
-
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
-#include <execution>
 #include <fstream>
 #include <iomanip>
 #include <map>
@@ -30,12 +25,10 @@
 #include <openvino/op/convert.hpp>
 #include <openvino/op/parameter.hpp>
 #include <openvino/runtime/tensor.hpp>
-#include <optional>
 #include <ostream>
 #include <set>
 #include <stdexcept>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
@@ -159,7 +152,7 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
             if (src->ne[2] * src->ne[3] == node->ne[1]) {
                 op_case = 5;
             }
-        } else if (src->ne[0] * src->ne[1] == node->ne[1]) {
+        } else if (src->ne[0] * src->ne[1] * src->ne[2] == node->ne[1]) {
             op_case = 3;
         } else if (src->ne[1] * src->ne[2] == node->ne[1]) {
             op_case = 6;
@@ -173,20 +166,40 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
             // kv cache tensor
             std::string src_name(node->view_src->name);
             int layer = extract_layer_from_name(src_name);
-            if (!is_swa_layer(layer)) {
-                op_case = 2;
+            if (ggml_is_contiguous(node->src[0])) {
+                // -  19: [    64,     8,   256,     1] VIEW            cache_k_l0 (view)             [ 2,   128,  1024, 1048576]
+                //         [   512,  1024,     1,     1]      0: NONE     cache_k_l0                    [ 2,  1024, 1048576, 1048576]
+                // -  20: [    64,   256,     8,     1] PERMUTE         cache_k_l0 (view) (permuted)  [ 2,  1024,   128, 1048576]
+                //         [    64,     8,   256,     1]      0: VIEW     cache_k_l0 (view)             [ 2,   128,  1024, 1048576]
+                if (!is_swa_layer(layer)) {
+                    op_case = 3;
+                } else {
+                    op_case = 4;
+                }
             } else {
-                op_case = 3;
+                // special case of cache v when `-fa off`
+                // -  17: [   256,     8,    64,     1] VIEW            cache_v_l0 (view)             [ 2, 131072,  2048, 1048576]
+                //         [   512,  1024,     1,     1]      0: NONE     cache_v_l0                   [ 2,  1024, 1048576, 1048576]
+                // -  18: [   256,    64,     8,     1] PERMUTE         cache_v_l0 (view) (permuted)  [ 2,  2048, 131072, 1048576]
+                //         [   256,     8,    64,     1]      0: VIEW     cache_v_l0 (view)            [ 2, 131072,  2048, 1048576]
+                if (!is_swa_layer(layer)) {
+                    op_case = 5;
+                } else {
+                    op_case = 6;
+                }
             }
         } else {
             // rope'ed query tensor
-            op_case = 4;
+            op_case = 2;
         }
         break;
     }
     case GGML_OP_MUL_MAT: {
         if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
             op_case = 3;
+        } else if (node->src[1]->op == GGML_OP_SOFT_MAX) {
+            // In the case of `-fa off`, softmax is used, v_trans=true, the dynamic dim is ne[0] for cache_v
+            op_case = 2;
         }
         break;
     }
@@ -273,13 +286,20 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
     for (int i = 0; i < cgraph->n_nodes; i++) {
         auto * node = cgraph->nodes[i];
         std::string name = std::string(node->name);
-        if (node->op == GGML_OP_FLASH_ATTN_EXT) {
-            model_params.n_heads = node->src[0]->ne[2];
-            model_params.n_heads_kv = node->src[1]->ne[2];
-            model_params.head_size = node->src[0]->ne[0];
+        if (node->op == GGML_OP_FLASH_ATTN_EXT || node->op == GGML_OP_SOFT_MAX) {
             compute_params.input_len = node->src[0]->ne[1];
 
+            auto * q_perm = node->src[0];
             auto * cache_k_perm = node->src[1];
+            if (node->op == GGML_OP_SOFT_MAX) {
+                q_perm = node->src[0]->src[1];
+                cache_k_perm = node->src[0]->src[0];
+            }
+            model_params.head_size = cache_k_perm->ne[0];
+            model_params.n_heads_kv = cache_k_perm->ne[2];
+            model_params.n_heads = q_perm->ne[2];
+            compute_params.token_len_per_seq = q_perm->ne[1];
+
             if (cache_k_perm->op == GGML_OP_CPY) {
                 cache_k_perm = cache_k_perm->src[0];
             }
@@ -289,7 +309,11 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
 
             auto * cache_k = cache_k_view->src[0];
             int layer = extract_layer_from_name(cache_k->name);
+
             auto * mask = node->src[3];
+            if (node->op == GGML_OP_SOFT_MAX) {
+                mask = node->src[1];
+            }
             std::string mask_name(mask->name);
 
             model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
@@ -306,7 +330,6 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
             size_t offset;
             memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
             compute_params.seq_active_start = offset / seq_size;
-            compute_params.token_len_per_seq = node->ne[2];
 
             if (mask_name.find("swa") != std::string::npos) {
                 compute_params.attention_size_swa = mask->ne[0];
@@ -318,7 +341,6 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
                 compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
                 compute_params.token_len_per_seq = 1;
             }
-            break;
         }
         // if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT)
         if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE &&
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 1a7849c5251..ff8f81e8ae6 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "ggml-quants.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
 #include "ggml.h"
 #include "openvino/decoder.h"
 
@@ -9,7 +10,6 @@
 #include <map>
 #include <memory>
 #include <openvino/core/partial_shape.hpp>
-#include <optional>
 #include <vector>
 
 struct ModelParams {
@@ -239,7 +239,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     }
 
     inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
+        return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) ||
+               (op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]);
     }
 
     inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
@@ -247,7 +248,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     }
 
     inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
+        return (op->op == GGML_OP_SET_ROWS && op->src[2] == tensor) ||
+               tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY;
     }
 
     inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
index 42602a730a4..059556107ef 100644
--- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
@@ -34,23 +34,19 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
     auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
     auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
 
-    ov::Output<ov::Node> mask_sliced, res;
+    ov::Output<ov::Node> res;
+
+    // For stateful
     std::string mask_name = "KQ_mask_sliced";
     if (context.get_input_names()[3].find("swa") != std::string::npos) {
         mask_name = "KQ_mask_swa_sliced";
     }
     if (context.has_input(mask_name)) {
-        mask_sliced = context.get_input(mask_name);
-    } else {
-        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
-        auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
-        auto token_len = get_dimensions(q, {2});
-        mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, two);
+        mask = context.get_input(mask_name);
     }
 
-    if (mask_sliced.get_element_type() != ov::element::f16) {
-        mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
+    if (mask.get_element_type() != ov::element::f16) {
+        mask = std::make_shared<ov::op::v0::Convert>(mask, ov::element::f16);
     }
 
     auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
@@ -77,7 +73,7 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
     k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k);
     v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v);
 
-    auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
+    auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask, scale_node, false);
     res = std::make_shared<ov::op::v1::Transpose>(sdpa,
                                                   ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
     res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
index 38edec85ddf..71cf1fd17aa 100644
--- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@@ -34,10 +34,7 @@ OutputVector translate_mulmat(const NodeContext & context) {
     ov::Output<ov::Node> A = context.get_input(1);
 
     bool transpose_b = true;
-    if (op_case == 2) {
-        B = B.get_node_shared_ptr()->input_value(0);
-        transpose_b = false;
-    } else if (op_case == 3) {
+    if (op_case == 3) {
         B = process_view_input(context, 0);
         A = process_view_input(context, 1);
     }
@@ -55,6 +52,7 @@ OutputVector translate_mulmat(const NodeContext & context) {
     auto batch_small = A_batch_larger ? B_batch : A_batch;
 
     Output<Node> Z = A_batch_larger ? B : A;
+    auto Z_shape = A_batch_larger ? B_shape : A_shape;
     int64_t factor = batch_large / batch_small;
     if (factor > 1 && batch_small > 1) {
         auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{batch_large});
@@ -67,7 +65,11 @@ OutputVector translate_mulmat(const NodeContext & context) {
         auto broadcast_shape = ov::op::v0::Constant::create(
             ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
         auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
-                                                        {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]});
+                                                        {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) Z_shape[3]});
+        if (op_case == 2) {
+            new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
+                                                       {(int64_t) 0, batch_large, (int64_t) Z_shape[2], (int64_t) -1});
+        }
 
         auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape,
                                                                      ov::op::BroadcastType::BIDIRECTIONAL);
diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp
index 269fd99f36f..a9a3800e663 100644
--- a/ggml/src/ggml-openvino/openvino/op/permute.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp
@@ -23,8 +23,11 @@ OutputVector translate_permute(const NodeContext & context) {
     num_inputs_check(context, 1, 1);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4,
-                                "Unsupported PERMUTE case");
+    FRONT_END_CHECK_IMPLEMENTED(op_case != 0, "Unsupported PERMUTE case");
+    // op_case 1 is trivial permute
+    // op_case 2 is to permute Q. It has a preceding VIEW that reshapes Q to restore the sequqence dimension
+    // op_case 3 4 it to permute KV cache in the default layout
+    // op_case 5 6 is to permute V cache when `-fa off`, where v_trans=true
 
     ov::Output<Node> res;
     auto src = context.get_input(0);
@@ -39,7 +42,7 @@ OutputVector translate_permute(const NodeContext & context) {
 
     if (op_case == 1 || context.is_stateful()) {
         res = std::make_shared<ov::op::v1::Transpose>(src, perm);
-    } else if (op_case == 4) {
+    } else if (op_case == 2) {
         auto output_shape = context.get_output_shape().to_shape();
         auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]});
         auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]});
@@ -62,13 +65,17 @@ OutputVector translate_permute(const NodeContext & context) {
         auto output_shape = context.get_output_shape().to_shape();
         int64_t head_size = output_shape[3];
         int64_t n_heads = output_shape[1];
+        if (op_case == 5 || op_case == 6) {
+            head_size = output_shape[2];
+            n_heads = output_shape[1];
+        }
         int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1;
         int64_t n_seq = cache_shape[1].get_length();
 
         Output<Node> attention_size;
         if (!context.has_input("attention_size")) {
             attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]});
-        } else if (op_case == 2) {
+        } else if (op_case == 3 || op_case == 5) {
             attention_size = context.get_input("attention_size");
         } else {
             attention_size = context.get_input("attention_size_swa");
@@ -88,18 +95,31 @@ OutputVector translate_permute(const NodeContext & context) {
             seq_active_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_end_val});
         }
 
-        // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size]
+        // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] (for `-fa off` [n_seq, n_heads, head_size, ctx_per_seq])
         // 2. slice out the active sequences
         // 3. slice out the attention part in each sequence
-        // 4. permute
+        // 4. permute (skip for `-fa off`)
         auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
         auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
 
-        auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
-            src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), false);
-        auto slice1 = std::make_shared<ov::op::v8::Slice>(src_reshaped, seq_active_start, seq_active_end, one, zero);
-        auto slice2 = std::make_shared<ov::op::v8::Slice>(slice1, zero, attention_size, one, one);
-        res = std::make_shared<ov::op::v1::Transpose>(slice2, perm);
+        if (op_case == 3 || op_case == 4) {
+            auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
+                src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}),
+                false);
+            auto slice1 =
+                std::make_shared<ov::op::v8::Slice>(src_reshaped, seq_active_start, seq_active_end, one, zero);
+            auto slice2 = std::make_shared<ov::op::v8::Slice>(slice1, zero, attention_size, one, one);
+            res = std::make_shared<ov::op::v1::Transpose>(slice2, perm);
+        } else {
+            auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
+            auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
+                src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, n_heads, head_size, ctx_per_seq}),
+                false);
+            auto slice1 =
+                std::make_shared<ov::op::v8::Slice>(src_reshaped, seq_active_start, seq_active_end, one, zero);
+            auto slice2 = std::make_shared<ov::op::v8::Slice>(slice1, zero, attention_size, one, three);
+            res = slice2;
+        }
     }
     return rename_outputs_with_suffix({res}, context.get_name());
 }
diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp
index efd9a5a860a..2a1a082d863 100644
--- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp
@@ -10,7 +10,6 @@
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/reshape.hpp>
-#include <stdexcept>
 #include <vector>
 
 namespace ov {
@@ -47,7 +46,14 @@ OutputVector translate_reshape(const NodeContext & context) {
             std::vector<int64_t>{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, (int64_t) output_shape[3]});
 
     } else if (op_case == 3) {
-        throw std::runtime_error("might be outdated RESHAPE case");
+        //  -  14: [     1,  1024,     1,     1] RESHAPE              Vcur-0 (reshaped) (reshaped)
+        //         [   512,     2,     1,     1]            0: RESHAPE     Vcur-0 (reshaped)
+        //  -  15: [     1, 524288,     1,     1] RESHAPE              cache_v_l0 (reshaped)
+        //         [   512,  1024,     1,     1]            0: NONE        cache_v_l0
+        //  -  16: [     1, 524288,     1,     1] SET_ROWS             cache_v_l0 (reshaped) (view)
+        //         [     1,  1024,     1,     1]            0: RESHAPE     Vcur-0 (reshaped) (reshaped)
+        //         [  1024,     1,     1,     1]            1: NONE        leaf_11
+        //         [     1, 524288,     1,     1]            2: RESHAPE     cache_v_l0 (reshaped)
         new_shape_node = ov::op::v0::Constant::create(
             ov::element::i64, {4}, std::vector<int64_t>{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, 1});
 
diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp
index 136e4265b42..9f2b841b19c 100644
--- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp
@@ -34,14 +34,14 @@ OutputVector translate_set_rows(const NodeContext & context) {
 
     data = std::make_shared<ov::op::v0::Convert>(data, context.get_output_type());
 
-    auto dst_shape = context.get_output_shape().to_shape();
+    auto row_size = context.get_input_shape(2)[3].get_length();
 
     auto ind_squeezed =
         std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2}));
     auto data_reshaped = std::make_shared<ov::op::v1::Reshape>(
         data,
         ov::op::v0::Constant::create(ov::element::i64, {4},
-                                     {(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) dst_shape[3]}),
+                                     {(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) row_size}),
         false);
     auto axes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2});
 
diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp
index 6b3a679c6db..3f3dd5e548d 100644
--- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp
@@ -47,6 +47,16 @@ OutputVector translate_soft_max(const NodeContext & context) {
     // For max_bias > 0 (ALiBi), apply per-head slope to mask before adding.
     if (context.get_input_size() > 1) {
         ov::Output<ov::Node> mask = context.get_input(1);
+
+        // For stateful
+        std::string mask_name = "KQ_mask_sliced";
+        if (context.get_input_names()[1].find("swa") != std::string::npos) {
+            mask_name = "KQ_mask_swa_sliced";
+        }
+        if (context.has_input(mask_name)) {
+            mask = context.get_input(mask_name);
+        }
+
         if (mask.get_element_type() != logits.get_element_type()) {
             mask = std::make_shared<ov::op::v0::Convert>(mask, logits.get_element_type());
         }
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
index 0f68a1f5062..8283777cdd0 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -13,6 +13,7 @@
 #include <memory>
 #include <openvino/core/node.hpp>
 #include <openvino/core/preprocess/pre_post_process.hpp>
+#include <openvino/core/type/element_type.hpp>
 #include <openvino/op/add.hpp>
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
@@ -88,19 +89,22 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
             if (is_static) {
                 mask_sliced = mask;
             } else if (ggml_model_decoder.is_stateful()) {
-                auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
-                auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
-                auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-                auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
-                auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
-                auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1});
+                auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+                auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+                auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
+                auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
+
+                auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+                auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
+
                 auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
-                auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one_1d, three_1d);
-                auto reshaped_inp_pos = std::make_shared<ov::op::v1::Reshape>(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
-                auto inp_pos_incremented = std::make_shared<ov::op::v1::Add>(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1}));
-                auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, std::make_shared<v1::ConvertLike>(inp_pos_incremented, token_len_per_seq)}, 0);
-                mask_sliced =
-                    std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
+                auto last_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one, three);
+                auto last_inp_pos_1d = std::make_shared<ov::op::v1::Reshape>(
+                    last_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
+                auto last_inp_pos_cvt = std::make_shared<ov::op::v0::Convert>(last_inp_pos_1d, ov::element::i64);
+                auto last_inp_pos_inc = std::make_shared<ov::op::v1::Add>(last_inp_pos_cvt, one);
+
+                mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, last_inp_pos_inc, step, axes);
                 mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
                 mask_sliced->set_friendly_name(sliced_name);
             } else {

From 8c3ff16385f030fa86000bfe68142130eb011128 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Fri, 10 Apr 2026 12:48:10 +0530
Subject: [PATCH 22/24] Enable --context-shift

---
 ggml/src/ggml-openvino/ggml-openvino.cpp    |  2 +-
 ggml/src/ggml-openvino/openvino/op/rope.cpp |  9 +++++++++
 ggml/src/ggml-openvino/utils.cpp            | 20 +++++++++++++++-----
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 49e2172ad3b..e9ff724042d 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -881,7 +881,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             //               op->src[0]->ne[0]);
             return true;
         }
-        if (op->type != GGML_TYPE_F32) {
+        if (op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) {
             // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
             return true;
         }
diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp
index 71fd90fae36..1954154835c 100644
--- a/ggml/src/ggml-openvino/openvino/op/rope.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp
@@ -75,6 +75,11 @@ OutputVector translate_rope(const NodeContext & context) {
         }
     }
 
+    auto output_type = context.get_output_type();
+    if (data_node->get_element_type() != ov::element::f32) {
+        data_node = std::make_shared<ov::op::v0::Convert>(data_node, ov::element::f32);
+    }
+
     if (mode == ROPE_TYPE_NORMAL) {
         auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
         auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
@@ -139,6 +144,10 @@ OutputVector translate_rope(const NodeContext & context) {
         res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
     }
 
+    if (res.get_element_type() != output_type) {
+        res = std::make_shared<ov::op::v0::Convert>(res, output_type);
+    }
+
     return rename_outputs_with_suffix({res}, context.get_name());
 }
 
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 24384fcf674..b034dc79469 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -283,17 +283,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
     auto & core = ov_singleton_core();
 
     auto get_prefill_chunk_size = [] {
-        const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE");
-        if (chunk_size_str && atoi(chunk_size_str) > 0) {
-            return atoi(chunk_size_str);
+        static int chunk_size = -1;
+        if (chunk_size == -1) {
+            const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE");
+            if (chunk_size_str && atoi(chunk_size_str) > 0) {
+                chunk_size = atoi(chunk_size_str);
+            } else {
+                chunk_size = 256;
+            }
         }
-        return 256;
+        return chunk_size;
     };
 
     static std::string device = "NPU";
     static auto is_static = true;
     static auto stateful = false;
-    static auto prefill_chunk_size = get_prefill_chunk_size();
+
+    auto prefill_chunk_size = get_prefill_chunk_size();
     const auto & config = ggml_openvino_get_compile_config();
 
     if (is_naive(cgraph)) {
@@ -357,6 +363,10 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
         std::shared_ptr<ov::Model> model;
         auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
 
+        if (m_params.n_heads == -1) {
+            // graph is not a LLM, e.g. context-shift graph
+            prefill_chunk_size = inp_pos->ne[0];
+        }
         auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
                                                                     is_static, stateful, false, true, prefill_chunk_size);
         auto ggml_decoder_decode = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static,

From 3e67742d1c5815f196ce06f15860955708e5d4c3 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Sun, 12 Apr 2026 22:18:36 -0700
Subject: [PATCH 23/24] Fix llm param compute error for normal softmax not the
 softmax in attention

---
 ggml/src/ggml-openvino/ggml-decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 9d2cf60cf60..d75915cd00d 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -286,7 +286,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
     for (int i = 0; i < cgraph->n_nodes; i++) {
         auto * node = cgraph->nodes[i];
         std::string name = std::string(node->name);
-        if (node->op == GGML_OP_FLASH_ATTN_EXT || node->op == GGML_OP_SOFT_MAX) {
+        if (node->op == GGML_OP_FLASH_ATTN_EXT || (node->op == GGML_OP_SOFT_MAX && node->src[1] != nullptr)) {
             compute_params.input_len = node->src[0]->ne[1];
 
             auto * q_perm = node->src[0];

From ea4d4b68780f4388b2693e1298c1f82e4dd062db Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Mon, 30 Mar 2026 22:23:11 -0700
Subject: [PATCH 24/24] openvino backend: enable OpenVINO backend fallback to
 CPU backend

---
 ggml/include/ggml.h                     |  4 +++-
 ggml/src/ggml-backend.cpp               | 20 +++++++++++++++++---
 ggml/src/ggml-openvino/ggml-decoder.cpp |  8 ++++----
 ggml/src/ggml-openvino/ggml-decoder.h   |  2 +-
 ggml/src/ggml.c                         |  1 +
 5 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 11d3e8a8167..96cac71c68a 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -688,7 +688,9 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[8];
+        char padding[16];
+        // add a struct ggml_tensor * named org_src, initialized to NULL, for keeping track of original source tensors in case of in-place operations
+        struct ggml_tensor * org_src;
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 22c656996cc..7c01bf0b6bf 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1124,8 +1124,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
         struct ggml_tensor * node = graph->nodes[i];
         int * cur_backend_id = &tensor_backend_id(node);
         if (node->view_src != NULL && *cur_backend_id == -1) {
-            *cur_backend_id = tensor_backend_id(node->view_src);
-            SET_CAUSE(node, "4.vsrc");
+            auto view_src_backend = tensor_backend_id(node->view_src);
+            if (view_src_backend != -1 && ggml_backend_supports_op(sched->backends[view_src_backend], node)) {
+                *cur_backend_id = tensor_backend_id(node->view_src);
+                SET_CAUSE(node, "4.vsrc");
+            }
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
@@ -1151,6 +1154,14 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
         GGML_ASSERT(*cur_backend_id != -1);
     }
 
+    // add the node id to the name for easier debugging
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        char                 new_name[128];
+        snprintf(new_name, sizeof(new_name), "%s#%d", node->name, i);
+        ggml_format_name(node, "%s", new_name);
+    }
+
     // pass 5: split graph, find tensors that need to be copied
     {
         int i_split = 0;
@@ -1171,7 +1182,9 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
             struct ggml_tensor * node = graph->nodes[i];
 
             if (ggml_is_view_op(node->op)) {
-                continue;
+                if ((tensor_backend_id(node) != cur_backend_id) && (ggml_backend_supports_op(sched->backends[cur_backend_id], node))) {
+                    tensor_backend_id(node) = cur_backend_id;
+                }
             }
 
             const int node_backend_id = tensor_backend_id(node);
@@ -1269,6 +1282,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
                                 ggml_set_input(tensor_copy);
                                 ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
                             }
+                            tensor_copy->org_src = src;
                             tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
                             SET_CAUSE(tensor_copy, "4.cpy");
                         }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index d75915cd00d..9753272038c 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -1057,9 +1057,9 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
                 continue;
             }
             struct ggml_tensor *root_src = nullptr;
-            // if (src->org_src) {
-            //     root_src = src->org_src;
-            // }
+            if (src->org_src) {
+                root_src = src->org_src;
+            }
             if (root_src) {
                 if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) ||
                     is_output_idx(root_src, node)) {
@@ -1139,7 +1139,7 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
             // identifies the dynamic dim even when two dims share the same size.
             m_node_dynamic_dims[node] = -1;
             if (m_node_dynamic_dims[node->src[0]] != -1) {
-                if (node->src[0]->op == GGML_OP_NONE) {
+                if (node->src[0]->op == GGML_OP_NONE && node->src[0]->org_src == nullptr) {
                     m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
                     break;
                 }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index ff8f81e8ae6..93e6973fb38 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -227,7 +227,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     void update_io(ggml_cgraph * cgraph);
 
     inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
+        return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE && op->src[0]->org_src == nullptr;
     }
 
     inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 0142498d967..41f3541da65 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1767,6 +1767,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
         /*.padding      =*/ { 0 },
+        /*.org_src       =*/ NULL,
     };
 
     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads