openvinotoolkit · apaniukov · Feb 16, 2026 · Feb 19, 2026 · Feb 20, 2026 · Feb 20, 2026
@@ -868,12 +868,27 @@ jobs:
           requirements_files: "${{ env.SRC_DIR }}/samples/requirements.txt;${{ env.SRC_DIR }}/tools/llm_bench/requirements.txt"
           local_wheel_dir: ${{ env.INSTALL_DIR }}/wheels
 
+      - name: Convert models for cache types gtests
+        if: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching }}
+        run: |
+          while IFS=',' read -r model_id _rest; do
+            [[ -z "${model_id}" || "${model_id}" =~ ^[[:space:]]*# ]] && continue
+            model_name="${model_id##*/}"
+            model_dir="${{ env.HF_HOME }}/ov_test_models/${model_name}"
+            if [[ ! -f "${model_dir}/openvino_model.xml" ]]; then
+              optimum-cli export openvino -m "${model_id}" --task text-generation-with-past "${model_dir}"
+            fi
+          done < "${{ env.INSTALL_DIR }}/tests/data/cache_types_models.csv"
+
       - name: gtests unit tests
         if: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching }}
         run: |
           source ${{ env.INSTALL_DIR }}/setupvars.sh
           chmod +x ${{ env.INSTALL_DIR }}/tests/tests_continuous_batching
           ${{ env.INSTALL_DIR }}/tests/tests_continuous_batching --gtest_filter="-AddSecondInputTest.*"
+        env:
+          TEST_MODELS_BASE_DIR: "${{ env.HF_HOME }}/ov_test_models"
+          CACHE_TYPES_CSV: "${{ env.INSTALL_DIR }}/tests/data/cache_types_models.csv"
 
       - name: Test Continuous Batching Tools
         if: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching }}

@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers[transformers]~=2026.1.0.0.dev
-https://github.com/huggingface/optimum-intel/archive/2c48d6430c265ac259c1b264f3e2c4025cdd7b76.tar.gz#egg=optimum-intel
+https://github.com/huggingface/optimum-intel/archive/0566b76f094d4c3084e06d29a248b39a1bff3fa4.tar.gz#egg=optimum-intel
 numpy==1.26.4; platform_system == "Darwin" and platform_machine == "x86_64"
 safetensors==0.6.2; platform_system == "Darwin" and platform_machine == "x86_64"
 einops==0.8.2  # For Qwen

@@ -205,8 +205,22 @@ ov::genai::LLMPipeline::LLMPipeline(
     bool is_npu_requested = ov::genai::utils::is_npu_requested(device, user_properties);
     auto [properties, attention_backend] = utils::extract_attention_backend(user_properties, is_npu_requested);
 
+    const auto model = utils::read_model(models_path, properties);
+
+    // PA backend does not support linear attention states (conv/SSM caches).
+    if (attention_backend == PA_BACKEND && !is_npu_requested
+        && utils::has_linear_attention_states(model)) {
+        if (utils::explicitly_requires_paged_attention(user_properties)
+            || user_properties.find("ATTENTION_BACKEND") != user_properties.end()) {
+            GENAI_WARN("PA backend does not support models with linear attention states. The model may work incorrectly.");
+        } else {
+            attention_backend = SDPA_BACKEND;
+        }
+    }
+
+    const auto generation_config = utils::from_config_json_if_exists(models_path);
     if (is_npu_requested) {
-        m_pimpl = StatefulPipeline::create(models_path, tokenizer, device, properties);
+        m_pimpl = StatefulPipeline::create(model, tokenizer, device, properties, generation_config, models_path);
     } else if (utils::explicitly_requires_paged_attention(user_properties)) {
         // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
         auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
@@ -227,7 +241,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     if (m_pimpl == nullptr) {
         // FIXME: Switch to StatefulPipeline::create after resolving issues
         //        with GPU and CPU for StatefulSpeculativeLLMPipeline
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(model, tokenizer, device, properties, generation_config);
     }
 
     m_pimpl->save_load_time(start_time);
@@ -243,8 +257,24 @@ ov::genai::LLMPipeline::LLMPipeline(
     bool is_npu_requested = ov::genai::utils::is_npu_requested(device, user_properties);
     auto [properties, attention_backend] = utils::extract_attention_backend(user_properties, is_npu_requested);
 
+    // Read model and create tokenizer once to avoid double I/O during pipeline construction.
+    const auto model = utils::read_model(models_path, properties);
+    const Tokenizer tokenizer(models_path, properties);
+
+    // PA backend does not support linear attention states (conv/SSM caches).
+    if (attention_backend == PA_BACKEND && !is_npu_requested
+        && utils::has_linear_attention_states(model)) {
+        if (utils::explicitly_requires_paged_attention(user_properties)
+            || user_properties.find("ATTENTION_BACKEND") != user_properties.end()) {
+            GENAI_WARN("PA backend does not support models with linear attention states. The model may work incorrectly.");
+        } else {
+            attention_backend = SDPA_BACKEND;
+        }
+    }
+
+    const auto generation_config = utils::from_config_json_if_exists(models_path);
     if (is_npu_requested) {
-        m_pimpl = StatefulPipeline::create(models_path, device, properties);
+        m_pimpl = StatefulPipeline::create(model, tokenizer, device, properties, generation_config, models_path);
     } else if (utils::explicitly_requires_paged_attention(user_properties)) {
         // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
         auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
@@ -265,7 +295,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     if (m_pimpl == nullptr) {
         // FIXME: Switch to StatefulPipeline::create after resolving issues
         //        with GPU and CPU for StatefulSpeculativeLLMPipeline
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(model, tokenizer, device, properties, generation_config);
     }
 
     m_pimpl->save_load_time(start_time);
@@ -284,9 +314,21 @@ ov::genai::LLMPipeline::LLMPipeline(
     bool is_npu_requested = ov::genai::utils::is_npu_requested(device, user_properties);
     auto [properties, attention_backend] = utils::extract_attention_backend(user_properties, is_npu_requested);
 
+    // PA backend does not support linear attention states (conv/SSM caches).
+    const auto model = utils::singleton_core().read_model(model_str, weights_tensor);
+    if (attention_backend == PA_BACKEND && !is_npu_requested
+        && utils::has_linear_attention_states(model)) {
+        if (utils::explicitly_requires_paged_attention(user_properties)
+            || user_properties.find("ATTENTION_BACKEND") != user_properties.end()) {
+            GENAI_WARN("PA backend does not support models with linear attention states. The model may work incorrectly.");
+        } else {
+            attention_backend = SDPA_BACKEND;
+        }
+    }
+
     if (is_npu_requested) {
         m_pimpl = StatefulPipeline::create(
-            utils::singleton_core().read_model(model_str, weights_tensor),
+            model,
             tokenizer,
             device,
             properties,
@@ -314,7 +356,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         // FIXME: Switch to StatefulPipeline::create after resolving issues
         //        with GPU and CPU for StatefulSpeculativeLLMPipeline
         m_pimpl = std::make_unique<StatefulLLMPipeline>(
-            utils::singleton_core().read_model(model_str, weights_tensor),
+            model,
             tokenizer,
             device,
             properties,

@@ -48,7 +48,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config)
-    : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
+    : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer), m_cache_state(model) {
     if (device.find("NPU") != std::string::npos) {
         m_is_npu = true;
         m_use_full_chat_history = true;
@@ -63,7 +63,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
 
     if (!m_use_full_chat_history)
-        m_kv_cache_state.seq_length_axis = kv_pos.seq_len;
+        m_cache_state.seq_length_axis = kv_pos.seq_len;
 
     auto [filtered_properties_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties);
     auto filtered_properties = extract_adapters_from_properties(filtered_properties_without_gguf, &m_generation_config.adapters);
@@ -165,8 +165,8 @@ DecodedResults StatefulLLMPipeline::generate(
             if (m_use_full_chat_history) {
                 encoded_input = new_chat_tokens;
             } else {
-                ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_kv_cache_state);
-                encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_kv_cache_state);
+                ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_cache_state);
+                encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_cache_state);
             }
         } else if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
             std::vector<std::string> templated_input_vector;
@@ -193,8 +193,8 @@ DecodedResults StatefulLLMPipeline::generate(
             if (m_use_full_chat_history) {
                 encoded_input = new_chat_tokens;
             } else {
-                ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_kv_cache_state);
-                encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_kv_cache_state);
+                ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_cache_state);
+                encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_cache_state);
             }
             // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
         } else {
@@ -260,9 +260,9 @@ DecodedResults StatefulLLMPipeline::generate(
     }
 
     if (!is_history_continuation) {
-        reset_kv_state();
+        reset_state();
         m_model_runner.get_tensor("attention_mask").set_shape({1, 0});
-        m_kv_cache_state.reset_state();
+        m_cache_state.reset_state();
     }
 
     m_history = history;
@@ -275,8 +275,8 @@ DecodedResults StatefulLLMPipeline::generate(
     if (m_use_full_chat_history) {
         encoded_input = new_chat_tokens;
     } else {
-        ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_kv_cache_state);
-        encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_kv_cache_state);
+        ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_cache_state);
+        encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_cache_state);
     }
     return get_decoded_results(encoded_input, config, streamer, start_time);
 }
@@ -294,9 +294,9 @@ EncodedResults StatefulLLMPipeline::generate(
                         "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
 
     if (!is_chat_conversation) {
-        reset_kv_state();
+        reset_state();
         m_model_runner.get_tensor("attention_mask").set_shape({1, 0});
-        m_kv_cache_state.reset_state();
+        m_cache_state.reset_state();
     }
 
     auto start_time = std::chrono::steady_clock::now();
@@ -338,14 +338,14 @@ EncodedResults StatefulLLMPipeline::generate(
     size_t real_input_ids_size = input_ids.get_shape().at(1);
 
     if (is_chat_conversation && m_use_full_chat_history)
-        m_kv_cache_state.reset_state();
+        m_cache_state.reset_state();
 
     // Tail of previous output in chat mode is missing in KV cache.
     if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
         ov::Tensor new_chat_tokens = ov::Tensor{ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data()};
-        ov::genai::align_kv_cache_and_history(new_chat_tokens, m_kv_cache_state);
+        ov::genai::align_kv_cache_and_history(new_chat_tokens, m_cache_state);
 
-        auto encoded_input = get_chat_encoded_input(new_chat_tokens, m_kv_cache_state);
+        auto encoded_input = get_chat_encoded_input(new_chat_tokens, m_cache_state);
         input_ids = encoded_input.input_ids;
         attention_mask = encoded_input.attention_mask;
     }
@@ -379,29 +379,29 @@ EncodedResults StatefulLLMPipeline::generate(
 
     if (is_chat_conversation) {
         if (m_use_full_chat_history)
-            reset_kv_state();
+            reset_state();
         else
-            ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_cache_state, m_adapter_controller);
+            ov::genai::utils::trim_kv_cache(m_model_runner, m_cache_state, m_adapter_controller);
     }
 
-    size_t kv_cache_len = 0;
+    size_t cache_len = 0;
     ov::Tensor concatenated_attention_mask;
-    if (is_chat_conversation && !m_kv_cache_state.get_state().empty() && !m_use_full_chat_history) {
+    if (is_chat_conversation && !m_cache_state.get_state().empty() && !m_use_full_chat_history) {
         OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
         // If history is saved in KV cache, concatenate new attention_mask with the already existing.
         // Between subsequent runs attention_mask should not be modified.
         auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
         auto prompt_len = attention_mask.get_shape()[1];
 
-        kv_cache_len = m_kv_cache_state.get_state().size();
+        cache_len = m_cache_state.get_state().size();
 
-        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
+        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, cache_len + prompt_len}};
         auto start_atten_hst = atten_mask_history.data<int64_t>();
 
-        std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
+        std::copy(start_atten_hst, start_atten_hst + cache_len,
                 new_atten_mask.data<int64_t>());
         std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
-                new_atten_mask.data<int64_t>() + kv_cache_len);
+                new_atten_mask.data<int64_t>() + cache_len);
         concatenated_attention_mask = new_atten_mask;
     } else {
         concatenated_attention_mask = attention_mask;
@@ -413,7 +413,7 @@ EncodedResults StatefulLLMPipeline::generate(
     std::optional<ov::Tensor> position_ids = std::nullopt;
     if (position_ids_available) {
         position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-        utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
+        utils::initialize_position_ids(*position_ids, attention_mask, cache_len);
     }
 
     if(m_adapter_controller) {
@@ -426,7 +426,7 @@ EncodedResults StatefulLLMPipeline::generate(
     for (size_t request_id = 0; request_id < batch_size; request_id++) {
         SequenceGroup::Ptr sequence_group;
         if (is_chat_conversation) {
-            std::vector<int64_t>& state = m_kv_cache_state.get_state();
+            std::vector<int64_t>& state = m_cache_state.get_state();
             std::vector<int64_t> tokenized_chat_hist;
             tokenized_chat_hist.reserve(state.size() + input_ids.get_size());
             std::copy(state.begin(), state.end(), std::back_inserter(tokenized_chat_hist));
@@ -449,12 +449,12 @@ EncodedResults StatefulLLMPipeline::generate(
     }
 
     ov::genai::utils::GenerationFinishInfo finish_info = get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr, m_sampler,
-                                                                                requests, position_ids, std::nullopt, m_kv_cache_state, nullptr, std::nullopt, m_max_kv_cache_size);
+                                                                                requests, position_ids, std::nullopt, m_cache_state, nullptr, std::nullopt, m_max_kv_cache_size);
     ov::genai::EncodedResults& result = finish_info.results;
     m_chat_generation_finish_status = finish_info.streaming_finish_status;
 
     if (is_chat_conversation) {
-        m_kv_cache_state.num_tokens_to_trim = 0;
+        m_cache_state.num_tokens_to_trim = 0;
 
         if (m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
             if (m_chat_generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
@@ -464,7 +464,7 @@ EncodedResults StatefulLLMPipeline::generate(
             }
         }
         if (config.is_beam_search()) {
-            m_kv_cache_state.num_tokens_to_trim = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+            m_cache_state.num_tokens_to_trim = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
         }
     }
 
@@ -489,7 +489,7 @@ void StatefulLLMPipeline::start_chat(const std::string& system_message) {
     m_history.push_back({{"role", "system"}, {"content", system_message}});
 }
 
-void StatefulLLMPipeline::reset_kv_state() {
+void StatefulLLMPipeline::reset_state() {
     if(m_adapter_controller) {
         for(auto& state: m_model_runner.query_state()) {
             if(!m_adapter_controller->has_state_name(state.get_name())) {
@@ -505,12 +505,12 @@ void StatefulLLMPipeline::finish_chat() {
     is_chat_conversation = false;
     m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
     bool have_state = 0 != m_model_runner.get_tensor("attention_mask").get_size();
-    if (!m_kv_cache_state.get_state().empty() || have_state) {
-        reset_kv_state();
+    if (!m_cache_state.get_state().empty() || have_state) {
+        reset_state();
         m_model_runner.get_tensor("attention_mask").set_shape({1, 0});
         m_history.clear();
         m_tokenized_chat_history.clear();
-        m_kv_cache_state.reset_state();
+        m_cache_state.reset_state();
     }
 }
 

@@ -28,9 +28,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
     bool m_is_npu = false;
     // include reflection of tokens contained in the kv cache and amount of tokens, which are needed to trim from kv cache on the next step of chat
-    utils::KVCacheState m_kv_cache_state;
+    utils::CacheState m_cache_state;
 
-    void reset_kv_state();
+    void reset_state();
 public:
 
     StatefulLLMPipeline(