microsoft · Mar 10, 2025
diff --git a/‎onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
Lines changed: 11 additions & 5 deletions b/‎onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
Lines changed: 11 additions & 5 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
Lines changed: 16 additions & 4 deletions b/‎onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
Lines changed: 16 additions & 4 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
Lines changed: 8 additions & 10 deletions b/‎onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
Lines changed: 8 additions & 10 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
Lines changed: 83 additions & 73 deletions b/‎onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
Lines changed: 83 additions & 73 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
Lines changed: 4 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
Lines changed: 4 additions & 0 deletions
@@ -139,13 +139,19 @@ Status BeamSearch::SetupSubgraphExecutionInfo(const SessionState& session_state,
       ORT_RETURN_IF_ERROR(t5_encoder_subgraph_->Setup(session_state, subgraph_session_state));
       encoder_feeds_fetches_manager_ = t5_encoder_subgraph_->GetFeedsFetchesManager();
 
-      if (parameters_->decoder_start_token_id < 0) {
-        ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 2,
-                      "Encoder subgraph shall have 2 inputs when decoder_start_token_id attribute is empty");
+      if (t5_encoder_subgraph_->HasLogitsOutput()) {
+        // New format requires start token id.
+        ORT_ENFORCE(parameters_->decoder_start_token_id >= 0);
       } else {
-        ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 3,
-                      "Encoder subgraph shall have 3 inputs when decoder_start_token_id attribute is available");
+        if (parameters_->decoder_start_token_id < 0) {
+          ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 2,
+                        "Encoder subgraph shall have 2 inputs when decoder_start_token_id attribute is empty");
+        } else {
+          ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 3,
+                        "Encoder subgraph shall have 3 inputs when decoder_start_token_id attribute is available");
+        }
       }
+
     } else if (attribute_name == "decoder") {
       ORT_ENFORCE(t5_decoder_subgraph_ == nullptr,
                   "SetupSubgraphExecutionInfo should only be called once for each subgraph.");
 
@@ -51,7 +51,13 @@ class BeamSearchT5 : public BeamSearchBase<T> {
         expand_buffer_int32_func_(expand_buffer_int32_func),
         expand_buffer_float_func_(expand_buffer_float_func),
         expand_buffer_float16_func_(expand_buffer_float16_func),
-        create_beam_scorer_func_(create_beam_scorer_func) {}
+        create_beam_scorer_func_(create_beam_scorer_func) {
+    // When decoder uses encoder_hidden_state, make sure the encoder outputs it.
+    if (decoder_subgraph_.UseEncoderHiddenState()) {
+      ORT_ENFORCE(encoder_subgraph_.subgraph_output_names[1] == "encoder_hidden_states");
+    }
+    ORT_ENFORCE(encoder_subgraph_.num_layers == decoder_subgraph_.num_layers);
+  }
 
 #ifdef USE_CUDA
   Status InitializeCuda(
@@ -160,7 +166,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
       this->create_encoder_inputs_func_,
       this->add_to_feeds_func_,
       buffer,
-      decoder_input_ids,
+      decoder_input_ids,  // new format does not use decoder_input_ids in encoder, it is still initialized here when decoder_start_token_id >= 0.
       this->ort_stream_));
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
@@ -233,15 +239,20 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
 
   std::vector<OrtValue> decoder_fetches;
 
-  if (current_length + 1 < parameters->max_length) {
+  // When encoder outputs logits (in old format), we need get the next token from logits.
+  if (current_length + 1 < parameters->max_length && encoder_subgraph_.HasLogitsOutput()) {
     ++iteration_counter;
-    ORT_RETURN_IF_ERROR(this->GenerateNextToken(encoder_fetches[0],
+    const OrtValue& logits = encoder_fetches[0];
+    ORT_RETURN_IF_ERROR(this->GenerateNextToken(logits,
                                                 beam_next_tokens,
                                                 beam_state,
                                                 cpu_state,
                                                 iteration_counter));
     ++current_length;  // Increase sequence length after a new token is generated.
+  }
 
+  // Generate inputs for next decoder subgraph call.
+  if (current_length < parameters->max_length) {
     ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(this->cpu_allocator_,
                                                              ReinterpretAsSpan<const int32_t>(beam_next_tokens),
                                                              this->implicit_inputs_,
@@ -262,6 +273,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
                                                              this->cuda_device_prop_ != nullptr));
 
     if (decoder_subgraph_.past_present_share_buffer_) {
+      // Configure buffer sharing of past and present kv cache.
       decoder_fetches.reserve(static_cast<size_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) +
                               2 * static_cast<size_t>(decoder_subgraph_.num_layers));
       decoder_fetches.resize(decoder_subgraph_.GetFirstPresentOutputIndex(), OrtValue());
 
@@ -36,12 +36,9 @@ Subgraph::Subgraph(
   auto& subgraph_inputs = subgraph.GetInputs();
   auto& subgraph_outputs = subgraph.GetOutputs();
 
-  // inputs: input_ids, position_ids, attention_mask, past_0, past_1, ...
-  // outputs: logits, present_0, present_1, ...
   num_subgraph_inputs = static_cast<int>(subgraph_inputs.size());
   num_subgraph_outputs = static_cast<int>(subgraph_outputs.size());
 
-  // CheckSubgraph will verify inputs and outputs later.
   subgraph_input_names.reserve(num_subgraph_inputs);
   for (int i = 0; i < num_subgraph_inputs; ++i) {
     subgraph_input_names.push_back(subgraph_inputs[i]->Name());
@@ -68,10 +65,9 @@ Status Subgraph::Setup(const SessionState& session_state,
   InlinedVector<std::string_view> feed_names;
   feed_names.reserve(static_cast<size_t>(num_subgraph_inputs) + static_cast<size_t>(num_implicit_inputs));
 
-  // Use the first output (logits) to find device location.
+  // Use the first output to find device location.
   const OrtDevice& default_location = utils::FindDeviceForValue(subgraph_session_state, subgraph_output_names[0]);
 
-  // The position_ids, attention_mask, past_0, ... are created by this operator so the name doesn't matter.
   feed_names.insert(feed_names.end(), subgraph_input_names.begin(), subgraph_input_names.end());
 
   const auto& subgraph_map = subgraph_session_state.GetOrtValueNameIdxMap();
@@ -174,13 +170,15 @@ Status Subgraph::GetParameters(const ONNX_NAMESPACE::TensorShapeProto* past_shap
   }
 
   // Logits shape is like (batch_size, seq_len, vocabulary_size)
-  ORT_RETURN_IF(logits_shape->dim_size() != 3,
-                "subgraph logits output is expected to have 3 dimension, got ", logits_shape->dim_size());
+  if (logits_shape != nullptr) {
+    ORT_RETURN_IF(logits_shape->dim_size() != 3,
+                  "subgraph logits output is expected to have 3 dimension, got ", logits_shape->dim_size());
 
-  ORT_RETURN_IF(!logits_shape->dim(2).has_dim_value() || logits_shape->dim(2).dim_value() <= 0,
-                "subgraph past state dimension 2 shall have a positive value for vocabulary size");
+    ORT_RETURN_IF(!logits_shape->dim(2).has_dim_value() || logits_shape->dim(2).dim_value() <= 0,
+                  "subgraph past state dimension 2 shall have a positive value for vocabulary size");
 
-  this->vocab_size = static_cast<int>(logits_shape->dim(2).dim_value());
+    this->vocab_size = static_cast<int>(logits_shape->dim(2).dim_value());
+  }
 
   return Status::OK();
 }
 
@@ -141,14 +141,23 @@ Status T5DecoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_i
 }
 
 // Create inputs for decoder from the following data sources:
-// encoder feeds: encoder_input_ids, encoder_attention_mask, decoder_input_ids (with start tokens)
-// encoder fetches: logits,
-//                  encoder_hidden_states,
-//                  present_key_self_0, present_value_self_0, ..., present_key_cross_0, present_value_cross_0, ...
-// decoder_feeds: input_ids,
-//                encoder_attention_mask,
-//                encoder_hidden_states,
-//                present_key_self_0, present_value_self_0, ..., present_key_cross_0, present_value_cross_0, ...
+// New format:
+//   encoder feeds: encoder_input_ids, encoder_attention_mask
+//   encoder fetches: present_key_cross_0, present_value_cross_0, ...
+//   decoder_feeds: input_ids, encoder_attention_mask,
+//                  present_key_self_0, present_value_self_0, ...,
+//                  present_key_cross_0, present_value_cross_0, ...
+
+// Old format:
+//   encoder feeds: encoder_input_ids, encoder_attention_mask, decoder_input_ids (with start tokens)
+//   encoder fetches: logits, encoder_hidden_states,
+//                    present_key_self_0, present_value_self_0, ...,
+//                    present_key_cross_0, present_value_cross_0, ...
+//   decoder_feeds: input_ids, encoder_input_ids (optiona), encoder_attention_mask, encoder_hidden_states,
+//                  present_key_self_0, present_value_self_0, ...,
+//                  present_key_cross_0, present_value_cross_0, ...
+//                  past_seq_len (optional), num_beams (optional), cache_indirection (optional)
+
 Status T5DecoderSubgraph::CreateInitialFeeds(
     AllocatorPtr cpu_allocator,
     gsl::span<const int32_t> beam_next_tokens,
@@ -173,33 +182,30 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
   // Allocate subgraph inputs from same device as inputs of encoder subgraph.
   AllocatorPtr allocator = session_state_->GetAllocator(encoder_feeds[0].Get<Tensor>().Location());
 
+  int batch_beam_size = static_cast<int>(encoder_fetches[0].Get<Tensor>().Shape()[0]) * num_beam;
+
   // Copy beam next tokens in CPU to input_ids in provider device (CPU for CPU EP, or GPU for CUDA EP).
-  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
   int sequence_length = !use_sequence_as_input_ids ? 1 : cur_len;
   int64_t dims[] = {batch_beam_size, sequence_length};
   TensorShape input_ids_shape(&dims[0], 2);
   OrtValue input_ids;
   Tensor::InitOrtValue(DataTypeImpl::GetType<int32_t>(), input_ids_shape, allocator, input_ids);
-  int32_t* input_ids_data = input_ids.GetMutable<Tensor>()->MutableData<int32_t>();
-  AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
-  size_t total_size = static_cast<size_t>(cur_len) * static_cast<size_t>(batch_beam_size);
-  size_t total_size_bytes = total_size * sizeof(int);
-  auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size_bytes, false, stream);
-  int* seq_copy_ptr = seq_copy.get();
-
-  if (!use_sequence_as_input_ids_) {
+
+  // Prepare data for input_ids.
+  if (!use_sequence_as_input_ids_) {  // use next tokens for input_ids. This is for Whisper model.
     ORT_RETURN_IF_ERROR(device_copy_int32_func(
         input_ids.GetMutable<Tensor>()->MutableDataAsSpan<int32_t>(),
         beam_next_tokens,
         stream,
         DeviceCopyDirection::hostToDevice));
   } else {
+    int32_t* input_ids_data = input_ids.GetMutable<Tensor>()->MutableData<int32_t>();
     if (use_cuda) {
       auto sequences_buffer = sequences.GetCurrentDeviceSequences();
       for (int i = 0; i < batch_beam_size; i++) {
-        size_t batch_beam_stride = static_cast<size_t>(i) * static_cast<size_t>(sequences.GetMaxLength());
+        size_t offset = static_cast<size_t>(i) * static_cast<size_t>(sequences.GetMaxLength());
         int seq_size = sequences.GetSequenceLength();
-        gsl::span<const int32_t> sequence = sequences_buffer.subspan(batch_beam_stride, seq_size);
+        gsl::span<const int32_t> sequence = sequences_buffer.subspan(offset, seq_size);
         gsl::span<int> temp_input(input_ids_data + static_cast<ptrdiff_t>(i) * seq_size, seq_size);
         ORT_RETURN_IF_ERROR(device_copy_int32_func(
             temp_input,
@@ -208,6 +214,13 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
             DeviceCopyDirection::deviceToDevice));
       }
     } else {
+      size_t total_size = static_cast<size_t>(cur_len) * static_cast<size_t>(batch_beam_size);
+      size_t total_size_bytes = total_size * sizeof(int);
+      AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
+      // TODO: not need extra buffer. Copy directly to input_ids_data instead like the user_cuda above.
+      auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size_bytes, false, stream);
+      int* seq_copy_ptr = seq_copy.get();
+
       const size_t cur_len_bytes = cur_len * sizeof(int);
       for (int i = 0; i < batch_beam_size; i++) {
         gsl::span<const int32_t> sequence = sequences.GetSequence(i);
@@ -227,9 +240,11 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
 
   // The ordering is the same as used in Setup.
   decoder_feeds.reserve(static_cast<size_t>(num_subgraph_inputs) + static_cast<size_t>(num_implicit_inputs));
+
+  // input 0: input_ids
   decoder_feeds.push_back(input_ids);
 
-  if (has_encoder_input_ids_) {
+  if (has_encoder_input_ids_) {  // encoder_input_ids is optional
     // The encoder_input_ids is copied from the first input of encoder.
     OrtValue expanded_encoder_input_ids;
     ORT_RETURN_IF_ERROR(expand_buffer_int32_func(stream,
@@ -251,70 +266,65 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
                                                expanded_decoder_attention_masks,
                                                false,
                                                0 /*max_sequence_length*/));
-
   decoder_feeds.push_back(expanded_decoder_attention_masks);
 
   if (!past_present_share_buffer_) {
     past_present_share_buffer_max_seq_len = 0;
   }
 
-  // When first_past_input_index_ == 3, the encoder_hidden_states and past states are copied from the second output
-  // of encoder.
-  // When first_past_input_index_ == 2, the past states are copied from the second output of encoder.
-  // TODO - probably more robust to introduce a encoder_out/decoder_in mapping instead of relying on positions.
-  // What happens if encoder_hidden_states is present in the encoder_fetches but not in the decoder_feeds?
-  for (size_t j = static_cast<size_t>(2) - has_hidden_state_; j < encoder_fetches.size(); j++) {
-    if (j == 1) {
-      ORT_RETURN_IF(has_hidden_state_ == false, "Invalid hidden_states expension: has_hidden_state_ == false");
-      OrtValue expanded_hidden_states;
-      if (is_output_float16_) {
-        ORT_RETURN_IF_ERROR(expand_buffer_float16_func(stream,
-                                                       encoder_fetches[j],
-                                                       num_beam,
-                                                       allocator,
-                                                       expanded_hidden_states,
-                                                       false,
-                                                       0 /*max_sequence_length*/));
-      } else {
-        ORT_RETURN_IF_ERROR(expand_buffer_float_func(stream,
-                                                     encoder_fetches[j],
-                                                     num_beam,
-                                                     allocator,
-                                                     expanded_hidden_states,
-                                                     false,
-                                                     0 /*max_sequence_length*/));
-      }
-      decoder_feeds.push_back(expanded_hidden_states);
-    } else {
+// macro to expand encoder outputs and append to decoder feeds.
+#define ADD_DECODER_FEED(encoder_output, is_dynamic_kv_cache)                                                         \
+  OrtValue expanded;                                                                                                  \
+  if (is_output_float16_) {                                                                                           \
+    ORT_RETURN_IF_ERROR(expand_buffer_float16_func(stream, encoder_output, num_beam, allocator, expanded, false,      \
+                                                   is_dynamic_kv_cache ? past_present_share_buffer_max_seq_len : 0)); \
+  } else {                                                                                                            \
+    ORT_RETURN_IF_ERROR(expand_buffer_float_func(stream, encoder_output, num_beam, allocator, expanded, false,        \
+                                                 is_dynamic_kv_cache ? past_present_share_buffer_max_seq_len : 0));   \
+  }                                                                                                                   \
+  decoder_feeds.push_back(expanded);
+
+  // The encoder_hidden_states is copied from the second output of encoder.
+  if (has_hidden_state_) {
+    ADD_DECODER_FEED(encoder_fetches[1], false);
+  }
+
+  // New format of encoder has only cross outputs.
+  bool is_new_format = (static_cast<int>(encoder_fetches.size()) == 2 * num_layers);
+  if (is_new_format) {
+    for (int i = 0; i < 2 * num_layers; i++) {
+      // cross shape is (batch_size, num_heads, encode_sequence_length, head_size)
+      const TensorShape& cross_shape = encoder_fetches[0].Get<Tensor>().Shape();
+      ORT_ENFORCE(cross_shape.NumDimensions() == 4);
+
+      // Shape for kv cache: (batch_size * num_beam, num_heads, max_seq_len, head_size)
+      int64_t cache_dims[4] = {0};
+      cross_shape.CopyDims(cache_dims, cross_shape.NumDimensions());
+      cache_dims[0] *= num_beam;
+      cache_dims[2] = past_present_share_buffer_max_seq_len;
+      TensorShape expanded_shape(&cache_dims[0], cross_shape.NumDimensions());
+
+      MLDataType element_type = encoder_fetches[0].Get<Tensor>().DataType();
+      OrtValue past;
+      Tensor::InitOrtValue(element_type, expanded_shape, allocator, past);
+      decoder_feeds.push_back(past);
+    }
+
+    // Add cross inputs from encoder output.
+    for (size_t j = 0; j < encoder_fetches.size(); j++) {
+      ADD_DECODER_FEED(encoder_fetches[j], false);
+    }
+  } else {
+    for (size_t j = 1 + has_hidden_state_; j < encoder_fetches.size(); j++) {
       // past key/value for cross attention does not need to be initialized with max_seq_len since they are static.
-      bool use_max_seq_len = (j - first_past_input_index_) < 2 * static_cast<size_t>(num_layers);
-
-      OrtValue expanded_cache;
-      if (is_output_float16_) {
-        ORT_RETURN_IF_ERROR(expand_buffer_float16_func(stream,
-                                                       encoder_fetches[j],
-                                                       num_beam,
-                                                       allocator,
-                                                       expanded_cache,
-                                                       false,
-                                                       use_max_seq_len ? past_present_share_buffer_max_seq_len : 0));
-      } else {
-        ORT_RETURN_IF_ERROR(expand_buffer_float_func(stream,
-                                                     encoder_fetches[j],
-                                                     num_beam,
-                                                     allocator,
-                                                     expanded_cache,
-                                                     false,
-                                                     use_max_seq_len ? past_present_share_buffer_max_seq_len : 0));
-      }
-      decoder_feeds.push_back(expanded_cache);
+      bool is_dynamic_kv_cache = (j - first_past_input_index_) < 2 * static_cast<size_t>(num_layers);
+      ADD_DECODER_FEED(encoder_fetches[j], is_dynamic_kv_cache);
     }
   }
 
-  // TODO: This part shares the similar logic with CreateInitialFeeds() in subgraph_gpt.cc. We should refactor it.
   if (past_present_share_buffer_) {
-    // Past sequence length feed
-    ORT_RETURN_IF_ERROR(AppendPastSequenceLength(decoder_feeds, cpu_allocator, 1));
+    // Past sequence length set to 0
+    ORT_RETURN_IF_ERROR(AppendPastSequenceLength(decoder_feeds, cpu_allocator, is_new_format ? 0 : 1));
     // Add beam search specific inputs
     if (need_cache_indir) {
       const int64_t batch_size = static_cast<int64_t>(batch_beam_size / num_beam);
 
@@ -72,6 +72,10 @@ class T5DecoderSubgraph : public Subgraph {
     return use_sequence_as_input_ids_;
   }
 
+  inline bool UseEncoderHiddenState() const {
+    return has_hidden_state_;
+  }
+
  protected:
   int first_past_input_index_;
   int first_present_output_index_;