Allow embed mode disabled with all models in buffers

microsoft · adrianlizarraga · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025
commit 6224f727debd173c144be62febd5d0b3f0887d12
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -247,6 +247,13 @@ Status CreateEPContextNodes(Model* model,
         } else {
           context_bin_path = context_model_path;
         }
+
+        if (context_bin_path.empty()) {
+          // Context bin path is empty, so just use the graph name (e.g., "QNNExecutionProvider_QNN_13728744673520368385_2_0").
+          // This happens if both the input model and output model are stored in buffers (i.e., there are no paths).
+          context_bin_path = ToPathString(graph_name);
+        }
+
         context_bin_path = context_bin_path + ToPathString("_qnn.bin");
         context_cache_name = std::filesystem::path(context_bin_path).filename().string();
 

diff --git a/onnxruntime/core/session/model_compilation_options.cc b/onnxruntime/core/session/model_compilation_options.cc
@@ -137,11 +137,7 @@ Status ModelCompilationOptions::ResetOutputModelSettings() {
   return session_options_.value.config_options.AddConfigEntry(kOrtSessionOptionEpContextFilePath, "");
 }
 
-Status ModelCompilationOptions::Check() const {
-  ORT_ENFORCE(session_options_.value.ep_context_gen_options.enable);
-  const EpContextModelGenerationOptions& ep_context_gen_options = session_options_.value.ep_context_gen_options;
-  const bool explicit_writes_to_file = !ep_context_gen_options.output_model_file_path.empty();
-  const bool writes_to_buffer = ep_context_gen_options.output_model_buffer_ptr != nullptr;
+Status ModelCompilationOptions::CheckInputModelSettings() const {
   const bool comes_from_file = !input_model_path_.empty();
   const bool comes_from_memory = input_model_data_ != nullptr;
 
@@ -164,15 +160,14 @@ Status ModelCompilationOptions::Check() const {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Buffer for input model data has size 0");
   }
 
-  if (comes_from_memory && writes_to_buffer && !ep_context_gen_options.embed_ep_context_in_model) {
-    // TODO(adrianlizarraga): We may want to support this in the future. That is, both input/output models
-    // are in buffers but the context cache binary is dumped to a file. Would need to allow user to specify
-    // a custom path for the context cache binary.
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "EPContext embed mode must be true (enabled) when both the "
-                           "input and output models are stored in buffers. "
-                           "Please call ModelCompilationOptions_SetEpContextEmbedMode(true).");
-  }
+  return Status::OK();
+}
+
+Status ModelCompilationOptions::CheckOutputModelSettings() const {
+  const EpContextModelGenerationOptions& ep_context_gen_options = session_options_.value.ep_context_gen_options;
+
+  const bool explicit_writes_to_file = !ep_context_gen_options.output_model_file_path.empty();
+  const bool writes_to_buffer = ep_context_gen_options.output_model_buffer_ptr != nullptr;
 
   if (!explicit_writes_to_file && !writes_to_buffer) {
     // User did not specify an output file or an output buffer. We default to generating an output file
@@ -197,5 +192,12 @@ Status ModelCompilationOptions::Check() const {
 
   return Status::OK();
 }
+
+Status ModelCompilationOptions::Check() const {
+  ORT_ENFORCE(session_options_.value.ep_context_gen_options.enable);
+  ORT_RETURN_IF_ERROR(CheckInputModelSettings());
+  ORT_RETURN_IF_ERROR(CheckOutputModelSettings());
+  return Status::OK();
+}
 }  // namespace onnxruntime
 #endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/model_compilation_options.h b/onnxruntime/core/session/model_compilation_options.h
@@ -119,6 +119,8 @@ class ModelCompilationOptions {
  private:
   void ResetInputModelSettings();
   Status ResetOutputModelSettings();
+  Status CheckInputModelSettings() const;
+  Status CheckOutputModelSettings() const;
 
   const OrtEnv& env_;
   OrtSessionOptions session_options_;

diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -385,40 +385,67 @@ TEST_F(QnnHTPBackendTests, CompileApi_FromSessionOptions_OutputModelBuffer) {
 // Test using the CompileModel() API with settings:
 //   - input model from buffer
 //   - save output model to buffer
-//   - EPContext nodes in output model use embedded binary blobs.
-TEST_F(QnnHTPBackendTests, CompileApi_FromSessionOptions_InputAndOutputModelsInBuffers_Embedded) {
+//   - test enabling AND disabling embed mode for context binary in EPContext node attributes
+TEST_F(QnnHTPBackendTests, CompileApi_FromSessionOptions_InputAndOutputModelsInBuffers) {
   // Create a test model and serialize it to a buffer.
   TestModel test_model;
   CreateTestModel(BuildGraphWithQAndNonQ(false), 21, logging::Severity::kERROR, test_model);
   std::string model_data = test_model.Serialize();
 
   // Initialize session options with QNN EP
-  Ort::SessionOptions so;
+  Ort::SessionOptions session_options;
   ProviderOptions provider_options;
   provider_options["backend_type"] = "htp";
   provider_options["offload_graph_io_quantization"] = "0";
-  so.AppendExecutionProvider("QNN", provider_options);
+  session_options.AppendExecutionProvider("QNN", provider_options);
 
   Ort::AllocatorWithDefaultOptions allocator;
-  void* output_model_buffer = nullptr;
-  size_t output_model_buffer_size = 0;
 
-  // Create model compilation options from the session options.
-  Ort::ModelCompilationOptions compile_options(*ort_env, so);
-  compile_options.SetInputModelFromBuffer(reinterpret_cast<const void*>(model_data.data()), model_data.size());
-  compile_options.SetOutputModelBuffer(allocator, &output_model_buffer, &output_model_buffer_size);
-  compile_options.SetEpContextEmbedMode(true);
+  // Test embed mode enabled.
+  {
+    void* output_model_buffer = nullptr;
+    size_t output_model_buffer_size = 0;
 
-  // Compile the model.
-  Ort::Status status = Ort::CompileModel(*ort_env, compile_options);
-  ASSERT_TRUE(status.IsOK()) << status.GetErrorMessage();
+    // Create model compilation options from the session options.
+    Ort::ModelCompilationOptions compile_options(*ort_env, session_options);
+    compile_options.SetInputModelFromBuffer(reinterpret_cast<const void*>(model_data.data()), model_data.size());
+    compile_options.SetOutputModelBuffer(allocator, &output_model_buffer, &output_model_buffer_size);
+    compile_options.SetEpContextEmbedMode(true);
 
-  // Make sure the compiled model was saved to the buffer.
-  ASSERT_TRUE(output_model_buffer != nullptr);
-  ASSERT_TRUE(output_model_buffer_size > 0);
+    // Compile the model.
+    Ort::Status status = Ort::CompileModel(*ort_env, compile_options);
+    ASSERT_TRUE(status.IsOK()) << status.GetErrorMessage();
 
-  // Check that the compiled model has the expected number of EPContext nodes.
-  CheckEpContextNodeCounts(output_model_buffer, output_model_buffer_size, 2, 2);
+    // Make sure the compiled model was saved to the buffer.
+    ASSERT_TRUE(output_model_buffer != nullptr);
+    ASSERT_TRUE(output_model_buffer_size > 0);
+
+    // Check that the compiled model has the expected number of EPContext nodes.
+    CheckEpContextNodeCounts(output_model_buffer, output_model_buffer_size, 2, 2);
+  }
+
+  // Test embed mode disabled.
+  {
+    void* output_model_buffer = nullptr;
+    size_t output_model_buffer_size = 0;
+
+    // Create model compilation options from the session options.
+    Ort::ModelCompilationOptions compile_options(*ort_env, session_options);
+    compile_options.SetInputModelFromBuffer(reinterpret_cast<const void*>(model_data.data()), model_data.size());
+    compile_options.SetOutputModelBuffer(allocator, &output_model_buffer, &output_model_buffer_size);
+    compile_options.SetEpContextEmbedMode(false);
+
+    // Compile the model.
+    Ort::Status status = Ort::CompileModel(*ort_env, compile_options);
+    ASSERT_TRUE(status.IsOK()) << status.GetErrorMessage();
+
+    // Make sure the compiled model was saved to the buffer.
+    ASSERT_TRUE(output_model_buffer != nullptr);
+    ASSERT_TRUE(output_model_buffer_size > 0);
+
+    // Check that the compiled model has the expected number of EPContext nodes.
+    CheckEpContextNodeCounts(output_model_buffer, output_model_buffer_size, 2, 2);
+  }
 }
 
 // Test using the CompileModel() API with settings: