Skip to content

Only enable CUDA language if needed #24256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -857,7 +857,7 @@ set(ONNXRUNTIME_PROVIDER_NAMES cpu)
set(ORT_PROVIDER_FLAGS)
set(ORT_EXTRA_INTERFACE_FLAGS)

if (onnxruntime_USE_CUDA)
if (onnxruntime_USE_CUDA AND NOT onnxruntime_CUDA_MINIMAL)
enable_language(CUDA)
message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")

4 changes: 0 additions & 4 deletions cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
@@ -38,10 +38,6 @@
"${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
)
else()
set(onnxruntime_providers_cuda_cu_srcs
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
)
endif()
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
10 changes: 5 additions & 5 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
@@ -75,7 +75,7 @@ function(AddTest)
onnxruntime_add_include_to_target(${_UT_TARGET} date::date flatbuffers::flatbuffers)
target_include_directories(${_UT_TARGET} PRIVATE ${TEST_INC_DIR})
if (onnxruntime_USE_CUDA)
target_include_directories(${_UT_TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR})
target_include_directories(${_UT_TARGET} PRIVATE ${CUDAToolkit_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
if (onnxruntime_USE_NCCL)
target_include_directories(${_UT_TARGET} PRIVATE ${NCCL_INCLUDE_DIRS})
endif()
@@ -1369,8 +1369,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)

target_include_directories(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_ROOT})

if (onnxruntime_USE_CUDA)
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if (onnxruntime_USE_CUDA AND NOT onnxruntime_CUDA_MINIMAL)
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu)
endif()

@@ -1573,11 +1573,11 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
set(custom_op_lib_option)
set(custom_op_lib_link ${GSL_TARGET})

if (onnxruntime_USE_CUDA)
if (onnxruntime_USE_CUDA AND NOT onnxruntime_CUDA_MINIMAL)
list(APPEND custom_op_src_patterns
"${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu"
"${TEST_SRC_DIR}/testdata/custom_op_library/cuda/cuda_ops.*")
list(APPEND custom_op_lib_include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR})
list(APPEND custom_op_lib_include ${CUDAToolkit_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
if (HAS_QSPECTRE)
list(APPEND custom_op_lib_option "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /Qspectre>")
endif()
5 changes: 4 additions & 1 deletion onnxruntime/core/providers/cuda/tensor/cast_op.cc
Original file line number Diff line number Diff line change
@@ -124,7 +124,7 @@ Status Cast<SrcT>::ComputeInternal(OpKernelContext* context) const {
Tensor* Y = context->Output(0, shape);
const auto* x_data = reinterpret_cast<const CudaSrcT*>(X->Data<SrcT>());
size_t count = shape.Size();

#ifndef USE_CUDA_MINMAL
switch (to_) {
CASE(TensorProto_DataType_FLOAT16, MLFloat16)
CASE(TensorProto_DataType_BFLOAT16, BFloat16)
@@ -151,6 +151,9 @@ Status Cast<SrcT>::ComputeInternal(OpKernelContext* context) const {
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unexpected 'to' argument value: ", to_);
}
#else
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Cuda cast ops in this build are not supported for dtype: ", to_);
#endif
return Status::OK();
}

Original file line number Diff line number Diff line change
@@ -36,9 +36,11 @@ OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtA
Ort::CustomOpDomain domain_v2{"v2"};
Cpu::RegisterOps(domain_v2);

#ifdef USE_CUDA_MINIMAL
Cuda::RegisterOps(domain);
Cuda::RegisterOps(domain_v2);

#endif
Comment on lines +39 to +42
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a memcpy would be used as custom op that should work without needing cvcc during compile. Let me know if that would be an accepted change.

Copy link
Contributor

@chilo-ms chilo-ms Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not very clear to me why we don't test Cuda::RegisterOps() in non USE_CUDA_MINIMAL, i.e. normal cuda build now?


Rocm::RegisterOps(domain);
Rocm::RegisterOps(domain_v2);