NVIDIA · abhilash1910 · Nov 5, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -11,7 +11,6 @@ from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
-
 ###############################################################################
 # Extern
 ###############################################################################

diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -11,7 +11,6 @@ from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
-
 ###############################################################################
 # Extern
 ###############################################################################

diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
@@ -13,3 +13,5 @@ cdef class Program:
         object _linker  # Linker
         object _options  # ProgramOptions
         object __weakref__
+        bint _use_libdevice      # Flag for libdevice loading
+        int _module_count
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
@@ -356,6 +356,8 @@ class ProgramOptions:
     pch_verbose: bool | None = None
     pch_messages: bool | None = None
     instantiate_templates_in_pch: bool | None = None
+    extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None
+    use_libdevice: bool | None = None  # For libdevice execution
     numba_debug: bool | None = None  # Custom option for Numba debugging
 
     def __post_init__(self):
@@ -458,6 +460,11 @@ def _get_nvvm_module():
         _nvvm_module = None
         raise e
 
+def _find_libdevice_path():
+    """Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
+    from cuda.pathfinder import get_libdevice_path
+    return get_libdevice_path()
+
 
 cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
     """Process a single define macro, returning True if successful."""
@@ -520,12 +527,20 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
     cdef const char* code_ptr
     cdef const char* name_ptr
     cdef size_t code_len
+    cdef bytes module_bytes
+    cdef const char* module_ptr
+    cdef size_t module_len
 
     self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
     code_type = code_type.lower()
+    self._module_count = 0
+    self._use_libdevice = False
 
     if code_type == "c++":
         assert_type(code, str)
+        if options.extra_sources is not None:
+            raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
+
         # TODO: support pre-loaded headers & include names
         code_bytes = code.encode()
         code_ptr = <const char*>code_bytes
@@ -540,6 +555,8 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
 
     elif code_type == "ptx":
         assert_type(code, str)
+        if options.extra_sources is not None:
+            raise ValueError("extra_sources is not supported by the PTX backend.")
         self._linker = Linker(
             ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
         )
@@ -561,6 +578,54 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
         self._h_nvvm = create_nvvm_program_handle(nvvm_prog)  # RAII from here
         with nogil:
             HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(nvvm_prog, code_ptr, code_len, name_ptr))
+        self._module_count = 1
+
+        # Add extra modules if provided
+        if options.extra_sources is not None:
+            if not is_sequence(options.extra_sources):
+                raise TypeError(
+                    "extra_sources must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)"
+                )
+            for i, module in enumerate(options.extra_sources):
+                if not isinstance(module, tuple) or len(module) != 2:
+                    raise TypeError(
+                        f"Each extra module must be a 2-tuple (name, source)"
+                        f", got {type(module).__name__} at index {i}"
+                    )
+
+                module_name, module_source = module
+
+                if not isinstance(module_name, str):
+                    raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
+
+                if isinstance(module_source, str):
+                    # Textual LLVM IR - encode to UTF-8 bytes
+                    module_source = module_source.encode("utf-8")
+                elif not isinstance(module_source, (bytes, bytearray)):
+                    raise TypeError(
+                        f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
+                        f"or bytearray, got {type(module_source).__name__}"
+                    )
+
+                if len(module_source) == 0:
+                    raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
+
+                # Add the module using NVVM API
+                module_bytes = module_source if isinstance(module_source, bytes) else bytes(module_source)
+                module_ptr = <const char*>module_bytes
+                module_len = len(module_bytes)
+                module_name_bytes = module_name.encode()
+                module_name_ptr = <const char*>module_name_bytes
+
+                with nogil:
+                    HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(
+                        nvvm_prog, module_ptr, module_len, module_name_ptr))
+                self._module_count += 1
+
+        # Store use_libdevice flag
+        if options.use_libdevice:
+            self._use_libdevice = True
+
         self._backend = "NVVM"
         self._linker = None
 
@@ -649,19 +714,38 @@ cdef object Program_compile_nvvm(Program self, str target_type, object logs):
     cdef size_t logsize = 0
     cdef vector[const char*] options_vec
     cdef char* data_ptr = NULL
-
+    cdef bytes libdevice_bytes
+    cdef const char* libdevice_ptr
+    cdef size_t libdevice_len
     # Build options array
     options_list = self._options.as_bytes("nvvm", target_type)
     options_vec.resize(len(options_list))
     for i in range(len(options_list)):
         options_vec[i] = <const char*>(<bytes>options_list[i])
 
-    # Compile
     with nogil:
         HANDLE_RETURN_NVVM(prog, cynvvm.nvvmVerifyProgram(prog, <int>options_vec.size(), options_vec.data()))
+
+    # Load libdevice if requested  - following numba-cuda
+    if self._use_libdevice:
+        libdevice_path = _find_libdevice_path()
+        if libdevice_path is None:
+            raise RuntimeError(
+                "use_libdevice=True but could not find libdevice.10.bc. "
+                "Ensure CUDA toolkit is installed."
+            )
+        with open(libdevice_path, "rb") as f:
+            libdevice_bytes = f.read()
+        libdevice_ptr = <const char*>libdevice_bytes
+        libdevice_len = len(libdevice_bytes)
+        # Use lazy_add_module
+        with nogil:
+            HANDLE_RETURN_NVVM(prog, cynvvm.nvvmLazyAddModuleToProgram(
+                prog, libdevice_ptr, libdevice_len, NULL))
+
+    with nogil:
         HANDLE_RETURN_NVVM(prog, cynvvm.nvvmCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
 
-    # Get compiled result
     HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResultSize(prog, &output_size))
     data = bytearray(output_size)
     data_ptr = <char*>(<bytearray>data)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
@@ -11,6 +11,7 @@
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401, F811
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -444,11 +445,23 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
 
 
 @nvvm_available
+@pytest.mark.parametrize("target_type", ["ptx", "ltoir"])
 @pytest.mark.parametrize(
     "options",
     [
         ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
         ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
+        ProgramOptions(name="test3", arch="sm_100", link_time_optimization=True),
+        ProgramOptions(
+            name="test4",
+            arch="sm_90",
+            ftz=True,
+            prec_sqrt=False,
+            prec_div=False,
+            fma=True,
+            device_code_optimize=True,
+            link_time_optimization=True,
+        ),
         pytest.param(
             ProgramOptions(name="test_sm110_1", arch="sm_110", device_code_optimize=False),
             marks=pytest.mark.skipif(
@@ -480,22 +493,180 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
         ),
     ],
 )
-def test_nvvm_program_options(init_cuda, nvvm_ir, options):
-    """Test NVVM programs with different options"""
+def test_nvvm_program_options(init_cuda, nvvm_ir, options, target_type):
+    """Test NVVM programs with different options and target types (ptx/ltoir)"""
     program = Program(nvvm_ir, "nvvm", options)
     assert program.backend == "NVVM"
 
+    result = program.compile(target_type)
+    assert isinstance(result, ObjectCode)
+    assert result.name == options.name
+
+    code_content = result.code
+    assert len(code_content) > 0
+
+    if target_type == "ptx":
+        ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
+        assert ".visible .entry simple(" in ptx_text
+
+    program.close()
+
+
+@nvvm_available
+def test_nvvm_program_with_single_extra_source(nvvm_ir):
+    """Test NVVM program with a single extra source"""
+    from cuda.core._program import _get_nvvm_module
+
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    # helper nvvm ir for multiple module loading
+    helper_nvvmir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""  # noqa: E501
+
+    options = ProgramOptions(
+        name="multi_module_test",
+        extra_sources=[
+            ("helper", helper_nvvmir),
+        ],
+    )
+    program = Program(nvvm_ir, "nvvm", options)
+
+    assert program.backend == "NVVM"
+
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == "multi_module_test"
+
+    program.close()
+
+
+@nvvm_available
+def test_nvvm_program_with_multiple_extra_sources():
+    """Test NVVM program with multiple extra sources"""
+    from cuda.core._program import _get_nvvm_module
+
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+
+    main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+declare i32 @helper_add(i32) nounwind readnone
+declare i32 @helper_mul(i32) nounwind readnone
+
+define void @main_kernel(i32* %data) {{
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %ptr = getelementptr inbounds i32, i32* %data, i32 %tid
+  %val = load i32, i32* %ptr, align 4
+
+  %val1 = call i32 @helper_add(i32 %val)
+  %val2 = call i32 @helper_mul(i32 %val1)
+
+  store i32 %val2, i32* %ptr, align 4
+  ret void
+}}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+!nvvm.annotations = !{{!0}}
+!0 = !{{void (i32*)* @main_kernel, !"kernel", i32 1}}
+
+!nvvmir.version = !{{!1}}
+!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""  # noqa: E501
+
+    helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) nounwind readnone {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""  # noqa: E501
+
+    helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_mul(i32 %x) nounwind readnone {{
+entry:
+  %result = mul i32 %x, 2
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""  # noqa: E501
+
+    options = ProgramOptions(
+        name="nvvm_multi_helper_test",
+        extra_sources=[
+            ("helper1", helper1_ir),
+            ("helper2", helper2_ir),
+        ],
+    )
+    program = Program(main_nvvm_ir, "nvvm", options)
+
+    assert program.backend == "NVVM"
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
-    assert ptx_code.name == options.name
+    assert ptx_code.name == "nvvm_multi_helper_test"
 
-    code_content = ptx_code.code
-    ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
-    assert ".visible .entry simple(" in ptx_text
+    ltoir_code = program.compile("ltoir")
+    assert isinstance(ltoir_code, ObjectCode)
+    assert ltoir_code.name == "nvvm_multi_helper_test"
 
     program.close()
 
 
+@nvvm_available
+def test_bitcode_format(minimal_nvvmir):  # noqa: F811
+    if len(minimal_nvvmir) < 4:
+        pytest.skip("Bitcode file is not valid or empty")
+
+    options = ProgramOptions(name="minimal_nvvmir_bitcode_test", arch="sm_90")
+    program = Program(minimal_nvvmir, "nvvm", options)
+
+    assert program.backend == "NVVM"
+    ptx_result = program.compile("ptx")
+    assert isinstance(ptx_result, ObjectCode)
+    assert ptx_result.name == "minimal_nvvmir_bitcode_test"
+    assert len(ptx_result.code) > 0
+    program_lto = Program(minimal_nvvmir, "nvvm", options)
+    try:
+        ltoir_result = program_lto.compile("ltoir")
+        assert isinstance(ltoir_result, ObjectCode)
+        assert len(ltoir_result.code) > 0
+        print(f"LTOIR size: {len(ltoir_result.code)} bytes")
+    except Exception as e:
+        print(f"LTOIR compilation failed : {e}")
+    finally:
+        program.close()
+
+
+def test_cpp_program_with_extra_sources():
+    # negative test with NVRTC with multiple sources
+    code = 'extern "C" __global__ void my_kernel(){}'
+    helper = 'extern "C" __global__ void helper(){}'
+    options = ProgramOptions(extra_sources=helper)
+    with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
+        Program(code, "c++", options)
+
+
 def test_program_options_as_bytes_nvrtc():
     """Test ProgramOptions.as_bytes() for NVRTC backend"""
     options = ProgramOptions(arch="sm_80", debug=True, lineinfo=True, ftz=True)