From a4db20b970d334f43fac86d1c388d07ab90ba110 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 5 Nov 2025 02:14:29 +0000
Subject: [PATCH 01/50] add ltoir test support

---
 cuda_core/tests/test_program.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index adc778973c..5c966296fe 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -387,3 +387,33 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
     assert ".visible .entry simple(" in ptx_text
 
     program.close()
+
+@nvvm_available
+@pytest.mark.parametrize(
+    "options",
+    [
+       ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
+       ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
+       ProgramOptions(
+            name="ltoir_test3",
+            arch="sm_90",
+            ftz=True,
+            prec_sqrt=False,
+            prec_div=False,
+            fma=True,
+            device_code_optimize=True,
+            link_time_optimization=True,
+       ),
+    ],
+)
+def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
+    """Test NVVM programs for LTOIR with different options"""
+    program = Program(nvvm_ir, "nvvm", options)
+    assert program.backend == "NVVM"
+
+    ltoir_code = program.compile("ltoir")
+    assert isinstance(ltoir_code, ObjectCode)
+    assert ltoir_code.name == options.name
+    code_content = ltoir_code.code
+    assert len(ltoir_code.code) > 0
+    program.close()

From fb6cfb3902b8a22aba92624ea417616033018852 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 04:10:17 +0000
Subject: [PATCH 02/50] add options for multi-modules

---
 cuda_core/cuda/core/experimental/_program.py | 30 ++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 1db453fed1..2e0115be48 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -298,6 +298,7 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
+    extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -479,6 +480,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+            
+            if options.extra_sources is not None:
+                raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
@@ -487,6 +491,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
         elif code_type == "ptx":
             assert_type(code, str)
+            if options.extra_sources is not None:
+                raise ValueError("extra_sources is not supported by the PTX backend.")
+            
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -502,6 +509,29 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            if options.extra_sources is not None:
+                if isinstance(options.extra_sources, (str, bytes)):
+                    extra_sources = [options.extra_sources]
+                elif isinstance(options.extra_sources, (list, tuple)):
+                    extra_sources = options.extra_sources
+                else:
+                    raise TypeError("extra_sources must be str, bytes, list, or tuple")
+                
+                if len(extra_sources) == 0:
+                    raise ValueError("extra_sources cannot be empty if provided")
+                
+                for i, extra_source in enumerate(extra_sources):
+                    if isinstance(extra_source, str):
+                        extra_source = extra_source.encode("utf-8")
+                    elif not isinstance(extra_source, (bytes, bytearray)):
+                        raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
+                    
+                    if len(extra_source) == 0:
+                        raise ValueError(f"Extra source {i} cannot be empty")
+                    
+                    extra_name = f"{options.name}_extra_{i}"
+                    nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
+            
             self._backend = "NVVM"
             self._linker = None
 

From 7aaed4e4f36b890b73098430ae39384217e3d4d1 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 11:05:56 +0000
Subject: [PATCH 03/50] add tests

---
 cuda_core/tests/test_program.py | 119 ++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 5c966296fe..70b131556d 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -417,3 +417,122 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
     code_content = ltoir_code.code
     assert len(ltoir_code.code) > 0
     program.close()
+
+
+@nvvm_available
+def test_nvvm_program_with_single_extra_source(nvvm_ir):
+    """Test NVVM program with a single extra source"""
+    from cuda.core.experimental._program import _get_nvvm_module
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    #helper nvvm ir for multiple module loading
+    helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
+    program = Program(nvvm_ir, "nvvm", options)
+    
+    assert program.backend == "NVVM"
+    
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == "multi_module_test"
+    
+    program.close()
+
+@nvvm_available
+def test_nvvm_program_with_multiple_extra_sources():
+    """Test NVVM program with multiple extra sources"""
+    from cuda.core.experimental._program import _get_nvvm_module
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    
+    main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+declare i32 @helper_add(i32) nounwind readnone
+declare i32 @helper_mul(i32) nounwind readnone
+
+define void @main_kernel(i32* %data) {{
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %ptr = getelementptr inbounds i32, i32* %data, i32 %tid
+  %val = load i32, i32* %ptr, align 4
+  
+  %val1 = call i32 @helper_add(i32 %val)
+  %val2 = call i32 @helper_mul(i32 %val1)
+  
+  store i32 %val2, i32* %ptr, align 4
+  ret void
+}}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+!nvvm.annotations = !{{!0}}
+!0 = !{{void (i32*)* @main_kernel, !"kernel", i32 1}}
+
+!nvvmir.version = !{{!1}}
+!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) nounwind readnone {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_mul(i32 %x) nounwind readnone {{
+entry:
+  %result = mul i32 %x, 2
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    options = ProgramOptions(
+        name="nvvm_multi_helper_test", 
+        extra_sources=[helper1_ir, helper2_ir]
+    )
+    program = Program(main_nvvm_ir, "nvvm", options)
+    
+    assert program.backend == "NVVM"
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == "nvvm_multi_helper_test"
+    
+    ltoir_code = program.compile("ltoir")
+    assert isinstance(ltoir_code, ObjectCode)
+    assert ltoir_code.name == "nvvm_multi_helper_test"
+    
+    program.close()
+       
+def test_cpp_program_with_extra_sources():
+    #negative test with NVRTC with multiple sources
+    code = 'extern "C" __global__ void my_kernel(){}'
+    helper = 'extern "C" __global__ void helper(){}'
+    options = ProgramOptions(extra_sources = helper)
+    with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
+        Program(code, "c++", options)
+    
\ No newline at end of file

From 64c7f7d3d7acc4785bd475530786777e5babe2ab Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 14:48:43 +0000
Subject: [PATCH 04/50] add bitcode test

---
 cuda_core/tests/test_program.py | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 70b131556d..24ae767220 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -527,6 +527,51 @@ def test_nvvm_program_with_multiple_extra_sources():
     assert ltoir_code.name == "nvvm_multi_helper_test"
     
     program.close()
+
+@nvvm_available
+def test_bitcode_format():
+    import os
+    from pathlib import Path
+    bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
+    if not bitcode_path:
+        pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set."
+                    "Disabling the test.")
+    bitcode_file = Path(bitcode_path)
+    if not bitcode_file.exists():
+        pytest.skip(f"Bitcode file not found: {bitcode_path}")
+    
+    if not bitcode_file.suffix == '.bc':
+        pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
+    
+    try:
+        with open(bitcode_file, 'rb') as f:
+            bitcode_data = f.read()
+        
+        if len(bitcode_data) < 4:
+            pytest.skip("Bitcode file appears to be empty or invalid")
+        
+        options = ProgramOptions(
+            name=f"existing_bitcode_{bitcode_file.stem}",
+            arch="sm_90"
+        )
+        program = Program(bitcode_data, "nvvm", options)
+        
+        assert program.backend == "NVVM"
+        ptx_result = program.compile("ptx")
+        assert isinstance(ptx_result, ObjectCode)
+        assert ptx_result.name.startswith("existing_bitcode_")
+        assert len(ptx_result.code) > 0
+        try:
+            ltoir_result = program.compile("ltoir")
+            assert isinstance(ltoir_result, ObjectCode)
+            assert len(ltoir_result.code) > 0
+            print(f"LTOIR size: {len(ltoir_result.code)} bytes")
+        except Exception as e:
+            print(f"LTOIR compilation failed : {e}")
+        program.close()
+    except Exception as e:
+        pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
+
        
 def test_cpp_program_with_extra_sources():
     #negative test with NVRTC with multiple sources

From 42ba301078bc17c796cddbe99499c957633eb5d8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Nov 2025 14:51:09 +0000
Subject: [PATCH 05/50] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/experimental/_program.py | 14 ++--
 cuda_core/tests/test_program.py              | 68 ++++++++++----------
 2 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 2e0115be48..eadd37ca95 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -480,7 +480,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            
+
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
@@ -493,7 +493,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the PTX backend.")
-            
+
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -516,22 +516,22 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                     extra_sources = options.extra_sources
                 else:
                     raise TypeError("extra_sources must be str, bytes, list, or tuple")
-                
+
                 if len(extra_sources) == 0:
                     raise ValueError("extra_sources cannot be empty if provided")
-                
+
                 for i, extra_source in enumerate(extra_sources):
                     if isinstance(extra_source, str):
                         extra_source = extra_source.encode("utf-8")
                     elif not isinstance(extra_source, (bytes, bytearray)):
                         raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
-                    
+
                     if len(extra_source) == 0:
                         raise ValueError(f"Extra source {i} cannot be empty")
-                    
+
                     extra_name = f"{options.name}_extra_{i}"
                     nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
-            
+
             self._backend = "NVVM"
             self._linker = None
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 24ae767220..9aca1878d4 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -388,13 +388,14 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
 
     program.close()
 
+
 @nvvm_available
 @pytest.mark.parametrize(
     "options",
     [
-       ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
-       ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
-       ProgramOptions(
+        ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
+        ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
+        ProgramOptions(
             name="ltoir_test3",
             arch="sm_90",
             ftz=True,
@@ -403,7 +404,7 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
             fma=True,
             device_code_optimize=True,
             link_time_optimization=True,
-       ),
+        ),
     ],
 )
 def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
@@ -423,9 +424,10 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
 def test_nvvm_program_with_single_extra_source(nvvm_ir):
     """Test NVVM program with a single extra source"""
     from cuda.core.experimental._program import _get_nvvm_module
+
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
-    #helper nvvm ir for multiple module loading
+    # helper nvvm ir for multiple module loading
     helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -441,22 +443,24 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
-    
+
     assert program.backend == "NVVM"
-    
+
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
     assert ptx_code.name == "multi_module_test"
-    
+
     program.close()
 
+
 @nvvm_available
 def test_nvvm_program_with_multiple_extra_sources():
     """Test NVVM program with multiple extra sources"""
     from cuda.core.experimental._program import _get_nvvm_module
+
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
-    
+
     main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -468,10 +472,10 @@ def test_nvvm_program_with_multiple_extra_sources():
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %ptr = getelementptr inbounds i32, i32* %data, i32 %tid
   %val = load i32, i32* %ptr, align 4
-  
+
   %val1 = call i32 @helper_add(i32 %val)
   %val2 = call i32 @helper_mul(i32 %val1)
-  
+
   store i32 %val2, i32* %ptr, align 4
   ret void
 }}
@@ -511,51 +515,46 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """
 
-    options = ProgramOptions(
-        name="nvvm_multi_helper_test", 
-        extra_sources=[helper1_ir, helper2_ir]
-    )
+    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)
-    
+
     assert program.backend == "NVVM"
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
     assert ptx_code.name == "nvvm_multi_helper_test"
-    
+
     ltoir_code = program.compile("ltoir")
     assert isinstance(ltoir_code, ObjectCode)
     assert ltoir_code.name == "nvvm_multi_helper_test"
-    
+
     program.close()
 
+
 @nvvm_available
 def test_bitcode_format():
     import os
     from pathlib import Path
+
     bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
     if not bitcode_path:
-        pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set."
-                    "Disabling the test.")
+        pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.")
     bitcode_file = Path(bitcode_path)
     if not bitcode_file.exists():
         pytest.skip(f"Bitcode file not found: {bitcode_path}")
-    
-    if not bitcode_file.suffix == '.bc':
+
+    if not bitcode_file.suffix == ".bc":
         pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
-    
+
     try:
-        with open(bitcode_file, 'rb') as f:
+        with open(bitcode_file, "rb") as f:
             bitcode_data = f.read()
-        
+
         if len(bitcode_data) < 4:
             pytest.skip("Bitcode file appears to be empty or invalid")
-        
-        options = ProgramOptions(
-            name=f"existing_bitcode_{bitcode_file.stem}",
-            arch="sm_90"
-        )
+
+        options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90")
         program = Program(bitcode_data, "nvvm", options)
-        
+
         assert program.backend == "NVVM"
         ptx_result = program.compile("ptx")
         assert isinstance(ptx_result, ObjectCode)
@@ -572,12 +571,11 @@ def test_bitcode_format():
     except Exception as e:
         pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
 
-       
+
 def test_cpp_program_with_extra_sources():
-    #negative test with NVRTC with multiple sources
+    # negative test with NVRTC with multiple sources
     code = 'extern "C" __global__ void my_kernel(){}'
     helper = 'extern "C" __global__ void helper(){}'
-    options = ProgramOptions(extra_sources = helper)
+    options = ProgramOptions(extra_sources=helper)
     with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
         Program(code, "c++", options)
-    
\ No newline at end of file

From 7ca68992514e63cffbd51642a59a9f277e79540f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 18:14:22 +0000
Subject: [PATCH 06/50] fix format

---
 cuda_core/tests/test_program.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 9aca1878d4..7d3e1c8709 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -416,7 +416,7 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
     assert isinstance(ltoir_code, ObjectCode)
     assert ltoir_code.name == options.name
     code_content = ltoir_code.code
-    assert len(ltoir_code.code) > 0
+    assert len(code_content) > 0
     program.close()
 
 
@@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
@@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)
@@ -542,7 +542,7 @@ def test_bitcode_format():
     if not bitcode_file.exists():
         pytest.skip(f"Bitcode file not found: {bitcode_path}")
 
-    if not bitcode_file.suffix == ".bc":
+    if bitcode_file.suffix != ".bc":
         pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
 
     try:

From 0674ea17dda35f886279bdef2f33a4aa56071562 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:15:33 +0000
Subject: [PATCH 07/50] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_program.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 7d3e1c8709..1b96a5f094 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
@@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)

From 03b122466a0eb10b4adb1b692b4a36b017f7241e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 26 Nov 2025 10:26:42 +0000
Subject: [PATCH 08/50] refresh

---
 cuda_core/cuda/core/experimental/_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index eadd37ca95..7d3ea15e46 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -479,11 +479,11 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-
+            
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
             self._backend = "NVRTC"

From 033f11cacf560de6ccf1bf9605070cc912081ddf Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 1 Dec 2025 11:45:49 +0000
Subject: [PATCH 09/50] apply bitcode file from cupy_test helpers

---
 cuda_core/tests/test_program.py               |  61 ++++----
 .../cuda_python_test_helpers/nvvm_bitcode.py  | 136 ++++++++++++++++++
 2 files changed, 163 insertions(+), 34 deletions(-)
 create mode 100644 cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 1b96a5f094..c62e1712f6 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -14,6 +14,13 @@
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
+try:
+    from cuda_python_test_helpers.nvvm_bitcode import (
+        minimal_nvvmir
+    )
+    _test_helpers_available = True
+except ImportError:
+    _test_helpers_available = False
 
 def _is_nvvm_available():
     """Check if NVVM is available."""
@@ -531,46 +538,32 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-def test_bitcode_format():
+@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible")
+def test_bitcode_format(minimal_nvvmir):
     import os
     from pathlib import Path
+    
+    if len(minimal_nvvmir) < 4:
+        pytest.skip("Bitcode file is not valid or empty")
 
-    bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
-    if not bitcode_path:
-        pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.")
-    bitcode_file = Path(bitcode_path)
-    if not bitcode_file.exists():
-        pytest.skip(f"Bitcode file not found: {bitcode_path}")
-
-    if bitcode_file.suffix != ".bc":
-        pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
+    options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90")
+    program = Program(minimal_nvvmir, "nvvm", options)
 
+    assert program.backend == "NVVM"
+    ptx_result = program.compile("ptx")
+    assert isinstance(ptx_result, ObjectCode)
+    assert ptx_result.name == "minimal_nvvmir_bitcode_test"
+    assert len(ptx_result.code) > 0
+    prgram_lto = Program(minimal_nvvmir, "nvvm", options)
     try:
-        with open(bitcode_file, "rb") as f:
-            bitcode_data = f.read()
-
-        if len(bitcode_data) < 4:
-            pytest.skip("Bitcode file appears to be empty or invalid")
-
-        options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90")
-        program = Program(bitcode_data, "nvvm", options)
-
-        assert program.backend == "NVVM"
-        ptx_result = program.compile("ptx")
-        assert isinstance(ptx_result, ObjectCode)
-        assert ptx_result.name.startswith("existing_bitcode_")
-        assert len(ptx_result.code) > 0
-        try:
-            ltoir_result = program.compile("ltoir")
-            assert isinstance(ltoir_result, ObjectCode)
-            assert len(ltoir_result.code) > 0
-            print(f"LTOIR size: {len(ltoir_result.code)} bytes")
-        except Exception as e:
-            print(f"LTOIR compilation failed : {e}")
-        program.close()
+        ltoir_result = program_lto.compile("ltoir")
+        assert isinstance(ltoir_result, ObjectCode)
+        assert len(ltoir_result.code) > 0
+        print(f"LTOIR size: {len(ltoir_result.code)} bytes")
     except Exception as e:
-        pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
-
+        print(f"LTOIR compilation failed : {e}")
+    finally:
+        program.close()
 
 def test_cpp_program_with_extra_sources():
     # negative test with NVRTC with multiple sources
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
new file mode 100644
index 0000000000..cba22d5425
--- /dev/null
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -0,0 +1,136 @@
+import binascii
+import re
+from contextlib import contextmanager
+
+import pytest
+from cuda.bindings import nvvm
+
+MINIMAL_NVVMIR_TXT_TEMPLATE = b"""\
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @kernel() {
+entry:
+  ret void
+}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
+
+!nvvmir.version = !{!1}
+!1 = !{i32 %d, i32 0, i32 %d, i32 0}
+"""  # noqa: E501
+
+MINIMAL_NVVMIR_BITCODE_STATIC = {
+    (1, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c00007f010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0e86000004016080000"
+    "06000000321e980c19114c908c092647c6044362098c009401000000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001d000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c330c4230cc40"
+    "0c4441c84860821272b3b36b730973737ba30ba34b7b739b1b2528d271b3b36b4b9373b12b939b4b"
+    "7b731b2530000000a9180000250000000b0a7228877780077a587098433db8c338b04339d0c382e6"
+    "1cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200fe7500ef4"
+    "b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1cd8211cdc"
+    "e11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe1500ff4800e"
+    "00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c30100000061200000"
+    "06000000130481860301000002000000075010cd14610000000000007120000003000000320e1022"
+    "8400fb020000000000000000650c00001f000000120394f000000000030000000600000006000000"
+    "4c000000010000005800000000000000580000000100000070000000000000000c00000013000000"
+    "1f000000080000000600000000000000700000000000000000000000010000000000000000000000"
+    "060000000000000006000000ffffffff00240000000000005d0c00000d0000001203946700000000"
+    "6b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c737472696e673e00"
+    "00000000",
+    (2, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c000080010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0286100004016080000"
+    "06000000321e980c19114c908c092647c60443620914c10840190000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001e000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c23080431c320"
+    "04c30c045118858c04262821373bbb36973037b737ba30bab437b7b95102231d373bbbb6343917bb"
+    "32b9b9b437b7518203000000a9180000250000000b0a7228877780077a587098433db8c338b04339"
+    "d0c382e61cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200f"
+    "e7500ef4b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1c"
+    "d8211cdce11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe150"
+    "0ff4800e00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c301000000"
+    "6120000006000000130481860301000002000000075010cd14610000000000007120000003000000"
+    "320e10228400fc020000000000000000650c00001f000000120394f0000000000300000006000000"
+    "060000004c000000010000005800000000000000580000000100000070000000000000000c000000"
+    "130000001f0000000800000006000000000000007000000000000000000000000100000000000000"
+    "00000000060000000000000006000000ffffffff00240000000000005d0c00000d00000012039467"
+    "000000006b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c73747269"
+    "6e673e0000000000",
+}
+
+@pytest.fixture(params=("txt", "bitcode_static"))
+def minimal_nvvmir(request):
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+
+    if request.param == "txt":
+        return MINIMAL_NVVMIR_TXT_TEMPLATE % (major, debug_major)
+
+    bitcode_static_binascii = MINIMAL_NVVMIR_BITCODE_STATIC.get((major, debug_major))
+    if bitcode_static_binascii:
+        return binascii.unhexlify(bitcode_static_binascii)
+    raise RuntimeError(
+        "Static bitcode for NVVM IR version "
+        f"{major}.{debug_major} is not available in this test.\n"
+        "Maintainers: Please run the helper script to generate it and add the "
+        "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n"
+        "  ../../toolshed/build_static_bitcode_input.py"
+    )
\ No newline at end of file

From 6e411ee8619868ec7ac6d4b4304a2e950edf490c Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 1 Dec 2025 17:44:23 +0000
Subject: [PATCH 10/50] use 2 tuples

---
 cuda_core/cuda/core/experimental/_program.py | 58 ++++++++++++--------
 cuda_core/tests/test_program.py              | 11 +++-
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 7d3ea15e46..ba6c4cf0af 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -298,7 +298,10 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
-    extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None
+    # Creating as 2 tuples ((names, source), (names,source))
+    extra_sources: (
+        Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
+    ) = None
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -468,13 +471,14 @@ def close(self):
                     nvvm.destroy_program(self.handle)
                 self.handle = None
 
-    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
+    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count")
 
     def __init__(self, code, code_type, options: ProgramOptions = None):
         self._mnff = Program._MembersNeededForFinalize(self, None, None)
 
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
         code_type = code_type.lower()
+        self._module_count = 0
 
         if code_type == "c++":
             assert_type(code, str)
@@ -509,28 +513,36 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            self._module_count = 1
+             # Add extra modules if provided
             if options.extra_sources is not None:
-                if isinstance(options.extra_sources, (str, bytes)):
-                    extra_sources = [options.extra_sources]
-                elif isinstance(options.extra_sources, (list, tuple)):
-                    extra_sources = options.extra_sources
-                else:
-                    raise TypeError("extra_sources must be str, bytes, list, or tuple")
-
-                if len(extra_sources) == 0:
-                    raise ValueError("extra_sources cannot be empty if provided")
-
-                for i, extra_source in enumerate(extra_sources):
-                    if isinstance(extra_source, str):
-                        extra_source = extra_source.encode("utf-8")
-                    elif not isinstance(extra_source, (bytes, bytearray)):
-                        raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
-
-                    if len(extra_source) == 0:
-                        raise ValueError(f"Extra source {i} cannot be empty")
-
-                    extra_name = f"{options.name}_extra_{i}"
-                    nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
+                if not is_sequence(options.extra_sources):
+                    raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)")
+                for i, module in enumerate(options.extra_sources):
+                    if not isinstance(module, tuple) or len(module) != 2:
+                        raise TypeError(
+                            f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}"
+                        )
+                    
+                    module_name, module_source = module
+                    
+                    if not isinstance(module_name, str):
+                        raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
+                    
+                    if isinstance(module_source, str):
+                        # Textual LLVM IR - encode to UTF-8 bytes
+                        module_source = module_source.encode("utf-8")
+                    elif not isinstance(module_source, (bytes, bytearray)):
+                        raise TypeError(
+                            f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
+                            f"or bytearray, got {type(module_source).__name__}"
+                        )
+                    
+                    if len(module_source) == 0:
+                        raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
+                    
+                    nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
+                    self._module_count += 1
 
             self._backend = "NVVM"
             self._linker = None
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c62e1712f6..aec975e84e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -435,7 +435,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
     # helper nvvm ir for multiple module loading
-    helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+    helper_nvvmir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @helper_add(i32 %x) {{
@@ -448,7 +448,12 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
+    options = ProgramOptions(
+            name="multi_module_test",
+            extra_sources=[
+                ("helper", helper_nvvmir),
+            ]
+        )   
     program = Program(nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -522,7 +527,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
+    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),])
     program = Program(main_nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"

From aeb26aa1f8dcdaf8a81f6715faa47c40703ced4e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 3 Dec 2025 04:16:32 +0000
Subject: [PATCH 11/50] refresh

---
 cuda_core/cuda/core/experimental/_program.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 9f74c3b10a..3aab5de3ae 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -303,7 +303,7 @@ class ProgramOptions:
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-
+    
     def __post_init__(self):
         self._name = self.name.encode()
 
@@ -423,8 +423,6 @@ def __post_init__(self):
             self._formatted_options.append("--fdevice-syntax-only")
         if self.minimal is not None and self.minimal:
             self._formatted_options.append("--minimal")
-        if self.numba_debug:
-            self._formatted_options.append("--numba-debug")
 
     def _as_bytes(self):
         # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved

From b3d6d960faa8f9ff130cd558e91b56213d1b448f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 3 Dec 2025 07:54:25 +0000
Subject: [PATCH 12/50] format

---
 cuda_core/cuda/core/experimental/_program.py | 27 +++++-----
 cuda_core/tests/test_program.py              | 57 +++++++-------------
 2 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 3aab5de3ae..efdb39807a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -7,7 +7,7 @@
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, List, Tuple, Union
 from warnings import warn
 
 if TYPE_CHECKING:
@@ -303,7 +303,7 @@ class ProgramOptions:
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-    
+
     def __post_init__(self):
         self._name = self.name.encode()
 
@@ -484,7 +484,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-            
+
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
@@ -515,21 +515,24 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
             self._module_count = 1
-             # Add extra modules if provided
+            # Add extra modules if provided
             if options.extra_sources is not None:
                 if not is_sequence(options.extra_sources):
-                    raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)")
+                    raise TypeError(
+                        "extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)"
+                    )
                 for i, module in enumerate(options.extra_sources):
                     if not isinstance(module, tuple) or len(module) != 2:
                         raise TypeError(
-                            f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}"
+                            f"Each extra module must be a 2-tuple (name, source)"
+                            f", got {type(module).__name__} at index {i}"
                         )
-                    
+
                     module_name, module_source = module
-                    
+
                     if not isinstance(module_name, str):
-                        raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
-                    
+                        raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}")
+
                     if isinstance(module_source, str):
                         # Textual LLVM IR - encode to UTF-8 bytes
                         module_source = module_source.encode("utf-8")
@@ -538,10 +541,10 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                             f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
                             f"or bytearray, got {type(module_source).__name__}"
                         )
-                    
+
                     if len(module_source) == 0:
                         raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
-                    
+
                     nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
                     self._module_count += 1
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e0b9a61023..f9cb8dc5f2 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -15,13 +15,13 @@
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
 try:
-    from cuda_python_test_helpers.nvvm_bitcode import (
-        minimal_nvvmir
-    )
+    from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir
+
     _test_helpers_available = True
 except ImportError:
     _test_helpers_available = False
 
+
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
@@ -38,29 +38,12 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
+    from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
     _cuda_driver_version = 0
 
-
-def _get_nvrtc_version_for_tests():
-    """
-    Get NVRTC version.
-
-    Returns:
-        int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
-        None: If NVRTC is not available
-    """
-    try:
-        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
-        version = nvrtc_major * 1000 + nvrtc_minor * 100
-        return version
-    except Exception:
-        return None
-
-
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 
@@ -200,13 +183,6 @@ def ptx_code_object():
     [
         ProgramOptions(name="abc"),
         ProgramOptions(device_code_optimize=True, debug=True),
-        pytest.param(
-            ProgramOptions(debug=True, numba_debug=True),
-            marks=pytest.mark.skipif(
-                (_get_nvrtc_version_for_tests() or 0) < 13200,
-                reason="numba_debug requires NVRTC >= 13.2",
-            ),
-        ),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
         ProgramOptions(fma=False, use_fast_math=True),
@@ -473,11 +449,11 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 """  # noqa: E501
 
     options = ProgramOptions(
-            name="multi_module_test",
-            extra_sources=[
-                ("helper", helper_nvvmir),
-            ]
-        )   
+        name="multi_module_test",
+        extra_sources=[
+            ("helper", helper_nvvmir),
+        ],
+    )
     program = Program(nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -551,7 +527,13 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),])
+    options = ProgramOptions(
+        name="nvvm_multi_helper_test",
+        extra_sources=[
+            ("helper1", helper1_ir),
+            ("helper2", helper2_ir),
+        ],
+    )
     program = Program(main_nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -567,11 +549,11 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible")
+@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
     import os
     from pathlib import Path
-    
+
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 
@@ -583,7 +565,7 @@ def test_bitcode_format(minimal_nvvmir):
     assert isinstance(ptx_result, ObjectCode)
     assert ptx_result.name == "minimal_nvvmir_bitcode_test"
     assert len(ptx_result.code) > 0
-    prgram_lto = Program(minimal_nvvmir, "nvvm", options)
+    program_lto = Program(minimal_nvvmir, "nvvm", options)
     try:
         ltoir_result = program_lto.compile("ltoir")
         assert isinstance(ltoir_result, ObjectCode)
@@ -594,6 +576,7 @@ def test_bitcode_format(minimal_nvvmir):
     finally:
         program.close()
 
+
 def test_cpp_program_with_extra_sources():
     # negative test with NVRTC with multiple sources
     code = 'extern "C" __global__ void my_kernel(){}'

From edd6401cc5300c03adb583c021da60b6da18bfe6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 07:55:54 +0000
Subject: [PATCH 13/50] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_program.py                              | 5 +----
 .../cuda_python_test_helpers/nvvm_bitcode.py                 | 5 ++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index f9cb8dc5f2..d0de56a41c 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -551,13 +551,10 @@ def test_nvvm_program_with_multiple_extra_sources():
 @nvvm_available
 @pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
-    import os
-    from pathlib import Path
-
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 
-    options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90")
+    options = ProgramOptions(name="minimal_nvvmir_bitcode_test", arch="sm_90")
     program = Program(minimal_nvvmir, "nvvm", options)
 
     assert program.backend == "NVVM"
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
index cba22d5425..e42dbff085 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -1,6 +1,4 @@
 import binascii
-import re
-from contextlib import contextmanager
 
 import pytest
 from cuda.bindings import nvvm
@@ -117,6 +115,7 @@
     "6e673e0000000000",
 }
 
+
 @pytest.fixture(params=("txt", "bitcode_static"))
 def minimal_nvvmir(request):
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -133,4 +132,4 @@ def minimal_nvvmir(request):
         "Maintainers: Please run the helper script to generate it and add the "
         "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n"
         "  ../../toolshed/build_static_bitcode_input.py"
-    )
\ No newline at end of file
+    )

From 8dbbafece0fc088fa8fbb7d042347f6ce905f9b5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 15 Dec 2025 17:59:25 +0000
Subject: [PATCH 14/50] fix from upstream

---
 cuda_core/cuda/core/experimental/_program.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index efdb39807a..2610bc1232 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -423,6 +423,8 @@ def __post_init__(self):
             self._formatted_options.append("--fdevice-syntax-only")
         if self.minimal is not None and self.minimal:
             self._formatted_options.append("--minimal")
+        if self.numba_debug:
+            self._formatted_options.append("--numba-debug")
 
     def _as_bytes(self):
         # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved

From b78f0c3e820bce5b6f70903d3d83e10821b02fba Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:22:24 +0000
Subject: [PATCH 15/50] refresh from upstream

---
 cuda_core/tests/test_program.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 91567fdced..02b9899eea 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -184,6 +184,13 @@ def ptx_code_object():
     [
         ProgramOptions(name="abc"),
         ProgramOptions(device_code_optimize=True, debug=True),
+        pytest.param(
+            ProgramOptions(debug=True, numba_debug=True),
+            marks=pytest.mark.skipif(
+                (_get_nvrtc_version_for_tests() or 0) < 13200,
+                reason="numba_debug requires NVRTC >= 13.2",
+            ),
+        ),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
         ProgramOptions(fma=False, use_fast_math=True),
@@ -714,4 +721,4 @@ def test_program_options_as_bytes_nvvm_unsupported_option():
     """Test that unsupported options raise CUDAError for NVVM backend"""
     options = ProgramOptions(arch="sm_80", lineinfo=True)
     with pytest.raises(CUDAError, match="not supported by NVVM backend"):
-        options.as_bytes("nvvm")
+        options.as_bytes("nvvm")
\ No newline at end of file

From 99a5593d0d82d5de399228eadf4fc6b6ed825ff6 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:27:55 +0000
Subject: [PATCH 16/50] fix tests

---
 cuda_core/tests/test_program.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 02b9899eea..00d0709870 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -45,6 +45,21 @@ def _is_nvvm_available():
 except Exception:
     _cuda_driver_version = 0
 
+
+def _get_nvrtc_version_for_tests():
+    """
+    Get NVRTC version.
+    Returns:
+        int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
+        None: If NVRTC is not available
+    """
+    try:
+        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
+        version = nvrtc_major * 1000 + nvrtc_minor * 100
+        return version
+    except Exception:
+        return None
+
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 

From 783f6e5a419b74757c7620824a7a23ec847321fd Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:29:35 +0000
Subject: [PATCH 17/50] take path_finder from PR 447

---
 .../bindings/_internal/nvjitlink_linux.pyx    |   1 +
 .../cuda/bindings/_path_finder/cuda_paths.py  | 400 ++++++++++++++++++
 .../find_nvidia_dynamic_library.py            | 139 ++++++
 .../cuda/bindings/_path_finder/findlib.py     |  69 +++
 .../load_nvidia_dynamic_library.py            |  92 ++++
 .../_path_finder/sys_path_find_sub_dirs.py    |  40 ++
 cuda_bindings/cuda/bindings/path_finder.py    |  37 ++
 7 files changed, 778 insertions(+)
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/findlib.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
 create mode 100644 cuda_bindings/cuda/bindings/path_finder.py

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 8c96b6d640..d095b3f524 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -12,6 +12,7 @@ from .utils import FunctionNotFoundError, NotSupportedError
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
 
+
 ###############################################################################
 # Extern
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
new file mode 100644
index 0000000000..c827699b3a
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
@@ -0,0 +1,400 @@
+import os
+import platform
+import re
+import site
+import sys
+import traceback
+import warnings
+from collections import namedtuple
+from pathlib import Path
+
+from .findlib import find_file, find_lib
+
+IS_WIN32 = sys.platform.startswith("win32")
+
+_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"])
+
+
+def _get_numba_CUDA_INCLUDE_PATH():
+    # From numba/numba/core/config.py
+
+    def _readenv(name, ctor, default):
+        value = os.environ.get(name)
+        if value is None:
+            return default() if callable(default) else default
+        try:
+            return ctor(value)
+        except Exception:
+            warnings.warn(  # noqa: B028
+                f"Environment variable '{name}' is defined but "
+                f"its associated value '{value}' could not be "
+                "parsed.\nThe parse failed with exception:\n"
+                f"{traceback.format_exc()}",
+                RuntimeWarning,
+            )
+            return default
+
+    if IS_WIN32:
+        cuda_path = os.environ.get("CUDA_PATH")
+        if cuda_path:  # noqa: SIM108
+            default_cuda_include_path = os.path.join(cuda_path, "include")
+        else:
+            default_cuda_include_path = "cuda_include_not_found"
+    else:
+        default_cuda_include_path = os.path.join(os.sep, "usr", "local", "cuda", "include")
+    CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, default_cuda_include_path)
+    return CUDA_INCLUDE_PATH
+
+
+config_CUDA_INCLUDE_PATH = _get_numba_CUDA_INCLUDE_PATH()
+
+
+def _find_valid_path(options):
+    """Find valid path from *options*, which is a list of 2-tuple of
+    (name, path).  Return first pair where *path* is not None.
+    If no valid path is found, return ('<unknown>', None)
+    """
+    for by, data in options:
+        if data is not None:
+            return by, data
+    else:
+        return "<unknown>", None
+
+
+def _get_libdevice_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()),
+        ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")),
+        ("Debian package", get_debian_pkg_libdevice()),
+        ("NVIDIA NVCC Wheel", get_libdevice_wheel()),
+    ]
+    libdevice_ctk_dir = get_system_ctk("nvvm", "libdevice")
+    if libdevice_ctk_dir and os.path.exists(libdevice_ctk_dir):
+        options.append(("System", libdevice_ctk_dir))
+
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _nvvm_lib_dir():
+    if IS_WIN32:
+        return "nvvm", "bin"
+    else:
+        return "nvvm", "lib64"
+
+
+def _get_nvvm_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())),
+        ("NVIDIA NVCC Wheel", _get_nvvm_wheel()),
+    ]
+    # need to ensure nvvm dir actually exists
+    nvvm_ctk_dir = get_system_ctk(*_nvvm_lib_dir())
+    if nvvm_ctk_dir and os.path.exists(nvvm_ctk_dir):
+        options.append(("System", nvvm_ctk_dir))
+
+    by, path = _find_valid_path(options)
+    return by, path
+
+
+def _get_nvvm_wheel():
+    site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None]
+    for sp in site_paths:
+        # The SONAME is taken based on public CTK 12.x releases
+        if sys.platform.startswith("linux"):
+            dso_dir = "lib64"
+            # Hack: libnvvm from Linux wheel
+            # does not have any soname (CUDAINST-3183)
+            dso_path = "libnvvm.so"
+        elif sys.platform.startswith("win32"):
+            dso_dir = "bin"
+            dso_path = "nvvm64_40_0.dll"
+        else:
+            raise AssertionError()
+
+        if sp is not None:
+            dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir)
+            dso_path = os.path.join(dso_dir, dso_path)
+            if os.path.exists(dso_path):
+                return str(Path(dso_path).parent)
+
+
+def _get_libdevice_paths():
+    by, libdir = _get_libdevice_path_decision()
+    if by == "NVIDIA NVCC Wheel":
+        # The NVVM path is a directory, not a file
+        out = os.path.join(libdir, "libdevice.10.bc")
+    else:
+        # Search for pattern
+        pat = r"libdevice(\.\d+)*\.bc$"
+        candidates = find_file(re.compile(pat), libdir)
+        # Keep only the max (most recent version) of the bitcode files.
+        out = max(candidates, default=None)
+    return _env_path_tuple(by, out)
+
+
+def _cudalib_path():
+    if IS_WIN32:
+        return "bin"
+    else:
+        return "lib64"
+
+
+def _cuda_home_static_cudalib_path():
+    if IS_WIN32:
+        return ("lib", "x64")
+    else:
+        return ("lib64",)
+
+
+def _get_cudalib_dir_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_static_cudalib_dir_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_cudalib_dir():
+    by, libdir = _get_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def _get_static_cudalib_dir():
+    by, libdir = _get_static_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def get_system_ctk(*subdirs):
+    """Return path to system-wide cudatoolkit; or, None if it doesn't exist."""
+    # Linux?
+    if sys.platform.startswith("linux"):
+        # Is cuda alias to /usr/local/cuda?
+        # We are intentionally not getting versioned cuda installation.
+        base = "/usr/local/cuda"
+        if os.path.exists(base):
+            return os.path.join(base, *subdirs)
+
+
+def get_conda_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+    # Assume the existence of NVVM to imply cudatoolkit installed
+    paths = find_lib("nvvm")
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_nvvm_ctk():
+    """Return path to directory containing the NVVM shared library."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+
+    # Assume the existence of NVVM in the conda env implies that a CUDA toolkit
+    # conda package is installed.
+
+    # First, try the location used on Linux and the Windows 11.x packages
+    libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path())
+    if not os.path.exists(libdir) or not os.path.isdir(libdir):
+        # If that fails, try the location used for Windows 12.x packages
+        libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path())
+        if not os.path.exists(libdir) or not os.path.isdir(libdir):
+            # If that doesn't exist either, assume we don't have the NVIDIA
+            # conda package
+            return
+
+    paths = find_lib("nvvm", libdir=libdir)
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_libdevice_ctk():
+    """Return path to directory containing the libdevice library."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    nvvm_dir = os.path.dirname(nvvm_ctk)
+    return os.path.join(nvvm_dir, "libdevice")
+
+
+def get_nvidia_cudalib_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    subdir = "bin" if IS_WIN32 else "lib"
+    return os.path.join(env_dir, subdir)
+
+
+def get_nvidia_static_cudalib_ctk():
+    """Return path to directory containing the static libraries of cudatoolkit."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+
+    if IS_WIN32 and ("Library" not in nvvm_ctk):  # noqa: SIM108
+        # Location specific to CUDA 11.x packages on Windows
+        dirs = ("Lib", "x64")
+    else:
+        # Linux, or Windows with CUDA 12.x packages
+        dirs = ("lib",)
+
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    return os.path.join(env_dir, *dirs)
+
+
+def get_cuda_home(*subdirs):
+    """Get paths of CUDA_HOME.
+    If *subdirs* are the subdirectory name to be appended in the resulting
+    path.
+    """
+    cuda_home = os.environ.get("CUDA_HOME")
+    if cuda_home is None:
+        # Try Windows CUDA installation without Anaconda
+        cuda_home = os.environ.get("CUDA_PATH")
+    if cuda_home is not None:
+        return os.path.join(cuda_home, *subdirs)
+
+
+def _get_nvvm_path():
+    by, path = _get_nvvm_path_decision()
+    if by == "NVIDIA NVCC Wheel":
+        # The NVVM path is a directory, not a file
+        path = os.path.join(path, "libnvvm.so")
+    else:
+        candidates = find_lib("nvvm", path)
+        path = max(candidates) if candidates else None
+    return _env_path_tuple(by, path)
+
+
+def get_cuda_paths():
+    """Returns a dictionary mapping component names to a 2-tuple
+    of (source_variable, info).
+    The returned dictionary will have the following keys and infos:
+    - "nvvm": file_path
+    - "libdevice": List[Tuple[arch, file_path]]
+    - "cudalib_dir": directory_path
+    Note: The result of the function is cached.
+    """
+    # Check cache
+    if hasattr(get_cuda_paths, "_cached_result"):
+        return get_cuda_paths._cached_result
+    else:
+        # Not in cache
+        d = {
+            "nvvm": _get_nvvm_path(),
+            "libdevice": _get_libdevice_paths(),
+            "cudalib_dir": _get_cudalib_dir(),
+            "static_cudalib_dir": _get_static_cudalib_dir(),
+            "include_dir": _get_include_dir(),
+        }
+        # Cache result
+        get_cuda_paths._cached_result = d
+        return d
+
+
+def get_debian_pkg_libdevice():
+    """
+    Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
+    exists.
+    """
+    pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice"
+    if not os.path.exists(pkg_libdevice_location):
+        return None
+    return pkg_libdevice_location
+
+
+def get_libdevice_wheel():
+    nvvm_path = _get_nvvm_wheel()
+    if nvvm_path is None:
+        return None
+    nvvm_path = Path(nvvm_path)
+    libdevice_path = nvvm_path.parent / "libdevice"
+
+    return str(libdevice_path)
+
+
+def get_current_cuda_target_name():
+    """Determine conda's CTK target folder based on system and machine arch.
+    CTK's conda package delivers headers based on its architecture type. For example,
+    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
+    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
+    nuances at cudart's conda feedstock:
+    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
+    """
+    system = platform.system()
+    machine = platform.machine()
+
+    if system == "Linux":
+        arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"}
+    elif system == "Windows":
+        arch_to_targets = {
+            "AMD64": "x64",
+        }
+    else:
+        arch_to_targets = {}
+
+    return arch_to_targets.get(machine, None)
+
+
+def get_conda_include_dir():
+    """
+    Return the include directory in the current conda environment, if one
+    is active and it exists.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+
+    if platform.system() == "Windows":
+        include_dir = os.path.join(sys.prefix, "Library", "include")
+    elif target_name := get_current_cuda_target_name():
+        include_dir = os.path.join(sys.prefix, "targets", target_name, "include")
+    else:
+        # A fallback when target cannot determined
+        # though usually it shouldn't.
+        include_dir = os.path.join(sys.prefix, "include")
+
+    if (
+        os.path.exists(include_dir)
+        and os.path.isdir(include_dir)
+        and os.path.exists(os.path.join(include_dir, "cuda_device_runtime_api.h"))
+    ):
+        return include_dir
+    return
+
+
+def _get_include_dir():
+    """Find the root include directory."""
+    options = [
+        ("Conda environment (NVIDIA package)", get_conda_include_dir()),
+        ("CUDA_INCLUDE_PATH Config Entry", config_CUDA_INCLUDE_PATH),
+        # TODO: add others
+    ]
+    by, include_dir = _find_valid_path(options)
+    return _env_path_tuple(by, include_dir)
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
new file mode 100644
index 0000000000..d8413282e6
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
@@ -0,0 +1,139 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import functools
+import glob
+import os
+
+from .cuda_paths import IS_WIN32, get_cuda_paths
+from .sys_path_find_sub_dirs import sys_path_find_sub_dirs
+
+
+def _no_such_file_in_sub_dirs(sub_dirs, file_wild, error_messages, attachments):
+    error_messages.append(f"No such file: {file_wild}")
+    for sub_dir in sys_path_find_sub_dirs(sub_dirs):
+        attachments.append(f'  listdir("{sub_dir}"):')
+        for node in sorted(os.listdir(sub_dir)):
+            attachments.append(f"    {node}")
+
+
+def _find_so_using_nvidia_lib_dirs(libname, so_basename, error_messages, attachments):
+    if libname == "nvvm":  # noqa: SIM108
+        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64")
+    else:
+        nvidia_sub_dirs = ("nvidia", "*", "lib")
+    file_wild = so_basename + "*"
+    for lib_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
+        # First look for an exact match
+        so_name = os.path.join(lib_dir, so_basename)
+        if os.path.isfile(so_name):
+            return so_name
+        # Look for a versioned library
+        # Using sort here mainly to make the result deterministic.
+        for node in sorted(glob.glob(os.path.join(lib_dir, file_wild))):
+            so_name = os.path.join(lib_dir, node)
+            if os.path.isfile(so_name):
+                return so_name
+    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
+    return None
+
+
+def _find_dll_using_nvidia_bin_dirs(libname, error_messages, attachments):
+    if libname == "nvvm":  # noqa: SIM108
+        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin")
+    else:
+        nvidia_sub_dirs = ("nvidia", "*", "bin")
+    file_wild = libname + "*.dll"
+    for bin_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
+        for node in sorted(glob.glob(os.path.join(bin_dir, file_wild))):
+            dll_name = os.path.join(bin_dir, node)
+            if os.path.isfile(dll_name):
+                return dll_name
+    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
+    return None
+
+
+def _get_cuda_paths_info(key, error_messages):
+    env_path_tuple = get_cuda_paths()[key]
+    if not env_path_tuple:
+        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"]')
+        return None
+    if not env_path_tuple.info:
+        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"].info')
+        return None
+    return env_path_tuple.info
+
+
+def _find_so_using_cudalib_dir(so_basename, error_messages, attachments):
+    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
+    if cudalib_dir is None:
+        return None
+    primary_so_dir = cudalib_dir + "/"
+    candidate_so_dirs = [primary_so_dir]
+    libs = ["/lib/", "/lib64/"]
+    for _ in range(2):
+        alt_dir = libs[0].join(primary_so_dir.rsplit(libs[1], 1))
+        if alt_dir not in candidate_so_dirs:
+            candidate_so_dirs.append(alt_dir)
+        libs.reverse()
+    candidate_so_names = [so_dirname + so_basename for so_dirname in candidate_so_dirs]
+    error_messages = []
+    for so_name in candidate_so_names:
+        if os.path.isfile(so_name):
+            return so_name
+        error_messages.append(f"No such file: {so_name}")
+    for so_dirname in candidate_so_dirs:
+        attachments.append(f'  listdir("{so_dirname}"):')
+        if not os.path.isdir(so_dirname):
+            attachments.append("    DIRECTORY DOES NOT EXIST")
+        else:
+            for node in sorted(os.listdir(so_dirname)):
+                attachments.append(f"    {node}")
+    return None
+
+
+def _find_dll_using_cudalib_dir(libname, error_messages, attachments):
+    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
+    if cudalib_dir is None:
+        return None
+    file_wild = libname + "*.dll"
+    for node in sorted(glob.glob(os.path.join(cudalib_dir, file_wild))):
+        dll_name = os.path.join(cudalib_dir, node)
+        if os.path.isfile(dll_name):
+            return dll_name
+    error_messages.append(f"No such file: {file_wild}")
+    attachments.append(f'  listdir("{cudalib_dir}"):')
+    for node in sorted(os.listdir(cudalib_dir)):
+        attachments.append(f"    {node}")
+    return None
+
+
+@functools.cache
+def find_nvidia_dynamic_library(name: str) -> str:
+    error_messages = []
+    attachments = []
+
+    if IS_WIN32:
+        dll_name = _find_dll_using_nvidia_bin_dirs(name, error_messages, attachments)
+        if dll_name is None:
+            if name == "nvvm":
+                dll_name = _get_cuda_paths_info("nvvm", error_messages)
+            else:
+                dll_name = _find_dll_using_cudalib_dir(name, error_messages, attachments)
+        if dll_name is None:
+            attachments = "\n".join(attachments)
+            raise RuntimeError(f"Failure finding {name}*.dll: {', '.join(error_messages)}\n{attachments}")
+        return dll_name
+
+    so_basename = f"lib{name}.so"
+    so_name = _find_so_using_nvidia_lib_dirs(name, so_basename, error_messages, attachments)
+    if so_name is None:
+        if name == "nvvm":
+            so_name = _get_cuda_paths_info("nvvm", error_messages)
+        else:
+            so_name = _find_so_using_cudalib_dir(so_basename, error_messages, attachments)
+    if so_name is None:
+        attachments = "\n".join(attachments)
+        raise RuntimeError(f"Failure finding {so_basename}: {', '.join(error_messages)}\n{attachments}")
+    return so_name
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/findlib.py b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
new file mode 100644
index 0000000000..c64c3c9577
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
@@ -0,0 +1,69 @@
+# Forked from:
+# https://github.com/numba/numba/blob/f0d24824fcd6a454827e3c108882395d00befc04/numba/misc/findlib.py
+
+import os
+import re
+import sys
+
+
+def get_lib_dirs():
+    """
+    Anaconda specific
+    """
+    if sys.platform == "win32":
+        # on windows, historically `DLLs` has been used for CUDA libraries,
+        # since approximately CUDA 9.2, `Library\bin` has been used.
+        dirnames = ["DLLs", os.path.join("Library", "bin")]
+    else:
+        dirnames = [
+            "lib",
+        ]
+    libdirs = [os.path.join(sys.prefix, x) for x in dirnames]
+    return libdirs
+
+
+DLLNAMEMAP = {
+    "linux": r"lib%(name)s\.so\.%(ver)s$",
+    "linux2": r"lib%(name)s\.so\.%(ver)s$",
+    "linux-static": r"lib%(name)s\.a$",
+    "darwin": r"lib%(name)s\.%(ver)s\.dylib$",
+    "win32": r"%(name)s%(ver)s\.dll$",
+    "win32-static": r"%(name)s\.lib$",
+    "bsd": r"lib%(name)s\.so\.%(ver)s$",
+}
+
+RE_VER = r"[0-9]*([_\.][0-9]+)*"
+
+
+def find_lib(libname, libdir=None, platform=None, static=False):
+    platform = platform or sys.platform
+    platform = "bsd" if "bsd" in platform else platform
+    if static:
+        platform = f"{platform}-static"
+    if platform not in DLLNAMEMAP:
+        # Return empty list if platform name is undefined.
+        # Not all platforms define their static library paths.
+        return []
+    pat = DLLNAMEMAP[platform] % {"name": libname, "ver": RE_VER}
+    regex = re.compile(pat)
+    return find_file(regex, libdir)
+
+
+def find_file(pat, libdir=None):
+    if libdir is None:
+        libdirs = get_lib_dirs()
+    elif isinstance(libdir, str):
+        libdirs = [
+            libdir,
+        ]
+    else:
+        libdirs = list(libdir)
+    files = []
+    for ldir in libdirs:
+        try:
+            entries = os.listdir(ldir)
+        except FileNotFoundError:
+            continue
+        candidates = [os.path.join(ldir, ent) for ent in entries if pat.match(ent)]
+        files.extend([c for c in candidates if os.path.isfile(c)])
+    return files
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
new file mode 100644
index 0000000000..69aadabcbf
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
@@ -0,0 +1,92 @@
+import functools
+import sys
+
+if sys.platform == "win32":
+    import ctypes.wintypes
+
+    import pywintypes
+    import win32api
+
+    # Mirrors WinBase.h (unfortunately not defined already elsewhere)
+    _WINBASE_LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+
+else:
+    import ctypes
+    import os
+
+    _LINUX_CDLL_MODE = os.RTLD_NOW | os.RTLD_GLOBAL
+
+from .find_nvidia_dynamic_library import find_nvidia_dynamic_library
+
+
+@functools.cache
+def _windows_cuDriverGetVersion() -> int:
+    handle = win32api.LoadLibrary("nvcuda.dll")
+
+    kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
+    GetProcAddress = kernel32.GetProcAddress
+    GetProcAddress.argtypes = [ctypes.wintypes.HMODULE, ctypes.wintypes.LPCSTR]
+    GetProcAddress.restype = ctypes.c_void_p
+    cuDriverGetVersion = GetProcAddress(handle, b"cuDriverGetVersion")
+    assert cuDriverGetVersion
+
+    FUNC_TYPE = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_int))
+    cuDriverGetVersion_fn = FUNC_TYPE(cuDriverGetVersion)
+    driver_ver = ctypes.c_int()
+    err = cuDriverGetVersion_fn(ctypes.byref(driver_ver))
+    assert err == 0
+    return driver_ver.value
+
+
+@functools.cache
+def _windows_load_with_dll_basename(name: str) -> int:
+    driver_ver = _windows_cuDriverGetVersion()
+    del driver_ver  # Keeping this here because it will probably be needed in the future.
+
+    if name == "nvJitLink":
+        dll_name = "nvJitLink_120_0.dll"
+    elif name == "nvrtc":
+        dll_name = "nvrtc64_120_0.dll"
+    elif name == "nvvm":
+        dll_name = "nvvm64_40_0.dll"
+
+    try:
+        return win32api.LoadLibrary(dll_name)
+    except pywintypes.error:
+        pass
+
+    return None
+
+
+@functools.cache
+def load_nvidia_dynamic_library(name: str) -> int:
+    # First try using the platform-specific dynamic loader search mechanisms
+    if sys.platform == "win32":
+        handle = _windows_load_with_dll_basename(name)
+        if handle:
+            return handle
+    else:
+        dl_path = f"lib{name}.so"  # Version intentionally no specified.
+        try:
+            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
+        except OSError:
+            pass
+        else:
+            # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
+            return handle._handle  # C unsigned int
+
+    dl_path = find_nvidia_dynamic_library(name)
+    if sys.platform == "win32":
+        try:
+            handle = win32api.LoadLibrary(dl_path)
+        except pywintypes.error as e:
+            raise RuntimeError(f"Failed to load DLL at {dl_path}: {e}") from e
+        # Use `cdef void* ptr = <void*><intptr_t>` in cython to convert back to void*
+        return handle  # C signed int, matches win32api.GetProcAddress
+    else:
+        try:
+            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
+        except OSError as e:
+            raise RuntimeError(f"Failed to dlopen {dl_path}: {e}") from e
+        # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
+        return handle._handle  # C unsigned int
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
new file mode 100644
index 0000000000..324cdeec30
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
@@ -0,0 +1,40 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import functools
+import os
+import sys
+
+
+@functools.cache
+def _impl(sys_path, sub_dirs):
+    results = []
+    for base in sys_path:
+        stack = [(base, 0)]  # (current_path, index into sub_dirs)
+        while stack:
+            current_path, idx = stack.pop()
+            if idx == len(sub_dirs):
+                if os.path.isdir(current_path):
+                    results.append(current_path)
+                continue
+
+            sub = sub_dirs[idx]
+            if sub == "*":
+                try:
+                    entries = sorted(os.listdir(current_path))
+                except OSError:
+                    continue
+                for entry in entries:
+                    entry_path = os.path.join(current_path, entry)
+                    if os.path.isdir(entry_path):
+                        stack.append((entry_path, idx + 1))
+            else:
+                next_path = os.path.join(current_path, sub)
+                if os.path.isdir(next_path):
+                    stack.append((next_path, idx + 1))
+    return results
+
+
+def sys_path_find_sub_dirs(sub_dirs):
+    return _impl(tuple(sys.path), tuple(sub_dirs))
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/path_finder.py b/cuda_bindings/cuda/bindings/path_finder.py
new file mode 100644
index 0000000000..12c02b930b
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/path_finder.py
@@ -0,0 +1,37 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.bindings._path_finder.cuda_paths import (
+    get_conda_ctk,
+    get_conda_include_dir,
+    get_cuda_home,
+    get_cuda_paths,
+    get_current_cuda_target_name,
+    get_debian_pkg_libdevice,
+    get_libdevice_wheel,
+    get_nvidia_cudalib_ctk,
+    get_nvidia_libdevice_ctk,
+    get_nvidia_nvvm_ctk,
+    get_nvidia_static_cudalib_ctk,
+    get_system_ctk,
+)
+from cuda.bindings._path_finder.find_nvidia_dynamic_library import find_nvidia_dynamic_library
+from cuda.bindings._path_finder.load_nvidia_dynamic_library import load_nvidia_dynamic_library
+
+__all__ = [
+    "find_nvidia_dynamic_library",
+    "load_nvidia_dynamic_library",
+    "get_conda_ctk",
+    "get_conda_include_dir",
+    "get_cuda_home",
+    "get_cuda_paths",
+    "get_current_cuda_target_name",
+    "get_debian_pkg_libdevice",
+    "get_libdevice_wheel",
+    "get_nvidia_cudalib_ctk",
+    "get_nvidia_libdevice_ctk",
+    "get_nvidia_nvvm_ctk",
+    "get_nvidia_static_cudalib_ctk",
+    "get_system_ctk",
+]
\ No newline at end of file

From 5dbfb2d0061d6ca9a656e151d922f4fd9ec255b5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:54:03 +0000
Subject: [PATCH 18/50] add builder files

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 4 ++--
 cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx      | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index d095b3f524..785f587951 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -9,7 +9,7 @@ from libc.stdint cimport intptr_t, uintptr_t
 import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
-from cuda.pathfinder import load_nvidia_dynamic_lib
+from cuda.bindings import path_finder
 
 
 
@@ -77,7 +77,7 @@ cdef void* __nvJitLinkVersion = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint
+    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvJitLink")
     return <void*>handle
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 408a2cb592..15845e3228 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -10,6 +10,7 @@ import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
+from cuda.bindings import path_finder
 
 
 ###############################################################################
@@ -75,7 +76,7 @@ cdef void* __nvvmGetProgramLog = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvvm")._handle_uint
+    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvvm")
     return <void*>handle
 
 

From 0a9eea9967e017bb20eea4801fdbba16e078ce50 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 14:06:11 +0000
Subject: [PATCH 19/50] use python lists/tuples

---
 cuda_core/cuda/core/experimental/_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 4da8191f26..1422ceda5f 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -7,7 +7,7 @@
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Union
 from warnings import warn
 
 if TYPE_CHECKING:

From 79138c0950aaee94426e5858df48010bb6db4530 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 18 Dec 2025 20:01:25 +0000
Subject: [PATCH 20/50] libdevice integration

---
 cuda_core/cuda/core/experimental/_program.py | 58 +++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 1422ceda5f..a5ad2a3be2 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -102,6 +102,47 @@ def _get_nvvm_module():
         _nvvm_module = None
         raise e
 
+def _find_libdevice_path():
+    """
+    Find libdevice.10.bc using cuda.bindings.path_finder.
+    
+    Returns:
+        str: Path to libdevice.10.bc, or None if not found
+    """
+    try:
+        from cuda.bindings.path_finder import (
+            get_nvidia_libdevice_ctk,
+            get_libdevice_wheel,
+            get_debian_pkg_libdevice,
+        )
+        
+        for getter in [get_nvidia_libdevice_ctk, get_libdevice_wheel, get_debian_pkg_libdevice]:
+            try:
+                result = getter()
+                if result is not None and result.info is not None:
+                    return result.info
+            except Exception:
+                continue
+        
+        return None
+    except ImportError:
+        import os
+        
+        # CUDA_HOME
+        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+        if cuda_home:
+            libdevice_path = os.path.join(cuda_home, "nvvm", "libdevice", "libdevice.10.bc")
+            if os.path.isfile(libdevice_path):
+                return libdevice_path
+        
+        # Linux paths
+        for base in ["/usr/local/cuda", "/opt/cuda"]:
+            libdevice_path = os.path.join(base, "nvvm", "libdevice", "libdevice.10.bc")
+            if os.path.isfile(libdevice_path):
+                return libdevice_path
+        
+        return None
+
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
@@ -352,6 +393,7 @@ class ProgramOptions:
     pch_messages: bool | None = None
     instantiate_templates_in_pch: bool | None = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
+    use_libdevice: bool | None = None # Use libdevice
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -748,7 +790,8 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
                     nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
                     self._module_count += 1
-
+                    
+            self._use_libdevice = options.use_libdevice
             self._backend = "NVVM"
             self._linker = None
 
@@ -866,6 +909,19 @@ def compile(self, target_type, name_expressions=(), logs=None):
             nvvm = _get_nvvm_module()
             with _nvvm_exception_manager(self):
                 nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+                # Invoke libdevice
+                if getattr(self, '_use_libdevice', False):
+                    libdevice_path = _find_libdevice_path()
+                    if libdevice_path is None:
+                        raise RuntimeError(
+                            "use_libdevice=True but could not find libdevice.10.bc. "
+                            "Ensure CUDA toolkit is installed."
+                        )
+                    with open(libdevice_path, "rb") as f:
+                        libdevice_bc = f.read()
+                    # Use lazy_add_module for libdevice bitcode only following numba-cuda
+                    nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None)
+                
                 nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
 
             size = nvvm.get_compiled_result_size(self._mnff.handle)

From 25d336c1d7644b801d3730fce702a231175bde47 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 18:43:25 +0000
Subject: [PATCH 21/50] refresh

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 6 ++----
 cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx      | 4 +---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 785f587951..8025189b34 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -9,9 +9,7 @@ from libc.stdint cimport intptr_t, uintptr_t
 import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
-from cuda.bindings import path_finder
-
-
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -77,7 +75,7 @@ cdef void* __nvJitLinkVersion = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvJitLink")
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint
     return <void*>handle
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 15845e3228..5411e157c7 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -10,8 +10,6 @@ import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
-from cuda.bindings import path_finder
-
 
 ###############################################################################
 # Extern
@@ -76,7 +74,7 @@ cdef void* __nvvmGetProgramLog = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvvm")
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvvm")._handle_uint
     return <void*>handle
 
 

From 32c1913af18f49e848bdeaf7e0abb20813005892 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 18:44:08 +0000
Subject: [PATCH 22/50] refresh

---
 cuda_bindings/cuda/bindings/path_finder.py | 37 ----------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 cuda_bindings/cuda/bindings/path_finder.py

diff --git a/cuda_bindings/cuda/bindings/path_finder.py b/cuda_bindings/cuda/bindings/path_finder.py
deleted file mode 100644
index 12c02b930b..0000000000
--- a/cuda_bindings/cuda/bindings/path_finder.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-from cuda.bindings._path_finder.cuda_paths import (
-    get_conda_ctk,
-    get_conda_include_dir,
-    get_cuda_home,
-    get_cuda_paths,
-    get_current_cuda_target_name,
-    get_debian_pkg_libdevice,
-    get_libdevice_wheel,
-    get_nvidia_cudalib_ctk,
-    get_nvidia_libdevice_ctk,
-    get_nvidia_nvvm_ctk,
-    get_nvidia_static_cudalib_ctk,
-    get_system_ctk,
-)
-from cuda.bindings._path_finder.find_nvidia_dynamic_library import find_nvidia_dynamic_library
-from cuda.bindings._path_finder.load_nvidia_dynamic_library import load_nvidia_dynamic_library
-
-__all__ = [
-    "find_nvidia_dynamic_library",
-    "load_nvidia_dynamic_library",
-    "get_conda_ctk",
-    "get_conda_include_dir",
-    "get_cuda_home",
-    "get_cuda_paths",
-    "get_current_cuda_target_name",
-    "get_debian_pkg_libdevice",
-    "get_libdevice_wheel",
-    "get_nvidia_cudalib_ctk",
-    "get_nvidia_libdevice_ctk",
-    "get_nvidia_nvvm_ctk",
-    "get_nvidia_static_cudalib_ctk",
-    "get_system_ctk",
-]
\ No newline at end of file

From 01f03e552806f65ae27defb8e5ce32a1aa79e7cc Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 18:45:37 +0000
Subject: [PATCH 23/50] refresh

---
 .../cuda/bindings/_path_finder/cuda_paths.py  | 400 ------------------
 .../find_nvidia_dynamic_library.py            | 139 ------
 .../cuda/bindings/_path_finder/findlib.py     |  69 ---
 .../load_nvidia_dynamic_library.py            |  92 ----
 .../_path_finder/sys_path_find_sub_dirs.py    |  40 --
 5 files changed, 740 deletions(-)
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/findlib.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py

diff --git a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
deleted file mode 100644
index c827699b3a..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
+++ /dev/null
@@ -1,400 +0,0 @@
-import os
-import platform
-import re
-import site
-import sys
-import traceback
-import warnings
-from collections import namedtuple
-from pathlib import Path
-
-from .findlib import find_file, find_lib
-
-IS_WIN32 = sys.platform.startswith("win32")
-
-_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"])
-
-
-def _get_numba_CUDA_INCLUDE_PATH():
-    # From numba/numba/core/config.py
-
-    def _readenv(name, ctor, default):
-        value = os.environ.get(name)
-        if value is None:
-            return default() if callable(default) else default
-        try:
-            return ctor(value)
-        except Exception:
-            warnings.warn(  # noqa: B028
-                f"Environment variable '{name}' is defined but "
-                f"its associated value '{value}' could not be "
-                "parsed.\nThe parse failed with exception:\n"
-                f"{traceback.format_exc()}",
-                RuntimeWarning,
-            )
-            return default
-
-    if IS_WIN32:
-        cuda_path = os.environ.get("CUDA_PATH")
-        if cuda_path:  # noqa: SIM108
-            default_cuda_include_path = os.path.join(cuda_path, "include")
-        else:
-            default_cuda_include_path = "cuda_include_not_found"
-    else:
-        default_cuda_include_path = os.path.join(os.sep, "usr", "local", "cuda", "include")
-    CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, default_cuda_include_path)
-    return CUDA_INCLUDE_PATH
-
-
-config_CUDA_INCLUDE_PATH = _get_numba_CUDA_INCLUDE_PATH()
-
-
-def _find_valid_path(options):
-    """Find valid path from *options*, which is a list of 2-tuple of
-    (name, path).  Return first pair where *path* is not None.
-    If no valid path is found, return ('<unknown>', None)
-    """
-    for by, data in options:
-        if data is not None:
-            return by, data
-    else:
-        return "<unknown>", None
-
-
-def _get_libdevice_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()),
-        ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")),
-        ("Debian package", get_debian_pkg_libdevice()),
-        ("NVIDIA NVCC Wheel", get_libdevice_wheel()),
-    ]
-    libdevice_ctk_dir = get_system_ctk("nvvm", "libdevice")
-    if libdevice_ctk_dir and os.path.exists(libdevice_ctk_dir):
-        options.append(("System", libdevice_ctk_dir))
-
-    by, libdir = _find_valid_path(options)
-    return by, libdir
-
-
-def _nvvm_lib_dir():
-    if IS_WIN32:
-        return "nvvm", "bin"
-    else:
-        return "nvvm", "lib64"
-
-
-def _get_nvvm_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()),
-        ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())),
-        ("NVIDIA NVCC Wheel", _get_nvvm_wheel()),
-    ]
-    # need to ensure nvvm dir actually exists
-    nvvm_ctk_dir = get_system_ctk(*_nvvm_lib_dir())
-    if nvvm_ctk_dir and os.path.exists(nvvm_ctk_dir):
-        options.append(("System", nvvm_ctk_dir))
-
-    by, path = _find_valid_path(options)
-    return by, path
-
-
-def _get_nvvm_wheel():
-    site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None]
-    for sp in site_paths:
-        # The SONAME is taken based on public CTK 12.x releases
-        if sys.platform.startswith("linux"):
-            dso_dir = "lib64"
-            # Hack: libnvvm from Linux wheel
-            # does not have any soname (CUDAINST-3183)
-            dso_path = "libnvvm.so"
-        elif sys.platform.startswith("win32"):
-            dso_dir = "bin"
-            dso_path = "nvvm64_40_0.dll"
-        else:
-            raise AssertionError()
-
-        if sp is not None:
-            dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir)
-            dso_path = os.path.join(dso_dir, dso_path)
-            if os.path.exists(dso_path):
-                return str(Path(dso_path).parent)
-
-
-def _get_libdevice_paths():
-    by, libdir = _get_libdevice_path_decision()
-    if by == "NVIDIA NVCC Wheel":
-        # The NVVM path is a directory, not a file
-        out = os.path.join(libdir, "libdevice.10.bc")
-    else:
-        # Search for pattern
-        pat = r"libdevice(\.\d+)*\.bc$"
-        candidates = find_file(re.compile(pat), libdir)
-        # Keep only the max (most recent version) of the bitcode files.
-        out = max(candidates, default=None)
-    return _env_path_tuple(by, out)
-
-
-def _cudalib_path():
-    if IS_WIN32:
-        return "bin"
-    else:
-        return "lib64"
-
-
-def _cuda_home_static_cudalib_path():
-    if IS_WIN32:
-        return ("lib", "x64")
-    else:
-        return ("lib64",)
-
-
-def _get_cudalib_dir_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()),
-        ("CUDA_HOME", get_cuda_home(_cudalib_path())),
-        ("System", get_system_ctk(_cudalib_path())),
-    ]
-    by, libdir = _find_valid_path(options)
-    return by, libdir
-
-
-def _get_static_cudalib_dir_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()),
-        ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())),
-        ("System", get_system_ctk(_cudalib_path())),
-    ]
-    by, libdir = _find_valid_path(options)
-    return by, libdir
-
-
-def _get_cudalib_dir():
-    by, libdir = _get_cudalib_dir_path_decision()
-    return _env_path_tuple(by, libdir)
-
-
-def _get_static_cudalib_dir():
-    by, libdir = _get_static_cudalib_dir_path_decision()
-    return _env_path_tuple(by, libdir)
-
-
-def get_system_ctk(*subdirs):
-    """Return path to system-wide cudatoolkit; or, None if it doesn't exist."""
-    # Linux?
-    if sys.platform.startswith("linux"):
-        # Is cuda alias to /usr/local/cuda?
-        # We are intentionally not getting versioned cuda installation.
-        base = "/usr/local/cuda"
-        if os.path.exists(base):
-            return os.path.join(base, *subdirs)
-
-
-def get_conda_ctk():
-    """Return path to directory containing the shared libraries of cudatoolkit."""
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
-    if not is_conda_env:
-        return
-    # Assume the existence of NVVM to imply cudatoolkit installed
-    paths = find_lib("nvvm")
-    if not paths:
-        return
-    # Use the directory name of the max path
-    return os.path.dirname(max(paths))
-
-
-def get_nvidia_nvvm_ctk():
-    """Return path to directory containing the NVVM shared library."""
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
-    if not is_conda_env:
-        return
-
-    # Assume the existence of NVVM in the conda env implies that a CUDA toolkit
-    # conda package is installed.
-
-    # First, try the location used on Linux and the Windows 11.x packages
-    libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path())
-    if not os.path.exists(libdir) or not os.path.isdir(libdir):
-        # If that fails, try the location used for Windows 12.x packages
-        libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path())
-        if not os.path.exists(libdir) or not os.path.isdir(libdir):
-            # If that doesn't exist either, assume we don't have the NVIDIA
-            # conda package
-            return
-
-    paths = find_lib("nvvm", libdir=libdir)
-    if not paths:
-        return
-    # Use the directory name of the max path
-    return os.path.dirname(max(paths))
-
-
-def get_nvidia_libdevice_ctk():
-    """Return path to directory containing the libdevice library."""
-    nvvm_ctk = get_nvidia_nvvm_ctk()
-    if not nvvm_ctk:
-        return
-    nvvm_dir = os.path.dirname(nvvm_ctk)
-    return os.path.join(nvvm_dir, "libdevice")
-
-
-def get_nvidia_cudalib_ctk():
-    """Return path to directory containing the shared libraries of cudatoolkit."""
-    nvvm_ctk = get_nvidia_nvvm_ctk()
-    if not nvvm_ctk:
-        return
-    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
-    subdir = "bin" if IS_WIN32 else "lib"
-    return os.path.join(env_dir, subdir)
-
-
-def get_nvidia_static_cudalib_ctk():
-    """Return path to directory containing the static libraries of cudatoolkit."""
-    nvvm_ctk = get_nvidia_nvvm_ctk()
-    if not nvvm_ctk:
-        return
-
-    if IS_WIN32 and ("Library" not in nvvm_ctk):  # noqa: SIM108
-        # Location specific to CUDA 11.x packages on Windows
-        dirs = ("Lib", "x64")
-    else:
-        # Linux, or Windows with CUDA 12.x packages
-        dirs = ("lib",)
-
-    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
-    return os.path.join(env_dir, *dirs)
-
-
-def get_cuda_home(*subdirs):
-    """Get paths of CUDA_HOME.
-    If *subdirs* are the subdirectory name to be appended in the resulting
-    path.
-    """
-    cuda_home = os.environ.get("CUDA_HOME")
-    if cuda_home is None:
-        # Try Windows CUDA installation without Anaconda
-        cuda_home = os.environ.get("CUDA_PATH")
-    if cuda_home is not None:
-        return os.path.join(cuda_home, *subdirs)
-
-
-def _get_nvvm_path():
-    by, path = _get_nvvm_path_decision()
-    if by == "NVIDIA NVCC Wheel":
-        # The NVVM path is a directory, not a file
-        path = os.path.join(path, "libnvvm.so")
-    else:
-        candidates = find_lib("nvvm", path)
-        path = max(candidates) if candidates else None
-    return _env_path_tuple(by, path)
-
-
-def get_cuda_paths():
-    """Returns a dictionary mapping component names to a 2-tuple
-    of (source_variable, info).
-    The returned dictionary will have the following keys and infos:
-    - "nvvm": file_path
-    - "libdevice": List[Tuple[arch, file_path]]
-    - "cudalib_dir": directory_path
-    Note: The result of the function is cached.
-    """
-    # Check cache
-    if hasattr(get_cuda_paths, "_cached_result"):
-        return get_cuda_paths._cached_result
-    else:
-        # Not in cache
-        d = {
-            "nvvm": _get_nvvm_path(),
-            "libdevice": _get_libdevice_paths(),
-            "cudalib_dir": _get_cudalib_dir(),
-            "static_cudalib_dir": _get_static_cudalib_dir(),
-            "include_dir": _get_include_dir(),
-        }
-        # Cache result
-        get_cuda_paths._cached_result = d
-        return d
-
-
-def get_debian_pkg_libdevice():
-    """
-    Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
-    exists.
-    """
-    pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice"
-    if not os.path.exists(pkg_libdevice_location):
-        return None
-    return pkg_libdevice_location
-
-
-def get_libdevice_wheel():
-    nvvm_path = _get_nvvm_wheel()
-    if nvvm_path is None:
-        return None
-    nvvm_path = Path(nvvm_path)
-    libdevice_path = nvvm_path.parent / "libdevice"
-
-    return str(libdevice_path)
-
-
-def get_current_cuda_target_name():
-    """Determine conda's CTK target folder based on system and machine arch.
-    CTK's conda package delivers headers based on its architecture type. For example,
-    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
-    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
-    nuances at cudart's conda feedstock:
-    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
-    """
-    system = platform.system()
-    machine = platform.machine()
-
-    if system == "Linux":
-        arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"}
-    elif system == "Windows":
-        arch_to_targets = {
-            "AMD64": "x64",
-        }
-    else:
-        arch_to_targets = {}
-
-    return arch_to_targets.get(machine, None)
-
-
-def get_conda_include_dir():
-    """
-    Return the include directory in the current conda environment, if one
-    is active and it exists.
-    """
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
-    if not is_conda_env:
-        return
-
-    if platform.system() == "Windows":
-        include_dir = os.path.join(sys.prefix, "Library", "include")
-    elif target_name := get_current_cuda_target_name():
-        include_dir = os.path.join(sys.prefix, "targets", target_name, "include")
-    else:
-        # A fallback when target cannot determined
-        # though usually it shouldn't.
-        include_dir = os.path.join(sys.prefix, "include")
-
-    if (
-        os.path.exists(include_dir)
-        and os.path.isdir(include_dir)
-        and os.path.exists(os.path.join(include_dir, "cuda_device_runtime_api.h"))
-    ):
-        return include_dir
-    return
-
-
-def _get_include_dir():
-    """Find the root include directory."""
-    options = [
-        ("Conda environment (NVIDIA package)", get_conda_include_dir()),
-        ("CUDA_INCLUDE_PATH Config Entry", config_CUDA_INCLUDE_PATH),
-        # TODO: add others
-    ]
-    by, include_dir = _find_valid_path(options)
-    return _env_path_tuple(by, include_dir)
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
deleted file mode 100644
index d8413282e6..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import functools
-import glob
-import os
-
-from .cuda_paths import IS_WIN32, get_cuda_paths
-from .sys_path_find_sub_dirs import sys_path_find_sub_dirs
-
-
-def _no_such_file_in_sub_dirs(sub_dirs, file_wild, error_messages, attachments):
-    error_messages.append(f"No such file: {file_wild}")
-    for sub_dir in sys_path_find_sub_dirs(sub_dirs):
-        attachments.append(f'  listdir("{sub_dir}"):')
-        for node in sorted(os.listdir(sub_dir)):
-            attachments.append(f"    {node}")
-
-
-def _find_so_using_nvidia_lib_dirs(libname, so_basename, error_messages, attachments):
-    if libname == "nvvm":  # noqa: SIM108
-        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64")
-    else:
-        nvidia_sub_dirs = ("nvidia", "*", "lib")
-    file_wild = so_basename + "*"
-    for lib_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
-        # First look for an exact match
-        so_name = os.path.join(lib_dir, so_basename)
-        if os.path.isfile(so_name):
-            return so_name
-        # Look for a versioned library
-        # Using sort here mainly to make the result deterministic.
-        for node in sorted(glob.glob(os.path.join(lib_dir, file_wild))):
-            so_name = os.path.join(lib_dir, node)
-            if os.path.isfile(so_name):
-                return so_name
-    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
-    return None
-
-
-def _find_dll_using_nvidia_bin_dirs(libname, error_messages, attachments):
-    if libname == "nvvm":  # noqa: SIM108
-        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin")
-    else:
-        nvidia_sub_dirs = ("nvidia", "*", "bin")
-    file_wild = libname + "*.dll"
-    for bin_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
-        for node in sorted(glob.glob(os.path.join(bin_dir, file_wild))):
-            dll_name = os.path.join(bin_dir, node)
-            if os.path.isfile(dll_name):
-                return dll_name
-    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
-    return None
-
-
-def _get_cuda_paths_info(key, error_messages):
-    env_path_tuple = get_cuda_paths()[key]
-    if not env_path_tuple:
-        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"]')
-        return None
-    if not env_path_tuple.info:
-        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"].info')
-        return None
-    return env_path_tuple.info
-
-
-def _find_so_using_cudalib_dir(so_basename, error_messages, attachments):
-    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
-    if cudalib_dir is None:
-        return None
-    primary_so_dir = cudalib_dir + "/"
-    candidate_so_dirs = [primary_so_dir]
-    libs = ["/lib/", "/lib64/"]
-    for _ in range(2):
-        alt_dir = libs[0].join(primary_so_dir.rsplit(libs[1], 1))
-        if alt_dir not in candidate_so_dirs:
-            candidate_so_dirs.append(alt_dir)
-        libs.reverse()
-    candidate_so_names = [so_dirname + so_basename for so_dirname in candidate_so_dirs]
-    error_messages = []
-    for so_name in candidate_so_names:
-        if os.path.isfile(so_name):
-            return so_name
-        error_messages.append(f"No such file: {so_name}")
-    for so_dirname in candidate_so_dirs:
-        attachments.append(f'  listdir("{so_dirname}"):')
-        if not os.path.isdir(so_dirname):
-            attachments.append("    DIRECTORY DOES NOT EXIST")
-        else:
-            for node in sorted(os.listdir(so_dirname)):
-                attachments.append(f"    {node}")
-    return None
-
-
-def _find_dll_using_cudalib_dir(libname, error_messages, attachments):
-    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
-    if cudalib_dir is None:
-        return None
-    file_wild = libname + "*.dll"
-    for node in sorted(glob.glob(os.path.join(cudalib_dir, file_wild))):
-        dll_name = os.path.join(cudalib_dir, node)
-        if os.path.isfile(dll_name):
-            return dll_name
-    error_messages.append(f"No such file: {file_wild}")
-    attachments.append(f'  listdir("{cudalib_dir}"):')
-    for node in sorted(os.listdir(cudalib_dir)):
-        attachments.append(f"    {node}")
-    return None
-
-
-@functools.cache
-def find_nvidia_dynamic_library(name: str) -> str:
-    error_messages = []
-    attachments = []
-
-    if IS_WIN32:
-        dll_name = _find_dll_using_nvidia_bin_dirs(name, error_messages, attachments)
-        if dll_name is None:
-            if name == "nvvm":
-                dll_name = _get_cuda_paths_info("nvvm", error_messages)
-            else:
-                dll_name = _find_dll_using_cudalib_dir(name, error_messages, attachments)
-        if dll_name is None:
-            attachments = "\n".join(attachments)
-            raise RuntimeError(f"Failure finding {name}*.dll: {', '.join(error_messages)}\n{attachments}")
-        return dll_name
-
-    so_basename = f"lib{name}.so"
-    so_name = _find_so_using_nvidia_lib_dirs(name, so_basename, error_messages, attachments)
-    if so_name is None:
-        if name == "nvvm":
-            so_name = _get_cuda_paths_info("nvvm", error_messages)
-        else:
-            so_name = _find_so_using_cudalib_dir(so_basename, error_messages, attachments)
-    if so_name is None:
-        attachments = "\n".join(attachments)
-        raise RuntimeError(f"Failure finding {so_basename}: {', '.join(error_messages)}\n{attachments}")
-    return so_name
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/findlib.py b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
deleted file mode 100644
index c64c3c9577..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/findlib.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Forked from:
-# https://github.com/numba/numba/blob/f0d24824fcd6a454827e3c108882395d00befc04/numba/misc/findlib.py
-
-import os
-import re
-import sys
-
-
-def get_lib_dirs():
-    """
-    Anaconda specific
-    """
-    if sys.platform == "win32":
-        # on windows, historically `DLLs` has been used for CUDA libraries,
-        # since approximately CUDA 9.2, `Library\bin` has been used.
-        dirnames = ["DLLs", os.path.join("Library", "bin")]
-    else:
-        dirnames = [
-            "lib",
-        ]
-    libdirs = [os.path.join(sys.prefix, x) for x in dirnames]
-    return libdirs
-
-
-DLLNAMEMAP = {
-    "linux": r"lib%(name)s\.so\.%(ver)s$",
-    "linux2": r"lib%(name)s\.so\.%(ver)s$",
-    "linux-static": r"lib%(name)s\.a$",
-    "darwin": r"lib%(name)s\.%(ver)s\.dylib$",
-    "win32": r"%(name)s%(ver)s\.dll$",
-    "win32-static": r"%(name)s\.lib$",
-    "bsd": r"lib%(name)s\.so\.%(ver)s$",
-}
-
-RE_VER = r"[0-9]*([_\.][0-9]+)*"
-
-
-def find_lib(libname, libdir=None, platform=None, static=False):
-    platform = platform or sys.platform
-    platform = "bsd" if "bsd" in platform else platform
-    if static:
-        platform = f"{platform}-static"
-    if platform not in DLLNAMEMAP:
-        # Return empty list if platform name is undefined.
-        # Not all platforms define their static library paths.
-        return []
-    pat = DLLNAMEMAP[platform] % {"name": libname, "ver": RE_VER}
-    regex = re.compile(pat)
-    return find_file(regex, libdir)
-
-
-def find_file(pat, libdir=None):
-    if libdir is None:
-        libdirs = get_lib_dirs()
-    elif isinstance(libdir, str):
-        libdirs = [
-            libdir,
-        ]
-    else:
-        libdirs = list(libdir)
-    files = []
-    for ldir in libdirs:
-        try:
-            entries = os.listdir(ldir)
-        except FileNotFoundError:
-            continue
-        candidates = [os.path.join(ldir, ent) for ent in entries if pat.match(ent)]
-        files.extend([c for c in candidates if os.path.isfile(c)])
-    return files
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
deleted file mode 100644
index 69aadabcbf..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import functools
-import sys
-
-if sys.platform == "win32":
-    import ctypes.wintypes
-
-    import pywintypes
-    import win32api
-
-    # Mirrors WinBase.h (unfortunately not defined already elsewhere)
-    _WINBASE_LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
-
-else:
-    import ctypes
-    import os
-
-    _LINUX_CDLL_MODE = os.RTLD_NOW | os.RTLD_GLOBAL
-
-from .find_nvidia_dynamic_library import find_nvidia_dynamic_library
-
-
-@functools.cache
-def _windows_cuDriverGetVersion() -> int:
-    handle = win32api.LoadLibrary("nvcuda.dll")
-
-    kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
-    GetProcAddress = kernel32.GetProcAddress
-    GetProcAddress.argtypes = [ctypes.wintypes.HMODULE, ctypes.wintypes.LPCSTR]
-    GetProcAddress.restype = ctypes.c_void_p
-    cuDriverGetVersion = GetProcAddress(handle, b"cuDriverGetVersion")
-    assert cuDriverGetVersion
-
-    FUNC_TYPE = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_int))
-    cuDriverGetVersion_fn = FUNC_TYPE(cuDriverGetVersion)
-    driver_ver = ctypes.c_int()
-    err = cuDriverGetVersion_fn(ctypes.byref(driver_ver))
-    assert err == 0
-    return driver_ver.value
-
-
-@functools.cache
-def _windows_load_with_dll_basename(name: str) -> int:
-    driver_ver = _windows_cuDriverGetVersion()
-    del driver_ver  # Keeping this here because it will probably be needed in the future.
-
-    if name == "nvJitLink":
-        dll_name = "nvJitLink_120_0.dll"
-    elif name == "nvrtc":
-        dll_name = "nvrtc64_120_0.dll"
-    elif name == "nvvm":
-        dll_name = "nvvm64_40_0.dll"
-
-    try:
-        return win32api.LoadLibrary(dll_name)
-    except pywintypes.error:
-        pass
-
-    return None
-
-
-@functools.cache
-def load_nvidia_dynamic_library(name: str) -> int:
-    # First try using the platform-specific dynamic loader search mechanisms
-    if sys.platform == "win32":
-        handle = _windows_load_with_dll_basename(name)
-        if handle:
-            return handle
-    else:
-        dl_path = f"lib{name}.so"  # Version intentionally no specified.
-        try:
-            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
-        except OSError:
-            pass
-        else:
-            # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
-            return handle._handle  # C unsigned int
-
-    dl_path = find_nvidia_dynamic_library(name)
-    if sys.platform == "win32":
-        try:
-            handle = win32api.LoadLibrary(dl_path)
-        except pywintypes.error as e:
-            raise RuntimeError(f"Failed to load DLL at {dl_path}: {e}") from e
-        # Use `cdef void* ptr = <void*><intptr_t>` in cython to convert back to void*
-        return handle  # C signed int, matches win32api.GetProcAddress
-    else:
-        try:
-            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
-        except OSError as e:
-            raise RuntimeError(f"Failed to dlopen {dl_path}: {e}") from e
-        # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
-        return handle._handle  # C unsigned int
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
deleted file mode 100644
index 324cdeec30..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import functools
-import os
-import sys
-
-
-@functools.cache
-def _impl(sys_path, sub_dirs):
-    results = []
-    for base in sys_path:
-        stack = [(base, 0)]  # (current_path, index into sub_dirs)
-        while stack:
-            current_path, idx = stack.pop()
-            if idx == len(sub_dirs):
-                if os.path.isdir(current_path):
-                    results.append(current_path)
-                continue
-
-            sub = sub_dirs[idx]
-            if sub == "*":
-                try:
-                    entries = sorted(os.listdir(current_path))
-                except OSError:
-                    continue
-                for entry in entries:
-                    entry_path = os.path.join(current_path, entry)
-                    if os.path.isdir(entry_path):
-                        stack.append((entry_path, idx + 1))
-            else:
-                next_path = os.path.join(current_path, sub)
-                if os.path.isdir(next_path):
-                    stack.append((next_path, idx + 1))
-    return results
-
-
-def sys_path_find_sub_dirs(sub_dirs):
-    return _impl(tuple(sys.path), tuple(sub_dirs))
\ No newline at end of file

From 9a5d5fea346410987266e9d85f32c84046306ad1 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 19:43:20 +0000
Subject: [PATCH 24/50] use cuda_pathfinder module for libdevice

---
 cuda_core/cuda/core/experimental/_program.py  |  43 +-----
 cuda_pathfinder/cuda/pathfinder/__init__.py   |   5 +
 .../_dynamic_libs/find_libdevice.py           | 146 ++++++++++++++++++
 3 files changed, 154 insertions(+), 40 deletions(-)
 create mode 100644 cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index a5ad2a3be2..fedafbc4fb 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -103,46 +103,9 @@ def _get_nvvm_module():
         raise e
 
 def _find_libdevice_path():
-    """
-    Find libdevice.10.bc using cuda.bindings.path_finder.
-    
-    Returns:
-        str: Path to libdevice.10.bc, or None if not found
-    """
-    try:
-        from cuda.bindings.path_finder import (
-            get_nvidia_libdevice_ctk,
-            get_libdevice_wheel,
-            get_debian_pkg_libdevice,
-        )
-        
-        for getter in [get_nvidia_libdevice_ctk, get_libdevice_wheel, get_debian_pkg_libdevice]:
-            try:
-                result = getter()
-                if result is not None and result.info is not None:
-                    return result.info
-            except Exception:
-                continue
-        
-        return None
-    except ImportError:
-        import os
-        
-        # CUDA_HOME
-        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
-        if cuda_home:
-            libdevice_path = os.path.join(cuda_home, "nvvm", "libdevice", "libdevice.10.bc")
-            if os.path.isfile(libdevice_path):
-                return libdevice_path
-        
-        # Linux paths
-        for base in ["/usr/local/cuda", "/opt/cuda"]:
-            libdevice_path = os.path.join(base, "nvvm", "libdevice", "libdevice.10.bc")
-            if os.path.isfile(libdevice_path):
-                return libdevice_path
-        
-        return None
-
+    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
+    from cuda.pathfinder import get_libdevice_path
+    return get_libdevice_path()
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 143c4b45cc..8820917318 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -6,6 +6,11 @@
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL
 from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib
+from cuda.pathfinder._dynamic_libs.find_libdevice import (
+    LibdeviceNotFoundError as LibdeviceNotFoundError,
+    find_libdevice as find_libdevice,
+    get_libdevice_path as get_libdevice_path,
+)
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
     SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES,  # noqa: F401
 )
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
new file mode 100644
index 0000000000..f1c0763765
--- /dev/null
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import functools
+import glob
+import os
+
+from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
+from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
+
+# Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
+SITE_PACKAGES_LIBDEVICE_DIRS = (
+    "nvidia/cuda_nvvm/nvvm/libdevice",   # CTK 13+
+    "nvidia/cuda_nvcc/nvvm/libdevice",   # CTK <13
+)
+
+
+class LibdeviceNotFoundError(Exception):
+    pass
+
+
+def _get_cuda_home_or_path() -> str | None:
+    return os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+
+
+def _no_such_file_in_dir(
+    dir_path: str, filename: str, error_messages: list[str], attachments: list[str]
+) -> None:
+    error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
+    if os.path.isdir(dir_path):
+        attachments.append(f'  listdir("{dir_path}"):')
+        for node in sorted(os.listdir(dir_path)):
+            attachments.append(f"    {node}")
+    else:
+        attachments.append(f'  Directory does not exist: "{dir_path}"')
+
+
+class _FindLibdevice:
+    FILENAME = "libdevice.10.bc"
+    REL_PATH = os.path.join("nvvm", "libdevice")
+    
+    def __init__(self):
+        self.error_messages: list[str] = []
+        self.attachments: list[str] = []
+        self.abs_path: str | None = None
+    
+    def try_site_packages(self) -> str | None:
+        for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS:
+            sub_dir = tuple(rel_dir.split("/"))
+            for abs_dir in find_sub_dirs_all_sitepackages(sub_dir):
+                file_path = os.path.join(abs_dir, self.FILENAME)
+                if os.path.isfile(file_path):
+                    return file_path
+        return None
+    
+    def try_with_conda_prefix(self) -> str | None:
+        conda_prefix = os.environ.get("CONDA_PREFIX")
+        if not conda_prefix:
+            return None
+        
+        anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix
+        file_path = os.path.join(anchor, self.REL_PATH, self.FILENAME)
+        if os.path.isfile(file_path):
+            return file_path
+        return None
+    
+    def try_with_cuda_home(self) -> str | None:
+        cuda_home = _get_cuda_home_or_path()
+        if cuda_home is None:
+            self.error_messages.append("CUDA_HOME/CUDA_PATH not set")
+            return None
+        
+        file_path = os.path.join(cuda_home, self.REL_PATH, self.FILENAME)
+        if os.path.isfile(file_path):
+            return file_path
+        
+        _no_such_file_in_dir(
+            os.path.join(cuda_home, self.REL_PATH),
+            self.FILENAME,
+            self.error_messages,
+            self.attachments,
+        )
+        return None
+    
+    def try_common_paths(self) -> str | None:
+        if IS_WINDOWS:
+            bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+        else:
+            bases = ["/usr/local/cuda", "/opt/cuda"]
+        
+        for base in bases:
+            # Direct path
+            file_path = os.path.join(base, self.REL_PATH, self.FILENAME)
+            if os.path.isfile(file_path):
+                return file_path
+            # Versioned paths (e.g., /usr/local/cuda-13.0)
+            for versioned in sorted(glob.glob(base + "*"), reverse=True):
+                if os.path.isdir(versioned):
+                    file_path = os.path.join(versioned, self.REL_PATH, self.FILENAME)
+                    if os.path.isfile(file_path):
+                        return file_path
+        return None
+    
+    def raise_not_found_error(self) -> None:
+        err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
+        att = "\n".join(self.attachments) if self.attachments else ""
+        raise LibdeviceNotFoundError(
+            f'Failure finding "{self.FILENAME}": {err}\n{att}'
+        )
+
+
+@functools.cache
+def find_libdevice() -> str:
+    """Find the path to libdevice.10.bc.
+    Raises:
+        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
+    """
+    finder = _FindLibdevice()
+    
+    abs_path = finder.try_site_packages()
+    if abs_path is None:
+        abs_path = finder.try_with_conda_prefix()
+    if abs_path is None:
+        abs_path = finder.try_with_cuda_home()
+    if abs_path is None:
+        abs_path = finder.try_common_paths()
+    
+    if abs_path is None:
+        finder.raise_not_found_error()
+    
+    return abs_path
+
+
+def get_libdevice_path() -> str | None:
+    """Get the path to libdevice.10.bc, or None if not found."""
+    finder = _FindLibdevice()
+    
+    abs_path = finder.try_site_packages()
+    if abs_path is None:
+        abs_path = finder.try_with_conda_prefix()
+    if abs_path is None:
+        abs_path = finder.try_with_cuda_home()
+    if abs_path is None:
+        abs_path = finder.try_common_paths()
+    
+    return abs_path
+

From 07c619958cbdbdda7360837ac6c982cbefb3b2f6 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 5 Feb 2026 17:37:45 +0000
Subject: [PATCH 25/50] rebase

---
 cuda_core/cuda/core/experimental/_program.py | 46 +++++++++-----------
 cuda_core/tests/test_program.py              | 32 +++++++-------
 2 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index fedafbc4fb..47b78fd831 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -13,11 +13,11 @@
 if TYPE_CHECKING:
     import cuda.bindings
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     _handle_boolean_option,
     check_or_create_options,
@@ -102,10 +102,6 @@ def _get_nvvm_module():
         _nvvm_module = None
         raise e
 
-def _find_libdevice_path():
-    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
-    from cuda.pathfinder import get_libdevice_path
-    return get_libdevice_path()
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
@@ -118,7 +114,11 @@ def _process_define_macro_inner(formatted_options, macro):
         return True
     return False
 
-
+def _find_libdevice_path():
+    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
+    from cuda.pathfinder import get_libdevice_path
+    return get_libdevice_path()
+    
 def _process_define_macro(formatted_options, macro):
     union_type = "Union[str, tuple[str, str]]"
     if _process_define_macro_inner(formatted_options, macro):
@@ -236,13 +236,9 @@ class ProgramOptions:
     no_display_error_number : bool, optional
         Disable the display of a diagnostic number for warning messages.
         Default: False
-    diag_error : Union[int, list[int]], optional
-        Emit error for a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_suppress : Union[int, list[int]], optional
-        Suppress a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_warn : Union[int, list[int]], optional
+    diag_error: Union[int, list[int], tuple[int]] | None = None
+    diag_suppress: Union[int, list[int], tuple[int]] | None = None
+    diag_warn: Union[int, list[int], tuple[int]] | None = None
         Emit warning for a specified diagnostic message number or comma separated lis of numbers.
         Default: None
     brief_diagnostics : bool, optional
@@ -339,7 +335,6 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
-    # Creating as 2 tuples ((names, source), (names,source))
     extra_sources: (
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
@@ -356,7 +351,7 @@ class ProgramOptions:
     pch_messages: bool | None = None
     instantiate_templates_in_pch: bool | None = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-    use_libdevice: bool | None = None # Use libdevice
+    use_libdevice: bool | None = None # For libdevice execution 
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -501,7 +496,7 @@ def _prepare_nvrtc_options(self) -> list[bytes]:
             options.append("--numba-debug")
         return [o.encode() for o in options]
 
-    def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]:
+     def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]:
         options = []
 
         # Options supported by NVVM
@@ -690,11 +685,11 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
             self._backend = "NVRTC"
@@ -704,7 +699,6 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the PTX backend.")
-
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -872,7 +866,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
             nvvm = _get_nvvm_module()
             with _nvvm_exception_manager(self):
                 nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-                # Invoke libdevice
+                # libdevice compilation 
                 if getattr(self, '_use_libdevice', False):
                     libdevice_path = _find_libdevice_path()
                     if libdevice_path is None:
@@ -882,7 +876,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                         )
                     with open(libdevice_path, "rb") as f:
                         libdevice_bc = f.read()
-                    # Use lazy_add_module for libdevice bitcode only following numba-cuda
+                    # libdevice for numba-cuda
                     nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None)
                 
                 nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
@@ -923,4 +917,4 @@ def handle(self) -> ProgramHandleT:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Program.handle)``.
         """
-        return self._mnff.handle
+        return self._mnff.handle
\ No newline at end of file
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 00d0709870..c8effbb825 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
@@ -6,11 +6,11 @@
 import warnings
 
 import pytest
-from cuda.core.experimental import _linker
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda.core import _linker
+from cuda.core._device import Device
+from cuda.core._module import Kernel, ObjectCode
+from cuda.core._program import Program, ProgramOptions
+from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -21,12 +21,11 @@
     _test_helpers_available = True
 except ImportError:
     _test_helpers_available = False
-
-
+    
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         _get_nvvm_module()
         return True
@@ -39,7 +38,7 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return
+    from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
@@ -49,6 +48,7 @@ def _is_nvvm_available():
 def _get_nvrtc_version_for_tests():
     """
     Get NVRTC version.
+
     Returns:
         int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
         None: If NVRTC is not available
@@ -60,6 +60,7 @@ def _get_nvrtc_version_for_tests():
     except Exception:
         return None
 
+
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 
@@ -97,7 +98,7 @@ def _get_libnvvm_version_for_tests():
     _libnvvm_version_attempted = True
 
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         nvvm = _get_nvvm_module()
 
@@ -145,7 +146,7 @@ def nvvm_ir():
     fallback assumes no version metadata will be present in
     the input nvvm ir
     """
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -340,7 +341,7 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path):
 
 @pytest.mark.parametrize("options", options)
 def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options):
-    program = Program(ptx_code_object._module.decode(), "ptx", options=options)
+    program = Program(ptx_code_object.code.decode(), "ptx", options=options)
     assert program.backend == ("driver" if is_culink_backend else "nvJitLink")
     program.compile("cubin")
     program.close()
@@ -383,7 +384,7 @@ def test_program_compile_valid_target_type(init_cuda):
         ptx_kernel = ptx_object_code.get_kernel("my_kernel")
         assert isinstance(ptx_kernel, Kernel)
 
-    program = Program(ptx_object_code._module.decode(), "ptx", options={"name": "24"})
+    program = Program(ptx_object_code.code.decode(), "ptx", options={"name": "24"})
     cubin_object_code = program.compile("cubin")
     assert isinstance(cubin_object_code, ObjectCode)
     assert cubin_object_code.name == "24"
@@ -420,7 +421,7 @@ def test_program_close():
 @nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     assert nvvm is not None
@@ -689,6 +690,7 @@ def test_cpp_program_with_extra_sources():
     options = ProgramOptions(extra_sources=helper)
     with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
         Program(code, "c++", options)
+        
 def test_program_options_as_bytes_nvrtc():
     """Test ProgramOptions.as_bytes() for NVRTC backend"""
     options = ProgramOptions(arch="sm_80", debug=True, lineinfo=True, ftz=True)

From e1b19cc8f51e754a7b997c6a5efc69f1514a4c83 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 5 Feb 2026 17:42:05 +0000
Subject: [PATCH 26/50] rebase

---
 cuda_core/cuda/core/{experimental => }/_program.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cuda_core/cuda/core/{experimental => }/_program.py (100%)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/_program.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_program.py
rename to cuda_core/cuda/core/_program.py

From dcdd1009bfdf4f998010f065a892746f32444fd5 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 6 Feb 2026 12:22:31 -0800
Subject: [PATCH 27/50] tests

---
 cuda_pathfinder/tests/test_find_libdevice.py | 94 ++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 cuda_pathfinder/tests/test_find_libdevice.py

diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
new file mode 100644
index 0000000000..2d24f397fe
--- /dev/null
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -0,0 +1,94 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import pytest
+
+from cuda.pathfinder import find_libdevice
+from cuda.pathfinder._dynamic_libs import find_libdevice as find_libdevice_module
+
+FILENAME = "libdevice.10.bc"
+
+SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
+SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
+
+
+@pytest.fixture
+def clear_find_libdevice_cache():
+    find_libdevice.cache_clear()
+    yield
+    find_libdevice.cache_clear()
+
+
+def _make_libdevice_file(dir_path: str) -> str:
+    os.makedirs(dir_path, exist_ok=True)
+    file_path = os.path.join(dir_path, FILENAME)
+    with open(file_path, "wb"):
+        pass
+    return file_path
+
+
+@pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13])
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
+    libdevice_dir = tmp_path.joinpath(*rel_dir.split("/"))
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[str(libdevice_dir)],
+    )
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice()
+
+    assert result == expected_path
+    assert os.path.isfile(result)
+
+
+# same for cu12/cu13
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
+    rel_path = os.path.join("nvvm", "libdevice")
+    libdevice_dir = tmp_path / rel_path
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(find_libdevice_module, "IS_WINDOWS", False)
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[],
+    )
+    monkeypatch.setenv("CONDA_PREFIX", str(tmp_path))
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice()
+
+    assert result == expected_path
+    assert os.path.isfile(result)
+
+
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
+    rel_path = os.path.join("nvvm", "libdevice")
+    libdevice_dir = tmp_path / rel_path
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[],
+    )
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.setenv("CUDA_HOME", str(tmp_path))
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice()
+
+    assert result == expected_path
+    assert os.path.isfile(result)

From aca2e361a4d503b7e289dd31db86842a8fff1384 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 11 Feb 2026 07:42:44 -0800
Subject: [PATCH 28/50] Address reviews

---
 cuda_core/tests/test_program.py               | 12 +--
 .../_dynamic_libs/find_libdevice.py           | 97 ++++++++-----------
 2 files changed, 43 insertions(+), 66 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c8effbb825..bf099c860a 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -15,13 +15,7 @@
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
-try:
-    from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir
 
-    _test_helpers_available = True
-except ImportError:
-    _test_helpers_available = False
-    
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
@@ -658,7 +652,6 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
@@ -690,7 +683,8 @@ def test_cpp_program_with_extra_sources():
     options = ProgramOptions(extra_sources=helper)
     with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
         Program(code, "c++", options)
-        
+
+
 def test_program_options_as_bytes_nvrtc():
     """Test ProgramOptions.as_bytes() for NVRTC backend"""
     options = ProgramOptions(arch="sm_80", debug=True, lineinfo=True, ftz=True)
@@ -738,4 +732,4 @@ def test_program_options_as_bytes_nvvm_unsupported_option():
     """Test that unsupported options raise CUDAError for NVVM backend"""
     options = ProgramOptions(arch="sm_80", lineinfo=True)
     with pytest.raises(CUDAError, match="not supported by NVVM backend"):
-        options.as_bytes("nvvm")
\ No newline at end of file
+        options.as_bytes("nvvm")
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
index f1c0763765..29296cfbde 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
@@ -1,30 +1,27 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 import functools
 import glob
 import os
 
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
+from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cuda_nvvm/nvvm/libdevice",   # CTK 13+
-    "nvidia/cuda_nvcc/nvvm/libdevice",   # CTK <13
+    "nvidia/cuda_nvvm/nvvm/libdevice",  # CTK 13+
+    "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
 )
 
-
-class LibdeviceNotFoundError(Exception):
-    pass
+FILENAME = "libdevice.10.bc"
 
 
-def _get_cuda_home_or_path() -> str | None:
-    return os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+class LibdeviceNotFoundError(RuntimeError):
+    pass
 
 
-def _no_such_file_in_dir(
-    dir_path: str, filename: str, error_messages: list[str], attachments: list[str]
-) -> None:
+def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
     if os.path.isdir(dir_path):
         attachments.append(f'  listdir("{dir_path}"):')
@@ -35,105 +32,80 @@ def _no_such_file_in_dir(
 
 
 class _FindLibdevice:
-    FILENAME = "libdevice.10.bc"
     REL_PATH = os.path.join("nvvm", "libdevice")
-    
-    def __init__(self):
+
+    def __init__(self) -> None:
         self.error_messages: list[str] = []
         self.attachments: list[str] = []
         self.abs_path: str | None = None
-    
+
     def try_site_packages(self) -> str | None:
         for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS:
             sub_dir = tuple(rel_dir.split("/"))
             for abs_dir in find_sub_dirs_all_sitepackages(sub_dir):
-                file_path = os.path.join(abs_dir, self.FILENAME)
+                file_path = os.path.join(abs_dir, FILENAME)
                 if os.path.isfile(file_path):
                     return file_path
         return None
-    
+
     def try_with_conda_prefix(self) -> str | None:
         conda_prefix = os.environ.get("CONDA_PREFIX")
         if not conda_prefix:
             return None
-        
+
         anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix
-        file_path = os.path.join(anchor, self.REL_PATH, self.FILENAME)
+        file_path = os.path.join(anchor, self.REL_PATH, FILENAME)
         if os.path.isfile(file_path):
             return file_path
         return None
-    
+
     def try_with_cuda_home(self) -> str | None:
-        cuda_home = _get_cuda_home_or_path()
+        cuda_home = get_cuda_home_or_path()
         if cuda_home is None:
             self.error_messages.append("CUDA_HOME/CUDA_PATH not set")
             return None
-        
-        file_path = os.path.join(cuda_home, self.REL_PATH, self.FILENAME)
+
+        file_path = os.path.join(cuda_home, self.REL_PATH, FILENAME)
         if os.path.isfile(file_path):
             return file_path
-        
+
         _no_such_file_in_dir(
             os.path.join(cuda_home, self.REL_PATH),
-            self.FILENAME,
+            FILENAME,
             self.error_messages,
             self.attachments,
         )
         return None
-    
+
     def try_common_paths(self) -> str | None:
         if IS_WINDOWS:
             bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
         else:
             bases = ["/usr/local/cuda", "/opt/cuda"]
-        
+
         for base in bases:
             # Direct path
-            file_path = os.path.join(base, self.REL_PATH, self.FILENAME)
+            file_path = os.path.join(base, self.REL_PATH, FILENAME)
             if os.path.isfile(file_path):
                 return file_path
             # Versioned paths (e.g., /usr/local/cuda-13.0)
             for versioned in sorted(glob.glob(base + "*"), reverse=True):
                 if os.path.isdir(versioned):
-                    file_path = os.path.join(versioned, self.REL_PATH, self.FILENAME)
+                    file_path = os.path.join(versioned, self.REL_PATH, FILENAME)
                     if os.path.isfile(file_path):
                         return file_path
         return None
-    
+
     def raise_not_found_error(self) -> None:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
-        raise LibdeviceNotFoundError(
-            f'Failure finding "{self.FILENAME}": {err}\n{att}'
-        )
-
-
-@functools.cache
-def find_libdevice() -> str:
-    """Find the path to libdevice.10.bc.
-    Raises:
-        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
-    """
-    finder = _FindLibdevice()
-    
-    abs_path = finder.try_site_packages()
-    if abs_path is None:
-        abs_path = finder.try_with_conda_prefix()
-    if abs_path is None:
-        abs_path = finder.try_with_cuda_home()
-    if abs_path is None:
-        abs_path = finder.try_common_paths()
-    
-    if abs_path is None:
-        finder.raise_not_found_error()
-    
-    return abs_path
+        raise LibdeviceNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
 
 
 def get_libdevice_path() -> str | None:
     """Get the path to libdevice.10.bc, or None if not found."""
     finder = _FindLibdevice()
-    
+
     abs_path = finder.try_site_packages()
     if abs_path is None:
         abs_path = finder.try_with_conda_prefix()
@@ -141,6 +113,17 @@ def get_libdevice_path() -> str | None:
         abs_path = finder.try_with_cuda_home()
     if abs_path is None:
         abs_path = finder.try_common_paths()
-    
+
     return abs_path
 
+
+@functools.cache
+def find_libdevice() -> str:
+    """Find the path to libdevice*.bc.
+    Raises:
+        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
+    """
+    path_or_none = get_libdevice_path()
+    if path_or_none is None:
+        raise LibdeviceNotFoundError(f"{FILENAME} not found")
+    return path_or_none

From af6e70ac8e3e92c8be073ce2b362763fa7f5a087 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 11 Feb 2026 07:50:35 -0800
Subject: [PATCH 29/50] put libdevice stuff under _static_libs

---
 cuda_pathfinder/cuda/pathfinder/__init__.py      | 16 ++++++++++------
 .../find_libdevice.py                            |  0
 cuda_pathfinder/tests/test_find_libdevice.py     |  2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)
 rename cuda_pathfinder/cuda/pathfinder/{_dynamic_libs => _static_libs}/find_libdevice.py (100%)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 3b2d44eb5d..ece060d81a 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 """cuda.pathfinder public APIs"""
@@ -10,11 +10,6 @@
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL
 from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib
-from cuda.pathfinder._dynamic_libs.find_libdevice import (
-    LibdeviceNotFoundError as LibdeviceNotFoundError,
-    find_libdevice as find_libdevice,
-    get_libdevice_path as get_libdevice_path,
-)
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
     SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES,  # noqa: F401
 )
@@ -24,6 +19,15 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
+from cuda.pathfinder._static_libs.find_libdevice import (
+    LibdeviceNotFoundError as LibdeviceNotFoundError,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
+    find_libdevice as find_libdevice,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
+    get_libdevice_path as get_libdevice_path,
+)
 
 from cuda.pathfinder._version import __version__  # isort: skip  # noqa: F401
 
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
similarity index 100%
rename from cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
rename to cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
index 2d24f397fe..a1dfc0b64f 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -6,7 +6,7 @@
 import pytest
 
 from cuda.pathfinder import find_libdevice
-from cuda.pathfinder._dynamic_libs import find_libdevice as find_libdevice_module
+from cuda.pathfinder._static_libs import find_libdevice as find_libdevice_module
 
 FILENAME = "libdevice.10.bc"
 

From b1d423f591426283f7948237684fa6f1aff6c560 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 11 Feb 2026 16:32:14 +0000
Subject: [PATCH 30/50] refresh reviews

---
 .../pathfinder/_static_libs/find_libdevice.py | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 29296cfbde..0cfcd4e493 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -7,6 +7,7 @@
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
@@ -15,11 +16,10 @@
 )
 
 FILENAME = "libdevice.10.bc"
-
-
-class LibdeviceNotFoundError(RuntimeError):
-    pass
-
+if IS_WINDOWS:
+    bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+else:
+    bases = ["/usr/local/cuda", "/opt/cuda"]
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
@@ -78,10 +78,6 @@ def try_with_cuda_home(self) -> str | None:
         return None
 
     def try_common_paths(self) -> str | None:
-        if IS_WINDOWS:
-            bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-        else:
-            bases = ["/usr/local/cuda", "/opt/cuda"]
 
         for base in bases:
             # Direct path
@@ -99,11 +95,11 @@ def try_common_paths(self) -> str | None:
     def raise_not_found_error(self) -> None:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
-        raise LibdeviceNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
+        raise DynamicLibNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
 
 
 def get_libdevice_path() -> str | None:
-    """Get the path to libdevice.10.bc, or None if not found."""
+    """Get the path to libdevice*.bc, or None if not found."""
     finder = _FindLibdevice()
 
     abs_path = finder.try_site_packages()
@@ -121,9 +117,9 @@ def get_libdevice_path() -> str | None:
 def find_libdevice() -> str:
     """Find the path to libdevice*.bc.
     Raises:
-        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
+        DynamicLibNotFoundError: If libdevice.10.bc cannot be found
     """
     path_or_none = get_libdevice_path()
     if path_or_none is None:
-        raise LibdeviceNotFoundError(f"{FILENAME} not found")
+        raise DynamicLibNotFoundError(f"{FILENAME} not found")
     return path_or_none

From 4cedbb7c2bb6790c6f8e4f7a60319c28663d412c Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 11 Feb 2026 17:37:12 +0000
Subject: [PATCH 31/50] change program to cython per PR 1565

---
 cuda_core/cuda/core/_program.pxd |   17 +
 cuda_core/cuda/core/_program.py  |  925 --------------------------
 cuda_core/cuda/core/_program.pyx | 1038 ++++++++++++++++++++++++++++++
 3 files changed, 1055 insertions(+), 925 deletions(-)
 create mode 100644 cuda_core/cuda/core/_program.pxd
 delete mode 100644 cuda_core/cuda/core/_program.py
 create mode 100644 cuda_core/cuda/core/_program.pyx

diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
new file mode 100644
index 0000000000..d4abe85ff8
--- /dev/null
+++ b/cuda_core/cuda/core/_program.pxd
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ._resource_handles cimport NvrtcProgramHandle, NvvmProgramHandle
+
+
+cdef class Program:
+    cdef:
+        NvrtcProgramHandle _h_nvrtc
+        NvvmProgramHandle _h_nvvm
+        str _backend
+        object _linker  # Linker
+        object _options  # ProgramOptions
+        object __weakref__
+        bint _use_libdevice      # Flag for libdevice loading
+        int _module_count  
diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py
deleted file mode 100644
index 34eac95270..0000000000
--- a/cuda_core/cuda/core/_program.py
+++ /dev/null
@@ -1,925 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import weakref
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
-from warnings import warn
-
-if TYPE_CHECKING:
-    import cuda.bindings
-
-from cuda.core._device import Device
-from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
-from cuda.core._module import ObjectCode
-from cuda.core._utils.clear_error_support import assert_type
-from cuda.core._utils.cuda_utils import (
-    CUDAError,
-    _handle_boolean_option,
-    check_or_create_options,
-    driver,
-    get_binding_version,
-    handle_return,
-    is_nested_sequence,
-    is_sequence,
-    nvrtc,
-)
-
-
-@contextmanager
-def _nvvm_exception_manager(self):
-    """
-    Taken from _linker.py
-    """
-    try:
-        yield
-    except Exception as e:
-        error_log = ""
-        if hasattr(self, "_mnff"):
-            try:
-                nvvm = _get_nvvm_module()
-                logsize = nvvm.get_program_log_size(self._mnff.handle)
-                if logsize > 1:
-                    log = bytearray(logsize)
-                    nvvm.get_program_log(self._mnff.handle, log)
-                    error_log = log.decode("utf-8", errors="backslashreplace")
-            except Exception:
-                error_log = ""
-        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
-        # unfortunately we are still supporting Python 3.10...
-        e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:])
-        raise e
-
-
-_nvvm_module = None
-_nvvm_import_attempted = False
-
-
-def _get_nvvm_module():
-    """
-    Handles the import of NVVM module with version and availability checks.
-    NVVM bindings were added in cuda-bindings 12.9.0, so we need to handle cases where:
-    1. cuda.bindings is not new enough (< 12.9.0)
-    2. libnvvm is not found in the Python environment
-
-    Returns:
-        The nvvm module if available and working
-
-    Raises:
-        RuntimeError: If NVVM is not available due to version or library issues
-    """
-    global _nvvm_module, _nvvm_import_attempted
-
-    if _nvvm_import_attempted:
-        if _nvvm_module is None:
-            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
-        return _nvvm_module
-
-    _nvvm_import_attempted = True
-
-    try:
-        version = get_binding_version()
-        if version < (12, 9):
-            raise RuntimeError(
-                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
-                "Please update cuda-bindings to use NVVM features."
-            )
-
-        from cuda.bindings import nvvm
-        from cuda.bindings._internal.nvvm import _inspect_function_pointer
-
-        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
-            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
-
-        _nvvm_module = nvvm
-        return _nvvm_module
-
-    except RuntimeError as e:
-        _nvvm_module = None
-        raise e
-
-
-def _process_define_macro_inner(formatted_options, macro):
-    if isinstance(macro, str):
-        formatted_options.append(f"--define-macro={macro}")
-        return True
-    if isinstance(macro, tuple):
-        if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
-            raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
-        formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
-        return True
-    return False
-
-def _find_libdevice_path():
-    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
-    from cuda.pathfinder import get_libdevice_path
-    return get_libdevice_path()
-    
-def _process_define_macro(formatted_options, macro):
-    union_type = "Union[str, tuple[str, str]]"
-    if _process_define_macro_inner(formatted_options, macro):
-        return
-    if is_nested_sequence(macro):
-        for seq_macro in macro:
-            if not _process_define_macro_inner(formatted_options, seq_macro):
-                raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
-        return
-    raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
-
-
-@dataclass
-class ProgramOptions:
-    """Customizable options for configuring `Program`.
-
-    Attributes
-    ----------
-    name : str, optional
-        Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`.
-    arch : str, optional
-        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
-        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
-        will be used.
-    relocatable_device_code : bool, optional
-        Enable (disable) the generation of relocatable device code.
-        Default: False
-    extensible_whole_program : bool, optional
-        Do extensible whole program compilation of device code.
-        Default: False
-    debug : bool, optional
-        Generate debug information. If --dopt is not specified, then turns off all optimizations.
-        Default: False
-    lineinfo: bool, optional
-        Generate line-number information.
-        Default: False
-    device_code_optimize : bool, optional
-        Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
-        for optimized device code.
-        Default: None
-    ptxas_options : Union[str, list[str]], optional
-        Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
-        For example ["-v", "-O2"].
-        Default: None
-    max_register_count : int, optional
-        Specify the maximum amount of registers that GPU functions can use.
-        Default: None
-    ftz : bool, optional
-        When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
-        values.
-        Default: False
-    prec_sqrt : bool, optional
-        For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
-        Default: True
-    prec_div : bool, optional
-        For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
-        approximation.
-        Default: True
-    fma : bool, optional
-        Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
-        multiply-add operations.
-        Default: True
-    use_fast_math : bool, optional
-        Make use of fast math operations.
-        Default: False
-    extra_device_vectorization : bool, optional
-        Enables more aggressive device code vectorization in the NVVM optimizer.
-        Default: False
-    link_time_optimization : bool, optional
-        Generate intermediate code for later link-time optimization.
-        Default: False
-    gen_opt_lto : bool, optional
-        Run the optimizer passes before generating the LTO IR.
-        Default: False
-    define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional
-        Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
-        strings, in which case the first element is defined as the second, or a list of strings or tuples.
-        Default: None
-    undefine_macro : Union[str, list[str]], optional
-        Cancel any previous definition of a macro, or list of macros.
-        Default: None
-    include_path : Union[str, list[str]], optional
-        Add the directory or directories to the list of directories to be searched for headers.
-        Default: None
-    pre_include : Union[str, list[str]], optional
-        Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
-        Default: None
-    no_source_include : bool, optional
-        Disable the default behavior of adding the directory of each input source to the include path.
-        Default: False
-    std : str, optional
-        Set language dialect to C++03, C++11, C++14, C++17 or C++20.
-        Default: c++17
-    builtin_move_forward : bool, optional
-        Provide builtin definitions of std::move and std::forward.
-        Default: True
-    builtin_initializer_list : bool, optional
-        Provide builtin definitions of std::initializer_list class and member functions.
-        Default: True
-    disable_warnings : bool, optional
-        Inhibit all warning messages.
-        Default: False
-    restrict : bool, optional
-        Programmer assertion that all kernel pointer parameters are restrict pointers.
-        Default: False
-    device_as_default_execution_space : bool, optional
-        Treat entities with no execution space annotation as __device__ entities.
-        Default: False
-    device_int128 : bool, optional
-        Allow the __int128 type in device code.
-        Default: False
-    optimization_info : str, optional
-        Provide optimization reports for the specified kind of optimization.
-        Default: None
-    no_display_error_number : bool, optional
-        Disable the display of a diagnostic number for warning messages.
-        Default: False
-    diag_error : Union[int, list[int]], optional
-        Emit error for a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_suppress : Union[int, list[int]], optional
-        Suppress a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_warn : Union[int, list[int]], optional
-        Emit warning for a specified diagnostic message number or comma separated lis of numbers.
-        Default: None
-    brief_diagnostics : bool, optional
-        Disable or enable showing source line and column info in a diagnostic.
-        Default: False
-    time : str, optional
-        Generate a CSV table with the time taken by each compilation phase.
-        Default: None
-    split_compile : int, optional
-        Perform compiler optimizations in parallel.
-        Default: 1
-    fdevice_syntax_only : bool, optional
-        Ends device compilation after front-end syntax checking.
-        Default: False
-    minimal : bool, optional
-        Omit certain language features to reduce compile time for small programs.
-        Default: False
-    no_cache : bool, optional
-        Disable compiler caching.
-        Default: False
-    fdevice_time_trace : str, optional
-        Generate time trace JSON for profiling compilation (NVRTC only).
-        Default: None
-    device_float128 : bool, optional
-        Allow __float128 type in device code (NVRTC only).
-        Default: False
-    frandom_seed : str, optional
-        Set random seed for randomized optimizations (NVRTC only).
-        Default: None
-    ofast_compile : str, optional
-        Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only).
-        Default: None
-    pch : bool, optional
-        Use default precompiled header (NVRTC only, CUDA 12.8+).
-        Default: False
-    create_pch : str, optional
-        Create precompiled header file (NVRTC only, CUDA 12.8+).
-        Default: None
-    use_pch : str, optional
-        Use specific precompiled header file (NVRTC only, CUDA 12.8+).
-        Default: None
-    pch_dir : str, optional
-        PCH directory location (NVRTC only, CUDA 12.8+).
-        Default: None
-    pch_verbose : bool, optional
-        Verbose PCH output (NVRTC only, CUDA 12.8+).
-        Default: False
-    pch_messages : bool, optional
-        Control PCH diagnostic messages (NVRTC only, CUDA 12.8+).
-        Default: False
-    instantiate_templates_in_pch : bool, optional
-        Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
-        Default: False
-    """
-
-    name: str | None = "default_program"
-    arch: str | None = None
-    relocatable_device_code: bool | None = None
-    extensible_whole_program: bool | None = None
-    debug: bool | None = None
-    lineinfo: bool | None = None
-    device_code_optimize: bool | None = None
-    ptxas_options: Union[str, list[str], tuple[str]] | None = None
-    max_register_count: int | None = None
-    ftz: bool | None = None
-    prec_sqrt: bool | None = None
-    prec_div: bool | None = None
-    fma: bool | None = None
-    use_fast_math: bool | None = None
-    extra_device_vectorization: bool | None = None
-    link_time_optimization: bool | None = None
-    gen_opt_lto: bool | None = None
-    define_macro: (
-        Union[str, tuple[str, str], list[Union[str, tuple[str, str]]], tuple[Union[str, tuple[str, str]]]] | None
-    ) = None
-    undefine_macro: Union[str, list[str], tuple[str]] | None = None
-    include_path: Union[str, list[str], tuple[str]] | None = None
-    pre_include: Union[str, list[str], tuple[str]] | None = None
-    no_source_include: bool | None = None
-    std: str | None = None
-    builtin_move_forward: bool | None = None
-    builtin_initializer_list: bool | None = None
-    disable_warnings: bool | None = None
-    restrict: bool | None = None
-    device_as_default_execution_space: bool | None = None
-    device_int128: bool | None = None
-    optimization_info: str | None = None
-    no_display_error_number: bool | None = None
-    diag_error: Union[int, list[int], tuple[int]] | None = None
-    diag_suppress: Union[int, list[int], tuple[int]] | None = None
-    diag_warn: Union[int, list[int], tuple[int]] | None = None
-    brief_diagnostics: bool | None = None
-    time: str | None = None
-    split_compile: int | None = None
-    fdevice_syntax_only: bool | None = None
-    minimal: bool | None = None
-    extra_sources: (
-        Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
-    ) = None
-    no_cache: bool | None = None
-    fdevice_time_trace: str | None = None
-    device_float128: bool | None = None
-    frandom_seed: str | None = None
-    ofast_compile: str | None = None
-    pch: bool | None = None
-    create_pch: str | None = None
-    use_pch: str | None = None
-    pch_dir: str | None = None
-    pch_verbose: bool | None = None
-    pch_messages: bool | None = None
-    instantiate_templates_in_pch: bool | None = None
-    numba_debug: bool | None = None  # Custom option for Numba debugging
-    use_libdevice: bool | None = None # For libdevice execution 
-
-    def __post_init__(self):
-        self._name = self.name.encode()
-        # Set arch to default if not provided
-        if self.arch is None:
-            self.arch = f"sm_{Device().arch}"
-
-    def _prepare_nvrtc_options(self) -> list[bytes]:
-        # Build NVRTC-specific options
-        options = [f"-arch={self.arch}"]
-        if self.relocatable_device_code is not None:
-            options.append(f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}")
-        if self.extensible_whole_program is not None and self.extensible_whole_program:
-            options.append("--extensible-whole-program")
-        if self.debug is not None and self.debug:
-            options.append("--device-debug")
-        if self.lineinfo is not None and self.lineinfo:
-            options.append("--generate-line-info")
-        if self.device_code_optimize is not None and self.device_code_optimize:
-            options.append("--dopt=on")
-        if self.ptxas_options is not None:
-            opt_name = "--ptxas-options"
-            if isinstance(self.ptxas_options, str):
-                options.append(f"{opt_name}={self.ptxas_options}")
-            elif is_sequence(self.ptxas_options):
-                for opt_value in self.ptxas_options:
-                    options.append(f"{opt_name}={opt_value}")
-        if self.max_register_count is not None:
-            options.append(f"--maxrregcount={self.max_register_count}")
-        if self.ftz is not None:
-            options.append(f"--ftz={_handle_boolean_option(self.ftz)}")
-        if self.prec_sqrt is not None:
-            options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}")
-        if self.prec_div is not None:
-            options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}")
-        if self.fma is not None:
-            options.append(f"--fmad={_handle_boolean_option(self.fma)}")
-        if self.use_fast_math is not None and self.use_fast_math:
-            options.append("--use_fast_math")
-        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
-            options.append("--extra-device-vectorization")
-        if self.link_time_optimization is not None and self.link_time_optimization:
-            options.append("--dlink-time-opt")
-        if self.gen_opt_lto is not None and self.gen_opt_lto:
-            options.append("--gen-opt-lto")
-        if self.define_macro is not None:
-            _process_define_macro(options, self.define_macro)
-        if self.undefine_macro is not None:
-            if isinstance(self.undefine_macro, str):
-                options.append(f"--undefine-macro={self.undefine_macro}")
-            elif is_sequence(self.undefine_macro):
-                for macro in self.undefine_macro:
-                    options.append(f"--undefine-macro={macro}")
-        if self.include_path is not None:
-            if isinstance(self.include_path, str):
-                options.append(f"--include-path={self.include_path}")
-            elif is_sequence(self.include_path):
-                for path in self.include_path:
-                    options.append(f"--include-path={path}")
-        if self.pre_include is not None:
-            if isinstance(self.pre_include, str):
-                options.append(f"--pre-include={self.pre_include}")
-            elif is_sequence(self.pre_include):
-                for header in self.pre_include:
-                    options.append(f"--pre-include={header}")
-        if self.no_source_include is not None and self.no_source_include:
-            options.append("--no-source-include")
-        if self.std is not None:
-            options.append(f"--std={self.std}")
-        if self.builtin_move_forward is not None:
-            options.append(f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}")
-        if self.builtin_initializer_list is not None:
-            options.append(f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}")
-        if self.disable_warnings is not None and self.disable_warnings:
-            options.append("--disable-warnings")
-        if self.restrict is not None and self.restrict:
-            options.append("--restrict")
-        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
-            options.append("--device-as-default-execution-space")
-        if self.device_int128 is not None and self.device_int128:
-            options.append("--device-int128")
-        if self.device_float128 is not None and self.device_float128:
-            options.append("--device-float128")
-        if self.optimization_info is not None:
-            options.append(f"--optimization-info={self.optimization_info}")
-        if self.no_display_error_number is not None and self.no_display_error_number:
-            options.append("--no-display-error-number")
-        if self.diag_error is not None:
-            if isinstance(self.diag_error, int):
-                options.append(f"--diag-error={self.diag_error}")
-            elif is_sequence(self.diag_error):
-                for error in self.diag_error:
-                    options.append(f"--diag-error={error}")
-        if self.diag_suppress is not None:
-            if isinstance(self.diag_suppress, int):
-                options.append(f"--diag-suppress={self.diag_suppress}")
-            elif is_sequence(self.diag_suppress):
-                for suppress in self.diag_suppress:
-                    options.append(f"--diag-suppress={suppress}")
-        if self.diag_warn is not None:
-            if isinstance(self.diag_warn, int):
-                options.append(f"--diag-warn={self.diag_warn}")
-            elif is_sequence(self.diag_warn):
-                for warn in self.diag_warn:
-                    options.append(f"--diag-warn={warn}")
-        if self.brief_diagnostics is not None:
-            options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}")
-        if self.time is not None:
-            options.append(f"--time={self.time}")
-        if self.split_compile is not None:
-            options.append(f"--split-compile={self.split_compile}")
-        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
-            options.append("--fdevice-syntax-only")
-        if self.minimal is not None and self.minimal:
-            options.append("--minimal")
-        if self.no_cache is not None and self.no_cache:
-            options.append("--no-cache")
-        if self.fdevice_time_trace is not None:
-            options.append(f"--fdevice-time-trace={self.fdevice_time_trace}")
-        if self.frandom_seed is not None:
-            options.append(f"--frandom-seed={self.frandom_seed}")
-        if self.ofast_compile is not None:
-            options.append(f"--Ofast-compile={self.ofast_compile}")
-        # PCH options (CUDA 12.8+)
-        if self.pch is not None and self.pch:
-            options.append("--pch")
-        if self.create_pch is not None:
-            options.append(f"--create-pch={self.create_pch}")
-        if self.use_pch is not None:
-            options.append(f"--use-pch={self.use_pch}")
-        if self.pch_dir is not None:
-            options.append(f"--pch-dir={self.pch_dir}")
-        if self.pch_verbose is not None:
-            options.append(f"--pch-verbose={_handle_boolean_option(self.pch_verbose)}")
-        if self.pch_messages is not None:
-            options.append(f"--pch-messages={_handle_boolean_option(self.pch_messages)}")
-        if self.instantiate_templates_in_pch is not None:
-            options.append(
-                f"--instantiate-templates-in-pch={_handle_boolean_option(self.instantiate_templates_in_pch)}"
-            )
-        if self.numba_debug:
-            options.append("--numba-debug")
-        return [o.encode() for o in options]
-
-    def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]:
-
-        options = []
-
-        # Options supported by NVVM
-        assert self.arch is not None
-        arch = self.arch
-        if arch.startswith("sm_"):
-            arch = f"compute_{arch[3:]}"
-        options.append(f"-arch={arch}")
-        if self.debug is not None and self.debug:
-            options.append("-g")
-        if self.device_code_optimize is False:
-            options.append("-opt=0")
-        elif self.device_code_optimize is True:
-            options.append("-opt=3")
-        # NVVM uses 0/1 instead of true/false for boolean options
-        if self.ftz is not None:
-            options.append(f"-ftz={'1' if self.ftz else '0'}")
-        if self.prec_sqrt is not None:
-            options.append(f"-prec-sqrt={'1' if self.prec_sqrt else '0'}")
-        if self.prec_div is not None:
-            options.append(f"-prec-div={'1' if self.prec_div else '0'}")
-        if self.fma is not None:
-            options.append(f"-fma={'1' if self.fma else '0'}")
-
-        # Check for unsupported options and raise error if they are set
-        unsupported = []
-        if self.relocatable_device_code is not None:
-            unsupported.append("relocatable_device_code")
-        if self.extensible_whole_program is not None and self.extensible_whole_program:
-            unsupported.append("extensible_whole_program")
-        if self.lineinfo is not None and self.lineinfo:
-            unsupported.append("lineinfo")
-        if self.ptxas_options is not None:
-            unsupported.append("ptxas_options")
-        if self.max_register_count is not None:
-            unsupported.append("max_register_count")
-        if self.use_fast_math is not None and self.use_fast_math:
-            unsupported.append("use_fast_math")
-        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
-            unsupported.append("extra_device_vectorization")
-        if self.gen_opt_lto is not None and self.gen_opt_lto:
-            unsupported.append("gen_opt_lto")
-        if self.define_macro is not None:
-            unsupported.append("define_macro")
-        if self.undefine_macro is not None:
-            unsupported.append("undefine_macro")
-        if self.include_path is not None:
-            unsupported.append("include_path")
-        if self.pre_include is not None:
-            unsupported.append("pre_include")
-        if self.no_source_include is not None and self.no_source_include:
-            unsupported.append("no_source_include")
-        if self.std is not None:
-            unsupported.append("std")
-        if self.builtin_move_forward is not None:
-            unsupported.append("builtin_move_forward")
-        if self.builtin_initializer_list is not None:
-            unsupported.append("builtin_initializer_list")
-        if self.disable_warnings is not None and self.disable_warnings:
-            unsupported.append("disable_warnings")
-        if self.restrict is not None and self.restrict:
-            unsupported.append("restrict")
-        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
-            unsupported.append("device_as_default_execution_space")
-        if self.device_int128 is not None and self.device_int128:
-            unsupported.append("device_int128")
-        if self.optimization_info is not None:
-            unsupported.append("optimization_info")
-        if self.no_display_error_number is not None and self.no_display_error_number:
-            unsupported.append("no_display_error_number")
-        if self.diag_error is not None:
-            unsupported.append("diag_error")
-        if self.diag_suppress is not None:
-            unsupported.append("diag_suppress")
-        if self.diag_warn is not None:
-            unsupported.append("diag_warn")
-        if self.brief_diagnostics is not None:
-            unsupported.append("brief_diagnostics")
-        if self.time is not None:
-            unsupported.append("time")
-        if self.split_compile is not None:
-            unsupported.append("split_compile")
-        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
-            unsupported.append("fdevice_syntax_only")
-        if self.minimal is not None and self.minimal:
-            unsupported.append("minimal")
-        if self.numba_debug is not None and self.numba_debug:
-            unsupported.append("numba_debug")
-        if unsupported:
-            raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}")
-
-        if as_bytes:
-            return [o.encode() for o in options]
-        else:
-            return options
-
-    def as_bytes(self, backend: str) -> list[bytes]:
-        """Convert program options to bytes format for the specified backend.
-
-        This method transforms the program options into a format suitable for the
-        specified compiler backend. Different backends may use different option names
-        and formats even for the same conceptual options.
-
-        Parameters
-        ----------
-        backend : str
-            The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm".
-
-        Returns
-        -------
-        list[bytes]
-            List of option strings encoded as bytes.
-
-        Raises
-        ------
-        ValueError
-            If an unknown backend is specified.
-        CUDAError
-            If an option incompatible with the specified backend is set.
-
-        Examples
-        --------
-        >>> options = ProgramOptions(arch="sm_80", debug=True)
-        >>> nvrtc_options = options.as_bytes("nvrtc")
-        """
-        backend = backend.lower()
-        if backend == "nvrtc":
-            return self._prepare_nvrtc_options()
-        elif backend == "nvvm":
-            return self._prepare_nvvm_options(as_bytes=True)
-        else:
-            raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'")
-
-    def __repr__(self):
-        return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})"
-
-
-ProgramHandleT = Union["cuda.bindings.nvrtc.nvrtcProgram", LinkerHandleT]
-
-
-class Program:
-    """Represent a compilation machinery to process programs into
-    :obj:`~_module.ObjectCode`.
-
-    This object provides a unified interface to multiple underlying
-    compiler libraries. Compilation support is enabled for a wide
-    range of code types and compilation types.
-
-    Parameters
-    ----------
-    code : Any
-        String of the CUDA Runtime Compilation program.
-    code_type : Any
-        String of the code type. Currently ``"ptx"``, ``"c++"``, and ``"nvvm"`` are supported.
-    options : ProgramOptions, optional
-        A ProgramOptions object to customize the compilation process.
-        See :obj:`ProgramOptions` for more information.
-    """
-
-    class _MembersNeededForFinalize:
-        __slots__ = "handle", "backend"
-
-        def __init__(self, program_obj, handle, backend):
-            self.handle = handle
-            self.backend = backend
-            weakref.finalize(program_obj, self.close)
-
-        def close(self):
-            if self.handle is not None:
-                if self.backend == "NVRTC":
-                    handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
-                elif self.backend == "NVVM":
-                    nvvm = _get_nvvm_module()
-                    nvvm.destroy_program(self.handle)
-                self.handle = None
-
-    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count")
-
-    def __init__(self, code, code_type, options: ProgramOptions = None):
-        self._mnff = Program._MembersNeededForFinalize(self, None, None)
-
-        self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
-        code_type = code_type.lower()
-        self._module_count = 0
-
-        if code_type == "c++":
-            assert_type(code, str)
-            # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            if options.extra_sources is not None:
-                raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
-
-
-            self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
-            self._mnff.backend = "NVRTC"
-            self._backend = "NVRTC"
-            self._linker = None
-
-        elif code_type == "ptx":
-            assert_type(code, str)
-            if options.extra_sources is not None:
-                raise ValueError("extra_sources is not supported by the PTX backend.")
-            self._linker = Linker(
-                ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
-            )
-            self._backend = self._linker.backend
-
-        elif code_type == "nvvm":
-            if isinstance(code, str):
-                code = code.encode("utf-8")
-            elif not isinstance(code, (bytes, bytearray)):
-                raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
-
-            nvvm = _get_nvvm_module()
-            self._mnff.handle = nvvm.create_program()
-            self._mnff.backend = "NVVM"
-            nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
-            self._module_count = 1
-            # Add extra modules if provided
-            if options.extra_sources is not None:
-                if not is_sequence(options.extra_sources):
-                    raise TypeError(
-                        "extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)"
-                    )
-                for i, module in enumerate(options.extra_sources):
-                    if not isinstance(module, tuple) or len(module) != 2:
-                        raise TypeError(
-                            f"Each extra module must be a 2-tuple (name, source)"
-                            f", got {type(module).__name__} at index {i}"
-                        )
-
-                    module_name, module_source = module
-
-                    if not isinstance(module_name, str):
-                        raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}")
-
-                    if isinstance(module_source, str):
-                        # Textual LLVM IR - encode to UTF-8 bytes
-                        module_source = module_source.encode("utf-8")
-                    elif not isinstance(module_source, (bytes, bytearray)):
-                        raise TypeError(
-                            f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
-                            f"or bytearray, got {type(module_source).__name__}"
-                        )
-
-                    if len(module_source) == 0:
-                        raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
-
-                    nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
-                    self._module_count += 1
-                    
-            self._use_libdevice = options.use_libdevice
-            self._backend = "NVVM"
-            self._linker = None
-
-        else:
-            supported_code_types = ("c++", "ptx", "nvvm")
-            assert code_type not in supported_code_types, f"{code_type=}"
-            raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
-
-    def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
-        return LinkerOptions(
-            name=options.name,
-            arch=options.arch,
-            max_register_count=options.max_register_count,
-            time=options.time,
-            link_time_optimization=options.link_time_optimization,
-            debug=options.debug,
-            lineinfo=options.lineinfo,
-            ftz=options.ftz,
-            prec_div=options.prec_div,
-            prec_sqrt=options.prec_sqrt,
-            fma=options.fma,
-            split_compile=options.split_compile,
-            ptxas_options=options.ptxas_options,
-            no_cache=options.no_cache,
-        )
-
-    def close(self):
-        """Destroy this program."""
-        if self._linker:
-            self._linker.close()
-        self._mnff.close()
-
-    @staticmethod
-    def _can_load_generated_ptx():
-        driver_ver = handle_return(driver.cuDriverGetVersion())
-        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
-        return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
-
-    def compile(self, target_type, name_expressions=(), logs=None):
-        """Compile the program with a specific compilation type.
-
-        Parameters
-        ----------
-        target_type : Any
-            String of the targeted compilation type.
-            Supported options are "ptx", "cubin" and "ltoir".
-        name_expressions : Union[list, tuple], optional
-            List of explicit name expressions to become accessible.
-            (Default to no expressions)
-        logs : Any, optional
-            Object with a write method to receive the logs generated
-            from compilation.
-            (Default to no logs)
-
-        Returns
-        -------
-        :obj:`~_module.ObjectCode`
-            Newly created code object.
-
-        """
-        supported_target_types = ("ptx", "cubin", "ltoir")
-        if target_type not in supported_target_types:
-            raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})')
-
-        if self._backend == "NVRTC":
-            if target_type == "ptx" and not self._can_load_generated_ptx():
-                warn(
-                    "The CUDA driver version is older than the backend version. "
-                    "The generated ptx will not be loadable by the current driver.",
-                    stacklevel=1,
-                    category=RuntimeWarning,
-                )
-            if name_expressions:
-                for n in name_expressions:
-                    handle_return(
-                        nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()),
-                        handle=self._mnff.handle,
-                    )
-            options = self._options.as_bytes("nvrtc")
-            handle_return(
-                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
-                handle=self._mnff.handle,
-            )
-
-            size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
-            comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
-            size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle)
-            data = b" " * size
-            handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle)
-
-            symbol_mapping = {}
-            if name_expressions:
-                for n in name_expressions:
-                    symbol_mapping[n] = handle_return(
-                        nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle
-                    )
-
-            if logs is not None:
-                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle)
-                if logsize > 1:
-                    log = b" " * logsize
-                    handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
-                    logs.write(log.decode("utf-8", errors="backslashreplace"))
-
-            return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
-
-        elif self._backend == "NVVM":
-            if target_type not in ("ptx", "ltoir"):
-                raise ValueError(f'NVVM backend only supports target_type="ptx", "ltoir", got "{target_type}"')
-
-            # TODO: flip to True when NVIDIA/cuda-python#1354 is resolved and CUDA 12 is dropped
-            nvvm_options = self._options._prepare_nvvm_options(as_bytes=False)
-            if target_type == "ltoir" and "-gen-lto" not in nvvm_options:
-                nvvm_options.append("-gen-lto")
-            nvvm = _get_nvvm_module()
-            with _nvvm_exception_manager(self):
-                nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-                # libdevice compilation 
-                if getattr(self, '_use_libdevice', False):
-                    libdevice_path = _find_libdevice_path()
-                    if libdevice_path is None:
-                        raise RuntimeError(
-                            "use_libdevice=True but could not find libdevice.10.bc. "
-                            "Ensure CUDA toolkit is installed."
-                        )
-                    with open(libdevice_path, "rb") as f:
-                        libdevice_bc = f.read()
-                    # libdevice for numba-cuda
-                    nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None)
-                
-                nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-
-            size = nvvm.get_compiled_result_size(self._mnff.handle)
-            data = bytearray(size)
-            nvvm.get_compiled_result(self._mnff.handle, data)
-
-            if logs is not None:
-                logsize = nvvm.get_program_log_size(self._mnff.handle)
-                if logsize > 1:
-                    log = bytearray(logsize)
-                    nvvm.get_program_log(self._mnff.handle, log)
-                    logs.write(log.decode("utf-8", errors="backslashreplace"))
-
-            return ObjectCode._init(data, target_type, name=self._options.name)
-
-        supported_backends = ("nvJitLink", "driver")
-        if self._backend not in supported_backends:
-            raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})')
-        return self._linker.link(target_type)
-
-    @property
-    def backend(self) -> str:
-        """Return this Program instance's underlying backend."""
-        return self._backend
-
-    @property
-    def handle(self) -> ProgramHandleT:
-        """Return the underlying handle object.
-
-        .. note::
-
-           The type of the returned object depends on the backend.
-
-        .. caution::
-
-            This handle is a Python object. To get the memory address of the underlying C
-            handle, call ``int(Program.handle)``.
-        """
-        return self._mnff.handle
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
new file mode 100644
index 0000000000..7f8f83d06b
--- /dev/null
+++ b/cuda_core/cuda/core/_program.pyx
@@ -0,0 +1,1038 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Compilation machinery for CUDA programs.
+
+This module provides :class:`Program` for compiling source code into
+:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from warnings import warn
+
+from cuda.bindings import driver, nvrtc
+
+from libcpp.vector cimport vector
+
+from ._resource_handles cimport (
+    as_cu,
+    as_py,
+    create_nvrtc_program_handle,
+    create_nvvm_program_handle,
+)
+from cuda.bindings cimport cynvrtc, cynvvm
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN_NVRTC, HANDLE_RETURN_NVVM
+from cuda.core._device import Device
+from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
+    CUDAError,
+    _handle_boolean_option,
+    check_or_create_options,
+    get_binding_version,
+    handle_return,
+    is_nested_sequence,
+    is_sequence,
+)
+
+__all__ = ["Program", "ProgramOptions"]
+
+ProgramHandleT = nvrtc.nvrtcProgram | int | LinkerHandleT
+"""Type alias for program handle types across different backends.
+
+The ``int`` type covers NVVM handles, which don't have a wrapper class.
+"""
+
+
+# =============================================================================
+# Principal Class
+# =============================================================================
+
+
+cdef class Program:
+    """Represent a compilation machinery to process programs into
+    :class:`~cuda.core.ObjectCode`.
+
+    This object provides a unified interface to multiple underlying
+    compiler libraries. Compilation support is enabled for a wide
+    range of code types and compilation types.
+
+    Parameters
+    ----------
+    code : str | bytes | bytearray
+        The source code to compile. For C++ and PTX, must be a string.
+        For NVVM IR, can be str, bytes, or bytearray.
+    code_type : str
+        The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``.
+    options : :class:`ProgramOptions`, optional
+        Options to customize the compilation process.
+    """
+
+    def __init__(self, code: str | bytes | bytearray, code_type: str, options: ProgramOptions | None = None):
+        Program_init(self, code, code_type, options)
+
+    def close(self):
+        """Destroy this program."""
+        if self._linker:
+            self._linker.close()
+        # Reset handles - the C++ shared_ptr destructor handles cleanup
+        self._h_nvrtc.reset()
+        self._h_nvvm.reset()
+
+    def compile(
+        self, target_type: str, name_expressions: tuple | list = (), logs = None
+    ) -> ObjectCode:
+        """Compile the program to the specified target type.
+
+        Parameters
+        ----------
+        target_type : str
+            The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``.
+        name_expressions : tuple | list, optional
+            Sequence of name expressions to make accessible in the compiled code.
+            Used for template instantiation and similar cases.
+        logs : object, optional
+            Object with a ``write`` method to receive compilation logs.
+
+        Returns
+        -------
+        :class:`~cuda.core.ObjectCode`
+            The compiled object code.
+        """
+        return Program_compile(self, target_type, name_expressions, logs)
+
+    @property
+    def backend(self) -> str:
+        """Return this Program instance's underlying backend."""
+        return self._backend
+
+    @property
+    def handle(self) -> ProgramHandleT:
+        """Return the underlying handle object.
+
+        .. note::
+
+           The type of the returned object depends on the backend.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Program.handle)``.
+        """
+        if self._backend == "NVRTC":
+            return as_py(self._h_nvrtc)
+        elif self._backend == "NVVM":
+            return as_py(self._h_nvvm)  # returns int (NVVM uses raw integers)
+        else:
+            return self._linker.handle
+
+    def __repr__(self) -> str:
+        return f"<Program backend='{self._backend}'>"
+
+
+# =============================================================================
+# Other Public Classes
+# =============================================================================
+
+
+@dataclass
+class ProgramOptions:
+    """Customizable options for configuring :class:`Program`.
+
+    Attributes
+    ----------
+    name : str, optional
+        Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`.
+    arch : str, optional
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
+    relocatable_device_code : bool, optional
+        Enable (disable) the generation of relocatable device code.
+        Default: False
+    extensible_whole_program : bool, optional
+        Do extensible whole program compilation of device code.
+        Default: False
+    debug : bool, optional
+        Generate debug information. If --dopt is not specified, then turns off all optimizations.
+        Default: False
+    lineinfo: bool, optional
+        Generate line-number information.
+        Default: False
+    device_code_optimize : bool, optional
+        Enable device code optimization. When specified along with '-G', enables limited debug information generation
+        for optimized device code.
+        Default: None
+    ptxas_options : Union[str, list[str]], optional
+        Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
+        For example ["-v", "-O2"].
+        Default: None
+    max_register_count : int, optional
+        Specify the maximum amount of registers that GPU functions can use.
+        Default: None
+    ftz : bool, optional
+        When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
+        values.
+        Default: False
+    prec_sqrt : bool, optional
+        For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
+        Default: True
+    prec_div : bool, optional
+        For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
+        approximation.
+        Default: True
+    fma : bool, optional
+        Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
+        multiply-add operations.
+        Default: True
+    use_fast_math : bool, optional
+        Make use of fast math operations.
+        Default: False
+    extra_device_vectorization : bool, optional
+        Enables more aggressive device code vectorization in the NVVM optimizer.
+        Default: False
+    link_time_optimization : bool, optional
+        Generate intermediate code for later link-time optimization.
+        Default: False
+    gen_opt_lto : bool, optional
+        Run the optimizer passes before generating the LTO IR.
+        Default: False
+    define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional
+        Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
+        strings, in which case the first element is defined as the second, or a list of strings or tuples.
+        Default: None
+    undefine_macro : Union[str, list[str]], optional
+        Cancel any previous definition of a macro, or list of macros.
+        Default: None
+    include_path : Union[str, list[str]], optional
+        Add the directory or directories to the list of directories to be searched for headers.
+        Default: None
+    pre_include : Union[str, list[str]], optional
+        Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
+        Default: None
+    no_source_include : bool, optional
+        Disable the default behavior of adding the directory of each input source to the include path.
+        Default: False
+    std : str, optional
+        Set language dialect to C++03, C++11, C++14, C++17 or C++20.
+        Default: c++17
+    builtin_move_forward : bool, optional
+        Provide builtin definitions of std::move and std::forward.
+        Default: True
+    builtin_initializer_list : bool, optional
+        Provide builtin definitions of std::initializer_list class and member functions.
+        Default: True
+    disable_warnings : bool, optional
+        Inhibit all warning messages.
+        Default: False
+    restrict : bool, optional
+        Programmer assertion that all kernel pointer parameters are restrict pointers.
+        Default: False
+    device_as_default_execution_space : bool, optional
+        Treat entities with no execution space annotation as __device__ entities.
+        Default: False
+    device_int128 : bool, optional
+        Allow the __int128 type in device code.
+        Default: False
+    optimization_info : str, optional
+        Provide optimization reports for the specified kind of optimization.
+        Default: None
+    no_display_error_number : bool, optional
+        Disable the display of a diagnostic number for warning messages.
+        Default: False
+    diag_error : Union[int, list[int]], optional
+        Emit error for a specified diagnostic message number or comma separated list of numbers.
+        Default: None
+    diag_suppress : Union[int, list[int]], optional
+        Suppress a specified diagnostic message number or comma separated list of numbers.
+        Default: None
+    diag_warn : Union[int, list[int]], optional
+        Emit warning for a specified diagnostic message number or comma separated lis of numbers.
+        Default: None
+    brief_diagnostics : bool, optional
+        Disable or enable showing source line and column info in a diagnostic.
+        Default: False
+    time : str, optional
+        Generate a CSV table with the time taken by each compilation phase.
+        Default: None
+    split_compile : int, optional
+        Perform compiler optimizations in parallel.
+        Default: 1
+    fdevice_syntax_only : bool, optional
+        Ends device compilation after front-end syntax checking.
+        Default: False
+    minimal : bool, optional
+        Omit certain language features to reduce compile time for small programs.
+        Default: False
+    no_cache : bool, optional
+        Disable compiler caching.
+        Default: False
+    fdevice_time_trace : str, optional
+        Generate time trace JSON for profiling compilation (NVRTC only).
+        Default: None
+    device_float128 : bool, optional
+        Allow __float128 type in device code (NVRTC only).
+        Default: False
+    frandom_seed : str, optional
+        Set random seed for randomized optimizations (NVRTC only).
+        Default: None
+    ofast_compile : str, optional
+        Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only).
+        Default: None
+    pch : bool, optional
+        Use default precompiled header (NVRTC only, CUDA 12.8+).
+        Default: False
+    create_pch : str, optional
+        Create precompiled header file (NVRTC only, CUDA 12.8+).
+        Default: None
+    use_pch : str, optional
+        Use specific precompiled header file (NVRTC only, CUDA 12.8+).
+        Default: None
+    pch_dir : str, optional
+        PCH directory location (NVRTC only, CUDA 12.8+).
+        Default: None
+    pch_verbose : bool, optional
+        Verbose PCH output (NVRTC only, CUDA 12.8+).
+        Default: False
+    pch_messages : bool, optional
+        Control PCH diagnostic messages (NVRTC only, CUDA 12.8+).
+        Default: False
+    instantiate_templates_in_pch : bool, optional
+        Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
+        Default: False
+    """
+
+    name: str | None = "default_program"
+    arch: str | None = None
+    relocatable_device_code: bool | None = None
+    extensible_whole_program: bool | None = None
+    debug: bool | None = None
+    lineinfo: bool | None = None
+    device_code_optimize: bool | None = None
+    ptxas_options: str | list[str] | tuple[str] | None = None
+    max_register_count: int | None = None
+    ftz: bool | None = None
+    prec_sqrt: bool | None = None
+    prec_div: bool | None = None
+    fma: bool | None = None
+    use_fast_math: bool | None = None
+    extra_device_vectorization: bool | None = None
+    link_time_optimization: bool | None = None
+    gen_opt_lto: bool | None = None
+    define_macro: str | tuple[str, str] | list[str | tuple[str, str]] | tuple[str | tuple[str, str], ...] | None = None
+    undefine_macro: str | list[str] | tuple[str] | None = None
+    include_path: str | list[str] | tuple[str] | None = None
+    pre_include: str | list[str] | tuple[str] | None = None
+    no_source_include: bool | None = None
+    std: str | None = None
+    builtin_move_forward: bool | None = None
+    builtin_initializer_list: bool | None = None
+    disable_warnings: bool | None = None
+    restrict: bool | None = None
+    device_as_default_execution_space: bool | None = None
+    device_int128: bool | None = None
+    optimization_info: str | None = None
+    no_display_error_number: bool | None = None
+    diag_error: int | list[int] | tuple[int] | None = None
+    diag_suppress: int | list[int] | tuple[int] | None = None
+    diag_warn: int | list[int] | tuple[int] | None = None
+    brief_diagnostics: bool | None = None
+    time: str | None = None
+    split_compile: int | None = None
+    fdevice_syntax_only: bool | None = None
+    minimal: bool | None = None
+    no_cache: bool | None = None
+    fdevice_time_trace: str | None = None
+    device_float128: bool | None = None
+    frandom_seed: str | None = None
+    ofast_compile: str | None = None
+    pch: bool | None = None
+    create_pch: str | None = None
+    use_pch: str | None = None
+    pch_dir: str | None = None
+    pch_verbose: bool | None = None
+    pch_messages: bool | None = None
+    instantiate_templates_in_pch: bool | None = None
+    extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None
+    use_libdevice: bool | None = None  # For libdevice execution
+    numba_debug: bool | None = None  # Custom option for Numba debugging
+
+    def __post_init__(self):
+        self._name = self.name.encode()
+        # Set arch to default if not provided
+        if self.arch is None:
+            self.arch = f"sm_{Device().arch}"
+
+    def _prepare_nvrtc_options(self) -> list[bytes]:
+        return _prepare_nvrtc_options_impl(self)
+
+    def _prepare_nvvm_options(self, as_bytes: bool = True) -> list[bytes] | list[str]:
+        return _prepare_nvvm_options_impl(self, as_bytes)
+
+    def as_bytes(self, backend: str, target_type: str | None = None) -> list[bytes]:
+        """Convert program options to bytes format for the specified backend.
+
+        This method transforms the program options into a format suitable for the
+        specified compiler backend. Different backends may use different option names
+        and formats even for the same conceptual options.
+
+        Parameters
+        ----------
+        backend : str
+            The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm".
+        target_type : str, optional
+            The compilation target type (e.g., "ptx", "cubin", "ltoir"). Some backends
+            require additional options based on the target type.
+
+        Returns
+        -------
+        list[bytes]
+            List of option strings encoded as bytes.
+
+        Raises
+        ------
+        ValueError
+            If an unknown backend is specified.
+        CUDAError
+            If an option incompatible with the specified backend is set.
+
+        Examples
+        --------
+        >>> options = ProgramOptions(arch="sm_80", debug=True)
+        >>> nvrtc_options = options.as_bytes("nvrtc")
+        """
+        backend = backend.lower()
+        if backend == "nvrtc":
+            return self._prepare_nvrtc_options()
+        elif backend == "nvvm":
+            options = self._prepare_nvvm_options(as_bytes=True)
+            if target_type == "ltoir" and b"-gen-lto" not in options:
+                options.append(b"-gen-lto")
+            return options
+        else:
+            raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'")
+
+    def __repr__(self):
+        return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})"
+
+
+# =============================================================================
+# Private Classes and Helper Functions
+# =============================================================================
+
+# Module-level state for NVVM lazy loading
+cdef object_nvvm_module = None
+cdef bint _nvvm_import_attempted = False
+
+
+def _get_nvvm_module():
+    """Get the NVVM module, importing it lazily with availability checks."""
+    global _nvvm_module, _nvvm_import_attempted
+
+    if _nvvm_import_attempted:
+        if _nvvm_module is None:
+            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
+        return _nvvm_module
+
+    _nvvm_import_attempted = True
+
+    try:
+        version = get_binding_version()
+        if version < (12, 9):
+            raise RuntimeError(
+                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
+                "Please update cuda-bindings to use NVVM features."
+            )
+
+        from cuda.bindings import nvvm
+        from cuda.bindings._internal.nvvm import _inspect_function_pointer
+
+        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
+            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
+
+        _nvvm_module = nvvm
+        return _nvvm_module
+
+    except RuntimeError as e:
+        _nvvm_module = None
+        raise e
+
+def _find_libdevice_path():
+    """Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
+    from cuda.pathfinder import get_libdevice_path
+    return get_libdevice_path()
+
+
+cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
+    """Process a single define macro, returning True if successful."""
+    if isinstance(macro, str):
+        options.append(f"--define-macro={macro}")
+        return True
+    if isinstance(macro, tuple):
+        if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
+            raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
+        options.append(f"--define-macro={macro[0]}={macro[1]}")
+        return True
+    return False
+
+
+cdef inline void _process_define_macro(list options, object macro) except *:
+    """Process define_macro option which can be str, tuple, or list thereof."""
+    union_type = "Union[str, tuple[str, str]]"
+    if _process_define_macro_inner(options, macro):
+        return
+    if is_nested_sequence(macro):
+        for seq_macro in macro:
+            if not _process_define_macro_inner(options, seq_macro):
+                raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
+        return
+    raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
+
+
+cpdef bint _can_load_generated_ptx() except? -1:
+    """Check if the driver can load PTX generated by the current NVRTC version."""
+    driver_ver = handle_return(driver.cuDriverGetVersion())
+    nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
+    return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
+
+
+cdef inline object _translate_program_options(object options):
+    """Translate ProgramOptions to LinkerOptions for PTX compilation."""
+    return LinkerOptions(
+        name=options.name,
+        arch=options.arch,
+        max_register_count=options.max_register_count,
+        time=options.time,
+        link_time_optimization=options.link_time_optimization,
+        debug=options.debug,
+        lineinfo=options.lineinfo,
+        ftz=options.ftz,
+        prec_div=options.prec_div,
+        prec_sqrt=options.prec_sqrt,
+        fma=options.fma,
+        split_compile=options.split_compile,
+        ptxas_options=options.ptxas_options,
+        no_cache=options.no_cache,
+    )
+
+
+cdef inline int Program_init(Program self, object code, str code_type, object options) except -1:
+    """Initialize a Program instance."""
+    cdef cynvrtc.nvrtcProgram nvrtc_prog
+    cdef cynvvm.nvvmProgram nvvm_prog
+    cdef bytes code_bytes
+    cdef const char* code_ptr
+    cdef const char* name_ptr
+    cdef size_t code_len
+    cdef bytes module_bytes
+    cdef const char* module_ptr
+    cdef size_t module_len
+
+    self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
+    code_type = code_type.lower()
+    self._module_count = 0
+    self._use_libdevice = False
+
+    if code_type == "c++":
+        assert_type(code, str)
+        if options.extra_sources is not None:
+            raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
+        # TODO: support pre-loaded headers & include names
+        code_bytes = code.encode()
+        code_ptr = <const char*>code_bytes
+        name_ptr = <const char*>options._name
+
+        with nogil:
+            HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
+                &nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
+        self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
+        self._backend = "NVRTC"
+        self._linker = None
+
+    elif code_type == "ptx":
+        assert_type(code, str)
+        if options.extra_sources is not None:
+            raise ValueError("extra_sources is not supported by the PTX backend.")
+        self._linker = Linker(
+            ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
+        )
+        self._backend = self._linker.backend
+
+    elif code_type == "nvvm":
+        _get_nvvm_module()  # Validate NVVM availability
+        if isinstance(code, str):
+            code = code.encode("utf-8")
+        elif not isinstance(code, (bytes, bytearray)):
+            raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
+
+        code_ptr = <const char*>(<bytes>code)
+        name_ptr = <const char*>options._name
+        code_len = len(code)
+
+        with nogil:
+            HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog))
+        self._h_nvvm = create_nvvm_program_handle(nvvm_prog)  # RAII from here
+        with nogil:
+            HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(nvvm_prog, code_ptr, code_len, name_ptr))
+        self._module_count = 1
+
+        # Add extra modules if provided
+        if options.extra_sources is not None:
+            if not is_sequence(options.extra_sources):
+                raise TypeError(
+                    "extra_sources must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)"
+                )
+            for i, module in enumerate(options.extra_sources):
+                if not isinstance(module, tuple) or len(module) != 2:
+                    raise TypeError(
+                        f"Each extra module must be a 2-tuple (name, source)"
+                        f", got {type(module).__name__} at index {i}"
+                    )
+
+                module_name, module_source = module
+
+                if not isinstance(module_name, str):
+                    raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
+
+                if isinstance(module_source, str):
+                    # Textual LLVM IR - encode to UTF-8 bytes
+                    module_source = module_source.encode("utf-8")
+                elif not isinstance(module_source, (bytes, bytearray)):
+                    raise TypeError(
+                        f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
+                        f"or bytearray, got {type(module_source).__name__}"
+                    )
+
+                if len(module_source) == 0:
+                    raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
+
+                # Add the module using NVVM API
+                module_bytes = module_source if isinstance(module_source, bytes) else bytes(module_source)
+                module_ptr = <const char*>module_bytes
+                module_len = len(module_bytes)
+                module_name_bytes = module_name.encode()
+                module_name_ptr = <const char*>module_name_bytes
+
+                with nogil:
+                    HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(
+                        nvvm_prog, module_ptr, module_len, module_name_ptr))
+                self._module_count += 1
+
+        # Store use_libdevice flag
+        if options.use_libdevice:
+            self._use_libdevice = True
+        
+        self._backend = "NVVM"
+        self._linker = None
+
+    else:
+        supported_code_types = ("c++", "ptx", "nvvm")
+        assert code_type not in supported_code_types, f"{code_type=}"
+        raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
+
+    return 0
+
+
+
+cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
+    """Compile using NVRTC backend and return ObjectCode."""
+    cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
+    cdef size_t output_size = 0
+    cdef size_t logsize = 0
+    cdef vector[const char*] options_vec
+    cdef char* data_ptr = NULL
+    cdef bytes name_bytes
+    cdef const char* name_ptr = NULL
+    cdef const char* lowered_name = NULL
+    cdef dict symbol_mapping = {}
+
+    # Add name expressions before compilation
+    if name_expressions:
+        for n in name_expressions:
+            name_bytes = n.encode() if isinstance(n, str) else n
+            name_ptr = <const char*>name_bytes
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr))
+
+    # Build options array
+    options_list = self._options.as_bytes("nvrtc", target_type)
+    options_vec.resize(len(options_list))
+    for i in range(len(options_list)):
+        options_vec[i] = <const char*>(<bytes>options_list[i])
+
+    # Compile
+    with nogil:
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
+
+    # Get compiled output based on target type
+    if target_type == "ptx":
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTXSize(prog, &output_size))
+        data = bytearray(output_size)
+        data_ptr = <char*>(<bytearray>data)
+        with nogil:
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTX(prog, data_ptr))
+    elif target_type == "cubin":
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBINSize(prog, &output_size))
+        data = bytearray(output_size)
+        data_ptr = <char*>(<bytearray>data)
+        with nogil:
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBIN(prog, data_ptr))
+    else:  # ltoir
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIRSize(prog, &output_size))
+        data = bytearray(output_size)
+        data_ptr = <char*>(<bytearray>data)
+        with nogil:
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIR(prog, data_ptr))
+
+    # Get lowered names after compilation
+    if name_expressions:
+        for n in name_expressions:
+            name_bytes = n.encode() if isinstance(n, str) else n
+            name_ptr = <const char*>name_bytes
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLoweredName(prog, name_ptr, &lowered_name))
+            symbol_mapping[n] = lowered_name if lowered_name != NULL else None
+
+    # Get compilation log if requested
+    if logs is not None:
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLogSize(prog, &logsize))
+        if logsize > 1:
+            log = bytearray(logsize)
+            data_ptr = <char*>(<bytearray>log)
+            with nogil:
+                HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr))
+            logs.write(log.decode("utf-8", errors="backslashreplace"))
+
+    return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=self._options.name)
+
+
+cdef object Program_compile_nvvm(Program self, str target_type, object logs):
+    """Compile using NVVM backend and return ObjectCode."""
+    cdef cynvvm.nvvmProgram prog = as_cu(self._h_nvvm)
+    cdef size_t output_size = 0
+    cdef size_t logsize = 0
+    cdef vector[const char*] options_vec
+    cdef char* data_ptr = NULL
+    cdef bytes libdevice_bytes
+    cdef const char* libdevice_ptr
+    cdef size_t libdevice_len
+
+    options_list = self._options.as_bytes("nvvm", target_type)
+    options_vec.resize(len(options_list))
+    for i in range(len(options_list)):
+        options_vec[i] = <const char*>(<bytes>options_list[i])
+
+    with nogil:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmVerifyProgram(prog, <int>options_vec.size(), options_vec.data()))
+
+    # Load libdevice if requested  - following numba-cuda
+    if self._use_libdevice:
+        libdevice_path = _find_libdevice_path()
+        if libdevice_path is None:
+            raise RuntimeError(
+                "use_libdevice=True but could not find libdevice.10.bc. "
+                "Ensure CUDA toolkit is installed."
+            )
+        with open(libdevice_path, "rb") as f:
+            libdevice_bytes = f.read()
+        libdevice_ptr = <const char*>libdevice_bytes
+        libdevice_len = len(libdevice_bytes)
+        # Use lazy_add_module
+        with nogil:
+            HANDLE_RETURN_NVVM(prog, cynvvm.nvvmLazyAddModuleToProgram(
+                prog, libdevice_ptr, libdevice_len, NULL))
+
+    with nogil:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
+
+    HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResultSize(prog, &output_size))
+    data = bytearray(output_size)
+    data_ptr = <char*>(<bytearray>data)
+    with nogil:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResult(prog, data_ptr))
+
+    if logs is not None:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLogSize(prog, &logsize))
+        if logsize > 1:
+            log = bytearray(logsize)
+            data_ptr = <char*>(<bytearray>log)
+            with nogil:
+                HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLog(prog, data_ptr))
+            logs.write(log.decode("utf-8", errors="backslashreplace"))
+
+    return ObjectCode._init(bytes(data), target_type, name=self._options.name)
+
+# Supported target types per backend
+cdef dict SUPPORTED_TARGETS = {
+    "NVRTC": ("ptx", "cubin", "ltoir"),
+    "NVVM": ("ptx", "ltoir"),
+    "nvJitLink": ("cubin", "ptx"),
+    "driver": ("cubin", "ptx"),
+}
+
+
+cdef object Program_compile(Program self, str target_type, object name_expressions, object logs):
+    """Compile the program to the specified target type."""
+    # Validate target_type for this backend
+    supported = SUPPORTED_TARGETS.get(self._backend)
+    if supported is None:
+        raise ValueError(f'Unknown backend="{self._backend}"')
+    if target_type not in supported:
+        raise ValueError(
+            f'Unsupported target_type="{target_type}" for {self._backend} '
+            f'(supported: {", ".join(repr(t) for t in supported)})'
+        )
+
+    if self._backend == "NVRTC":
+        if target_type == "ptx" and not _can_load_generated_ptx():
+            warn(
+                "The CUDA driver version is older than the backend version. "
+                "The generated ptx will not be loadable by the current driver.",
+                stacklevel=2,
+                category=RuntimeWarning,
+            )
+        return Program_compile_nvrtc(self, target_type, name_expressions, logs)
+
+    elif self._backend == "NVVM":
+        return Program_compile_nvvm(self, target_type, logs)
+
+    else:
+        return self._linker.link(target_type)
+
+
+cdef inline list _prepare_nvrtc_options_impl(object opts):
+    """Build NVRTC-specific compiler options."""
+    options = [f"-arch={opts.arch}"]
+    if opts.relocatable_device_code is not None:
+        options.append(f"--relocatable-device-code={_handle_boolean_option(opts.relocatable_device_code)}")
+    if opts.extensible_whole_program is not None and opts.extensible_whole_program:
+        options.append("--extensible-whole-program")
+    if opts.debug is not None and opts.debug:
+        options.append("--device-debug")
+    if opts.lineinfo is not None and opts.lineinfo:
+        options.append("--generate-line-info")
+    if opts.device_code_optimize is not None and opts.device_code_optimize:
+        options.append("--dopt=on")
+    if opts.ptxas_options is not None:
+        opt_name = "--ptxas-options"
+        if isinstance(opts.ptxas_options, str):
+            options.append(f"{opt_name}={opts.ptxas_options}")
+        elif is_sequence(opts.ptxas_options):
+            for opt_value in opts.ptxas_options:
+                options.append(f"{opt_name}={opt_value}")
+    if opts.max_register_count is not None:
+        options.append(f"--maxrregcount={opts.max_register_count}")
+    if opts.ftz is not None:
+        options.append(f"--ftz={_handle_boolean_option(opts.ftz)}")
+    if opts.prec_sqrt is not None:
+        options.append(f"--prec-sqrt={_handle_boolean_option(opts.prec_sqrt)}")
+    if opts.prec_div is not None:
+        options.append(f"--prec-div={_handle_boolean_option(opts.prec_div)}")
+    if opts.fma is not None:
+        options.append(f"--fmad={_handle_boolean_option(opts.fma)}")
+    if opts.use_fast_math is not None and opts.use_fast_math:
+        options.append("--use_fast_math")
+    if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
+        options.append("--extra-device-vectorization")
+    if opts.link_time_optimization is not None and opts.link_time_optimization:
+        options.append("--dlink-time-opt")
+    if opts.gen_opt_lto is not None and opts.gen_opt_lto:
+        options.append("--gen-opt-lto")
+    if opts.define_macro is not None:
+        _process_define_macro(options, opts.define_macro)
+    if opts.undefine_macro is not None:
+        if isinstance(opts.undefine_macro, str):
+            options.append(f"--undefine-macro={opts.undefine_macro}")
+        elif is_sequence(opts.undefine_macro):
+            for macro in opts.undefine_macro:
+                options.append(f"--undefine-macro={macro}")
+    if opts.include_path is not None:
+        if isinstance(opts.include_path, str):
+            options.append(f"--include-path={opts.include_path}")
+        elif is_sequence(opts.include_path):
+            for path in opts.include_path:
+                options.append(f"--include-path={path}")
+    if opts.pre_include is not None:
+        if isinstance(opts.pre_include, str):
+            options.append(f"--pre-include={opts.pre_include}")
+        elif is_sequence(opts.pre_include):
+            for header in opts.pre_include:
+                options.append(f"--pre-include={header}")
+    if opts.no_source_include is not None and opts.no_source_include:
+        options.append("--no-source-include")
+    if opts.std is not None:
+        options.append(f"--std={opts.std}")
+    if opts.builtin_move_forward is not None:
+        options.append(f"--builtin-move-forward={_handle_boolean_option(opts.builtin_move_forward)}")
+    if opts.builtin_initializer_list is not None:
+        options.append(f"--builtin-initializer-list={_handle_boolean_option(opts.builtin_initializer_list)}")
+    if opts.disable_warnings is not None and opts.disable_warnings:
+        options.append("--disable-warnings")
+    if opts.restrict is not None and opts.restrict:
+        options.append("--restrict")
+    if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space:
+        options.append("--device-as-default-execution-space")
+    if opts.device_int128 is not None and opts.device_int128:
+        options.append("--device-int128")
+    if opts.device_float128 is not None and opts.device_float128:
+        options.append("--device-float128")
+    if opts.optimization_info is not None:
+        options.append(f"--optimization-info={opts.optimization_info}")
+    if opts.no_display_error_number is not None and opts.no_display_error_number:
+        options.append("--no-display-error-number")
+    if opts.diag_error is not None:
+        if isinstance(opts.diag_error, int):
+            options.append(f"--diag-error={opts.diag_error}")
+        elif is_sequence(opts.diag_error):
+            for error in opts.diag_error:
+                options.append(f"--diag-error={error}")
+    if opts.diag_suppress is not None:
+        if isinstance(opts.diag_suppress, int):
+            options.append(f"--diag-suppress={opts.diag_suppress}")
+        elif is_sequence(opts.diag_suppress):
+            for suppress in opts.diag_suppress:
+                options.append(f"--diag-suppress={suppress}")
+    if opts.diag_warn is not None:
+        if isinstance(opts.diag_warn, int):
+            options.append(f"--diag-warn={opts.diag_warn}")
+        elif is_sequence(opts.diag_warn):
+            for w in opts.diag_warn:
+                options.append(f"--diag-warn={w}")
+    if opts.brief_diagnostics is not None:
+        options.append(f"--brief-diagnostics={_handle_boolean_option(opts.brief_diagnostics)}")
+    if opts.time is not None:
+        options.append(f"--time={opts.time}")
+    if opts.split_compile is not None:
+        options.append(f"--split-compile={opts.split_compile}")
+    if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only:
+        options.append("--fdevice-syntax-only")
+    if opts.minimal is not None and opts.minimal:
+        options.append("--minimal")
+    if opts.no_cache is not None and opts.no_cache:
+        options.append("--no-cache")
+    if opts.fdevice_time_trace is not None:
+        options.append(f"--fdevice-time-trace={opts.fdevice_time_trace}")
+    if opts.frandom_seed is not None:
+        options.append(f"--frandom-seed={opts.frandom_seed}")
+    if opts.ofast_compile is not None:
+        options.append(f"--Ofast-compile={opts.ofast_compile}")
+    # PCH options (CUDA 12.8+)
+    if opts.pch is not None and opts.pch:
+        options.append("--pch")
+    if opts.create_pch is not None:
+        options.append(f"--create-pch={opts.create_pch}")
+    if opts.use_pch is not None:
+        options.append(f"--use-pch={opts.use_pch}")
+    if opts.pch_dir is not None:
+        options.append(f"--pch-dir={opts.pch_dir}")
+    if opts.pch_verbose is not None:
+        options.append(f"--pch-verbose={_handle_boolean_option(opts.pch_verbose)}")
+    if opts.pch_messages is not None:
+        options.append(f"--pch-messages={_handle_boolean_option(opts.pch_messages)}")
+    if opts.instantiate_templates_in_pch is not None:
+        options.append(
+            f"--instantiate-templates-in-pch={_handle_boolean_option(opts.instantiate_templates_in_pch)}"
+        )
+    if opts.numba_debug:
+        options.append("--numba-debug")
+    return [o.encode() for o in options]
+
+
+cdef inline object _prepare_nvvm_options_impl(object opts, bint as_bytes):
+    """Build NVVM-specific compiler options."""
+    options = []
+
+    # Options supported by NVVM
+    assert opts.arch is not None
+    arch = opts.arch
+    if arch.startswith("sm_"):
+        arch = f"compute_{arch[3:]}"
+    options.append(f"-arch={arch}")
+    if opts.debug is not None and opts.debug:
+        options.append("-g")
+    if opts.device_code_optimize is False:
+        options.append("-opt=0")
+    elif opts.device_code_optimize is True:
+        options.append("-opt=3")
+    # NVVM uses 0/1 instead of true/false for boolean options
+    if opts.ftz is not None:
+        options.append(f"-ftz={'1' if opts.ftz else '0'}")
+    if opts.prec_sqrt is not None:
+        options.append(f"-prec-sqrt={'1' if opts.prec_sqrt else '0'}")
+    if opts.prec_div is not None:
+        options.append(f"-prec-div={'1' if opts.prec_div else '0'}")
+    if opts.fma is not None:
+        options.append(f"-fma={'1' if opts.fma else '0'}")
+
+    # Check for unsupported options and raise error if they are set
+    unsupported = []
+    if opts.relocatable_device_code is not None:
+        unsupported.append("relocatable_device_code")
+    if opts.extensible_whole_program is not None and opts.extensible_whole_program:
+        unsupported.append("extensible_whole_program")
+    if opts.lineinfo is not None and opts.lineinfo:
+        unsupported.append("lineinfo")
+    if opts.ptxas_options is not None:
+        unsupported.append("ptxas_options")
+    if opts.max_register_count is not None:
+        unsupported.append("max_register_count")
+    if opts.use_fast_math is not None and opts.use_fast_math:
+        unsupported.append("use_fast_math")
+    if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
+        unsupported.append("extra_device_vectorization")
+    if opts.gen_opt_lto is not None and opts.gen_opt_lto:
+        unsupported.append("gen_opt_lto")
+    if opts.define_macro is not None:
+        unsupported.append("define_macro")
+    if opts.undefine_macro is not None:
+        unsupported.append("undefine_macro")
+    if opts.include_path is not None:
+        unsupported.append("include_path")
+    if opts.pre_include is not None:
+        unsupported.append("pre_include")
+    if opts.no_source_include is not None and opts.no_source_include:
+        unsupported.append("no_source_include")
+    if opts.std is not None:
+        unsupported.append("std")
+    if opts.builtin_move_forward is not None:
+        unsupported.append("builtin_move_forward")
+    if opts.builtin_initializer_list is not None:
+        unsupported.append("builtin_initializer_list")
+    if opts.disable_warnings is not None and opts.disable_warnings:
+        unsupported.append("disable_warnings")
+    if opts.restrict is not None and opts.restrict:
+        unsupported.append("restrict")
+    if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space:
+        unsupported.append("device_as_default_execution_space")
+    if opts.device_int128 is not None and opts.device_int128:
+        unsupported.append("device_int128")
+    if opts.optimization_info is not None:
+        unsupported.append("optimization_info")
+    if opts.no_display_error_number is not None and opts.no_display_error_number:
+        unsupported.append("no_display_error_number")
+    if opts.diag_error is not None:
+        unsupported.append("diag_error")
+    if opts.diag_suppress is not None:
+        unsupported.append("diag_suppress")
+    if opts.diag_warn is not None:
+        unsupported.append("diag_warn")
+    if opts.brief_diagnostics is not None:
+        unsupported.append("brief_diagnostics")
+    if opts.time is not None:
+        unsupported.append("time")
+    if opts.split_compile is not None:
+        unsupported.append("split_compile")
+    if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only:
+        unsupported.append("fdevice_syntax_only")
+    if opts.minimal is not None and opts.minimal:
+        unsupported.append("minimal")
+    if opts.numba_debug is not None and opts.numba_debug:
+        unsupported.append("numba_debug")
+    if unsupported:
+        raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}")
+
+    if as_bytes:
+        return [o.encode() for o in options]
+    else:
+        return options
\ No newline at end of file

From ca32d2b39dd7a3363dae40362ae9c3933c477e3f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 11 Feb 2026 19:54:54 +0000
Subject: [PATCH 32/50] fix import

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 44d69a9bd4..929e38fd7a 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -530,7 +530,7 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
 @nvvm_available
 def test_nvvm_program_with_single_extra_source(nvvm_ir):
     """Test NVVM program with a single extra source"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -568,7 +568,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 @nvvm_available
 def test_nvvm_program_with_multiple_extra_sources():
     """Test NVVM program with multiple extra sources"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()

From fac190726a88835239b3097aeca7879f8bac1fce Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:16:39 +0000
Subject: [PATCH 33/50] fix tests

---
 cuda_core/tests/test_program.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 929e38fd7a..e439353aa9 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -11,6 +11,7 @@
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()

From 4a01e068b79f3834a72934eb149c80025edd9341 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:39:22 +0000
Subject: [PATCH 34/50] fix ruff check

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e439353aa9..daef53da4e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -11,7 +11,7 @@
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
-from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401
+from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401, F811
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -650,7 +650,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-def test_bitcode_format(minimal_nvvmir):
+def test_bitcode_format(minimal_nvvmir):  # noqa: F811
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 

From 2d5252fb49bf6bb1db129b030b2f48ab400a6537 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:44:28 +0000
Subject: [PATCH 35/50] ruff fix find_libdevice

---
 cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 0cfcd4e493..e56fd17114 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -4,10 +4,10 @@
 import glob
 import os
 
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
-from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
@@ -21,6 +21,7 @@
 else:
     bases = ["/usr/local/cuda", "/opt/cuda"]
 
+
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
     if os.path.isdir(dir_path):

From 2976c24cad77e54091af010d12a8eae32d2fbcd2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 04:46:35 +0000
Subject: [PATCH 36/50] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/_program.pxd | 2 +-
 cuda_core/cuda/core/_program.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
index d4abe85ff8..d2ddc71513 100644
--- a/cuda_core/cuda/core/_program.pxd
+++ b/cuda_core/cuda/core/_program.pxd
@@ -14,4 +14,4 @@ cdef class Program:
         object _options  # ProgramOptions
         object __weakref__
         bint _use_libdevice      # Flag for libdevice loading
-        int _module_count  
+        int _module_count
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 35bf55e545..4f0e0c8b78 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -625,7 +625,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
         # Store use_libdevice flag
         if options.use_libdevice:
             self._use_libdevice = True
-        
+
         self._backend = "NVVM"
         self._linker = None
 

From 61c1e00f5d43b654305304ece7e38026e5d80bb7 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:48:32 +0000
Subject: [PATCH 37/50] add spdx and copyright

---
 .../cuda_python_test_helpers/nvvm_bitcode.py                  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
index e42dbff085..5264b947d0 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import binascii
 
 import pytest

From c6bea0c38d0adce012fc153f96093b36e3a89ec7 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 11:44:16 +0000
Subject: [PATCH 38/50] rm redundant include and fix test

---
 cuda_pathfinder/cuda/pathfinder/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index ece060d81a..bdcbdea8f1 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -19,9 +19,7 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
-from cuda.pathfinder._static_libs.find_libdevice import (
-    LibdeviceNotFoundError as LibdeviceNotFoundError,
-)
+
 from cuda.pathfinder._static_libs.find_libdevice import (
     find_libdevice as find_libdevice,
 )

From b7866cf5e38d4e5b4ecd1c3a26cbc744a6ac9e0c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 11:46:05 +0000
Subject: [PATCH 39/50] [pre-commit.ci] auto code formatting

---
 cuda_pathfinder/cuda/pathfinder/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index bdcbdea8f1..63781965f3 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -19,7 +19,6 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
-
 from cuda.pathfinder._static_libs.find_libdevice import (
     find_libdevice as find_libdevice,
 )

From 42832306f638a116c0b0c9182e6cf7898fdc969e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 18:58:45 +0000
Subject: [PATCH 40/50] refresh tests

---
 cuda_core/tests/test_program.py | 60 ++++++++++++---------------------
 1 file changed, 22 insertions(+), 38 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index daef53da4e..30cdad6b9e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -445,11 +445,23 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
 
 
 @nvvm_available
+@pytest.mark.parametrize("target_type", ["ptx", "ltoir"])
 @pytest.mark.parametrize(
     "options",
     [
         ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
         ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
+        ProgramOptions(name="test3", arch="sm_100", link_time_optimization=True),
+        ProgramOptions(
+            name="test4",
+            arch="sm_90",
+            ftz=True,
+            prec_sqrt=False,
+            prec_div=False,
+            fma=True,
+            device_code_optimize=True,
+            link_time_optimization=True,
+        ),
         pytest.param(
             ProgramOptions(name="test_sm110_1", arch="sm_110", device_code_optimize=False),
             marks=pytest.mark.skipif(
@@ -481,50 +493,22 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
         ),
     ],
 )
-def test_nvvm_program_options(init_cuda, nvvm_ir, options):
-    """Test NVVM programs with different options"""
+def test_nvvm_program_options(init_cuda, nvvm_ir, options, target_type):
+    """Test NVVM programs with different options and target types (ptx/ltoir)"""
     program = Program(nvvm_ir, "nvvm", options)
     assert program.backend == "NVVM"
 
-    ptx_code = program.compile("ptx")
-    assert isinstance(ptx_code, ObjectCode)
-    assert ptx_code.name == options.name
-
-    code_content = ptx_code.code
-    ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
-    assert ".visible .entry simple(" in ptx_text
+    result = program.compile(target_type)
+    assert isinstance(result, ObjectCode)
+    assert result.name == options.name
 
-    program.close()
+    code_content = result.code
+    assert len(code_content) > 0
 
+    if target_type == "ptx":
+        ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
+        assert ".visible .entry simple(" in ptx_text
 
-@nvvm_available
-@pytest.mark.parametrize(
-    "options",
-    [
-        ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
-        ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
-        ProgramOptions(
-            name="ltoir_test3",
-            arch="sm_90",
-            ftz=True,
-            prec_sqrt=False,
-            prec_div=False,
-            fma=True,
-            device_code_optimize=True,
-            link_time_optimization=True,
-        ),
-    ],
-)
-def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
-    """Test NVVM programs for LTOIR with different options"""
-    program = Program(nvvm_ir, "nvvm", options)
-    assert program.backend == "NVVM"
-
-    ltoir_code = program.compile("ltoir")
-    assert isinstance(ltoir_code, ObjectCode)
-    assert ltoir_code.name == options.name
-    code_content = ltoir_code.code
-    assert len(code_content) > 0
     program.close()
 
 

From 78f43283e2d795d498b5aa54d93c15d3eadab753 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 19:05:29 +0000
Subject: [PATCH 41/50] add correct libdevice for CTK> 13

---
 cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index e56fd17114..c7e3b8cf18 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -11,7 +11,7 @@
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cuda_nvvm/nvvm/libdevice",  # CTK 13+
+    "nvidia/cu13/nvvm/libdevice",       # CTK 13+
     "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
 )
 

From ddf48396621e10506a0cf128bb0348aed1caf73a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 19:14:03 +0000
Subject: [PATCH 42/50] [pre-commit.ci] auto code formatting

---
 cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index c7e3b8cf18..0a37b31736 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -11,7 +11,7 @@
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cu13/nvvm/libdevice",       # CTK 13+
+    "nvidia/cu13/nvvm/libdevice",  # CTK 13+
     "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
 )
 

From cd3644e68345c75efde93f1335333739e2886a7e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:24:53 +0000
Subject: [PATCH 43/50] revamp design of pathfinder as LocatedHeaderDir

---
 cuda_pathfinder/cuda/pathfinder/__init__.py   |   8 +-
 .../pathfinder/_static_libs/find_libdevice.py | 111 +++++++++++-------
 cuda_pathfinder/tests/test_find_libdevice.py  |  55 ++++++---
 3 files changed, 116 insertions(+), 58 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 63781965f3..cc1eda8d2d 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -20,10 +20,10 @@
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
 from cuda.pathfinder._static_libs.find_libdevice import (
-    find_libdevice as find_libdevice,
-)
-from cuda.pathfinder._static_libs.find_libdevice import (
-    get_libdevice_path as get_libdevice_path,
+    BitcodeLibNotFoundError as BitcodeLibNotFoundError,
+    LocatedBitcodeLib as LocatedBitcodeLib,
+    find_bitcode_lib as find_bitcode_lib,
+    locate_bitcode_lib as locate_bitcode_lib,
 )
 
 from cuda.pathfinder._version import __version__  # isort: skip  # noqa: F401
diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 0a37b31736..f4422a5df1 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -1,25 +1,46 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+
 import functools
 import glob
 import os
+from dataclasses import dataclass
 
-from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
-# Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
-SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cu13/nvvm/libdevice",  # CTK 13+
-    "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
-)
 
-FILENAME = "libdevice.10.bc"
+class BitcodeLibNotFoundError(DynamicLibNotFoundError):
+    """Raised when a bitcode library cannot be found."""
+    pass
+
+
+@dataclass(frozen=True)
+class LocatedBitcodeLib:
+    """Information about a located bitcode library.
+    """
+    name: str
+    abs_path: str
+    filename: str
+
+
+SUPPORTED_BITCODE_LIBS = {
+    "device": {
+        "filename": "libdevice.10.bc",
+        "rel_path": os.path.join("nvvm", "libdevice"),
+        "site_packages_dirs": (
+            "nvidia/cu13/nvvm/libdevice",       # CTK 13+
+            "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
+        ),
+    },
+}
+
 if IS_WINDOWS:
-    bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    _COMMON_BASES = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
 else:
-    bases = ["/usr/local/cuda", "/opt/cuda"]
+    _COMMON_BASES = ["/usr/local/cuda", "/opt/cuda"]
 
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
@@ -32,19 +53,26 @@ def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str]
         attachments.append(f'  Directory does not exist: "{dir_path}"')
 
 
-class _FindLibdevice:
-    REL_PATH = os.path.join("nvvm", "libdevice")
-
-    def __init__(self) -> None:
+class _FindBitcodeLib:
+    def __init__(self, name: str) -> None:
+        if name not in SUPPORTED_BITCODE_LIBS:
+            raise ValueError(
+                f"Unknown bitcode library: '{name}'. "
+                f"Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
+            )
+        self.name = name
+        self.config = SUPPORTED_BITCODE_LIBS[name]
+        self.filename = self.config["filename"]
+        self.rel_path = self.config["rel_path"]
+        self.site_packages_dirs = self.config["site_packages_dirs"]
         self.error_messages: list[str] = []
         self.attachments: list[str] = []
-        self.abs_path: str | None = None
 
     def try_site_packages(self) -> str | None:
-        for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS:
+        for rel_dir in self.site_packages_dirs:
             sub_dir = tuple(rel_dir.split("/"))
             for abs_dir in find_sub_dirs_all_sitepackages(sub_dir):
-                file_path = os.path.join(abs_dir, FILENAME)
+                file_path = os.path.join(abs_dir, self.filename)
                 if os.path.isfile(file_path):
                     return file_path
         return None
@@ -55,7 +83,7 @@ def try_with_conda_prefix(self) -> str | None:
             return None
 
         anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix
-        file_path = os.path.join(anchor, self.REL_PATH, FILENAME)
+        file_path = os.path.join(anchor, self.rel_path, self.filename)
         if os.path.isfile(file_path):
             return file_path
         return None
@@ -66,29 +94,26 @@ def try_with_cuda_home(self) -> str | None:
             self.error_messages.append("CUDA_HOME/CUDA_PATH not set")
             return None
 
-        file_path = os.path.join(cuda_home, self.REL_PATH, FILENAME)
+        file_path = os.path.join(cuda_home, self.rel_path, self.filename)
         if os.path.isfile(file_path):
             return file_path
 
         _no_such_file_in_dir(
-            os.path.join(cuda_home, self.REL_PATH),
-            FILENAME,
+            os.path.join(cuda_home, self.rel_path),
+            self.filename,
             self.error_messages,
             self.attachments,
         )
         return None
 
     def try_common_paths(self) -> str | None:
-
-        for base in bases:
-            # Direct path
-            file_path = os.path.join(base, self.REL_PATH, FILENAME)
+        for base in _COMMON_BASES:
+            file_path = os.path.join(base, self.rel_path, self.filename)
             if os.path.isfile(file_path):
                 return file_path
-            # Versioned paths (e.g., /usr/local/cuda-13.0)
             for versioned in sorted(glob.glob(base + "*"), reverse=True):
                 if os.path.isdir(versioned):
-                    file_path = os.path.join(versioned, self.REL_PATH, FILENAME)
+                    file_path = os.path.join(versioned, self.rel_path, self.filename)
                     if os.path.isfile(file_path):
                         return file_path
         return None
@@ -96,12 +121,12 @@ def try_common_paths(self) -> str | None:
     def raise_not_found_error(self) -> None:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
-        raise DynamicLibNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
+        raise BitcodeLibNotFoundError(f'Failure finding "{self.filename}": {err}\n{att}')
 
 
-def get_libdevice_path() -> str | None:
-    """Get the path to libdevice*.bc, or None if not found."""
-    finder = _FindLibdevice()
+def locate_bitcode_lib(name: str) -> LocatedBitcodeLib | None:
+    """Locate a bitcode library by name."""
+    finder = _FindBitcodeLib(name)
 
     abs_path = finder.try_site_packages()
     if abs_path is None:
@@ -111,16 +136,22 @@ def get_libdevice_path() -> str | None:
     if abs_path is None:
         abs_path = finder.try_common_paths()
 
-    return abs_path
+    if abs_path is None:
+        return None
+
+    return LocatedBitcodeLib(
+        name=name,
+        abs_path=abs_path,
+        filename=finder.filename,
+    )
 
 
 @functools.cache
-def find_libdevice() -> str:
-    """Find the path to libdevice*.bc.
-    Raises:
-        DynamicLibNotFoundError: If libdevice.10.bc cannot be found
-    """
-    path_or_none = get_libdevice_path()
-    if path_or_none is None:
-        raise DynamicLibNotFoundError(f"{FILENAME} not found")
-    return path_or_none
+def find_bitcode_lib(name: str) -> str:
+    """Find the absolute path to a bitcode library."""
+    result = locate_bitcode_lib(name)
+    if result is None:
+        config = SUPPORTED_BITCODE_LIBS.get(name, {})
+        filename = config.get("filename", name)
+        raise BitcodeLibNotFoundError(f"{filename} not found")
+    return result.abs_path
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
index a1dfc0b64f..d93929991f 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -5,20 +5,18 @@
 
 import pytest
 
-from cuda.pathfinder import find_libdevice
-from cuda.pathfinder._static_libs import find_libdevice as find_libdevice_module
+import cuda.pathfinder._static_libs.find_libdevice as find_libdevice_module
 
 FILENAME = "libdevice.10.bc"
 
 SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
 SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
 
-
 @pytest.fixture
 def clear_find_libdevice_cache():
-    find_libdevice.cache_clear()
+    find_libdevice_module.find_bitcode_lib.cache_clear()
     yield
-    find_libdevice.cache_clear()
+    find_libdevice_module.find_bitcode_lib.cache_clear()
 
 
 def _make_libdevice_file(dir_path: str) -> str:
@@ -44,10 +42,13 @@ def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice()
+    result = find_libdevice_module.locate_bitcode_lib("device")
 
-    assert result == expected_path
-    assert os.path.isfile(result)
+    assert result is not None
+    assert result.abs_path == expected_path
+    assert result.name == "device"
+    assert result.filename == FILENAME
+    assert os.path.isfile(result.abs_path)
 
 
 # same for cu12/cu13
@@ -67,11 +68,11 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice()
-
-    assert result == expected_path
-    assert os.path.isfile(result)
+    result = find_libdevice_module.locate_bitcode_lib("device")
 
+    assert result is not None
+    assert result.abs_path == expected_path
+    assert os.path.isfile(result.abs_path)
 
 @pytest.mark.usefixtures("clear_find_libdevice_cache")
 def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
@@ -88,7 +89,33 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     monkeypatch.setenv("CUDA_HOME", str(tmp_path))
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice()
+    result = find_libdevice_module.locate_bitcode_lib("device")
+
+    assert result is not None
+    assert result.abs_path == expected_path
+    assert os.path.isfile(result.abs_path)
+    
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
+    rel_path = os.path.join("nvvm", "libdevice")
+    libdevice_dir = tmp_path / rel_path
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[str(libdevice_dir)],
+    )
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice_module.find_bitcode_lib("device")
 
     assert result == expected_path
-    assert os.path.isfile(result)
+    assert isinstance(result, str)
+
+
+def test_find_bitcode_lib_invalid_name():
+    with pytest.raises(ValueError, match="Unknown bitcode library"):
+        find_libdevice_module.locate_bitcode_lib("invalid")
\ No newline at end of file

From 68b33a219c22ce24de971c89c7fa1babea6bf6d4 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:27:07 +0000
Subject: [PATCH 44/50] refresh

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py     | 10 +++++-----
 cuda_pathfinder/tests/test_find_libdevice.py           |  7 +++++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index f4422a5df1..727ab1577f 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -14,13 +14,14 @@
 
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
+
     pass
 
 
 @dataclass(frozen=True)
 class LocatedBitcodeLib:
-    """Information about a located bitcode library.
-    """
+    """Information about a located bitcode library."""
+
     name: str
     abs_path: str
     filename: str
@@ -31,7 +32,7 @@ class LocatedBitcodeLib:
         "filename": "libdevice.10.bc",
         "rel_path": os.path.join("nvvm", "libdevice"),
         "site_packages_dirs": (
-            "nvidia/cu13/nvvm/libdevice",       # CTK 13+
+            "nvidia/cu13/nvvm/libdevice",  # CTK 13+
             "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
         ),
     },
@@ -57,8 +58,7 @@ class _FindBitcodeLib:
     def __init__(self, name: str) -> None:
         if name not in SUPPORTED_BITCODE_LIBS:
             raise ValueError(
-                f"Unknown bitcode library: '{name}'. "
-                f"Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
+                f"Unknown bitcode library: '{name}'. Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
             )
         self.name = name
         self.config = SUPPORTED_BITCODE_LIBS[name]
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
index d93929991f..78ed1dbe21 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -12,6 +12,7 @@
 SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
 SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
 
+
 @pytest.fixture
 def clear_find_libdevice_cache():
     find_libdevice_module.find_bitcode_lib.cache_clear()
@@ -74,6 +75,7 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
 
+
 @pytest.mark.usefixtures("clear_find_libdevice_cache")
 def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
@@ -94,7 +96,8 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     assert result is not None
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
-    
+
+
 @pytest.mark.usefixtures("clear_find_libdevice_cache")
 def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
@@ -118,4 +121,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_libdevice_module.locate_bitcode_lib("invalid")
\ No newline at end of file
+        find_libdevice_module.locate_bitcode_lib("invalid")

From 631a1138f5a13a58b1f6024b5e9d8ed4019ee136 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 13 Feb 2026 09:29:01 +0000
Subject: [PATCH 45/50] [pre-commit.ci] auto code formatting

---
 cuda_pathfinder/cuda/pathfinder/__init__.py                 | 6 ++++++
 .../cuda/pathfinder/_static_libs/find_libdevice.py          | 2 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index cc1eda8d2d..80dff6d486 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -21,8 +21,14 @@
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
 from cuda.pathfinder._static_libs.find_libdevice import (
     BitcodeLibNotFoundError as BitcodeLibNotFoundError,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
     LocatedBitcodeLib as LocatedBitcodeLib,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
     find_bitcode_lib as find_bitcode_lib,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
     locate_bitcode_lib as locate_bitcode_lib,
 )
 
diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 727ab1577f..a45f03a19b 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -15,8 +15,6 @@
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
 
-    pass
-
 
 @dataclass(frozen=True)
 class LocatedBitcodeLib:

From 7a02aee8c974070e0513fc098cc31480f9217903 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:36:23 +0000
Subject: [PATCH 46/50] fix mypy errirs

---
 .../pathfinder/_static_libs/find_libdevice.py | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index a45f03a19b..7e3c1a7805 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -5,6 +5,7 @@
 import glob
 import os
 from dataclasses import dataclass
+from typing import TypedDict
 
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
@@ -25,21 +26,27 @@ class LocatedBitcodeLib:
     filename: str
 
 
-SUPPORTED_BITCODE_LIBS = {
+class _BitcodeLibConfig(TypedDict):
+    filename: str
+    rel_path: str
+    site_packages_dirs: tuple[str, ...]
+
+
+SUPPORTED_BITCODE_LIBS: dict[str, _BitcodeLibConfig] = {
     "device": {
         "filename": "libdevice.10.bc",
         "rel_path": os.path.join("nvvm", "libdevice"),
         "site_packages_dirs": (
-            "nvidia/cu13/nvvm/libdevice",  # CTK 13+
-            "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
+            "nvidia/cu13/nvvm/libdevice",
+            "nvidia/cuda_nvcc/nvvm/libdevice",
         ),
     },
 }
 
 if IS_WINDOWS:
-    _COMMON_BASES = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    _COMMON_BASES: list[str] = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
 else:
-    _COMMON_BASES = ["/usr/local/cuda", "/opt/cuda"]
+    _COMMON_BASES: list[str] = ["/usr/local/cuda", "/opt/cuda"]
 
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
@@ -58,11 +65,11 @@ def __init__(self, name: str) -> None:
             raise ValueError(
                 f"Unknown bitcode library: '{name}'. Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
             )
-        self.name = name
-        self.config = SUPPORTED_BITCODE_LIBS[name]
-        self.filename = self.config["filename"]
-        self.rel_path = self.config["rel_path"]
-        self.site_packages_dirs = self.config["site_packages_dirs"]
+        self.name: str = name
+        self.config: _BitcodeLibConfig = SUPPORTED_BITCODE_LIBS[name]
+        self.filename: str = self.config["filename"]
+        self.rel_path: str = self.config["rel_path"]
+        self.site_packages_dirs: tuple[str, ...] = self.config["site_packages_dirs"]
         self.error_messages: list[str] = []
         self.attachments: list[str] = []
 
@@ -149,7 +156,7 @@ def find_bitcode_lib(name: str) -> str:
     """Find the absolute path to a bitcode library."""
     result = locate_bitcode_lib(name)
     if result is None:
-        config = SUPPORTED_BITCODE_LIBS.get(name, {})
-        filename = config.get("filename", name)
+        config = SUPPORTED_BITCODE_LIBS.get(name)
+        filename = config["filename"] if config else name
         raise BitcodeLibNotFoundError(f"{filename} not found")
     return result.abs_path

From b8f2eb076e2b703bae7883af6c13842c7c218ea2 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:42:26 +0000
Subject: [PATCH 47/50] fix base var declaration

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py       | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 7e3c1a7805..5d51f733d8 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -43,10 +43,11 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
-if IS_WINDOWS:
-    _COMMON_BASES: list[str] = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-else:
-    _COMMON_BASES: list[str] = ["/usr/local/cuda", "/opt/cuda"]
+_COMMON_BASES: list[str] = (
+    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    if IS_WINDOWS
+    else ["/usr/local/cuda", "/opt/cuda"]
+)
 
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:

From 434b3c904c84cdbbd511fe64d6b2ed38be5d34d5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:49:58 +0000
Subject: [PATCH 48/50] format changes

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 5d51f733d8..102b3b9f77 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -13,6 +13,13 @@
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
 
+_COMMON_BASES: list[str] = (
+    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    if IS_WINDOWS
+    else ["/usr/local/cuda", "/opt/cuda"]
+)
+
+
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
 
@@ -43,12 +50,6 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
-_COMMON_BASES: list[str] = (
-    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-    if IS_WINDOWS
-    else ["/usr/local/cuda", "/opt/cuda"]
-)
-
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")

From 9f0a3199978b6c042c0c35f0dd6121621a24344a Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:51:56 +0000
Subject: [PATCH 49/50] format changes

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py  | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 102b3b9f77..5d51f733d8 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -13,13 +13,6 @@
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
 
-_COMMON_BASES: list[str] = (
-    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-    if IS_WINDOWS
-    else ["/usr/local/cuda", "/opt/cuda"]
-)
-
-
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
 
@@ -50,6 +43,12 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
+_COMMON_BASES: list[str] = (
+    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    if IS_WINDOWS
+    else ["/usr/local/cuda", "/opt/cuda"]
+)
+
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")

From f499487a0ea6a2f63553d0d31802dd8759837020 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 06:27:50 +0000
Subject: [PATCH 50/50] rename to bitcodelib

---
 cuda_pathfinder/cuda/pathfinder/__init__.py   |  8 +--
 ...{find_libdevice.py => find_bitcode_lib.py} |  0
 ..._libdevice.py => test_find_bitcode_lib.py} | 64 +++++++++----------
 3 files changed, 36 insertions(+), 36 deletions(-)
 rename cuda_pathfinder/cuda/pathfinder/_static_libs/{find_libdevice.py => find_bitcode_lib.py} (100%)
 rename cuda_pathfinder/tests/{test_find_libdevice.py => test_find_bitcode_lib.py} (57%)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 80dff6d486..29a3ff4979 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -19,16 +19,16 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     BitcodeLibNotFoundError as BitcodeLibNotFoundError,
 )
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     LocatedBitcodeLib as LocatedBitcodeLib,
 )
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     find_bitcode_lib as find_bitcode_lib,
 )
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     locate_bitcode_lib as locate_bitcode_lib,
 )
 
diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
similarity index 100%
rename from cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
rename to cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
similarity index 57%
rename from cuda_pathfinder/tests/test_find_libdevice.py
rename to cuda_pathfinder/tests/test_find_bitcode_lib.py
index 78ed1dbe21..b9ab71529f 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-import cuda.pathfinder._static_libs.find_libdevice as find_libdevice_module
+import cuda.pathfinder._static_libs.find_bitcode_lib as find_bitcode_lib_module
 
 FILENAME = "libdevice.10.bc"
 
@@ -14,13 +14,13 @@
 
 
 @pytest.fixture
-def clear_find_libdevice_cache():
-    find_libdevice_module.find_bitcode_lib.cache_clear()
+def clear_find_bitcode_lib_cache():
+    find_bitcode_lib_module.find_bitcode_lib.cache_clear()
     yield
-    find_libdevice_module.find_bitcode_lib.cache_clear()
+    find_bitcode_lib_module.find_bitcode_lib.cache_clear()
 
 
-def _make_libdevice_file(dir_path: str) -> str:
+def _make_bitcode_lib_file(dir_path: str) -> str:
     os.makedirs(dir_path, exist_ok=True)
     file_path = os.path.join(dir_path, FILENAME)
     with open(file_path, "wb"):
@@ -29,21 +29,21 @@ def _make_libdevice_file(dir_path: str) -> str:
 
 
 @pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13])
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
-def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
-    libdevice_dir = tmp_path.joinpath(*rel_dir.split("/"))
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
+def test_find_bitcode_lib_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
+    bitcode_lib_dir = tmp_path.joinpath(*rel_dir.split("/"))
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
-        return_value=[str(libdevice_dir)],
+        return_value=[str(bitcode_lib_dir)],
     )
     monkeypatch.delenv("CONDA_PREFIX", raising=False)
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.locate_bitcode_lib("device")
+    result = find_bitcode_lib_module.locate_bitcode_lib("device")
 
     assert result is not None
     assert result.abs_path == expected_path
@@ -53,15 +53,15 @@ def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir
 
 
 # same for cu12/cu13
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
-def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
+def test_find_bitcode_lib_via_conda(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
-    libdevice_dir = tmp_path / rel_path
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+    bitcode_lib_dir = tmp_path / rel_path
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
-    mocker.patch.object(find_libdevice_module, "IS_WINDOWS", False)
+    mocker.patch.object(find_bitcode_lib_module, "IS_WINDOWS", False)
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
         return_value=[],
     )
@@ -69,21 +69,21 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.locate_bitcode_lib("device")
+    result = find_bitcode_lib_module.locate_bitcode_lib("device")
 
     assert result is not None
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
 
 
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
-def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
+def test_find_bitcode_lib_via_cuda_home(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
-    libdevice_dir = tmp_path / rel_path
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+    bitcode_lib_dir = tmp_path / rel_path
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
         return_value=[],
     )
@@ -91,29 +91,29 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     monkeypatch.setenv("CUDA_HOME", str(tmp_path))
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.locate_bitcode_lib("device")
+    result = find_bitcode_lib_module.locate_bitcode_lib("device")
 
     assert result is not None
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
 
 
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
 def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
-    libdevice_dir = tmp_path / rel_path
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+    bitcode_lib_dir = tmp_path / rel_path
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
-        return_value=[str(libdevice_dir)],
+        return_value=[str(bitcode_lib_dir)],
     )
     monkeypatch.delenv("CONDA_PREFIX", raising=False)
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.find_bitcode_lib("device")
+    result = find_bitcode_lib_module.find_bitcode_lib("device")
 
     assert result == expected_path
     assert isinstance(result, str)
@@ -121,4 +121,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_libdevice_module.locate_bitcode_lib("invalid")
+        find_bitcode_lib_module.locate_bitcode_lib("invalid")