From a4db20b970d334f43fac86d1c388d07ab90ba110 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 5 Nov 2025 02:14:29 +0000 Subject: [PATCH 01/50] add ltoir test support --- cuda_core/tests/test_program.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index adc778973c..5c966296fe 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -387,3 +387,33 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options): assert ".visible .entry simple(" in ptx_text program.close() + +@nvvm_available +@pytest.mark.parametrize( + "options", + [ + ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False), + ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True), + ProgramOptions( + name="ltoir_test3", + arch="sm_90", + ftz=True, + prec_sqrt=False, + prec_div=False, + fma=True, + device_code_optimize=True, + link_time_optimization=True, + ), + ], +) +def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options): + """Test NVVM programs for LTOIR with different options""" + program = Program(nvvm_ir, "nvvm", options) + assert program.backend == "NVVM" + + ltoir_code = program.compile("ltoir") + assert isinstance(ltoir_code, ObjectCode) + assert ltoir_code.name == options.name + code_content = ltoir_code.code + assert len(ltoir_code.code) > 0 + program.close() From fb6cfb3902b8a22aba92624ea417616033018852 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 25 Nov 2025 04:10:17 +0000 Subject: [PATCH 02/50] add options for multi-modules --- cuda_core/cuda/core/experimental/_program.py | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 1db453fed1..2e0115be48 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -298,6 +298,7 @@ class ProgramOptions: split_compile: int | None = None fdevice_syntax_only: bool | None = None minimal: bool | None = None + extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None def __post_init__(self): self._name = self.name.encode() @@ -479,6 +480,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None): assert_type(code, str) # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved + + if options.extra_sources is not None: + raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)") self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], [])) self._mnff.backend = "NVRTC" @@ -487,6 +491,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None): elif code_type == "ptx": assert_type(code, str) + if options.extra_sources is not None: + raise ValueError("extra_sources is not supported by the PTX backend.") + self._linker = Linker( ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) ) @@ -502,6 +509,29 @@ def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff.handle = nvvm.create_program() self._mnff.backend = "NVVM" nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode()) + if options.extra_sources is not None: + if isinstance(options.extra_sources, (str, bytes)): + extra_sources = [options.extra_sources] + elif isinstance(options.extra_sources, (list, tuple)): + extra_sources = options.extra_sources + else: + raise TypeError("extra_sources must be str, bytes, list, or tuple") + + if len(extra_sources) == 0: + raise ValueError("extra_sources cannot be empty if provided") + + for i, extra_source in enumerate(extra_sources): + if isinstance(extra_source, str): + extra_source = extra_source.encode("utf-8") + elif not isinstance(extra_source, (bytes, bytearray)): + raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray") + + if len(extra_source) == 0: + raise ValueError(f"Extra source {i} cannot be empty") + + extra_name = f"{options.name}_extra_{i}" + nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name) + self._backend = "NVVM" self._linker = None From 7aaed4e4f36b890b73098430ae39384217e3d4d1 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 25 Nov 2025 11:05:56 +0000 Subject: [PATCH 03/50] add tests --- cuda_core/tests/test_program.py | 119 ++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 5c966296fe..70b131556d 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -417,3 +417,122 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options): code_content = ltoir_code.code assert len(ltoir_code.code) > 0 program.close() + + +@nvvm_available +def test_nvvm_program_with_single_extra_source(nvvm_ir): + """Test NVVM program with a single extra source""" + from cuda.core.experimental._program import _get_nvvm_module + nvvm = _get_nvvm_module() + major, minor, debug_major, debug_minor = nvvm.ir_version() + #helper nvvm ir for multiple module loading + helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +define i32 @helper_add(i32 %x) {{ +entry: + %result = add i32 %x, 1 + ret i32 %result +}} + +!nvvmir.version = !{{!0}} +!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} +""" + + options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir) + program = Program(nvvm_ir, "nvvm", options) + + assert program.backend == "NVVM" + + ptx_code = program.compile("ptx") + assert isinstance(ptx_code, ObjectCode) + assert ptx_code.name == "multi_module_test" + + program.close() + +@nvvm_available +def test_nvvm_program_with_multiple_extra_sources(): + """Test NVVM program with multiple extra sources""" + from cuda.core.experimental._program import _get_nvvm_module + nvvm = _get_nvvm_module() + major, minor, debug_major, debug_minor = nvvm.ir_version() + + main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +declare i32 @helper_add(i32) nounwind readnone +declare i32 @helper_mul(i32) nounwind readnone + +define void @main_kernel(i32* %data) {{ +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %ptr = getelementptr inbounds i32, i32* %data, i32 %tid + %val = load i32, i32* %ptr, align 4 + + %val1 = call i32 @helper_add(i32 %val) + %val2 = call i32 @helper_mul(i32 %val1) + + store i32 %val2, i32* %ptr, align 4 + ret void +}} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone + +!nvvm.annotations = !{{!0}} +!0 = !{{void (i32*)* @main_kernel, !"kernel", i32 1}} + +!nvvmir.version = !{{!1}} +!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} +""" + + helper1_ir = f"""target triple = "nvptx64-unknown-cuda" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +define i32 @helper_add(i32 %x) nounwind readnone {{ +entry: + %result = add i32 %x, 1 + ret i32 %result +}} + +!nvvmir.version = !{{!0}} +!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} +""" + + helper2_ir = f"""target triple = "nvptx64-unknown-cuda" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +define i32 @helper_mul(i32 %x) nounwind readnone {{ +entry: + %result = mul i32 %x, 2 + ret i32 %result +}} + +!nvvmir.version = !{{!0}} +!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} +""" + + options = ProgramOptions( + name="nvvm_multi_helper_test", + extra_sources=[helper1_ir, helper2_ir] + ) + program = Program(main_nvvm_ir, "nvvm", options) + + assert program.backend == "NVVM" + ptx_code = program.compile("ptx") + assert isinstance(ptx_code, ObjectCode) + assert ptx_code.name == "nvvm_multi_helper_test" + + ltoir_code = program.compile("ltoir") + assert isinstance(ltoir_code, ObjectCode) + assert ltoir_code.name == "nvvm_multi_helper_test" + + program.close() + +def test_cpp_program_with_extra_sources(): + #negative test with NVRTC with multiple sources + code = 'extern "C" __global__ void my_kernel(){}' + helper = 'extern "C" __global__ void helper(){}' + options = ProgramOptions(extra_sources = helper) + with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"): + Program(code, "c++", options) + \ No newline at end of file From 64c7f7d3d7acc4785bd475530786777e5babe2ab Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 25 Nov 2025 14:48:43 +0000 Subject: [PATCH 04/50] add bitcode test --- cuda_core/tests/test_program.py | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 70b131556d..24ae767220 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -527,6 +527,51 @@ def test_nvvm_program_with_multiple_extra_sources(): assert ltoir_code.name == "nvvm_multi_helper_test" program.close() + +@nvvm_available +def test_bitcode_format(): + import os + from pathlib import Path + bitcode_path = os.environ.get("BITCODE_NVVM_PATH") + if not bitcode_path: + pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set." + "Disabling the test.") + bitcode_file = Path(bitcode_path) + if not bitcode_file.exists(): + pytest.skip(f"Bitcode file not found: {bitcode_path}") + + if not bitcode_file.suffix == '.bc': + pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}") + + try: + with open(bitcode_file, 'rb') as f: + bitcode_data = f.read() + + if len(bitcode_data) < 4: + pytest.skip("Bitcode file appears to be empty or invalid") + + options = ProgramOptions( + name=f"existing_bitcode_{bitcode_file.stem}", + arch="sm_90" + ) + program = Program(bitcode_data, "nvvm", options) + + assert program.backend == "NVVM" + ptx_result = program.compile("ptx") + assert isinstance(ptx_result, ObjectCode) + assert ptx_result.name.startswith("existing_bitcode_") + assert len(ptx_result.code) > 0 + try: + ltoir_result = program.compile("ltoir") + assert isinstance(ltoir_result, ObjectCode) + assert len(ltoir_result.code) > 0 + print(f"LTOIR size: {len(ltoir_result.code)} bytes") + except Exception as e: + print(f"LTOIR compilation failed : {e}") + program.close() + except Exception as e: + pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}") + def test_cpp_program_with_extra_sources(): #negative test with NVRTC with multiple sources From 42ba301078bc17c796cddbe99499c957633eb5d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Nov 2025 14:51:09 +0000 Subject: [PATCH 05/50] [pre-commit.ci] auto code formatting --- cuda_core/cuda/core/experimental/_program.py | 14 ++-- cuda_core/tests/test_program.py | 68 ++++++++++---------- 2 files changed, 40 insertions(+), 42 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 2e0115be48..eadd37ca95 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -480,7 +480,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): assert_type(code, str) # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - + if options.extra_sources is not None: raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)") @@ -493,7 +493,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): assert_type(code, str) if options.extra_sources is not None: raise ValueError("extra_sources is not supported by the PTX backend.") - + self._linker = Linker( ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) ) @@ -516,22 +516,22 @@ def __init__(self, code, code_type, options: ProgramOptions = None): extra_sources = options.extra_sources else: raise TypeError("extra_sources must be str, bytes, list, or tuple") - + if len(extra_sources) == 0: raise ValueError("extra_sources cannot be empty if provided") - + for i, extra_source in enumerate(extra_sources): if isinstance(extra_source, str): extra_source = extra_source.encode("utf-8") elif not isinstance(extra_source, (bytes, bytearray)): raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray") - + if len(extra_source) == 0: raise ValueError(f"Extra source {i} cannot be empty") - + extra_name = f"{options.name}_extra_{i}" nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name) - + self._backend = "NVVM" self._linker = None diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 24ae767220..9aca1878d4 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -388,13 +388,14 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options): program.close() + @nvvm_available @pytest.mark.parametrize( "options", [ - ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False), - ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True), - ProgramOptions( + ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False), + ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True), + ProgramOptions( name="ltoir_test3", arch="sm_90", ftz=True, @@ -403,7 +404,7 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options): fma=True, device_code_optimize=True, link_time_optimization=True, - ), + ), ], ) def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options): @@ -423,9 +424,10 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options): def test_nvvm_program_with_single_extra_source(nvvm_ir): """Test NVVM program with a single extra source""" from cuda.core.experimental._program import _get_nvvm_module + nvvm = _get_nvvm_module() major, minor, debug_major, debug_minor = nvvm.ir_version() - #helper nvvm ir for multiple module loading + # helper nvvm ir for multiple module loading helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" @@ -441,22 +443,24 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir): options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir) program = Program(nvvm_ir, "nvvm", options) - + assert program.backend == "NVVM" - + ptx_code = program.compile("ptx") assert isinstance(ptx_code, ObjectCode) assert ptx_code.name == "multi_module_test" - + program.close() + @nvvm_available def test_nvvm_program_with_multiple_extra_sources(): """Test NVVM program with multiple extra sources""" from cuda.core.experimental._program import _get_nvvm_module + nvvm = _get_nvvm_module() major, minor, debug_major, debug_minor = nvvm.ir_version() - + main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" @@ -468,10 +472,10 @@ def test_nvvm_program_with_multiple_extra_sources(): %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %ptr = getelementptr inbounds i32, i32* %data, i32 %tid %val = load i32, i32* %ptr, align 4 - + %val1 = call i32 @helper_add(i32 %val) %val2 = call i32 @helper_mul(i32 %val1) - + store i32 %val2, i32* %ptr, align 4 ret void }} @@ -511,51 +515,46 @@ def test_nvvm_program_with_multiple_extra_sources(): !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} """ - options = ProgramOptions( - name="nvvm_multi_helper_test", - extra_sources=[helper1_ir, helper2_ir] - ) + options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir]) program = Program(main_nvvm_ir, "nvvm", options) - + assert program.backend == "NVVM" ptx_code = program.compile("ptx") assert isinstance(ptx_code, ObjectCode) assert ptx_code.name == "nvvm_multi_helper_test" - + ltoir_code = program.compile("ltoir") assert isinstance(ltoir_code, ObjectCode) assert ltoir_code.name == "nvvm_multi_helper_test" - + program.close() + @nvvm_available def test_bitcode_format(): import os from pathlib import Path + bitcode_path = os.environ.get("BITCODE_NVVM_PATH") if not bitcode_path: - pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set." - "Disabling the test.") + pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.") bitcode_file = Path(bitcode_path) if not bitcode_file.exists(): pytest.skip(f"Bitcode file not found: {bitcode_path}") - - if not bitcode_file.suffix == '.bc': + + if not bitcode_file.suffix == ".bc": pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}") - + try: - with open(bitcode_file, 'rb') as f: + with open(bitcode_file, "rb") as f: bitcode_data = f.read() - + if len(bitcode_data) < 4: pytest.skip("Bitcode file appears to be empty or invalid") - - options = ProgramOptions( - name=f"existing_bitcode_{bitcode_file.stem}", - arch="sm_90" - ) + + options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90") program = Program(bitcode_data, "nvvm", options) - + assert program.backend == "NVVM" ptx_result = program.compile("ptx") assert isinstance(ptx_result, ObjectCode) @@ -572,12 +571,11 @@ def test_bitcode_format(): except Exception as e: pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}") - + def test_cpp_program_with_extra_sources(): - #negative test with NVRTC with multiple sources + # negative test with NVRTC with multiple sources code = 'extern "C" __global__ void my_kernel(){}' helper = 'extern "C" __global__ void helper(){}' - options = ProgramOptions(extra_sources = helper) + options = ProgramOptions(extra_sources=helper) with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"): Program(code, "c++", options) - \ No newline at end of file From 7ca68992514e63cffbd51642a59a9f277e79540f Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 25 Nov 2025 18:14:22 +0000 Subject: [PATCH 06/50] fix format --- cuda_core/tests/test_program.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 9aca1878d4..7d3e1c8709 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -416,7 +416,7 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options): assert isinstance(ltoir_code, ObjectCode) assert ltoir_code.name == options.name code_content = ltoir_code.code - assert len(ltoir_code.code) > 0 + assert len(code_content) > 0 program.close() @@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir): !nvvmir.version = !{{!0}} !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" +""" # noqa: E501 options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir) program = Program(nvvm_ir, "nvvm", options) @@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources(): !nvvmir.version = !{{!1}} !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" +""" # noqa: E501 helper1_ir = f"""target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" @@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources(): !nvvmir.version = !{{!0}} !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" +""" # noqa: E501 helper2_ir = f"""target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" @@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources(): !nvvmir.version = !{{!0}} !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" +""" # noqa: E501 options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir]) program = Program(main_nvvm_ir, "nvvm", options) @@ -542,7 +542,7 @@ def test_bitcode_format(): if not bitcode_file.exists(): pytest.skip(f"Bitcode file not found: {bitcode_path}") - if not bitcode_file.suffix == ".bc": + if bitcode_file.suffix != ".bc": pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}") try: From 0674ea17dda35f886279bdef2f33a4aa56071562 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Nov 2025 18:15:33 +0000 Subject: [PATCH 07/50] [pre-commit.ci] auto code formatting --- cuda_core/tests/test_program.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 7d3e1c8709..1b96a5f094 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir): !nvvmir.version = !{{!0}} !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" # noqa: E501 +""" # noqa: E501 options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir) program = Program(nvvm_ir, "nvvm", options) @@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources(): !nvvmir.version = !{{!1}} !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" # noqa: E501 +""" # noqa: E501 helper1_ir = f"""target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" @@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources(): !nvvmir.version = !{{!0}} !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" # noqa: E501 +""" # noqa: E501 helper2_ir = f"""target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" @@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources(): !nvvmir.version = !{{!0}} !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} -""" # noqa: E501 +""" # noqa: E501 options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir]) program = Program(main_nvvm_ir, "nvvm", options) From 03b122466a0eb10b4adb1b692b4a36b017f7241e Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 26 Nov 2025 10:26:42 +0000 Subject: [PATCH 08/50] refresh --- cuda_core/cuda/core/experimental/_program.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index eadd37ca95..7d3ea15e46 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -479,11 +479,11 @@ def __init__(self, code, code_type, options: ProgramOptions = None): if code_type == "c++": assert_type(code, str) # TODO: support pre-loaded headers & include names - # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - + if options.extra_sources is not None: raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)") + # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], [])) self._mnff.backend = "NVRTC" self._backend = "NVRTC" From 033f11cacf560de6ccf1bf9605070cc912081ddf Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Mon, 1 Dec 2025 11:45:49 +0000 Subject: [PATCH 09/50] apply bitcode file from cupy_test helpers --- cuda_core/tests/test_program.py | 61 ++++---- .../cuda_python_test_helpers/nvvm_bitcode.py | 136 ++++++++++++++++++ 2 files changed, 163 insertions(+), 34 deletions(-) create mode 100644 cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 1b96a5f094..c62e1712f6 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -14,6 +14,13 @@ cuda_driver_version = handle_return(driver.cuDriverGetVersion()) is_culink_backend = _linker._decide_nvjitlink_or_driver() +try: + from cuda_python_test_helpers.nvvm_bitcode import ( + minimal_nvvmir + ) + _test_helpers_available = True +except ImportError: + _test_helpers_available = False def _is_nvvm_available(): """Check if NVVM is available.""" @@ -531,46 +538,32 @@ def test_nvvm_program_with_multiple_extra_sources(): @nvvm_available -def test_bitcode_format(): +@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible") +def test_bitcode_format(minimal_nvvmir): import os from pathlib import Path + + if len(minimal_nvvmir) < 4: + pytest.skip("Bitcode file is not valid or empty") - bitcode_path = os.environ.get("BITCODE_NVVM_PATH") - if not bitcode_path: - pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.") - bitcode_file = Path(bitcode_path) - if not bitcode_file.exists(): - pytest.skip(f"Bitcode file not found: {bitcode_path}") - - if bitcode_file.suffix != ".bc": - pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}") + options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90") + program = Program(minimal_nvvmir, "nvvm", options) + assert program.backend == "NVVM" + ptx_result = program.compile("ptx") + assert isinstance(ptx_result, ObjectCode) + assert ptx_result.name == "minimal_nvvmir_bitcode_test" + assert len(ptx_result.code) > 0 + prgram_lto = Program(minimal_nvvmir, "nvvm", options) try: - with open(bitcode_file, "rb") as f: - bitcode_data = f.read() - - if len(bitcode_data) < 4: - pytest.skip("Bitcode file appears to be empty or invalid") - - options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90") - program = Program(bitcode_data, "nvvm", options) - - assert program.backend == "NVVM" - ptx_result = program.compile("ptx") - assert isinstance(ptx_result, ObjectCode) - assert ptx_result.name.startswith("existing_bitcode_") - assert len(ptx_result.code) > 0 - try: - ltoir_result = program.compile("ltoir") - assert isinstance(ltoir_result, ObjectCode) - assert len(ltoir_result.code) > 0 - print(f"LTOIR size: {len(ltoir_result.code)} bytes") - except Exception as e: - print(f"LTOIR compilation failed : {e}") - program.close() + ltoir_result = program_lto.compile("ltoir") + assert isinstance(ltoir_result, ObjectCode) + assert len(ltoir_result.code) > 0 + print(f"LTOIR size: {len(ltoir_result.code)} bytes") except Exception as e: - pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}") - + print(f"LTOIR compilation failed : {e}") + finally: + program.close() def test_cpp_program_with_extra_sources(): # negative test with NVRTC with multiple sources diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py new file mode 100644 index 0000000000..cba22d5425 --- /dev/null +++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py @@ -0,0 +1,136 @@ +import binascii +import re +from contextlib import contextmanager + +import pytest +from cuda.bindings import nvvm + +MINIMAL_NVVMIR_TXT_TEMPLATE = b"""\ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +target triple = "nvptx64-nvidia-cuda" + +define void @kernel() { +entry: + ret void +} + +!nvvm.annotations = !{!0} +!0 = !{void ()* @kernel, !"kernel", i32 1} + +!nvvmir.version = !{!1} +!1 = !{i32 %d, i32 0, i32 %d, i32 0} +""" # noqa: E501 + +MINIMAL_NVVMIR_BITCODE_STATIC = { + (1, 3): # (major, debug_major) + "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c00007f010000" + "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502" + "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9" + "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498" + "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007" + "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007" + "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d" + "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120" + "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006" + "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a" + "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21" + "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0e86000004016080000" + "06000000321e980c19114c908c092647c6044362098c009401000000b1180000ac0000003308801c" + "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce" + "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376" + "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089" + "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007" + "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1" + "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443" + "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087" + "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0" + "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0" + "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339" + "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e" + "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3" + "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650" + "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421" + "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb" + "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881" + "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f" + "e5000000792000001d000000721e482043880c19097232482023818c9191d144a01028643c313242" + "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c330c4230cc40" + "0c4441c84860821272b3b36b730973737ba30ba34b7b739b1b2528d271b3b36b4b9373b12b939b4b" + "7b731b2530000000a9180000250000000b0a7228877780077a587098433db8c338b04339d0c382e6" + "1cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200fe7500ef4" + "b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1cd8211cdc" + "e11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe1500ff4800e" + "00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c30100000061200000" + "06000000130481860301000002000000075010cd14610000000000007120000003000000320e1022" + "8400fb020000000000000000650c00001f000000120394f000000000030000000600000006000000" + "4c000000010000005800000000000000580000000100000070000000000000000c00000013000000" + "1f000000080000000600000000000000700000000000000000000000010000000000000000000000" + "060000000000000006000000ffffffff00240000000000005d0c00000d0000001203946700000000" + "6b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c737472696e673e00" + "00000000", + (2, 3): # (major, debug_major) + "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c000080010000" + "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502" + "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9" + "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498" + "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007" + "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007" + "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d" + "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120" + "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006" + "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a" + "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21" + "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0286100004016080000" + "06000000321e980c19114c908c092647c60443620914c10840190000b1180000ac0000003308801c" + "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce" + "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376" + "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089" + "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007" + "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1" + "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443" + "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087" + "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0" + "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0" + "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339" + "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e" + "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3" + "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650" + "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421" + "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb" + "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881" + "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f" + "e5000000792000001e000000721e482043880c19097232482023818c9191d144a01028643c313242" + "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c23080431c320" + "04c30c045118858c04262821373bbb36973037b737ba30bab437b7b95102231d373bbbb6343917bb" + "32b9b9b437b7518203000000a9180000250000000b0a7228877780077a587098433db8c338b04339" + "d0c382e61cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200f" + "e7500ef4b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1c" + "d8211cdce11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe150" + "0ff4800e00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c301000000" + "6120000006000000130481860301000002000000075010cd14610000000000007120000003000000" + "320e10228400fc020000000000000000650c00001f000000120394f0000000000300000006000000" + "060000004c000000010000005800000000000000580000000100000070000000000000000c000000" + "130000001f0000000800000006000000000000007000000000000000000000000100000000000000" + "00000000060000000000000006000000ffffffff00240000000000005d0c00000d00000012039467" + "000000006b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c73747269" + "6e673e0000000000", +} + +@pytest.fixture(params=("txt", "bitcode_static")) +def minimal_nvvmir(request): + major, minor, debug_major, debug_minor = nvvm.ir_version() + + if request.param == "txt": + return MINIMAL_NVVMIR_TXT_TEMPLATE % (major, debug_major) + + bitcode_static_binascii = MINIMAL_NVVMIR_BITCODE_STATIC.get((major, debug_major)) + if bitcode_static_binascii: + return binascii.unhexlify(bitcode_static_binascii) + raise RuntimeError( + "Static bitcode for NVVM IR version " + f"{major}.{debug_major} is not available in this test.\n" + "Maintainers: Please run the helper script to generate it and add the " + "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n" + " ../../toolshed/build_static_bitcode_input.py" + ) \ No newline at end of file From 6e411ee8619868ec7ac6d4b4304a2e950edf490c Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Mon, 1 Dec 2025 17:44:23 +0000 Subject: [PATCH 10/50] use 2 tuples --- cuda_core/cuda/core/experimental/_program.py | 58 ++++++++++++-------- cuda_core/tests/test_program.py | 11 +++- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 7d3ea15e46..ba6c4cf0af 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -298,7 +298,10 @@ class ProgramOptions: split_compile: int | None = None fdevice_syntax_only: bool | None = None minimal: bool | None = None - extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None + # Creating as 2 tuples ((names, source), (names,source)) + extra_sources: ( + Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None + ) = None def __post_init__(self): self._name = self.name.encode() @@ -468,13 +471,14 @@ def close(self): nvvm.destroy_program(self.handle) self.handle = None - __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options") + __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count") def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff = Program._MembersNeededForFinalize(self, None, None) self._options = options = check_or_create_options(ProgramOptions, options, "Program options") code_type = code_type.lower() + self._module_count = 0 if code_type == "c++": assert_type(code, str) @@ -509,28 +513,36 @@ def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff.handle = nvvm.create_program() self._mnff.backend = "NVVM" nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode()) + self._module_count = 1 + # Add extra modules if provided if options.extra_sources is not None: - if isinstance(options.extra_sources, (str, bytes)): - extra_sources = [options.extra_sources] - elif isinstance(options.extra_sources, (list, tuple)): - extra_sources = options.extra_sources - else: - raise TypeError("extra_sources must be str, bytes, list, or tuple") - - if len(extra_sources) == 0: - raise ValueError("extra_sources cannot be empty if provided") - - for i, extra_source in enumerate(extra_sources): - if isinstance(extra_source, str): - extra_source = extra_source.encode("utf-8") - elif not isinstance(extra_source, (bytes, bytearray)): - raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray") - - if len(extra_source) == 0: - raise ValueError(f"Extra source {i} cannot be empty") - - extra_name = f"{options.name}_extra_{i}" - nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name) + if not is_sequence(options.extra_sources): + raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)") + for i, module in enumerate(options.extra_sources): + if not isinstance(module, tuple) or len(module) != 2: + raise TypeError( + f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}" + ) + + module_name, module_source = module + + if not isinstance(module_name, str): + raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}") + + if isinstance(module_source, str): + # Textual LLVM IR - encode to UTF-8 bytes + module_source = module_source.encode("utf-8") + elif not isinstance(module_source, (bytes, bytearray)): + raise TypeError( + f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), " + f"or bytearray, got {type(module_source).__name__}" + ) + + if len(module_source) == 0: + raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty") + + nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name) + self._module_count += 1 self._backend = "NVVM" self._linker = None diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index c62e1712f6..aec975e84e 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -435,7 +435,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir): nvvm = _get_nvvm_module() major, minor, debug_major, debug_minor = nvvm.ir_version() # helper nvvm ir for multiple module loading - helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda" + helper_nvvmir = f"""target triple = "nvptx64-unknown-cuda" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" define i32 @helper_add(i32 %x) {{ @@ -448,7 +448,12 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir): !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} """ # noqa: E501 - options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir) + options = ProgramOptions( + name="multi_module_test", + extra_sources=[ + ("helper", helper_nvvmir), + ] + ) program = Program(nvvm_ir, "nvvm", options) assert program.backend == "NVVM" @@ -522,7 +527,7 @@ def test_nvvm_program_with_multiple_extra_sources(): !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} """ # noqa: E501 - options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir]) + options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),]) program = Program(main_nvvm_ir, "nvvm", options) assert program.backend == "NVVM" From aeb26aa1f8dcdaf8a81f6715faa47c40703ced4e Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 3 Dec 2025 04:16:32 +0000 Subject: [PATCH 11/50] refresh --- cuda_core/cuda/core/experimental/_program.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 9f74c3b10a..3aab5de3ae 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -303,7 +303,7 @@ class ProgramOptions: Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None ) = None numba_debug: bool | None = None # Custom option for Numba debugging - + def __post_init__(self): self._name = self.name.encode() @@ -423,8 +423,6 @@ def __post_init__(self): self._formatted_options.append("--fdevice-syntax-only") if self.minimal is not None and self.minimal: self._formatted_options.append("--minimal") - if self.numba_debug: - self._formatted_options.append("--numba-debug") def _as_bytes(self): # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved From b3d6d960faa8f9ff130cd558e91b56213d1b448f Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 3 Dec 2025 07:54:25 +0000 Subject: [PATCH 12/50] format --- cuda_core/cuda/core/experimental/_program.py | 27 +++++----- cuda_core/tests/test_program.py | 57 +++++++------------- 2 files changed, 35 insertions(+), 49 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 3aab5de3ae..efdb39807a 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -7,7 +7,7 @@ import weakref from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, List, Tuple, Union from warnings import warn if TYPE_CHECKING: @@ -303,7 +303,7 @@ class ProgramOptions: Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None ) = None numba_debug: bool | None = None # Custom option for Numba debugging - + def __post_init__(self): self._name = self.name.encode() @@ -484,7 +484,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): if code_type == "c++": assert_type(code, str) # TODO: support pre-loaded headers & include names - + if options.extra_sources is not None: raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)") @@ -515,21 +515,24 @@ def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff.backend = "NVVM" nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode()) self._module_count = 1 - # Add extra modules if provided + # Add extra modules if provided if options.extra_sources is not None: if not is_sequence(options.extra_sources): - raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)") + raise TypeError( + "extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)" + ) for i, module in enumerate(options.extra_sources): if not isinstance(module, tuple) or len(module) != 2: raise TypeError( - f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}" + f"Each extra module must be a 2-tuple (name, source)" + f", got {type(module).__name__} at index {i}" ) - + module_name, module_source = module - + if not isinstance(module_name, str): - raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}") - + raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}") + if isinstance(module_source, str): # Textual LLVM IR - encode to UTF-8 bytes module_source = module_source.encode("utf-8") @@ -538,10 +541,10 @@ def __init__(self, code, code_type, options: ProgramOptions = None): f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), " f"or bytearray, got {type(module_source).__name__}" ) - + if len(module_source) == 0: raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty") - + nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name) self._module_count += 1 diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index e0b9a61023..f9cb8dc5f2 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -15,13 +15,13 @@ is_culink_backend = _linker._decide_nvjitlink_or_driver() try: - from cuda_python_test_helpers.nvvm_bitcode import ( - minimal_nvvmir - ) + from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir + _test_helpers_available = True except ImportError: _test_helpers_available = False + def _is_nvvm_available(): """Check if NVVM is available.""" try: @@ -38,29 +38,12 @@ def _is_nvvm_available(): ) try: - from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc + from cuda.core.experimental._utils.cuda_utils import driver, handle_return _cuda_driver_version = handle_return(driver.cuDriverGetVersion()) except Exception: _cuda_driver_version = 0 - -def _get_nvrtc_version_for_tests(): - """ - Get NVRTC version. - - Returns: - int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2) - None: If NVRTC is not available - """ - try: - nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion()) - version = nvrtc_major * 1000 + nvrtc_minor * 100 - return version - except Exception: - return None - - _libnvvm_version = None _libnvvm_version_attempted = False @@ -200,13 +183,6 @@ def ptx_code_object(): [ ProgramOptions(name="abc"), ProgramOptions(device_code_optimize=True, debug=True), - pytest.param( - ProgramOptions(debug=True, numba_debug=True), - marks=pytest.mark.skipif( - (_get_nvrtc_version_for_tests() or 0) < 13200, - reason="numba_debug requires NVRTC >= 13.2", - ), - ), ProgramOptions(relocatable_device_code=True, max_register_count=32), ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), ProgramOptions(fma=False, use_fast_math=True), @@ -473,11 +449,11 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir): """ # noqa: E501 options = ProgramOptions( - name="multi_module_test", - extra_sources=[ - ("helper", helper_nvvmir), - ] - ) + name="multi_module_test", + extra_sources=[ + ("helper", helper_nvvmir), + ], + ) program = Program(nvvm_ir, "nvvm", options) assert program.backend == "NVVM" @@ -551,7 +527,13 @@ def test_nvvm_program_with_multiple_extra_sources(): !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}} """ # noqa: E501 - options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),]) + options = ProgramOptions( + name="nvvm_multi_helper_test", + extra_sources=[ + ("helper1", helper1_ir), + ("helper2", helper2_ir), + ], + ) program = Program(main_nvvm_ir, "nvvm", options) assert program.backend == "NVVM" @@ -567,11 +549,11 @@ def test_nvvm_program_with_multiple_extra_sources(): @nvvm_available -@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible") +@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible") def test_bitcode_format(minimal_nvvmir): import os from pathlib import Path - + if len(minimal_nvvmir) < 4: pytest.skip("Bitcode file is not valid or empty") @@ -583,7 +565,7 @@ def test_bitcode_format(minimal_nvvmir): assert isinstance(ptx_result, ObjectCode) assert ptx_result.name == "minimal_nvvmir_bitcode_test" assert len(ptx_result.code) > 0 - prgram_lto = Program(minimal_nvvmir, "nvvm", options) + program_lto = Program(minimal_nvvmir, "nvvm", options) try: ltoir_result = program_lto.compile("ltoir") assert isinstance(ltoir_result, ObjectCode) @@ -594,6 +576,7 @@ def test_bitcode_format(minimal_nvvmir): finally: program.close() + def test_cpp_program_with_extra_sources(): # negative test with NVRTC with multiple sources code = 'extern "C" __global__ void my_kernel(){}' From edd6401cc5300c03adb583c021da60b6da18bfe6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Dec 2025 07:55:54 +0000 Subject: [PATCH 13/50] [pre-commit.ci] auto code formatting --- cuda_core/tests/test_program.py | 5 +---- .../cuda_python_test_helpers/nvvm_bitcode.py | 5 ++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index f9cb8dc5f2..d0de56a41c 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -551,13 +551,10 @@ def test_nvvm_program_with_multiple_extra_sources(): @nvvm_available @pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible") def test_bitcode_format(minimal_nvvmir): - import os - from pathlib import Path - if len(minimal_nvvmir) < 4: pytest.skip("Bitcode file is not valid or empty") - options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90") + options = ProgramOptions(name="minimal_nvvmir_bitcode_test", arch="sm_90") program = Program(minimal_nvvmir, "nvvm", options) assert program.backend == "NVVM" diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py index cba22d5425..e42dbff085 100644 --- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py +++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py @@ -1,6 +1,4 @@ import binascii -import re -from contextlib import contextmanager import pytest from cuda.bindings import nvvm @@ -117,6 +115,7 @@ "6e673e0000000000", } + @pytest.fixture(params=("txt", "bitcode_static")) def minimal_nvvmir(request): major, minor, debug_major, debug_minor = nvvm.ir_version() @@ -133,4 +132,4 @@ def minimal_nvvmir(request): "Maintainers: Please run the helper script to generate it and add the " "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n" " ../../toolshed/build_static_bitcode_input.py" - ) \ No newline at end of file + ) From 8dbbafece0fc088fa8fbb7d042347f6ce905f9b5 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Mon, 15 Dec 2025 17:59:25 +0000 Subject: [PATCH 14/50] fix from upstream --- cuda_core/cuda/core/experimental/_program.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index efdb39807a..2610bc1232 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -423,6 +423,8 @@ def __post_init__(self): self._formatted_options.append("--fdevice-syntax-only") if self.minimal is not None and self.minimal: self._formatted_options.append("--minimal") + if self.numba_debug: + self._formatted_options.append("--numba-debug") def _as_bytes(self): # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved From b78f0c3e820bce5b6f70903d3d83e10821b02fba Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 17 Dec 2025 13:22:24 +0000 Subject: [PATCH 15/50] refresh from upstream --- cuda_core/tests/test_program.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 91567fdced..02b9899eea 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -184,6 +184,13 @@ def ptx_code_object(): [ ProgramOptions(name="abc"), ProgramOptions(device_code_optimize=True, debug=True), + pytest.param( + ProgramOptions(debug=True, numba_debug=True), + marks=pytest.mark.skipif( + (_get_nvrtc_version_for_tests() or 0) < 13200, + reason="numba_debug requires NVRTC >= 13.2", + ), + ), ProgramOptions(relocatable_device_code=True, max_register_count=32), ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), ProgramOptions(fma=False, use_fast_math=True), @@ -714,4 +721,4 @@ def test_program_options_as_bytes_nvvm_unsupported_option(): """Test that unsupported options raise CUDAError for NVVM backend""" options = ProgramOptions(arch="sm_80", lineinfo=True) with pytest.raises(CUDAError, match="not supported by NVVM backend"): - options.as_bytes("nvvm") + options.as_bytes("nvvm") \ No newline at end of file From 99a5593d0d82d5de399228eadf4fc6b6ed825ff6 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 17 Dec 2025 13:27:55 +0000 Subject: [PATCH 16/50] fix tests --- cuda_core/tests/test_program.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 02b9899eea..00d0709870 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -45,6 +45,21 @@ def _is_nvvm_available(): except Exception: _cuda_driver_version = 0 + +def _get_nvrtc_version_for_tests(): + """ + Get NVRTC version. + Returns: + int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2) + None: If NVRTC is not available + """ + try: + nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion()) + version = nvrtc_major * 1000 + nvrtc_minor * 100 + return version + except Exception: + return None + _libnvvm_version = None _libnvvm_version_attempted = False From 783f6e5a419b74757c7620824a7a23ec847321fd Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 17 Dec 2025 13:29:35 +0000 Subject: [PATCH 17/50] take path_finder from PR 447 --- .../bindings/_internal/nvjitlink_linux.pyx | 1 + .../cuda/bindings/_path_finder/cuda_paths.py | 400 ++++++++++++++++++ .../find_nvidia_dynamic_library.py | 139 ++++++ .../cuda/bindings/_path_finder/findlib.py | 69 +++ .../load_nvidia_dynamic_library.py | 92 ++++ .../_path_finder/sys_path_find_sub_dirs.py | 40 ++ cuda_bindings/cuda/bindings/path_finder.py | 37 ++ 7 files changed, 778 insertions(+) create mode 100644 cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py create mode 100644 cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py create mode 100644 cuda_bindings/cuda/bindings/_path_finder/findlib.py create mode 100644 cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py create mode 100644 cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py create mode 100644 cuda_bindings/cuda/bindings/path_finder.py diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx index 8c96b6d640..d095b3f524 100644 --- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx +++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx @@ -12,6 +12,7 @@ from .utils import FunctionNotFoundError, NotSupportedError from cuda.pathfinder import load_nvidia_dynamic_lib + ############################################################################### # Extern ############################################################################### diff --git a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py new file mode 100644 index 0000000000..c827699b3a --- /dev/null +++ b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py @@ -0,0 +1,400 @@ +import os +import platform +import re +import site +import sys +import traceback +import warnings +from collections import namedtuple +from pathlib import Path + +from .findlib import find_file, find_lib + +IS_WIN32 = sys.platform.startswith("win32") + +_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"]) + + +def _get_numba_CUDA_INCLUDE_PATH(): + # From numba/numba/core/config.py + + def _readenv(name, ctor, default): + value = os.environ.get(name) + if value is None: + return default() if callable(default) else default + try: + return ctor(value) + except Exception: + warnings.warn( # noqa: B028 + f"Environment variable '{name}' is defined but " + f"its associated value '{value}' could not be " + "parsed.\nThe parse failed with exception:\n" + f"{traceback.format_exc()}", + RuntimeWarning, + ) + return default + + if IS_WIN32: + cuda_path = os.environ.get("CUDA_PATH") + if cuda_path: # noqa: SIM108 + default_cuda_include_path = os.path.join(cuda_path, "include") + else: + default_cuda_include_path = "cuda_include_not_found" + else: + default_cuda_include_path = os.path.join(os.sep, "usr", "local", "cuda", "include") + CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, default_cuda_include_path) + return CUDA_INCLUDE_PATH + + +config_CUDA_INCLUDE_PATH = _get_numba_CUDA_INCLUDE_PATH() + + +def _find_valid_path(options): + """Find valid path from *options*, which is a list of 2-tuple of + (name, path). Return first pair where *path* is not None. + If no valid path is found, return ('', None) + """ + for by, data in options: + if data is not None: + return by, data + else: + return "", None + + +def _get_libdevice_path_decision(): + options = [ + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()), + ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")), + ("Debian package", get_debian_pkg_libdevice()), + ("NVIDIA NVCC Wheel", get_libdevice_wheel()), + ] + libdevice_ctk_dir = get_system_ctk("nvvm", "libdevice") + if libdevice_ctk_dir and os.path.exists(libdevice_ctk_dir): + options.append(("System", libdevice_ctk_dir)) + + by, libdir = _find_valid_path(options) + return by, libdir + + +def _nvvm_lib_dir(): + if IS_WIN32: + return "nvvm", "bin" + else: + return "nvvm", "lib64" + + +def _get_nvvm_path_decision(): + options = [ + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()), + ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())), + ("NVIDIA NVCC Wheel", _get_nvvm_wheel()), + ] + # need to ensure nvvm dir actually exists + nvvm_ctk_dir = get_system_ctk(*_nvvm_lib_dir()) + if nvvm_ctk_dir and os.path.exists(nvvm_ctk_dir): + options.append(("System", nvvm_ctk_dir)) + + by, path = _find_valid_path(options) + return by, path + + +def _get_nvvm_wheel(): + site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None] + for sp in site_paths: + # The SONAME is taken based on public CTK 12.x releases + if sys.platform.startswith("linux"): + dso_dir = "lib64" + # Hack: libnvvm from Linux wheel + # does not have any soname (CUDAINST-3183) + dso_path = "libnvvm.so" + elif sys.platform.startswith("win32"): + dso_dir = "bin" + dso_path = "nvvm64_40_0.dll" + else: + raise AssertionError() + + if sp is not None: + dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir) + dso_path = os.path.join(dso_dir, dso_path) + if os.path.exists(dso_path): + return str(Path(dso_path).parent) + + +def _get_libdevice_paths(): + by, libdir = _get_libdevice_path_decision() + if by == "NVIDIA NVCC Wheel": + # The NVVM path is a directory, not a file + out = os.path.join(libdir, "libdevice.10.bc") + else: + # Search for pattern + pat = r"libdevice(\.\d+)*\.bc$" + candidates = find_file(re.compile(pat), libdir) + # Keep only the max (most recent version) of the bitcode files. + out = max(candidates, default=None) + return _env_path_tuple(by, out) + + +def _cudalib_path(): + if IS_WIN32: + return "bin" + else: + return "lib64" + + +def _cuda_home_static_cudalib_path(): + if IS_WIN32: + return ("lib", "x64") + else: + return ("lib64",) + + +def _get_cudalib_dir_path_decision(): + options = [ + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()), + ("CUDA_HOME", get_cuda_home(_cudalib_path())), + ("System", get_system_ctk(_cudalib_path())), + ] + by, libdir = _find_valid_path(options) + return by, libdir + + +def _get_static_cudalib_dir_path_decision(): + options = [ + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()), + ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())), + ("System", get_system_ctk(_cudalib_path())), + ] + by, libdir = _find_valid_path(options) + return by, libdir + + +def _get_cudalib_dir(): + by, libdir = _get_cudalib_dir_path_decision() + return _env_path_tuple(by, libdir) + + +def _get_static_cudalib_dir(): + by, libdir = _get_static_cudalib_dir_path_decision() + return _env_path_tuple(by, libdir) + + +def get_system_ctk(*subdirs): + """Return path to system-wide cudatoolkit; or, None if it doesn't exist.""" + # Linux? + if sys.platform.startswith("linux"): + # Is cuda alias to /usr/local/cuda? + # We are intentionally not getting versioned cuda installation. + base = "/usr/local/cuda" + if os.path.exists(base): + return os.path.join(base, *subdirs) + + +def get_conda_ctk(): + """Return path to directory containing the shared libraries of cudatoolkit.""" + is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) + if not is_conda_env: + return + # Assume the existence of NVVM to imply cudatoolkit installed + paths = find_lib("nvvm") + if not paths: + return + # Use the directory name of the max path + return os.path.dirname(max(paths)) + + +def get_nvidia_nvvm_ctk(): + """Return path to directory containing the NVVM shared library.""" + is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) + if not is_conda_env: + return + + # Assume the existence of NVVM in the conda env implies that a CUDA toolkit + # conda package is installed. + + # First, try the location used on Linux and the Windows 11.x packages + libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path()) + if not os.path.exists(libdir) or not os.path.isdir(libdir): + # If that fails, try the location used for Windows 12.x packages + libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path()) + if not os.path.exists(libdir) or not os.path.isdir(libdir): + # If that doesn't exist either, assume we don't have the NVIDIA + # conda package + return + + paths = find_lib("nvvm", libdir=libdir) + if not paths: + return + # Use the directory name of the max path + return os.path.dirname(max(paths)) + + +def get_nvidia_libdevice_ctk(): + """Return path to directory containing the libdevice library.""" + nvvm_ctk = get_nvidia_nvvm_ctk() + if not nvvm_ctk: + return + nvvm_dir = os.path.dirname(nvvm_ctk) + return os.path.join(nvvm_dir, "libdevice") + + +def get_nvidia_cudalib_ctk(): + """Return path to directory containing the shared libraries of cudatoolkit.""" + nvvm_ctk = get_nvidia_nvvm_ctk() + if not nvvm_ctk: + return + env_dir = os.path.dirname(os.path.dirname(nvvm_ctk)) + subdir = "bin" if IS_WIN32 else "lib" + return os.path.join(env_dir, subdir) + + +def get_nvidia_static_cudalib_ctk(): + """Return path to directory containing the static libraries of cudatoolkit.""" + nvvm_ctk = get_nvidia_nvvm_ctk() + if not nvvm_ctk: + return + + if IS_WIN32 and ("Library" not in nvvm_ctk): # noqa: SIM108 + # Location specific to CUDA 11.x packages on Windows + dirs = ("Lib", "x64") + else: + # Linux, or Windows with CUDA 12.x packages + dirs = ("lib",) + + env_dir = os.path.dirname(os.path.dirname(nvvm_ctk)) + return os.path.join(env_dir, *dirs) + + +def get_cuda_home(*subdirs): + """Get paths of CUDA_HOME. + If *subdirs* are the subdirectory name to be appended in the resulting + path. + """ + cuda_home = os.environ.get("CUDA_HOME") + if cuda_home is None: + # Try Windows CUDA installation without Anaconda + cuda_home = os.environ.get("CUDA_PATH") + if cuda_home is not None: + return os.path.join(cuda_home, *subdirs) + + +def _get_nvvm_path(): + by, path = _get_nvvm_path_decision() + if by == "NVIDIA NVCC Wheel": + # The NVVM path is a directory, not a file + path = os.path.join(path, "libnvvm.so") + else: + candidates = find_lib("nvvm", path) + path = max(candidates) if candidates else None + return _env_path_tuple(by, path) + + +def get_cuda_paths(): + """Returns a dictionary mapping component names to a 2-tuple + of (source_variable, info). + The returned dictionary will have the following keys and infos: + - "nvvm": file_path + - "libdevice": List[Tuple[arch, file_path]] + - "cudalib_dir": directory_path + Note: The result of the function is cached. + """ + # Check cache + if hasattr(get_cuda_paths, "_cached_result"): + return get_cuda_paths._cached_result + else: + # Not in cache + d = { + "nvvm": _get_nvvm_path(), + "libdevice": _get_libdevice_paths(), + "cudalib_dir": _get_cudalib_dir(), + "static_cudalib_dir": _get_static_cudalib_dir(), + "include_dir": _get_include_dir(), + } + # Cache result + get_cuda_paths._cached_result = d + return d + + +def get_debian_pkg_libdevice(): + """ + Return the Debian NVIDIA Maintainers-packaged libdevice location, if it + exists. + """ + pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice" + if not os.path.exists(pkg_libdevice_location): + return None + return pkg_libdevice_location + + +def get_libdevice_wheel(): + nvvm_path = _get_nvvm_wheel() + if nvvm_path is None: + return None + nvvm_path = Path(nvvm_path) + libdevice_path = nvvm_path.parent / "libdevice" + + return str(libdevice_path) + + +def get_current_cuda_target_name(): + """Determine conda's CTK target folder based on system and machine arch. + CTK's conda package delivers headers based on its architecture type. For example, + `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and + `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the + nuances at cudart's conda feedstock: + https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11 # noqa: E501 + """ + system = platform.system() + machine = platform.machine() + + if system == "Linux": + arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"} + elif system == "Windows": + arch_to_targets = { + "AMD64": "x64", + } + else: + arch_to_targets = {} + + return arch_to_targets.get(machine, None) + + +def get_conda_include_dir(): + """ + Return the include directory in the current conda environment, if one + is active and it exists. + """ + is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) + if not is_conda_env: + return + + if platform.system() == "Windows": + include_dir = os.path.join(sys.prefix, "Library", "include") + elif target_name := get_current_cuda_target_name(): + include_dir = os.path.join(sys.prefix, "targets", target_name, "include") + else: + # A fallback when target cannot determined + # though usually it shouldn't. + include_dir = os.path.join(sys.prefix, "include") + + if ( + os.path.exists(include_dir) + and os.path.isdir(include_dir) + and os.path.exists(os.path.join(include_dir, "cuda_device_runtime_api.h")) + ): + return include_dir + return + + +def _get_include_dir(): + """Find the root include directory.""" + options = [ + ("Conda environment (NVIDIA package)", get_conda_include_dir()), + ("CUDA_INCLUDE_PATH Config Entry", config_CUDA_INCLUDE_PATH), + # TODO: add others + ] + by, include_dir = _find_valid_path(options) + return _env_path_tuple(by, include_dir) \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py new file mode 100644 index 0000000000..d8413282e6 --- /dev/null +++ b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py @@ -0,0 +1,139 @@ +# Copyright 2024-2025 NVIDIA Corporation. All rights reserved. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import functools +import glob +import os + +from .cuda_paths import IS_WIN32, get_cuda_paths +from .sys_path_find_sub_dirs import sys_path_find_sub_dirs + + +def _no_such_file_in_sub_dirs(sub_dirs, file_wild, error_messages, attachments): + error_messages.append(f"No such file: {file_wild}") + for sub_dir in sys_path_find_sub_dirs(sub_dirs): + attachments.append(f' listdir("{sub_dir}"):') + for node in sorted(os.listdir(sub_dir)): + attachments.append(f" {node}") + + +def _find_so_using_nvidia_lib_dirs(libname, so_basename, error_messages, attachments): + if libname == "nvvm": # noqa: SIM108 + nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64") + else: + nvidia_sub_dirs = ("nvidia", "*", "lib") + file_wild = so_basename + "*" + for lib_dir in sys_path_find_sub_dirs(nvidia_sub_dirs): + # First look for an exact match + so_name = os.path.join(lib_dir, so_basename) + if os.path.isfile(so_name): + return so_name + # Look for a versioned library + # Using sort here mainly to make the result deterministic. + for node in sorted(glob.glob(os.path.join(lib_dir, file_wild))): + so_name = os.path.join(lib_dir, node) + if os.path.isfile(so_name): + return so_name + _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments) + return None + + +def _find_dll_using_nvidia_bin_dirs(libname, error_messages, attachments): + if libname == "nvvm": # noqa: SIM108 + nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin") + else: + nvidia_sub_dirs = ("nvidia", "*", "bin") + file_wild = libname + "*.dll" + for bin_dir in sys_path_find_sub_dirs(nvidia_sub_dirs): + for node in sorted(glob.glob(os.path.join(bin_dir, file_wild))): + dll_name = os.path.join(bin_dir, node) + if os.path.isfile(dll_name): + return dll_name + _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments) + return None + + +def _get_cuda_paths_info(key, error_messages): + env_path_tuple = get_cuda_paths()[key] + if not env_path_tuple: + error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"]') + return None + if not env_path_tuple.info: + error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"].info') + return None + return env_path_tuple.info + + +def _find_so_using_cudalib_dir(so_basename, error_messages, attachments): + cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages) + if cudalib_dir is None: + return None + primary_so_dir = cudalib_dir + "/" + candidate_so_dirs = [primary_so_dir] + libs = ["/lib/", "/lib64/"] + for _ in range(2): + alt_dir = libs[0].join(primary_so_dir.rsplit(libs[1], 1)) + if alt_dir not in candidate_so_dirs: + candidate_so_dirs.append(alt_dir) + libs.reverse() + candidate_so_names = [so_dirname + so_basename for so_dirname in candidate_so_dirs] + error_messages = [] + for so_name in candidate_so_names: + if os.path.isfile(so_name): + return so_name + error_messages.append(f"No such file: {so_name}") + for so_dirname in candidate_so_dirs: + attachments.append(f' listdir("{so_dirname}"):') + if not os.path.isdir(so_dirname): + attachments.append(" DIRECTORY DOES NOT EXIST") + else: + for node in sorted(os.listdir(so_dirname)): + attachments.append(f" {node}") + return None + + +def _find_dll_using_cudalib_dir(libname, error_messages, attachments): + cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages) + if cudalib_dir is None: + return None + file_wild = libname + "*.dll" + for node in sorted(glob.glob(os.path.join(cudalib_dir, file_wild))): + dll_name = os.path.join(cudalib_dir, node) + if os.path.isfile(dll_name): + return dll_name + error_messages.append(f"No such file: {file_wild}") + attachments.append(f' listdir("{cudalib_dir}"):') + for node in sorted(os.listdir(cudalib_dir)): + attachments.append(f" {node}") + return None + + +@functools.cache +def find_nvidia_dynamic_library(name: str) -> str: + error_messages = [] + attachments = [] + + if IS_WIN32: + dll_name = _find_dll_using_nvidia_bin_dirs(name, error_messages, attachments) + if dll_name is None: + if name == "nvvm": + dll_name = _get_cuda_paths_info("nvvm", error_messages) + else: + dll_name = _find_dll_using_cudalib_dir(name, error_messages, attachments) + if dll_name is None: + attachments = "\n".join(attachments) + raise RuntimeError(f"Failure finding {name}*.dll: {', '.join(error_messages)}\n{attachments}") + return dll_name + + so_basename = f"lib{name}.so" + so_name = _find_so_using_nvidia_lib_dirs(name, so_basename, error_messages, attachments) + if so_name is None: + if name == "nvvm": + so_name = _get_cuda_paths_info("nvvm", error_messages) + else: + so_name = _find_so_using_cudalib_dir(so_basename, error_messages, attachments) + if so_name is None: + attachments = "\n".join(attachments) + raise RuntimeError(f"Failure finding {so_basename}: {', '.join(error_messages)}\n{attachments}") + return so_name \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/findlib.py b/cuda_bindings/cuda/bindings/_path_finder/findlib.py new file mode 100644 index 0000000000..c64c3c9577 --- /dev/null +++ b/cuda_bindings/cuda/bindings/_path_finder/findlib.py @@ -0,0 +1,69 @@ +# Forked from: +# https://github.com/numba/numba/blob/f0d24824fcd6a454827e3c108882395d00befc04/numba/misc/findlib.py + +import os +import re +import sys + + +def get_lib_dirs(): + """ + Anaconda specific + """ + if sys.platform == "win32": + # on windows, historically `DLLs` has been used for CUDA libraries, + # since approximately CUDA 9.2, `Library\bin` has been used. + dirnames = ["DLLs", os.path.join("Library", "bin")] + else: + dirnames = [ + "lib", + ] + libdirs = [os.path.join(sys.prefix, x) for x in dirnames] + return libdirs + + +DLLNAMEMAP = { + "linux": r"lib%(name)s\.so\.%(ver)s$", + "linux2": r"lib%(name)s\.so\.%(ver)s$", + "linux-static": r"lib%(name)s\.a$", + "darwin": r"lib%(name)s\.%(ver)s\.dylib$", + "win32": r"%(name)s%(ver)s\.dll$", + "win32-static": r"%(name)s\.lib$", + "bsd": r"lib%(name)s\.so\.%(ver)s$", +} + +RE_VER = r"[0-9]*([_\.][0-9]+)*" + + +def find_lib(libname, libdir=None, platform=None, static=False): + platform = platform or sys.platform + platform = "bsd" if "bsd" in platform else platform + if static: + platform = f"{platform}-static" + if platform not in DLLNAMEMAP: + # Return empty list if platform name is undefined. + # Not all platforms define their static library paths. + return [] + pat = DLLNAMEMAP[platform] % {"name": libname, "ver": RE_VER} + regex = re.compile(pat) + return find_file(regex, libdir) + + +def find_file(pat, libdir=None): + if libdir is None: + libdirs = get_lib_dirs() + elif isinstance(libdir, str): + libdirs = [ + libdir, + ] + else: + libdirs = list(libdir) + files = [] + for ldir in libdirs: + try: + entries = os.listdir(ldir) + except FileNotFoundError: + continue + candidates = [os.path.join(ldir, ent) for ent in entries if pat.match(ent)] + files.extend([c for c in candidates if os.path.isfile(c)]) + return files \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py new file mode 100644 index 0000000000..69aadabcbf --- /dev/null +++ b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py @@ -0,0 +1,92 @@ +import functools +import sys + +if sys.platform == "win32": + import ctypes.wintypes + + import pywintypes + import win32api + + # Mirrors WinBase.h (unfortunately not defined already elsewhere) + _WINBASE_LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800 + +else: + import ctypes + import os + + _LINUX_CDLL_MODE = os.RTLD_NOW | os.RTLD_GLOBAL + +from .find_nvidia_dynamic_library import find_nvidia_dynamic_library + + +@functools.cache +def _windows_cuDriverGetVersion() -> int: + handle = win32api.LoadLibrary("nvcuda.dll") + + kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) + GetProcAddress = kernel32.GetProcAddress + GetProcAddress.argtypes = [ctypes.wintypes.HMODULE, ctypes.wintypes.LPCSTR] + GetProcAddress.restype = ctypes.c_void_p + cuDriverGetVersion = GetProcAddress(handle, b"cuDriverGetVersion") + assert cuDriverGetVersion + + FUNC_TYPE = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_int)) + cuDriverGetVersion_fn = FUNC_TYPE(cuDriverGetVersion) + driver_ver = ctypes.c_int() + err = cuDriverGetVersion_fn(ctypes.byref(driver_ver)) + assert err == 0 + return driver_ver.value + + +@functools.cache +def _windows_load_with_dll_basename(name: str) -> int: + driver_ver = _windows_cuDriverGetVersion() + del driver_ver # Keeping this here because it will probably be needed in the future. + + if name == "nvJitLink": + dll_name = "nvJitLink_120_0.dll" + elif name == "nvrtc": + dll_name = "nvrtc64_120_0.dll" + elif name == "nvvm": + dll_name = "nvvm64_40_0.dll" + + try: + return win32api.LoadLibrary(dll_name) + except pywintypes.error: + pass + + return None + + +@functools.cache +def load_nvidia_dynamic_library(name: str) -> int: + # First try using the platform-specific dynamic loader search mechanisms + if sys.platform == "win32": + handle = _windows_load_with_dll_basename(name) + if handle: + return handle + else: + dl_path = f"lib{name}.so" # Version intentionally no specified. + try: + handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE) + except OSError: + pass + else: + # Use `cdef void* ptr = ` in cython to convert back to void* + return handle._handle # C unsigned int + + dl_path = find_nvidia_dynamic_library(name) + if sys.platform == "win32": + try: + handle = win32api.LoadLibrary(dl_path) + except pywintypes.error as e: + raise RuntimeError(f"Failed to load DLL at {dl_path}: {e}") from e + # Use `cdef void* ptr = ` in cython to convert back to void* + return handle # C signed int, matches win32api.GetProcAddress + else: + try: + handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE) + except OSError as e: + raise RuntimeError(f"Failed to dlopen {dl_path}: {e}") from e + # Use `cdef void* ptr = ` in cython to convert back to void* + return handle._handle # C unsigned int \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py new file mode 100644 index 0000000000..324cdeec30 --- /dev/null +++ b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py @@ -0,0 +1,40 @@ +# Copyright 2024-2025 NVIDIA Corporation. All rights reserved. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import functools +import os +import sys + + +@functools.cache +def _impl(sys_path, sub_dirs): + results = [] + for base in sys_path: + stack = [(base, 0)] # (current_path, index into sub_dirs) + while stack: + current_path, idx = stack.pop() + if idx == len(sub_dirs): + if os.path.isdir(current_path): + results.append(current_path) + continue + + sub = sub_dirs[idx] + if sub == "*": + try: + entries = sorted(os.listdir(current_path)) + except OSError: + continue + for entry in entries: + entry_path = os.path.join(current_path, entry) + if os.path.isdir(entry_path): + stack.append((entry_path, idx + 1)) + else: + next_path = os.path.join(current_path, sub) + if os.path.isdir(next_path): + stack.append((next_path, idx + 1)) + return results + + +def sys_path_find_sub_dirs(sub_dirs): + return _impl(tuple(sys.path), tuple(sub_dirs)) \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/path_finder.py b/cuda_bindings/cuda/bindings/path_finder.py new file mode 100644 index 0000000000..12c02b930b --- /dev/null +++ b/cuda_bindings/cuda/bindings/path_finder.py @@ -0,0 +1,37 @@ +# Copyright 2024-2025 NVIDIA Corporation. All rights reserved. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +from cuda.bindings._path_finder.cuda_paths import ( + get_conda_ctk, + get_conda_include_dir, + get_cuda_home, + get_cuda_paths, + get_current_cuda_target_name, + get_debian_pkg_libdevice, + get_libdevice_wheel, + get_nvidia_cudalib_ctk, + get_nvidia_libdevice_ctk, + get_nvidia_nvvm_ctk, + get_nvidia_static_cudalib_ctk, + get_system_ctk, +) +from cuda.bindings._path_finder.find_nvidia_dynamic_library import find_nvidia_dynamic_library +from cuda.bindings._path_finder.load_nvidia_dynamic_library import load_nvidia_dynamic_library + +__all__ = [ + "find_nvidia_dynamic_library", + "load_nvidia_dynamic_library", + "get_conda_ctk", + "get_conda_include_dir", + "get_cuda_home", + "get_cuda_paths", + "get_current_cuda_target_name", + "get_debian_pkg_libdevice", + "get_libdevice_wheel", + "get_nvidia_cudalib_ctk", + "get_nvidia_libdevice_ctk", + "get_nvidia_nvvm_ctk", + "get_nvidia_static_cudalib_ctk", + "get_system_ctk", +] \ No newline at end of file From 5dbfb2d0061d6ca9a656e151d922f4fd9ec255b5 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 17 Dec 2025 13:54:03 +0000 Subject: [PATCH 18/50] add builder files --- cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 4 ++-- cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx index d095b3f524..785f587951 100644 --- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx +++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx @@ -9,7 +9,7 @@ from libc.stdint cimport intptr_t, uintptr_t import threading from .utils import FunctionNotFoundError, NotSupportedError -from cuda.pathfinder import load_nvidia_dynamic_lib +from cuda.bindings import path_finder @@ -77,7 +77,7 @@ cdef void* __nvJitLinkVersion = NULL cdef void* load_library() except* with gil: - cdef uintptr_t handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint + cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvJitLink") return handle diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx index 408a2cb592..15845e3228 100644 --- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx +++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx @@ -10,6 +10,7 @@ import threading from .utils import FunctionNotFoundError, NotSupportedError from cuda.pathfinder import load_nvidia_dynamic_lib +from cuda.bindings import path_finder ############################################################################### @@ -75,7 +76,7 @@ cdef void* __nvvmGetProgramLog = NULL cdef void* load_library() except* with gil: - cdef uintptr_t handle = load_nvidia_dynamic_lib("nvvm")._handle_uint + cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvvm") return handle From 0a9eea9967e017bb20eea4801fdbba16e078ce50 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 17 Dec 2025 14:06:11 +0000 Subject: [PATCH 19/50] use python lists/tuples --- cuda_core/cuda/core/experimental/_program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 4da8191f26..1422ceda5f 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -7,7 +7,7 @@ import weakref from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, List, Tuple, Union +from typing import TYPE_CHECKING, Union from warnings import warn if TYPE_CHECKING: From 79138c0950aaee94426e5858df48010bb6db4530 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 18 Dec 2025 20:01:25 +0000 Subject: [PATCH 20/50] libdevice integration --- cuda_core/cuda/core/experimental/_program.py | 58 +++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 1422ceda5f..a5ad2a3be2 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -102,6 +102,47 @@ def _get_nvvm_module(): _nvvm_module = None raise e +def _find_libdevice_path(): + """ + Find libdevice.10.bc using cuda.bindings.path_finder. + + Returns: + str: Path to libdevice.10.bc, or None if not found + """ + try: + from cuda.bindings.path_finder import ( + get_nvidia_libdevice_ctk, + get_libdevice_wheel, + get_debian_pkg_libdevice, + ) + + for getter in [get_nvidia_libdevice_ctk, get_libdevice_wheel, get_debian_pkg_libdevice]: + try: + result = getter() + if result is not None and result.info is not None: + return result.info + except Exception: + continue + + return None + except ImportError: + import os + + # CUDA_HOME + cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + if cuda_home: + libdevice_path = os.path.join(cuda_home, "nvvm", "libdevice", "libdevice.10.bc") + if os.path.isfile(libdevice_path): + return libdevice_path + + # Linux paths + for base in ["/usr/local/cuda", "/opt/cuda"]: + libdevice_path = os.path.join(base, "nvvm", "libdevice", "libdevice.10.bc") + if os.path.isfile(libdevice_path): + return libdevice_path + + return None + def _process_define_macro_inner(formatted_options, macro): if isinstance(macro, str): @@ -352,6 +393,7 @@ class ProgramOptions: pch_messages: bool | None = None instantiate_templates_in_pch: bool | None = None numba_debug: bool | None = None # Custom option for Numba debugging + use_libdevice: bool | None = None # Use libdevice def __post_init__(self): self._name = self.name.encode() @@ -748,7 +790,8 @@ def __init__(self, code, code_type, options: ProgramOptions = None): nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name) self._module_count += 1 - + + self._use_libdevice = options.use_libdevice self._backend = "NVVM" self._linker = None @@ -866,6 +909,19 @@ def compile(self, target_type, name_expressions=(), logs=None): nvvm = _get_nvvm_module() with _nvvm_exception_manager(self): nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options) + # Invoke libdevice + if getattr(self, '_use_libdevice', False): + libdevice_path = _find_libdevice_path() + if libdevice_path is None: + raise RuntimeError( + "use_libdevice=True but could not find libdevice.10.bc. " + "Ensure CUDA toolkit is installed." + ) + with open(libdevice_path, "rb") as f: + libdevice_bc = f.read() + # Use lazy_add_module for libdevice bitcode only following numba-cuda + nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None) + nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options) size = nvvm.get_compiled_result_size(self._mnff.handle) From 25d336c1d7644b801d3730fce702a231175bde47 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 19 Dec 2025 18:43:25 +0000 Subject: [PATCH 21/50] refresh --- cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 6 ++---- cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx index 785f587951..8025189b34 100644 --- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx +++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx @@ -9,9 +9,7 @@ from libc.stdint cimport intptr_t, uintptr_t import threading from .utils import FunctionNotFoundError, NotSupportedError -from cuda.bindings import path_finder - - +from cuda.pathfinder import load_nvidia_dynamic_lib ############################################################################### # Extern @@ -77,7 +75,7 @@ cdef void* __nvJitLinkVersion = NULL cdef void* load_library() except* with gil: - cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvJitLink") + cdef uintptr_t handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint return handle diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx index 15845e3228..5411e157c7 100644 --- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx +++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx @@ -10,8 +10,6 @@ import threading from .utils import FunctionNotFoundError, NotSupportedError from cuda.pathfinder import load_nvidia_dynamic_lib -from cuda.bindings import path_finder - ############################################################################### # Extern @@ -76,7 +74,7 @@ cdef void* __nvvmGetProgramLog = NULL cdef void* load_library() except* with gil: - cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvvm") + cdef uintptr_t handle = load_nvidia_dynamic_lib("nvvm")._handle_uint return handle From 32c1913af18f49e848bdeaf7e0abb20813005892 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 19 Dec 2025 18:44:08 +0000 Subject: [PATCH 22/50] refresh --- cuda_bindings/cuda/bindings/path_finder.py | 37 ---------------------- 1 file changed, 37 deletions(-) delete mode 100644 cuda_bindings/cuda/bindings/path_finder.py diff --git a/cuda_bindings/cuda/bindings/path_finder.py b/cuda_bindings/cuda/bindings/path_finder.py deleted file mode 100644 index 12c02b930b..0000000000 --- a/cuda_bindings/cuda/bindings/path_finder.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2024-2025 NVIDIA Corporation. All rights reserved. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -from cuda.bindings._path_finder.cuda_paths import ( - get_conda_ctk, - get_conda_include_dir, - get_cuda_home, - get_cuda_paths, - get_current_cuda_target_name, - get_debian_pkg_libdevice, - get_libdevice_wheel, - get_nvidia_cudalib_ctk, - get_nvidia_libdevice_ctk, - get_nvidia_nvvm_ctk, - get_nvidia_static_cudalib_ctk, - get_system_ctk, -) -from cuda.bindings._path_finder.find_nvidia_dynamic_library import find_nvidia_dynamic_library -from cuda.bindings._path_finder.load_nvidia_dynamic_library import load_nvidia_dynamic_library - -__all__ = [ - "find_nvidia_dynamic_library", - "load_nvidia_dynamic_library", - "get_conda_ctk", - "get_conda_include_dir", - "get_cuda_home", - "get_cuda_paths", - "get_current_cuda_target_name", - "get_debian_pkg_libdevice", - "get_libdevice_wheel", - "get_nvidia_cudalib_ctk", - "get_nvidia_libdevice_ctk", - "get_nvidia_nvvm_ctk", - "get_nvidia_static_cudalib_ctk", - "get_system_ctk", -] \ No newline at end of file From 01f03e552806f65ae27defb8e5ce32a1aa79e7cc Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 19 Dec 2025 18:45:37 +0000 Subject: [PATCH 23/50] refresh --- .../cuda/bindings/_path_finder/cuda_paths.py | 400 ------------------ .../find_nvidia_dynamic_library.py | 139 ------ .../cuda/bindings/_path_finder/findlib.py | 69 --- .../load_nvidia_dynamic_library.py | 92 ---- .../_path_finder/sys_path_find_sub_dirs.py | 40 -- 5 files changed, 740 deletions(-) delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/findlib.py delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py diff --git a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py deleted file mode 100644 index c827699b3a..0000000000 --- a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py +++ /dev/null @@ -1,400 +0,0 @@ -import os -import platform -import re -import site -import sys -import traceback -import warnings -from collections import namedtuple -from pathlib import Path - -from .findlib import find_file, find_lib - -IS_WIN32 = sys.platform.startswith("win32") - -_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"]) - - -def _get_numba_CUDA_INCLUDE_PATH(): - # From numba/numba/core/config.py - - def _readenv(name, ctor, default): - value = os.environ.get(name) - if value is None: - return default() if callable(default) else default - try: - return ctor(value) - except Exception: - warnings.warn( # noqa: B028 - f"Environment variable '{name}' is defined but " - f"its associated value '{value}' could not be " - "parsed.\nThe parse failed with exception:\n" - f"{traceback.format_exc()}", - RuntimeWarning, - ) - return default - - if IS_WIN32: - cuda_path = os.environ.get("CUDA_PATH") - if cuda_path: # noqa: SIM108 - default_cuda_include_path = os.path.join(cuda_path, "include") - else: - default_cuda_include_path = "cuda_include_not_found" - else: - default_cuda_include_path = os.path.join(os.sep, "usr", "local", "cuda", "include") - CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, default_cuda_include_path) - return CUDA_INCLUDE_PATH - - -config_CUDA_INCLUDE_PATH = _get_numba_CUDA_INCLUDE_PATH() - - -def _find_valid_path(options): - """Find valid path from *options*, which is a list of 2-tuple of - (name, path). Return first pair where *path* is not None. - If no valid path is found, return ('', None) - """ - for by, data in options: - if data is not None: - return by, data - else: - return "", None - - -def _get_libdevice_path_decision(): - options = [ - ("Conda environment", get_conda_ctk()), - ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()), - ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")), - ("Debian package", get_debian_pkg_libdevice()), - ("NVIDIA NVCC Wheel", get_libdevice_wheel()), - ] - libdevice_ctk_dir = get_system_ctk("nvvm", "libdevice") - if libdevice_ctk_dir and os.path.exists(libdevice_ctk_dir): - options.append(("System", libdevice_ctk_dir)) - - by, libdir = _find_valid_path(options) - return by, libdir - - -def _nvvm_lib_dir(): - if IS_WIN32: - return "nvvm", "bin" - else: - return "nvvm", "lib64" - - -def _get_nvvm_path_decision(): - options = [ - ("Conda environment", get_conda_ctk()), - ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()), - ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())), - ("NVIDIA NVCC Wheel", _get_nvvm_wheel()), - ] - # need to ensure nvvm dir actually exists - nvvm_ctk_dir = get_system_ctk(*_nvvm_lib_dir()) - if nvvm_ctk_dir and os.path.exists(nvvm_ctk_dir): - options.append(("System", nvvm_ctk_dir)) - - by, path = _find_valid_path(options) - return by, path - - -def _get_nvvm_wheel(): - site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None] - for sp in site_paths: - # The SONAME is taken based on public CTK 12.x releases - if sys.platform.startswith("linux"): - dso_dir = "lib64" - # Hack: libnvvm from Linux wheel - # does not have any soname (CUDAINST-3183) - dso_path = "libnvvm.so" - elif sys.platform.startswith("win32"): - dso_dir = "bin" - dso_path = "nvvm64_40_0.dll" - else: - raise AssertionError() - - if sp is not None: - dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir) - dso_path = os.path.join(dso_dir, dso_path) - if os.path.exists(dso_path): - return str(Path(dso_path).parent) - - -def _get_libdevice_paths(): - by, libdir = _get_libdevice_path_decision() - if by == "NVIDIA NVCC Wheel": - # The NVVM path is a directory, not a file - out = os.path.join(libdir, "libdevice.10.bc") - else: - # Search for pattern - pat = r"libdevice(\.\d+)*\.bc$" - candidates = find_file(re.compile(pat), libdir) - # Keep only the max (most recent version) of the bitcode files. - out = max(candidates, default=None) - return _env_path_tuple(by, out) - - -def _cudalib_path(): - if IS_WIN32: - return "bin" - else: - return "lib64" - - -def _cuda_home_static_cudalib_path(): - if IS_WIN32: - return ("lib", "x64") - else: - return ("lib64",) - - -def _get_cudalib_dir_path_decision(): - options = [ - ("Conda environment", get_conda_ctk()), - ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()), - ("CUDA_HOME", get_cuda_home(_cudalib_path())), - ("System", get_system_ctk(_cudalib_path())), - ] - by, libdir = _find_valid_path(options) - return by, libdir - - -def _get_static_cudalib_dir_path_decision(): - options = [ - ("Conda environment", get_conda_ctk()), - ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()), - ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())), - ("System", get_system_ctk(_cudalib_path())), - ] - by, libdir = _find_valid_path(options) - return by, libdir - - -def _get_cudalib_dir(): - by, libdir = _get_cudalib_dir_path_decision() - return _env_path_tuple(by, libdir) - - -def _get_static_cudalib_dir(): - by, libdir = _get_static_cudalib_dir_path_decision() - return _env_path_tuple(by, libdir) - - -def get_system_ctk(*subdirs): - """Return path to system-wide cudatoolkit; or, None if it doesn't exist.""" - # Linux? - if sys.platform.startswith("linux"): - # Is cuda alias to /usr/local/cuda? - # We are intentionally not getting versioned cuda installation. - base = "/usr/local/cuda" - if os.path.exists(base): - return os.path.join(base, *subdirs) - - -def get_conda_ctk(): - """Return path to directory containing the shared libraries of cudatoolkit.""" - is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) - if not is_conda_env: - return - # Assume the existence of NVVM to imply cudatoolkit installed - paths = find_lib("nvvm") - if not paths: - return - # Use the directory name of the max path - return os.path.dirname(max(paths)) - - -def get_nvidia_nvvm_ctk(): - """Return path to directory containing the NVVM shared library.""" - is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) - if not is_conda_env: - return - - # Assume the existence of NVVM in the conda env implies that a CUDA toolkit - # conda package is installed. - - # First, try the location used on Linux and the Windows 11.x packages - libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path()) - if not os.path.exists(libdir) or not os.path.isdir(libdir): - # If that fails, try the location used for Windows 12.x packages - libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path()) - if not os.path.exists(libdir) or not os.path.isdir(libdir): - # If that doesn't exist either, assume we don't have the NVIDIA - # conda package - return - - paths = find_lib("nvvm", libdir=libdir) - if not paths: - return - # Use the directory name of the max path - return os.path.dirname(max(paths)) - - -def get_nvidia_libdevice_ctk(): - """Return path to directory containing the libdevice library.""" - nvvm_ctk = get_nvidia_nvvm_ctk() - if not nvvm_ctk: - return - nvvm_dir = os.path.dirname(nvvm_ctk) - return os.path.join(nvvm_dir, "libdevice") - - -def get_nvidia_cudalib_ctk(): - """Return path to directory containing the shared libraries of cudatoolkit.""" - nvvm_ctk = get_nvidia_nvvm_ctk() - if not nvvm_ctk: - return - env_dir = os.path.dirname(os.path.dirname(nvvm_ctk)) - subdir = "bin" if IS_WIN32 else "lib" - return os.path.join(env_dir, subdir) - - -def get_nvidia_static_cudalib_ctk(): - """Return path to directory containing the static libraries of cudatoolkit.""" - nvvm_ctk = get_nvidia_nvvm_ctk() - if not nvvm_ctk: - return - - if IS_WIN32 and ("Library" not in nvvm_ctk): # noqa: SIM108 - # Location specific to CUDA 11.x packages on Windows - dirs = ("Lib", "x64") - else: - # Linux, or Windows with CUDA 12.x packages - dirs = ("lib",) - - env_dir = os.path.dirname(os.path.dirname(nvvm_ctk)) - return os.path.join(env_dir, *dirs) - - -def get_cuda_home(*subdirs): - """Get paths of CUDA_HOME. - If *subdirs* are the subdirectory name to be appended in the resulting - path. - """ - cuda_home = os.environ.get("CUDA_HOME") - if cuda_home is None: - # Try Windows CUDA installation without Anaconda - cuda_home = os.environ.get("CUDA_PATH") - if cuda_home is not None: - return os.path.join(cuda_home, *subdirs) - - -def _get_nvvm_path(): - by, path = _get_nvvm_path_decision() - if by == "NVIDIA NVCC Wheel": - # The NVVM path is a directory, not a file - path = os.path.join(path, "libnvvm.so") - else: - candidates = find_lib("nvvm", path) - path = max(candidates) if candidates else None - return _env_path_tuple(by, path) - - -def get_cuda_paths(): - """Returns a dictionary mapping component names to a 2-tuple - of (source_variable, info). - The returned dictionary will have the following keys and infos: - - "nvvm": file_path - - "libdevice": List[Tuple[arch, file_path]] - - "cudalib_dir": directory_path - Note: The result of the function is cached. - """ - # Check cache - if hasattr(get_cuda_paths, "_cached_result"): - return get_cuda_paths._cached_result - else: - # Not in cache - d = { - "nvvm": _get_nvvm_path(), - "libdevice": _get_libdevice_paths(), - "cudalib_dir": _get_cudalib_dir(), - "static_cudalib_dir": _get_static_cudalib_dir(), - "include_dir": _get_include_dir(), - } - # Cache result - get_cuda_paths._cached_result = d - return d - - -def get_debian_pkg_libdevice(): - """ - Return the Debian NVIDIA Maintainers-packaged libdevice location, if it - exists. - """ - pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice" - if not os.path.exists(pkg_libdevice_location): - return None - return pkg_libdevice_location - - -def get_libdevice_wheel(): - nvvm_path = _get_nvvm_wheel() - if nvvm_path is None: - return None - nvvm_path = Path(nvvm_path) - libdevice_path = nvvm_path.parent / "libdevice" - - return str(libdevice_path) - - -def get_current_cuda_target_name(): - """Determine conda's CTK target folder based on system and machine arch. - CTK's conda package delivers headers based on its architecture type. For example, - `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and - `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the - nuances at cudart's conda feedstock: - https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11 # noqa: E501 - """ - system = platform.system() - machine = platform.machine() - - if system == "Linux": - arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"} - elif system == "Windows": - arch_to_targets = { - "AMD64": "x64", - } - else: - arch_to_targets = {} - - return arch_to_targets.get(machine, None) - - -def get_conda_include_dir(): - """ - Return the include directory in the current conda environment, if one - is active and it exists. - """ - is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) - if not is_conda_env: - return - - if platform.system() == "Windows": - include_dir = os.path.join(sys.prefix, "Library", "include") - elif target_name := get_current_cuda_target_name(): - include_dir = os.path.join(sys.prefix, "targets", target_name, "include") - else: - # A fallback when target cannot determined - # though usually it shouldn't. - include_dir = os.path.join(sys.prefix, "include") - - if ( - os.path.exists(include_dir) - and os.path.isdir(include_dir) - and os.path.exists(os.path.join(include_dir, "cuda_device_runtime_api.h")) - ): - return include_dir - return - - -def _get_include_dir(): - """Find the root include directory.""" - options = [ - ("Conda environment (NVIDIA package)", get_conda_include_dir()), - ("CUDA_INCLUDE_PATH Config Entry", config_CUDA_INCLUDE_PATH), - # TODO: add others - ] - by, include_dir = _find_valid_path(options) - return _env_path_tuple(by, include_dir) \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py deleted file mode 100644 index d8413282e6..0000000000 --- a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright 2024-2025 NVIDIA Corporation. All rights reserved. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import functools -import glob -import os - -from .cuda_paths import IS_WIN32, get_cuda_paths -from .sys_path_find_sub_dirs import sys_path_find_sub_dirs - - -def _no_such_file_in_sub_dirs(sub_dirs, file_wild, error_messages, attachments): - error_messages.append(f"No such file: {file_wild}") - for sub_dir in sys_path_find_sub_dirs(sub_dirs): - attachments.append(f' listdir("{sub_dir}"):') - for node in sorted(os.listdir(sub_dir)): - attachments.append(f" {node}") - - -def _find_so_using_nvidia_lib_dirs(libname, so_basename, error_messages, attachments): - if libname == "nvvm": # noqa: SIM108 - nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64") - else: - nvidia_sub_dirs = ("nvidia", "*", "lib") - file_wild = so_basename + "*" - for lib_dir in sys_path_find_sub_dirs(nvidia_sub_dirs): - # First look for an exact match - so_name = os.path.join(lib_dir, so_basename) - if os.path.isfile(so_name): - return so_name - # Look for a versioned library - # Using sort here mainly to make the result deterministic. - for node in sorted(glob.glob(os.path.join(lib_dir, file_wild))): - so_name = os.path.join(lib_dir, node) - if os.path.isfile(so_name): - return so_name - _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments) - return None - - -def _find_dll_using_nvidia_bin_dirs(libname, error_messages, attachments): - if libname == "nvvm": # noqa: SIM108 - nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin") - else: - nvidia_sub_dirs = ("nvidia", "*", "bin") - file_wild = libname + "*.dll" - for bin_dir in sys_path_find_sub_dirs(nvidia_sub_dirs): - for node in sorted(glob.glob(os.path.join(bin_dir, file_wild))): - dll_name = os.path.join(bin_dir, node) - if os.path.isfile(dll_name): - return dll_name - _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments) - return None - - -def _get_cuda_paths_info(key, error_messages): - env_path_tuple = get_cuda_paths()[key] - if not env_path_tuple: - error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"]') - return None - if not env_path_tuple.info: - error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"].info') - return None - return env_path_tuple.info - - -def _find_so_using_cudalib_dir(so_basename, error_messages, attachments): - cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages) - if cudalib_dir is None: - return None - primary_so_dir = cudalib_dir + "/" - candidate_so_dirs = [primary_so_dir] - libs = ["/lib/", "/lib64/"] - for _ in range(2): - alt_dir = libs[0].join(primary_so_dir.rsplit(libs[1], 1)) - if alt_dir not in candidate_so_dirs: - candidate_so_dirs.append(alt_dir) - libs.reverse() - candidate_so_names = [so_dirname + so_basename for so_dirname in candidate_so_dirs] - error_messages = [] - for so_name in candidate_so_names: - if os.path.isfile(so_name): - return so_name - error_messages.append(f"No such file: {so_name}") - for so_dirname in candidate_so_dirs: - attachments.append(f' listdir("{so_dirname}"):') - if not os.path.isdir(so_dirname): - attachments.append(" DIRECTORY DOES NOT EXIST") - else: - for node in sorted(os.listdir(so_dirname)): - attachments.append(f" {node}") - return None - - -def _find_dll_using_cudalib_dir(libname, error_messages, attachments): - cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages) - if cudalib_dir is None: - return None - file_wild = libname + "*.dll" - for node in sorted(glob.glob(os.path.join(cudalib_dir, file_wild))): - dll_name = os.path.join(cudalib_dir, node) - if os.path.isfile(dll_name): - return dll_name - error_messages.append(f"No such file: {file_wild}") - attachments.append(f' listdir("{cudalib_dir}"):') - for node in sorted(os.listdir(cudalib_dir)): - attachments.append(f" {node}") - return None - - -@functools.cache -def find_nvidia_dynamic_library(name: str) -> str: - error_messages = [] - attachments = [] - - if IS_WIN32: - dll_name = _find_dll_using_nvidia_bin_dirs(name, error_messages, attachments) - if dll_name is None: - if name == "nvvm": - dll_name = _get_cuda_paths_info("nvvm", error_messages) - else: - dll_name = _find_dll_using_cudalib_dir(name, error_messages, attachments) - if dll_name is None: - attachments = "\n".join(attachments) - raise RuntimeError(f"Failure finding {name}*.dll: {', '.join(error_messages)}\n{attachments}") - return dll_name - - so_basename = f"lib{name}.so" - so_name = _find_so_using_nvidia_lib_dirs(name, so_basename, error_messages, attachments) - if so_name is None: - if name == "nvvm": - so_name = _get_cuda_paths_info("nvvm", error_messages) - else: - so_name = _find_so_using_cudalib_dir(so_basename, error_messages, attachments) - if so_name is None: - attachments = "\n".join(attachments) - raise RuntimeError(f"Failure finding {so_basename}: {', '.join(error_messages)}\n{attachments}") - return so_name \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/findlib.py b/cuda_bindings/cuda/bindings/_path_finder/findlib.py deleted file mode 100644 index c64c3c9577..0000000000 --- a/cuda_bindings/cuda/bindings/_path_finder/findlib.py +++ /dev/null @@ -1,69 +0,0 @@ -# Forked from: -# https://github.com/numba/numba/blob/f0d24824fcd6a454827e3c108882395d00befc04/numba/misc/findlib.py - -import os -import re -import sys - - -def get_lib_dirs(): - """ - Anaconda specific - """ - if sys.platform == "win32": - # on windows, historically `DLLs` has been used for CUDA libraries, - # since approximately CUDA 9.2, `Library\bin` has been used. - dirnames = ["DLLs", os.path.join("Library", "bin")] - else: - dirnames = [ - "lib", - ] - libdirs = [os.path.join(sys.prefix, x) for x in dirnames] - return libdirs - - -DLLNAMEMAP = { - "linux": r"lib%(name)s\.so\.%(ver)s$", - "linux2": r"lib%(name)s\.so\.%(ver)s$", - "linux-static": r"lib%(name)s\.a$", - "darwin": r"lib%(name)s\.%(ver)s\.dylib$", - "win32": r"%(name)s%(ver)s\.dll$", - "win32-static": r"%(name)s\.lib$", - "bsd": r"lib%(name)s\.so\.%(ver)s$", -} - -RE_VER = r"[0-9]*([_\.][0-9]+)*" - - -def find_lib(libname, libdir=None, platform=None, static=False): - platform = platform or sys.platform - platform = "bsd" if "bsd" in platform else platform - if static: - platform = f"{platform}-static" - if platform not in DLLNAMEMAP: - # Return empty list if platform name is undefined. - # Not all platforms define their static library paths. - return [] - pat = DLLNAMEMAP[platform] % {"name": libname, "ver": RE_VER} - regex = re.compile(pat) - return find_file(regex, libdir) - - -def find_file(pat, libdir=None): - if libdir is None: - libdirs = get_lib_dirs() - elif isinstance(libdir, str): - libdirs = [ - libdir, - ] - else: - libdirs = list(libdir) - files = [] - for ldir in libdirs: - try: - entries = os.listdir(ldir) - except FileNotFoundError: - continue - candidates = [os.path.join(ldir, ent) for ent in entries if pat.match(ent)] - files.extend([c for c in candidates if os.path.isfile(c)]) - return files \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py deleted file mode 100644 index 69aadabcbf..0000000000 --- a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py +++ /dev/null @@ -1,92 +0,0 @@ -import functools -import sys - -if sys.platform == "win32": - import ctypes.wintypes - - import pywintypes - import win32api - - # Mirrors WinBase.h (unfortunately not defined already elsewhere) - _WINBASE_LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800 - -else: - import ctypes - import os - - _LINUX_CDLL_MODE = os.RTLD_NOW | os.RTLD_GLOBAL - -from .find_nvidia_dynamic_library import find_nvidia_dynamic_library - - -@functools.cache -def _windows_cuDriverGetVersion() -> int: - handle = win32api.LoadLibrary("nvcuda.dll") - - kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) - GetProcAddress = kernel32.GetProcAddress - GetProcAddress.argtypes = [ctypes.wintypes.HMODULE, ctypes.wintypes.LPCSTR] - GetProcAddress.restype = ctypes.c_void_p - cuDriverGetVersion = GetProcAddress(handle, b"cuDriverGetVersion") - assert cuDriverGetVersion - - FUNC_TYPE = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_int)) - cuDriverGetVersion_fn = FUNC_TYPE(cuDriverGetVersion) - driver_ver = ctypes.c_int() - err = cuDriverGetVersion_fn(ctypes.byref(driver_ver)) - assert err == 0 - return driver_ver.value - - -@functools.cache -def _windows_load_with_dll_basename(name: str) -> int: - driver_ver = _windows_cuDriverGetVersion() - del driver_ver # Keeping this here because it will probably be needed in the future. - - if name == "nvJitLink": - dll_name = "nvJitLink_120_0.dll" - elif name == "nvrtc": - dll_name = "nvrtc64_120_0.dll" - elif name == "nvvm": - dll_name = "nvvm64_40_0.dll" - - try: - return win32api.LoadLibrary(dll_name) - except pywintypes.error: - pass - - return None - - -@functools.cache -def load_nvidia_dynamic_library(name: str) -> int: - # First try using the platform-specific dynamic loader search mechanisms - if sys.platform == "win32": - handle = _windows_load_with_dll_basename(name) - if handle: - return handle - else: - dl_path = f"lib{name}.so" # Version intentionally no specified. - try: - handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE) - except OSError: - pass - else: - # Use `cdef void* ptr = ` in cython to convert back to void* - return handle._handle # C unsigned int - - dl_path = find_nvidia_dynamic_library(name) - if sys.platform == "win32": - try: - handle = win32api.LoadLibrary(dl_path) - except pywintypes.error as e: - raise RuntimeError(f"Failed to load DLL at {dl_path}: {e}") from e - # Use `cdef void* ptr = ` in cython to convert back to void* - return handle # C signed int, matches win32api.GetProcAddress - else: - try: - handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE) - except OSError as e: - raise RuntimeError(f"Failed to dlopen {dl_path}: {e}") from e - # Use `cdef void* ptr = ` in cython to convert back to void* - return handle._handle # C unsigned int \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py deleted file mode 100644 index 324cdeec30..0000000000 --- a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2024-2025 NVIDIA Corporation. All rights reserved. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import functools -import os -import sys - - -@functools.cache -def _impl(sys_path, sub_dirs): - results = [] - for base in sys_path: - stack = [(base, 0)] # (current_path, index into sub_dirs) - while stack: - current_path, idx = stack.pop() - if idx == len(sub_dirs): - if os.path.isdir(current_path): - results.append(current_path) - continue - - sub = sub_dirs[idx] - if sub == "*": - try: - entries = sorted(os.listdir(current_path)) - except OSError: - continue - for entry in entries: - entry_path = os.path.join(current_path, entry) - if os.path.isdir(entry_path): - stack.append((entry_path, idx + 1)) - else: - next_path = os.path.join(current_path, sub) - if os.path.isdir(next_path): - stack.append((next_path, idx + 1)) - return results - - -def sys_path_find_sub_dirs(sub_dirs): - return _impl(tuple(sys.path), tuple(sub_dirs)) \ No newline at end of file From 9a5d5fea346410987266e9d85f32c84046306ad1 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 19 Dec 2025 19:43:20 +0000 Subject: [PATCH 24/50] use cuda_pathfinder module for libdevice --- cuda_core/cuda/core/experimental/_program.py | 43 +----- cuda_pathfinder/cuda/pathfinder/__init__.py | 5 + .../_dynamic_libs/find_libdevice.py | 146 ++++++++++++++++++ 3 files changed, 154 insertions(+), 40 deletions(-) create mode 100644 cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index a5ad2a3be2..fedafbc4fb 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -103,46 +103,9 @@ def _get_nvvm_module(): raise e def _find_libdevice_path(): - """ - Find libdevice.10.bc using cuda.bindings.path_finder. - - Returns: - str: Path to libdevice.10.bc, or None if not found - """ - try: - from cuda.bindings.path_finder import ( - get_nvidia_libdevice_ctk, - get_libdevice_wheel, - get_debian_pkg_libdevice, - ) - - for getter in [get_nvidia_libdevice_ctk, get_libdevice_wheel, get_debian_pkg_libdevice]: - try: - result = getter() - if result is not None and result.info is not None: - return result.info - except Exception: - continue - - return None - except ImportError: - import os - - # CUDA_HOME - cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") - if cuda_home: - libdevice_path = os.path.join(cuda_home, "nvvm", "libdevice", "libdevice.10.bc") - if os.path.isfile(libdevice_path): - return libdevice_path - - # Linux paths - for base in ["/usr/local/cuda", "/opt/cuda"]: - libdevice_path = os.path.join(base, "nvvm", "libdevice", "libdevice.10.bc") - if os.path.isfile(libdevice_path): - return libdevice_path - - return None - + """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder.""" + from cuda.pathfinder import get_libdevice_path + return get_libdevice_path() def _process_define_macro_inner(formatted_options, macro): if isinstance(macro, str): diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py index 143c4b45cc..8820917318 100644 --- a/cuda_pathfinder/cuda/pathfinder/__init__.py +++ b/cuda_pathfinder/cuda/pathfinder/__init__.py @@ -6,6 +6,11 @@ from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib +from cuda.pathfinder._dynamic_libs.find_libdevice import ( + LibdeviceNotFoundError as LibdeviceNotFoundError, + find_libdevice as find_libdevice, + get_libdevice_path as get_libdevice_path, +) from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import ( SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES, # noqa: F401 ) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py new file mode 100644 index 0000000000..f1c0763765 --- /dev/null +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py @@ -0,0 +1,146 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +import functools +import glob +import os + +from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS +from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages + +# Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern) +SITE_PACKAGES_LIBDEVICE_DIRS = ( + "nvidia/cuda_nvvm/nvvm/libdevice", # CTK 13+ + "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 +) + + +class LibdeviceNotFoundError(Exception): + pass + + +def _get_cuda_home_or_path() -> str | None: + return os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + + +def _no_such_file_in_dir( + dir_path: str, filename: str, error_messages: list[str], attachments: list[str] +) -> None: + error_messages.append(f"No such file: {os.path.join(dir_path, filename)}") + if os.path.isdir(dir_path): + attachments.append(f' listdir("{dir_path}"):') + for node in sorted(os.listdir(dir_path)): + attachments.append(f" {node}") + else: + attachments.append(f' Directory does not exist: "{dir_path}"') + + +class _FindLibdevice: + FILENAME = "libdevice.10.bc" + REL_PATH = os.path.join("nvvm", "libdevice") + + def __init__(self): + self.error_messages: list[str] = [] + self.attachments: list[str] = [] + self.abs_path: str | None = None + + def try_site_packages(self) -> str | None: + for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS: + sub_dir = tuple(rel_dir.split("/")) + for abs_dir in find_sub_dirs_all_sitepackages(sub_dir): + file_path = os.path.join(abs_dir, self.FILENAME) + if os.path.isfile(file_path): + return file_path + return None + + def try_with_conda_prefix(self) -> str | None: + conda_prefix = os.environ.get("CONDA_PREFIX") + if not conda_prefix: + return None + + anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix + file_path = os.path.join(anchor, self.REL_PATH, self.FILENAME) + if os.path.isfile(file_path): + return file_path + return None + + def try_with_cuda_home(self) -> str | None: + cuda_home = _get_cuda_home_or_path() + if cuda_home is None: + self.error_messages.append("CUDA_HOME/CUDA_PATH not set") + return None + + file_path = os.path.join(cuda_home, self.REL_PATH, self.FILENAME) + if os.path.isfile(file_path): + return file_path + + _no_such_file_in_dir( + os.path.join(cuda_home, self.REL_PATH), + self.FILENAME, + self.error_messages, + self.attachments, + ) + return None + + def try_common_paths(self) -> str | None: + if IS_WINDOWS: + bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] + else: + bases = ["/usr/local/cuda", "/opt/cuda"] + + for base in bases: + # Direct path + file_path = os.path.join(base, self.REL_PATH, self.FILENAME) + if os.path.isfile(file_path): + return file_path + # Versioned paths (e.g., /usr/local/cuda-13.0) + for versioned in sorted(glob.glob(base + "*"), reverse=True): + if os.path.isdir(versioned): + file_path = os.path.join(versioned, self.REL_PATH, self.FILENAME) + if os.path.isfile(file_path): + return file_path + return None + + def raise_not_found_error(self) -> None: + err = ", ".join(self.error_messages) if self.error_messages else "No search paths available" + att = "\n".join(self.attachments) if self.attachments else "" + raise LibdeviceNotFoundError( + f'Failure finding "{self.FILENAME}": {err}\n{att}' + ) + + +@functools.cache +def find_libdevice() -> str: + """Find the path to libdevice.10.bc. + Raises: + LibdeviceNotFoundError: If libdevice.10.bc cannot be found + """ + finder = _FindLibdevice() + + abs_path = finder.try_site_packages() + if abs_path is None: + abs_path = finder.try_with_conda_prefix() + if abs_path is None: + abs_path = finder.try_with_cuda_home() + if abs_path is None: + abs_path = finder.try_common_paths() + + if abs_path is None: + finder.raise_not_found_error() + + return abs_path + + +def get_libdevice_path() -> str | None: + """Get the path to libdevice.10.bc, or None if not found.""" + finder = _FindLibdevice() + + abs_path = finder.try_site_packages() + if abs_path is None: + abs_path = finder.try_with_conda_prefix() + if abs_path is None: + abs_path = finder.try_with_cuda_home() + if abs_path is None: + abs_path = finder.try_common_paths() + + return abs_path + From 07c619958cbdbdda7360837ac6c982cbefb3b2f6 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 5 Feb 2026 17:37:45 +0000 Subject: [PATCH 25/50] rebase --- cuda_core/cuda/core/experimental/_program.py | 46 +++++++++----------- cuda_core/tests/test_program.py | 32 +++++++------- 2 files changed, 37 insertions(+), 41 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index fedafbc4fb..47b78fd831 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -13,11 +13,11 @@ if TYPE_CHECKING: import cuda.bindings -from cuda.core.experimental._device import Device -from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions -from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils.clear_error_support import assert_type -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._device import Device +from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions +from cuda.core._module import ObjectCode +from cuda.core._utils.clear_error_support import assert_type +from cuda.core._utils.cuda_utils import ( CUDAError, _handle_boolean_option, check_or_create_options, @@ -102,10 +102,6 @@ def _get_nvvm_module(): _nvvm_module = None raise e -def _find_libdevice_path(): - """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder.""" - from cuda.pathfinder import get_libdevice_path - return get_libdevice_path() def _process_define_macro_inner(formatted_options, macro): if isinstance(macro, str): @@ -118,7 +114,11 @@ def _process_define_macro_inner(formatted_options, macro): return True return False - +def _find_libdevice_path(): + """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder.""" + from cuda.pathfinder import get_libdevice_path + return get_libdevice_path() + def _process_define_macro(formatted_options, macro): union_type = "Union[str, tuple[str, str]]" if _process_define_macro_inner(formatted_options, macro): @@ -236,13 +236,9 @@ class ProgramOptions: no_display_error_number : bool, optional Disable the display of a diagnostic number for warning messages. Default: False - diag_error : Union[int, list[int]], optional - Emit error for a specified diagnostic message number or comma separated list of numbers. - Default: None - diag_suppress : Union[int, list[int]], optional - Suppress a specified diagnostic message number or comma separated list of numbers. - Default: None - diag_warn : Union[int, list[int]], optional + diag_error: Union[int, list[int], tuple[int]] | None = None + diag_suppress: Union[int, list[int], tuple[int]] | None = None + diag_warn: Union[int, list[int], tuple[int]] | None = None Emit warning for a specified diagnostic message number or comma separated lis of numbers. Default: None brief_diagnostics : bool, optional @@ -339,7 +335,6 @@ class ProgramOptions: split_compile: int | None = None fdevice_syntax_only: bool | None = None minimal: bool | None = None - # Creating as 2 tuples ((names, source), (names,source)) extra_sources: ( Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None ) = None @@ -356,7 +351,7 @@ class ProgramOptions: pch_messages: bool | None = None instantiate_templates_in_pch: bool | None = None numba_debug: bool | None = None # Custom option for Numba debugging - use_libdevice: bool | None = None # Use libdevice + use_libdevice: bool | None = None # For libdevice execution def __post_init__(self): self._name = self.name.encode() @@ -501,7 +496,7 @@ def _prepare_nvrtc_options(self) -> list[bytes]: options.append("--numba-debug") return [o.encode() for o in options] - def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]: + def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]: options = [] # Options supported by NVVM @@ -690,11 +685,11 @@ def __init__(self, code, code_type, options: ProgramOptions = None): if code_type == "c++": assert_type(code, str) # TODO: support pre-loaded headers & include names - + # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved if options.extra_sources is not None: raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)") - # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved + self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], [])) self._mnff.backend = "NVRTC" self._backend = "NVRTC" @@ -704,7 +699,6 @@ def __init__(self, code, code_type, options: ProgramOptions = None): assert_type(code, str) if options.extra_sources is not None: raise ValueError("extra_sources is not supported by the PTX backend.") - self._linker = Linker( ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) ) @@ -872,7 +866,7 @@ def compile(self, target_type, name_expressions=(), logs=None): nvvm = _get_nvvm_module() with _nvvm_exception_manager(self): nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options) - # Invoke libdevice + # libdevice compilation if getattr(self, '_use_libdevice', False): libdevice_path = _find_libdevice_path() if libdevice_path is None: @@ -882,7 +876,7 @@ def compile(self, target_type, name_expressions=(), logs=None): ) with open(libdevice_path, "rb") as f: libdevice_bc = f.read() - # Use lazy_add_module for libdevice bitcode only following numba-cuda + # libdevice for numba-cuda nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None) nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options) @@ -923,4 +917,4 @@ def handle(self) -> ProgramHandleT: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Program.handle)``. """ - return self._mnff.handle + return self._mnff.handle \ No newline at end of file diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 00d0709870..c8effbb825 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE @@ -6,11 +6,11 @@ import warnings import pytest -from cuda.core.experimental import _linker -from cuda.core.experimental._device import Device -from cuda.core.experimental._module import Kernel, ObjectCode -from cuda.core.experimental._program import Program, ProgramOptions -from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return +from cuda.core import _linker +from cuda.core._device import Device +from cuda.core._module import Kernel, ObjectCode +from cuda.core._program import Program, ProgramOptions +from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return cuda_driver_version = handle_return(driver.cuDriverGetVersion()) is_culink_backend = _linker._decide_nvjitlink_or_driver() @@ -21,12 +21,11 @@ _test_helpers_available = True except ImportError: _test_helpers_available = False - - + def _is_nvvm_available(): """Check if NVVM is available.""" try: - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module _get_nvvm_module() return True @@ -39,7 +38,7 @@ def _is_nvvm_available(): ) try: - from cuda.core.experimental._utils.cuda_utils import driver, handle_return + from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc _cuda_driver_version = handle_return(driver.cuDriverGetVersion()) except Exception: @@ -49,6 +48,7 @@ def _is_nvvm_available(): def _get_nvrtc_version_for_tests(): """ Get NVRTC version. + Returns: int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2) None: If NVRTC is not available @@ -60,6 +60,7 @@ def _get_nvrtc_version_for_tests(): except Exception: return None + _libnvvm_version = None _libnvvm_version_attempted = False @@ -97,7 +98,7 @@ def _get_libnvvm_version_for_tests(): _libnvvm_version_attempted = True try: - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() @@ -145,7 +146,7 @@ def nvvm_ir(): fallback assumes no version metadata will be present in the input nvvm ir """ - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() major, minor, debug_major, debug_minor = nvvm.ir_version() @@ -340,7 +341,7 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path): @pytest.mark.parametrize("options", options) def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options): - program = Program(ptx_code_object._module.decode(), "ptx", options=options) + program = Program(ptx_code_object.code.decode(), "ptx", options=options) assert program.backend == ("driver" if is_culink_backend else "nvJitLink") program.compile("cubin") program.close() @@ -383,7 +384,7 @@ def test_program_compile_valid_target_type(init_cuda): ptx_kernel = ptx_object_code.get_kernel("my_kernel") assert isinstance(ptx_kernel, Kernel) - program = Program(ptx_object_code._module.decode(), "ptx", options={"name": "24"}) + program = Program(ptx_object_code.code.decode(), "ptx", options={"name": "24"}) cubin_object_code = program.compile("cubin") assert isinstance(cubin_object_code, ObjectCode) assert cubin_object_code.name == "24" @@ -420,7 +421,7 @@ def test_program_close(): @nvvm_available def test_nvvm_deferred_import(): """Test that our deferred NVVM import works correctly""" - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() assert nvvm is not None @@ -689,6 +690,7 @@ def test_cpp_program_with_extra_sources(): options = ProgramOptions(extra_sources=helper) with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"): Program(code, "c++", options) + def test_program_options_as_bytes_nvrtc(): """Test ProgramOptions.as_bytes() for NVRTC backend""" options = ProgramOptions(arch="sm_80", debug=True, lineinfo=True, ftz=True) From e1b19cc8f51e754a7b997c6a5efc69f1514a4c83 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 5 Feb 2026 17:42:05 +0000 Subject: [PATCH 26/50] rebase --- cuda_core/cuda/core/{experimental => }/_program.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cuda_core/cuda/core/{experimental => }/_program.py (100%) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/_program.py similarity index 100% rename from cuda_core/cuda/core/experimental/_program.py rename to cuda_core/cuda/core/_program.py From dcdd1009bfdf4f998010f065a892746f32444fd5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 6 Feb 2026 12:22:31 -0800 Subject: [PATCH 27/50] tests --- cuda_pathfinder/tests/test_find_libdevice.py | 94 ++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 cuda_pathfinder/tests/test_find_libdevice.py diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py new file mode 100644 index 0000000000..2d24f397fe --- /dev/null +++ b/cuda_pathfinder/tests/test_find_libdevice.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import os + +import pytest + +from cuda.pathfinder import find_libdevice +from cuda.pathfinder._dynamic_libs import find_libdevice as find_libdevice_module + +FILENAME = "libdevice.10.bc" + +SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice" +SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice" + + +@pytest.fixture +def clear_find_libdevice_cache(): + find_libdevice.cache_clear() + yield + find_libdevice.cache_clear() + + +def _make_libdevice_file(dir_path: str) -> str: + os.makedirs(dir_path, exist_ok=True) + file_path = os.path.join(dir_path, FILENAME) + with open(file_path, "wb"): + pass + return file_path + + +@pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13]) +@pytest.mark.usefixtures("clear_find_libdevice_cache") +def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir): + libdevice_dir = tmp_path.joinpath(*rel_dir.split("/")) + expected_path = str(_make_libdevice_file(str(libdevice_dir))) + + mocker.patch.object( + find_libdevice_module, + "find_sub_dirs_all_sitepackages", + return_value=[str(libdevice_dir)], + ) + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.delenv("CUDA_HOME", raising=False) + monkeypatch.delenv("CUDA_PATH", raising=False) + + result = find_libdevice() + + assert result == expected_path + assert os.path.isfile(result) + + +# same for cu12/cu13 +@pytest.mark.usefixtures("clear_find_libdevice_cache") +def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path): + rel_path = os.path.join("nvvm", "libdevice") + libdevice_dir = tmp_path / rel_path + expected_path = str(_make_libdevice_file(str(libdevice_dir))) + + mocker.patch.object(find_libdevice_module, "IS_WINDOWS", False) + mocker.patch.object( + find_libdevice_module, + "find_sub_dirs_all_sitepackages", + return_value=[], + ) + monkeypatch.setenv("CONDA_PREFIX", str(tmp_path)) + monkeypatch.delenv("CUDA_HOME", raising=False) + monkeypatch.delenv("CUDA_PATH", raising=False) + + result = find_libdevice() + + assert result == expected_path + assert os.path.isfile(result) + + +@pytest.mark.usefixtures("clear_find_libdevice_cache") +def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path): + rel_path = os.path.join("nvvm", "libdevice") + libdevice_dir = tmp_path / rel_path + expected_path = str(_make_libdevice_file(str(libdevice_dir))) + + mocker.patch.object( + find_libdevice_module, + "find_sub_dirs_all_sitepackages", + return_value=[], + ) + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.setenv("CUDA_HOME", str(tmp_path)) + monkeypatch.delenv("CUDA_PATH", raising=False) + + result = find_libdevice() + + assert result == expected_path + assert os.path.isfile(result) From aca2e361a4d503b7e289dd31db86842a8fff1384 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 11 Feb 2026 07:42:44 -0800 Subject: [PATCH 28/50] Address reviews --- cuda_core/tests/test_program.py | 12 +-- .../_dynamic_libs/find_libdevice.py | 97 ++++++++----------- 2 files changed, 43 insertions(+), 66 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index c8effbb825..bf099c860a 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -15,13 +15,7 @@ cuda_driver_version = handle_return(driver.cuDriverGetVersion()) is_culink_backend = _linker._decide_nvjitlink_or_driver() -try: - from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir - _test_helpers_available = True -except ImportError: - _test_helpers_available = False - def _is_nvvm_available(): """Check if NVVM is available.""" try: @@ -658,7 +652,6 @@ def test_nvvm_program_with_multiple_extra_sources(): @nvvm_available -@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible") def test_bitcode_format(minimal_nvvmir): if len(minimal_nvvmir) < 4: pytest.skip("Bitcode file is not valid or empty") @@ -690,7 +683,8 @@ def test_cpp_program_with_extra_sources(): options = ProgramOptions(extra_sources=helper) with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"): Program(code, "c++", options) - + + def test_program_options_as_bytes_nvrtc(): """Test ProgramOptions.as_bytes() for NVRTC backend""" options = ProgramOptions(arch="sm_80", debug=True, lineinfo=True, ftz=True) @@ -738,4 +732,4 @@ def test_program_options_as_bytes_nvvm_unsupported_option(): """Test that unsupported options raise CUDAError for NVVM backend""" options = ProgramOptions(arch="sm_80", lineinfo=True) with pytest.raises(CUDAError, match="not supported by NVVM backend"): - options.as_bytes("nvvm") \ No newline at end of file + options.as_bytes("nvvm") diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py index f1c0763765..29296cfbde 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py @@ -1,30 +1,27 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import functools import glob import os from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS +from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern) SITE_PACKAGES_LIBDEVICE_DIRS = ( - "nvidia/cuda_nvvm/nvvm/libdevice", # CTK 13+ - "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 + "nvidia/cuda_nvvm/nvvm/libdevice", # CTK 13+ + "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 ) - -class LibdeviceNotFoundError(Exception): - pass +FILENAME = "libdevice.10.bc" -def _get_cuda_home_or_path() -> str | None: - return os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") +class LibdeviceNotFoundError(RuntimeError): + pass -def _no_such_file_in_dir( - dir_path: str, filename: str, error_messages: list[str], attachments: list[str] -) -> None: +def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: error_messages.append(f"No such file: {os.path.join(dir_path, filename)}") if os.path.isdir(dir_path): attachments.append(f' listdir("{dir_path}"):') @@ -35,105 +32,80 @@ def _no_such_file_in_dir( class _FindLibdevice: - FILENAME = "libdevice.10.bc" REL_PATH = os.path.join("nvvm", "libdevice") - - def __init__(self): + + def __init__(self) -> None: self.error_messages: list[str] = [] self.attachments: list[str] = [] self.abs_path: str | None = None - + def try_site_packages(self) -> str | None: for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS: sub_dir = tuple(rel_dir.split("/")) for abs_dir in find_sub_dirs_all_sitepackages(sub_dir): - file_path = os.path.join(abs_dir, self.FILENAME) + file_path = os.path.join(abs_dir, FILENAME) if os.path.isfile(file_path): return file_path return None - + def try_with_conda_prefix(self) -> str | None: conda_prefix = os.environ.get("CONDA_PREFIX") if not conda_prefix: return None - + anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix - file_path = os.path.join(anchor, self.REL_PATH, self.FILENAME) + file_path = os.path.join(anchor, self.REL_PATH, FILENAME) if os.path.isfile(file_path): return file_path return None - + def try_with_cuda_home(self) -> str | None: - cuda_home = _get_cuda_home_or_path() + cuda_home = get_cuda_home_or_path() if cuda_home is None: self.error_messages.append("CUDA_HOME/CUDA_PATH not set") return None - - file_path = os.path.join(cuda_home, self.REL_PATH, self.FILENAME) + + file_path = os.path.join(cuda_home, self.REL_PATH, FILENAME) if os.path.isfile(file_path): return file_path - + _no_such_file_in_dir( os.path.join(cuda_home, self.REL_PATH), - self.FILENAME, + FILENAME, self.error_messages, self.attachments, ) return None - + def try_common_paths(self) -> str | None: if IS_WINDOWS: bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] else: bases = ["/usr/local/cuda", "/opt/cuda"] - + for base in bases: # Direct path - file_path = os.path.join(base, self.REL_PATH, self.FILENAME) + file_path = os.path.join(base, self.REL_PATH, FILENAME) if os.path.isfile(file_path): return file_path # Versioned paths (e.g., /usr/local/cuda-13.0) for versioned in sorted(glob.glob(base + "*"), reverse=True): if os.path.isdir(versioned): - file_path = os.path.join(versioned, self.REL_PATH, self.FILENAME) + file_path = os.path.join(versioned, self.REL_PATH, FILENAME) if os.path.isfile(file_path): return file_path return None - + def raise_not_found_error(self) -> None: err = ", ".join(self.error_messages) if self.error_messages else "No search paths available" att = "\n".join(self.attachments) if self.attachments else "" - raise LibdeviceNotFoundError( - f'Failure finding "{self.FILENAME}": {err}\n{att}' - ) - - -@functools.cache -def find_libdevice() -> str: - """Find the path to libdevice.10.bc. - Raises: - LibdeviceNotFoundError: If libdevice.10.bc cannot be found - """ - finder = _FindLibdevice() - - abs_path = finder.try_site_packages() - if abs_path is None: - abs_path = finder.try_with_conda_prefix() - if abs_path is None: - abs_path = finder.try_with_cuda_home() - if abs_path is None: - abs_path = finder.try_common_paths() - - if abs_path is None: - finder.raise_not_found_error() - - return abs_path + raise LibdeviceNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}') def get_libdevice_path() -> str | None: """Get the path to libdevice.10.bc, or None if not found.""" finder = _FindLibdevice() - + abs_path = finder.try_site_packages() if abs_path is None: abs_path = finder.try_with_conda_prefix() @@ -141,6 +113,17 @@ def get_libdevice_path() -> str | None: abs_path = finder.try_with_cuda_home() if abs_path is None: abs_path = finder.try_common_paths() - + return abs_path + +@functools.cache +def find_libdevice() -> str: + """Find the path to libdevice*.bc. + Raises: + LibdeviceNotFoundError: If libdevice.10.bc cannot be found + """ + path_or_none = get_libdevice_path() + if path_or_none is None: + raise LibdeviceNotFoundError(f"{FILENAME} not found") + return path_or_none From af6e70ac8e3e92c8be073ce2b362763fa7f5a087 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 11 Feb 2026 07:50:35 -0800 Subject: [PATCH 29/50] put libdevice stuff under _static_libs --- cuda_pathfinder/cuda/pathfinder/__init__.py | 16 ++++++++++------ .../find_libdevice.py | 0 cuda_pathfinder/tests/test_find_libdevice.py | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) rename cuda_pathfinder/cuda/pathfinder/{_dynamic_libs => _static_libs}/find_libdevice.py (100%) diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py index 3b2d44eb5d..ece060d81a 100644 --- a/cuda_pathfinder/cuda/pathfinder/__init__.py +++ b/cuda_pathfinder/cuda/pathfinder/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """cuda.pathfinder public APIs""" @@ -10,11 +10,6 @@ from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib -from cuda.pathfinder._dynamic_libs.find_libdevice import ( - LibdeviceNotFoundError as LibdeviceNotFoundError, - find_libdevice as find_libdevice, - get_libdevice_path as get_libdevice_path, -) from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import ( SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES, # noqa: F401 ) @@ -24,6 +19,15 @@ locate_nvidia_header_directory as locate_nvidia_header_directory, ) from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK +from cuda.pathfinder._static_libs.find_libdevice import ( + LibdeviceNotFoundError as LibdeviceNotFoundError, +) +from cuda.pathfinder._static_libs.find_libdevice import ( + find_libdevice as find_libdevice, +) +from cuda.pathfinder._static_libs.find_libdevice import ( + get_libdevice_path as get_libdevice_path, +) from cuda.pathfinder._version import __version__ # isort: skip # noqa: F401 diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py similarity index 100% rename from cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py rename to cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py index 2d24f397fe..a1dfc0b64f 100644 --- a/cuda_pathfinder/tests/test_find_libdevice.py +++ b/cuda_pathfinder/tests/test_find_libdevice.py @@ -6,7 +6,7 @@ import pytest from cuda.pathfinder import find_libdevice -from cuda.pathfinder._dynamic_libs import find_libdevice as find_libdevice_module +from cuda.pathfinder._static_libs import find_libdevice as find_libdevice_module FILENAME = "libdevice.10.bc" From b1d423f591426283f7948237684fa6f1aff6c560 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 11 Feb 2026 16:32:14 +0000 Subject: [PATCH 30/50] refresh reviews --- .../pathfinder/_static_libs/find_libdevice.py | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index 29296cfbde..0cfcd4e493 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -7,6 +7,7 @@ from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages +from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern) SITE_PACKAGES_LIBDEVICE_DIRS = ( @@ -15,11 +16,10 @@ ) FILENAME = "libdevice.10.bc" - - -class LibdeviceNotFoundError(RuntimeError): - pass - +if IS_WINDOWS: + bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] +else: + bases = ["/usr/local/cuda", "/opt/cuda"] def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: error_messages.append(f"No such file: {os.path.join(dir_path, filename)}") @@ -78,10 +78,6 @@ def try_with_cuda_home(self) -> str | None: return None def try_common_paths(self) -> str | None: - if IS_WINDOWS: - bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] - else: - bases = ["/usr/local/cuda", "/opt/cuda"] for base in bases: # Direct path @@ -99,11 +95,11 @@ def try_common_paths(self) -> str | None: def raise_not_found_error(self) -> None: err = ", ".join(self.error_messages) if self.error_messages else "No search paths available" att = "\n".join(self.attachments) if self.attachments else "" - raise LibdeviceNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}') + raise DynamicLibNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}') def get_libdevice_path() -> str | None: - """Get the path to libdevice.10.bc, or None if not found.""" + """Get the path to libdevice*.bc, or None if not found.""" finder = _FindLibdevice() abs_path = finder.try_site_packages() @@ -121,9 +117,9 @@ def get_libdevice_path() -> str | None: def find_libdevice() -> str: """Find the path to libdevice*.bc. Raises: - LibdeviceNotFoundError: If libdevice.10.bc cannot be found + DynamicLibNotFoundError: If libdevice.10.bc cannot be found """ path_or_none = get_libdevice_path() if path_or_none is None: - raise LibdeviceNotFoundError(f"{FILENAME} not found") + raise DynamicLibNotFoundError(f"{FILENAME} not found") return path_or_none From 4cedbb7c2bb6790c6f8e4f7a60319c28663d412c Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 11 Feb 2026 17:37:12 +0000 Subject: [PATCH 31/50] change program to cython per PR 1565 --- cuda_core/cuda/core/_program.pxd | 17 + cuda_core/cuda/core/_program.py | 925 -------------------------- cuda_core/cuda/core/_program.pyx | 1038 ++++++++++++++++++++++++++++++ 3 files changed, 1055 insertions(+), 925 deletions(-) create mode 100644 cuda_core/cuda/core/_program.pxd delete mode 100644 cuda_core/cuda/core/_program.py create mode 100644 cuda_core/cuda/core/_program.pyx diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd new file mode 100644 index 0000000000..d4abe85ff8 --- /dev/null +++ b/cuda_core/cuda/core/_program.pxd @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from ._resource_handles cimport NvrtcProgramHandle, NvvmProgramHandle + + +cdef class Program: + cdef: + NvrtcProgramHandle _h_nvrtc + NvvmProgramHandle _h_nvvm + str _backend + object _linker # Linker + object _options # ProgramOptions + object __weakref__ + bint _use_libdevice # Flag for libdevice loading + int _module_count diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py deleted file mode 100644 index 34eac95270..0000000000 --- a/cuda_core/cuda/core/_program.py +++ /dev/null @@ -1,925 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import weakref -from contextlib import contextmanager -from dataclasses import dataclass -from typing import TYPE_CHECKING, Union -from warnings import warn - -if TYPE_CHECKING: - import cuda.bindings - -from cuda.core._device import Device -from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions -from cuda.core._module import ObjectCode -from cuda.core._utils.clear_error_support import assert_type -from cuda.core._utils.cuda_utils import ( - CUDAError, - _handle_boolean_option, - check_or_create_options, - driver, - get_binding_version, - handle_return, - is_nested_sequence, - is_sequence, - nvrtc, -) - - -@contextmanager -def _nvvm_exception_manager(self): - """ - Taken from _linker.py - """ - try: - yield - except Exception as e: - error_log = "" - if hasattr(self, "_mnff"): - try: - nvvm = _get_nvvm_module() - logsize = nvvm.get_program_log_size(self._mnff.handle) - if logsize > 1: - log = bytearray(logsize) - nvvm.get_program_log(self._mnff.handle, log) - error_log = log.decode("utf-8", errors="backslashreplace") - except Exception: - error_log = "" - # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but - # unfortunately we are still supporting Python 3.10... - e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:]) - raise e - - -_nvvm_module = None -_nvvm_import_attempted = False - - -def _get_nvvm_module(): - """ - Handles the import of NVVM module with version and availability checks. - NVVM bindings were added in cuda-bindings 12.9.0, so we need to handle cases where: - 1. cuda.bindings is not new enough (< 12.9.0) - 2. libnvvm is not found in the Python environment - - Returns: - The nvvm module if available and working - - Raises: - RuntimeError: If NVVM is not available due to version or library issues - """ - global _nvvm_module, _nvvm_import_attempted - - if _nvvm_import_attempted: - if _nvvm_module is None: - raise RuntimeError("NVVM module is not available (previous import attempt failed)") - return _nvvm_module - - _nvvm_import_attempted = True - - try: - version = get_binding_version() - if version < (12, 9): - raise RuntimeError( - f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. " - "Please update cuda-bindings to use NVVM features." - ) - - from cuda.bindings import nvvm - from cuda.bindings._internal.nvvm import _inspect_function_pointer - - if _inspect_function_pointer("__nvvmCreateProgram") == 0: - raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ") - - _nvvm_module = nvvm - return _nvvm_module - - except RuntimeError as e: - _nvvm_module = None - raise e - - -def _process_define_macro_inner(formatted_options, macro): - if isinstance(macro, str): - formatted_options.append(f"--define-macro={macro}") - return True - if isinstance(macro, tuple): - if len(macro) != 2 or any(not isinstance(val, str) for val in macro): - raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}") - formatted_options.append(f"--define-macro={macro[0]}={macro[1]}") - return True - return False - -def _find_libdevice_path(): - """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder.""" - from cuda.pathfinder import get_libdevice_path - return get_libdevice_path() - -def _process_define_macro(formatted_options, macro): - union_type = "Union[str, tuple[str, str]]" - if _process_define_macro_inner(formatted_options, macro): - return - if is_nested_sequence(macro): - for seq_macro in macro: - if not _process_define_macro_inner(formatted_options, seq_macro): - raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}") - return - raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}") - - -@dataclass -class ProgramOptions: - """Customizable options for configuring `Program`. - - Attributes - ---------- - name : str, optional - Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`. - arch : str, optional - Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or - ``compute_`` (for generating PTX). If not provided, the current device's architecture - will be used. - relocatable_device_code : bool, optional - Enable (disable) the generation of relocatable device code. - Default: False - extensible_whole_program : bool, optional - Do extensible whole program compilation of device code. - Default: False - debug : bool, optional - Generate debug information. If --dopt is not specified, then turns off all optimizations. - Default: False - lineinfo: bool, optional - Generate line-number information. - Default: False - device_code_optimize : bool, optional - Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation - for optimized device code. - Default: None - ptxas_options : Union[str, list[str]], optional - Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. - For example ["-v", "-O2"]. - Default: None - max_register_count : int, optional - Specify the maximum amount of registers that GPU functions can use. - Default: None - ftz : bool, optional - When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal - values. - Default: False - prec_sqrt : bool, optional - For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. - Default: True - prec_div : bool, optional - For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster - approximation. - Default: True - fma : bool, optional - Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point - multiply-add operations. - Default: True - use_fast_math : bool, optional - Make use of fast math operations. - Default: False - extra_device_vectorization : bool, optional - Enables more aggressive device code vectorization in the NVVM optimizer. - Default: False - link_time_optimization : bool, optional - Generate intermediate code for later link-time optimization. - Default: False - gen_opt_lto : bool, optional - Run the optimizer passes before generating the LTO IR. - Default: False - define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional - Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of - strings, in which case the first element is defined as the second, or a list of strings or tuples. - Default: None - undefine_macro : Union[str, list[str]], optional - Cancel any previous definition of a macro, or list of macros. - Default: None - include_path : Union[str, list[str]], optional - Add the directory or directories to the list of directories to be searched for headers. - Default: None - pre_include : Union[str, list[str]], optional - Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. - Default: None - no_source_include : bool, optional - Disable the default behavior of adding the directory of each input source to the include path. - Default: False - std : str, optional - Set language dialect to C++03, C++11, C++14, C++17 or C++20. - Default: c++17 - builtin_move_forward : bool, optional - Provide builtin definitions of std::move and std::forward. - Default: True - builtin_initializer_list : bool, optional - Provide builtin definitions of std::initializer_list class and member functions. - Default: True - disable_warnings : bool, optional - Inhibit all warning messages. - Default: False - restrict : bool, optional - Programmer assertion that all kernel pointer parameters are restrict pointers. - Default: False - device_as_default_execution_space : bool, optional - Treat entities with no execution space annotation as __device__ entities. - Default: False - device_int128 : bool, optional - Allow the __int128 type in device code. - Default: False - optimization_info : str, optional - Provide optimization reports for the specified kind of optimization. - Default: None - no_display_error_number : bool, optional - Disable the display of a diagnostic number for warning messages. - Default: False - diag_error : Union[int, list[int]], optional - Emit error for a specified diagnostic message number or comma separated list of numbers. - Default: None - diag_suppress : Union[int, list[int]], optional - Suppress a specified diagnostic message number or comma separated list of numbers. - Default: None - diag_warn : Union[int, list[int]], optional - Emit warning for a specified diagnostic message number or comma separated lis of numbers. - Default: None - brief_diagnostics : bool, optional - Disable or enable showing source line and column info in a diagnostic. - Default: False - time : str, optional - Generate a CSV table with the time taken by each compilation phase. - Default: None - split_compile : int, optional - Perform compiler optimizations in parallel. - Default: 1 - fdevice_syntax_only : bool, optional - Ends device compilation after front-end syntax checking. - Default: False - minimal : bool, optional - Omit certain language features to reduce compile time for small programs. - Default: False - no_cache : bool, optional - Disable compiler caching. - Default: False - fdevice_time_trace : str, optional - Generate time trace JSON for profiling compilation (NVRTC only). - Default: None - device_float128 : bool, optional - Allow __float128 type in device code (NVRTC only). - Default: False - frandom_seed : str, optional - Set random seed for randomized optimizations (NVRTC only). - Default: None - ofast_compile : str, optional - Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only). - Default: None - pch : bool, optional - Use default precompiled header (NVRTC only, CUDA 12.8+). - Default: False - create_pch : str, optional - Create precompiled header file (NVRTC only, CUDA 12.8+). - Default: None - use_pch : str, optional - Use specific precompiled header file (NVRTC only, CUDA 12.8+). - Default: None - pch_dir : str, optional - PCH directory location (NVRTC only, CUDA 12.8+). - Default: None - pch_verbose : bool, optional - Verbose PCH output (NVRTC only, CUDA 12.8+). - Default: False - pch_messages : bool, optional - Control PCH diagnostic messages (NVRTC only, CUDA 12.8+). - Default: False - instantiate_templates_in_pch : bool, optional - Control template instantiation in PCH (NVRTC only, CUDA 12.8+). - Default: False - """ - - name: str | None = "default_program" - arch: str | None = None - relocatable_device_code: bool | None = None - extensible_whole_program: bool | None = None - debug: bool | None = None - lineinfo: bool | None = None - device_code_optimize: bool | None = None - ptxas_options: Union[str, list[str], tuple[str]] | None = None - max_register_count: int | None = None - ftz: bool | None = None - prec_sqrt: bool | None = None - prec_div: bool | None = None - fma: bool | None = None - use_fast_math: bool | None = None - extra_device_vectorization: bool | None = None - link_time_optimization: bool | None = None - gen_opt_lto: bool | None = None - define_macro: ( - Union[str, tuple[str, str], list[Union[str, tuple[str, str]]], tuple[Union[str, tuple[str, str]]]] | None - ) = None - undefine_macro: Union[str, list[str], tuple[str]] | None = None - include_path: Union[str, list[str], tuple[str]] | None = None - pre_include: Union[str, list[str], tuple[str]] | None = None - no_source_include: bool | None = None - std: str | None = None - builtin_move_forward: bool | None = None - builtin_initializer_list: bool | None = None - disable_warnings: bool | None = None - restrict: bool | None = None - device_as_default_execution_space: bool | None = None - device_int128: bool | None = None - optimization_info: str | None = None - no_display_error_number: bool | None = None - diag_error: Union[int, list[int], tuple[int]] | None = None - diag_suppress: Union[int, list[int], tuple[int]] | None = None - diag_warn: Union[int, list[int], tuple[int]] | None = None - brief_diagnostics: bool | None = None - time: str | None = None - split_compile: int | None = None - fdevice_syntax_only: bool | None = None - minimal: bool | None = None - extra_sources: ( - Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None - ) = None - no_cache: bool | None = None - fdevice_time_trace: str | None = None - device_float128: bool | None = None - frandom_seed: str | None = None - ofast_compile: str | None = None - pch: bool | None = None - create_pch: str | None = None - use_pch: str | None = None - pch_dir: str | None = None - pch_verbose: bool | None = None - pch_messages: bool | None = None - instantiate_templates_in_pch: bool | None = None - numba_debug: bool | None = None # Custom option for Numba debugging - use_libdevice: bool | None = None # For libdevice execution - - def __post_init__(self): - self._name = self.name.encode() - # Set arch to default if not provided - if self.arch is None: - self.arch = f"sm_{Device().arch}" - - def _prepare_nvrtc_options(self) -> list[bytes]: - # Build NVRTC-specific options - options = [f"-arch={self.arch}"] - if self.relocatable_device_code is not None: - options.append(f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}") - if self.extensible_whole_program is not None and self.extensible_whole_program: - options.append("--extensible-whole-program") - if self.debug is not None and self.debug: - options.append("--device-debug") - if self.lineinfo is not None and self.lineinfo: - options.append("--generate-line-info") - if self.device_code_optimize is not None and self.device_code_optimize: - options.append("--dopt=on") - if self.ptxas_options is not None: - opt_name = "--ptxas-options" - if isinstance(self.ptxas_options, str): - options.append(f"{opt_name}={self.ptxas_options}") - elif is_sequence(self.ptxas_options): - for opt_value in self.ptxas_options: - options.append(f"{opt_name}={opt_value}") - if self.max_register_count is not None: - options.append(f"--maxrregcount={self.max_register_count}") - if self.ftz is not None: - options.append(f"--ftz={_handle_boolean_option(self.ftz)}") - if self.prec_sqrt is not None: - options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}") - if self.prec_div is not None: - options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}") - if self.fma is not None: - options.append(f"--fmad={_handle_boolean_option(self.fma)}") - if self.use_fast_math is not None and self.use_fast_math: - options.append("--use_fast_math") - if self.extra_device_vectorization is not None and self.extra_device_vectorization: - options.append("--extra-device-vectorization") - if self.link_time_optimization is not None and self.link_time_optimization: - options.append("--dlink-time-opt") - if self.gen_opt_lto is not None and self.gen_opt_lto: - options.append("--gen-opt-lto") - if self.define_macro is not None: - _process_define_macro(options, self.define_macro) - if self.undefine_macro is not None: - if isinstance(self.undefine_macro, str): - options.append(f"--undefine-macro={self.undefine_macro}") - elif is_sequence(self.undefine_macro): - for macro in self.undefine_macro: - options.append(f"--undefine-macro={macro}") - if self.include_path is not None: - if isinstance(self.include_path, str): - options.append(f"--include-path={self.include_path}") - elif is_sequence(self.include_path): - for path in self.include_path: - options.append(f"--include-path={path}") - if self.pre_include is not None: - if isinstance(self.pre_include, str): - options.append(f"--pre-include={self.pre_include}") - elif is_sequence(self.pre_include): - for header in self.pre_include: - options.append(f"--pre-include={header}") - if self.no_source_include is not None and self.no_source_include: - options.append("--no-source-include") - if self.std is not None: - options.append(f"--std={self.std}") - if self.builtin_move_forward is not None: - options.append(f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}") - if self.builtin_initializer_list is not None: - options.append(f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}") - if self.disable_warnings is not None and self.disable_warnings: - options.append("--disable-warnings") - if self.restrict is not None and self.restrict: - options.append("--restrict") - if self.device_as_default_execution_space is not None and self.device_as_default_execution_space: - options.append("--device-as-default-execution-space") - if self.device_int128 is not None and self.device_int128: - options.append("--device-int128") - if self.device_float128 is not None and self.device_float128: - options.append("--device-float128") - if self.optimization_info is not None: - options.append(f"--optimization-info={self.optimization_info}") - if self.no_display_error_number is not None and self.no_display_error_number: - options.append("--no-display-error-number") - if self.diag_error is not None: - if isinstance(self.diag_error, int): - options.append(f"--diag-error={self.diag_error}") - elif is_sequence(self.diag_error): - for error in self.diag_error: - options.append(f"--diag-error={error}") - if self.diag_suppress is not None: - if isinstance(self.diag_suppress, int): - options.append(f"--diag-suppress={self.diag_suppress}") - elif is_sequence(self.diag_suppress): - for suppress in self.diag_suppress: - options.append(f"--diag-suppress={suppress}") - if self.diag_warn is not None: - if isinstance(self.diag_warn, int): - options.append(f"--diag-warn={self.diag_warn}") - elif is_sequence(self.diag_warn): - for warn in self.diag_warn: - options.append(f"--diag-warn={warn}") - if self.brief_diagnostics is not None: - options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}") - if self.time is not None: - options.append(f"--time={self.time}") - if self.split_compile is not None: - options.append(f"--split-compile={self.split_compile}") - if self.fdevice_syntax_only is not None and self.fdevice_syntax_only: - options.append("--fdevice-syntax-only") - if self.minimal is not None and self.minimal: - options.append("--minimal") - if self.no_cache is not None and self.no_cache: - options.append("--no-cache") - if self.fdevice_time_trace is not None: - options.append(f"--fdevice-time-trace={self.fdevice_time_trace}") - if self.frandom_seed is not None: - options.append(f"--frandom-seed={self.frandom_seed}") - if self.ofast_compile is not None: - options.append(f"--Ofast-compile={self.ofast_compile}") - # PCH options (CUDA 12.8+) - if self.pch is not None and self.pch: - options.append("--pch") - if self.create_pch is not None: - options.append(f"--create-pch={self.create_pch}") - if self.use_pch is not None: - options.append(f"--use-pch={self.use_pch}") - if self.pch_dir is not None: - options.append(f"--pch-dir={self.pch_dir}") - if self.pch_verbose is not None: - options.append(f"--pch-verbose={_handle_boolean_option(self.pch_verbose)}") - if self.pch_messages is not None: - options.append(f"--pch-messages={_handle_boolean_option(self.pch_messages)}") - if self.instantiate_templates_in_pch is not None: - options.append( - f"--instantiate-templates-in-pch={_handle_boolean_option(self.instantiate_templates_in_pch)}" - ) - if self.numba_debug: - options.append("--numba-debug") - return [o.encode() for o in options] - - def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]: - - options = [] - - # Options supported by NVVM - assert self.arch is not None - arch = self.arch - if arch.startswith("sm_"): - arch = f"compute_{arch[3:]}" - options.append(f"-arch={arch}") - if self.debug is not None and self.debug: - options.append("-g") - if self.device_code_optimize is False: - options.append("-opt=0") - elif self.device_code_optimize is True: - options.append("-opt=3") - # NVVM uses 0/1 instead of true/false for boolean options - if self.ftz is not None: - options.append(f"-ftz={'1' if self.ftz else '0'}") - if self.prec_sqrt is not None: - options.append(f"-prec-sqrt={'1' if self.prec_sqrt else '0'}") - if self.prec_div is not None: - options.append(f"-prec-div={'1' if self.prec_div else '0'}") - if self.fma is not None: - options.append(f"-fma={'1' if self.fma else '0'}") - - # Check for unsupported options and raise error if they are set - unsupported = [] - if self.relocatable_device_code is not None: - unsupported.append("relocatable_device_code") - if self.extensible_whole_program is not None and self.extensible_whole_program: - unsupported.append("extensible_whole_program") - if self.lineinfo is not None and self.lineinfo: - unsupported.append("lineinfo") - if self.ptxas_options is not None: - unsupported.append("ptxas_options") - if self.max_register_count is not None: - unsupported.append("max_register_count") - if self.use_fast_math is not None and self.use_fast_math: - unsupported.append("use_fast_math") - if self.extra_device_vectorization is not None and self.extra_device_vectorization: - unsupported.append("extra_device_vectorization") - if self.gen_opt_lto is not None and self.gen_opt_lto: - unsupported.append("gen_opt_lto") - if self.define_macro is not None: - unsupported.append("define_macro") - if self.undefine_macro is not None: - unsupported.append("undefine_macro") - if self.include_path is not None: - unsupported.append("include_path") - if self.pre_include is not None: - unsupported.append("pre_include") - if self.no_source_include is not None and self.no_source_include: - unsupported.append("no_source_include") - if self.std is not None: - unsupported.append("std") - if self.builtin_move_forward is not None: - unsupported.append("builtin_move_forward") - if self.builtin_initializer_list is not None: - unsupported.append("builtin_initializer_list") - if self.disable_warnings is not None and self.disable_warnings: - unsupported.append("disable_warnings") - if self.restrict is not None and self.restrict: - unsupported.append("restrict") - if self.device_as_default_execution_space is not None and self.device_as_default_execution_space: - unsupported.append("device_as_default_execution_space") - if self.device_int128 is not None and self.device_int128: - unsupported.append("device_int128") - if self.optimization_info is not None: - unsupported.append("optimization_info") - if self.no_display_error_number is not None and self.no_display_error_number: - unsupported.append("no_display_error_number") - if self.diag_error is not None: - unsupported.append("diag_error") - if self.diag_suppress is not None: - unsupported.append("diag_suppress") - if self.diag_warn is not None: - unsupported.append("diag_warn") - if self.brief_diagnostics is not None: - unsupported.append("brief_diagnostics") - if self.time is not None: - unsupported.append("time") - if self.split_compile is not None: - unsupported.append("split_compile") - if self.fdevice_syntax_only is not None and self.fdevice_syntax_only: - unsupported.append("fdevice_syntax_only") - if self.minimal is not None and self.minimal: - unsupported.append("minimal") - if self.numba_debug is not None and self.numba_debug: - unsupported.append("numba_debug") - if unsupported: - raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}") - - if as_bytes: - return [o.encode() for o in options] - else: - return options - - def as_bytes(self, backend: str) -> list[bytes]: - """Convert program options to bytes format for the specified backend. - - This method transforms the program options into a format suitable for the - specified compiler backend. Different backends may use different option names - and formats even for the same conceptual options. - - Parameters - ---------- - backend : str - The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm". - - Returns - ------- - list[bytes] - List of option strings encoded as bytes. - - Raises - ------ - ValueError - If an unknown backend is specified. - CUDAError - If an option incompatible with the specified backend is set. - - Examples - -------- - >>> options = ProgramOptions(arch="sm_80", debug=True) - >>> nvrtc_options = options.as_bytes("nvrtc") - """ - backend = backend.lower() - if backend == "nvrtc": - return self._prepare_nvrtc_options() - elif backend == "nvvm": - return self._prepare_nvvm_options(as_bytes=True) - else: - raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'") - - def __repr__(self): - return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})" - - -ProgramHandleT = Union["cuda.bindings.nvrtc.nvrtcProgram", LinkerHandleT] - - -class Program: - """Represent a compilation machinery to process programs into - :obj:`~_module.ObjectCode`. - - This object provides a unified interface to multiple underlying - compiler libraries. Compilation support is enabled for a wide - range of code types and compilation types. - - Parameters - ---------- - code : Any - String of the CUDA Runtime Compilation program. - code_type : Any - String of the code type. Currently ``"ptx"``, ``"c++"``, and ``"nvvm"`` are supported. - options : ProgramOptions, optional - A ProgramOptions object to customize the compilation process. - See :obj:`ProgramOptions` for more information. - """ - - class _MembersNeededForFinalize: - __slots__ = "handle", "backend" - - def __init__(self, program_obj, handle, backend): - self.handle = handle - self.backend = backend - weakref.finalize(program_obj, self.close) - - def close(self): - if self.handle is not None: - if self.backend == "NVRTC": - handle_return(nvrtc.nvrtcDestroyProgram(self.handle)) - elif self.backend == "NVVM": - nvvm = _get_nvvm_module() - nvvm.destroy_program(self.handle) - self.handle = None - - __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count") - - def __init__(self, code, code_type, options: ProgramOptions = None): - self._mnff = Program._MembersNeededForFinalize(self, None, None) - - self._options = options = check_or_create_options(ProgramOptions, options, "Program options") - code_type = code_type.lower() - self._module_count = 0 - - if code_type == "c++": - assert_type(code, str) - # TODO: support pre-loaded headers & include names - # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - if options.extra_sources is not None: - raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)") - - - self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], [])) - self._mnff.backend = "NVRTC" - self._backend = "NVRTC" - self._linker = None - - elif code_type == "ptx": - assert_type(code, str) - if options.extra_sources is not None: - raise ValueError("extra_sources is not supported by the PTX backend.") - self._linker = Linker( - ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) - ) - self._backend = self._linker.backend - - elif code_type == "nvvm": - if isinstance(code, str): - code = code.encode("utf-8") - elif not isinstance(code, (bytes, bytearray)): - raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray") - - nvvm = _get_nvvm_module() - self._mnff.handle = nvvm.create_program() - self._mnff.backend = "NVVM" - nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode()) - self._module_count = 1 - # Add extra modules if provided - if options.extra_sources is not None: - if not is_sequence(options.extra_sources): - raise TypeError( - "extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)" - ) - for i, module in enumerate(options.extra_sources): - if not isinstance(module, tuple) or len(module) != 2: - raise TypeError( - f"Each extra module must be a 2-tuple (name, source)" - f", got {type(module).__name__} at index {i}" - ) - - module_name, module_source = module - - if not isinstance(module_name, str): - raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}") - - if isinstance(module_source, str): - # Textual LLVM IR - encode to UTF-8 bytes - module_source = module_source.encode("utf-8") - elif not isinstance(module_source, (bytes, bytearray)): - raise TypeError( - f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), " - f"or bytearray, got {type(module_source).__name__}" - ) - - if len(module_source) == 0: - raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty") - - nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name) - self._module_count += 1 - - self._use_libdevice = options.use_libdevice - self._backend = "NVVM" - self._linker = None - - else: - supported_code_types = ("c++", "ptx", "nvvm") - assert code_type not in supported_code_types, f"{code_type=}" - raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})") - - def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions: - return LinkerOptions( - name=options.name, - arch=options.arch, - max_register_count=options.max_register_count, - time=options.time, - link_time_optimization=options.link_time_optimization, - debug=options.debug, - lineinfo=options.lineinfo, - ftz=options.ftz, - prec_div=options.prec_div, - prec_sqrt=options.prec_sqrt, - fma=options.fma, - split_compile=options.split_compile, - ptxas_options=options.ptxas_options, - no_cache=options.no_cache, - ) - - def close(self): - """Destroy this program.""" - if self._linker: - self._linker.close() - self._mnff.close() - - @staticmethod - def _can_load_generated_ptx(): - driver_ver = handle_return(driver.cuDriverGetVersion()) - nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion()) - return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver - - def compile(self, target_type, name_expressions=(), logs=None): - """Compile the program with a specific compilation type. - - Parameters - ---------- - target_type : Any - String of the targeted compilation type. - Supported options are "ptx", "cubin" and "ltoir". - name_expressions : Union[list, tuple], optional - List of explicit name expressions to become accessible. - (Default to no expressions) - logs : Any, optional - Object with a write method to receive the logs generated - from compilation. - (Default to no logs) - - Returns - ------- - :obj:`~_module.ObjectCode` - Newly created code object. - - """ - supported_target_types = ("ptx", "cubin", "ltoir") - if target_type not in supported_target_types: - raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})') - - if self._backend == "NVRTC": - if target_type == "ptx" and not self._can_load_generated_ptx(): - warn( - "The CUDA driver version is older than the backend version. " - "The generated ptx will not be loadable by the current driver.", - stacklevel=1, - category=RuntimeWarning, - ) - if name_expressions: - for n in name_expressions: - handle_return( - nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), - handle=self._mnff.handle, - ) - options = self._options.as_bytes("nvrtc") - handle_return( - nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), - handle=self._mnff.handle, - ) - - size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") - comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") - size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle) - data = b" " * size - handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle) - - symbol_mapping = {} - if name_expressions: - for n in name_expressions: - symbol_mapping[n] = handle_return( - nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle - ) - - if logs is not None: - logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle) - if logsize > 1: - log = b" " * logsize - handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) - logs.write(log.decode("utf-8", errors="backslashreplace")) - - return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name) - - elif self._backend == "NVVM": - if target_type not in ("ptx", "ltoir"): - raise ValueError(f'NVVM backend only supports target_type="ptx", "ltoir", got "{target_type}"') - - # TODO: flip to True when NVIDIA/cuda-python#1354 is resolved and CUDA 12 is dropped - nvvm_options = self._options._prepare_nvvm_options(as_bytes=False) - if target_type == "ltoir" and "-gen-lto" not in nvvm_options: - nvvm_options.append("-gen-lto") - nvvm = _get_nvvm_module() - with _nvvm_exception_manager(self): - nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options) - # libdevice compilation - if getattr(self, '_use_libdevice', False): - libdevice_path = _find_libdevice_path() - if libdevice_path is None: - raise RuntimeError( - "use_libdevice=True but could not find libdevice.10.bc. " - "Ensure CUDA toolkit is installed." - ) - with open(libdevice_path, "rb") as f: - libdevice_bc = f.read() - # libdevice for numba-cuda - nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None) - - nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options) - - size = nvvm.get_compiled_result_size(self._mnff.handle) - data = bytearray(size) - nvvm.get_compiled_result(self._mnff.handle, data) - - if logs is not None: - logsize = nvvm.get_program_log_size(self._mnff.handle) - if logsize > 1: - log = bytearray(logsize) - nvvm.get_program_log(self._mnff.handle, log) - logs.write(log.decode("utf-8", errors="backslashreplace")) - - return ObjectCode._init(data, target_type, name=self._options.name) - - supported_backends = ("nvJitLink", "driver") - if self._backend not in supported_backends: - raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})') - return self._linker.link(target_type) - - @property - def backend(self) -> str: - """Return this Program instance's underlying backend.""" - return self._backend - - @property - def handle(self) -> ProgramHandleT: - """Return the underlying handle object. - - .. note:: - - The type of the returned object depends on the backend. - - .. caution:: - - This handle is a Python object. To get the memory address of the underlying C - handle, call ``int(Program.handle)``. - """ - return self._mnff.handle diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx new file mode 100644 index 0000000000..7f8f83d06b --- /dev/null +++ b/cuda_core/cuda/core/_program.pyx @@ -0,0 +1,1038 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +"""Compilation machinery for CUDA programs. + +This module provides :class:`Program` for compiling source code into +:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from warnings import warn + +from cuda.bindings import driver, nvrtc + +from libcpp.vector cimport vector + +from ._resource_handles cimport ( + as_cu, + as_py, + create_nvrtc_program_handle, + create_nvvm_program_handle, +) +from cuda.bindings cimport cynvrtc, cynvvm +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN_NVRTC, HANDLE_RETURN_NVVM +from cuda.core._device import Device +from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions +from cuda.core._module import ObjectCode +from cuda.core._utils.clear_error_support import assert_type +from cuda.core._utils.cuda_utils import ( + CUDAError, + _handle_boolean_option, + check_or_create_options, + get_binding_version, + handle_return, + is_nested_sequence, + is_sequence, +) + +__all__ = ["Program", "ProgramOptions"] + +ProgramHandleT = nvrtc.nvrtcProgram | int | LinkerHandleT +"""Type alias for program handle types across different backends. + +The ``int`` type covers NVVM handles, which don't have a wrapper class. +""" + + +# ============================================================================= +# Principal Class +# ============================================================================= + + +cdef class Program: + """Represent a compilation machinery to process programs into + :class:`~cuda.core.ObjectCode`. + + This object provides a unified interface to multiple underlying + compiler libraries. Compilation support is enabled for a wide + range of code types and compilation types. + + Parameters + ---------- + code : str | bytes | bytearray + The source code to compile. For C++ and PTX, must be a string. + For NVVM IR, can be str, bytes, or bytearray. + code_type : str + The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``. + options : :class:`ProgramOptions`, optional + Options to customize the compilation process. + """ + + def __init__(self, code: str | bytes | bytearray, code_type: str, options: ProgramOptions | None = None): + Program_init(self, code, code_type, options) + + def close(self): + """Destroy this program.""" + if self._linker: + self._linker.close() + # Reset handles - the C++ shared_ptr destructor handles cleanup + self._h_nvrtc.reset() + self._h_nvvm.reset() + + def compile( + self, target_type: str, name_expressions: tuple | list = (), logs = None + ) -> ObjectCode: + """Compile the program to the specified target type. + + Parameters + ---------- + target_type : str + The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``. + name_expressions : tuple | list, optional + Sequence of name expressions to make accessible in the compiled code. + Used for template instantiation and similar cases. + logs : object, optional + Object with a ``write`` method to receive compilation logs. + + Returns + ------- + :class:`~cuda.core.ObjectCode` + The compiled object code. + """ + return Program_compile(self, target_type, name_expressions, logs) + + @property + def backend(self) -> str: + """Return this Program instance's underlying backend.""" + return self._backend + + @property + def handle(self) -> ProgramHandleT: + """Return the underlying handle object. + + .. note:: + + The type of the returned object depends on the backend. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Program.handle)``. + """ + if self._backend == "NVRTC": + return as_py(self._h_nvrtc) + elif self._backend == "NVVM": + return as_py(self._h_nvvm) # returns int (NVVM uses raw integers) + else: + return self._linker.handle + + def __repr__(self) -> str: + return f"" + + +# ============================================================================= +# Other Public Classes +# ============================================================================= + + +@dataclass +class ProgramOptions: + """Customizable options for configuring :class:`Program`. + + Attributes + ---------- + name : str, optional + Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`. + arch : str, optional + Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). If not provided, the current device's architecture + will be used. + relocatable_device_code : bool, optional + Enable (disable) the generation of relocatable device code. + Default: False + extensible_whole_program : bool, optional + Do extensible whole program compilation of device code. + Default: False + debug : bool, optional + Generate debug information. If --dopt is not specified, then turns off all optimizations. + Default: False + lineinfo: bool, optional + Generate line-number information. + Default: False + device_code_optimize : bool, optional + Enable device code optimization. When specified along with '-G', enables limited debug information generation + for optimized device code. + Default: None + ptxas_options : Union[str, list[str]], optional + Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. + For example ["-v", "-O2"]. + Default: None + max_register_count : int, optional + Specify the maximum amount of registers that GPU functions can use. + Default: None + ftz : bool, optional + When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal + values. + Default: False + prec_sqrt : bool, optional + For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. + Default: True + prec_div : bool, optional + For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster + approximation. + Default: True + fma : bool, optional + Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point + multiply-add operations. + Default: True + use_fast_math : bool, optional + Make use of fast math operations. + Default: False + extra_device_vectorization : bool, optional + Enables more aggressive device code vectorization in the NVVM optimizer. + Default: False + link_time_optimization : bool, optional + Generate intermediate code for later link-time optimization. + Default: False + gen_opt_lto : bool, optional + Run the optimizer passes before generating the LTO IR. + Default: False + define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional + Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of + strings, in which case the first element is defined as the second, or a list of strings or tuples. + Default: None + undefine_macro : Union[str, list[str]], optional + Cancel any previous definition of a macro, or list of macros. + Default: None + include_path : Union[str, list[str]], optional + Add the directory or directories to the list of directories to be searched for headers. + Default: None + pre_include : Union[str, list[str]], optional + Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. + Default: None + no_source_include : bool, optional + Disable the default behavior of adding the directory of each input source to the include path. + Default: False + std : str, optional + Set language dialect to C++03, C++11, C++14, C++17 or C++20. + Default: c++17 + builtin_move_forward : bool, optional + Provide builtin definitions of std::move and std::forward. + Default: True + builtin_initializer_list : bool, optional + Provide builtin definitions of std::initializer_list class and member functions. + Default: True + disable_warnings : bool, optional + Inhibit all warning messages. + Default: False + restrict : bool, optional + Programmer assertion that all kernel pointer parameters are restrict pointers. + Default: False + device_as_default_execution_space : bool, optional + Treat entities with no execution space annotation as __device__ entities. + Default: False + device_int128 : bool, optional + Allow the __int128 type in device code. + Default: False + optimization_info : str, optional + Provide optimization reports for the specified kind of optimization. + Default: None + no_display_error_number : bool, optional + Disable the display of a diagnostic number for warning messages. + Default: False + diag_error : Union[int, list[int]], optional + Emit error for a specified diagnostic message number or comma separated list of numbers. + Default: None + diag_suppress : Union[int, list[int]], optional + Suppress a specified diagnostic message number or comma separated list of numbers. + Default: None + diag_warn : Union[int, list[int]], optional + Emit warning for a specified diagnostic message number or comma separated lis of numbers. + Default: None + brief_diagnostics : bool, optional + Disable or enable showing source line and column info in a diagnostic. + Default: False + time : str, optional + Generate a CSV table with the time taken by each compilation phase. + Default: None + split_compile : int, optional + Perform compiler optimizations in parallel. + Default: 1 + fdevice_syntax_only : bool, optional + Ends device compilation after front-end syntax checking. + Default: False + minimal : bool, optional + Omit certain language features to reduce compile time for small programs. + Default: False + no_cache : bool, optional + Disable compiler caching. + Default: False + fdevice_time_trace : str, optional + Generate time trace JSON for profiling compilation (NVRTC only). + Default: None + device_float128 : bool, optional + Allow __float128 type in device code (NVRTC only). + Default: False + frandom_seed : str, optional + Set random seed for randomized optimizations (NVRTC only). + Default: None + ofast_compile : str, optional + Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only). + Default: None + pch : bool, optional + Use default precompiled header (NVRTC only, CUDA 12.8+). + Default: False + create_pch : str, optional + Create precompiled header file (NVRTC only, CUDA 12.8+). + Default: None + use_pch : str, optional + Use specific precompiled header file (NVRTC only, CUDA 12.8+). + Default: None + pch_dir : str, optional + PCH directory location (NVRTC only, CUDA 12.8+). + Default: None + pch_verbose : bool, optional + Verbose PCH output (NVRTC only, CUDA 12.8+). + Default: False + pch_messages : bool, optional + Control PCH diagnostic messages (NVRTC only, CUDA 12.8+). + Default: False + instantiate_templates_in_pch : bool, optional + Control template instantiation in PCH (NVRTC only, CUDA 12.8+). + Default: False + """ + + name: str | None = "default_program" + arch: str | None = None + relocatable_device_code: bool | None = None + extensible_whole_program: bool | None = None + debug: bool | None = None + lineinfo: bool | None = None + device_code_optimize: bool | None = None + ptxas_options: str | list[str] | tuple[str] | None = None + max_register_count: int | None = None + ftz: bool | None = None + prec_sqrt: bool | None = None + prec_div: bool | None = None + fma: bool | None = None + use_fast_math: bool | None = None + extra_device_vectorization: bool | None = None + link_time_optimization: bool | None = None + gen_opt_lto: bool | None = None + define_macro: str | tuple[str, str] | list[str | tuple[str, str]] | tuple[str | tuple[str, str], ...] | None = None + undefine_macro: str | list[str] | tuple[str] | None = None + include_path: str | list[str] | tuple[str] | None = None + pre_include: str | list[str] | tuple[str] | None = None + no_source_include: bool | None = None + std: str | None = None + builtin_move_forward: bool | None = None + builtin_initializer_list: bool | None = None + disable_warnings: bool | None = None + restrict: bool | None = None + device_as_default_execution_space: bool | None = None + device_int128: bool | None = None + optimization_info: str | None = None + no_display_error_number: bool | None = None + diag_error: int | list[int] | tuple[int] | None = None + diag_suppress: int | list[int] | tuple[int] | None = None + diag_warn: int | list[int] | tuple[int] | None = None + brief_diagnostics: bool | None = None + time: str | None = None + split_compile: int | None = None + fdevice_syntax_only: bool | None = None + minimal: bool | None = None + no_cache: bool | None = None + fdevice_time_trace: str | None = None + device_float128: bool | None = None + frandom_seed: str | None = None + ofast_compile: str | None = None + pch: bool | None = None + create_pch: str | None = None + use_pch: str | None = None + pch_dir: str | None = None + pch_verbose: bool | None = None + pch_messages: bool | None = None + instantiate_templates_in_pch: bool | None = None + extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None + use_libdevice: bool | None = None # For libdevice execution + numba_debug: bool | None = None # Custom option for Numba debugging + + def __post_init__(self): + self._name = self.name.encode() + # Set arch to default if not provided + if self.arch is None: + self.arch = f"sm_{Device().arch}" + + def _prepare_nvrtc_options(self) -> list[bytes]: + return _prepare_nvrtc_options_impl(self) + + def _prepare_nvvm_options(self, as_bytes: bool = True) -> list[bytes] | list[str]: + return _prepare_nvvm_options_impl(self, as_bytes) + + def as_bytes(self, backend: str, target_type: str | None = None) -> list[bytes]: + """Convert program options to bytes format for the specified backend. + + This method transforms the program options into a format suitable for the + specified compiler backend. Different backends may use different option names + and formats even for the same conceptual options. + + Parameters + ---------- + backend : str + The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm". + target_type : str, optional + The compilation target type (e.g., "ptx", "cubin", "ltoir"). Some backends + require additional options based on the target type. + + Returns + ------- + list[bytes] + List of option strings encoded as bytes. + + Raises + ------ + ValueError + If an unknown backend is specified. + CUDAError + If an option incompatible with the specified backend is set. + + Examples + -------- + >>> options = ProgramOptions(arch="sm_80", debug=True) + >>> nvrtc_options = options.as_bytes("nvrtc") + """ + backend = backend.lower() + if backend == "nvrtc": + return self._prepare_nvrtc_options() + elif backend == "nvvm": + options = self._prepare_nvvm_options(as_bytes=True) + if target_type == "ltoir" and b"-gen-lto" not in options: + options.append(b"-gen-lto") + return options + else: + raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'") + + def __repr__(self): + return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})" + + +# ============================================================================= +# Private Classes and Helper Functions +# ============================================================================= + +# Module-level state for NVVM lazy loading +cdef object_nvvm_module = None +cdef bint _nvvm_import_attempted = False + + +def _get_nvvm_module(): + """Get the NVVM module, importing it lazily with availability checks.""" + global _nvvm_module, _nvvm_import_attempted + + if _nvvm_import_attempted: + if _nvvm_module is None: + raise RuntimeError("NVVM module is not available (previous import attempt failed)") + return _nvvm_module + + _nvvm_import_attempted = True + + try: + version = get_binding_version() + if version < (12, 9): + raise RuntimeError( + f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. " + "Please update cuda-bindings to use NVVM features." + ) + + from cuda.bindings import nvvm + from cuda.bindings._internal.nvvm import _inspect_function_pointer + + if _inspect_function_pointer("__nvvmCreateProgram") == 0: + raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ") + + _nvvm_module = nvvm + return _nvvm_module + + except RuntimeError as e: + _nvvm_module = None + raise e + +def _find_libdevice_path(): + """Find libdevice*.bc for NVVM compilation using cuda.pathfinder.""" + from cuda.pathfinder import get_libdevice_path + return get_libdevice_path() + + +cdef inline bint _process_define_macro_inner(list options, object macro) except? -1: + """Process a single define macro, returning True if successful.""" + if isinstance(macro, str): + options.append(f"--define-macro={macro}") + return True + if isinstance(macro, tuple): + if len(macro) != 2 or any(not isinstance(val, str) for val in macro): + raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}") + options.append(f"--define-macro={macro[0]}={macro[1]}") + return True + return False + + +cdef inline void _process_define_macro(list options, object macro) except *: + """Process define_macro option which can be str, tuple, or list thereof.""" + union_type = "Union[str, tuple[str, str]]" + if _process_define_macro_inner(options, macro): + return + if is_nested_sequence(macro): + for seq_macro in macro: + if not _process_define_macro_inner(options, seq_macro): + raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}") + return + raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}") + + +cpdef bint _can_load_generated_ptx() except? -1: + """Check if the driver can load PTX generated by the current NVRTC version.""" + driver_ver = handle_return(driver.cuDriverGetVersion()) + nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion()) + return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver + + +cdef inline object _translate_program_options(object options): + """Translate ProgramOptions to LinkerOptions for PTX compilation.""" + return LinkerOptions( + name=options.name, + arch=options.arch, + max_register_count=options.max_register_count, + time=options.time, + link_time_optimization=options.link_time_optimization, + debug=options.debug, + lineinfo=options.lineinfo, + ftz=options.ftz, + prec_div=options.prec_div, + prec_sqrt=options.prec_sqrt, + fma=options.fma, + split_compile=options.split_compile, + ptxas_options=options.ptxas_options, + no_cache=options.no_cache, + ) + + +cdef inline int Program_init(Program self, object code, str code_type, object options) except -1: + """Initialize a Program instance.""" + cdef cynvrtc.nvrtcProgram nvrtc_prog + cdef cynvvm.nvvmProgram nvvm_prog + cdef bytes code_bytes + cdef const char* code_ptr + cdef const char* name_ptr + cdef size_t code_len + cdef bytes module_bytes + cdef const char* module_ptr + cdef size_t module_len + + self._options = options = check_or_create_options(ProgramOptions, options, "Program options") + code_type = code_type.lower() + self._module_count = 0 + self._use_libdevice = False + + if code_type == "c++": + assert_type(code, str) + if options.extra_sources is not None: + raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)") + # TODO: support pre-loaded headers & include names + code_bytes = code.encode() + code_ptr = code_bytes + name_ptr = options._name + + with nogil: + HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram( + &nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL)) + self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog) + self._backend = "NVRTC" + self._linker = None + + elif code_type == "ptx": + assert_type(code, str) + if options.extra_sources is not None: + raise ValueError("extra_sources is not supported by the PTX backend.") + self._linker = Linker( + ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options) + ) + self._backend = self._linker.backend + + elif code_type == "nvvm": + _get_nvvm_module() # Validate NVVM availability + if isinstance(code, str): + code = code.encode("utf-8") + elif not isinstance(code, (bytes, bytearray)): + raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray") + + code_ptr = (code) + name_ptr = options._name + code_len = len(code) + + with nogil: + HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog)) + self._h_nvvm = create_nvvm_program_handle(nvvm_prog) # RAII from here + with nogil: + HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(nvvm_prog, code_ptr, code_len, name_ptr)) + self._module_count = 1 + + # Add extra modules if provided + if options.extra_sources is not None: + if not is_sequence(options.extra_sources): + raise TypeError( + "extra_sources must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)" + ) + for i, module in enumerate(options.extra_sources): + if not isinstance(module, tuple) or len(module) != 2: + raise TypeError( + f"Each extra module must be a 2-tuple (name, source)" + f", got {type(module).__name__} at index {i}" + ) + + module_name, module_source = module + + if not isinstance(module_name, str): + raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}") + + if isinstance(module_source, str): + # Textual LLVM IR - encode to UTF-8 bytes + module_source = module_source.encode("utf-8") + elif not isinstance(module_source, (bytes, bytearray)): + raise TypeError( + f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), " + f"or bytearray, got {type(module_source).__name__}" + ) + + if len(module_source) == 0: + raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty") + + # Add the module using NVVM API + module_bytes = module_source if isinstance(module_source, bytes) else bytes(module_source) + module_ptr = module_bytes + module_len = len(module_bytes) + module_name_bytes = module_name.encode() + module_name_ptr = module_name_bytes + + with nogil: + HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram( + nvvm_prog, module_ptr, module_len, module_name_ptr)) + self._module_count += 1 + + # Store use_libdevice flag + if options.use_libdevice: + self._use_libdevice = True + + self._backend = "NVVM" + self._linker = None + + else: + supported_code_types = ("c++", "ptx", "nvvm") + assert code_type not in supported_code_types, f"{code_type=}" + raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})") + + return 0 + + + +cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs): + """Compile using NVRTC backend and return ObjectCode.""" + cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc) + cdef size_t output_size = 0 + cdef size_t logsize = 0 + cdef vector[const char*] options_vec + cdef char* data_ptr = NULL + cdef bytes name_bytes + cdef const char* name_ptr = NULL + cdef const char* lowered_name = NULL + cdef dict symbol_mapping = {} + + # Add name expressions before compilation + if name_expressions: + for n in name_expressions: + name_bytes = n.encode() if isinstance(n, str) else n + name_ptr = name_bytes + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr)) + + # Build options array + options_list = self._options.as_bytes("nvrtc", target_type) + options_vec.resize(len(options_list)) + for i in range(len(options_list)): + options_vec[i] = (options_list[i]) + + # Compile + with nogil: + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcCompileProgram(prog, options_vec.size(), options_vec.data())) + + # Get compiled output based on target type + if target_type == "ptx": + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTXSize(prog, &output_size)) + data = bytearray(output_size) + data_ptr = (data) + with nogil: + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTX(prog, data_ptr)) + elif target_type == "cubin": + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBINSize(prog, &output_size)) + data = bytearray(output_size) + data_ptr = (data) + with nogil: + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBIN(prog, data_ptr)) + else: # ltoir + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIRSize(prog, &output_size)) + data = bytearray(output_size) + data_ptr = (data) + with nogil: + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIR(prog, data_ptr)) + + # Get lowered names after compilation + if name_expressions: + for n in name_expressions: + name_bytes = n.encode() if isinstance(n, str) else n + name_ptr = name_bytes + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLoweredName(prog, name_ptr, &lowered_name)) + symbol_mapping[n] = lowered_name if lowered_name != NULL else None + + # Get compilation log if requested + if logs is not None: + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLogSize(prog, &logsize)) + if logsize > 1: + log = bytearray(logsize) + data_ptr = (log) + with nogil: + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr)) + logs.write(log.decode("utf-8", errors="backslashreplace")) + + return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=self._options.name) + + +cdef object Program_compile_nvvm(Program self, str target_type, object logs): + """Compile using NVVM backend and return ObjectCode.""" + cdef cynvvm.nvvmProgram prog = as_cu(self._h_nvvm) + cdef size_t output_size = 0 + cdef size_t logsize = 0 + cdef vector[const char*] options_vec + cdef char* data_ptr = NULL + cdef bytes libdevice_bytes + cdef const char* libdevice_ptr + cdef size_t libdevice_len + + options_list = self._options.as_bytes("nvvm", target_type) + options_vec.resize(len(options_list)) + for i in range(len(options_list)): + options_vec[i] = (options_list[i]) + + with nogil: + HANDLE_RETURN_NVVM(prog, cynvvm.nvvmVerifyProgram(prog, options_vec.size(), options_vec.data())) + + # Load libdevice if requested - following numba-cuda + if self._use_libdevice: + libdevice_path = _find_libdevice_path() + if libdevice_path is None: + raise RuntimeError( + "use_libdevice=True but could not find libdevice.10.bc. " + "Ensure CUDA toolkit is installed." + ) + with open(libdevice_path, "rb") as f: + libdevice_bytes = f.read() + libdevice_ptr = libdevice_bytes + libdevice_len = len(libdevice_bytes) + # Use lazy_add_module + with nogil: + HANDLE_RETURN_NVVM(prog, cynvvm.nvvmLazyAddModuleToProgram( + prog, libdevice_ptr, libdevice_len, NULL)) + + with nogil: + HANDLE_RETURN_NVVM(prog, cynvvm.nvvmCompileProgram(prog, options_vec.size(), options_vec.data())) + + HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResultSize(prog, &output_size)) + data = bytearray(output_size) + data_ptr = (data) + with nogil: + HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResult(prog, data_ptr)) + + if logs is not None: + HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLogSize(prog, &logsize)) + if logsize > 1: + log = bytearray(logsize) + data_ptr = (log) + with nogil: + HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLog(prog, data_ptr)) + logs.write(log.decode("utf-8", errors="backslashreplace")) + + return ObjectCode._init(bytes(data), target_type, name=self._options.name) + +# Supported target types per backend +cdef dict SUPPORTED_TARGETS = { + "NVRTC": ("ptx", "cubin", "ltoir"), + "NVVM": ("ptx", "ltoir"), + "nvJitLink": ("cubin", "ptx"), + "driver": ("cubin", "ptx"), +} + + +cdef object Program_compile(Program self, str target_type, object name_expressions, object logs): + """Compile the program to the specified target type.""" + # Validate target_type for this backend + supported = SUPPORTED_TARGETS.get(self._backend) + if supported is None: + raise ValueError(f'Unknown backend="{self._backend}"') + if target_type not in supported: + raise ValueError( + f'Unsupported target_type="{target_type}" for {self._backend} ' + f'(supported: {", ".join(repr(t) for t in supported)})' + ) + + if self._backend == "NVRTC": + if target_type == "ptx" and not _can_load_generated_ptx(): + warn( + "The CUDA driver version is older than the backend version. " + "The generated ptx will not be loadable by the current driver.", + stacklevel=2, + category=RuntimeWarning, + ) + return Program_compile_nvrtc(self, target_type, name_expressions, logs) + + elif self._backend == "NVVM": + return Program_compile_nvvm(self, target_type, logs) + + else: + return self._linker.link(target_type) + + +cdef inline list _prepare_nvrtc_options_impl(object opts): + """Build NVRTC-specific compiler options.""" + options = [f"-arch={opts.arch}"] + if opts.relocatable_device_code is not None: + options.append(f"--relocatable-device-code={_handle_boolean_option(opts.relocatable_device_code)}") + if opts.extensible_whole_program is not None and opts.extensible_whole_program: + options.append("--extensible-whole-program") + if opts.debug is not None and opts.debug: + options.append("--device-debug") + if opts.lineinfo is not None and opts.lineinfo: + options.append("--generate-line-info") + if opts.device_code_optimize is not None and opts.device_code_optimize: + options.append("--dopt=on") + if opts.ptxas_options is not None: + opt_name = "--ptxas-options" + if isinstance(opts.ptxas_options, str): + options.append(f"{opt_name}={opts.ptxas_options}") + elif is_sequence(opts.ptxas_options): + for opt_value in opts.ptxas_options: + options.append(f"{opt_name}={opt_value}") + if opts.max_register_count is not None: + options.append(f"--maxrregcount={opts.max_register_count}") + if opts.ftz is not None: + options.append(f"--ftz={_handle_boolean_option(opts.ftz)}") + if opts.prec_sqrt is not None: + options.append(f"--prec-sqrt={_handle_boolean_option(opts.prec_sqrt)}") + if opts.prec_div is not None: + options.append(f"--prec-div={_handle_boolean_option(opts.prec_div)}") + if opts.fma is not None: + options.append(f"--fmad={_handle_boolean_option(opts.fma)}") + if opts.use_fast_math is not None and opts.use_fast_math: + options.append("--use_fast_math") + if opts.extra_device_vectorization is not None and opts.extra_device_vectorization: + options.append("--extra-device-vectorization") + if opts.link_time_optimization is not None and opts.link_time_optimization: + options.append("--dlink-time-opt") + if opts.gen_opt_lto is not None and opts.gen_opt_lto: + options.append("--gen-opt-lto") + if opts.define_macro is not None: + _process_define_macro(options, opts.define_macro) + if opts.undefine_macro is not None: + if isinstance(opts.undefine_macro, str): + options.append(f"--undefine-macro={opts.undefine_macro}") + elif is_sequence(opts.undefine_macro): + for macro in opts.undefine_macro: + options.append(f"--undefine-macro={macro}") + if opts.include_path is not None: + if isinstance(opts.include_path, str): + options.append(f"--include-path={opts.include_path}") + elif is_sequence(opts.include_path): + for path in opts.include_path: + options.append(f"--include-path={path}") + if opts.pre_include is not None: + if isinstance(opts.pre_include, str): + options.append(f"--pre-include={opts.pre_include}") + elif is_sequence(opts.pre_include): + for header in opts.pre_include: + options.append(f"--pre-include={header}") + if opts.no_source_include is not None and opts.no_source_include: + options.append("--no-source-include") + if opts.std is not None: + options.append(f"--std={opts.std}") + if opts.builtin_move_forward is not None: + options.append(f"--builtin-move-forward={_handle_boolean_option(opts.builtin_move_forward)}") + if opts.builtin_initializer_list is not None: + options.append(f"--builtin-initializer-list={_handle_boolean_option(opts.builtin_initializer_list)}") + if opts.disable_warnings is not None and opts.disable_warnings: + options.append("--disable-warnings") + if opts.restrict is not None and opts.restrict: + options.append("--restrict") + if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space: + options.append("--device-as-default-execution-space") + if opts.device_int128 is not None and opts.device_int128: + options.append("--device-int128") + if opts.device_float128 is not None and opts.device_float128: + options.append("--device-float128") + if opts.optimization_info is not None: + options.append(f"--optimization-info={opts.optimization_info}") + if opts.no_display_error_number is not None and opts.no_display_error_number: + options.append("--no-display-error-number") + if opts.diag_error is not None: + if isinstance(opts.diag_error, int): + options.append(f"--diag-error={opts.diag_error}") + elif is_sequence(opts.diag_error): + for error in opts.diag_error: + options.append(f"--diag-error={error}") + if opts.diag_suppress is not None: + if isinstance(opts.diag_suppress, int): + options.append(f"--diag-suppress={opts.diag_suppress}") + elif is_sequence(opts.diag_suppress): + for suppress in opts.diag_suppress: + options.append(f"--diag-suppress={suppress}") + if opts.diag_warn is not None: + if isinstance(opts.diag_warn, int): + options.append(f"--diag-warn={opts.diag_warn}") + elif is_sequence(opts.diag_warn): + for w in opts.diag_warn: + options.append(f"--diag-warn={w}") + if opts.brief_diagnostics is not None: + options.append(f"--brief-diagnostics={_handle_boolean_option(opts.brief_diagnostics)}") + if opts.time is not None: + options.append(f"--time={opts.time}") + if opts.split_compile is not None: + options.append(f"--split-compile={opts.split_compile}") + if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only: + options.append("--fdevice-syntax-only") + if opts.minimal is not None and opts.minimal: + options.append("--minimal") + if opts.no_cache is not None and opts.no_cache: + options.append("--no-cache") + if opts.fdevice_time_trace is not None: + options.append(f"--fdevice-time-trace={opts.fdevice_time_trace}") + if opts.frandom_seed is not None: + options.append(f"--frandom-seed={opts.frandom_seed}") + if opts.ofast_compile is not None: + options.append(f"--Ofast-compile={opts.ofast_compile}") + # PCH options (CUDA 12.8+) + if opts.pch is not None and opts.pch: + options.append("--pch") + if opts.create_pch is not None: + options.append(f"--create-pch={opts.create_pch}") + if opts.use_pch is not None: + options.append(f"--use-pch={opts.use_pch}") + if opts.pch_dir is not None: + options.append(f"--pch-dir={opts.pch_dir}") + if opts.pch_verbose is not None: + options.append(f"--pch-verbose={_handle_boolean_option(opts.pch_verbose)}") + if opts.pch_messages is not None: + options.append(f"--pch-messages={_handle_boolean_option(opts.pch_messages)}") + if opts.instantiate_templates_in_pch is not None: + options.append( + f"--instantiate-templates-in-pch={_handle_boolean_option(opts.instantiate_templates_in_pch)}" + ) + if opts.numba_debug: + options.append("--numba-debug") + return [o.encode() for o in options] + + +cdef inline object _prepare_nvvm_options_impl(object opts, bint as_bytes): + """Build NVVM-specific compiler options.""" + options = [] + + # Options supported by NVVM + assert opts.arch is not None + arch = opts.arch + if arch.startswith("sm_"): + arch = f"compute_{arch[3:]}" + options.append(f"-arch={arch}") + if opts.debug is not None and opts.debug: + options.append("-g") + if opts.device_code_optimize is False: + options.append("-opt=0") + elif opts.device_code_optimize is True: + options.append("-opt=3") + # NVVM uses 0/1 instead of true/false for boolean options + if opts.ftz is not None: + options.append(f"-ftz={'1' if opts.ftz else '0'}") + if opts.prec_sqrt is not None: + options.append(f"-prec-sqrt={'1' if opts.prec_sqrt else '0'}") + if opts.prec_div is not None: + options.append(f"-prec-div={'1' if opts.prec_div else '0'}") + if opts.fma is not None: + options.append(f"-fma={'1' if opts.fma else '0'}") + + # Check for unsupported options and raise error if they are set + unsupported = [] + if opts.relocatable_device_code is not None: + unsupported.append("relocatable_device_code") + if opts.extensible_whole_program is not None and opts.extensible_whole_program: + unsupported.append("extensible_whole_program") + if opts.lineinfo is not None and opts.lineinfo: + unsupported.append("lineinfo") + if opts.ptxas_options is not None: + unsupported.append("ptxas_options") + if opts.max_register_count is not None: + unsupported.append("max_register_count") + if opts.use_fast_math is not None and opts.use_fast_math: + unsupported.append("use_fast_math") + if opts.extra_device_vectorization is not None and opts.extra_device_vectorization: + unsupported.append("extra_device_vectorization") + if opts.gen_opt_lto is not None and opts.gen_opt_lto: + unsupported.append("gen_opt_lto") + if opts.define_macro is not None: + unsupported.append("define_macro") + if opts.undefine_macro is not None: + unsupported.append("undefine_macro") + if opts.include_path is not None: + unsupported.append("include_path") + if opts.pre_include is not None: + unsupported.append("pre_include") + if opts.no_source_include is not None and opts.no_source_include: + unsupported.append("no_source_include") + if opts.std is not None: + unsupported.append("std") + if opts.builtin_move_forward is not None: + unsupported.append("builtin_move_forward") + if opts.builtin_initializer_list is not None: + unsupported.append("builtin_initializer_list") + if opts.disable_warnings is not None and opts.disable_warnings: + unsupported.append("disable_warnings") + if opts.restrict is not None and opts.restrict: + unsupported.append("restrict") + if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space: + unsupported.append("device_as_default_execution_space") + if opts.device_int128 is not None and opts.device_int128: + unsupported.append("device_int128") + if opts.optimization_info is not None: + unsupported.append("optimization_info") + if opts.no_display_error_number is not None and opts.no_display_error_number: + unsupported.append("no_display_error_number") + if opts.diag_error is not None: + unsupported.append("diag_error") + if opts.diag_suppress is not None: + unsupported.append("diag_suppress") + if opts.diag_warn is not None: + unsupported.append("diag_warn") + if opts.brief_diagnostics is not None: + unsupported.append("brief_diagnostics") + if opts.time is not None: + unsupported.append("time") + if opts.split_compile is not None: + unsupported.append("split_compile") + if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only: + unsupported.append("fdevice_syntax_only") + if opts.minimal is not None and opts.minimal: + unsupported.append("minimal") + if opts.numba_debug is not None and opts.numba_debug: + unsupported.append("numba_debug") + if unsupported: + raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}") + + if as_bytes: + return [o.encode() for o in options] + else: + return options \ No newline at end of file From ca32d2b39dd7a3363dae40362ae9c3933c477e3f Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 11 Feb 2026 19:54:54 +0000 Subject: [PATCH 32/50] fix import --- cuda_core/tests/test_program.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 44d69a9bd4..929e38fd7a 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -530,7 +530,7 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options): @nvvm_available def test_nvvm_program_with_single_extra_source(nvvm_ir): """Test NVVM program with a single extra source""" - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() major, minor, debug_major, debug_minor = nvvm.ir_version() @@ -568,7 +568,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir): @nvvm_available def test_nvvm_program_with_multiple_extra_sources(): """Test NVVM program with multiple extra sources""" - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() major, minor, debug_major, debug_minor = nvvm.ir_version() From fac190726a88835239b3097aeca7879f8bac1fce Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 12 Feb 2026 04:16:39 +0000 Subject: [PATCH 33/50] fix tests --- cuda_core/tests/test_program.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 929e38fd7a..e439353aa9 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -11,6 +11,7 @@ from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return +from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir # noqa: F401 cuda_driver_version = handle_return(driver.cuDriverGetVersion()) is_culink_backend = _linker._decide_nvjitlink_or_driver() From 4a01e068b79f3834a72934eb149c80025edd9341 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 12 Feb 2026 04:39:22 +0000 Subject: [PATCH 34/50] fix ruff check --- cuda_core/tests/test_program.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index e439353aa9..daef53da4e 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -11,7 +11,7 @@ from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return -from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir # noqa: F401 +from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir # noqa: F401, F811 cuda_driver_version = handle_return(driver.cuDriverGetVersion()) is_culink_backend = _linker._decide_nvjitlink_or_driver() @@ -650,7 +650,7 @@ def test_nvvm_program_with_multiple_extra_sources(): @nvvm_available -def test_bitcode_format(minimal_nvvmir): +def test_bitcode_format(minimal_nvvmir): # noqa: F811 if len(minimal_nvvmir) < 4: pytest.skip("Bitcode file is not valid or empty") From 2d5252fb49bf6bb1db129b030b2f48ab400a6537 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 12 Feb 2026 04:44:28 +0000 Subject: [PATCH 35/50] ruff fix find_libdevice --- cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index 0cfcd4e493..e56fd17114 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -4,10 +4,10 @@ import glob import os +from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages -from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern) SITE_PACKAGES_LIBDEVICE_DIRS = ( @@ -21,6 +21,7 @@ else: bases = ["/usr/local/cuda", "/opt/cuda"] + def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: error_messages.append(f"No such file: {os.path.join(dir_path, filename)}") if os.path.isdir(dir_path): From 2976c24cad77e54091af010d12a8eae32d2fbcd2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 04:46:35 +0000 Subject: [PATCH 36/50] [pre-commit.ci] auto code formatting --- cuda_core/cuda/core/_program.pxd | 2 +- cuda_core/cuda/core/_program.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd index d4abe85ff8..d2ddc71513 100644 --- a/cuda_core/cuda/core/_program.pxd +++ b/cuda_core/cuda/core/_program.pxd @@ -14,4 +14,4 @@ cdef class Program: object _options # ProgramOptions object __weakref__ bint _use_libdevice # Flag for libdevice loading - int _module_count + int _module_count diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx index 35bf55e545..4f0e0c8b78 100644 --- a/cuda_core/cuda/core/_program.pyx +++ b/cuda_core/cuda/core/_program.pyx @@ -625,7 +625,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op # Store use_libdevice flag if options.use_libdevice: self._use_libdevice = True - + self._backend = "NVVM" self._linker = None From 61c1e00f5d43b654305304ece7e38026e5d80bb7 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 12 Feb 2026 04:48:32 +0000 Subject: [PATCH 37/50] add spdx and copyright --- .../cuda_python_test_helpers/nvvm_bitcode.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py index e42dbff085..5264b947d0 100644 --- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py +++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + import binascii import pytest From c6bea0c38d0adce012fc153f96093b36e3a89ec7 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 12 Feb 2026 11:44:16 +0000 Subject: [PATCH 38/50] rm redundant include and fix test --- cuda_pathfinder/cuda/pathfinder/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py index ece060d81a..bdcbdea8f1 100644 --- a/cuda_pathfinder/cuda/pathfinder/__init__.py +++ b/cuda_pathfinder/cuda/pathfinder/__init__.py @@ -19,9 +19,7 @@ locate_nvidia_header_directory as locate_nvidia_header_directory, ) from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK -from cuda.pathfinder._static_libs.find_libdevice import ( - LibdeviceNotFoundError as LibdeviceNotFoundError, -) + from cuda.pathfinder._static_libs.find_libdevice import ( find_libdevice as find_libdevice, ) From b7866cf5e38d4e5b4ecd1c3a26cbc744a6ac9e0c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 11:46:05 +0000 Subject: [PATCH 39/50] [pre-commit.ci] auto code formatting --- cuda_pathfinder/cuda/pathfinder/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py index bdcbdea8f1..63781965f3 100644 --- a/cuda_pathfinder/cuda/pathfinder/__init__.py +++ b/cuda_pathfinder/cuda/pathfinder/__init__.py @@ -19,7 +19,6 @@ locate_nvidia_header_directory as locate_nvidia_header_directory, ) from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK - from cuda.pathfinder._static_libs.find_libdevice import ( find_libdevice as find_libdevice, ) From 42832306f638a116c0b0c9182e6cf7898fdc969e Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 12 Feb 2026 18:58:45 +0000 Subject: [PATCH 40/50] refresh tests --- cuda_core/tests/test_program.py | 60 ++++++++++++--------------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index daef53da4e..30cdad6b9e 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -445,11 +445,23 @@ def test_nvvm_compile_invalid_target(nvvm_ir): @nvvm_available +@pytest.mark.parametrize("target_type", ["ptx", "ltoir"]) @pytest.mark.parametrize( "options", [ ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False), ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False), + ProgramOptions(name="test3", arch="sm_100", link_time_optimization=True), + ProgramOptions( + name="test4", + arch="sm_90", + ftz=True, + prec_sqrt=False, + prec_div=False, + fma=True, + device_code_optimize=True, + link_time_optimization=True, + ), pytest.param( ProgramOptions(name="test_sm110_1", arch="sm_110", device_code_optimize=False), marks=pytest.mark.skipif( @@ -481,50 +493,22 @@ def test_nvvm_compile_invalid_target(nvvm_ir): ), ], ) -def test_nvvm_program_options(init_cuda, nvvm_ir, options): - """Test NVVM programs with different options""" +def test_nvvm_program_options(init_cuda, nvvm_ir, options, target_type): + """Test NVVM programs with different options and target types (ptx/ltoir)""" program = Program(nvvm_ir, "nvvm", options) assert program.backend == "NVVM" - ptx_code = program.compile("ptx") - assert isinstance(ptx_code, ObjectCode) - assert ptx_code.name == options.name - - code_content = ptx_code.code - ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content) - assert ".visible .entry simple(" in ptx_text + result = program.compile(target_type) + assert isinstance(result, ObjectCode) + assert result.name == options.name - program.close() + code_content = result.code + assert len(code_content) > 0 + if target_type == "ptx": + ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content) + assert ".visible .entry simple(" in ptx_text -@nvvm_available -@pytest.mark.parametrize( - "options", - [ - ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False), - ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True), - ProgramOptions( - name="ltoir_test3", - arch="sm_90", - ftz=True, - prec_sqrt=False, - prec_div=False, - fma=True, - device_code_optimize=True, - link_time_optimization=True, - ), - ], -) -def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options): - """Test NVVM programs for LTOIR with different options""" - program = Program(nvvm_ir, "nvvm", options) - assert program.backend == "NVVM" - - ltoir_code = program.compile("ltoir") - assert isinstance(ltoir_code, ObjectCode) - assert ltoir_code.name == options.name - code_content = ltoir_code.code - assert len(code_content) > 0 program.close() From 78f43283e2d795d498b5aa54d93c15d3eadab753 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 12 Feb 2026 19:05:29 +0000 Subject: [PATCH 41/50] add correct libdevice for CTK> 13 --- cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index e56fd17114..c7e3b8cf18 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -11,7 +11,7 @@ # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern) SITE_PACKAGES_LIBDEVICE_DIRS = ( - "nvidia/cuda_nvvm/nvvm/libdevice", # CTK 13+ + "nvidia/cu13/nvvm/libdevice", # CTK 13+ "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 ) From ddf48396621e10506a0cf128bb0348aed1caf73a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 19:14:03 +0000 Subject: [PATCH 42/50] [pre-commit.ci] auto code formatting --- cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index c7e3b8cf18..0a37b31736 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -11,7 +11,7 @@ # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern) SITE_PACKAGES_LIBDEVICE_DIRS = ( - "nvidia/cu13/nvvm/libdevice", # CTK 13+ + "nvidia/cu13/nvvm/libdevice", # CTK 13+ "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 ) From cd3644e68345c75efde93f1335333739e2886a7e Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 13 Feb 2026 09:24:53 +0000 Subject: [PATCH 43/50] revamp design of pathfinder as LocatedHeaderDir --- cuda_pathfinder/cuda/pathfinder/__init__.py | 8 +- .../pathfinder/_static_libs/find_libdevice.py | 111 +++++++++++------- cuda_pathfinder/tests/test_find_libdevice.py | 55 ++++++--- 3 files changed, 116 insertions(+), 58 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py index 63781965f3..cc1eda8d2d 100644 --- a/cuda_pathfinder/cuda/pathfinder/__init__.py +++ b/cuda_pathfinder/cuda/pathfinder/__init__.py @@ -20,10 +20,10 @@ ) from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK from cuda.pathfinder._static_libs.find_libdevice import ( - find_libdevice as find_libdevice, -) -from cuda.pathfinder._static_libs.find_libdevice import ( - get_libdevice_path as get_libdevice_path, + BitcodeLibNotFoundError as BitcodeLibNotFoundError, + LocatedBitcodeLib as LocatedBitcodeLib, + find_bitcode_lib as find_bitcode_lib, + locate_bitcode_lib as locate_bitcode_lib, ) from cuda.pathfinder._version import __version__ # isort: skip # noqa: F401 diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index 0a37b31736..f4422a5df1 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -1,25 +1,46 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 + import functools import glob import os +from dataclasses import dataclass -from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError +from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages -# Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern) -SITE_PACKAGES_LIBDEVICE_DIRS = ( - "nvidia/cu13/nvvm/libdevice", # CTK 13+ - "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 -) -FILENAME = "libdevice.10.bc" +class BitcodeLibNotFoundError(DynamicLibNotFoundError): + """Raised when a bitcode library cannot be found.""" + pass + + +@dataclass(frozen=True) +class LocatedBitcodeLib: + """Information about a located bitcode library. + """ + name: str + abs_path: str + filename: str + + +SUPPORTED_BITCODE_LIBS = { + "device": { + "filename": "libdevice.10.bc", + "rel_path": os.path.join("nvvm", "libdevice"), + "site_packages_dirs": ( + "nvidia/cu13/nvvm/libdevice", # CTK 13+ + "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 + ), + }, +} + if IS_WINDOWS: - bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] + _COMMON_BASES = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] else: - bases = ["/usr/local/cuda", "/opt/cuda"] + _COMMON_BASES = ["/usr/local/cuda", "/opt/cuda"] def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: @@ -32,19 +53,26 @@ def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str] attachments.append(f' Directory does not exist: "{dir_path}"') -class _FindLibdevice: - REL_PATH = os.path.join("nvvm", "libdevice") - - def __init__(self) -> None: +class _FindBitcodeLib: + def __init__(self, name: str) -> None: + if name not in SUPPORTED_BITCODE_LIBS: + raise ValueError( + f"Unknown bitcode library: '{name}'. " + f"Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}" + ) + self.name = name + self.config = SUPPORTED_BITCODE_LIBS[name] + self.filename = self.config["filename"] + self.rel_path = self.config["rel_path"] + self.site_packages_dirs = self.config["site_packages_dirs"] self.error_messages: list[str] = [] self.attachments: list[str] = [] - self.abs_path: str | None = None def try_site_packages(self) -> str | None: - for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS: + for rel_dir in self.site_packages_dirs: sub_dir = tuple(rel_dir.split("/")) for abs_dir in find_sub_dirs_all_sitepackages(sub_dir): - file_path = os.path.join(abs_dir, FILENAME) + file_path = os.path.join(abs_dir, self.filename) if os.path.isfile(file_path): return file_path return None @@ -55,7 +83,7 @@ def try_with_conda_prefix(self) -> str | None: return None anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix - file_path = os.path.join(anchor, self.REL_PATH, FILENAME) + file_path = os.path.join(anchor, self.rel_path, self.filename) if os.path.isfile(file_path): return file_path return None @@ -66,29 +94,26 @@ def try_with_cuda_home(self) -> str | None: self.error_messages.append("CUDA_HOME/CUDA_PATH not set") return None - file_path = os.path.join(cuda_home, self.REL_PATH, FILENAME) + file_path = os.path.join(cuda_home, self.rel_path, self.filename) if os.path.isfile(file_path): return file_path _no_such_file_in_dir( - os.path.join(cuda_home, self.REL_PATH), - FILENAME, + os.path.join(cuda_home, self.rel_path), + self.filename, self.error_messages, self.attachments, ) return None def try_common_paths(self) -> str | None: - - for base in bases: - # Direct path - file_path = os.path.join(base, self.REL_PATH, FILENAME) + for base in _COMMON_BASES: + file_path = os.path.join(base, self.rel_path, self.filename) if os.path.isfile(file_path): return file_path - # Versioned paths (e.g., /usr/local/cuda-13.0) for versioned in sorted(glob.glob(base + "*"), reverse=True): if os.path.isdir(versioned): - file_path = os.path.join(versioned, self.REL_PATH, FILENAME) + file_path = os.path.join(versioned, self.rel_path, self.filename) if os.path.isfile(file_path): return file_path return None @@ -96,12 +121,12 @@ def try_common_paths(self) -> str | None: def raise_not_found_error(self) -> None: err = ", ".join(self.error_messages) if self.error_messages else "No search paths available" att = "\n".join(self.attachments) if self.attachments else "" - raise DynamicLibNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}') + raise BitcodeLibNotFoundError(f'Failure finding "{self.filename}": {err}\n{att}') -def get_libdevice_path() -> str | None: - """Get the path to libdevice*.bc, or None if not found.""" - finder = _FindLibdevice() +def locate_bitcode_lib(name: str) -> LocatedBitcodeLib | None: + """Locate a bitcode library by name.""" + finder = _FindBitcodeLib(name) abs_path = finder.try_site_packages() if abs_path is None: @@ -111,16 +136,22 @@ def get_libdevice_path() -> str | None: if abs_path is None: abs_path = finder.try_common_paths() - return abs_path + if abs_path is None: + return None + + return LocatedBitcodeLib( + name=name, + abs_path=abs_path, + filename=finder.filename, + ) @functools.cache -def find_libdevice() -> str: - """Find the path to libdevice*.bc. - Raises: - DynamicLibNotFoundError: If libdevice.10.bc cannot be found - """ - path_or_none = get_libdevice_path() - if path_or_none is None: - raise DynamicLibNotFoundError(f"{FILENAME} not found") - return path_or_none +def find_bitcode_lib(name: str) -> str: + """Find the absolute path to a bitcode library.""" + result = locate_bitcode_lib(name) + if result is None: + config = SUPPORTED_BITCODE_LIBS.get(name, {}) + filename = config.get("filename", name) + raise BitcodeLibNotFoundError(f"{filename} not found") + return result.abs_path diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py index a1dfc0b64f..d93929991f 100644 --- a/cuda_pathfinder/tests/test_find_libdevice.py +++ b/cuda_pathfinder/tests/test_find_libdevice.py @@ -5,20 +5,18 @@ import pytest -from cuda.pathfinder import find_libdevice -from cuda.pathfinder._static_libs import find_libdevice as find_libdevice_module +import cuda.pathfinder._static_libs.find_libdevice as find_libdevice_module FILENAME = "libdevice.10.bc" SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice" SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice" - @pytest.fixture def clear_find_libdevice_cache(): - find_libdevice.cache_clear() + find_libdevice_module.find_bitcode_lib.cache_clear() yield - find_libdevice.cache_clear() + find_libdevice_module.find_bitcode_lib.cache_clear() def _make_libdevice_file(dir_path: str) -> str: @@ -44,10 +42,13 @@ def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir monkeypatch.delenv("CUDA_HOME", raising=False) monkeypatch.delenv("CUDA_PATH", raising=False) - result = find_libdevice() + result = find_libdevice_module.locate_bitcode_lib("device") - assert result == expected_path - assert os.path.isfile(result) + assert result is not None + assert result.abs_path == expected_path + assert result.name == "device" + assert result.filename == FILENAME + assert os.path.isfile(result.abs_path) # same for cu12/cu13 @@ -67,11 +68,11 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path): monkeypatch.delenv("CUDA_HOME", raising=False) monkeypatch.delenv("CUDA_PATH", raising=False) - result = find_libdevice() - - assert result == expected_path - assert os.path.isfile(result) + result = find_libdevice_module.locate_bitcode_lib("device") + assert result is not None + assert result.abs_path == expected_path + assert os.path.isfile(result.abs_path) @pytest.mark.usefixtures("clear_find_libdevice_cache") def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path): @@ -88,7 +89,33 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path): monkeypatch.setenv("CUDA_HOME", str(tmp_path)) monkeypatch.delenv("CUDA_PATH", raising=False) - result = find_libdevice() + result = find_libdevice_module.locate_bitcode_lib("device") + + assert result is not None + assert result.abs_path == expected_path + assert os.path.isfile(result.abs_path) + +@pytest.mark.usefixtures("clear_find_libdevice_cache") +def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path): + rel_path = os.path.join("nvvm", "libdevice") + libdevice_dir = tmp_path / rel_path + expected_path = str(_make_libdevice_file(str(libdevice_dir))) + + mocker.patch.object( + find_libdevice_module, + "find_sub_dirs_all_sitepackages", + return_value=[str(libdevice_dir)], + ) + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.delenv("CUDA_HOME", raising=False) + monkeypatch.delenv("CUDA_PATH", raising=False) + + result = find_libdevice_module.find_bitcode_lib("device") assert result == expected_path - assert os.path.isfile(result) + assert isinstance(result, str) + + +def test_find_bitcode_lib_invalid_name(): + with pytest.raises(ValueError, match="Unknown bitcode library"): + find_libdevice_module.locate_bitcode_lib("invalid") \ No newline at end of file From 68b33a219c22ce24de971c89c7fa1babea6bf6d4 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 13 Feb 2026 09:27:07 +0000 Subject: [PATCH 44/50] refresh --- .../cuda/pathfinder/_static_libs/find_libdevice.py | 10 +++++----- cuda_pathfinder/tests/test_find_libdevice.py | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index f4422a5df1..727ab1577f 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -14,13 +14,14 @@ class BitcodeLibNotFoundError(DynamicLibNotFoundError): """Raised when a bitcode library cannot be found.""" + pass @dataclass(frozen=True) class LocatedBitcodeLib: - """Information about a located bitcode library. - """ + """Information about a located bitcode library.""" + name: str abs_path: str filename: str @@ -31,7 +32,7 @@ class LocatedBitcodeLib: "filename": "libdevice.10.bc", "rel_path": os.path.join("nvvm", "libdevice"), "site_packages_dirs": ( - "nvidia/cu13/nvvm/libdevice", # CTK 13+ + "nvidia/cu13/nvvm/libdevice", # CTK 13+ "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 ), }, @@ -57,8 +58,7 @@ class _FindBitcodeLib: def __init__(self, name: str) -> None: if name not in SUPPORTED_BITCODE_LIBS: raise ValueError( - f"Unknown bitcode library: '{name}'. " - f"Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}" + f"Unknown bitcode library: '{name}'. Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}" ) self.name = name self.config = SUPPORTED_BITCODE_LIBS[name] diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py index d93929991f..78ed1dbe21 100644 --- a/cuda_pathfinder/tests/test_find_libdevice.py +++ b/cuda_pathfinder/tests/test_find_libdevice.py @@ -12,6 +12,7 @@ SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice" SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice" + @pytest.fixture def clear_find_libdevice_cache(): find_libdevice_module.find_bitcode_lib.cache_clear() @@ -74,6 +75,7 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path): assert result.abs_path == expected_path assert os.path.isfile(result.abs_path) + @pytest.mark.usefixtures("clear_find_libdevice_cache") def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path): rel_path = os.path.join("nvvm", "libdevice") @@ -94,7 +96,8 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path): assert result is not None assert result.abs_path == expected_path assert os.path.isfile(result.abs_path) - + + @pytest.mark.usefixtures("clear_find_libdevice_cache") def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path): rel_path = os.path.join("nvvm", "libdevice") @@ -118,4 +121,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path): def test_find_bitcode_lib_invalid_name(): with pytest.raises(ValueError, match="Unknown bitcode library"): - find_libdevice_module.locate_bitcode_lib("invalid") \ No newline at end of file + find_libdevice_module.locate_bitcode_lib("invalid") From 631a1138f5a13a58b1f6024b5e9d8ed4019ee136 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Feb 2026 09:29:01 +0000 Subject: [PATCH 45/50] [pre-commit.ci] auto code formatting --- cuda_pathfinder/cuda/pathfinder/__init__.py | 6 ++++++ .../cuda/pathfinder/_static_libs/find_libdevice.py | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py index cc1eda8d2d..80dff6d486 100644 --- a/cuda_pathfinder/cuda/pathfinder/__init__.py +++ b/cuda_pathfinder/cuda/pathfinder/__init__.py @@ -21,8 +21,14 @@ from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK from cuda.pathfinder._static_libs.find_libdevice import ( BitcodeLibNotFoundError as BitcodeLibNotFoundError, +) +from cuda.pathfinder._static_libs.find_libdevice import ( LocatedBitcodeLib as LocatedBitcodeLib, +) +from cuda.pathfinder._static_libs.find_libdevice import ( find_bitcode_lib as find_bitcode_lib, +) +from cuda.pathfinder._static_libs.find_libdevice import ( locate_bitcode_lib as locate_bitcode_lib, ) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index 727ab1577f..a45f03a19b 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -15,8 +15,6 @@ class BitcodeLibNotFoundError(DynamicLibNotFoundError): """Raised when a bitcode library cannot be found.""" - pass - @dataclass(frozen=True) class LocatedBitcodeLib: From 7a02aee8c974070e0513fc098cc31480f9217903 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 13 Feb 2026 09:36:23 +0000 Subject: [PATCH 46/50] fix mypy errirs --- .../pathfinder/_static_libs/find_libdevice.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index a45f03a19b..7e3c1a7805 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -5,6 +5,7 @@ import glob import os from dataclasses import dataclass +from typing import TypedDict from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS @@ -25,21 +26,27 @@ class LocatedBitcodeLib: filename: str -SUPPORTED_BITCODE_LIBS = { +class _BitcodeLibConfig(TypedDict): + filename: str + rel_path: str + site_packages_dirs: tuple[str, ...] + + +SUPPORTED_BITCODE_LIBS: dict[str, _BitcodeLibConfig] = { "device": { "filename": "libdevice.10.bc", "rel_path": os.path.join("nvvm", "libdevice"), "site_packages_dirs": ( - "nvidia/cu13/nvvm/libdevice", # CTK 13+ - "nvidia/cuda_nvcc/nvvm/libdevice", # CTK <13 + "nvidia/cu13/nvvm/libdevice", + "nvidia/cuda_nvcc/nvvm/libdevice", ), }, } if IS_WINDOWS: - _COMMON_BASES = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] + _COMMON_BASES: list[str] = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] else: - _COMMON_BASES = ["/usr/local/cuda", "/opt/cuda"] + _COMMON_BASES: list[str] = ["/usr/local/cuda", "/opt/cuda"] def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: @@ -58,11 +65,11 @@ def __init__(self, name: str) -> None: raise ValueError( f"Unknown bitcode library: '{name}'. Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}" ) - self.name = name - self.config = SUPPORTED_BITCODE_LIBS[name] - self.filename = self.config["filename"] - self.rel_path = self.config["rel_path"] - self.site_packages_dirs = self.config["site_packages_dirs"] + self.name: str = name + self.config: _BitcodeLibConfig = SUPPORTED_BITCODE_LIBS[name] + self.filename: str = self.config["filename"] + self.rel_path: str = self.config["rel_path"] + self.site_packages_dirs: tuple[str, ...] = self.config["site_packages_dirs"] self.error_messages: list[str] = [] self.attachments: list[str] = [] @@ -149,7 +156,7 @@ def find_bitcode_lib(name: str) -> str: """Find the absolute path to a bitcode library.""" result = locate_bitcode_lib(name) if result is None: - config = SUPPORTED_BITCODE_LIBS.get(name, {}) - filename = config.get("filename", name) + config = SUPPORTED_BITCODE_LIBS.get(name) + filename = config["filename"] if config else name raise BitcodeLibNotFoundError(f"{filename} not found") return result.abs_path From b8f2eb076e2b703bae7883af6c13842c7c218ea2 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 13 Feb 2026 09:42:26 +0000 Subject: [PATCH 47/50] fix base var declaration --- .../cuda/pathfinder/_static_libs/find_libdevice.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index 7e3c1a7805..5d51f733d8 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -43,10 +43,11 @@ class _BitcodeLibConfig(TypedDict): }, } -if IS_WINDOWS: - _COMMON_BASES: list[str] = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] -else: - _COMMON_BASES: list[str] = ["/usr/local/cuda", "/opt/cuda"] +_COMMON_BASES: list[str] = ( + [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] + if IS_WINDOWS + else ["/usr/local/cuda", "/opt/cuda"] +) def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: From 434b3c904c84cdbbd511fe64d6b2ed38be5d34d5 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 13 Feb 2026 09:49:58 +0000 Subject: [PATCH 48/50] format changes --- .../cuda/pathfinder/_static_libs/find_libdevice.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index 5d51f733d8..102b3b9f77 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -13,6 +13,13 @@ from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages +_COMMON_BASES: list[str] = ( + [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] + if IS_WINDOWS + else ["/usr/local/cuda", "/opt/cuda"] +) + + class BitcodeLibNotFoundError(DynamicLibNotFoundError): """Raised when a bitcode library cannot be found.""" @@ -43,12 +50,6 @@ class _BitcodeLibConfig(TypedDict): }, } -_COMMON_BASES: list[str] = ( - [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] - if IS_WINDOWS - else ["/usr/local/cuda", "/opt/cuda"] -) - def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: error_messages.append(f"No such file: {os.path.join(dir_path, filename)}") From 9f0a3199978b6c042c0c35f0dd6121621a24344a Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 13 Feb 2026 09:51:56 +0000 Subject: [PATCH 49/50] format changes --- .../cuda/pathfinder/_static_libs/find_libdevice.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py index 102b3b9f77..5d51f733d8 100644 --- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py +++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py @@ -13,13 +13,6 @@ from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages -_COMMON_BASES: list[str] = ( - [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] - if IS_WINDOWS - else ["/usr/local/cuda", "/opt/cuda"] -) - - class BitcodeLibNotFoundError(DynamicLibNotFoundError): """Raised when a bitcode library cannot be found.""" @@ -50,6 +43,12 @@ class _BitcodeLibConfig(TypedDict): }, } +_COMMON_BASES: list[str] = ( + [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"] + if IS_WINDOWS + else ["/usr/local/cuda", "/opt/cuda"] +) + def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None: error_messages.append(f"No such file: {os.path.join(dir_path, filename)}") From f499487a0ea6a2f63553d0d31802dd8759837020 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Mon, 16 Feb 2026 06:27:50 +0000 Subject: [PATCH 50/50] rename to bitcodelib --- cuda_pathfinder/cuda/pathfinder/__init__.py | 8 +-- ...{find_libdevice.py => find_bitcode_lib.py} | 0 ..._libdevice.py => test_find_bitcode_lib.py} | 64 +++++++++---------- 3 files changed, 36 insertions(+), 36 deletions(-) rename cuda_pathfinder/cuda/pathfinder/_static_libs/{find_libdevice.py => find_bitcode_lib.py} (100%) rename cuda_pathfinder/tests/{test_find_libdevice.py => test_find_bitcode_lib.py} (57%) diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py index 80dff6d486..29a3ff4979 100644 --- a/cuda_pathfinder/cuda/pathfinder/__init__.py +++ b/cuda_pathfinder/cuda/pathfinder/__init__.py @@ -19,16 +19,16 @@ locate_nvidia_header_directory as locate_nvidia_header_directory, ) from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK -from cuda.pathfinder._static_libs.find_libdevice import ( +from cuda.pathfinder._static_libs.find_bitcode_lib import ( BitcodeLibNotFoundError as BitcodeLibNotFoundError, ) -from cuda.pathfinder._static_libs.find_libdevice import ( +from cuda.pathfinder._static_libs.find_bitcode_lib import ( LocatedBitcodeLib as LocatedBitcodeLib, ) -from cuda.pathfinder._static_libs.find_libdevice import ( +from cuda.pathfinder._static_libs.find_bitcode_lib import ( find_bitcode_lib as find_bitcode_lib, ) -from cuda.pathfinder._static_libs.find_libdevice import ( +from cuda.pathfinder._static_libs.find_bitcode_lib import ( locate_bitcode_lib as locate_bitcode_lib, ) diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py similarity index 100% rename from cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py rename to cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py similarity index 57% rename from cuda_pathfinder/tests/test_find_libdevice.py rename to cuda_pathfinder/tests/test_find_bitcode_lib.py index 78ed1dbe21..b9ab71529f 100644 --- a/cuda_pathfinder/tests/test_find_libdevice.py +++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py @@ -5,7 +5,7 @@ import pytest -import cuda.pathfinder._static_libs.find_libdevice as find_libdevice_module +import cuda.pathfinder._static_libs.find_bitcode_lib as find_bitcode_lib_module FILENAME = "libdevice.10.bc" @@ -14,13 +14,13 @@ @pytest.fixture -def clear_find_libdevice_cache(): - find_libdevice_module.find_bitcode_lib.cache_clear() +def clear_find_bitcode_lib_cache(): + find_bitcode_lib_module.find_bitcode_lib.cache_clear() yield - find_libdevice_module.find_bitcode_lib.cache_clear() + find_bitcode_lib_module.find_bitcode_lib.cache_clear() -def _make_libdevice_file(dir_path: str) -> str: +def _make_bitcode_lib_file(dir_path: str) -> str: os.makedirs(dir_path, exist_ok=True) file_path = os.path.join(dir_path, FILENAME) with open(file_path, "wb"): @@ -29,21 +29,21 @@ def _make_libdevice_file(dir_path: str) -> str: @pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13]) -@pytest.mark.usefixtures("clear_find_libdevice_cache") -def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir): - libdevice_dir = tmp_path.joinpath(*rel_dir.split("/")) - expected_path = str(_make_libdevice_file(str(libdevice_dir))) +@pytest.mark.usefixtures("clear_find_bitcode_lib_cache") +def test_find_bitcode_lib_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir): + bitcode_lib_dir = tmp_path.joinpath(*rel_dir.split("/")) + expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir))) mocker.patch.object( - find_libdevice_module, + find_bitcode_lib_module, "find_sub_dirs_all_sitepackages", - return_value=[str(libdevice_dir)], + return_value=[str(bitcode_lib_dir)], ) monkeypatch.delenv("CONDA_PREFIX", raising=False) monkeypatch.delenv("CUDA_HOME", raising=False) monkeypatch.delenv("CUDA_PATH", raising=False) - result = find_libdevice_module.locate_bitcode_lib("device") + result = find_bitcode_lib_module.locate_bitcode_lib("device") assert result is not None assert result.abs_path == expected_path @@ -53,15 +53,15 @@ def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir # same for cu12/cu13 -@pytest.mark.usefixtures("clear_find_libdevice_cache") -def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path): +@pytest.mark.usefixtures("clear_find_bitcode_lib_cache") +def test_find_bitcode_lib_via_conda(monkeypatch, mocker, tmp_path): rel_path = os.path.join("nvvm", "libdevice") - libdevice_dir = tmp_path / rel_path - expected_path = str(_make_libdevice_file(str(libdevice_dir))) + bitcode_lib_dir = tmp_path / rel_path + expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir))) - mocker.patch.object(find_libdevice_module, "IS_WINDOWS", False) + mocker.patch.object(find_bitcode_lib_module, "IS_WINDOWS", False) mocker.patch.object( - find_libdevice_module, + find_bitcode_lib_module, "find_sub_dirs_all_sitepackages", return_value=[], ) @@ -69,21 +69,21 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path): monkeypatch.delenv("CUDA_HOME", raising=False) monkeypatch.delenv("CUDA_PATH", raising=False) - result = find_libdevice_module.locate_bitcode_lib("device") + result = find_bitcode_lib_module.locate_bitcode_lib("device") assert result is not None assert result.abs_path == expected_path assert os.path.isfile(result.abs_path) -@pytest.mark.usefixtures("clear_find_libdevice_cache") -def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path): +@pytest.mark.usefixtures("clear_find_bitcode_lib_cache") +def test_find_bitcode_lib_via_cuda_home(monkeypatch, mocker, tmp_path): rel_path = os.path.join("nvvm", "libdevice") - libdevice_dir = tmp_path / rel_path - expected_path = str(_make_libdevice_file(str(libdevice_dir))) + bitcode_lib_dir = tmp_path / rel_path + expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir))) mocker.patch.object( - find_libdevice_module, + find_bitcode_lib_module, "find_sub_dirs_all_sitepackages", return_value=[], ) @@ -91,29 +91,29 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path): monkeypatch.setenv("CUDA_HOME", str(tmp_path)) monkeypatch.delenv("CUDA_PATH", raising=False) - result = find_libdevice_module.locate_bitcode_lib("device") + result = find_bitcode_lib_module.locate_bitcode_lib("device") assert result is not None assert result.abs_path == expected_path assert os.path.isfile(result.abs_path) -@pytest.mark.usefixtures("clear_find_libdevice_cache") +@pytest.mark.usefixtures("clear_find_bitcode_lib_cache") def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path): rel_path = os.path.join("nvvm", "libdevice") - libdevice_dir = tmp_path / rel_path - expected_path = str(_make_libdevice_file(str(libdevice_dir))) + bitcode_lib_dir = tmp_path / rel_path + expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir))) mocker.patch.object( - find_libdevice_module, + find_bitcode_lib_module, "find_sub_dirs_all_sitepackages", - return_value=[str(libdevice_dir)], + return_value=[str(bitcode_lib_dir)], ) monkeypatch.delenv("CONDA_PREFIX", raising=False) monkeypatch.delenv("CUDA_HOME", raising=False) monkeypatch.delenv("CUDA_PATH", raising=False) - result = find_libdevice_module.find_bitcode_lib("device") + result = find_bitcode_lib_module.find_bitcode_lib("device") assert result == expected_path assert isinstance(result, str) @@ -121,4 +121,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path): def test_find_bitcode_lib_invalid_name(): with pytest.raises(ValueError, match="Unknown bitcode library"): - find_libdevice_module.locate_bitcode_lib("invalid") + find_bitcode_lib_module.locate_bitcode_lib("invalid")