-
Notifications
You must be signed in to change notification settings - Fork 271
Expand file tree
/
Copy path_program.pyx
More file actions
1039 lines (924 loc) · 42.8 KB
/
_program.pyx
File metadata and controls
1039 lines (924 loc) · 42.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
"""Compilation machinery for CUDA programs.
This module provides :class:`Program` for compiling source code into
:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration.
"""
from __future__ import annotations
from dataclasses import dataclass
from warnings import warn
from cuda.bindings import driver, nvrtc
from libcpp.vector cimport vector
from ._resource_handles cimport (
as_cu,
as_py,
create_nvrtc_program_handle,
create_nvvm_program_handle,
)
from cuda.bindings cimport cynvrtc, cynvvm
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN_NVRTC, HANDLE_RETURN_NVVM
from cuda.core._device import Device
from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
from cuda.core._module import ObjectCode
from cuda.core._utils.clear_error_support import assert_type
from cuda.core._utils.cuda_utils import (
CUDAError,
_handle_boolean_option,
check_or_create_options,
get_binding_version,
handle_return,
is_nested_sequence,
is_sequence,
)
__all__ = ["Program", "ProgramOptions"]
ProgramHandleT = nvrtc.nvrtcProgram | int | LinkerHandleT
"""Type alias for program handle types across different backends.
The ``int`` type covers NVVM handles, which don't have a wrapper class.
"""
# =============================================================================
# Principal Class
# =============================================================================
cdef class Program:
"""Represent a compilation machinery to process programs into
:class:`~cuda.core.ObjectCode`.
This object provides a unified interface to multiple underlying
compiler libraries. Compilation support is enabled for a wide
range of code types and compilation types.
Parameters
----------
code : str | bytes | bytearray
The source code to compile. For C++ and PTX, must be a string.
For NVVM IR, can be str, bytes, or bytearray.
code_type : str
The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``.
options : :class:`ProgramOptions`, optional
Options to customize the compilation process.
"""
def __init__(self, code: str | bytes | bytearray, code_type: str, options: ProgramOptions | None = None):
Program_init(self, code, code_type, options)
def close(self):
"""Destroy this program."""
if self._linker:
self._linker.close()
# Reset handles - the C++ shared_ptr destructor handles cleanup
self._h_nvrtc.reset()
self._h_nvvm.reset()
def compile(
self, target_type: str, name_expressions: tuple | list = (), logs = None
) -> ObjectCode:
"""Compile the program to the specified target type.
Parameters
----------
target_type : str
The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``.
name_expressions : tuple | list, optional
Sequence of name expressions to make accessible in the compiled code.
Used for template instantiation and similar cases.
logs : object, optional
Object with a ``write`` method to receive compilation logs.
Returns
-------
:class:`~cuda.core.ObjectCode`
The compiled object code.
"""
return Program_compile(self, target_type, name_expressions, logs)
@property
def backend(self) -> str:
"""Return this Program instance's underlying backend."""
return self._backend
@property
def handle(self) -> ProgramHandleT:
"""Return the underlying handle object.
.. note::
The type of the returned object depends on the backend.
.. caution::
This handle is a Python object. To get the memory address of the underlying C
handle, call ``int(Program.handle)``.
"""
if self._backend == "NVRTC":
return as_py(self._h_nvrtc)
elif self._backend == "NVVM":
return as_py(self._h_nvvm) # returns int (NVVM uses raw integers)
else:
return self._linker.handle
def __repr__(self) -> str:
return f"<Program backend='{self._backend}'>"
# =============================================================================
# Other Public Classes
# =============================================================================
@dataclass
class ProgramOptions:
"""Customizable options for configuring :class:`Program`.
Attributes
----------
name : str, optional
Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`.
arch : str, optional
Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
will be used.
relocatable_device_code : bool, optional
Enable (disable) the generation of relocatable device code.
Default: False
extensible_whole_program : bool, optional
Do extensible whole program compilation of device code.
Default: False
debug : bool, optional
Generate debug information. If --dopt is not specified, then turns off all optimizations.
Default: False
lineinfo: bool, optional
Generate line-number information.
Default: False
device_code_optimize : bool, optional
Enable device code optimization. When specified along with '-G', enables limited debug information generation
for optimized device code.
Default: None
ptxas_options : Union[str, list[str]], optional
Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
For example ["-v", "-O2"].
Default: None
max_register_count : int, optional
Specify the maximum amount of registers that GPU functions can use.
Default: None
ftz : bool, optional
When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
values.
Default: False
prec_sqrt : bool, optional
For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
Default: True
prec_div : bool, optional
For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
approximation.
Default: True
fma : bool, optional
Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
multiply-add operations.
Default: True
use_fast_math : bool, optional
Make use of fast math operations.
Default: False
extra_device_vectorization : bool, optional
Enables more aggressive device code vectorization in the NVVM optimizer.
Default: False
link_time_optimization : bool, optional
Generate intermediate code for later link-time optimization.
Default: False
gen_opt_lto : bool, optional
Run the optimizer passes before generating the LTO IR.
Default: False
define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional
Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
strings, in which case the first element is defined as the second, or a list of strings or tuples.
Default: None
undefine_macro : Union[str, list[str]], optional
Cancel any previous definition of a macro, or list of macros.
Default: None
include_path : Union[str, list[str]], optional
Add the directory or directories to the list of directories to be searched for headers.
Default: None
pre_include : Union[str, list[str]], optional
Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
Default: None
no_source_include : bool, optional
Disable the default behavior of adding the directory of each input source to the include path.
Default: False
std : str, optional
Set language dialect to C++03, C++11, C++14, C++17 or C++20.
Default: c++17
builtin_move_forward : bool, optional
Provide builtin definitions of std::move and std::forward.
Default: True
builtin_initializer_list : bool, optional
Provide builtin definitions of std::initializer_list class and member functions.
Default: True
disable_warnings : bool, optional
Inhibit all warning messages.
Default: False
restrict : bool, optional
Programmer assertion that all kernel pointer parameters are restrict pointers.
Default: False
device_as_default_execution_space : bool, optional
Treat entities with no execution space annotation as __device__ entities.
Default: False
device_int128 : bool, optional
Allow the __int128 type in device code.
Default: False
optimization_info : str, optional
Provide optimization reports for the specified kind of optimization.
Default: None
no_display_error_number : bool, optional
Disable the display of a diagnostic number for warning messages.
Default: False
diag_error : Union[int, list[int]], optional
Emit error for a specified diagnostic message number or comma separated list of numbers.
Default: None
diag_suppress : Union[int, list[int]], optional
Suppress a specified diagnostic message number or comma separated list of numbers.
Default: None
diag_warn : Union[int, list[int]], optional
Emit warning for a specified diagnostic message number or comma separated lis of numbers.
Default: None
brief_diagnostics : bool, optional
Disable or enable showing source line and column info in a diagnostic.
Default: False
time : str, optional
Generate a CSV table with the time taken by each compilation phase.
Default: None
split_compile : int, optional
Perform compiler optimizations in parallel.
Default: 1
fdevice_syntax_only : bool, optional
Ends device compilation after front-end syntax checking.
Default: False
minimal : bool, optional
Omit certain language features to reduce compile time for small programs.
Default: False
no_cache : bool, optional
Disable compiler caching.
Default: False
fdevice_time_trace : str, optional
Generate time trace JSON for profiling compilation (NVRTC only).
Default: None
device_float128 : bool, optional
Allow __float128 type in device code (NVRTC only).
Default: False
frandom_seed : str, optional
Set random seed for randomized optimizations (NVRTC only).
Default: None
ofast_compile : str, optional
Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only).
Default: None
pch : bool, optional
Use default precompiled header (NVRTC only, CUDA 12.8+).
Default: False
create_pch : str, optional
Create precompiled header file (NVRTC only, CUDA 12.8+).
Default: None
use_pch : str, optional
Use specific precompiled header file (NVRTC only, CUDA 12.8+).
Default: None
pch_dir : str, optional
PCH directory location (NVRTC only, CUDA 12.8+).
Default: None
pch_verbose : bool, optional
Verbose PCH output (NVRTC only, CUDA 12.8+).
Default: False
pch_messages : bool, optional
Control PCH diagnostic messages (NVRTC only, CUDA 12.8+).
Default: False
instantiate_templates_in_pch : bool, optional
Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
Default: False
"""
name: str | None = "default_program"
arch: str | None = None
relocatable_device_code: bool | None = None
extensible_whole_program: bool | None = None
debug: bool | None = None
lineinfo: bool | None = None
device_code_optimize: bool | None = None
ptxas_options: str | list[str] | tuple[str] | None = None
max_register_count: int | None = None
ftz: bool | None = None
prec_sqrt: bool | None = None
prec_div: bool | None = None
fma: bool | None = None
use_fast_math: bool | None = None
extra_device_vectorization: bool | None = None
link_time_optimization: bool | None = None
gen_opt_lto: bool | None = None
define_macro: str | tuple[str, str] | list[str | tuple[str, str]] | tuple[str | tuple[str, str], ...] | None = None
undefine_macro: str | list[str] | tuple[str] | None = None
include_path: str | list[str] | tuple[str] | None = None
pre_include: str | list[str] | tuple[str] | None = None
no_source_include: bool | None = None
std: str | None = None
builtin_move_forward: bool | None = None
builtin_initializer_list: bool | None = None
disable_warnings: bool | None = None
restrict: bool | None = None
device_as_default_execution_space: bool | None = None
device_int128: bool | None = None
optimization_info: str | None = None
no_display_error_number: bool | None = None
diag_error: int | list[int] | tuple[int] | None = None
diag_suppress: int | list[int] | tuple[int] | None = None
diag_warn: int | list[int] | tuple[int] | None = None
brief_diagnostics: bool | None = None
time: str | None = None
split_compile: int | None = None
fdevice_syntax_only: bool | None = None
minimal: bool | None = None
no_cache: bool | None = None
fdevice_time_trace: str | None = None
device_float128: bool | None = None
frandom_seed: str | None = None
ofast_compile: str | None = None
pch: bool | None = None
create_pch: str | None = None
use_pch: str | None = None
pch_dir: str | None = None
pch_verbose: bool | None = None
pch_messages: bool | None = None
instantiate_templates_in_pch: bool | None = None
extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None
use_libdevice: bool | None = None # For libdevice execution
numba_debug: bool | None = None # Custom option for Numba debugging
def __post_init__(self):
self._name = self.name.encode()
# Set arch to default if not provided
if self.arch is None:
self.arch = f"sm_{Device().arch}"
def _prepare_nvrtc_options(self) -> list[bytes]:
return _prepare_nvrtc_options_impl(self)
def _prepare_nvvm_options(self, as_bytes: bool = True) -> list[bytes] | list[str]:
return _prepare_nvvm_options_impl(self, as_bytes)
def as_bytes(self, backend: str, target_type: str | None = None) -> list[bytes]:
"""Convert program options to bytes format for the specified backend.
This method transforms the program options into a format suitable for the
specified compiler backend. Different backends may use different option names
and formats even for the same conceptual options.
Parameters
----------
backend : str
The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm".
target_type : str, optional
The compilation target type (e.g., "ptx", "cubin", "ltoir"). Some backends
require additional options based on the target type.
Returns
-------
list[bytes]
List of option strings encoded as bytes.
Raises
------
ValueError
If an unknown backend is specified.
CUDAError
If an option incompatible with the specified backend is set.
Examples
--------
>>> options = ProgramOptions(arch="sm_80", debug=True)
>>> nvrtc_options = options.as_bytes("nvrtc")
"""
backend = backend.lower()
if backend == "nvrtc":
return self._prepare_nvrtc_options()
elif backend == "nvvm":
options = self._prepare_nvvm_options(as_bytes=True)
if target_type == "ltoir" and b"-gen-lto" not in options:
options.append(b"-gen-lto")
return options
else:
raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'")
def __repr__(self):
return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})"
# =============================================================================
# Private Classes and Helper Functions
# =============================================================================
# Module-level state for NVVM lazy loading
cdef object_nvvm_module = None
cdef bint _nvvm_import_attempted = False
def _get_nvvm_module():
"""Get the NVVM module, importing it lazily with availability checks."""
global _nvvm_module, _nvvm_import_attempted
if _nvvm_import_attempted:
if _nvvm_module is None:
raise RuntimeError("NVVM module is not available (previous import attempt failed)")
return _nvvm_module
_nvvm_import_attempted = True
try:
version = get_binding_version()
if version < (12, 9):
raise RuntimeError(
f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
"Please update cuda-bindings to use NVVM features."
)
from cuda.bindings import nvvm
from cuda.bindings._internal.nvvm import _inspect_function_pointer
if _inspect_function_pointer("__nvvmCreateProgram") == 0:
raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
_nvvm_module = nvvm
return _nvvm_module
except RuntimeError as e:
_nvvm_module = None
raise e
def _find_libdevice_path():
"""Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
from cuda.pathfinder import get_libdevice_path
return get_libdevice_path()
cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
"""Process a single define macro, returning True if successful."""
if isinstance(macro, str):
options.append(f"--define-macro={macro}")
return True
if isinstance(macro, tuple):
if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
options.append(f"--define-macro={macro[0]}={macro[1]}")
return True
return False
cdef inline void _process_define_macro(list options, object macro) except *:
"""Process define_macro option which can be str, tuple, or list thereof."""
union_type = "Union[str, tuple[str, str]]"
if _process_define_macro_inner(options, macro):
return
if is_nested_sequence(macro):
for seq_macro in macro:
if not _process_define_macro_inner(options, seq_macro):
raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
return
raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
cpdef bint _can_load_generated_ptx() except? -1:
"""Check if the driver can load PTX generated by the current NVRTC version."""
driver_ver = handle_return(driver.cuDriverGetVersion())
nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
cdef inline object _translate_program_options(object options):
"""Translate ProgramOptions to LinkerOptions for PTX compilation."""
return LinkerOptions(
name=options.name,
arch=options.arch,
max_register_count=options.max_register_count,
time=options.time,
link_time_optimization=options.link_time_optimization,
debug=options.debug,
lineinfo=options.lineinfo,
ftz=options.ftz,
prec_div=options.prec_div,
prec_sqrt=options.prec_sqrt,
fma=options.fma,
split_compile=options.split_compile,
ptxas_options=options.ptxas_options,
no_cache=options.no_cache,
)
cdef inline int Program_init(Program self, object code, str code_type, object options) except -1:
"""Initialize a Program instance."""
cdef cynvrtc.nvrtcProgram nvrtc_prog
cdef cynvvm.nvvmProgram nvvm_prog
cdef bytes code_bytes
cdef const char* code_ptr
cdef const char* name_ptr
cdef size_t code_len
cdef bytes module_bytes
cdef const char* module_ptr
cdef size_t module_len
self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
code_type = code_type.lower()
self._module_count = 0
self._use_libdevice = False
if code_type == "c++":
assert_type(code, str)
if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
# TODO: support pre-loaded headers & include names
code_bytes = code.encode()
code_ptr = <const char*>code_bytes
name_ptr = <const char*>options._name
with nogil:
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
&nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
self._backend = "NVRTC"
self._linker = None
elif code_type == "ptx":
assert_type(code, str)
if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the PTX backend.")
self._linker = Linker(
ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
)
self._backend = self._linker.backend
elif code_type == "nvvm":
_get_nvvm_module() # Validate NVVM availability
if isinstance(code, str):
code = code.encode("utf-8")
elif not isinstance(code, (bytes, bytearray)):
raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
code_ptr = <const char*>(<bytes>code)
name_ptr = <const char*>options._name
code_len = len(code)
with nogil:
HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog))
self._h_nvvm = create_nvvm_program_handle(nvvm_prog) # RAII from here
with nogil:
HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(nvvm_prog, code_ptr, code_len, name_ptr))
self._module_count = 1
# Add extra modules if provided
if options.extra_sources is not None:
if not is_sequence(options.extra_sources):
raise TypeError(
"extra_sources must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)"
)
for i, module in enumerate(options.extra_sources):
if not isinstance(module, tuple) or len(module) != 2:
raise TypeError(
f"Each extra module must be a 2-tuple (name, source)"
f", got {type(module).__name__} at index {i}"
)
module_name, module_source = module
if not isinstance(module_name, str):
raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
if isinstance(module_source, str):
# Textual LLVM IR - encode to UTF-8 bytes
module_source = module_source.encode("utf-8")
elif not isinstance(module_source, (bytes, bytearray)):
raise TypeError(
f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
f"or bytearray, got {type(module_source).__name__}"
)
if len(module_source) == 0:
raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
# Add the module using NVVM API
module_bytes = module_source if isinstance(module_source, bytes) else bytes(module_source)
module_ptr = <const char*>module_bytes
module_len = len(module_bytes)
module_name_bytes = module_name.encode()
module_name_ptr = <const char*>module_name_bytes
with nogil:
HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(
nvvm_prog, module_ptr, module_len, module_name_ptr))
self._module_count += 1
# Store use_libdevice flag
if options.use_libdevice:
self._use_libdevice = True
self._backend = "NVVM"
self._linker = None
else:
supported_code_types = ("c++", "ptx", "nvvm")
assert code_type not in supported_code_types, f"{code_type=}"
raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
return 0
cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
"""Compile using NVRTC backend and return ObjectCode."""
cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
cdef size_t output_size = 0
cdef size_t logsize = 0
cdef vector[const char*] options_vec
cdef char* data_ptr = NULL
cdef bytes name_bytes
cdef const char* name_ptr = NULL
cdef const char* lowered_name = NULL
cdef dict symbol_mapping = {}
# Add name expressions before compilation
if name_expressions:
for n in name_expressions:
name_bytes = n.encode() if isinstance(n, str) else n
name_ptr = <const char*>name_bytes
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr))
# Build options array
options_list = self._options.as_bytes("nvrtc", target_type)
options_vec.resize(len(options_list))
for i in range(len(options_list)):
options_vec[i] = <const char*>(<bytes>options_list[i])
# Compile
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
# Get compiled output based on target type
if target_type == "ptx":
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTXSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTX(prog, data_ptr))
elif target_type == "cubin":
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBINSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBIN(prog, data_ptr))
else: # ltoir
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIRSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIR(prog, data_ptr))
# Get lowered names after compilation
if name_expressions:
for n in name_expressions:
name_bytes = n.encode() if isinstance(n, str) else n
name_ptr = <const char*>name_bytes
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLoweredName(prog, name_ptr, &lowered_name))
symbol_mapping[n] = lowered_name if lowered_name != NULL else None
# Get compilation log if requested
if logs is not None:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLogSize(prog, &logsize))
if logsize > 1:
log = bytearray(logsize)
data_ptr = <char*>(<bytearray>log)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr))
logs.write(log.decode("utf-8", errors="backslashreplace"))
return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=self._options.name)
cdef object Program_compile_nvvm(Program self, str target_type, object logs):
"""Compile using NVVM backend and return ObjectCode."""
cdef cynvvm.nvvmProgram prog = as_cu(self._h_nvvm)
cdef size_t output_size = 0
cdef size_t logsize = 0
cdef vector[const char*] options_vec
cdef char* data_ptr = NULL
cdef bytes libdevice_bytes
cdef const char* libdevice_ptr
cdef size_t libdevice_len
# Build options array
options_list = self._options.as_bytes("nvvm", target_type)
options_vec.resize(len(options_list))
for i in range(len(options_list)):
options_vec[i] = <const char*>(<bytes>options_list[i])
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmVerifyProgram(prog, <int>options_vec.size(), options_vec.data()))
# Load libdevice if requested - following numba-cuda
if self._use_libdevice:
libdevice_path = _find_libdevice_path()
if libdevice_path is None:
raise RuntimeError(
"use_libdevice=True but could not find libdevice.10.bc. "
"Ensure CUDA toolkit is installed."
)
with open(libdevice_path, "rb") as f:
libdevice_bytes = f.read()
libdevice_ptr = <const char*>libdevice_bytes
libdevice_len = len(libdevice_bytes)
# Use lazy_add_module
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmLazyAddModuleToProgram(
prog, libdevice_ptr, libdevice_len, NULL))
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResultSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResult(prog, data_ptr))
# Get compilation log if requested
if logs is not None:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLogSize(prog, &logsize))
if logsize > 1:
log = bytearray(logsize)
data_ptr = <char*>(<bytearray>log)
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLog(prog, data_ptr))
logs.write(log.decode("utf-8", errors="backslashreplace"))
return ObjectCode._init(bytes(data), target_type, name=self._options.name)
# Supported target types per backend
cdef dict SUPPORTED_TARGETS = {
"NVRTC": ("ptx", "cubin", "ltoir"),
"NVVM": ("ptx", "ltoir"),
"nvJitLink": ("cubin", "ptx"),
"driver": ("cubin", "ptx"),
}
cdef object Program_compile(Program self, str target_type, object name_expressions, object logs):
"""Compile the program to the specified target type."""
# Validate target_type for this backend
supported = SUPPORTED_TARGETS.get(self._backend)
if supported is None:
raise ValueError(f'Unknown backend="{self._backend}"')
if target_type not in supported:
raise ValueError(
f'Unsupported target_type="{target_type}" for {self._backend} '
f'(supported: {", ".join(repr(t) for t in supported)})'
)
if self._backend == "NVRTC":
if target_type == "ptx" and not _can_load_generated_ptx():
warn(
"The CUDA driver version is older than the backend version. "
"The generated ptx will not be loadable by the current driver.",
stacklevel=2,
category=RuntimeWarning,
)
return Program_compile_nvrtc(self, target_type, name_expressions, logs)
elif self._backend == "NVVM":
return Program_compile_nvvm(self, target_type, logs)
else:
return self._linker.link(target_type)
cdef inline list _prepare_nvrtc_options_impl(object opts):
"""Build NVRTC-specific compiler options."""
options = [f"-arch={opts.arch}"]
if opts.relocatable_device_code is not None:
options.append(f"--relocatable-device-code={_handle_boolean_option(opts.relocatable_device_code)}")
if opts.extensible_whole_program is not None and opts.extensible_whole_program:
options.append("--extensible-whole-program")
if opts.debug is not None and opts.debug:
options.append("--device-debug")
if opts.lineinfo is not None and opts.lineinfo:
options.append("--generate-line-info")
if opts.device_code_optimize is not None and opts.device_code_optimize:
options.append("--dopt=on")
if opts.ptxas_options is not None:
opt_name = "--ptxas-options"
if isinstance(opts.ptxas_options, str):
options.append(f"{opt_name}={opts.ptxas_options}")
elif is_sequence(opts.ptxas_options):
for opt_value in opts.ptxas_options:
options.append(f"{opt_name}={opt_value}")
if opts.max_register_count is not None:
options.append(f"--maxrregcount={opts.max_register_count}")
if opts.ftz is not None:
options.append(f"--ftz={_handle_boolean_option(opts.ftz)}")
if opts.prec_sqrt is not None:
options.append(f"--prec-sqrt={_handle_boolean_option(opts.prec_sqrt)}")
if opts.prec_div is not None:
options.append(f"--prec-div={_handle_boolean_option(opts.prec_div)}")
if opts.fma is not None:
options.append(f"--fmad={_handle_boolean_option(opts.fma)}")
if opts.use_fast_math is not None and opts.use_fast_math:
options.append("--use_fast_math")
if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
options.append("--extra-device-vectorization")
if opts.link_time_optimization is not None and opts.link_time_optimization:
options.append("--dlink-time-opt")
if opts.gen_opt_lto is not None and opts.gen_opt_lto:
options.append("--gen-opt-lto")
if opts.define_macro is not None:
_process_define_macro(options, opts.define_macro)
if opts.undefine_macro is not None:
if isinstance(opts.undefine_macro, str):
options.append(f"--undefine-macro={opts.undefine_macro}")
elif is_sequence(opts.undefine_macro):
for macro in opts.undefine_macro:
options.append(f"--undefine-macro={macro}")
if opts.include_path is not None:
if isinstance(opts.include_path, str):
options.append(f"--include-path={opts.include_path}")
elif is_sequence(opts.include_path):
for path in opts.include_path:
options.append(f"--include-path={path}")
if opts.pre_include is not None:
if isinstance(opts.pre_include, str):
options.append(f"--pre-include={opts.pre_include}")
elif is_sequence(opts.pre_include):
for header in opts.pre_include:
options.append(f"--pre-include={header}")
if opts.no_source_include is not None and opts.no_source_include:
options.append("--no-source-include")
if opts.std is not None:
options.append(f"--std={opts.std}")
if opts.builtin_move_forward is not None:
options.append(f"--builtin-move-forward={_handle_boolean_option(opts.builtin_move_forward)}")
if opts.builtin_initializer_list is not None:
options.append(f"--builtin-initializer-list={_handle_boolean_option(opts.builtin_initializer_list)}")
if opts.disable_warnings is not None and opts.disable_warnings:
options.append("--disable-warnings")
if opts.restrict is not None and opts.restrict:
options.append("--restrict")
if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space:
options.append("--device-as-default-execution-space")
if opts.device_int128 is not None and opts.device_int128:
options.append("--device-int128")
if opts.device_float128 is not None and opts.device_float128:
options.append("--device-float128")
if opts.optimization_info is not None:
options.append(f"--optimization-info={opts.optimization_info}")
if opts.no_display_error_number is not None and opts.no_display_error_number:
options.append("--no-display-error-number")
if opts.diag_error is not None:
if isinstance(opts.diag_error, int):
options.append(f"--diag-error={opts.diag_error}")
elif is_sequence(opts.diag_error):
for error in opts.diag_error:
options.append(f"--diag-error={error}")
if opts.diag_suppress is not None:
if isinstance(opts.diag_suppress, int):
options.append(f"--diag-suppress={opts.diag_suppress}")
elif is_sequence(opts.diag_suppress):
for suppress in opts.diag_suppress:
options.append(f"--diag-suppress={suppress}")
if opts.diag_warn is not None:
if isinstance(opts.diag_warn, int):
options.append(f"--diag-warn={opts.diag_warn}")
elif is_sequence(opts.diag_warn):
for w in opts.diag_warn:
options.append(f"--diag-warn={w}")
if opts.brief_diagnostics is not None:
options.append(f"--brief-diagnostics={_handle_boolean_option(opts.brief_diagnostics)}")
if opts.time is not None:
options.append(f"--time={opts.time}")
if opts.split_compile is not None:
options.append(f"--split-compile={opts.split_compile}")
if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only:
options.append("--fdevice-syntax-only")
if opts.minimal is not None and opts.minimal:
options.append("--minimal")
if opts.no_cache is not None and opts.no_cache:
options.append("--no-cache")
if opts.fdevice_time_trace is not None:
options.append(f"--fdevice-time-trace={opts.fdevice_time_trace}")
if opts.frandom_seed is not None:
options.append(f"--frandom-seed={opts.frandom_seed}")
if opts.ofast_compile is not None:
options.append(f"--Ofast-compile={opts.ofast_compile}")
# PCH options (CUDA 12.8+)
if opts.pch is not None and opts.pch:
options.append("--pch")
if opts.create_pch is not None:
options.append(f"--create-pch={opts.create_pch}")
if opts.use_pch is not None:
options.append(f"--use-pch={opts.use_pch}")
if opts.pch_dir is not None:
options.append(f"--pch-dir={opts.pch_dir}")
if opts.pch_verbose is not None:
options.append(f"--pch-verbose={_handle_boolean_option(opts.pch_verbose)}")
if opts.pch_messages is not None:
options.append(f"--pch-messages={_handle_boolean_option(opts.pch_messages)}")
if opts.instantiate_templates_in_pch is not None:
options.append(
f"--instantiate-templates-in-pch={_handle_boolean_option(opts.instantiate_templates_in_pch)}"
)
if opts.numba_debug:
options.append("--numba-debug")
return [o.encode() for o in options]
cdef inline object _prepare_nvvm_options_impl(object opts, bint as_bytes):
"""Build NVVM-specific compiler options."""
options = []
# Options supported by NVVM
assert opts.arch is not None
arch = opts.arch
if arch.startswith("sm_"):
arch = f"compute_{arch[3:]}"
options.append(f"-arch={arch}")
if opts.debug is not None and opts.debug:
options.append("-g")
if opts.device_code_optimize is False:
options.append("-opt=0")
elif opts.device_code_optimize is True:
options.append("-opt=3")
# NVVM uses 0/1 instead of true/false for boolean options
if opts.ftz is not None:
options.append(f"-ftz={'1' if opts.ftz else '0'}")
if opts.prec_sqrt is not None:
options.append(f"-prec-sqrt={'1' if opts.prec_sqrt else '0'}")
if opts.prec_div is not None:
options.append(f"-prec-div={'1' if opts.prec_div else '0'}")
if opts.fma is not None:
options.append(f"-fma={'1' if opts.fma else '0'}")
# Check for unsupported options and raise error if they are set
unsupported = []
if opts.relocatable_device_code is not None:
unsupported.append("relocatable_device_code")
if opts.extensible_whole_program is not None and opts.extensible_whole_program:
unsupported.append("extensible_whole_program")
if opts.lineinfo is not None and opts.lineinfo:
unsupported.append("lineinfo")
if opts.ptxas_options is not None:
unsupported.append("ptxas_options")
if opts.max_register_count is not None:
unsupported.append("max_register_count")
if opts.use_fast_math is not None and opts.use_fast_math:
unsupported.append("use_fast_math")
if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
unsupported.append("extra_device_vectorization")
if opts.gen_opt_lto is not None and opts.gen_opt_lto:
unsupported.append("gen_opt_lto")
if opts.define_macro is not None:
unsupported.append("define_macro")
if opts.undefine_macro is not None:
unsupported.append("undefine_macro")
if opts.include_path is not None:
unsupported.append("include_path")
if opts.pre_include is not None:
unsupported.append("pre_include")
if opts.no_source_include is not None and opts.no_source_include:
unsupported.append("no_source_include")
if opts.std is not None:
unsupported.append("std")
if opts.builtin_move_forward is not None:
unsupported.append("builtin_move_forward")