diff --git a/gpustack_runtime/detector/amd.py b/gpustack_runtime/detector/amd.py index a00d9bb..bc19590 100644 --- a/gpustack_runtime/detector/amd.py +++ b/gpustack_runtime/detector/amd.py @@ -179,6 +179,17 @@ def detect(self) -> Devices | None: dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev) dev_mem = dev_gpu_vram_usage.get("vram_total") dev_mem_used = dev_gpu_vram_usage.get("vram_used") + # On APUs with unified memory (e.g., AMD Strix Halo), VRAM + # reports only the BIOS carveout (~512 MiB); VIS_VRAM reports + # the full usable system memory. Use VIS_VRAM when larger. + with contextlib.suppress(pyrocmsmi.ROCMSMIError): + dev_mem_vis_vram = byte_to_mebibyte( + pyrocmsmi.rsmi_dev_memory_total_get( + dev_idx, + pyrocmsmi.RSMI_MEM_TYPE_VIS_VRAM, + ), + ) + dev_mem = max(dev_mem, dev_mem_vis_vram) dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count( dev, pyamdsmi.AmdSmiGpuBlock.UMC, @@ -189,6 +200,17 @@ def detect(self) -> Devices | None: dev_mem = byte_to_mebibyte( # byte to MiB pyrocmsmi.rsmi_dev_memory_total_get(dev_idx), ) + # On APUs with unified memory (e.g., AMD Strix Halo), VRAM + # reports only the BIOS carveout (~512 MiB); VIS_VRAM reports + # the full usable system memory. Use VIS_VRAM when larger. + with contextlib.suppress(pyrocmsmi.ROCMSMIError): + dev_mem_vis_vram = byte_to_mebibyte( + pyrocmsmi.rsmi_dev_memory_total_get( + dev_idx, + pyrocmsmi.RSMI_MEM_TYPE_VIS_VRAM, + ), + ) + dev_mem = max(dev_mem, dev_mem_vis_vram) dev_mem_used = byte_to_mebibyte( # byte to MiB pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx), ) diff --git a/gpustack_runtime/detector/pyrocmsmi/__init__.py b/gpustack_runtime/detector/pyrocmsmi/__init__.py index b8437f1..088b33e 100644 --- a/gpustack_runtime/detector/pyrocmsmi/__init__.py +++ b/gpustack_runtime/detector/pyrocmsmi/__init__.py @@ -62,6 +62,11 @@ ROCMSMI_IOLINK_TYPE_XGMI = 2 ROCMSMI_IOLINK_TYPE_NUMIOLINKTYPES = 3 +## Memory Types ## +RSMI_MEM_TYPE_VRAM = 0 +RSMI_MEM_TYPE_GTT = 1 +RSMI_MEM_TYPE_VIS_VRAM = 2 + ## Error Codes ## ROCMSMI_ERROR_UNINITIALIZED = -99997 ROCMSMI_ERROR_FUNCTION_NOT_FOUND = -99998