ROCm · nileshnegi · May 2, 2026 · May 3, 2026
@@ -3,6 +3,44 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
 
+## v1.67.00
+### Added
+- Initial support for pod communication.  Requires compatible hardware / ROCm version and subject to further testing
+  - This potentially enables GFX/DMA executors to access SRC/DST memory locations on GPUs within the same pod
+  - Pod membership requires amd-smi however can be skipped by setting TB_FORCE_SINGLE_POD=1
+- Support for dumping executed Transfers to a config file specified by TB_DUMP_CFG_FILE
+  - This will write Transfers that are executed (for example via a preset) to a config file that can then be executed
+- Reporting number of iterations run when running in timed mode (NUM_ITERATIONS < 0)
+- Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers
+- New "hbm" preset which sweeps and tests local HBM read performance
+- Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug)
+- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1 / CUDA 12.8
+- Added new "bmasweep" preset that compares DMA to batched DMA execution for parallel transfers to other GPUs
+- Added new "wallclock" preset that compares wallclock counters across XCCs within a GPU
+- Added new "smoketest" preset that runs a variety of DMA/GFX tests for simple correctness tests
+- Added new "help" preset to show config file examples
+- Added new "presets" preset to show available presets and their descriptions
+- Added new "rings" preset that runs parallel rings of transfers (pod-capable)
+- Added new "envvars" preset to show environment variables that can change TransferBench behavior
+- Adding information on how to run multi-rank with TransferBench, when run with no args
+- Added new "nica2a" preset (NIC all-to-all over GPUs via NIC executors, multi-node)
+- Added new GFX_KERNEL to allow experimenting with copy-only GFX kernel.  Currently this is opt-in only
+- Added `SHOW_PERCENTILES` (e.g. `50,75,90,95,99`) to show empirical percentiles of per-iteration duration
+
+### Modified
+- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
+- Adding extra information to CMake and make build methods to indicate enabled / disabled features
+- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
+- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
+- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
+- NIC_FILTER renamed to TB_NIC_FILTER for consistency
+- DUMP_LINES renamed to TB_DUMP_LINES for consistency
+- Dynamically size CQs for NIC transfers in high QPs case
+- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
+- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
+- Removed env vars printing and preset print when running TransferBench with no args
+- Modification to simplify socket comm usage - first rank only needs to set TB_NUM_RANKS=X to see connection info
+
 ## v1.66.02
 ### Added
 - Adding DMA-BUF support

@@ -6,14 +6,18 @@
 ROCM_PATH ?= /opt/rocm
 CUDA_PATH ?= /usr/local/cuda
 MPI_PATH  ?= /usr/local/openmpi
+HIPCC     ?= $(ROCM_PATH)/bin/amdclang++
+NVCC      ?= $(CUDA_PATH)/bin/nvcc
+DEBUG     ?= 0
 
 # Optional features (set to 0 to disable, 1 to enable)
-# DISABLE_NIC_EXEC: Disable RDMA/NIC executor support (default: 0)
-# DISABLE_MPI_COMM: Disable MPI communicator support (default: 0)
-# DISABLE_DMABUF: Disable DMA-BUF support for GPU Direct RDMA (default: 1)
-
-HIPCC ?= $(ROCM_PATH)/bin/amdclang++
-NVCC ?= $(CUDA_PATH)/bin/nvcc
+# DISABLE_NIC_EXEC: Disable RDMA/NIC executor support                     (default: 0)
+# DISABLE_MPI_COMM: Disable MPI communicator support                      (default: 0)
+# DISABLE_DMA_BUF:  Disable DMA-BUF support for GPU Direct RDMA           (default: 1)
+# DISABLE_AMD_SMI:  Disable AMD-SMI pod membership checking support       (default: 0)
+# DISABLE_NVML:     Disable NVML pod membership detection for CUDA builds (default: 0)
+# DISABLE_POD_COMM: Disable pod communication support                     (default: 0)
+# DISABLE_CUMEM:    Disable CUDA driver API (also disables pod on CUDA)   (default: 0)
 
 # ROCm device libraries can live in different locations depending on packaging.
 # hipcc/clang needs to find the amdgcn bitcode directory at link time.
@@ -32,11 +36,11 @@ SINGLE_KERNEL ?= 0
 GPU_TARGETS ?= native
 
 EXE=TransferBench
-DEBUG ?= 0
 
 # Only perform this check if 'make clean' is not the target
 ifeq ($(filter clean,$(MAKECMDGOALS)),)
   ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+    $(info Building TransferBenchCuda)
     # Check for nvcc
     ifneq ($(shell test -e $(NVCC) && echo found), found)
       $(error "Could not find $(NVCC).  Please set CUDA_PATH appropriately")
@@ -48,15 +52,21 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
     # Check for HIP compiler
     ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
       CXX=$(HIPCC)
-    else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
-      CXX=$(ROCM_PATH)/bin/hipcc
-      $(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
     else
-      $(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
+      ifeq ("$(shell test -e $(ROCM_PATH)/llvm/bin/amdclang++ && echo found)", "found")
+        CXX=$(ROCM_PATH)/llvm/bin/amdclang++
+      else ifeq ("$(shell test -e $(ROCM_PATH)/llvm/bin/clang++ && echo found)", "found")
+        CXX=$(ROCM_PATH)/llvm/bin/clang++
+      else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
+        CXX=$(ROCM_PATH)/bin/hipcc
+      else
+        $(error "Could not find a HIP compiler. Tried: $(HIPCC), $(ROCM_PATH)/llvm/bin/amdclang++, $(ROCM_PATH)/llvm/bin/clang++, $(ROCM_PATH)/bin/hipcc. Check if ROCM_PATH is correct")
+      endif
+      $(info "Could not find $(HIPCC). Using fallback to $(CXX)")
     endif
     GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
-
-    CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
+    $(info Compiling for $(GPU_TARGETS) architecture(s). Can modify this by setting GPU_TARGETS)
+    CXXFLAGS = -I. -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
     HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
     HIPFLAGS = -Wall -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
     ifneq ($(strip $(ROCM_DEVICE_LIB_PATH)),)
@@ -84,18 +94,19 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
   # 3) infiniband/verbs.h is found in the default include path
   DISABLE_NIC_EXEC ?= 0
   ifneq ($(DISABLE_NIC_EXEC),1)
+    $(info Attempting to build with NIC executor support)
     ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
-      $(info lib IBVerbs not found)
+      $(info - ibverbs library not found)
     else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
-      $(info infiniband/verbs.h not found)
+      $(info - infiniband/verbs.h not found)
     else
       COMMON_FLAGS += -DNIC_EXEC_ENABLED
       LDFLAGS += -libverbs
       NIC_ENABLED = 1
 
-      # Disable DMA-BUF support by default (set DISABLE_DMABUF=0 to enable)
-      DISABLE_DMABUF ?= 1
-      ifeq ($(DISABLE_DMABUF), 0)
+      # Disable DMA-BUF support by default (set DISABLE_DMA_BUF=0 to enable)
+      DISABLE_DMA_BUF ?= 1
+      ifeq ($(DISABLE_DMA_BUF), 0)
         # Check for both ibv_reg_dmabuf_mr and ROCm DMA-BUF export support
         HAVE_IBV_DMABUF := $(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'ibv_reg_dmabuf_mr')
         HAVE_ROCM_DMABUF := $(shell echo '#include <hsa/hsa_ext_amd.h>' | $(CXX) -I$(ROCM_PATH)/include -E - 2>/dev/null | grep -c 'hsa_amd_portable_export_dmabuf')
@@ -111,14 +122,14 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
           $(info Building with DMA-BUF support)
         endif
       else
-        $(info Building with DMA-BUF support disabled (DISABLE_DMABUF=1))
+        $(info Building with DMA-BUF support disabled (DISABLE_DMA_BUF=1))
       endif
     endif
     ifeq ($(NIC_ENABLED), 0)
-      $(info Building without NIC executor support)
-      $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
+      $(info - Building without NIC executor support)
+      $(info - To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
     else
-      $(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
+      $(info - Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
     endif
   endif
 
@@ -128,30 +139,167 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
   # 2) mpi.h is found in the MPI_PATH
   DISABLE_MPI_COMM ?= 0
   ifneq ($(DISABLE_MPI_COMM), 1)
+    $(info Attempting to build with MPI communicator support)
     ifeq ($(wildcard $(MPI_PATH)/include/mpi.h),)
-      $(info Unable to find mpi.h at $(MPI_PATH)/include.  Please specify appropriate MPI_PATH)
+      $(info - Unable to find mpi.h at $(MPI_PATH)/include.  Please specify appropriate MPI_PATH)
     else
       MPI_ENABLED = 1
       COMMON_FLAGS += -DMPI_COMM_ENABLED -I$(MPI_PATH)/include
-      LDFLAGS += -L/$(MPI_PATH)/lib -lmpi
-      ifeq ($(DEBUG), 1)
-        LDFLAGS += -lmpi_cxx
-      endif
+      LDFLAGS += -L$(MPI_PATH)/lib -L$(MPI_PATH)/lib64 -lmpi
     endif
 
     ifeq ($(MPI_ENABLED), 0)
-      $(info Building without MPI communicator support)
-      $(info To use TransferBench with MPI support, install MPI libraries and specify appropriate MPI_PATH)
+      $(info - Building without MPI communicator support)
+      $(info - To use TransferBench with MPI support, install MPI libraries and specify appropriate MPI_PATH)
     else
-      $(info Building with MPI communicator support.  Can set DISABLE_MPI_COMM=1 to disable)
+      $(info - Building with MPI communicator support.  Can set DISABLE_MPI_COMM=1 to disable)
    endif
   endif
-endif
 
+  NVML_ENABLED = 0
+  # Enable NVML support for pod membership detection on NVIDIA platforms
+  # Compile with NVML support if
+  # 1) DISABLE_NVML is not set to 1
+  # 2) Building TransferBenchCuda
+  # 3) nvml.h is found under CUDA_PATH
+  DISABLE_NVML ?= 0
+  ifneq ($(DISABLE_NVML), 1)
+    ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+      $(info Attempting to build with NVML support)
+      ifneq ($(wildcard $(CUDA_PATH)/include/nvml.h),)
+        COMMON_FLAGS += -DNVML_ENABLED
+        LDFLAGS += -lnvidia-ml
+        NVML_ENABLED = 1
+        $(info - Building with NVML support for pod membership detection)
+      else
+        $(info - nvml.h not found at $(CUDA_PATH)/include. Building without NVML support)
+        $(info - Pod membership may be forced by setting TB_FORCE_SINGLE_POD=1)
+      endif
+    endif
+  endif
+
+  # TransferBenchCuda: CUDA driver API (libcuda). Independent of POD, but POD on CUDA requires CUMEM.
+  DISABLE_CUMEM ?= 0
+  ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+    ifneq ($(DISABLE_CUMEM),1)
+      $(info - Building with CUMEM_ENABLED (CUDA driver API, -lcuda))
+      COMMON_FLAGS += -DCUMEM_ENABLED
+      LDFLAGS += -lcuda
+    else
+      $(info - CUDA driver API disabled (DISABLE_CUMEM=1); POD comm unavailable on CUDA)
+    endif
+  endif
+
+  POD_ENABLED = 0
+  AMD_SMI_ENABLED = 0
+  # Compile with pod support if
+  # 1) DISABLE_POD_COMM is not set to 1
+  # 2) For HIP: a small probe program that uses hipMemFabricHandle_t,
+  #    hipMemExportToShareableHandle, and hipMemImportFromShareableHandle
+  #    compiles and links successfully against amdhip64
+  #    For CUDA: CUDA Version >= 12.2
+  DISABLE_POD_COMM ?= 0
+  DISABLE_AMD_SMI ?= 0
+  ifneq ($(DISABLE_POD_COMM), 1)
+    $(info Attempting to build with pod communication support)
+    ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+      # Check for appropriate CUDA support for MNNVL
+      CUDA_MIN_MAJOR := 12
+      CUDA_MIN_MINOR := 2
+
+      CUDA_VERSION_STR := $(shell $(NVCC) --version | grep release | sed -E 's/.*release ([0-9]+)\.([0-9]+).*/\1 \2/')
+      CUDA_MAJOR := $(word 1,$(CUDA_VERSION_STR))
+      CUDA_MINOR := $(word 2,$(CUDA_VERSION_STR))
+
+      CUDA_VERSION_OK := $(shell \
+        if [ $(CUDA_MAJOR) -gt $(CUDA_MIN_MAJOR) ] || \
+           [ $(CUDA_MAJOR) -eq $(CUDA_MIN_MAJOR) -a $(CUDA_MINOR) -ge $(CUDA_MIN_MINOR) ]; then \
+          echo yes; \
+        else \
+          echo no; \
+        fi)
+
+      ifeq ($(CUDA_VERSION_OK),yes)
+        $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which has MNNVL support)
+        ifeq ($(DISABLE_CUMEM),1)
+          $(info - Pod communication skipped on CUDA: requires CUMEM_ENABLED (DISABLE_CUMEM=1))
+        else
+          COMMON_FLAGS += -DPOD_COMM_ENABLED
+          POD_ENABLED = 1
+        endif
+      else
+        $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which does not have MNNVL support)
+        $(info - Pod support will require CUDA version of at least $(CUDA_MIN_MAJOR).$(CUDA_MIN_MINOR))
+      endif
+    else
+      # Check for the HIP fabric API functions used by TransferBench at runtime.
+      HIP_HAS_FABRIC := $(shell \
+        printf '%s\n' \
+          '#include <hip/hip_runtime_api.h>' \
+          'int main() {' \
+          '  hipMemFabricHandle_t fabricHandle = {};' \
+          '  hipMemGenericAllocationHandle_t allocationHandle = {};' \
+          '  hipMemExportToShareableHandle(&fabricHandle, allocationHandle, hipMemHandleTypeFabric, 0);' \
+          '  hipMemImportFromShareableHandle(&allocationHandle, &fabricHandle, hipMemHandleTypeFabric);' \
+          '  return 0;' \
+          '}' | \
+        $(CXX) -I$(ROCM_PATH)/include -D__HIP_PLATFORM_AMD__ -x c++ - \
+          -L$(ROCM_PATH)/lib -L$(ROCM_PATH)/lib64 -lamdhip64 -o /dev/null 2>/dev/null && echo yes || echo no)
+
+      ifeq ($(HIP_HAS_FABRIC),yes)
+        $(info - HIP fabric API found; enabling pod communication support)
+        COMMON_FLAGS += -DPOD_COMM_ENABLED
+        POD_ENABLED = 1
+        ifeq ($(DISABLE_AMD_SMI), 1)
+          $(info - AMD-SMI disabled via DISABLE_AMD_SMI=1; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+        else
+          # Prefer AMD-SMI for pod membership queries; fall back to TB_FORCE_SINGLE_POD=1 at runtime.
+          AMD_SMI_HEADER := $(ROCM_PATH)/include/amd_smi/amdsmi.h
+          AMD_SMI_LIB    := $(firstword $(wildcard $(ROCM_PATH)/lib/libamd_smi.so $(ROCM_PATH)/lib64/libamd_smi.so))
+          ifneq ($(wildcard $(AMD_SMI_HEADER)),)
+            ifneq ($(AMD_SMI_LIB),)
+              # Check for the AMD-SMI functions used by TransferBench at runtime.
+              AMDSMI_HAS_FABRIC := $(shell \
+                printf '%s\n' \
+                  '#include <amd_smi/amdsmi.h>' \
+                  'int main() {' \
+                  '  amdsmi_bdf_t bdf = {};' \
+                  '  amdsmi_processor_handle h;' \
+                  '  amdsmi_get_processor_handle_from_bdf(bdf, &h);' \
+                  '  amdsmi_fabric_info_t fi;' \
+                  '  amdsmi_get_gpu_fabric_info(h, &fi);' \
+                  '  (void)fi.fabric_info.fabric_version.v1.ppod_id;' \
+                  '  (void)fi.fabric_info.fabric_version.v1.vpod_id;' \
+                  '  return 0;' \
+                  '}' | \
+                $(CXX) -I$(ROCM_PATH)/include -x c++ - \
+                  -L$(dir $(AMD_SMI_LIB)) -lamd_smi -o /dev/null 2>/dev/null && echo yes || echo no)
+
+              ifeq ($(AMDSMI_HAS_FABRIC),yes)
+                $(info - AMD-SMI fabric API found; using AMD-SMI for pod membership queries)
+                COMMON_FLAGS += -DAMD_SMI_ENABLED
+                LDFLAGS += -L$(dir $(AMD_SMI_LIB)) -lamd_smi
+                AMD_SMI_ENABLED = 1
+              else
+                $(info - AMD-SMI fabric API not found; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+              endif
+            else
+              $(info - libamd_smi not found under $(ROCM_PATH)/lib or $(ROCM_PATH)/lib64; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+            endif
+          else
+            $(info - amd_smi/amdsmi.h not found under $(ROCM_PATH)/include; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+          endif
+        endif
+      else
+        $(info - HIP fabric API not found; disabling pod communication support)
+      endif
+    endif
+  endif
+endif
 
 .PHONY : all clean
 
-all: $(EXE)
+all: TransferBench
 
 TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
 	$(CXX) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)

@@ -8,12 +8,13 @@
 #                SRC 1 -> Executor -> DST 1
 #                SRC X                DST Y
 
-# Three Executors are supported by TransferBench
+# Five Executors are supported by TransferBench
 #   Executor:        SubExecutor:
 #   1) CPU           CPU thread
 #   2) GPU           GPU threadblock/Compute Unit (CU)
-#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+#   3) DMA           N/A.                                 (Must have single SRC, at least one DST)
 #   4) NIC           Queue Pair
+#   5) Batched-DMA   Batch item                           (Must have single SRC, at least one DST)
 
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
 
@@ -38,6 +39,7 @@
 #                 - C:    CPU-executed          (Indexed from 0 to # NUMA nodes - 1)
 #                 - G:    GPU-executed          (Indexed from 0 to # GPUs - 1)
 #                 - D:    DMA-executor          (Indexed from 0 to # GPUs - 1)
+#                 - B:    Batched-DMA-executor  (Indexed from 0 to # GPUs - 1)
 #                 - I#.#: NIC executor          (Indexed from 0 to # NICs - 1)
 #                 - N#.#: Nearest NIC executor  (Indexed from 0 to # GPUs - 1)
 #   dstMemL   :   Destination memory locations (Where the data is to be written to)