From a108f3c415c5f06d88fba008c1da2f6a5eab8cc2 Mon Sep 17 00:00:00 2001 From: Emilio Gallicchio Date: Sat, 5 Aug 2023 23:10:56 -0400 Subject: [PATCH 01/14] Add ATMForce --- platforms/hip/include/HipKernels.h | 12 ++++++++++++ platforms/hip/src/HipKernelFactory.cpp | 2 ++ platforms/hip/src/HipPlatform.cpp | 1 + 3 files changed, 15 insertions(+) diff --git a/platforms/hip/include/HipKernels.h b/platforms/hip/include/HipKernels.h index 0c317b9..c1adf84 100644 --- a/platforms/hip/include/HipKernels.h +++ b/platforms/hip/include/HipKernels.h @@ -360,6 +360,18 @@ class HipCalcCustomCVForceKernel : public CommonCalcCustomCVForceKernel { } }; +/** + * This kernel is invoked by ATMForce to calculate the forces acting on the system and the energy of the system. + */ +class HipCalcATMForceKernel : public CommonCalcATMForceKernel { +public: + HipCalcATMForceKernel(std::string name, const Platform& platform, ComputeContext& cc) : CommonCalcATMForceKernel(name, platform, cc) { + } + ComputeContext& getInnerComputeContext(ContextImpl& innerContext) { + return *reinterpret_cast(innerContext.getPlatformData())->contexts[0]; + } +}; + } // namespace OpenMM #endif /*OPENMM_HIPKERNELS_H_*/ diff --git a/platforms/hip/src/HipKernelFactory.cpp b/platforms/hip/src/HipKernelFactory.cpp index 9ef58a8..cc71294 100644 --- a/platforms/hip/src/HipKernelFactory.cpp +++ b/platforms/hip/src/HipKernelFactory.cpp @@ -141,5 +141,7 @@ KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& return new CommonApplyMonteCarloBarostatKernel(name, platform, cu); if (name == RemoveCMMotionKernel::Name()) return new CommonRemoveCMMotionKernel(name, platform, cu); + if (name == CalcATMForceKernel::Name() ) + return new HipCalcATMForceKernel(name, platform, cu); throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str()); } diff --git a/platforms/hip/src/HipPlatform.cpp b/platforms/hip/src/HipPlatform.cpp index 4ac15a9..c72264f 100644 --- a/platforms/hip/src/HipPlatform.cpp +++ b/platforms/hip/src/HipPlatform.cpp @@ -108,6 +108,7 @@ HipPlatform::HipPlatform() { registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory); registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory); registerKernelFactory(RemoveCMMotionKernel::Name(), factory); + registerKernelFactory(CalcATMForceKernel::Name(), factory); platformProperties.push_back(HipDeviceIndex()); platformProperties.push_back(HipDeviceName()); platformProperties.push_back(HipUseBlockingSync()); From 5e07154af796b5cd64b0ff6d7c34f9d26e6a8d21 Mon Sep 17 00:00:00 2001 From: bdenhollander <44237618+bdenhollander@users.noreply.github.com> Date: Mon, 7 Aug 2023 13:35:06 -0400 Subject: [PATCH 02/14] Fix incomplete PTX file written to disk Bytes written is sometimes less than original ptx.size() and hipModuleLoad throws an a string too long exception. Setting binary output writes all the bytes. --- platforms/hip/src/HipContext.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/hip/src/HipContext.cpp b/platforms/hip/src/HipContext.cpp index 771496b..1a1a507 100644 --- a/platforms/hip/src/HipContext.cpp +++ b/platforms/hip/src/HipContext.cpp @@ -607,7 +607,7 @@ hipModule_t HipContext::createModule(const string source, const map Date: Mon, 4 Sep 2023 13:47:59 -0400 Subject: [PATCH 03/14] Port skip neighbor list for very small systems - Port optimization from https://github.com/openmm/openmm/pull/4070 to HIP for compatibility with upcoming OpenMM 8.1 release - It may be possible to revert some of the changes in https://github.com/amd/openmm-hip/commit/08c967d1b1b4b1fd980d6811166c397f3aaf4023, which was optimizing for small systems as well --- platforms/hip/include/HipNonbondedUtilities.h | 10 +++++--- platforms/hip/src/HipKernels.cpp | 2 +- platforms/hip/src/HipNonbondedUtilities.cpp | 23 +++++++++++-------- platforms/hip/src/kernels/nonbonded.hip | 8 +++---- 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/platforms/hip/include/HipNonbondedUtilities.h b/platforms/hip/include/HipNonbondedUtilities.h index d9fc3b9..319621e 100644 --- a/platforms/hip/include/HipNonbondedUtilities.h +++ b/platforms/hip/include/HipNonbondedUtilities.h @@ -83,8 +83,10 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded * @param kernel the code to evaluate the interaction * @param forceGroup the force group in which the interaction should be calculated + * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should + * be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway. */ - void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup); + void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList = true); /** * Add a nonbonded interaction to be evaluated by the default interaction kernel. * @@ -95,9 +97,11 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded * @param kernel the code to evaluate the interaction * @param forceGroup the force group in which the interaction should be calculated + * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should + * be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway. * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list */ - void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList); + void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList); /** * Add a per-atom parameter that the default interaction kernel may depend on. */ @@ -349,7 +353,7 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { std::map groupCutoff; std::map groupKernelSource; double lastCutoff; - bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList; + bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList; int startTileIndex, startBlockIndex, numBlocks, numTilesInBatch, maxExclusions; int numForceThreadBlocks, forceThreadBlockSize, findInteractingBlocksThreadBlockSize, numAtoms, groupFlags; unsigned int maxTiles, maxSinglePairs, tilesAfterReorder; diff --git a/platforms/hip/src/HipKernels.cpp b/platforms/hip/src/HipKernels.cpp index fc816e4..df2cf69 100644 --- a/platforms/hip/src/HipKernels.cpp +++ b/platforms/hip/src/HipKernels.cpp @@ -1001,7 +1001,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond } source = cu.replaceStrings(source, replacements); if (force.getIncludeDirectSpace()) - cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true); + cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true); // Initialize the exceptions. diff --git a/platforms/hip/src/HipNonbondedUtilities.cpp b/platforms/hip/src/HipNonbondedUtilities.cpp index 5dd6fe0..49824ee 100644 --- a/platforms/hip/src/HipNonbondedUtilities.cpp +++ b/platforms/hip/src/HipNonbondedUtilities.cpp @@ -65,7 +65,7 @@ class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait { bool useDouble; }; -HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), +HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) { // Decide how many thread blocks to use. @@ -86,11 +86,11 @@ HipNonbondedUtilities::~HipNonbondedUtilities() { hipEventDestroy(downloadCountEvent); } -void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup) { - addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false); +void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList) { + addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, usesNeighborList, false); } -void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) { +void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList) { if (groupCutoff.size() > 0) { if (usesCutoff != useCutoff) throw OpenMMException("All Forces must agree on whether to use a cutoff"); @@ -103,6 +103,7 @@ void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, b requestExclusions(exclusionList); useCutoff = usesCutoff; usePeriodic = usesPeriodic; + useNeighborList |= (usesNeighborList && useCutoff); groupCutoff[forceGroup] = cutoffDistance; groupFlags |= 1< 0) { + if (useNeighborList && numTiles > 0) { hipEventSynchronize(downloadCountEvent); updateNeighborListSize(); } @@ -671,6 +672,8 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc defines["USE_EXCLUSIONS"] = "1"; if (isSymmetric) defines["USE_SYMMETRIC"] = "1"; + if (useNeighborList) + defines["USE_NEIGHBOR_LIST"] = "1"; defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc if (includeForces) defines["INCLUDE_FORCES"] = "1"; diff --git a/platforms/hip/src/kernels/nonbonded.hip b/platforms/hip/src/kernels/nonbonded.hip index d6e1716..9cbaae0 100644 --- a/platforms/hip/src/kernels/nonbonded.hip +++ b/platforms/hip/src/kernels/nonbonded.hip @@ -237,7 +237,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // of them (no cutoff). -#ifdef USE_CUTOFF +#ifdef USE_NEIGHBOR_LIST const unsigned int numTiles = interactionCount[0]; if (numTiles > maxTiles) return; // There wasn't enough memory for the neighbor list. @@ -262,7 +262,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Extract the coordinates of this tile. int x, y; bool singlePeriodicCopy = false; -#ifdef USE_CUTOFF +#ifdef USE_NEIGHBOR_LIST x = tiles[pos]; real4 blockSizeX = blockSize[x]; singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF && @@ -297,7 +297,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Load atom data for this tile. real4 posq1 = posq[atom1]; LOAD_ATOM1_PARAMETERS -#ifdef USE_CUTOFF +#ifdef USE_NEIGHBOR_LIST unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx]; #else unsigned int j = y*TILE_SIZE + tgx; @@ -454,7 +454,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Third loop: single pairs that aren't part of a tile. -#if USE_CUTOFF +#if USE_NEIGHBOR_LIST const unsigned int numPairs = interactionCount[1]; if (numPairs > maxSinglePairs) return; // There wasn't enough memory for the neighbor list. From 65e105fb2ef086b38e1057858d2ca3ac9cb369f2 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sat, 14 Oct 2023 13:13:57 +0600 Subject: [PATCH 04/14] Add missing tests for ATMForce Related to https://github.com/amd/openmm-hip/pull/7 --- platforms/hip/tests/TestHipATMForce.cpp | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 platforms/hip/tests/TestHipATMForce.cpp diff --git a/platforms/hip/tests/TestHipATMForce.cpp b/platforms/hip/tests/TestHipATMForce.cpp new file mode 100644 index 0000000..6a594fc --- /dev/null +++ b/platforms/hip/tests/TestHipATMForce.cpp @@ -0,0 +1,36 @@ +/* -------------------------------------------------------------------------- * + * OpenMM * + * -------------------------------------------------------------------------- * + * This is part of the OpenMM molecular simulation toolkit originating from * + * Simbios, the NIH National Center for Physics-Based Simulation of * + * Biological Structures at Stanford, funded under the NIH Roadmap for * + * Medical Research, grant U54 GM072970. See https://simtk.org. * + * * + * Portions copyright (c) 2023 Stanford University and the Authors. * + * Authors: Peter Eastman * + * Contributors: * + * * + * Permission is hereby granted, free of charge, to any person obtaining a * + * copy of this software and associated documentation files (the "Software"), * + * to deal in the Software without restriction, including without limitation * + * the rights to use, copy, modify, merge, publish, distribute, sublicense, * + * and/or sell copies of the Software, and to permit persons to whom the * + * Software is furnished to do so, subject to the following conditions: * + * * + * The above copyright notice and this permission notice shall be included in * + * all copies or substantial portions of the Software. * + * * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * + * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * + * USE OR OTHER DEALINGS IN THE SOFTWARE. * + * -------------------------------------------------------------------------- */ + +#include "HipTests.h" +#include "TestATMForce.h" + +void runPlatformTests() { +} From e8fa1d8d20afdf601d60e98baa34d0f486c8c7a0 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sat, 14 Oct 2023 18:28:43 +0600 Subject: [PATCH 05/14] Fix exclusion tiles sorting on AMD CDNA (64 threads per wave) The nonbonded kernel uses USE_NEIGHBOR_LIST (useNeighborList) so host code also must check it instead of useCutoff. See also https://github.com/openmm/openmm/issues/3462 --- platforms/hip/src/HipNonbondedUtilities.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/hip/src/HipNonbondedUtilities.cpp b/platforms/hip/src/HipNonbondedUtilities.cpp index 49824ee..de8b84f 100644 --- a/platforms/hip/src/HipNonbondedUtilities.cpp +++ b/platforms/hip/src/HipNonbondedUtilities.cpp @@ -218,7 +218,7 @@ void HipNonbondedUtilities::initialize(const System& system) { vector exclusionTilesVec; for (set >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) exclusionTilesVec.push_back(make_int2(iter->first, iter->second)); - sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useCutoff ? compareInt2 : compareInt2LargeSIMD); + sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useNeighborList ? compareInt2 : compareInt2LargeSIMD); exclusionTiles.initialize(context, exclusionTilesVec.size(), "exclusionTiles"); exclusionTiles.upload(exclusionTilesVec); map, int> exclusionTileMap; From 52ee803356d81efe15c225ff0cdd86366e656807 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sat, 21 Oct 2023 12:01:07 +0600 Subject: [PATCH 06/14] Port "CustomCPPForceImpl for writing forces in C++" https://github.com/openmm/openmm/commit/9a0db72 https://github.com/openmm/openmm/pull/4231 --- platforms/hip/src/HipKernelFactory.cpp | 4 ++- platforms/hip/src/HipPlatform.cpp | 1 + platforms/hip/tests/TestHipCustomCPPForce.cpp | 36 +++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 platforms/hip/tests/TestHipCustomCPPForce.cpp diff --git a/platforms/hip/src/HipKernelFactory.cpp b/platforms/hip/src/HipKernelFactory.cpp index cc71294..f75d1fe 100644 --- a/platforms/hip/src/HipKernelFactory.cpp +++ b/platforms/hip/src/HipKernelFactory.cpp @@ -6,7 +6,7 @@ * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * - * Portions copyright (c) 2008-2019 Stanford University and the Authors. * + * Portions copyright (c) 2008-2023 Stanford University and the Authors. * * Portions copyright (C) 2020 Advanced Micro Devices, Inc. All Rights * * Reserved. * * Authors: Peter Eastman, Nicholas Curtis * @@ -113,6 +113,8 @@ KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem()); if (name == CalcCustomCVForceKernel::Name()) return new HipCalcCustomCVForceKernel(name, platform, cu); + if (name == CalcCustomCPPForceKernel::Name()) + return new CommonCalcCustomCPPForceKernel(name, platform, context, cu); if (name == CalcRMSDForceKernel::Name()) return new CommonCalcRMSDForceKernel(name, platform, cu); if (name == CalcCustomManyParticleForceKernel::Name()) diff --git a/platforms/hip/src/HipPlatform.cpp b/platforms/hip/src/HipPlatform.cpp index c72264f..21cba5f 100644 --- a/platforms/hip/src/HipPlatform.cpp +++ b/platforms/hip/src/HipPlatform.cpp @@ -93,6 +93,7 @@ HipPlatform::HipPlatform() { registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory); + registerKernelFactory(CalcCustomCPPForceKernel::Name(), factory); registerKernelFactory(CalcCustomCVForceKernel::Name(), factory); registerKernelFactory(CalcRMSDForceKernel::Name(), factory); registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory); diff --git a/platforms/hip/tests/TestHipCustomCPPForce.cpp b/platforms/hip/tests/TestHipCustomCPPForce.cpp new file mode 100644 index 0000000..7910447 --- /dev/null +++ b/platforms/hip/tests/TestHipCustomCPPForce.cpp @@ -0,0 +1,36 @@ +/* -------------------------------------------------------------------------- * + * OpenMM * + * -------------------------------------------------------------------------- * + * This is part of the OpenMM molecular simulation toolkit originating from * + * Simbios, the NIH National Center for Physics-Based Simulation of * + * Biological Structures at Stanford, funded under the NIH Roadmap for * + * Medical Research, grant U54 GM072970. See https://simtk.org. * + * * + * Portions copyright (c) 2023 Stanford University and the Authors. * + * Authors: Peter Eastman * + * Contributors: * + * * + * Permission is hereby granted, free of charge, to any person obtaining a * + * copy of this software and associated documentation files (the "Software"), * + * to deal in the Software without restriction, including without limitation * + * the rights to use, copy, modify, merge, publish, distribute, sublicense, * + * and/or sell copies of the Software, and to permit persons to whom the * + * Software is furnished to do so, subject to the following conditions: * + * * + * The above copyright notice and this permission notice shall be included in * + * all copies or substantial portions of the Software. * + * * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * + * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * + * USE OR OTHER DEALINGS IN THE SOFTWARE. * + * -------------------------------------------------------------------------- */ + +#include "HipTests.h" +#include "TestCustomCPPForce.h" + +void runPlatformTests() { +} From 9d9394a172bb6c7d610286baf0ade0123942e7ee Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Mon, 27 Nov 2023 21:00:00 +0600 Subject: [PATCH 07/14] Port "Store bounding box sizes in half precision" https://github.com/openmm/openmm/commit/2ae50f9 --- .../hip/src/kernels/findInteractingBlocks.hip | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/platforms/hip/src/kernels/findInteractingBlocks.hip b/platforms/hip/src/kernels/findInteractingBlocks.hip index 65db0f8..39258c3 100644 --- a/platforms/hip/src/kernels/findInteractingBlocks.hip +++ b/platforms/hip/src/kernels/findInteractingBlocks.hip @@ -1,3 +1,5 @@ +#include + #define BUFFER_SIZE 256 #if defined(AMD_RDNA) @@ -18,6 +20,21 @@ __device__ inline int warpPopc(warpflags x) { #endif +struct alignas(sizeof(__half) * 4) BoundingBox { + __device__ BoundingBox(real3 f) { + // Round up so we'll err on the side of making the box a little too large. + // This ensures interactions will never be missed. + v[0] = __float2half_ru((float) f.x); + v[1] = __float2half_ru((float) f.y); + v[2] = __float2half_ru((float) f.z); + } + __device__ real3 toReal3() const { + return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2])); + } +private: + __half v[3]; +}; + /** * Find a bounding box for the atoms in each block. */ @@ -112,13 +129,13 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, */ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, - real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions, + BoundingBox* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions, unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) { int i = threadIdx.x+blockIdx.x*blockDim.x; if (i < NUM_BLOCKS) { int index = (int) sortedBlock[i].y; sortedBlockCenter[i] = blockCenter[index]; - sortedBlockBoundingBox[i] = blockBoundingBox[index]; + sortedBlockBoundingBox[i] = BoundingBox(trimTo3(blockBoundingBox[index])); } // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list. @@ -242,7 +259,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, - const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, + const BoundingBox* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) { if (rebuildNeighborList[0] == 0) @@ -276,7 +293,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti real2 sortedKey = sortedBlocks[block1]; int x = (int) sortedKey.y; real4 blockCenterX = sortedBlockCenter[block1]; - real4 blockSizeX = sortedBlockBoundingBox[block1]; + real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3(); int neighborsInBuffer = 0; real4 pos1 = posq[x*TILE_SIZE+indexInTile]; #ifdef USE_PERIODIC @@ -329,7 +346,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti if (!lastIteration && __ballot(includeBlock2) == 0) continue; #endif - real4 blockSizeY = sortedBlockBoundingBox[block2]; + real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3(); blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x); blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z); From d9ecea578057b6267b8ef83230ce9a8196e25ef8 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sun, 3 Dec 2023 16:23:22 +0600 Subject: [PATCH 08/14] Port "Use large blocks to optimize building the neighbor list" https://github.com/openmm/openmm/commit/3955033 --- platforms/hip/include/HipNonbondedUtilities.h | 4 +- platforms/hip/src/HipNonbondedUtilities.cpp | 36 +++++++++- .../hip/src/kernels/findInteractingBlocks.hip | 70 +++++++++++++++++-- 3 files changed, 103 insertions(+), 7 deletions(-) diff --git a/platforms/hip/include/HipNonbondedUtilities.h b/platforms/hip/include/HipNonbondedUtilities.h index 319621e..8a32a3a 100644 --- a/platforms/hip/include/HipNonbondedUtilities.h +++ b/platforms/hip/include/HipNonbondedUtilities.h @@ -340,6 +340,8 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { HipArray sortedBlocks; HipArray sortedBlockCenter; HipArray sortedBlockBoundingBox; + HipArray largeBlockCenter; + HipArray largeBlockBoundingBox; HipArray oldPositions; HipArray rebuildNeighborList; HipSort* blockSorter; @@ -353,7 +355,7 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { std::map groupCutoff; std::map groupKernelSource; double lastCutoff; - bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList; + bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks; int startTileIndex, startBlockIndex, numBlocks, numTilesInBatch, maxExclusions; int numForceThreadBlocks, forceThreadBlockSize, findInteractingBlocksThreadBlockSize, numAtoms, groupFlags; unsigned int maxTiles, maxSinglePairs, tilesAfterReorder; diff --git a/platforms/hip/src/HipNonbondedUtilities.cpp b/platforms/hip/src/HipNonbondedUtilities.cpp index de8b84f..de62709 100644 --- a/platforms/hip/src/HipNonbondedUtilities.cpp +++ b/platforms/hip/src/HipNonbondedUtilities.cpp @@ -75,6 +75,13 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont numForceThreadBlocks = 5*4*context.getMultiprocessors(); forceThreadBlockSize = 64; findInteractingBlocksThreadBlockSize = context.getSIMDWidth(); + + // When building the neighbor list, we can optionally use large blocks (32 * warpSize atoms) to + // accelerate the process. This makes building the neighbor list faster, but it prevents + // us from sorting atom blocks by size, which leads to a slightly less efficient neighbor + // list. We guess based on system size which will be faster. + + useLargeBlocks = (context.getNumAtoms() > 90000); setKernelSource(HipKernelSources::nonbonded); } @@ -284,7 +291,14 @@ void HipNonbondedUtilities::initialize(const System& system) { maxTiles = 1; maxSinglePairs = 20*numAtoms; // HIP-TODO: This may require tuning - numTilesInBatch = numAtomBlocks < 2000 ? 4 : 1; + if (useLargeBlocks) { + // It seems beneficial to launch more thread blocks when large blocks are used. + // Hypothesis: when numTilesInBatch warps process one block1, there is a smaller chance + // to have a few warps with extremely long durations. + numTilesInBatch = 2; + } else { + numTilesInBatch = numAtomBlocks < 2000 ? 4 : 1; + } interactingTiles.initialize(context, maxTiles, "interactingTiles"); interactingAtoms.initialize(context, HipContext::TileSize*maxTiles, "interactingAtoms"); interactionCount.initialize(context, 2, "interactionCount"); @@ -295,6 +309,8 @@ void HipNonbondedUtilities::initialize(const System& system) { sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks"); sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); + largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter"); + largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions"); rebuildNeighborList.initialize(context, 1, "rebuildNeighborList"); blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false); @@ -350,6 +366,15 @@ void HipNonbondedUtilities::initialize(const System& system) { sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer()); + if (useLargeBlocks) { + sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer()); + sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer()); + sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer()); + } sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer()); sortBoxDataArgs.push_back(&oldPositions.getDevicePointer()); sortBoxDataArgs.push_back(&interactionCount.getDevicePointer()); @@ -372,6 +397,10 @@ void HipNonbondedUtilities::initialize(const System& system) { findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer()); + if (useLargeBlocks) { + findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer()); + findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer()); + } findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer()); findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer()); findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer()); @@ -416,7 +445,8 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) { if (lastCutoff != kernels.cutoffDistance) forceRebuildNeighborList = true; context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth()); - blockSorter->sort(sortedBlocks); + if (!useLargeBlocks) + blockSorter->sort(sortedBlocks); context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64); context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize); forceRebuildNeighborList = false; @@ -524,6 +554,8 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) { defines["USE_PERIODIC"] = "1"; if (context.getBoxIsTriclinic()) defines["TRICLINIC"] = "1"; + if (useLargeBlocks) + defines["USE_LARGE_BLOCKS"] = "1"; defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions); int maxBits = 0; if (canUsePairList) { diff --git a/platforms/hip/src/kernels/findInteractingBlocks.hip b/platforms/hip/src/kernels/findInteractingBlocks.hip index 39258c3..f146e91 100644 --- a/platforms/hip/src/kernels/findInteractingBlocks.hip +++ b/platforms/hip/src/kernels/findInteractingBlocks.hip @@ -128,14 +128,37 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel. */ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter, - const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, - BoundingBox* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions, + const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, BoundingBox* __restrict__ sortedBlockBoundingBox, +#ifdef USE_LARGE_BLOCKS + real4* __restrict__ largeBlockCenter, BoundingBox* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize, + real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, +#endif + const real4* __restrict__ posq, const real4* __restrict__ oldPositions, unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) { int i = threadIdx.x+blockIdx.x*blockDim.x; if (i < NUM_BLOCKS) { int index = (int) sortedBlock[i].y; sortedBlockCenter[i] = blockCenter[index]; sortedBlockBoundingBox[i] = BoundingBox(trimTo3(blockBoundingBox[index])); +#ifdef USE_LARGE_BLOCKS + // Compute the sizes of large blocks (composed of warpSize regular blocks) starting from each block. + + real4 minPos = blockCenter[i]-blockBoundingBox[i]; + real4 maxPos = blockCenter[i]+blockBoundingBox[i]; + int last = min(i+warpSize, NUM_BLOCKS); + for (int j = i+1; j < last; j++) { + real4 blockPos = blockCenter[j]; + real4 width = blockBoundingBox[j]; +#ifdef USE_PERIODIC + real4 center = 0.5f*(maxPos+minPos); + APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center) +#endif + minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0); + maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0); + } + largeBlockCenter[i] = 0.5f*(maxPos+minPos); + largeBlockBoundingBox[i] = BoundingBox(trimTo3(0.5f*(maxPos-minPos))); +#endif } // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list. @@ -258,8 +281,12 @@ void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned i extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, - unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, - const BoundingBox* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, + unsigned int startBlockIndex, unsigned int numBlocks, const real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, + const BoundingBox* __restrict__ sortedBlockBoundingBox, +#ifdef USE_LARGE_BLOCKS + const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox, +#endif + const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) { if (rebuildNeighborList[0] == 0) @@ -328,10 +355,45 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti // units are idle at the end of the kernel (the kernel works on the upper triangle of // the NUM_BLOCKS x NUM_BLOCKS matrix). +#ifdef USE_LARGE_BLOCKS + warpflags largeBlockFlags = 0; + int loadedLargeBlocks = 0; +#endif int block2Count = 0; // Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox. for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) { + // Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2) const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS; +#ifdef USE_LARGE_BLOCKS + if (loadedLargeBlocks == 0) { + // Check the next set of large blocks. + + int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp; + bool includeLargeBlock = false; + if (largeBlockIndex < NUM_BLOCKS) { + real4 largeCenter = largeBlockCenter[largeBlockIndex]; + real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3(); + real4 blockDelta = blockCenterX-largeCenter; +#ifdef USE_PERIODIC + APPLY_PERIODIC_TO_DELTA(blockDelta) +#endif + blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x); + blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y); + blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z); + includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED); + } + largeBlockFlags = __ballot(includeLargeBlock); + loadedLargeBlocks = warpSize; + } + loadedLargeBlocks--; + if ((largeBlockFlags&1) == 0 && !lastIteration) { + // None of the next warpSize blocks interact with block 1. + + largeBlockFlags >>= 1; + continue; + } + largeBlockFlags >>= 1; +#endif int block2 = block2Base+indexInWarp; bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS); block2 = includeBlock2 ? block2 : block1; From e0d23248fb0118192e665d35d651fec882126033 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sun, 17 Dec 2023 12:40:24 +0600 Subject: [PATCH 09/14] Port "Improved sorting of blocks when building neighbor list" and "Fixed bug in large blocks optimization with triclinic boxes" https://github.com/openmm/openmm/commit/796ffaa https://github.com/openmm/openmm/commit/4c10732 --- platforms/hip/include/HipNonbondedUtilities.h | 6 +- platforms/hip/src/HipNonbondedUtilities.cpp | 58 +++++++++-------- .../hip/src/kernels/findInteractingBlocks.hip | 64 ++++++++++++++----- 3 files changed, 84 insertions(+), 44 deletions(-) diff --git a/platforms/hip/include/HipNonbondedUtilities.h b/platforms/hip/include/HipNonbondedUtilities.h index 8a32a3a..9bc6485 100644 --- a/platforms/hip/include/HipNonbondedUtilities.h +++ b/platforms/hip/include/HipNonbondedUtilities.h @@ -9,7 +9,7 @@ * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * - * Portions copyright (c) 2009-2022 Stanford University and the Authors. * + * Portions copyright (c) 2009-2023 Stanford University and the Authors. * * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights * * Reserved. * * Authors: Peter Eastman, Nicholas Curtis * @@ -340,6 +340,7 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { HipArray sortedBlocks; HipArray sortedBlockCenter; HipArray sortedBlockBoundingBox; + HipArray blockSizeRange; HipArray largeBlockCenter; HipArray largeBlockBoundingBox; HipArray oldPositions; @@ -347,7 +348,7 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { HipSort* blockSorter; hipEvent_t downloadCountEvent; unsigned int* pinnedCountBuffer; - std::vector forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs; + std::vector forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs; std::vector > atomExclusions; std::vector parameters; std::vector arguments; @@ -374,6 +375,7 @@ class HipNonbondedUtilities::KernelSet { std::string source; hipFunction_t forceKernel, energyKernel, forceEnergyKernel; hipFunction_t findBlockBoundsKernel; + hipFunction_t computeSortKeysKernel; hipFunction_t sortBoxDataKernel; hipFunction_t findInteractingBlocksKernel; hipFunction_t copyInteractionCountsKernel; diff --git a/platforms/hip/src/HipNonbondedUtilities.cpp b/platforms/hip/src/HipNonbondedUtilities.cpp index de62709..4307c0d 100644 --- a/platforms/hip/src/HipNonbondedUtilities.cpp +++ b/platforms/hip/src/HipNonbondedUtilities.cpp @@ -6,7 +6,7 @@ * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * - * Portions copyright (c) 2009-2022 Stanford University and the Authors. * + * Portions copyright (c) 2009-2023 Stanford University and the Authors. * * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights * * Reserved. * * Authors: Peter Eastman, Nicholas Curtis * @@ -51,18 +51,15 @@ using namespace std; class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait { public: - BlockSortTrait(bool useDouble) : useDouble(useDouble) { - } - int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);} - int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);} - const char* getDataType() const {return "real2";} - const char* getKeyType() const {return "real";} - const char* getMinKey() const {return "-3.40282e+38f";} - const char* getMaxKey() const {return "3.40282e+38f";} - const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";} - const char* getSortKey() const {return "value.x";} -private: - bool useDouble; + BlockSortTrait() {} + int getDataSize() const {return sizeof(int);} + int getKeySize() const {return sizeof(int);} + const char* getDataType() const {return "unsigned int";} + const char* getKeyType() const {return "unsigned int";} + const char* getMinKey() const {return "0";} + const char* getMaxKey() const {return "0xFFFFFFFFu";} + const char* getMaxValue() const {return "0xFFFFFFFFu";} + const char* getSortKey() const {return "value";} }; HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true), @@ -291,14 +288,7 @@ void HipNonbondedUtilities::initialize(const System& system) { maxTiles = 1; maxSinglePairs = 20*numAtoms; // HIP-TODO: This may require tuning - if (useLargeBlocks) { - // It seems beneficial to launch more thread blocks when large blocks are used. - // Hypothesis: when numTilesInBatch warps process one block1, there is a smaller chance - // to have a few warps with extremely long durations. - numTilesInBatch = 2; - } else { - numTilesInBatch = numAtomBlocks < 2000 ? 4 : 1; - } + numTilesInBatch = numAtomBlocks < 2000 ? 4 : 1; interactingTiles.initialize(context, maxTiles, "interactingTiles"); interactingAtoms.initialize(context, HipContext::TileSize*maxTiles, "interactingAtoms"); interactionCount.initialize(context, 2, "interactionCount"); @@ -306,17 +296,23 @@ void HipNonbondedUtilities::initialize(const System& system) { int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter"); blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox"); - sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks"); + sortedBlocks.initialize(context, numAtomBlocks, "sortedBlocks"); sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); + blockSizeRange.initialize(context, 2, elementSize, "blockSizeRange"); largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter"); largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions"); rebuildNeighborList.initialize(context, 1, "rebuildNeighborList"); - blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false); + blockSorter = new HipSort(context, new BlockSortTrait(), numAtomBlocks, false); vector count(2, 0); interactionCount.upload(count); rebuildNeighborList.upload(&count[0]); + if (context.getUseDoublePrecision()) { + blockSizeRange.upload(vector{1e38, 0}); + } else { + blockSizeRange.upload(vector{1e38, 0}); + } } // Record arguments for kernels. @@ -360,7 +356,10 @@ void HipNonbondedUtilities::initialize(const System& system) { findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer()); findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer()); findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer()); - findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer()); + findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer()); + computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer()); + computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer()); + computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer()); sortBoxDataArgs.push_back(&blockCenter.getDevicePointer()); sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer()); @@ -380,6 +379,7 @@ void HipNonbondedUtilities::initialize(const System& system) { sortBoxDataArgs.push_back(&interactionCount.getDevicePointer()); sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer()); sortBoxDataArgs.push_back(&forceRebuildNeighborList); + sortBoxDataArgs.push_back(&blockSizeRange.getDevicePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer()); @@ -445,8 +445,8 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) { if (lastCutoff != kernels.cutoffDistance) forceRebuildNeighborList = true; context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth()); - if (!useLargeBlocks) - blockSorter->sort(sortedBlocks); + context.executeKernelFlat(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks()); + blockSorter->sort(sortedBlocks); context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64); context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize); forceRebuildNeighborList = false; @@ -584,8 +584,14 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) { defines["MAX_BITS_FOR_PAIRS"] = context.intToString(maxBits); defines["NUM_TILES_IN_BATCH"] = context.intToString(numTilesInBatch); defines["GROUP_SIZE"] = context.intToString(findInteractingBlocksThreadBlockSize); + int binShift = 1; + while (1<x, totalSize); + atomicMax(&blockSizeRange->y, totalSize); } if (blockIdx.x == 0 && threadIdx.x == 0) rebuildNeighborList[0] = 0; } +extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) { + // Sort keys store the bin in the high order part and the block in the low + // order part. + + real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y)); + int numSizeBins = 20; + real scale = numSizeBins/(sizeRange.y-sizeRange.x); + unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; + if (i < NUM_BLOCKS) { + real4 box = blockBoundingBox[i]; + real size = LOG(box.x+box.y+box.z); + int bin = (size-sizeRange.x)*scale; + bin = max(0, min(bin, numSizeBins-1)); + sortedBlocks[i] = (((unsigned int) bin)<x = 1e38; + blockSizeRange->y = 0; + } + // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list. bool rebuild = forceRebuild; @@ -281,7 +307,7 @@ void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned i extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, - unsigned int startBlockIndex, unsigned int numBlocks, const real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, + unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const BoundingBox* __restrict__ sortedBlockBoundingBox, #ifdef USE_LARGE_BLOCKS const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox, @@ -317,8 +343,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti if (block1 < startBlockIndex+numBlocks) { // Load data for this block. Note that all threads in a warp are processing the same block. - real2 sortedKey = sortedBlocks[block1]; - int x = (int) sortedKey.y; + int x = sortedBlocks[block1] & BLOCK_INDEX_MASK; real4 blockCenterX = sortedBlockCenter[block1]; real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3(); int neighborsInBuffer = 0; @@ -381,6 +406,13 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z); includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED); +#ifdef TRICLINIC + // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away. + // If there's any possibility we might have missed it, do a detailed check. + + if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF) + includeLargeBlock = true; +#endif } largeBlockFlags = __ballot(includeLargeBlock); loadedLargeBlocks = warpSize; @@ -440,7 +472,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)]; const bool forceInclude = b & 1; const int block2 = b >> 1; - int y = (int) sortedBlocks[block2].y; + int y = sortedBlocks[block2] & BLOCK_INDEX_MASK; #pragma unroll 1 for (int k = indexInTile; k < numExclusions; k += TILE_SIZE) From a2032fc615c00bd36b95a15eae0ae598efaae55c Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sun, 17 Dec 2023 15:39:06 +0600 Subject: [PATCH 10/14] Port "Optimize sorting of non-uniformly distributed data" https://github.com/openmm/openmm/commit/71d9bb1 --- platforms/hip/src/HipSort.cpp | 7 +-- platforms/hip/src/kernels/sort.hip | 84 +++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 4 deletions(-) diff --git a/platforms/hip/src/HipSort.cpp b/platforms/hip/src/HipSort.cpp index d7bdbe1..5775fd4 100644 --- a/platforms/hip/src/HipSort.cpp +++ b/platforms/hip/src/HipSort.cpp @@ -6,7 +6,7 @@ * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * - * Portions copyright (c) 2010-2018 Stanford University and the Authors. * + * Portions copyright (c) 2010-2021 Stanford University and the Authors. * * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights * * Reserved. * * Authors: Peter Eastman, Nicholas Curtis * @@ -45,11 +45,12 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo replacements["MIN_KEY"] = trait->getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); + replacements["UNIFORM"] = (uniform ? "1" : "0"); hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements)); shortListKernel = context.getKernel(module, "sortShortList"); shortList2Kernel = context.getKernel(module, "sortShortList2"); computeRangeKernel = context.getKernel(module, "computeRange"); - assignElementsKernel = context.getKernel(module, "assignElementsToBuckets"); + assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2"); computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions"); copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets"); sortBucketsKernel = context.getKernel(module, "sortBuckets"); @@ -59,7 +60,7 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo int maxSharedMem; hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice()); int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2; - int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks())); + int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks())); isShortList = (length <= maxShortList); sortKernelSize = 256; rangeKernelSize = 256; diff --git a/platforms/hip/src/kernels/sort.hip b/platforms/hip/src/kernels/sort.hip index 288c629..3efe82d 100644 --- a/platforms/hip/src/kernels/sort.hip +++ b/platforms/hip/src/kernels/sort.hip @@ -95,6 +95,7 @@ inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE */ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, volatile KEY_TYPE* __restrict__ range, unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) { +#if UNIFORM extern __shared__ KEY_TYPE minBuffer[]; KEY_TYPE* maxBuffer = minBuffer+blockDim.x; KEY_TYPE minimum = MAX_KEY; @@ -131,6 +132,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le } reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]); } +#endif // Clear the bucket counters in preparation for the next kernel. @@ -139,7 +141,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le } /** - * Assign elements to buckets. + * Assign elements to buckets. This version is optimized for uniformly distributed data. */ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) { @@ -154,6 +156,86 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi } } + +/** + * Assign elements to buckets. This version is optimized for non-uniformly distributed data. + */ +__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range, + unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) { + // Load 64 datapoints and sort them to get an estimate of the data distribution. + + __shared__ KEY_TYPE elements[64]; + if (threadIdx.x < 64) { + int index = (int) (threadIdx.x*length/64.0); + elements[threadIdx.x] = getValue(data[index]); + } + __syncthreads(); + for (unsigned int k = 2; k <= 64; k *= 2) { + for (unsigned int j = k/2; j > 0; j /= 2) { + if (threadIdx.x < 64) { + int ixj = threadIdx.x^j; + if (ixj > threadIdx.x) { + KEY_TYPE value1 = elements[threadIdx.x]; + KEY_TYPE value2 = elements[ixj]; + bool ascending = (threadIdx.x&k) == 0; + KEY_TYPE lowKey = (ascending ? value1 : value2); + KEY_TYPE highKey = (ascending ? value2 : value1); + if (lowKey > highKey) { + elements[threadIdx.x] = value2; + elements[ixj] = value1; + } + } + } + __syncthreads(); + } + } + + // Create a function composed of linear segments mapping data values to bucket indices. + + __shared__ float segmentLowerBound[9]; + __shared__ float segmentBaseIndex[9]; + __shared__ float segmentIndexScale[9]; + if (threadIdx.x == 0) { + segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]); + segmentLowerBound[1] = elements[5]; + segmentLowerBound[2] = elements[10]; + segmentLowerBound[3] = elements[20]; + segmentLowerBound[4] = elements[30]; + segmentLowerBound[5] = elements[40]; + segmentLowerBound[6] = elements[50]; + segmentLowerBound[7] = elements[60]; + segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]); + segmentBaseIndex[0] = numBuckets/16; + segmentBaseIndex[1] = 3*numBuckets/16; + segmentBaseIndex[2] = 5*numBuckets/16; + segmentBaseIndex[3] = 7*numBuckets/16; + segmentBaseIndex[4] = 9*numBuckets/16; + segmentBaseIndex[5] = 11*numBuckets/16; + segmentBaseIndex[6] = 13*numBuckets/16; + segmentBaseIndex[7] = 15*numBuckets/16; + segmentBaseIndex[8] = numBuckets; + for (int i = 0; i < 8; i++) + if (segmentLowerBound[i+1] == segmentLowerBound[i]) + segmentIndexScale[i] = 0; + else + segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]); + } + __syncthreads(); + + // Assign elements to buckets. + + for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) { + float key = (float) getValue(data[index]); + int segment; + for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++) + ; + unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment]; + bucketIndex = min(max(0, bucketIndex), numBuckets-1); + offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1); + bucketOfElement[index] = bucketIndex; + } +} + /** * Sum the bucket sizes to compute the start position of each bucket. This kernel * is executed as a single work group. From 41647377d451764b1a9fcd89d5c44d77058d9c03 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sun, 21 Jan 2024 13:41:15 +0600 Subject: [PATCH 11/14] Support ROCm 6.0: do not use deprecated gcnArch --- platforms/hip/src/HipContext.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/platforms/hip/src/HipContext.cpp b/platforms/hip/src/HipContext.cpp index 1a1a507..5777c34 100644 --- a/platforms/hip/src/HipContext.cpp +++ b/platforms/hip/src/HipContext.cpp @@ -904,24 +904,20 @@ vector HipContext::getDevicePrecedence() { int numDevices; hipDeviceProp_t thisDevice; string errorMessage = "Error initializing Context"; - vector, int> > devices; + vector > devices; CHECK_RESULT(hipGetDeviceCount(&numDevices)); for (int i = 0; i < numDevices; i++) { CHECK_RESULT(hipGetDeviceProperties(&thisDevice, i)); int clock, multiprocessors, speed; // AMD GPU - // gcn arch is available if needed, however... - int major = thisDevice.gcnArch; clock = thisDevice.clockRate; multiprocessors = thisDevice.multiProcessorCount; speed = clock*multiprocessors; - pair deviceProperties = std::make_pair(major, speed); - devices.push_back(std::make_pair(deviceProperties, -i)); + devices.push_back(std::make_pair(speed, -i)); } - // sort first by compute capability (higher is better), then speed - // (higher is better), and finally device index (lower is better) + // sort first by speed (higher is better), and finally device index (lower is better) std::sort(devices.begin(), devices.end()); std::reverse(devices.begin(), devices.end()); From 25afefb8d7fe5fa6327fd467698fba4678fcb253 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sun, 21 Jan 2024 21:05:35 +0600 Subject: [PATCH 12/14] Support ROCm 6.0: improve module compilation and loading * hipModuleLoad sometimes fails to load modules for unknown reasons, use manual loading from the output file and hipModuleLoadDataEx; * use amdclang++ directly instead of hipcc; * use --offload-device-only instead of --genco; --- CMakeLists.txt | 6 -- README.md | 14 ++-- platforms/hip/src/HipContext.cpp | 109 +++++++++++++++--------------- platforms/hip/src/HipPlatform.cpp | 8 +-- 4 files changed, 66 insertions(+), 71 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbb913e..77ae2b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,12 +89,6 @@ LIST(APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH} /opt/rocm) FIND_PACKAGE(HIP CONFIG REQUIRED) FIND_PACKAGE(HIPFFT CONFIG REQUIRED) -IF(${HIP_COMPILER} STREQUAL "clang") - MESSAGE(STATUS "Using HIP-Clang compiler") -ELSE() - MESSAGE(FATAL_ERROR "HIP compiler ${HIP_COMPILER} not recognized!") -ENDIF() - ADD_SUBDIRECTORY(platforms/hip) # Amoeba plugin diff --git a/README.md b/README.md index 3bf46df..5286160 100644 --- a/README.md +++ b/README.md @@ -138,15 +138,15 @@ please try different backends: * the hipFFT/rocFFT-based implementation (`export OPENMM_FFT_BACKEND=1`); * the VkFFT-based implementation (`export OPENMM_FFT_BACKEND=2`); -### The kernel compilation: hipcc and hipRTC +### The kernel compilation: amdclang++ and hipRTC -By default, the HIP Platform builds kernels with the hipcc compiler. To run the compiler, paths -in the following order are used: +By default, the HIP Platform builds kernels with the amdclang++ compiler. To run the compiler, +paths in the following order are used: * `properties['HipCompiler']`, if it is passed to Context constructor; * `OPENMM_HIP_COMPILER` environment variable, if it is set; -* `${ROCM_PATH}/bin/hipcc`, if `ROCM_PATH` environment variable is set; -* `/opt/rocm/bin/hipcc` otherwise. +* `${ROCM_PATH}/bin/amdclang++`, if `ROCM_PATH` environment variable is set; +* `/opt/rocm/bin/amdclang++` otherwise. There is an alternative way to compile kernels: hipRTC, it is implemented by `plugins/hipcompiler`. To enable this way: @@ -154,6 +154,10 @@ There is an alternative way to compile kernels: hipRTC, it is implemented by * set `properties['HipAllowRuntimeCompiler'] = 'true'`; * set `OPENMM_USE_HIPRTC` environment variable to 1 (`export OPENMM_USE_HIPRTC=1`). +**Warning:** hipRTC from ROCm 6.0.0 has issues with ambiguous operators for vector and complex +types. It seems that they have been fixed in ROCm/clr's `develop` branch and likely OpenMM+hipRTC +will be usable with the next ROCm release. + ## License The HIP Platform uses OpenMM API under the terms of the MIT License. A copy of this license may diff --git a/platforms/hip/src/HipContext.cpp b/platforms/hip/src/HipContext.cpp index 5777c34..c31bf95 100644 --- a/platforms/hip/src/HipContext.cpp +++ b/platforms/hip/src/HipContext.cpp @@ -489,7 +489,7 @@ string HipContext::getHash(const string& src) const { string HipContext::getCacheFileName(const string& src) const { stringstream cacheFile; - cacheFile << cacheDir << getHash(src) << '_' << gpuArchitecture; + cacheFile << cacheDir << "openmm-hip-" << getHash(src + gpuArchitecture); return cacheFile.str(); } @@ -500,7 +500,7 @@ hipModule_t HipContext::createModule(const string source) { hipModule_t HipContext::createModule(const string source, const map& defines) { const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS"); bool saveTemps = saveTempsEnv != nullptr; - string options = "-ffast-math -munsafe-fp-atomics -Wall"; + string options = "-O3 -ffast-math -munsafe-fp-atomics -Wall"; // HIP-TODO: Remove it when the compiler does a better job // Disable SLP vectorization as it may generate unoptimal packed math instructions on >=MI200 // (gfx90a): more v_mov, higher register usage etc. @@ -594,35 +594,29 @@ hipModule_t HipContext::createModule(const string source, const map ptx = compilerKernel.getAs().createModule(src.str(), options, *this); + vector code = compilerKernel.getAs().createModule(src.str(), options, *this); // If possible, write the PTX out to a temporary file so we can cache it for later use. - bool wroteCache = false; try { - ofstream out(outputFile.c_str(), ios::out | ios::binary); - out.write(&ptx[0], ptx.size()); + ofstream out(cacheFile.c_str(), ios::out | ios::binary); + out.write(&code[0], code.size()); out.close(); - if (!out.fail()) - wroteCache = true; } catch (...) { + // An error occurred. Possibly we don't have permission to write to the temp directory. // Ignore. } - if (!wroteCache) { - // An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly. - - CHECK_RESULT2(hipModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading HIP module"); - loadedModules.push_back(module); - return module; - } + CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module"); + loadedModules.push_back(module); + return module; } else { // Write out the source to a temporary file. @@ -630,48 +624,55 @@ hipModule_t HipContext::createModule(const string source, const map \""+logFile+"\""; + string command = compiler + " -x hip --offload-device-only --offload-arch=" + gpuArchitecture + " " + options + (saveTemps ? " -save-temps=obj" : "") +" -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\""; res = std::system(command.c_str()); - } - try { - if (res != 0) { - // Load the error log. - - stringstream error; - error << "Error launching HIP compiler: " << res; - ifstream log(logFile.c_str()); - if (log.is_open()) { - string line; - while (!log.eof()) { - getline(log, line); - error << '\n' << line; + try { + if (res != 0) { + // Load the error log. + + stringstream error; + error << "Error launching HIP compiler: " << res; + ifstream log(logFile.c_str()); + if (log.is_open()) { + string line; + while (!log.eof()) { + getline(log, line); + error << '\n' << line; + } + log.close(); } - log.close(); + throw OpenMMException(error.str()); } - throw OpenMMException(error.str()); - } - hipError_t result = hipModuleLoad(&module, outputFile.c_str()); - if (result != hipSuccess) { - std::stringstream m; - m<<"Error loading HIP module: "< code; + ifstream out(outputFile.c_str(), ios::in | ios::binary); + if (!out.is_open()) { + std::stringstream error; + error << "Error reading HIP module from `" << outputFile << "`"; + throw OpenMMException(error.str()); + } + code.insert(code.begin(), istreambuf_iterator(out), istreambuf_iterator()); + out.close(); + + if (!saveTemps) { + remove(inputFile.c_str()); + remove(logFile.c_str()); + } + if (rename(outputFile.c_str(), cacheFile.c_str()) != 0 && !saveTemps) + remove(outputFile.c_str()); + + CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module"); + loadedModules.push_back(module); + return module; } - if (rename(outputFile.c_str(), cacheFile.c_str()) != 0 && !saveTemps) - remove(outputFile.c_str()); - loadedModules.push_back(module); - return module; - } - catch (...) { - if (!saveTemps) { - remove(inputFile.c_str()); - remove(outputFile.c_str()); - remove(logFile.c_str()); + catch (...) { + if (!saveTemps) { + remove(inputFile.c_str()); + remove(outputFile.c_str()); + remove(logFile.c_str()); + } + throw; } - throw; } } diff --git a/platforms/hip/src/HipPlatform.cpp b/platforms/hip/src/HipPlatform.cpp index 21cba5f..d3237df 100644 --- a/platforms/hip/src/HipPlatform.cpp +++ b/platforms/hip/src/HipPlatform.cpp @@ -135,16 +135,12 @@ HipPlatform::HipPlatform() { hipcc = compiler; } else if (rocmPath != NULL) { - hipcc = string(rocmPath) + "/bin/hipcc"; + hipcc = string(rocmPath) + "/bin/amdclang++"; } else { - hipcc = "/opt/rocm/bin/hipcc"; + hipcc = "/opt/rocm/bin/amdclang++"; } setPropertyDefaultValue(HipCompiler(), hipcc); - // Do not use hipRTC by default (it doesn't allow to use a workaround for DPP, see comments in - // HipContext::createModule and intrinsics.hip so performance may be a bit lower). - // HIP-TODO: Enable when the compiler issue is fixed. - // hipRTC on ROCm before 4.3 and earlier has issues so it's not supported char* useHipRtcEnv = getenv("OPENMM_USE_HIPRTC"); bool allowRuntimeCompiler = (useHipRtcEnv != NULL && string(useHipRtcEnv) == "1"); setPropertyDefaultValue(HipAllowRuntimeCompiler(), allowRuntimeCompiler ? "true" : "false"); From 6b1c5a9709cab7018c44b8a3fc65f4a7d12ed7a2 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sun, 21 Jan 2024 21:50:05 +0600 Subject: [PATCH 13/14] Update README for 8.1.1 --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5286160..b0aac38 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,9 @@ This plugin requires hipFFT and rocFFT, install them from ROCm repositories: apt install hipfft rocfft ``` +If you see "libhipfft.so.0: cannot open shared object file: No such file or directory", run +`ldconfig`. + ```sh conda create -n openmm-env -c streamhpc -c conda-forge --strict-channel-priority openmm-hip conda activate openmm-env @@ -69,7 +72,7 @@ The plugin requires source code of OpenMM, it can be downloaded as an archive [here](https://github.com/openmm/openmm/releases) or as a Git repository: ```sh -git clone https://github.com/openmm/openmm.git -b 8.0.0 +git clone https://github.com/openmm/openmm.git -b 8.1.1 ``` To build the plugin, follow these steps: @@ -100,7 +103,7 @@ source code: ```sh mkdir build build-hip install -git clone https://github.com/openmm/openmm.git -b 8.0.0 +git clone https://github.com/openmm/openmm.git -b 8.1.1 cd build cmake ../openmm/ -D CMAKE_INSTALL_PREFIX=../install -D OPENMM_BUILD_COMMON=ON -D OPENMM_PYTHON_USER_INSTALL=ON make From 388f4115c69d4223d58e6c345b95755ca171da59 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Sat, 20 Apr 2024 15:42:00 +0500 Subject: [PATCH 14/14] Make hipFFT dependency optional --- CMakeLists.txt | 2 +- README.md | 19 ++++++++++--------- platforms/hip/CMakeLists.txt | 12 ++++++++++-- platforms/hip/src/HipContext.cpp | 8 ++++++++ platforms/hip/src/HipFFTImplHipFFT.cpp | 4 ++++ platforms/hip/tests/TestHipFFTImplHipFFT.cpp | 10 ++++++++++ 6 files changed, 43 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77ae2b2..dd8755e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,7 +87,7 @@ ENDFOREACH(subdir) LIST(APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH} /opt/rocm) FIND_PACKAGE(HIP CONFIG REQUIRED) -FIND_PACKAGE(HIPFFT CONFIG REQUIRED) +FIND_PACKAGE(HIPFFT CONFIG) ADD_SUBDIRECTORY(platforms/hip) diff --git a/README.md b/README.md index b0aac38..f148c6c 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,6 @@ AMD GPUs on [AMD ROCmâ„¢ open software platform](https://rocmdocs.amd.com). ## Installing with Conda -This plugin requires hipFFT and rocFFT, install them from ROCm repositories: - -```sh -apt install hipfft rocfft -``` - -If you see "libhipfft.so.0: cannot open shared object file: No such file or directory", run -`ldconfig`. - ```sh conda create -n openmm-env -c streamhpc -c conda-forge --strict-channel-priority openmm-hip conda activate openmm-env @@ -141,6 +132,16 @@ please try different backends: * the hipFFT/rocFFT-based implementation (`export OPENMM_FFT_BACKEND=1`); * the VkFFT-based implementation (`export OPENMM_FFT_BACKEND=2`); +The hipFFT/rocFFT-based implementation requires hipFFT and rocFFT libraries, otherwise it will be +disabled, install them from ROCm repositories before running cmake: + +```sh +apt install hipfft rocfft +``` + +If you see "libhipfft.so.0: cannot open shared object file: No such file or directory", run +`ldconfig`. + ### The kernel compilation: amdclang++ and hipRTC By default, the HIP Platform builds kernels with the amdclang++ compiler. To run the compiler, diff --git a/platforms/hip/CMakeLists.txt b/platforms/hip/CMakeLists.txt index 866336a..8380625 100644 --- a/platforms/hip/CMakeLists.txt +++ b/platforms/hip/CMakeLists.txt @@ -110,13 +110,17 @@ IF (OPENMM_BUILD_SHARED_LIB) ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) ADD_DEPENDENCIES(${SHARED_TARGET} CommonKernels HipKernels) - TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB} hip::host hip::hipfft) + TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB} hip::host) SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_SHARED_LIBRARY") IF (APPLE) SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP") ELSE (APPLE) SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}") ENDIF (APPLE) + IF(HIPFFT_FOUND) + TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC hip::hipfft) + TARGET_COMPILE_OPTIONS(${SHARED_TARGET} PUBLIC "-DOPENMM_HIP_WITH_HIPFFT") + ENDIF(HIPFFT_FOUND) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET}) ENDIF (OPENMM_BUILD_SHARED_LIB) @@ -127,13 +131,17 @@ IF(OPENMM_BUILD_STATIC_LIB) ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) ADD_DEPENDENCIES(${STATIC_TARGET} CommonKernels HipKernels) - TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB_STATIC} hip::host hip::hipfft) + TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB_STATIC} hip::host) SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_STATIC_LIBRARY") IF (APPLE) SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP") ELSE (APPLE) SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}") ENDIF (APPLE) + IF(HIPFFT_FOUND) + TARGET_LINK_LIBRARIES(${STATIC_TARGET} PUBLIC hip::hipfft) + TARGET_COMPILE_OPTIONS(${STATIC_TARGET} PUBLIC "-DOPENMM_HIP_WITH_HIPFFT") + ENDIF(HIPFFT_FOUND) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET}) ENDIF(OPENMM_BUILD_STATIC_LIB) diff --git a/platforms/hip/src/HipContext.cpp b/platforms/hip/src/HipContext.cpp index c31bf95..3eca0e3 100644 --- a/platforms/hip/src/HipContext.cpp +++ b/platforms/hip/src/HipContext.cpp @@ -40,7 +40,9 @@ #include "HipNonbondedUtilities.h" #include "HipProgram.h" #include "HipFFTImplFFT3D.h" +#ifdef OPENMM_HIP_WITH_HIPFFT #include "HipFFTImplHipFFT.h" +#endif #include "HipFFTImplVkFFT.h" #include "openmm/common/ComputeArray.h" #include "openmm/common/ContextSelector.h" @@ -709,7 +711,11 @@ ComputeEvent HipContext::createEvent() { HipFFTBase* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex, hipStream_t stream, HipArray& in, HipArray& out) { if (fftBackend == 1) { +#ifdef OPENMM_HIP_WITH_HIPFFT return new HipFFTImplHipFFT(*this, xsize, ysize, zsize, realToComplex, stream, in, out); +#else + throw OpenMMException("OpenMM HIP is not built with hipFFT support"); +#endif } else if (fftBackend == 2) { return new HipFFTImplVkFFT(*this, xsize, ysize, zsize, realToComplex, stream, in, out); @@ -721,7 +727,9 @@ HipFFTBase* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToCo int HipContext::findLegalFFTDimension(int minimum) { if (fftBackend == 1) { +#ifdef OPENMM_HIP_WITH_HIPFFT return HipFFTImplHipFFT::findLegalDimension(minimum); +#endif } else if (fftBackend == 2) { return HipFFTImplVkFFT::findLegalDimension(minimum); diff --git a/platforms/hip/src/HipFFTImplHipFFT.cpp b/platforms/hip/src/HipFFTImplHipFFT.cpp index 6adebdf..ecf6f16 100644 --- a/platforms/hip/src/HipFFTImplHipFFT.cpp +++ b/platforms/hip/src/HipFFTImplHipFFT.cpp @@ -26,6 +26,8 @@ * along with this program. If not, see . * * -------------------------------------------------------------------------- */ +#ifdef OPENMM_HIP_WITH_HIPFFT + #include "HipFFTImplHipFFT.h" #include "HipContext.h" @@ -121,3 +123,5 @@ int HipFFTImplHipFFT::findLegalDimension(int minimum) { minimum++; } } + +#endif // OPENMM_HIP_WITH_HIPFFT diff --git a/platforms/hip/tests/TestHipFFTImplHipFFT.cpp b/platforms/hip/tests/TestHipFFTImplHipFFT.cpp index 9753761..f404a3f 100644 --- a/platforms/hip/tests/TestHipFFTImplHipFFT.cpp +++ b/platforms/hip/tests/TestHipFFTImplHipFFT.cpp @@ -35,6 +35,8 @@ * This tests the hipFFT-based implementation of FFT. */ +#ifdef OPENMM_HIP_WITH_HIPFFT + #include "openmm/internal/AssertionUtilities.h" #include "HipArray.h" #include "HipContext.h" @@ -169,3 +171,11 @@ int main(int argc, char* argv[]) { cout << "Done" << endl; return 0; } + +#else // OPENMM_HIP_WITH_HIPFFT + +int main(int argc, char* argv[]) { + return 0; +} + +#endif // OPENMM_HIP_WITH_HIPFFT