Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 171 additions & 8 deletions devices/rtx/device/frame/Frame.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,102 @@ __device__ bool resolveSample(uint32_t idx,
return divisor > 0;
}

// One-sided (upper) trimmed mean -- the TRIM mode. A robust per-pixel estimator:
// a trimmed mean (Tukey 1962; Huber 1981) whose outlier set is chosen by a
// Grubbs / generalized-ESD test (Grubbs 1969; Rosner 1983), accumulated online
// with Welford (Welford 1962); an a-posteriori per-pixel sample-outlier rejector
// in the DeCoro et al. 2010 lineage. See the commit message for the full mapping
// and the two deliberate deviations from textbook ESD.
//
// `sum` is the running total of all `n` samples (the colorAccumulation value,
// undivided); `topK` holds the `trim` brightest samples the pixel saw (rgb in
// xyz, luminance in w, w < 0 = empty); `lum` carries the pixel's luminance
// Welford (mean in mean.x, M2 in m2.x).
//
// A sample is dropped when its luminance exceeds the threshold mean + k*stddev,
// with the spread (stddev) estimated over the BASE samples -- the bulk with the
// tracked brightest removed. This is the ESD masking fix: one spike otherwise
// inflates its own sigma enough to exempt itself, so a moderate k never fires.
// Leaving the candidates out of the scale keeps the threshold tied to the
// well-behaved bulk so a genuine spike stands out even at large k.
//
// Two refinements keep this from darkening the image at low spp -- the one real
// drawback of the plain version, where with few samples the tracked brightest
// are a large fraction, the base mean collapses below the true level, and even
// legitimate bright samples get dropped:
// * the threshold is centred on the FULL mean, not the base mean, so it cannot
// fall below the true level when the base excludes the bright fraction;
// * the number of samples actually dropped is capped at ~n/4, so at low spp at
// most the single most extreme spike is removed (it ramps to the full trim
// as samples accumulate) -- a large trim fraction can no longer gut the
// estimate. The brightest tracked samples are dropped first.
// Clean pixels have nothing above the threshold and resolve to the exact mean;
// the dropped fraction -> 0 with spp (consistent estimator).
__device__ vec3 resolveTrimmed(
const vec4 *topK, vec3 sum, const PixelLumStats &lum, int trim, float kSigma)
{
constexpr int MAX_TRIM = 8;
if (trim > MAX_TRIM)
trim = MAX_TRIM;

const int n = int(lum.n);
if (n <= 0)
return vec3(0.f);
if (n < 3)
return sum / float(n);

// Full-distribution luminance moments, from the Welford accumulators.
const float meanFull = lum.mean.x;
const float sumL = meanFull * lum.n;
const float sumL2 = lum.m2.x + lum.n * meanFull * meanFull;

// Gather the tracked brightest, sorted by luminance descending (<= 8 elems),
// and accumulate their moments to subtract from the base spread estimate.
float topW[MAX_TRIM];
vec3 topRGB[MAX_TRIM];
float sumTop = 0.0f, sumTop2 = 0.0f;
int v = 0;
for (int i = 0; i < trim; ++i) {
if (topK[i].w < 0.0f)
continue;
sumTop += topK[i].w;
sumTop2 += topK[i].w * topK[i].w;
float w = topK[i].w;
vec3 rgb = vec3(topK[i]);
int j = v - 1;
for (; j >= 0 && topW[j] < w; --j) {
topW[j + 1] = topW[j];
topRGB[j + 1] = topRGB[j];
}
topW[j + 1] = w;
topRGB[j + 1] = rgb;
++v;
}
const int nB = n - v;
if (nB < 2) // too few base samples to estimate a spread
return sum / float(n);

const float baseSum = sumL - sumTop;
const float meanB = baseSum / float(nB);
const float baseM2 = fmaxf(sumL2 - sumTop2 - meanB * baseSum, 0.0f);
const float sigmaB = sqrtf(baseM2 / float(nB - 1));
const float threshold = meanFull + kSigma * sigmaB;

// Drop at most ~n/4 samples (>=1), brightest first, ramping the trim fraction
// in with the sample count.
const int maxDrop = min(min(trim, n - 1), max(1, n / 4));
vec3 dropSum(0.f);
int dropCount = 0;
for (int i = 0; i < v && dropCount < maxDrop; ++i) {
if (topW[i] <= threshold)
break; // sorted descending: nothing below is above the threshold either
dropSum += topRGB[i];
++dropCount;
}

return (sum - dropSum) / float(n - dropCount);
}

__global__ void prepareDenoiseInputs(const vec4 *__restrict__ accumColor,
const vec3 *__restrict__ accumAlbedo,
const vec3 *__restrict__ accumNormal,
Expand All @@ -82,7 +178,11 @@ __global__ void prepareDenoiseInputs(const vec4 *__restrict__ accumColor,
uvec2 size,
int frameID,
int checkerboardID,
bool fireflyFilter)
FireflyFilterMode fireflyFilterMode,
const vec4 *__restrict__ trimTopK,
const PixelLumStats *__restrict__ lumStats,
int trim,
float sigma)
{
const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size.x * size.y)
Expand All @@ -101,8 +201,16 @@ __global__ void prepareDenoiseInputs(const vec4 *__restrict__ accumColor,

const float invDivisor = 1.0f / float(divisor);
vec4 c = accumColor[srcIdx] * invDivisor;
if (fireflyFilter)
if (fireflyFilterMode == FireflyFilterMode::TONEMAP) {
c = detail::inverseTonemap(c);
} else if (fireflyFilterMode == FireflyFilterMode::TRIM && trimTopK) {
c = vec4(resolveTrimmed(trimTopK + size_t(srcIdx) * trim,
vec3(accumColor[srcIdx]),
lumStats[srcIdx],
trim,
sigma),
c.a);
}
denoiseInput[idx] = c;

if (denoiseAlbedo)
Expand All @@ -125,7 +233,11 @@ void launchPrepareDenoiseInputs(const vec4 *accumColor,
uvec2 size,
int frameID,
int checkerboardID,
bool fireflyFilter,
FireflyFilterMode fireflyFilterMode,
const vec4 *trimTopK,
const PixelLumStats *lumStats,
int trim,
float sigma,
cudaStream_t stream)
{
const uint32_t nPixels = size.x * size.y;
Expand All @@ -140,7 +252,11 @@ void launchPrepareDenoiseInputs(const vec4 *accumColor,
size,
frameID,
checkerboardID,
fireflyFilter);
fireflyFilterMode,
trimTopK,
lumStats,
trim,
sigma);
}

__global__ void compositeBackground(vec4 *__restrict__ accumColor,
Expand All @@ -152,7 +268,9 @@ __global__ void compositeBackground(vec4 *__restrict__ accumColor,
FrameFormat format,
int frameID,
int checkerboardID,
bool isDenoised)
bool isDenoised,
const vec4 *__restrict__ trimTopK,
const PixelLumStats *__restrict__ lumStats)
{
const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size.x * size.y)
Expand All @@ -176,8 +294,18 @@ __global__ void compositeBackground(vec4 *__restrict__ accumColor,
rendered.a = accumColor[sourceIdx].a / float(divisor);
} else {
rendered = accumColor[sourceIdx] / float(divisor);
if (renderer.fireflyFilter)
if (renderer.fireflyFilterMode == FireflyFilterMode::TONEMAP) {
rendered = detail::inverseTonemap(rendered);
} else if (renderer.fireflyFilterMode == FireflyFilterMode::TRIM
&& trimTopK) {
rendered = vec4(
resolveTrimmed(trimTopK + size_t(sourceIdx) * renderer.fireflyFilterTrim,
vec3(accumColor[sourceIdx]),
lumStats[sourceIdx],
renderer.fireflyFilterTrim,
renderer.fireflyFilterSigma),
rendered.a);
}
}

const vec2 uv = (vec2(px, py) + 0.5f) * invSize;
Expand Down Expand Up @@ -217,6 +345,8 @@ void launchCompositeBackground(vec4 *accumColor,
int frameID,
int checkerboardID,
bool isDenoised,
const vec4 *trimTopK,
const PixelLumStats *lumStats,
cudaStream_t stream)
{
const uint32_t nPixels = size.x * size.y;
Expand All @@ -231,7 +361,9 @@ void launchCompositeBackground(vec4 *accumColor,
format,
frameID,
checkerboardID,
isDenoised);
isDenoised,
trimTopK,
lumStats);
}

} // anonymous namespace
Expand Down Expand Up @@ -332,6 +464,7 @@ void Frame::finalize()
m_instIDBuffer.resize(channelInstID ? numPixels() : 0);

m_accumColor.reserve(numPixels() * sizeof(vec4));
m_lumStats.reserve(numPixels() * sizeof(PixelLumStats));
if (channelAlbedo)
m_accumAlbedo.reserve(numPixels() * sizeof(vec3));
else
Expand All @@ -358,6 +491,7 @@ void Frame::finalize()
}

hd.fb.buffers.colorAccumulation = m_accumColor.ptrAs<vec4>();
hd.fb.buffers.lumStats = m_lumStats.ptrAs<PixelLumStats>();

hd.fb.buffers.depth = channelDepth ? m_depthBuffer.dataDevice() : nullptr;
hd.fb.buffers.primID = channelPrimID ? m_primIDBuffer.dataDevice() : nullptr;
Expand Down Expand Up @@ -497,6 +631,18 @@ void Frame::renderFrame()
m_camera->populateFrameData(hd.camera, hd.fb.size);
hd.world = m_world->gpuData();

// The TRIM top-k buffer is `trim` times the color buffer, so allocate it only
// while that mode is active. trim is a renderer parameter, hence resolved
// here rather than in finalize(). newFrame() clears it on accumulation reset.
if (hd.renderer.fireflyFilterMode == FireflyFilterMode::TRIM) {
m_trimTopK.reserve(
numPixels() * size_t(hd.renderer.fireflyFilterTrim) * sizeof(vec4));
hd.fb.buffers.trimTopK = m_trimTopK.ptrAs<vec4>();
} else {
m_trimTopK.reset();
hd.fb.buffers.trimTopK = nullptr;
}

hd.registry.samplers = state.registry.samplers.devicePtr();
hd.registry.geometries = state.registry.geometries.devicePtr();
hd.registry.materials = state.registry.materials.devicePtr();
Expand Down Expand Up @@ -545,7 +691,11 @@ void Frame::renderFrame()
hd.fb.size,
hd.fb.frameID,
hd.fb.checkerboardID,
hd.renderer.fireflyFilter,
hd.renderer.fireflyFilterMode,
m_trimTopK.ptrAs<vec4>(),
m_lumStats.ptrAs<PixelLumStats>(),
hd.renderer.fireflyFilterTrim,
hd.renderer.fireflyFilterSigma,
state.stream);

m_denoiser.launch();
Expand All @@ -560,6 +710,8 @@ void Frame::renderFrame()
hd.fb.frameID,
hd.fb.checkerboardID,
/*isDenoised=*/true,
m_trimTopK.ptrAs<vec4>(),
m_lumStats.ptrAs<PixelLumStats>(),
state.stream);

m_denoiser.convertOutput();
Expand All @@ -577,6 +729,8 @@ void Frame::renderFrame()
hd.fb.frameID,
hd.fb.checkerboardID,
/*isDenoised=*/false,
m_trimTopK.ptrAs<vec4>(),
m_lumStats.ptrAs<PixelLumStats>(),
state.stream);
}

Expand Down Expand Up @@ -905,6 +1059,15 @@ void Frame::newFrame()
thrust::fill_n(thrust::device_pointer_cast(m_accumColor.ptrAs<vec4>()),
numPixels(),
vec4(0.0f));
thrust::fill_n(thrust::device_pointer_cast(m_lumStats.ptrAs<PixelLumStats>()),
numPixels(),
PixelLumStats{vec3(0.0f), vec3(0.0f), 0.0f});
if (hd.renderer.fireflyFilterMode == FireflyFilterMode::TRIM
&& m_trimTopK.ptrAs<vec4>()) {
thrust::fill_n(thrust::device_pointer_cast(m_trimTopK.ptrAs<vec4>()),
numPixels() * size_t(hd.renderer.fireflyFilterTrim),
vec4(0.0f, 0.0f, 0.0f, -1.0f)); // w<0 marks an empty top-k slot
}

// Conditionally initialize other buffers
if (channelDepth) {
Expand Down
2 changes: 2 additions & 0 deletions devices/rtx/device/frame/Frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ struct Frame : public helium::BaseFrame, public DeviceObject<FrameGPUData>
DeviceBuffer m_accumColor; // vec4
DeviceBuffer m_accumAlbedo; // vec3
DeviceBuffer m_accumNormal; // vec3
DeviceBuffer m_lumStats; // PixelLumStats: per-channel Welford + count
DeviceBuffer m_trimTopK; // TRIM mode: trim*vec4 brightest samples per pixel

// Per-pixel pre-denoise estimates. Keeping these separate from pixelBuffer
// avoids the denoiser reading its own previous output on non-rendered
Expand Down
34 changes: 33 additions & 1 deletion devices/rtx/device/gpu/gpu_objects.h
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,15 @@ enum class BackgroundMode
IMAGE
};

// Per-sample firefly suppression strategy applied during accumulation.
enum class FireflyFilterMode
{
NONE, // accumulate raw radiance (unbiased)
TONEMAP, // reversible Reinhard round-trip (legacy; dims highlights)
CLAMP, // per-pixel Welford luminance clamp (energy-preserving)
TRIM // adaptive upper-trimmed mean (consistent; near-unbiased at high spp)
};

union RendererBackgroundGPUData
{
glm::vec4 color;
Expand All @@ -797,7 +806,10 @@ struct RendererGPUData
float occlusionDistance;
bool cullTriangleBF;
bool premultiplyBackground;
bool fireflyFilter; // enable internal tonemapping during sample accumulation
FireflyFilterMode fireflyFilterMode; // per-sample outlier suppression strategy
float fireflyFilterSigma; // CLAMP/TRIM: k in threshold = mean + k*stddev
int fireflyFilterWarmup; // CLAMP mode: samples before the Welford cap engages
int fireflyFilterTrim; // TRIM mode: count of brightest samples tracked/trimmed
glm::vec4 cutPlane; // cutting plane (nx,ny,nz,d); disabled when all zero (GPU
// default)
};
Expand All @@ -812,9 +824,29 @@ enum class FrameFormat
UNKNOWN
};

// Per-pixel running Welford statistics for firefly suppression, tracked per RGB
// channel so a single-channel (chromatic) outlier is caught even when its
// luminance is unremarkable. `n` is the shared sample count — kept here because
// checkerboarding makes frameID a poor proxy for "how many samples this pixel
// has seen". CLAMP uses all three channels; TRIM uses only the luminance Welford
// in channel x and reads n as its sample divisor.
struct PixelLumStats
{
glm::vec3 mean; // per-channel running mean
glm::vec3 m2; // per-channel sum of squared deltas
float n; // sample count (shared across channels and with TRIM)
};

struct FrameBuffers
{
glm::vec4 *colorAccumulation;
PixelLumStats *lumStats;
// TRIM mode: the `trim` brightest samples seen per pixel, laid out
// [pixel*trim + slot] as (rgb in xyz, luminance in w; w < 0 marks an empty
// slot). At resolve the trimmed mean removes the outliers among these from
// the running colorAccumulation sum, so it needs only O(trim) memory per
// pixel. The sample count lives in lumStats->n.
glm::vec4 *trimTopK;
float *depth;
uint32_t *primID;
uint32_t *objID;
Expand Down
8 changes: 5 additions & 3 deletions devices/rtx/device/gpu/gpu_tonemap.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
* POSSIBILITY OF SUCH DAMAGE.
*/

// Tonemap helpers — safe to include from both PTX and regular CUDA sources.
// gpu_util.h includes <optix_device.h> and cannot be used from Frame.cu;
// this header provides the subset needed by the compositing kernel.
// Reversible Reinhard-on-max tonemap, used by the "tonemap" firefly filter
// mode: each sample is compressed before accumulation and the average is
// expanded back afterward. Safe to include from both PTX and regular CUDA
// sources — gpu_util.h pulls in <optix_device.h> and cannot be included from
// the Frame.cu compositing kernel, so the inverse lives here on its own.
#pragma once

#include "gpu_math.h"
Expand Down
Loading
Loading