Skip to content

Commit

Permalink
Optimise Cuda::DefFieldCompose() #92
Browse files Browse the repository at this point in the history
  • Loading branch information
onurulgen committed Jan 10, 2024
1 parent a73014e commit 92ec3ce
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 93 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
381
382
6 changes: 0 additions & 6 deletions reg-lib/cuda/BlockSize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ struct BlockSize {
unsigned ComputeJacGradient3d;
unsigned ApproxCorrectFolding3d;
unsigned CorrectFolding3d;
unsigned DefFieldCompose2d;
unsigned DefFieldCompose3d;
unsigned GetJacobianMatrix;
unsigned ConvertNmiGradientFromVoxelToRealSpace;
unsigned ApplyConvolutionWindowAlongX;
Expand All @@ -49,8 +47,6 @@ struct BlockSize100: public BlockSize {
ComputeJacGradient3d = 256; // 32 reg - 24 smem - 64 cmem
ApproxCorrectFolding3d = 256; // 32 reg - 24 smem - 24 cmem
CorrectFolding3d = 256; // 31 reg - 24 smem - 32 cmem
DefFieldCompose2d = 512; // 15 reg - 24 smem - 08 cmem - 16 lmem
DefFieldCompose3d = 384; // 21 reg - 24 smem - 08 cmem - 24 lmem
GetJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem
ConvertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem
ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem
Expand All @@ -74,8 +70,6 @@ struct BlockSize300: public BlockSize {
ComputeJacGradient3d = 768; // 37 reg
ApproxCorrectFolding3d = 768; // 34 reg
CorrectFolding3d = 768; // 34 reg
DefFieldCompose2d = 1024; // 23 reg
DefFieldCompose3d = 1024; // 24 reg
GetJacobianMatrix = 768; // 34 reg
ConvertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg
ApplyConvolutionWindowAlongX = 1024; // 25 reg
Expand Down
3 changes: 2 additions & 1 deletion reg-lib/cuda/CudaCompute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ void CudaCompute::DefFieldCompose(const nifti_image *defField) {
const size_t voxelNumber = NiftiImage::calcVoxelNumber(defField, 3);
thrust::device_vector<float4> defFieldCuda(voxelNumber);
Cuda::TransferNiftiToDevice(defFieldCuda.data().get(), defField);
Cuda::DefFieldCompose(defField, defFieldCuda.data().get(), con.GetDeformationFieldCuda());
auto defFieldCompose = defField->nz > 1 ? Cuda::DefFieldCompose<true> : Cuda::DefFieldCompose<false>;
defFieldCompose(defField, defFieldCuda.data().get(), con.GetDeformationFieldCuda());
}
/* *************************************************************** */
32 changes: 10 additions & 22 deletions reg-lib/cuda/CudaLocalTransformation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -634,33 +634,20 @@ void GetFlowFieldFromVelocityGrid(nifti_image *velocityFieldGrid,
velocityFieldGrid->num_ext = oldNumExt;
}
/* *************************************************************** */
template<bool is3d>
void DefFieldCompose(const nifti_image *deformationField,
const float4 *deformationFieldCuda,
float4 *deformationFieldCudaOut) {
auto blockSize = CudaContext::GetBlockSize();
float4 *deformationFieldOutCuda) {
const size_t voxelNumber = NiftiImage::calcVoxelNumber(deformationField, 3);
const int3 referenceImageDim{ deformationField->nx, deformationField->ny, deformationField->nz };
const int3 referenceImageDims{ deformationField->nx, deformationField->ny, deformationField->nz };
const mat44& affineMatrixB = deformationField->sform_code > 0 ? deformationField->sto_ijk : deformationField->qto_ijk;
const mat44& affineMatrixC = deformationField->sform_code > 0 ? deformationField->sto_xyz : deformationField->qto_xyz;
auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, voxelNumber, cudaChannelFormatKindFloat, 4);
auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, voxelNumber, cudaChannelFormatKindFloat, 4);
auto deformationFieldTexture = *deformationFieldTexturePtr;

if (deformationField->nz > 1) {
const unsigned blocks = blockSize->DefFieldCompose3d;
const unsigned grids = (unsigned)Ceil(sqrtf((float)voxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
DefFieldCompose3d<<<gridDims, blockDims>>>(deformationFieldCudaOut, *deformationFieldTexture, referenceImageDim,
(unsigned)voxelNumber, affineMatrixB, affineMatrixC);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
} else {
const unsigned blocks = blockSize->DefFieldCompose2d;
const unsigned grids = (unsigned)Ceil(sqrtf((float)voxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
DefFieldCompose2d<<<gridDims, blockDims>>>(deformationFieldCudaOut, *deformationFieldTexture, referenceImageDim,
(unsigned)voxelNumber, affineMatrixB, affineMatrixC);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
}
thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), voxelNumber, [=]__device__(const int index) {
DefFieldComposeKernel<is3d>(deformationFieldOutCuda, deformationFieldTexture, referenceImageDims, affineMatrixB, affineMatrixC, index);
});
}
/* *************************************************************** */
void GetDeformationFieldFromFlowField(nifti_image *flowField,
Expand Down Expand Up @@ -725,9 +712,10 @@ void GetDeformationFieldFromFlowField(nifti_image *flowField,
thrust::copy(thrust::device, flowFieldCuda, flowFieldCuda + voxelNumber, deformationFieldCuda);

// The deformation field is squared
auto defFieldCompose = deformationField->nz > 1 ? DefFieldCompose<true> : DefFieldCompose<false>;
for (int i = 0; i < squaringNumber; ++i) {
// The deformation field is applied to itself
DefFieldCompose(deformationField, deformationFieldCuda, flowFieldCuda);
defFieldCompose(deformationField, deformationFieldCuda, flowFieldCuda);
// The computed scaled deformation field is copied over
thrust::copy(thrust::device, flowFieldCuda, flowFieldCuda + voxelNumber, deformationFieldCuda);
NR_DEBUG("Squaring (composition) step " << i + 1 << "/" << squaringNumber);
Expand Down
1 change: 1 addition & 0 deletions reg-lib/cuda/CudaLocalTransformation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ double CorrectFolding(const nifti_image *referenceImage,
float4 *controlPointImageCuda,
const bool approx);
/* *************************************************************** */
template<bool is3d>
void DefFieldCompose(const nifti_image *deformationField,
const float4 *deformationFieldCuda,
float4 *deformationFieldOutCuda);
Expand Down
110 changes: 47 additions & 63 deletions reg-lib/cuda/CudaLocalTransformationKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1057,67 +1057,22 @@ __global__ void CorrectFolding3d(float4 *controlPointGrid,
}
}
/* *************************************************************** */
__global__ void DefFieldCompose2d(float4 *deformationField,
cudaTextureObject_t deformationFieldTexture,
const int3 referenceImageDim,
const unsigned voxelNumber,
const mat44 affineMatrixB,
const mat44 affineMatrixC) {
const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < voxelNumber) {
// Extract the original voxel position
float4 position = deformationField[tid];

// Conversion from real position to voxel coordinate
const float4 voxelPosition{
position.x * affineMatrixB.m[0][0] + position.y * affineMatrixB.m[0][1] + affineMatrixB.m[0][3],
position.x * affineMatrixB.m[1][0] + position.y * affineMatrixB.m[1][1] + affineMatrixB.m[1][3],
0.f,
0.f
};

// Linear interpolation
const int2 ante = { Floor(voxelPosition.x), Floor(voxelPosition.y) };
float relX[2], relY[2];
relX[1] = voxelPosition.x - (float)ante.x; relX[0] = 1.f - relX[1];
relY[1] = voxelPosition.y - (float)ante.y; relY[0] = 1.f - relY[1];

position = make_float4(0.f, 0.f, 0.f, 0.f);
for (short b = 0; b < 2; ++b) {
for (short a = 0; a < 2; ++a) {
float4 deformation;
if (-1 < ante.x + a && ante.x + a < referenceImageDim.x &&
-1 < ante.y + b && ante.y + b < referenceImageDim.y) {
const int index = (ante.y + b) * referenceImageDim.x + ante.x + a;
deformation = tex1Dfetch<float4>(deformationFieldTexture, index);
} else {
deformation = GetSlidedValues(ante.x + a, ante.y + b, deformationFieldTexture, referenceImageDim, affineMatrixC);
}
const float basis = relX[a] * relY[b];
position = position + basis * deformation;
}
}
deformationField[tid] = position;
}
}
/* *************************************************************** */
__global__ void DefFieldCompose3d(float4 *deformationField,
cudaTextureObject_t deformationFieldTexture,
const int3 referenceImageDim,
const unsigned voxelNumber,
const mat44 affineMatrixB,
const mat44 affineMatrixC) {
const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < voxelNumber) {
// Extract the original voxel position
float4 position = deformationField[tid];
template<bool is3d>
__device__ void DefFieldComposeKernel(float4 *deformationField,
cudaTextureObject_t deformationFieldTexture,
const int3 referenceImageDims,
const mat44 affineMatrixB,
const mat44 affineMatrixC,
const int index) {
// Extract the original voxel position
float4 position = deformationField[index];

if constexpr (is3d) {
// Conversion from real position to voxel coordinate
const float4 voxelPosition{
const float3 voxelPosition{
position.x * affineMatrixB.m[0][0] + position.y * affineMatrixB.m[0][1] + position.z * affineMatrixB.m[0][2] + affineMatrixB.m[0][3],
position.x * affineMatrixB.m[1][0] + position.y * affineMatrixB.m[1][1] + position.z * affineMatrixB.m[1][2] + affineMatrixB.m[1][3],
position.x * affineMatrixB.m[2][0] + position.y * affineMatrixB.m[2][1] + position.z * affineMatrixB.m[2][2] + affineMatrixB.m[2][3],
0.f
position.x * affineMatrixB.m[2][0] + position.y * affineMatrixB.m[2][1] + position.z * affineMatrixB.m[2][2] + affineMatrixB.m[2][3]
};

// Linear interpolation
Expand All @@ -1132,21 +1087,50 @@ __global__ void DefFieldCompose3d(float4 *deformationField,
for (short b = 0; b < 2; ++b) {
for (short a = 0; a < 2; ++a) {
float4 deformation;
if (-1 < ante.x + a && ante.x + a < referenceImageDim.x &&
-1 < ante.y + b && ante.y + b < referenceImageDim.y &&
-1 < ante.z + c && ante.z + c < referenceImageDim.z) {
const int index = ((ante.z + c) * referenceImageDim.y + ante.y + b) * referenceImageDim.x + ante.x + a;
if (-1 < ante.x + a && ante.x + a < referenceImageDims.x &&
-1 < ante.y + b && ante.y + b < referenceImageDims.y &&
-1 < ante.z + c && ante.z + c < referenceImageDims.z) {
const int index = ((ante.z + c) * referenceImageDims.y + ante.y + b) * referenceImageDims.x + ante.x + a;
deformation = tex1Dfetch<float4>(deformationFieldTexture, index);
} else {
deformation = GetSlidedValues(ante.x + a, ante.y + b, ante.z + c, deformationFieldTexture, referenceImageDim, affineMatrixC);
deformation = GetSlidedValues(ante.x + a, ante.y + b, ante.z + c, deformationFieldTexture, referenceImageDims, affineMatrixC);
}
const float basis = relX[a] * relY[b] * relZ[c];
position = position + basis * deformation;
}
}
}
deformationField[tid] = position;
} else {
// Conversion from real position to voxel coordinate
const float2 voxelPosition{
position.x * affineMatrixB.m[0][0] + position.y * affineMatrixB.m[0][1] + affineMatrixB.m[0][3],
position.x * affineMatrixB.m[1][0] + position.y * affineMatrixB.m[1][1] + affineMatrixB.m[1][3]
};

// Linear interpolation
const int2 ante = { Floor(voxelPosition.x), Floor(voxelPosition.y) };
float relX[2], relY[2];
relX[1] = voxelPosition.x - (float)ante.x; relX[0] = 1.f - relX[1];
relY[1] = voxelPosition.y - (float)ante.y; relY[0] = 1.f - relY[1];

position = make_float4(0.f, 0.f, 0.f, 0.f);
for (short b = 0; b < 2; ++b) {
for (short a = 0; a < 2; ++a) {
float4 deformation;
if (-1 < ante.x + a && ante.x + a < referenceImageDims.x &&
-1 < ante.y + b && ante.y + b < referenceImageDims.y) {
const int index = (ante.y + b) * referenceImageDims.x + ante.x + a;
deformation = tex1Dfetch<float4>(deformationFieldTexture, index);
} else {
deformation = GetSlidedValues(ante.x + a, ante.y + b, deformationFieldTexture, referenceImageDims, affineMatrixC);
}
const float basis = relX[a] * relY[b];
position = position + basis * deformation;
}
}
}

deformationField[index] = position;
}
/* *************************************************************** */
__global__ void GetJacobianMatrix3d(float *jacobianMatrices,
Expand Down

0 comments on commit 92ec3ce

Please sign in to comment.