Skip to content

Commit

Permalink
Optimise Cuda::GetDeformationField() #92
Browse files Browse the repository at this point in the history
  • Loading branch information
onurulgen committed Dec 1, 2023
1 parent 120386a commit 6511740
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 140 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
378
379
6 changes: 0 additions & 6 deletions reg-lib/cuda/BlockSize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ namespace NiftyReg {
/* *************************************************************** */
struct BlockSize {
unsigned reg_affine_getDeformationField;
unsigned GetDeformationField2d;
unsigned GetDeformationField3d;
unsigned GetApproxJacobianValues2d;
unsigned GetApproxJacobianValues3d;
unsigned ApproxLinearEnergyGradient;
Expand All @@ -43,8 +41,6 @@ struct BlockSize {
struct BlockSize100: public BlockSize {
BlockSize100() {
reg_affine_getDeformationField = 512; // 16 reg - 24 smem
GetDeformationField2d = 384; // 20 reg - 6168 smem - 28 cmem
GetDeformationField3d = 192; // 37 reg - 6168 smem - 28 cmem
GetApproxJacobianValues2d = 384; // 17 reg - 104 smem - 36 cmem
GetApproxJacobianValues3d = 256; // 27 reg - 356 smem - 108 cmem
ApproxLinearEnergyGradient = 384; // 40 reg
Expand Down Expand Up @@ -73,8 +69,6 @@ struct BlockSize100: public BlockSize {
struct BlockSize300: public BlockSize {
BlockSize300() {
reg_affine_getDeformationField = 1024; // 23 reg
GetDeformationField2d = 1024; // 34 reg
GetDeformationField3d = 1024; // 34 reg
GetApproxJacobianValues2d = 768; // 34 reg
GetApproxJacobianValues3d = 640; // 46 reg
ApproxLinearEnergyGradient = 768; // 40 reg
Expand Down
21 changes: 13 additions & 8 deletions reg-lib/cuda/CudaCompute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,20 @@ void CudaCompute::LandmarkDistanceGradient(size_t landmarkNumber, float *landmar
}
/* *************************************************************** */
void CudaCompute::GetDeformationField(bool composition, bool bspline) {
decltype(Cuda::GetDeformationField<true, true>) *getDeformationField;
if (composition)
getDeformationField = bspline ? Cuda::GetDeformationField<true, true> :
Cuda::GetDeformationField<true, false>;
else
getDeformationField = bspline ? Cuda::GetDeformationField<false, true> :
Cuda::GetDeformationField<false, false>;
CudaF3dContent& con = dynamic_cast<CudaF3dContent&>(this->con);
Cuda::GetDeformationField(con.F3dContent::GetControlPointGrid(),
con.F3dContent::GetReference(),
con.GetControlPointGridCuda(),
con.GetDeformationFieldCuda(),
con.GetReferenceMaskCuda(),
con.GetActiveVoxelNumber(),
composition,
bspline);
getDeformationField(con.F3dContent::GetControlPointGrid(),
con.F3dContent::GetReference(),
con.GetControlPointGridCuda(),
con.GetDeformationFieldCuda(),
con.GetReferenceMaskCuda(),
con.GetActiveVoxelNumber());
}
/* *************************************************************** */
template<bool optimiseX, bool optimiseY, bool optimiseZ>
Expand Down
88 changes: 32 additions & 56 deletions reg-lib/cuda/CudaLocalTransformation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,67 +18,47 @@
/* *************************************************************** */
namespace NiftyReg::Cuda {
/* *************************************************************** */
template<bool composition, bool bspline>
void GetDeformationField(const nifti_image *controlPointImage,
const nifti_image *referenceImage,
const float4 *controlPointImageCuda,
float4 *deformationFieldCuda,
const int *maskCuda,
const size_t activeVoxelNumber,
const bool composition,
const bool bspline) {
const size_t activeVoxelNumber) {
const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3);
const int3 referenceImageDim = make_int3(referenceImage->nx, referenceImage->ny, referenceImage->nz);
const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz);
const float3 controlPointVoxelSpacing = make_float3(controlPointImage->dx / referenceImage->dx,
controlPointImage->dy / referenceImage->dy,
controlPointImage->dz / referenceImage->dz);

auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4);
auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);
auto controlPointTexturePtr = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4);
auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);
auto controlPointTexture = *controlPointTexturePtr;
auto maskTexture = *maskTexturePtr;

// Get the reference matrix if composition is required
thrust::device_vector<mat44> realToVoxel;
if (composition) {
thrust::device_vector<mat44> realToVoxelCudaVec;
if constexpr (composition) {
const mat44 *matPtr = controlPointImage->sform_code > 0 ? &controlPointImage->sto_ijk : &controlPointImage->qto_ijk;
realToVoxel = thrust::device_vector<mat44>(matPtr, matPtr + 1);
realToVoxelCudaVec = thrust::device_vector<mat44>(matPtr, matPtr + 1);
}
const auto realToVoxelCuda = composition ? realToVoxelCudaVec.data().get() : nullptr;

if (referenceImage->nz > 1) {
const unsigned blocks = CudaContext::GetBlockSize()->GetDeformationField3d;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
// 8 floats of shared memory are allocated per thread
GetDeformationField3d<<<gridDims, blockDims, blocks * 8 * sizeof(float)>>>(deformationFieldCuda,
*controlPointTexture,
*maskTexture,
realToVoxel.data().get(),
referenceImageDim,
controlPointImageDim,
controlPointVoxelSpacing,
(unsigned)activeVoxelNumber,
composition,
bspline);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [=]__device__(const int index) {
GetDeformationField3d<composition, bspline>(deformationFieldCuda, controlPointTexture, maskTexture, realToVoxelCuda,
referenceImageDim, controlPointImageDim, controlPointVoxelSpacing, index);
});
} else {
const unsigned blocks = CudaContext::GetBlockSize()->GetDeformationField2d;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
// 4 floats of shared memory are allocated per thread
GetDeformationField2d<<<gridDims, blockDims, blocks * 4 * sizeof(float)>>>(deformationFieldCuda,
*controlPointTexture,
*maskTexture,
realToVoxel.data().get(),
referenceImageDim,
controlPointImageDim,
controlPointVoxelSpacing,
(unsigned)activeVoxelNumber,
composition,
bspline);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [=]__device__(const int index) {
GetDeformationField2d<composition, bspline>(deformationFieldCuda, controlPointTexture, maskTexture, realToVoxelCuda,
referenceImageDim, controlPointImageDim, controlPointVoxelSpacing, index);
});
}
}
template void GetDeformationField<false, false>(const nifti_image*, const nifti_image*, const float4*, float4*, const int*, const size_t);
template void GetDeformationField<true, false>(const nifti_image*, const nifti_image*, const float4*, float4*, const int*, const size_t);
/* *************************************************************** */
template<bool is3d>
struct Basis2nd {
Expand Down Expand Up @@ -644,14 +624,12 @@ void GetFlowFieldFromVelocityGrid(nifti_image *velocityFieldGrid,
// Copy over the number of required squaring steps
flowField->intent_p2 = velocityFieldGrid->intent_p2;
// The initial flow field is generated using cubic B-Spline interpolation/approximation
GetDeformationField(velocityFieldGrid,
flowField,
velocityFieldGridCuda,
flowFieldCuda,
maskCuda,
activeVoxelNumber,
true, // composition
true); // bspline
GetDeformationField<true, true>(velocityFieldGrid,
flowField,
velocityFieldGridCuda,
flowFieldCuda,
maskCuda,
activeVoxelNumber);

velocityFieldGrid->num_ext = oldNumExt;
}
Expand Down Expand Up @@ -784,14 +762,12 @@ void GetDefFieldFromVelocityGrid(nifti_image *velocityFieldGrid,
// Check if the velocity field is actually a velocity field
if (velocityFieldGrid->intent_p1 == CUB_SPLINE_GRID) {
// Use the spline approximation to generate the deformation field
GetDeformationField(velocityFieldGrid,
deformationField,
velocityFieldGridCuda,
deformationFieldCuda,
maskCuda.data().get(),
voxelNumber,
false, // composition
true); // bspline
GetDeformationField<false, true>(velocityFieldGrid,
deformationField,
velocityFieldGridCuda,
deformationFieldCuda,
maskCuda.data().get(),
voxelNumber);
} else if (velocityFieldGrid->intent_p1 == SPLINE_VEL_GRID) {
// Create an image to store the flow field
NiftiImage flowField(deformationField, NiftiImage::Copy::ImageInfo);
Expand Down
5 changes: 2 additions & 3 deletions reg-lib/cuda/CudaLocalTransformation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,13 @@ void GetDeformationFromDisplacement(nifti_image *image, float4 *imageCuda);
/* *************************************************************** */
void GetDisplacementFromDeformation(nifti_image *image, float4 *imageCuda);
/* *************************************************************** */
template<bool composition, bool bspline>
void GetDeformationField(const nifti_image *controlPointImage,
const nifti_image *referenceImage,
const float4 *controlPointImageCuda,
float4 *deformationFieldCuda,
const int *maskCuda,
const size_t activeVoxelNumber,
const bool composition,
const bool bspline);
const size_t activeVoxelNumber);
/* *************************************************************** */
template<bool is3d>
double ApproxBendingEnergy(const nifti_image *controlPointImage,
Expand Down
Loading

0 comments on commit 6511740

Please sign in to comment.