Skip to content

Commit

Permalink
Implement multi-timepoint support for CUDA #92
Browse files Browse the repository at this point in the history
  • Loading branch information
onurulgen committed Nov 21, 2023
1 parent 1e8b36e commit 13697c3
Show file tree
Hide file tree
Showing 12 changed files with 229 additions and 285 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
366
367
2 changes: 1 addition & 1 deletion reg-lib/Content.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Content::Content(nifti_image *referenceIn,
NR_FATAL_ERROR("referenceIn or floatingIn can't be nullptr");
AllocateWarped();
AllocateDeformationField(bytesIn);
activeVoxelNumber = reference->nvox;
activeVoxelNumber = NiftiImage::calcVoxelNumber(reference, 3);
if (!referenceMask) {
referenceMaskManaged.reset(new int[activeVoxelNumber]());
referenceMask = referenceMaskManaged.get();
Expand Down
6 changes: 0 additions & 6 deletions reg-lib/cuda/BlockSize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ struct BlockSize {
unsigned reg_getConjugateGradient1;
unsigned reg_getConjugateGradient2;
unsigned reg_updateControlPointPosition;
unsigned GetSsdValue;
unsigned GetSsdGradient;
unsigned reg_voxelCentricToNodeCentric;
unsigned reg_convertNmiGradientFromVoxelToRealSpace;
unsigned reg_ApplyConvolutionWindowAlongX;
Expand Down Expand Up @@ -74,8 +72,6 @@ struct BlockSize100: public BlockSize {
reg_getConjugateGradient1 = 320; // 12 reg - 24 smem
reg_getConjugateGradient2 = 384; // 10 reg - 40 smem
reg_updateControlPointPosition = 384; // 08 reg - 24 smem
GetSsdValue = 320; // 12 reg - 24 smem - 08 cmem
GetSsdGradient = 320; // 12 reg - 24 smem - 08 cmem
reg_voxelCentricToNodeCentric = 320; // 11 reg - 24 smem - 16 cmem
reg_convertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem
reg_ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem
Expand Down Expand Up @@ -114,8 +110,6 @@ struct BlockSize300: public BlockSize {
reg_getConjugateGradient1 = 1024; // 22 reg
reg_getConjugateGradient2 = 1024; // 25 reg
reg_updateControlPointPosition = 1024; // 22 reg
GetSsdValue = 768; // 34 reg
GetSsdGradient = 768; // 34 reg
reg_voxelCentricToNodeCentric = 1024; // 23 reg
reg_convertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg
reg_ApplyConvolutionWindowAlongX = 1024; // 25 reg
Expand Down
35 changes: 19 additions & 16 deletions reg-lib/cuda/CudaCompute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@
/* *************************************************************** */
void CudaCompute::ResampleImage(int interpolation, float paddingValue) {
CudaContent& con = dynamic_cast<CudaContent&>(this->con);
reg_resampleImage_gpu(con.Content::GetFloating(),
con.GetWarpedCuda(),
con.GetFloatingCuda(),
con.GetDeformationFieldCuda(),
con.GetReferenceMaskCuda(),
con.GetActiveVoxelNumber(),
interpolation,
paddingValue);
const nifti_image *floating = con.Content::GetFloating();
auto resampleImage = floating->nz > 1 ? Cuda::ResampleImage<true> : Cuda::ResampleImage<false>;
resampleImage(floating,
con.GetFloatingCuda(),
con.Content::GetWarped(),
con.GetWarpedCuda(),
con.GetDeformationFieldCuda(),
con.GetReferenceMaskCuda(),
con.GetActiveVoxelNumber(),
interpolation,
paddingValue);
}
/* *************************************************************** */
double CudaCompute::GetJacobianPenaltyTerm(bool approx) {
Expand Down Expand Up @@ -124,15 +127,15 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof,
}
/* *************************************************************** */
void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimePoint) {
// TODO Fix reg_getImageGradient_gpu to accept activeTimePoint
CudaDefContent& con = dynamic_cast<CudaDefContent&>(this->con);
reg_getImageGradient_gpu(con.Content::GetFloating(),
con.GetFloatingCuda(),
con.GetDeformationFieldCuda(),
con.GetWarpedGradientCuda(),
con.GetActiveVoxelNumber(),
interpolation,
paddingValue);
Cuda::GetImageGradient(con.Content::GetFloating(),
con.GetFloatingCuda(),
con.GetDeformationFieldCuda(),
con.GetWarpedGradientCuda(),
con.GetActiveVoxelNumber(),
interpolation,
paddingValue,
activeTimePoint);
}
/* *************************************************************** */
double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) {
Expand Down
15 changes: 7 additions & 8 deletions reg-lib/cuda/CudaContent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,22 +91,21 @@ void CudaContent::SetReferenceMask(int *referenceMaskIn) {
referenceMaskCuda = nullptr;
}

activeVoxelNumber = 0;
if (!referenceMask) return;

decltype(referenceMask) targetMask;
NR_CUDA_SAFE_CALL(cudaMallocHost(&targetMask, reference->nvox * sizeof(*targetMask)));
int *targetMaskPtr = targetMask;
activeVoxelNumber = 0;
for (size_t i = 0; i < reference->nvox; i++) {
const size_t voxelNumber = NiftiImage::calcVoxelNumber(reference, 3);
thrust::host_vector<int> mask(voxelNumber);
int *maskPtr = mask.data();
for (size_t i = 0; i < voxelNumber; i++) {
if (referenceMask[i] != -1) {
*targetMaskPtr++ = i;
*maskPtr++ = static_cast<int>(i);
activeVoxelNumber++;
}
}

Cuda::Allocate(&referenceMaskCuda, activeVoxelNumber);
NR_CUDA_SAFE_CALL(cudaMemcpy(referenceMaskCuda, targetMask, activeVoxelNumber * sizeof(*targetMask), cudaMemcpyHostToDevice));
NR_CUDA_SAFE_CALL(cudaFreeHost(targetMask));
thrust::copy_n(mask.begin(), activeVoxelNumber, thrust::device_ptr<int>(referenceMaskCuda));
}
/* *************************************************************** */
void CudaContent::SetTransformationMatrix(mat44 *transformationMatrixIn) {
Expand Down
99 changes: 51 additions & 48 deletions reg-lib/cuda/CudaResampling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,89 +14,92 @@
#include "CudaResamplingKernels.cu"

/* *************************************************************** */
void reg_resampleImage_gpu(const nifti_image *floatingImage,
float *warpedImageCuda,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
const int *maskCuda,
const size_t activeVoxelNumber,
const int interpolation,
const float paddingValue) {
namespace NiftyReg::Cuda {
/* *************************************************************** */
template<bool is3d>
void ResampleImage(const nifti_image *floatingImage,
const float *floatingImageCuda,
const nifti_image *warpedImage,
float *warpedImageCuda,
const float4 *deformationFieldCuda,
const int *maskCuda,
const size_t activeVoxelNumber,
const int interpolation,
const float paddingValue) {
if (interpolation != 1)
NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");

auto blockSize = CudaContext::GetBlockSize();
const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);

// Create the texture object for the floating image
auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1);
// Create the texture object for the deformation field
auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
// Create the texture object for the mask
auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);

// Bind the real to voxel matrix to the texture
const mat44 floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;

if (floatingImage->nz > 1) {
const unsigned blocks = blockSize->reg_resampleImage3D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
reg_resampleImage3D_kernel<<<gridDims, blockDims>>>(warpedImageCuda, *floatingTexture, *deformationFieldTexture, *maskTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
} else {
const unsigned blocks = blockSize->reg_resampleImage2D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
reg_resampleImage2D_kernel<<<gridDims, blockDims>>>(warpedImageCuda, *floatingTexture, *deformationFieldTexture, *maskTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
for (int t = 0; t < warpedImage->nt * warpedImage->nu; t++) {
NR_DEBUG((is3d ? "3" : "2") << "D resampling of volume number " << t);
auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
if constexpr (is3d) {
const unsigned blocks = blockSize->reg_resampleImage3D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
ResampleImage3D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
} else {
const unsigned blocks = blockSize->reg_resampleImage2D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
ResampleImage2D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
}
}
}
template void ResampleImage<false>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
template void ResampleImage<true>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
/* *************************************************************** */
void reg_getImageGradient_gpu(const nifti_image *floatingImage,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
float4 *warpedGradientCuda,
const size_t activeVoxelNumber,
const int interpolation,
float paddingValue) {
void GetImageGradient(const nifti_image *floatingImage,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
float4 *warpedGradientCuda,
const size_t activeVoxelNumber,
const int interpolation,
float paddingValue,
const int activeTimePoint) {
if (interpolation != 1)
NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");

auto blockSize = CudaContext::GetBlockSize();
const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);
if (paddingValue != paddingValue) paddingValue = 0;

// Create the texture object for the floating image
auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1);
// Create the texture object for the deformation field
auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);

// Bind the real to voxel matrix to the texture
const mat44 floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;

if (floatingImage->nz > 1) {
const unsigned blocks = blockSize->reg_getImageGradient3D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
reg_getImageGradient3D_kernel<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
GetImageGradient3D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
} else {
const unsigned blocks = blockSize->reg_getImageGradient2D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
reg_getImageGradient2D_kernel<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
GetImageGradient2D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
}
}
/* *************************************************************** */
} // namespace NiftyReg::Cuda
/* *************************************************************** */
37 changes: 22 additions & 15 deletions reg-lib/cuda/CudaResampling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,27 @@
#include "CudaCommon.hpp"

/* *************************************************************** */
void reg_resampleImage_gpu(const nifti_image *floatingImage,
float *warpedImageCuda,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
const int *maskCuda,
const size_t activeVoxelNumber,
const int interpolation,
const float paddingValue);
namespace NiftyReg::Cuda {
/* *************************************************************** */
void reg_getImageGradient_gpu(const nifti_image *floatingImage,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
float4 *warpedGradientCuda,
const size_t activeVoxelNumber,
const int interpolation,
float paddingValue);
template<bool is3d>
void ResampleImage(const nifti_image *floatingImage,
const float *floatingImageCuda,
const nifti_image *warpedImage,
float *warpedImageCuda,
const float4 *deformationFieldCuda,
const int *maskCuda,
const size_t activeVoxelNumber,
const int interpolation,
const float paddingValue);
/* *************************************************************** */
void GetImageGradient(const nifti_image *floatingImage,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
float4 *warpedGradientCuda,
const size_t activeVoxelNumber,
const int interpolation,
float paddingValue,
const int activeTimePoint);
/* *************************************************************** */
} // namespace NiftyReg::Cuda
/* *************************************************************** */
Loading

0 comments on commit 13697c3

Please sign in to comment.