Implement multi-timepoint support for CUDA #92

KCL-BMEIS · Nov 21, 2023 · 13697c3 · 13697c3
1 parent 1e8b36e
commit 13697c3
Show file tree

Hide file tree

Showing 12 changed files with 229 additions and 285 deletions.
diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt
@@ -1 +1 @@
-366
+367
diff --git a/reg-lib/Content.cpp b/reg-lib/Content.cpp
@@ -15,7 +15,7 @@ Content::Content(nifti_image *referenceIn,
         NR_FATAL_ERROR("referenceIn or floatingIn can't be nullptr");
     AllocateWarped();
     AllocateDeformationField(bytesIn);
-    activeVoxelNumber = reference->nvox;
+    activeVoxelNumber = NiftiImage::calcVoxelNumber(reference, 3);
     if (!referenceMask) {
         referenceMaskManaged.reset(new int[activeVoxelNumber]());
         referenceMask = referenceMaskManaged.get();

diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp
@@ -36,8 +36,6 @@ struct BlockSize {
     unsigned reg_getConjugateGradient1;
     unsigned reg_getConjugateGradient2;
     unsigned reg_updateControlPointPosition;
-    unsigned GetSsdValue;
-    unsigned GetSsdGradient;
     unsigned reg_voxelCentricToNodeCentric;
     unsigned reg_convertNmiGradientFromVoxelToRealSpace;
     unsigned reg_ApplyConvolutionWindowAlongX;
@@ -74,8 +72,6 @@ struct BlockSize100: public BlockSize {
         reg_getConjugateGradient1 = 320; // 12 reg - 24 smem
         reg_getConjugateGradient2 = 384; // 10 reg - 40 smem
         reg_updateControlPointPosition = 384; // 08 reg - 24 smem
-        GetSsdValue = 320; // 12 reg - 24 smem - 08 cmem
-        GetSsdGradient = 320; // 12 reg - 24 smem - 08 cmem
         reg_voxelCentricToNodeCentric = 320; // 11 reg - 24 smem - 16 cmem
         reg_convertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem
         reg_ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem
@@ -114,8 +110,6 @@ struct BlockSize300: public BlockSize {
         reg_getConjugateGradient1 = 1024; // 22 reg
         reg_getConjugateGradient2 = 1024; // 25 reg
         reg_updateControlPointPosition = 1024; // 22 reg
-        GetSsdValue = 768; // 34 reg
-        GetSsdGradient = 768; // 34 reg
         reg_voxelCentricToNodeCentric = 1024; // 23 reg
         reg_convertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg
         reg_ApplyConvolutionWindowAlongX = 1024; // 25 reg

diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu
@@ -9,14 +9,17 @@
 /* *************************************************************** */
 void CudaCompute::ResampleImage(int interpolation, float paddingValue) {
     CudaContent& con = dynamic_cast<CudaContent&>(this->con);
-    reg_resampleImage_gpu(con.Content::GetFloating(),
-                          con.GetWarpedCuda(),
-                          con.GetFloatingCuda(),
-                          con.GetDeformationFieldCuda(),
-                          con.GetReferenceMaskCuda(),
-                          con.GetActiveVoxelNumber(),
-                          interpolation,
-                          paddingValue);
+    const nifti_image *floating = con.Content::GetFloating();
+    auto resampleImage = floating->nz > 1 ? Cuda::ResampleImage<true> : Cuda::ResampleImage<false>;
+    resampleImage(floating,
+                  con.GetFloatingCuda(),
+                  con.Content::GetWarped(),
+                  con.GetWarpedCuda(),
+                  con.GetDeformationFieldCuda(),
+                  con.GetReferenceMaskCuda(),
+                  con.GetActiveVoxelNumber(),
+                  interpolation,
+                  paddingValue);
 }
 /* *************************************************************** */
 double CudaCompute::GetJacobianPenaltyTerm(bool approx) {
@@ -124,15 +127,15 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof,
 }
 /* *************************************************************** */
 void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimePoint) {
-    // TODO Fix reg_getImageGradient_gpu to accept activeTimePoint
     CudaDefContent& con = dynamic_cast<CudaDefContent&>(this->con);
-    reg_getImageGradient_gpu(con.Content::GetFloating(),
-                             con.GetFloatingCuda(),
-                             con.GetDeformationFieldCuda(),
-                             con.GetWarpedGradientCuda(),
-                             con.GetActiveVoxelNumber(),
-                             interpolation,
-                             paddingValue);
+    Cuda::GetImageGradient(con.Content::GetFloating(),
+                           con.GetFloatingCuda(),
+                           con.GetDeformationFieldCuda(),
+                           con.GetWarpedGradientCuda(),
+                           con.GetActiveVoxelNumber(),
+                           interpolation,
+                           paddingValue,
+                           activeTimePoint);
 }
 /* *************************************************************** */
 double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) {

diff --git a/reg-lib/cuda/CudaContent.cpp b/reg-lib/cuda/CudaContent.cpp
@@ -91,22 +91,21 @@ void CudaContent::SetReferenceMask(int *referenceMaskIn) {
         referenceMaskCuda = nullptr;
     }
 
+    activeVoxelNumber = 0;
     if (!referenceMask) return;
 
-    decltype(referenceMask) targetMask;
-    NR_CUDA_SAFE_CALL(cudaMallocHost(&targetMask, reference->nvox * sizeof(*targetMask)));
-    int *targetMaskPtr = targetMask;
-    activeVoxelNumber = 0;
-    for (size_t i = 0; i < reference->nvox; i++) {
+    const size_t voxelNumber = NiftiImage::calcVoxelNumber(reference, 3);
+    thrust::host_vector<int> mask(voxelNumber);
+    int *maskPtr = mask.data();
+    for (size_t i = 0; i < voxelNumber; i++) {
         if (referenceMask[i] != -1) {
-            *targetMaskPtr++ = i;
+            *maskPtr++ = static_cast<int>(i);
             activeVoxelNumber++;
         }
     }
 
     Cuda::Allocate(&referenceMaskCuda, activeVoxelNumber);
-    NR_CUDA_SAFE_CALL(cudaMemcpy(referenceMaskCuda, targetMask, activeVoxelNumber * sizeof(*targetMask), cudaMemcpyHostToDevice));
-    NR_CUDA_SAFE_CALL(cudaFreeHost(targetMask));
+    thrust::copy_n(mask.begin(), activeVoxelNumber, thrust::device_ptr<int>(referenceMaskCuda));
 }
 /* *************************************************************** */
 void CudaContent::SetTransformationMatrix(mat44 *transformationMatrixIn) {

diff --git a/reg-lib/cuda/CudaResampling.cu b/reg-lib/cuda/CudaResampling.cu
@@ -14,89 +14,92 @@
 #include "CudaResamplingKernels.cu"
 
 /* *************************************************************** */
-void reg_resampleImage_gpu(const nifti_image *floatingImage,
-                           float *warpedImageCuda,
-                           const float *floatingImageCuda,
-                           const float4 *deformationFieldCuda,
-                           const int *maskCuda,
-                           const size_t activeVoxelNumber,
-                           const int interpolation,
-                           const float paddingValue) {
+namespace NiftyReg::Cuda {
+/* *************************************************************** */
+template<bool is3d>
+void ResampleImage(const nifti_image *floatingImage,
+                   const float *floatingImageCuda,
+                   const nifti_image *warpedImage,
+                   float *warpedImageCuda,
+                   const float4 *deformationFieldCuda,
+                   const int *maskCuda,
+                   const size_t activeVoxelNumber,
+                   const int interpolation,
+                   const float paddingValue) {
     if (interpolation != 1)
         NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");
 
     auto blockSize = CudaContext::GetBlockSize();
     const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
     const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);
-
-    // Create the texture object for the floating image
-    auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1);
-    // Create the texture object for the deformation field
     auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
-    // Create the texture object for the mask
     auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);
-
     // Bind the real to voxel matrix to the texture
-    const mat44 floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
+    const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
 
-    if (floatingImage->nz > 1) {
-        const unsigned blocks = blockSize->reg_resampleImage3D;
-        const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
-        const dim3 gridDims(grids, grids, 1);
-        const dim3 blockDims(blocks, 1, 1);
-        reg_resampleImage3D_kernel<<<gridDims, blockDims>>>(warpedImageCuda, *floatingTexture, *deformationFieldTexture, *maskTexture,
-                                                            floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
-        NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-    } else {
-        const unsigned blocks = blockSize->reg_resampleImage2D;
-        const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
-        const dim3 gridDims(grids, grids, 1);
-        const dim3 blockDims(blocks, 1, 1);
-        reg_resampleImage2D_kernel<<<gridDims, blockDims>>>(warpedImageCuda, *floatingTexture, *deformationFieldTexture, *maskTexture,
-                                                            floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
-        NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
+    for (int t = 0; t < warpedImage->nt * warpedImage->nu; t++) {
+        NR_DEBUG((is3d ? "3" : "2") << "D resampling of volume number " << t);
+        auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
+        if constexpr (is3d) {
+            const unsigned blocks = blockSize->reg_resampleImage3D;
+            const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
+            const dim3 gridDims(grids, grids, 1);
+            const dim3 blockDims(blocks, 1, 1);
+            ResampleImage3D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
+                                                     floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
+            NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
+        } else {
+            const unsigned blocks = blockSize->reg_resampleImage2D;
+            const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
+            const dim3 gridDims(grids, grids, 1);
+            const dim3 blockDims(blocks, 1, 1);
+            ResampleImage2D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
+                                                     floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
+            NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
+        }
     }
 }
+template void ResampleImage<false>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
+template void ResampleImage<true>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
 /* *************************************************************** */
-void reg_getImageGradient_gpu(const nifti_image *floatingImage,
-                              const float *floatingImageCuda,
-                              const float4 *deformationFieldCuda,
-                              float4 *warpedGradientCuda,
-                              const size_t activeVoxelNumber,
-                              const int interpolation,
-                              float paddingValue) {
+void GetImageGradient(const nifti_image *floatingImage,
+                      const float *floatingImageCuda,
+                      const float4 *deformationFieldCuda,
+                      float4 *warpedGradientCuda,
+                      const size_t activeVoxelNumber,
+                      const int interpolation,
+                      float paddingValue,
+                      const int activeTimePoint) {
     if (interpolation != 1)
         NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");
 
     auto blockSize = CudaContext::GetBlockSize();
     const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
     const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);
     if (paddingValue != paddingValue) paddingValue = 0;
-
-    // Create the texture object for the floating image
-    auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1);
-    // Create the texture object for the deformation field
+    auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
     auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
-
     // Bind the real to voxel matrix to the texture
-    const mat44 floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
+    const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
 
     if (floatingImage->nz > 1) {
         const unsigned blocks = blockSize->reg_getImageGradient3D;
         const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
         const dim3 gridDims(grids, grids, 1);
         const dim3 blockDims(blocks, 1, 1);
-        reg_getImageGradient3D_kernel<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
-                                                               floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
+        GetImageGradient3D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
+                                                    floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
         NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
     } else {
         const unsigned blocks = blockSize->reg_getImageGradient2D;
         const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
         const dim3 gridDims(grids, grids, 1);
         const dim3 blockDims(blocks, 1, 1);
-        reg_getImageGradient2D_kernel<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
-                                                               floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
+        GetImageGradient2D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
+                                                    floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
         NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
     }
 }
 /* *************************************************************** */
+} // namespace NiftyReg::Cuda
+/* *************************************************************** */
diff --git a/reg-lib/cuda/CudaResampling.hpp b/reg-lib/cuda/CudaResampling.hpp
@@ -15,20 +15,27 @@
 #include "CudaCommon.hpp"
 
 /* *************************************************************** */
-void reg_resampleImage_gpu(const nifti_image *floatingImage,
-                           float *warpedImageCuda,
-                           const float *floatingImageCuda,
-                           const float4 *deformationFieldCuda,
-                           const int *maskCuda,
-                           const size_t activeVoxelNumber,
-                           const int interpolation,
-                           const float paddingValue);
+namespace NiftyReg::Cuda {
 /* *************************************************************** */
-void reg_getImageGradient_gpu(const nifti_image *floatingImage,
-                              const float *floatingImageCuda,
-                              const float4 *deformationFieldCuda,
-                              float4 *warpedGradientCuda,
-                              const size_t activeVoxelNumber,
-                              const int interpolation,
-                              float paddingValue);
+template<bool is3d>
+void ResampleImage(const nifti_image *floatingImage,
+                   const float *floatingImageCuda,
+                   const nifti_image *warpedImage,
+                   float *warpedImageCuda,
+                   const float4 *deformationFieldCuda,
+                   const int *maskCuda,
+                   const size_t activeVoxelNumber,
+                   const int interpolation,
+                   const float paddingValue);
+/* *************************************************************** */
+void GetImageGradient(const nifti_image *floatingImage,
+                      const float *floatingImageCuda,
+                      const float4 *deformationFieldCuda,
+                      float4 *warpedGradientCuda,
+                      const size_t activeVoxelNumber,
+                      const int interpolation,
+                      float paddingValue,
+                      const int activeTimePoint);
+/* *************************************************************** */
+} // namespace NiftyReg::Cuda
 /* *************************************************************** */