From 07f231a30b10dadc7f9f67d84e80c5d11d4d45af Mon Sep 17 00:00:00 2001
From: LordMathis <matus@namesny.com>
Date: Tue, 5 Mar 2024 21:04:11 +0100
Subject: [PATCH] Switch padding kernel to row major

---
 src/kernels/padding.cu       | 37 ++++++++++++++++++++----------------
 test/kernels/test_padding.cu | 30 +++++++++++++++++++++--------
 2 files changed, 43 insertions(+), 24 deletions(-)
diff --git a/src/kernels/padding.cu b/src/kernels/padding.cu
index f8b60c1..79f1ac1 100644
--- a/src/kernels/padding.cu
+++ b/src/kernels/padding.cu
@@ -2,7 +2,7 @@
 
 /*
 Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
-padding x n_channels Matrix is represented as a pointer to column major vector
+padding x n_channels Matrix is represented as a pointer to a vector
 
 For example:
 
@@ -22,20 +22,29 @@ Channel 1:
 
 Is represented as:
 
-0 2 4 1 3 5 6 8 10 7 9 11
+0 1 2 3 4 5 6 7 8 9 10 11
 
 Padded result (as a continuous vector):
 
-0 0 0 0 0 0 0 2 4 0
-0 1 3 5 0 0 0 0 0 0
-0 0 0 0 0 0 6 8 10 0
-0 7 9 11 0 0 0 0 0 0
+0.0f, 0.0f, 0.0f, 0.0f,
+0.0f, 0.0f, 1.0f, 0.0f,
+0.0f, 2.0f, 3.0f, 0.0f,
+0.0f, 4.0f, 5.0f, 0.0f,
+0.0f, 0.0f, 0.0f, 0.0f,
+0.0f, 0.0f, 0.0f, 0.0f,
+0.0f, 6.0f, 7.0f, 0.0f,
+0.0f, 8.0f, 9.0f, 0.0f,
+9.0f, 10.0f, 11.0f, 0.0f,
+0.0f, 0.0f, 0.0f, 0.0f
 
 Args:
   d_input: Pointer to input vector representing matrix
   d_padded: Pointer to output vector representing padded matrix (needs to be
-pre-allocated) w: Width of input matrix h: Height of input matrix n: Number of
-channels in input matrix p: Padding
+pre-allocated)
+  w: Width of input matrix
+  h: Height of input matrix
+  n: Number of channels in input matrix
+  p: Padding
 */
 __global__ void pad_matrix_kernel(
     const float* d_input,
@@ -53,21 +62,17 @@ __global__ void pad_matrix_kernel(
 
     int idx = tid;
 
-    // unravel index
-    int i_h = idx % (h + 2 * p);
-    idx /= (h + 2 * p);
-
+    // unravel index into padded matrix
+    int i_n = idx / ((w + 2 * p) * (h + 2 * p));
+    int i_h = idx % ((w + 2 * p) * (h + 2 * p)) / (w + 2 * p);
     int i_w = idx % (w + 2 * p);
-    idx /= (w + 2 * p);
-
-    int i_n = idx % n;
 
     // if i is in the padding region
     if (i_w < p || i_w >= (w + p) || i_h < p || i_h >= (h + p)) {
         d_padded[tid] = 0.0f;
     } else {
         // Get index into input vector
-        int i_orig    = i_n * w * h + (i_w - p) * h + (i_h - p);
+        int i_orig    = i_n * w * h + (i_h - p) * w + (i_w - p);
         d_padded[tid] = d_input[i_orig];
     }
 }
diff --git a/test/kernels/test_padding.cu b/test/kernels/test_padding.cu
index 8ab56be..8038245 100644
--- a/test/kernels/test_padding.cu
+++ b/test/kernels/test_padding.cu
@@ -35,13 +35,13 @@ TEST(PaddingTest, SimplePaddingTest) {
     8  9
     10 11
 
-    Represented as column major vector:
+    Represented as a vector:
 
-    0 2 4 1 3 5 6 8 10 7 9 11
+    0 1 2 3 4 5 6 7 8 9 10 11
     */
 
-    std::vector<float> input = {0.0f, 2.0f, 4.0f,  1.0f, 3.0f, 5.0f,
-                                6.0f, 8.0f, 10.0f, 7.0f, 9.0f, 11.0f};
+    std::vector<float> input = {0.0f, 1.0f, 2.0f,  3.0f, 4.0f, 5.0f,
+                                6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
 
     cudaStatus = cudaMemcpy(
         d_input, input.data(), sizeof(float) * inputSize, cudaMemcpyHostToDevice
@@ -57,12 +57,22 @@ TEST(PaddingTest, SimplePaddingTest) {
     cudaStatus = cudaDeviceSynchronize();
     EXPECT_EQ(cudaStatus, cudaSuccess);
 
+    // clang-format off
     std::vector<float> expectedOutput = {
-        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f, 2.0f, 4.0f,  0.0f,
-        0.0f, 1.0f, 3.0f, 5.0f,  0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 6.0f, 8.0f, 10.0f, 0.0f,
-        0.0f, 7.0f, 9.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f
+        // channel 0
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, 2.0f, 3.0f, 0.0f,
+        0.0f, 4.0f, 5.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        // channel 1
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 6.0f, 7.0f, 0.0f,
+        0.0f, 8.0f, 9.0f, 0.0f,
+        0.0f, 10.0f, 11.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
     };
+    // clang-format on
 
     std::vector<float> output(paddedSize);
 
@@ -75,4 +85,8 @@ TEST(PaddingTest, SimplePaddingTest) {
     for (int i = 0; i < paddedSize; i++) {
         EXPECT_NEAR(expectedOutput[i], output[i], 1e-5);
     }
+
+
+    cudaFree(d_input);
+    cudaFree(d_padded);
 }