Switch padding kernel to row major

2025-11-06 01:34:22 +00:00 · 2024-03-05 21:04:11 +01:00
parent f4257afd5a
commit 07f231a30b
2 changed files with 43 additions and 24 deletions
--- a/src/kernels/padding.cu
+++ b/src/kernels/padding.cu
@@ -2,7 +2,7 @@

 /*
 Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
-padding x n_channels Matrix is represented as a pointer to column major vector
+padding x n_channels Matrix is represented as a pointer to a vector

 For example:

@@ -22,20 +22,29 @@ Channel 1:

 Is represented as:

-0 2 4 1 3 5 6 8 10 7 9 11
+0 1 2 3 4 5 6 7 8 9 10 11

 Padded result (as a continuous vector):

-0 0 0 0 0 0 0 2 4 0
-0 1 3 5 0 0 0 0 0 0
-0 0 0 0 0 0 6 8 10 0
-0 7 9 11 0 0 0 0 0 0
+0.0f, 0.0f, 0.0f, 0.0f,
+0.0f, 0.0f, 1.0f, 0.0f,
+0.0f, 2.0f, 3.0f, 0.0f,
+0.0f, 4.0f, 5.0f, 0.0f,
+0.0f, 0.0f, 0.0f, 0.0f,
+0.0f, 0.0f, 0.0f, 0.0f,
+0.0f, 6.0f, 7.0f, 0.0f,
+0.0f, 8.0f, 9.0f, 0.0f,
+9.0f, 10.0f, 11.0f, 0.0f,
+0.0f, 0.0f, 0.0f, 0.0f

 Args:
  d_input: Pointer to input vector representing matrix
  d_padded: Pointer to output vector representing padded matrix (needs to be
-pre-allocated) w: Width of input matrix h: Height of input matrix n: Number of
-channels in input matrix p: Padding
+pre-allocated)
+  w: Width of input matrix
+  h: Height of input matrix
+  n: Number of channels in input matrix
+  p: Padding
 */
 __global__ void pad_matrix_kernel(
    const float* d_input,
@@ -53,21 +62,17 @@ __global__ void pad_matrix_kernel(

    int idx = tid;

-    // unravel index
-    int i_h = idx % (h + 2 * p);
-    idx /= (h + 2 * p);
-
+    // unravel index into padded matrix
+    int i_n = idx / ((w + 2 * p) * (h + 2 * p));
+    int i_h = idx % ((w + 2 * p) * (h + 2 * p)) / (w + 2 * p);
    int i_w = idx % (w + 2 * p);
-    idx /= (w + 2 * p);
-
-    int i_n = idx % n;

    // if i is in the padding region
    if (i_w < p || i_w >= (w + p) || i_h < p || i_h >= (h + p)) {
        d_padded[tid] = 0.0f;
    } else {
        // Get index into input vector
-        int i_orig    = i_n * w * h + (i_w - p) * h + (i_h - p);
+        int i_orig    = i_n * w * h + (i_h - p) * w + (i_w - p);
        d_padded[tid] = d_input[i_orig];
    }
 }
--- a/test/kernels/test_padding.cu
+++ b/test/kernels/test_padding.cu
@@ -35,13 +35,13 @@ TEST(PaddingTest, SimplePaddingTest) {
    8  9
    10 11

-    Represented as column major vector:
+    Represented as a vector:

-    0 2 4 1 3 5 6 8 10 7 9 11
+    0 1 2 3 4 5 6 7 8 9 10 11
    */

-    std::vector<float> input = {0.0f, 2.0f, 4.0f,  1.0f, 3.0f, 5.0f,
-                                6.0f, 8.0f, 10.0f, 7.0f, 9.0f, 11.0f};
+    std::vector<float> input = {0.0f, 1.0f, 2.0f,  3.0f, 4.0f, 5.0f,
+                                6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};

    cudaStatus = cudaMemcpy(
        d_input, input.data(), sizeof(float) * inputSize, cudaMemcpyHostToDevice
@@ -57,12 +57,22 @@ TEST(PaddingTest, SimplePaddingTest) {
    cudaStatus = cudaDeviceSynchronize();
    EXPECT_EQ(cudaStatus, cudaSuccess);

+    // clang-format off
    std::vector<float> expectedOutput = {
-        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f, 2.0f, 4.0f,  0.0f,
-        0.0f, 1.0f, 3.0f, 5.0f,  0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 6.0f, 8.0f, 10.0f, 0.0f,
-        0.0f, 7.0f, 9.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f
+        // channel 0
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, 2.0f, 3.0f, 0.0f,
+        0.0f, 4.0f, 5.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        // channel 1
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 6.0f, 7.0f, 0.0f,
+        0.0f, 8.0f, 9.0f, 0.0f,
+        0.0f, 10.0f, 11.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
    };
+    // clang-format on

    std::vector<float> output(paddedSize);

@@ -75,4 +85,8 @@ TEST(PaddingTest, SimplePaddingTest) {
    for (int i = 0; i < paddedSize; i++) {
        EXPECT_NEAR(expectedOutput[i], output[i], 1e-5);
    }
+
+
+    cudaFree(d_input);
+    cudaFree(d_padded);
 }