Switch padding kernel to row major

2025-12-22 14:24:22 +00:00 · 2024-03-05 21:04:11 +01:00
parent f4257afd5a
commit 07f231a30b
2 changed files with 43 additions and 24 deletions
--- a/src/kernels/padding.cu
+++ b/src/kernels/padding.cu
@@ -2,7 +2,7 @@
 /*
 Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
-padding x n_channels Matrix is represented as a pointer to column major vector
+padding x n_channels Matrix is represented as a pointer to a vector
 For example:
@@ -22,20 +22,29 @@ Channel 1:
 Is represented as:
-0 2 4 1 3 5 6 8 10 7 9 11
+0 1 2 3 4 5 6 7 8 9 10 11
 Padded result (as a continuous vector):
-0 0 0 0 0 0 0 2 4 0
+0.0f, 0.0f, 0.0f, 0.0f,
-0 1 3 5 0 0 0 0 0 0
+0.0f, 0.0f, 1.0f, 0.0f,
-0 0 0 0 0 0 6 8 10 0
+0.0f, 2.0f, 3.0f, 0.0f,
-0 7 9 11 0 0 0 0 0 0
+0.0f, 4.0f, 5.0f, 0.0f,
 0.0f, 0.0f, 0.0f, 0.0f,
 0.0f, 0.0f, 0.0f, 0.0f,
 0.0f, 6.0f, 7.0f, 0.0f,
 0.0f, 8.0f, 9.0f, 0.0f,
 9.0f, 10.0f, 11.0f, 0.0f,
 0.0f, 0.0f, 0.0f, 0.0f
 Args:
  d_input: Pointer to input vector representing matrix
  d_padded: Pointer to output vector representing padded matrix (needs to be
-pre-allocated) w: Width of input matrix h: Height of input matrix n: Number of
+pre-allocated)
-channels in input matrix p: Padding
+  w: Width of input matrix
  h: Height of input matrix
  n: Number of channels in input matrix
  p: Padding
 */
 __global__ void pad_matrix_kernel(
    const float* d_input,
@@ -53,21 +62,17 @@ __global__ void pad_matrix_kernel(
    int idx = tid;
-    // unravel index
+    // unravel index into padded matrix
-    int i_h = idx % (h + 2 * p);
+    int i_n = idx / ((w + 2 * p) * (h + 2 * p));
-    idx /= (h + 2 * p);
+    int i_h = idx % ((w + 2 * p) * (h + 2 * p)) / (w + 2 * p);
    int i_w = idx % (w + 2 * p);
    idx /= (w + 2 * p);
    int i_n = idx % n;
    // if i is in the padding region
    if (i_w < p || i_w >= (w + p) || i_h < p || i_h >= (h + p)) {
        d_padded[tid] = 0.0f;
    } else {
        // Get index into input vector
-        int i_orig    = i_n * w * h + (i_w - p) * h + (i_h - p);
+        int i_orig    = i_n * w * h + (i_h - p) * w + (i_w - p);
        d_padded[tid] = d_input[i_orig];
    }
 }
--- a/test/kernels/test_padding.cu
+++ b/test/kernels/test_padding.cu
@@ -35,13 +35,13 @@ TEST(PaddingTest, SimplePaddingTest) {
    8  9
    10 11
-    Represented as column major vector:
+    Represented as a vector:
-    0 2 4 1 3 5 6 8 10 7 9 11
+    0 1 2 3 4 5 6 7 8 9 10 11
    */
-    std::vector<float> input = {0.0f, 2.0f, 4.0f,  1.0f, 3.0f, 5.0f,
+    std::vector<float> input = {0.0f, 1.0f, 2.0f,  3.0f, 4.0f, 5.0f,
-                                6.0f, 8.0f, 10.0f, 7.0f, 9.0f, 11.0f};
+                                6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
    cudaStatus = cudaMemcpy(
        d_input, input.data(), sizeof(float) * inputSize, cudaMemcpyHostToDevice
@@ -57,12 +57,22 @@ TEST(PaddingTest, SimplePaddingTest) {
    cudaStatus = cudaDeviceSynchronize();
    EXPECT_EQ(cudaStatus, cudaSuccess);
    // clang-format off
    std::vector<float> expectedOutput = {
-        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f, 2.0f, 4.0f,  0.0f,
+        // channel 0
-        0.0f, 1.0f, 3.0f, 5.0f,  0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 6.0f, 8.0f, 10.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f,
-        0.0f, 7.0f, 9.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f
+        0.0f, 2.0f, 3.0f, 0.0f,
        0.0f, 4.0f, 5.0f, 0.0f,
        0.0f, 0.0f, 0.0f, 0.0f,
        // channel 1
        0.0f, 0.0f, 0.0f, 0.0f,
        0.0f, 6.0f, 7.0f, 0.0f,
        0.0f, 8.0f, 9.0f, 0.0f,
        0.0f, 10.0f, 11.0f, 0.0f,
        0.0f, 0.0f, 0.0f, 0.0f
    };
    // clang-format on
    std::vector<float> output(paddedSize);
@@ -75,4 +85,8 @@ TEST(PaddingTest, SimplePaddingTest) {
    for (int i = 0; i < paddedSize; i++) {
        EXPECT_NEAR(expectedOutput[i], output[i], 1e-5);
    }
    cudaFree(d_input);
    cudaFree(d_padded);
 }