From 07f231a30b10dadc7f9f67d84e80c5d11d4d45af Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 5 Mar 2024 21:04:11 +0100 Subject: [PATCH] Switch padding kernel to row major --- src/kernels/padding.cu | 37 ++++++++++++++++++++---------------- test/kernels/test_padding.cu | 30 +++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/src/kernels/padding.cu b/src/kernels/padding.cu index f8b60c1..79f1ac1 100644 --- a/src/kernels/padding.cu +++ b/src/kernels/padding.cu @@ -2,7 +2,7 @@ /* Pads matrix width x height x n_channels to width + 2 * padding x height + 2 * -padding x n_channels Matrix is represented as a pointer to column major vector +padding x n_channels Matrix is represented as a pointer to a vector For example: @@ -22,20 +22,29 @@ Channel 1: Is represented as: -0 2 4 1 3 5 6 8 10 7 9 11 +0 1 2 3 4 5 6 7 8 9 10 11 Padded result (as a continuous vector): -0 0 0 0 0 0 0 2 4 0 -0 1 3 5 0 0 0 0 0 0 -0 0 0 0 0 0 6 8 10 0 -0 7 9 11 0 0 0 0 0 0 +0.0f, 0.0f, 0.0f, 0.0f, +0.0f, 0.0f, 1.0f, 0.0f, +0.0f, 2.0f, 3.0f, 0.0f, +0.0f, 4.0f, 5.0f, 0.0f, +0.0f, 0.0f, 0.0f, 0.0f, +0.0f, 0.0f, 0.0f, 0.0f, +0.0f, 6.0f, 7.0f, 0.0f, +0.0f, 8.0f, 9.0f, 0.0f, +9.0f, 10.0f, 11.0f, 0.0f, +0.0f, 0.0f, 0.0f, 0.0f Args: d_input: Pointer to input vector representing matrix d_padded: Pointer to output vector representing padded matrix (needs to be -pre-allocated) w: Width of input matrix h: Height of input matrix n: Number of -channels in input matrix p: Padding +pre-allocated) + w: Width of input matrix + h: Height of input matrix + n: Number of channels in input matrix + p: Padding */ __global__ void pad_matrix_kernel( const float* d_input, @@ -53,21 +62,17 @@ __global__ void pad_matrix_kernel( int idx = tid; - // unravel index - int i_h = idx % (h + 2 * p); - idx /= (h + 2 * p); - + // unravel index into padded matrix + int i_n = idx / ((w + 2 * p) * (h + 2 * p)); + int i_h = idx % ((w + 2 * p) * (h + 2 * p)) / (w + 2 * p); int i_w = idx % (w + 2 * p); - idx /= (w + 2 * p); - - int i_n = idx % n; // if i is in the padding region if (i_w < p || i_w >= (w + p) || i_h < p || i_h >= (h + p)) { d_padded[tid] = 0.0f; } else { // Get index into input vector - int i_orig = i_n * w * h + (i_w - p) * h + (i_h - p); + int i_orig = i_n * w * h + (i_h - p) * w + (i_w - p); d_padded[tid] = d_input[i_orig]; } } diff --git a/test/kernels/test_padding.cu b/test/kernels/test_padding.cu index 8ab56be..8038245 100644 --- a/test/kernels/test_padding.cu +++ b/test/kernels/test_padding.cu @@ -35,13 +35,13 @@ TEST(PaddingTest, SimplePaddingTest) { 8 9 10 11 - Represented as column major vector: + Represented as a vector: - 0 2 4 1 3 5 6 8 10 7 9 11 + 0 1 2 3 4 5 6 7 8 9 10 11 */ - std::vector input = {0.0f, 2.0f, 4.0f, 1.0f, 3.0f, 5.0f, - 6.0f, 8.0f, 10.0f, 7.0f, 9.0f, 11.0f}; + std::vector input = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; cudaStatus = cudaMemcpy( d_input, input.data(), sizeof(float) * inputSize, cudaMemcpyHostToDevice @@ -57,12 +57,22 @@ TEST(PaddingTest, SimplePaddingTest) { cudaStatus = cudaDeviceSynchronize(); EXPECT_EQ(cudaStatus, cudaSuccess); + // clang-format off std::vector expectedOutput = { - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 4.0f, 0.0f, - 0.0f, 1.0f, 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 6.0f, 8.0f, 10.0f, 0.0f, - 0.0f, 7.0f, 9.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f + // channel 0 + 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 2.0f, 3.0f, 0.0f, + 0.0f, 4.0f, 5.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, + // channel 1 + 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 6.0f, 7.0f, 0.0f, + 0.0f, 8.0f, 9.0f, 0.0f, + 0.0f, 10.0f, 11.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f }; + // clang-format on std::vector output(paddedSize); @@ -75,4 +85,8 @@ TEST(PaddingTest, SimplePaddingTest) { for (int i = 0; i < paddedSize; i++) { EXPECT_NEAR(expectedOutput[i], output[i], 1e-5); } + + + cudaFree(d_input); + cudaFree(d_padded); }