Start implementing padding kernel

2025-12-22 14:24:22 +00:00 · 2024-02-29 22:21:48 +01:00
parent 045359cca2
commit 019ccc33d9
5 changed files with 158 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@ include_directories(${CUDAToolkit_INCLUDE_DIRS})
 set(LIBRARY_SOURCES
    src/utils/cuda_helper.cu
    src/kernels/activations.cu
    src/kernels/padding.cu
    src/layers/dense.cu
 )
--- a/include/kernels/padding.cuh
+++ b/include/kernels/padding.cuh
@@ -0,0 +1,13 @@
 #ifndef PADDING_H
 #define PADDING_H
 __global__ void pad_matrix_kernel(
    const float* d_input,
    float*       d_padded,
    int          w,
    int          h,
    int          n,
    int          p
 );
 #endif  // PADDING_H
--- a/src/kernels/padding.cu
+++ b/src/kernels/padding.cu
@@ -0,0 +1,57 @@
 /*
 Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
 padding x n_channels Matrix is represented as a pointer to column major vector
 For example:
 w = 2
 h = 3
 n = 2
 p = 1
 Channel 0:
  0  1
  2  3
  4  5
 Channel 1:
  6  7
  8  9
 10 11
 Is represented as:
 0 2 4 1 3 5 6 8 10 7 9 11
 Padded result:
 0 0 0 0 0 0 0 2 4 0 0 1 3 5 0 0 0 0 0 0 0 0 0 0 0 0 6 8 10 0 0 7 9 11 0 0 0 0 0 0
 Args:
  d_input: Pointer to input vector representing matrix
  d_padded: Pointer to output vector representing padded matrix (needs to be
 pre-allocated) w: Width of input matrix h: Height of input matrix n: Number of
 channels in input matrix p: Padding
 */
 __global__ void pad_matrix_kernel(
    const float* d_input,
    float*       d_padded,
    int          w,
    int          h,
    int          n,
    int          p
 ) {
    int stride = gridDim.x * blockDim.x;
    int tid    = blockDim.x * blockIdx.x + threadIdx.x;
 	for (int i = tid; i < (w + 2 * p) * (h + 2 * p) * n; i += stride) {
 		// if i is in the padding region
 		if (i < p * (h + 2 * p) * n || i >= (w + p) * (h + 2 * p) * n) {
 			d_padded[i] = 0.0f;
 		} else {
 			// if i is in the original region
 			d_padded[i] = d_input[(i - p * (h + 2 * p) * n) / (h + 2 * p) * w + (i - p * (h + 2 * p) * n) % (h + 2 * p)];
 		}
 	}
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,7 +1,11 @@
 find_package(GTest REQUIRED)
 include_directories(${GTEST_INCLUDE_DIRS})
-add_executable(test_main layers/test_dense.cu kernels/test_activations.cu)
+add_executable(test_main 
    layers/test_dense.cu
    kernels/test_activations.cu
    kernels/test_padding.cu
 )
 add_library(test_utils
    test_utils/test_cublas_fixture.cu
--- a/test/kernels/test_padding.cu
+++ b/test/kernels/test_padding.cu
@@ -0,0 +1,82 @@
 #include <cuda_runtime_api.h>
 #include <gtest/gtest.h>
 #include <iostream>
 #include "padding.cuh"
 #include "test_cublas_fixture.cuh"
 class PaddingTest : public CublasTestFixture {
  protected:
    cudaError_t    cudaStatus;
    cublasStatus_t cublasStatus;
 };
 TEST_F(PaddingTest, SimplePaddingTest) {
    int w = 2;
    int h = 3;
    int n = 2;
    int p = 1;
    float* d_input;
    float* d_padded;
    int inputSize  = w * h * n;
    int paddedSize = (w + 2 * p) * (h + 2 * p) * n;
    cudaStatus = cudaMalloc((void**)&d_input, sizeof(float) * inputSize);
    EXPECT_EQ(cudaStatus, cudaSuccess);
    cudaStatus = cudaMalloc(
        (void**)&d_padded, sizeof(float) * paddedSize
    );
    EXPECT_EQ(cudaStatus, cudaSuccess);
    /*
    Matrix channel 0:
    0  1
    2  3
    4  5
    Matrix channel 1:
    6  7
    8  9
    10 11
    Represented as column major vector:
    0 2 4 1 3 5 6 8 10 7 9 11
    */
    std::vector<float> input = {0.0f, 2.0f, 4.0f,  1.0f, 3.0f, 5.0f,
                                6.0f, 8.0f, 10.0f, 7.0f, 9.0f, 11.0f};
    cublasStatus =
        cublasSetVector(inputSize, sizeof(float), input.data(), 1, d_input, 1);
    EXPECT_EQ(cublasStatus, CUBLAS_STATUS_SUCCESS);
    pad_matrix_kernel<<<1, 1>>>(d_input, d_padded, w, h, n, p);
    cudaStatus = cudaDeviceSynchronize();
    EXPECT_EQ(cudaStatus, cudaSuccess);
    std::vector<float> expectedOutput = {
        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f, 2.0f, 4.0f,  0.0f,
        0.0f, 1.0f, 3.0f, 5.0f,  0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f,
        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 6.0f, 8.0f, 10.0f, 0.0f,
        0.0f, 7.0f, 9.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f
    };
    std::vector<float> output(paddedSize);
    cublasStatus = cublasGetVector(
        paddedSize, sizeof(float), d_padded, 1, output.data(), 1
    );
    std::cout << "Actual output: " << std::endl;
    for (int i = 0; i < paddedSize; i++) {
        std::cout << output[i] << " ";
    }
    std::cout << std::endl;
    for (int i = 0; i < paddedSize; i++) {
        EXPECT_NEAR(expectedOutput[i], output[i], 1e-5);
    }
 }