From 019ccc33d922204b24ce732078c7dde6bbd60bee Mon Sep 17 00:00:00 2001
From: LordMathis <matus@namesny.com>
Date: Thu, 29 Feb 2024 22:21:48 +0100
Subject: [PATCH] Start implementing padding kernel

---
 CMakeLists.txt               |  1 +
 include/kernels/padding.cuh  | 13 ++++++
 src/kernels/padding.cu       | 57 +++++++++++++++++++++++++
 test/CMakeLists.txt          |  6 ++-
 test/kernels/test_padding.cu | 82 ++++++++++++++++++++++++++++++++++++
 5 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 include/kernels/padding.cuh
 create mode 100644 src/kernels/padding.cu
 create mode 100644 test/kernels/test_padding.cu
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2daefd..ed802bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@ include_directories(${CUDAToolkit_INCLUDE_DIRS})
 set(LIBRARY_SOURCES
     src/utils/cuda_helper.cu
     src/kernels/activations.cu
+    src/kernels/padding.cu
     src/layers/dense.cu
 )
 
diff --git a/include/kernels/padding.cuh b/include/kernels/padding.cuh
new file mode 100644
index 0000000..cba01ae
--- /dev/null
+++ b/include/kernels/padding.cuh
@@ -0,0 +1,13 @@
+#ifndef PADDING_H
+#define PADDING_H
+
+__global__ void pad_matrix_kernel(
+    const float* d_input,
+    float*       d_padded,
+    int          w,
+    int          h,
+    int          n,
+    int          p
+);
+
+#endif  // PADDING_H
\ No newline at end of file
diff --git a/src/kernels/padding.cu b/src/kernels/padding.cu
new file mode 100644
index 0000000..ca78008
--- /dev/null
+++ b/src/kernels/padding.cu
@@ -0,0 +1,57 @@
+/*
+Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
+padding x n_channels Matrix is represented as a pointer to column major vector
+
+For example:
+
+w = 2
+h = 3
+n = 2
+p = 1
+
+Channel 0:
+  0  1
+  2  3
+  4  5
+Channel 1:
+  6  7
+  8  9
+ 10 11
+
+Is represented as:
+
+0 2 4 1 3 5 6 8 10 7 9 11
+
+Padded result:
+
+0 0 0 0 0 0 0 2 4 0 0 1 3 5 0 0 0 0 0 0 0 0 0 0 0 0 6 8 10 0 0 7 9 11 0 0 0 0 0 0
+
+
+Args:
+  d_input: Pointer to input vector representing matrix
+  d_padded: Pointer to output vector representing padded matrix (needs to be
+pre-allocated) w: Width of input matrix h: Height of input matrix n: Number of
+channels in input matrix p: Padding
+*/
+__global__ void pad_matrix_kernel(
+    const float* d_input,
+    float*       d_padded,
+    int          w,
+    int          h,
+    int          n,
+    int          p
+) {
+    int stride = gridDim.x * blockDim.x;
+    int tid    = blockDim.x * blockIdx.x + threadIdx.x;
+
+	for (int i = tid; i < (w + 2 * p) * (h + 2 * p) * n; i += stride) {
+		
+		// if i is in the padding region
+		if (i < p * (h + 2 * p) * n || i >= (w + p) * (h + 2 * p) * n) {
+			d_padded[i] = 0.0f;
+		} else {
+			// if i is in the original region
+			d_padded[i] = d_input[(i - p * (h + 2 * p) * n) / (h + 2 * p) * w + (i - p * (h + 2 * p) * n) % (h + 2 * p)];
+		}
+	}
+}
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5813516..dff831d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,7 +1,11 @@
 find_package(GTest REQUIRED)
 include_directories(${GTEST_INCLUDE_DIRS})
 
-add_executable(test_main layers/test_dense.cu kernels/test_activations.cu)
+add_executable(test_main 
+    layers/test_dense.cu
+    kernels/test_activations.cu
+    kernels/test_padding.cu
+)
 
 add_library(test_utils
     test_utils/test_cublas_fixture.cu
diff --git a/test/kernels/test_padding.cu b/test/kernels/test_padding.cu
new file mode 100644
index 0000000..c8b89af
--- /dev/null
+++ b/test/kernels/test_padding.cu
@@ -0,0 +1,82 @@
+#include <cuda_runtime_api.h>
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include "padding.cuh"
+#include "test_cublas_fixture.cuh"
+
+class PaddingTest : public CublasTestFixture {
+  protected:
+    cudaError_t    cudaStatus;
+    cublasStatus_t cublasStatus;
+};
+
+TEST_F(PaddingTest, SimplePaddingTest) {
+    int w = 2;
+    int h = 3;
+    int n = 2;
+    int p = 1;
+
+    float* d_input;
+    float* d_padded;
+
+    int inputSize  = w * h * n;
+    int paddedSize = (w + 2 * p) * (h + 2 * p) * n;
+
+    cudaStatus = cudaMalloc((void**)&d_input, sizeof(float) * inputSize);
+    EXPECT_EQ(cudaStatus, cudaSuccess);
+
+    cudaStatus = cudaMalloc(
+        (void**)&d_padded, sizeof(float) * paddedSize
+    );
+    EXPECT_EQ(cudaStatus, cudaSuccess);
+
+    /*
+    Matrix channel 0:
+    0  1
+    2  3
+    4  5
+    Matrix channel 1:
+    6  7
+    8  9
+    10 11
+
+    Represented as column major vector:
+
+    0 2 4 1 3 5 6 8 10 7 9 11
+    */
+
+    std::vector<float> input = {0.0f, 2.0f, 4.0f,  1.0f, 3.0f, 5.0f,
+                                6.0f, 8.0f, 10.0f, 7.0f, 9.0f, 11.0f};
+
+    cublasStatus =
+        cublasSetVector(inputSize, sizeof(float), input.data(), 1, d_input, 1);
+    EXPECT_EQ(cublasStatus, CUBLAS_STATUS_SUCCESS);
+
+    pad_matrix_kernel<<<1, 1>>>(d_input, d_padded, w, h, n, p);
+    cudaStatus = cudaDeviceSynchronize();
+    EXPECT_EQ(cudaStatus, cudaSuccess);
+
+    std::vector<float> expectedOutput = {
+        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f, 2.0f, 4.0f,  0.0f,
+        0.0f, 1.0f, 3.0f, 5.0f,  0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 6.0f, 8.0f, 10.0f, 0.0f,
+        0.0f, 7.0f, 9.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.0f
+    };
+
+    std::vector<float> output(paddedSize);
+    cublasStatus = cublasGetVector(
+        paddedSize, sizeof(float), d_padded, 1, output.data(), 1
+    );
+
+    std::cout << "Actual output: " << std::endl;
+    for (int i = 0; i < paddedSize; i++) {
+        std::cout << output[i] << " ";
+    }
+    std::cout << std::endl;
+
+    for (int i = 0; i < paddedSize; i++) {
+        EXPECT_NEAR(expectedOutput[i], output[i], 1e-5);
+    }
+}
\ No newline at end of file