diff --git a/include/kernels/convolution.cuh b/include/kernels/convolution.cuh index 22380a2..89861f1 100644 --- a/include/kernels/convolution.cuh +++ b/include/kernels/convolution.cuh @@ -3,25 +3,6 @@ namespace CUDANet::Kernels { -/** - * @brief Kernel that pads the input matrix with zeros - * - * @param d_input Device pointer to the input matrix (as vector) - * @param d_padded Device pointer to the padded matrix (as vector) - * @param w Width of the input matrix - * @param h Height of the input matrix - * @param n Number of input channels - * @param p Padding size - */ -__global__ void padding( - const float* __restrict__ d_input, - float* __restrict__ d_padded, - const unsigned int w, - const unsigned int h, - const unsigned int n, - const unsigned int p -); - /** * @brief Convolution kernel * diff --git a/include/layers/conv2d.cuh b/include/layers/conv2d.cuh index 53a06fd..ab80e64 100644 --- a/include/layers/conv2d.cuh +++ b/include/layers/conv2d.cuh @@ -105,7 +105,6 @@ class Conv2d : public ILayer { float* d_output; float* d_weights; float* d_biases; - float* d_padded; // Kernels Layers::Activation activation; diff --git a/src/kernels/convolution.cu b/src/kernels/convolution.cu index d25adb3..7a80482 100644 --- a/src/kernels/convolution.cu +++ b/src/kernels/convolution.cu @@ -2,83 +2,6 @@ #include "convolution.cuh" -/* -Pads matrix width x height x n_channels to width + 2 * padding x height + 2 * -padding x n_channels Matrix is represented as a pointer to a vector - -For example: - -w = 2 -h = 3 -n = 2 -p = 1 - -Channel 0: - 0 1 - 2 3 - 4 5 -Channel 1: - 6 7 - 8 9 - 10 11 - -Is represented as: - -0 1 2 3 4 5 6 7 8 9 10 11 - -Padded result (as a continuous vector): - -0.0f, 0.0f, 0.0f, 0.0f, -0.0f, 0.0f, 1.0f, 0.0f, -0.0f, 2.0f, 3.0f, 0.0f, -0.0f, 4.0f, 5.0f, 0.0f, -0.0f, 0.0f, 0.0f, 0.0f, -0.0f, 0.0f, 0.0f, 0.0f, -0.0f, 6.0f, 7.0f, 0.0f, -0.0f, 8.0f, 9.0f, 0.0f, -9.0f, 10.0f, 11.0f, 0.0f, -0.0f, 0.0f, 0.0f, 0.0f - -Args: - d_input: Pointer to input vector representing matrix - d_padded: Pointer to output vector representing padded matrix (needs to be -pre-allocated) - w: Width of input matrix - h: Height of input matrix - n: Number of channels in input matrix - p: Padding -*/ -__global__ void CUDANet::Kernels::padding( - const float* __restrict__ d_input, - float* __restrict__ d_padded, - const unsigned int w, - const unsigned int h, - const unsigned int n, - const unsigned int p -) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; - - if (tid >= (w + 2 * p) * (h + 2 * p) * n) { - return; - } - - int idx = tid; - - // unravel index into padded matrix - int i_n = idx / ((w + 2 * p) * (h + 2 * p)); - int i_h = idx % ((w + 2 * p) * (h + 2 * p)) / (w + 2 * p); - int i_w = idx % (w + 2 * p); - - // if i is in the padding region - if (i_w < p || i_w >= (w + p) || i_h < p || i_h >= (h + p)) { - d_padded[tid] = 0.0f; - } else { - // Get index into input vector - int i_orig = i_n * w * h + (i_h - p) * w + (i_w - p); - d_padded[tid] = d_input[i_orig]; - } -} - __global__ void CUDANet::Kernels::convolution( const float* __restrict__ d_input, const float* __restrict__ d_kernel, @@ -108,12 +31,21 @@ __global__ void CUDANet::Kernels::convolution( for (int c = 0; c < nChannels; c++) { for (int k = 0; k < kernelSize; k++) { for (int l = 0; l < kernelSize; l++) { + + // if i, j is in the padding region + if (i * stride + k < paddingSize || + i * stride + k >= (inputSize + paddingSize) || + j * stride + l < paddingSize || + j * stride + l >= (inputSize + paddingSize)) { + continue; + } + int kernelIndex = f * kernelSize * kernelSize * nChannels + c * kernelSize * kernelSize + k * kernelSize + l; int inputIndex = c * inputSize * inputSize + - (i * stride + k) * inputSize + - (j * stride + l); + (i * stride + k - paddingSize) * inputSize + + (j * stride + l - paddingSize); sum += d_kernel[kernelIndex] * d_input[inputIndex]; } diff --git a/src/layers/conv2d.cu b/src/layers/conv2d.cu index 065185f..719ec9e 100644 --- a/src/layers/conv2d.cu +++ b/src/layers/conv2d.cu @@ -23,6 +23,7 @@ Layers::Conv2d::Conv2d( kernelSize(kernelSize), stride(stride), numFilters(numFilters) { + switch (padding) { case SAME: outputSize = inputSize; @@ -64,12 +65,6 @@ Layers::Conv2d::Conv2d( (void**)&d_biases, sizeof(float) * outputSize * outputSize * numFilters )); - d_padded = nullptr; - CUDA_CHECK(cudaMalloc( - (void**)&d_padded, sizeof(float) * (inputSize + 2 * paddingSize) * - (inputSize + 2 * paddingSize) * inputChannels - )); - toCuda(); } @@ -77,7 +72,6 @@ Layers::Conv2d::~Conv2d() { cudaFree(d_output); cudaFree(d_weights); cudaFree(d_biases); - cudaFree(d_padded); } void Layers::Conv2d::initializeWeights() { @@ -113,18 +107,10 @@ void Layers::Conv2d::toCuda() { } float* Layers::Conv2d::forward(const float* d_input) { - // Pad input - int THREADS_PER_BLOCK = (inputSize + 2 * paddingSize) * - (inputSize + 2 * paddingSize) * inputChannels; - - Kernels::padding<<<1, THREADS_PER_BLOCK>>>( - d_input, d_padded, inputSize, inputSize, inputChannels, paddingSize - ); - // Convolve - THREADS_PER_BLOCK = outputSize * outputSize * numFilters; + int THREADS_PER_BLOCK = outputSize * outputSize * numFilters; Kernels::convolution<<<1, THREADS_PER_BLOCK>>>( - d_padded, d_weights, d_output, inputSize + 2 * paddingSize, inputChannels, paddingSize, + d_input, d_weights, d_output, inputSize, inputChannels, paddingSize, kernelSize, stride, numFilters, outputSize ); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 666b5f2..27a4b6f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -9,7 +9,6 @@ add_executable(test_main layers/test_dense.cu layers/test_input.cu kernels/test_activation_functions.cu - kernels/test_padding.cu kernels/test_matmul.cu ) diff --git a/test/kernels/test_padding.cu b/test/kernels/test_padding.cu deleted file mode 100644 index 7e04cb0..0000000 --- a/test/kernels/test_padding.cu +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include - -#include - -#include "convolution.cuh" - -TEST(PaddingTest, SimplePaddingTest) { - cudaError_t cudaStatus; - - int w = 2; - int h = 3; - int n = 2; - int p = 1; - - float* d_input; - float* d_padded; - - int inputSize = w * h * n; - int paddedSize = (w + 2 * p) * (h + 2 * p) * n; - - cudaStatus = cudaMalloc((void**)&d_input, sizeof(float) * inputSize); - EXPECT_EQ(cudaStatus, cudaSuccess); - - cudaStatus = cudaMalloc((void**)&d_padded, sizeof(float) * paddedSize); - EXPECT_EQ(cudaStatus, cudaSuccess); - - /* - Matrix channel 0: - 0 1 - 2 3 - 4 5 - Matrix channel 1: - 6 7 - 8 9 - 10 11 - - Represented as a vector: - - 0 1 2 3 4 5 6 7 8 9 10 11 - */ - - std::vector input = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, - 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; - - cudaStatus = cudaMemcpy( - d_input, input.data(), sizeof(float) * inputSize, cudaMemcpyHostToDevice - ); - EXPECT_EQ(cudaStatus, cudaSuccess); - - int THREADS_PER_BLOCK = 64; - int BLOCKS = paddedSize / THREADS_PER_BLOCK + 1; - - CUDANet::Kernels::padding<<>>( - d_input, d_padded, w, h, n, p - ); - cudaStatus = cudaDeviceSynchronize(); - EXPECT_EQ(cudaStatus, cudaSuccess); - - // clang-format off - std::vector expectedOutput = { - // channel 0 - 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 2.0f, 3.0f, 0.0f, - 0.0f, 4.0f, 5.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, - // channel 1 - 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 6.0f, 7.0f, 0.0f, - 0.0f, 8.0f, 9.0f, 0.0f, - 0.0f, 10.0f, 11.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f - }; - // clang-format on - - std::vector output(paddedSize); - - cudaStatus = cudaMemcpy( - output.data(), d_padded, sizeof(float) * paddedSize, - cudaMemcpyDeviceToHost - ); - EXPECT_EQ(cudaStatus, cudaSuccess); - - for (int i = 0; i < paddedSize; i++) { - EXPECT_NEAR(expectedOutput[i], output[i], 1e-5); - } - - - cudaFree(d_input); - cudaFree(d_padded); -}