diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c25fb7..e1382f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,8 +12,7 @@ set(LIBRARY_SOURCES src/utils/cuda_helper.cu src/kernels/activations.cu src/kernels/convolution.cu - src/kernels/padding.cu - src/kernels/matrix_math.cu + src/kernels/matmul.cu src/layers/dense.cu src/layers/conv2d.cu ) diff --git a/include/kernels/activations.cuh b/include/kernels/activations.cuh index cbd91a2..34ff65d 100644 --- a/include/kernels/activations.cuh +++ b/include/kernels/activations.cuh @@ -1,19 +1,14 @@ #ifndef ACTIVATIONS_H #define ACTIVATIONS_H -__global__ void -sigmoid_kernel(const float* __restrict__ src, float* __restrict__ dst, int len); +namespace Kernels { __global__ void -relu_kernel(const float* __restrict__ src, float* __restrict__ dst, int len); +sigmoid(const float* __restrict__ src, float* __restrict__ dst, int len); __global__ void -linear_kernel(const float* __restrict__ src, float* __restrict__ dst, int len); +relu(const float* __restrict__ src, float* __restrict__ dst, int len); -enum Activation { - SIGMOID, - RELU, - LINEAR -}; +} // namespace Kernels #endif // ACTIVATIONS_H \ No newline at end of file diff --git a/include/kernels/convolution.cuh b/include/kernels/convolution.cuh index d7d009a..0332320 100644 --- a/include/kernels/convolution.cuh +++ b/include/kernels/convolution.cuh @@ -1,7 +1,18 @@ #ifndef CONVOLUTION_H #define CONVOLUTION_H -__global__ void convolution_kernel( +namespace Kernels { + +__global__ void padding( + const float* d_input, + float* d_padded, + int w, + int h, + int n, + int p +); + +__global__ void convolution( const float* d_input, const float* d_kernel, float* d_output, @@ -13,4 +24,6 @@ __global__ void convolution_kernel( int outputSize ); +} // namespace Kernels + #endif // CONVOLUTION_H \ No newline at end of file diff --git a/include/kernels/matrix_math.cuh b/include/kernels/matmul.cuh similarity index 59% rename from include/kernels/matrix_math.cuh rename to include/kernels/matmul.cuh index 20e0956..e8dd782 100644 --- a/include/kernels/matrix_math.cuh +++ b/include/kernels/matmul.cuh @@ -1,7 +1,9 @@ -#ifndef MATRIX_MATH_H -#define MATRIX_MATH_H +#ifndef MATMUL_H +#define MATMUL_H -__global__ void mat_vec_mul_kernel( +namespace Kernels { + +__global__ void mat_vec_mul( const float* d_matrix, const float* d_vector, float* d_output, @@ -9,11 +11,13 @@ __global__ void mat_vec_mul_kernel( int h ); -__global__ void vec_vec_add_kernel( +__global__ void vec_vec_add( const float* d_vector1, const float* d_vector2, float* d_output, int w ); -#endif // MATRIX_MATH_H \ No newline at end of file +} // namespace Kernels + +#endif // MATMUL_H \ No newline at end of file diff --git a/include/kernels/padding.cuh b/include/kernels/padding.cuh deleted file mode 100644 index ccd7403..0000000 --- a/include/kernels/padding.cuh +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef PADDING_H -#define PADDING_H - -__global__ void pad_matrix_kernel( - const float* d_input, - float* d_padded, - int w, - int h, - int n, - int p -); - -enum Padding { - SAME, - VALID -}; - -#endif // PADDING_H \ No newline at end of file diff --git a/include/layers/conv2d.cuh b/include/layers/conv2d.cuh index 63e29ca..5650afa 100644 --- a/include/layers/conv2d.cuh +++ b/include/layers/conv2d.cuh @@ -5,7 +5,7 @@ #include #include "activations.cuh" -#include "padding.cuh" +#include "convolution.cuh" #include "ilayer.cuh" namespace Layers { @@ -13,13 +13,13 @@ namespace Layers { class Conv2d : public ILayer { public: Conv2d( - int inputSize, - int inputChannels, - int kernelSize, - int stride, - Padding padding, - int numFilters, - Activation activation + int inputSize, + int inputChannels, + int kernelSize, + int stride, + Layers::Padding padding, + int numFilters, + Layers::Activation activation ); ~Conv2d(); @@ -52,7 +52,7 @@ class Conv2d : public ILayer { float* d_padded; // Kernels - Activation activation; + Layers::Activation activation; void initializeWeights(); void initializeBiases(); diff --git a/include/layers/dense.cuh b/include/layers/dense.cuh index 47d650f..48b6a88 100644 --- a/include/layers/dense.cuh +++ b/include/layers/dense.cuh @@ -14,7 +14,7 @@ class Dense : public ILayer { Dense( int inputSize, int outputSize, - Activation activation + Layers::Activation activation ); ~Dense(); @@ -32,7 +32,7 @@ class Dense : public ILayer { std::vector weights; std::vector biases; - Activation activation; + Layers::Activation activation; void initializeWeights(); void initializeBiases(); diff --git a/include/layers/ilayer.cuh b/include/layers/ilayer.cuh index 7124a46..6738cb8 100644 --- a/include/layers/ilayer.cuh +++ b/include/layers/ilayer.cuh @@ -6,6 +6,17 @@ namespace Layers { +enum Activation { + SIGMOID, + RELU, + NONE +}; + +enum Padding { + SAME, + VALID +}; + class ILayer { public: virtual ~ILayer() {} @@ -29,7 +40,7 @@ class ILayer { std::vector weights; std::vector biases; - Activation activation; + Layers::Activation activation; }; } // namespace Layers diff --git a/src/kernels/activations.cu b/src/kernels/activations.cu index 0ceffd7..7fbbf74 100644 --- a/src/kernels/activations.cu +++ b/src/kernels/activations.cu @@ -2,7 +2,7 @@ #include "activations.cuh" -__global__ void sigmoid_kernel( +__global__ void Kernels::sigmoid( const float* __restrict__ src, float* __restrict__ dst, int len @@ -16,7 +16,7 @@ __global__ void sigmoid_kernel( } __global__ void -relu_kernel(const float* __restrict__ src, float* __restrict__ dst, int len) { +Kernels::relu(const float* __restrict__ src, float* __restrict__ dst, int len) { int stride = gridDim.x * blockDim.x; int tid = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/src/kernels/convolution.cu b/src/kernels/convolution.cu index b3068e6..27daefc 100644 --- a/src/kernels/convolution.cu +++ b/src/kernels/convolution.cu @@ -1,7 +1,84 @@ #include "convolution.cuh" #include -__global__ void convolution_kernel( +/* +Pads matrix width x height x n_channels to width + 2 * padding x height + 2 * +padding x n_channels Matrix is represented as a pointer to a vector + +For example: + +w = 2 +h = 3 +n = 2 +p = 1 + +Channel 0: + 0 1 + 2 3 + 4 5 +Channel 1: + 6 7 + 8 9 + 10 11 + +Is represented as: + +0 1 2 3 4 5 6 7 8 9 10 11 + +Padded result (as a continuous vector): + +0.0f, 0.0f, 0.0f, 0.0f, +0.0f, 0.0f, 1.0f, 0.0f, +0.0f, 2.0f, 3.0f, 0.0f, +0.0f, 4.0f, 5.0f, 0.0f, +0.0f, 0.0f, 0.0f, 0.0f, +0.0f, 0.0f, 0.0f, 0.0f, +0.0f, 6.0f, 7.0f, 0.0f, +0.0f, 8.0f, 9.0f, 0.0f, +9.0f, 10.0f, 11.0f, 0.0f, +0.0f, 0.0f, 0.0f, 0.0f + +Args: + d_input: Pointer to input vector representing matrix + d_padded: Pointer to output vector representing padded matrix (needs to be +pre-allocated) + w: Width of input matrix + h: Height of input matrix + n: Number of channels in input matrix + p: Padding +*/ +__global__ void Kernels::padding( + const float* d_input, + float* d_padded, + int w, + int h, + int n, + int p +) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + if (tid >= (w + 2 * p) * (h + 2 * p) * n) { + return; + } + + int idx = tid; + + // unravel index into padded matrix + int i_n = idx / ((w + 2 * p) * (h + 2 * p)); + int i_h = idx % ((w + 2 * p) * (h + 2 * p)) / (w + 2 * p); + int i_w = idx % (w + 2 * p); + + // if i is in the padding region + if (i_w < p || i_w >= (w + p) || i_h < p || i_h >= (h + p)) { + d_padded[tid] = 0.0f; + } else { + // Get index into input vector + int i_orig = i_n * w * h + (i_h - p) * w + (i_w - p); + d_padded[tid] = d_input[i_orig]; + } +} + +__global__ void Kernels::convolution( const float* d_input, const float* d_kernel, float* d_output, diff --git a/src/kernels/matrix_math.cu b/src/kernels/matmul.cu similarity index 86% rename from src/kernels/matrix_math.cu rename to src/kernels/matmul.cu index 01889e9..1359bb3 100644 --- a/src/kernels/matrix_math.cu +++ b/src/kernels/matmul.cu @@ -1,6 +1,6 @@ -#include "matrix_math.cuh" +#include "matmul.cuh" -__global__ void mat_vec_mul_kernel( +__global__ void Kernels::mat_vec_mul( const float* d_matrix, const float* d_vector, float* d_output, @@ -22,7 +22,7 @@ __global__ void mat_vec_mul_kernel( } -__global__ void vec_vec_add_kernel( +__global__ void Kernels::vec_vec_add( const float* d_vector1, const float* d_vector2, float* d_output, diff --git a/src/kernels/padding.cu b/src/kernels/padding.cu deleted file mode 100644 index 79f1ac1..0000000 --- a/src/kernels/padding.cu +++ /dev/null @@ -1,78 +0,0 @@ -#include - -/* -Pads matrix width x height x n_channels to width + 2 * padding x height + 2 * -padding x n_channels Matrix is represented as a pointer to a vector - -For example: - -w = 2 -h = 3 -n = 2 -p = 1 - -Channel 0: - 0 1 - 2 3 - 4 5 -Channel 1: - 6 7 - 8 9 - 10 11 - -Is represented as: - -0 1 2 3 4 5 6 7 8 9 10 11 - -Padded result (as a continuous vector): - -0.0f, 0.0f, 0.0f, 0.0f, -0.0f, 0.0f, 1.0f, 0.0f, -0.0f, 2.0f, 3.0f, 0.0f, -0.0f, 4.0f, 5.0f, 0.0f, -0.0f, 0.0f, 0.0f, 0.0f, -0.0f, 0.0f, 0.0f, 0.0f, -0.0f, 6.0f, 7.0f, 0.0f, -0.0f, 8.0f, 9.0f, 0.0f, -9.0f, 10.0f, 11.0f, 0.0f, -0.0f, 0.0f, 0.0f, 0.0f - -Args: - d_input: Pointer to input vector representing matrix - d_padded: Pointer to output vector representing padded matrix (needs to be -pre-allocated) - w: Width of input matrix - h: Height of input matrix - n: Number of channels in input matrix - p: Padding -*/ -__global__ void pad_matrix_kernel( - const float* d_input, - float* d_padded, - int w, - int h, - int n, - int p -) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; - - if (tid >= (w + 2 * p) * (h + 2 * p) * n) { - return; - } - - int idx = tid; - - // unravel index into padded matrix - int i_n = idx / ((w + 2 * p) * (h + 2 * p)); - int i_h = idx % ((w + 2 * p) * (h + 2 * p)) / (w + 2 * p); - int i_w = idx % (w + 2 * p); - - // if i is in the padding region - if (i_w < p || i_w >= (w + p) || i_h < p || i_h >= (h + p)) { - d_padded[tid] = 0.0f; - } else { - // Get index into input vector - int i_orig = i_n * w * h + (i_h - p) * w + (i_w - p); - d_padded[tid] = d_input[i_orig]; - } -} diff --git a/src/layers/conv2d.cu b/src/layers/conv2d.cu index fdfd27e..4518c01 100644 --- a/src/layers/conv2d.cu +++ b/src/layers/conv2d.cu @@ -5,17 +5,16 @@ #include "conv2d.cuh" #include "convolution.cuh" #include "cuda_helper.cuh" -#include "matrix_math.cuh" -#include "padding.cuh" +#include "matmul.cuh" Layers::Conv2d::Conv2d( - int inputSize, - int inputChannels, - int kernelSize, - int stride, - Padding padding, - int numFilters, - Activation activation + int inputSize, + int inputChannels, + int kernelSize, + int stride, + Layers::Padding padding, + int numFilters, + Layers::Activation activation ) : inputSize(inputSize), inputChannels(inputChannels), @@ -23,21 +22,19 @@ Layers::Conv2d::Conv2d( stride(stride), numFilters(numFilters), activation(activation) { + switch (padding) { + case SAME: + outputSize = inputSize; + paddingSize = ((stride - 1) * inputSize - stride + kernelSize) / 2; + break; - switch (padding) - { - case SAME: - outputSize = inputSize; - paddingSize = ((stride - 1) * inputSize - stride + kernelSize) / 2; - break; + case VALID: + paddingSize = 0; + outputSize = (inputSize - kernelSize) / stride + 1; + break; - case VALID: - paddingSize = 0; - outputSize = (inputSize - kernelSize) / stride + 1; - break; - - default: - break; + default: + break; } weights.resize(kernelSize * kernelSize * inputChannels * numFilters); @@ -109,19 +106,19 @@ void Layers::Conv2d::forward(const float* d_input, float* d_output) { int THREADS_PER_BLOCK = (inputSize + 2 * paddingSize) * (inputSize + 2 * paddingSize) * inputChannels; - pad_matrix_kernel<<<1, THREADS_PER_BLOCK>>>( + Kernels::padding<<<1, THREADS_PER_BLOCK>>>( d_input, d_padded, inputSize, inputSize, inputChannels, paddingSize ); // Convolve THREADS_PER_BLOCK = outputSize * outputSize * numFilters; - convolution_kernel<<<1, THREADS_PER_BLOCK>>>( + Kernels::convolution<<<1, THREADS_PER_BLOCK>>>( d_padded, d_weights, d_output, inputSize + (2 * paddingSize), inputChannels, kernelSize, stride, numFilters, outputSize ); // Add bias - vec_vec_add_kernel<<<1, biases.size()>>>( + Kernels::vec_vec_add<<<1, biases.size()>>>( d_biases, d_output, d_output, biases.size() ); @@ -138,8 +135,7 @@ outputSize x numFilters */ void Layers::Conv2d::host_conv(const float* input, float* output) { // Iterate over output matrix - for (int tid = 0; tid < outputSize * outputSize * numFilters; tid++) - { + for (int tid = 0; tid < outputSize * outputSize * numFilters; tid++) { // Get output index int f = tid / (outputSize * outputSize); int i = tid % (outputSize * outputSize) / outputSize; @@ -153,19 +149,17 @@ void Layers::Conv2d::host_conv(const float* input, float* output) { for (int c = 0; c < inputChannels; c++) { int kernelIndex = f * kernelSize * kernelSize * inputChannels + - c * kernelSize * kernelSize + k * kernelSize + - l; + c * kernelSize * kernelSize + k * kernelSize + l; int inputIndex = c * inputSize * inputSize + - (i * stride + k) * inputSize + - (j * stride + l); + (i * stride + k) * inputSize + + (j * stride + l); sum += weights[kernelIndex] * input[inputIndex]; } } } - int outputIndex = - f * outputSize * outputSize + i * outputSize + j; + int outputIndex = f * outputSize * outputSize + i * outputSize + j; output[outputIndex] = sum; } diff --git a/src/layers/dense.cu b/src/layers/dense.cu index c888692..4361277 100644 --- a/src/layers/dense.cu +++ b/src/layers/dense.cu @@ -8,9 +8,9 @@ #include "activations.cuh" #include "cuda_helper.cuh" #include "dense.cuh" -#include "matrix_math.cuh" +#include "matmul.cuh" -Layers::Dense::Dense(int inputSize, int outputSize, Activation activation) +Layers::Dense::Dense(int inputSize, int outputSize, Layers::Activation activation) : inputSize(inputSize), outputSize(outputSize), activation(activation) { // Allocate memory for weights and biases weights.resize(outputSize * inputSize); @@ -46,21 +46,21 @@ void Layers::Dense::initializeBiases() { } void Layers::Dense::forward(const float* d_input, float* d_output) { - mat_vec_mul_kernel<<<1, outputSize>>>( + Kernels::mat_vec_mul<<<1, outputSize>>>( d_weights, d_input, d_output, inputSize, outputSize ); - vec_vec_add_kernel<<<1, outputSize>>>( + Kernels::vec_vec_add<<<1, outputSize>>>( d_biases, d_output, d_output, outputSize ); switch (activation) { case SIGMOID: - sigmoid_kernel<<<1, outputSize>>>(d_output, d_output, outputSize); + Kernels::sigmoid<<<1, outputSize>>>(d_output, d_output, outputSize); break; case RELU: - relu_kernel<<<1, outputSize>>>(d_output, d_output, outputSize); + Kernels::relu<<<1, outputSize>>>(d_output, d_output, outputSize); break; default: diff --git a/test/kernels/test_activations.cu b/test/kernels/test_activations.cu index 0818081..d1d44ae 100644 --- a/test/kernels/test_activations.cu +++ b/test/kernels/test_activations.cu @@ -25,7 +25,7 @@ TEST(ActivationsTest, SigmoidSanityCheck) { cudaStatus = cudaMemcpy(d_input, input, sizeof(float) * 3, cudaMemcpyHostToDevice); EXPECT_EQ(cudaStatus, cudaSuccess); - sigmoid_kernel<<<1, 3>>>(d_input, d_output, 3); + Kernels::sigmoid<<<1, 3>>>(d_input, d_output, 3); cudaStatus = cudaDeviceSynchronize(); EXPECT_EQ(cudaStatus, cudaSuccess); diff --git a/test/kernels/test_padding.cu b/test/kernels/test_padding.cu index 8038245..aadec4c 100644 --- a/test/kernels/test_padding.cu +++ b/test/kernels/test_padding.cu @@ -3,7 +3,7 @@ #include -#include "padding.cuh" +#include "convolution.cuh" TEST(PaddingTest, SimplePaddingTest) { cudaError_t cudaStatus; @@ -51,7 +51,7 @@ TEST(PaddingTest, SimplePaddingTest) { int THREADS_PER_BLOCK = 64; int BLOCKS = paddedSize / THREADS_PER_BLOCK + 1; - pad_matrix_kernel<<>>( + Kernels::padding<<>>( d_input, d_padded, w, h, n, p ); cudaStatus = cudaDeviceSynchronize(); diff --git a/test/layers/test_conv2d.cu b/test/layers/test_conv2d.cu index 33ab730..3cf8552 100644 --- a/test/layers/test_conv2d.cu +++ b/test/layers/test_conv2d.cu @@ -12,9 +12,9 @@ class Conv2dTest : public ::testing::Test { int inputChannels, int kernelSize, int stride, - Padding padding, + Layers::Padding padding, int numFilters, - Activation activation, + Layers::Activation activation, std::vector& input, float* kernels, float*& d_input, @@ -65,9 +65,9 @@ TEST_F(Conv2dTest, SimpleTest) { int inputChannels = 1; int kernelSize = 2; int stride = 1; - Padding padding = VALID; + Layers::Padding padding = Layers::Padding::VALID; int numFilters = 1; - Activation activation = LINEAR; + Layers::Activation activation = Layers::Activation::NONE; std::vector input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, @@ -114,9 +114,9 @@ TEST_F(Conv2dTest, ComplexTest) { int inputChannels = 3; int kernelSize = 3; int stride = 1; - Padding padding = SAME; + Layers::Padding padding = Layers::Padding::SAME; int numFilters = 2; - Activation activation = LINEAR; + Layers::Activation activation = Layers::Activation::NONE; // clang-format off std::vector input = { diff --git a/test/layers/test_dense.cu b/test/layers/test_dense.cu index e1b78d3..d5bd6d2 100644 --- a/test/layers/test_dense.cu +++ b/test/layers/test_dense.cu @@ -16,7 +16,7 @@ class DenseLayerTest : public ::testing::Test { float* biases, float*& d_input, float*& d_output, - Activation activation + Layers::Activation activation ) { // Create Dense layer Layers::Dense denseLayer(inputSize, outputSize, activation); @@ -57,7 +57,9 @@ TEST_F(DenseLayerTest, Init) { int inputSize = i; int outputSize = j; - Layers::Dense denseLayer(inputSize, outputSize, SIGMOID); + Layers::Dense denseLayer( + inputSize, outputSize, Layers::Activation::SIGMOID + ); } } } @@ -76,7 +78,9 @@ TEST_F(DenseLayerTest, setWeights) { }; // clang-format on - Layers::Dense denseLayer(inputSize, outputSize, SIGMOID); + Layers::Dense denseLayer( + inputSize, outputSize, Layers::Activation::SIGMOID + ); denseLayer.setWeights(weights.data()); } @@ -102,7 +106,7 @@ TEST_F(DenseLayerTest, ForwardUnitWeightMatrixLinear) { Layers::Dense denseLayer = commonTestSetup( inputSize, outputSize, input, weights.data(), biases.data(), d_input, - d_output, LINEAR + d_output, Layers::Activation::NONE ); denseLayer.forward(d_input, d_output); @@ -142,7 +146,8 @@ TEST_F(DenseLayerTest, ForwardRandomWeightMatrixRelu) { float* d_output; Layers::Dense denseLayer = commonTestSetup( - inputSize, outputSize, input, weights.data(), biases.data(), d_input, d_output, RELU + inputSize, outputSize, input, weights.data(), biases.data(), d_input, + d_output, Layers::Activation::RELU ); denseLayer.forward(d_input, d_output); @@ -186,8 +191,8 @@ TEST_F(DenseLayerTest, ForwardRandomWeightMatrixSigmoid) { float* d_output; Layers::Dense denseLayer = commonTestSetup( - inputSize, outputSize, input, weights.data(), biases.data(), d_input, d_output, - SIGMOID + inputSize, outputSize, input, weights.data(), biases.data(), d_input, + d_output, Layers::Activation::SIGMOID ); denseLayer.forward(d_input, d_output);