diff --git a/include/kernels/convolution.cuh b/include/kernels/convolution.cuh index f4f94a7..044b6dc 100644 --- a/include/kernels/convolution.cuh +++ b/include/kernels/convolution.cuh @@ -2,6 +2,7 @@ #define CUDANET_CONVOLUTION_H #include +#include "layer.cuh" namespace CUDANet::Kernels { @@ -24,13 +25,13 @@ __global__ void convolution( const float* __restrict__ d_kernel, const float* __restrict__ d_bias, float* __restrict__ d_output, - const int inputSize, + const dim2d inputSize, const int nChannels, - const int paddingSize, - const int kernelSize, - const int stride, + const dim2d paddingSize, + const dim2d kernelSize, + const dim2d stride, const int nFilters, - const int outputSize + const dim2d outputSize ); } // namespace CUDANet::Kernels diff --git a/include/kernels/pooling.cuh b/include/kernels/pooling.cuh index 6f4e3b8..4c3c34c 100644 --- a/include/kernels/pooling.cuh +++ b/include/kernels/pooling.cuh @@ -2,27 +2,28 @@ #define CUDANET_POOLING_H #include +#include "layer.cuh" namespace CUDANet::Kernels { __global__ void max_pooling( const float* __restrict__ d_input, float* __restrict__ d_output, - const int inputSize, - const int outputSize, + const dim2d inputSize, + const dim2d outputSize, const int nChannels, - const int poolingSize, - const int stride + const dim2d poolingSize, + const dim2d stride ); __global__ void avg_pooling( const float* __restrict__ d_input, float* __restrict__ d_output, - const int inputSize, - const int outputSize, + const dim2d inputSize, + const dim2d outputSize, const int nChannels, - const int poolingSize, - const int stride + const dim2d poolingSize, + const dim2d stride ); } // namespace CUDANet::Kernels diff --git a/include/layers/avg_pooling.cuh b/include/layers/avg_pooling.cuh index 11d5431..7815158 100644 --- a/include/layers/avg_pooling.cuh +++ b/include/layers/avg_pooling.cuh @@ -9,10 +9,10 @@ namespace CUDANet::Layers { class AvgPooling2D : public SequentialLayer { public: AvgPooling2D( - int inputSize, + dim2d inputSize, int nChannels, - int poolingSize, - int stride, + dim2d poolingSize, + dim2d stride, ActivationType activationType ); ~AvgPooling2D(); @@ -28,18 +28,18 @@ class AvgPooling2D : public SequentialLayer { /** * @brief Get input size - * + * * @return int input size */ int getInputSize(); private: - int inputSize; - int nChannels; - int poolingSize; - int stride; + dim2d inputSize; + int nChannels; + dim2d poolingSize; + dim2d stride; - int outputSize; + dim2d outputSize; float* d_output; diff --git a/include/layers/batch_norm.cuh b/include/layers/batch_norm.cuh index 91abb2e..c423c34 100644 --- a/include/layers/batch_norm.cuh +++ b/include/layers/batch_norm.cuh @@ -10,7 +10,7 @@ namespace CUDANet::Layers { class BatchNorm2D : public WeightedLayer { public: - BatchNorm2D(int inputSize, int inputChannels, float epsilon, ActivationType activationType); + BatchNorm2D(dim2d inputSize, int inputChannels, float epsilon, ActivationType activationType); ~BatchNorm2D(); @@ -66,7 +66,7 @@ class BatchNorm2D : public WeightedLayer { private: - int inputSize; + dim2d inputSize; int inputChannels; int gridSize; diff --git a/include/layers/conv2d.cuh b/include/layers/conv2d.cuh index 9c8de89..19c9003 100644 --- a/include/layers/conv2d.cuh +++ b/include/layers/conv2d.cuh @@ -28,12 +28,12 @@ class Conv2d : public WeightedLayer { * 'SOFTMAX' or 'NONE') */ Conv2d( - int inputSize, + dim2d inputSize, int inputChannels, - int kernelSize, - int stride, + dim2d kernelSize, + dim2d stride, int numFilters, - int paddingSize, + dim2d paddingSize, ActivationType activationType ); @@ -98,23 +98,23 @@ class Conv2d : public WeightedLayer { * * @return int */ - int getPaddingSize() { + dim2d getPaddingSize() { return paddingSize; } private: // Inputs - int inputSize; - int inputChannels; + dim2d inputSize; + int inputChannels; // Outputs - int outputSize; + dim2d outputSize; // Kernel - int kernelSize; - int stride; - int paddingSize; - int numFilters; + dim2d kernelSize; + dim2d stride; + dim2d paddingSize; + int numFilters; // Kernels std::vector weights; diff --git a/include/layers/dense.cuh b/include/layers/dense.cuh index 93356ba..1624fcc 100644 --- a/include/layers/dense.cuh +++ b/include/layers/dense.cuh @@ -81,8 +81,8 @@ class Dense : public WeightedLayer { int getInputSize(); private: - unsigned int inputSize; - unsigned int outputSize; + int inputSize; + int outputSize; float* d_output; @@ -95,8 +95,8 @@ class Dense : public WeightedLayer { Layers::Activation* activation; // Precompute kernel launch parameters - unsigned int forwardGridSize; - unsigned int biasGridSize; + int forwardGridSize; + int biasGridSize; /** * @brief Initialize the weights to zeros diff --git a/include/layers/layer.cuh b/include/layers/layer.cuh index 495ab95..22f14e7 100644 --- a/include/layers/layer.cuh +++ b/include/layers/layer.cuh @@ -7,6 +7,8 @@ #define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \ ((stride - 1) * inputSize - stride + kernelSize) / 2; +typedef std::pair dim2d; + namespace CUDANet::Layers { /** diff --git a/include/layers/max_pooling.cuh b/include/layers/max_pooling.cuh index 6157cae..0715a80 100644 --- a/include/layers/max_pooling.cuh +++ b/include/layers/max_pooling.cuh @@ -9,10 +9,10 @@ namespace CUDANet::Layers { class MaxPooling2D : public SequentialLayer { public: MaxPooling2D( - int inputSize, + dim2d inputSize, int nChannels, - int poolingSize, - int stride, + dim2d poolingSize, + dim2d stride, ActivationType activationType ); ~MaxPooling2D(); @@ -28,18 +28,18 @@ class MaxPooling2D : public SequentialLayer { /** * @brief Get input size - * + * * @return int input size */ int getInputSize(); private: - int inputSize; - int nChannels; - int poolingSize; - int stride; + dim2d inputSize; + int nChannels; + dim2d poolingSize; + dim2d stride; - int outputSize; + dim2d outputSize; float* d_output; diff --git a/include/model/model.hpp b/include/model/model.hpp index 9535b79..9e1d393 100644 --- a/include/model/model.hpp +++ b/include/model/model.hpp @@ -26,7 +26,7 @@ struct TensorInfo { class Model { public: - Model(const int inputSize, const int inputChannels, const int outputSize); + Model(const dim2d inputSize, const int inputChannels, const int outputSize); Model(const Model& other); ~Model(); @@ -43,7 +43,7 @@ class Model { Layers::Input* inputLayer; Layers::Output* outputLayer; - int inputSize; + dim2d inputSize; int inputChannels; int outputSize; diff --git a/src/kernels/convolution.cu b/src/kernels/convolution.cu index 64a227a..1fa4fb7 100644 --- a/src/kernels/convolution.cu +++ b/src/kernels/convolution.cu @@ -9,19 +9,19 @@ __global__ void Kernels::convolution( const float* __restrict__ d_kernel, const float* __restrict__ d_bias, float* __restrict__ d_output, - const int inputSize, - const int nChannels, - const int paddingSize, - const int kernelSize, - const int stride, - const int nFilters, - const int outputSize + const dim2d inputSize, + const int nChannels, + const dim2d paddingSize, + const dim2d kernelSize, + const dim2d stride, + const int nFilters, + const dim2d outputSize ) { int j = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.y * blockIdx.y + threadIdx.y; int f = blockDim.z * blockIdx.z + threadIdx.z; - if (i >= outputSize || j >= outputSize || f >= nFilters) { + if (i >= outputSize.first || j >= outputSize.second || f >= nFilters) { return; } @@ -29,28 +29,32 @@ __global__ void Kernels::convolution( // Iterate over kernel and input matrix for (int c = 0; c < nChannels; c++) { - for (int k = 0; k < kernelSize; k++) { - for (int l = 0; l < kernelSize; l++) { - + for (int k = 0; k < kernelSize.first; k++) { + for (int l = 0; l < kernelSize.second; l++) { // if i, j is in the padding region - if (i * stride + k < paddingSize || - i * stride + k >= (inputSize + paddingSize) || - j * stride + l < paddingSize || - j * stride + l >= (inputSize + paddingSize)) { + if (i * stride.first + k < paddingSize.first || + i * stride.first + k >= + (inputSize.first + paddingSize.first) || + j * stride.second + l < paddingSize.second || + j * stride.second + l >= + (inputSize.second + paddingSize.second)) { continue; } - int kernelIndex = f * kernelSize * kernelSize * nChannels + - c * kernelSize * kernelSize + k * kernelSize + - l; - int inputIndex = c * inputSize * inputSize + - (i * stride + k - paddingSize) * inputSize + - (j * stride + l - paddingSize); + int kernelIndex = + f * kernelSize.first * kernelSize.second * nChannels + + c * kernelSize.first * kernelSize.second + + k * kernelSize.second + l; + int inputIndex = c * inputSize.first * inputSize.second + + (i * stride.first + k - paddingSize.first) * + inputSize.second + + (j * stride.second + l - paddingSize.second); sum += d_kernel[kernelIndex] * d_input[inputIndex]; } } } - d_output[f * outputSize * outputSize + i * outputSize + j] = sum + d_bias[f]; + d_output[f * outputSize.first * outputSize.second + i * outputSize.second + j] = + sum + d_bias[f]; } \ No newline at end of file diff --git a/src/kernels/pooling.cu b/src/kernels/pooling.cu index 97094cf..0429ddb 100644 --- a/src/kernels/pooling.cu +++ b/src/kernels/pooling.cu @@ -1,4 +1,5 @@ #include "cuda_helper.cuh" +#include "layer.cuh" #include "pooling.cuh" using namespace CUDANet; @@ -6,26 +7,27 @@ using namespace CUDANet; __global__ void Kernels::max_pooling( const float* __restrict__ d_input, float* __restrict__ d_output, - const int inputSize, - const int outputSize, - const int nChannels, - const int poolingSize, - const int stride + const dim2d inputSize, + const dim2d outputSize, + const int nChannels, + const dim2d poolingSize, + const dim2d stride ) { int j = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.y * blockIdx.y + threadIdx.y; int c = blockDim.z * blockIdx.z + threadIdx.z; - if (i >= outputSize || j >= outputSize || c >= nChannels) { + if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) { return; } float max = 0.0f; - for (int k = 0; k < poolingSize; k++) { - for (int l = 0; l < poolingSize; l++) { - int inputIndex = c * inputSize * inputSize + - (i * stride + k) * inputSize + (j * stride + l); + for (int k = 0; k < poolingSize.first; k++) { + for (int l = 0; l < poolingSize.second; l++) { + int inputIndex = c * inputSize.first * inputSize.second + + (i * stride.first + k) * inputSize.second + + (j * stride.second + l); if (d_input[inputIndex] > max) { max = d_input[inputIndex]; @@ -33,37 +35,41 @@ __global__ void Kernels::max_pooling( } } - d_output[c * outputSize * outputSize + i * outputSize + j] = max; + d_output + [c * outputSize.first * outputSize.second + i * outputSize.second + j] = + max; } __global__ void Kernels::avg_pooling( const float* __restrict__ d_input, float* __restrict__ d_output, - const int inputSize, - const int outputSize, - const int nChannels, - const int poolingSize, - const int stride + const dim2d inputSize, + const dim2d outputSize, + const int nChannels, + const dim2d poolingSize, + const dim2d stride ) { int j = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.y * blockIdx.y + threadIdx.y; int c = blockDim.z * blockIdx.z + threadIdx.z; - if (i >= outputSize || j >= outputSize || c >= outputSize) { + if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) { return; } float sum = 0.0f; - for (int k = 0; k < poolingSize; k++) { - for (int l = 0; l < poolingSize; l++) { - int inputIndex = c * inputSize * inputSize + - (i * stride + k) * inputSize + (j * stride + l); + for (int k = 0; k < poolingSize.first; k++) { + for (int l = 0; l < poolingSize.second; l++) { + int inputIndex = c * inputSize.first * inputSize.second + + (i * stride.first + k) * inputSize.second + + (j * stride.second + l); sum += d_input[inputIndex]; } } - d_output[c * outputSize * outputSize + i * outputSize + j] = - sum / (poolingSize * poolingSize); + d_output + [c * outputSize.first * outputSize.second + i * outputSize.second + j] = + sum / (poolingSize.first * poolingSize.second); } \ No newline at end of file diff --git a/src/layers/avg_pooling.cu b/src/layers/avg_pooling.cu index 23ec5d5..c5c9bba 100644 --- a/src/layers/avg_pooling.cu +++ b/src/layers/avg_pooling.cu @@ -5,24 +5,29 @@ using namespace CUDANet::Layers; AvgPooling2D::AvgPooling2D( - int inputSize, + dim2d inputSize, int nChannels, - int poolingSize, - int stride, + dim2d poolingSize, + dim2d stride, ActivationType activationType ) : inputSize(inputSize), nChannels(nChannels), poolingSize(poolingSize), stride(stride) { - outputSize = (inputSize - poolingSize) / stride + 1; + outputSize = { + (inputSize.first - poolingSize.first) / stride.first + 1, + (inputSize.second - poolingSize.second) / stride.second + 1 + }; - activation = - new Activation(activationType, outputSize * outputSize * nChannels); + activation = new Activation( + activationType, outputSize.first * outputSize.second * nChannels + ); d_output = nullptr; CUDA_CHECK(cudaMalloc( - (void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels + (void**)&d_output, + sizeof(float) * outputSize.first * outputSize.second * nChannels )); } @@ -32,11 +37,10 @@ AvgPooling2D::~AvgPooling2D() { } float* AvgPooling2D::forward(const float* d_input) { - dim3 block(8, 8, 8); dim3 grid( - (outputSize + block.x - 1) / block.x, - (outputSize + block.y - 1) / block.y, + (outputSize.first + block.x - 1) / block.x, + (outputSize.second + block.y - 1) / block.y, (nChannels + block.z - 1) / block.z ); @@ -52,9 +56,9 @@ float* AvgPooling2D::forward(const float* d_input) { } int AvgPooling2D::getOutputSize() { - return outputSize * outputSize * nChannels; + return outputSize.first * outputSize.second * nChannels; } int AvgPooling2D::getInputSize() { - return inputSize * inputSize * nChannels; + return inputSize.first * inputSize.second * nChannels; } \ No newline at end of file diff --git a/src/layers/batch_norm.cu b/src/layers/batch_norm.cu index c96e6cf..bfbd7b6 100644 --- a/src/layers/batch_norm.cu +++ b/src/layers/batch_norm.cu @@ -10,31 +10,36 @@ using namespace CUDANet::Layers; BatchNorm2D::BatchNorm2D( - int inputSize, + dim2d inputSize, int inputChannels, float epsilon, ActivationType activationType ) : inputSize(inputSize), inputChannels(inputChannels) { - activation = - new Activation(activationType, inputSize * inputSize * inputChannels); + activation = new Activation( + activationType, inputSize.first * inputSize.second * inputChannels + ); d_output = nullptr; CUDA_CHECK(cudaMalloc( (void **)&d_output, - sizeof(float) * inputSize * inputSize * inputChannels + sizeof(float) * inputSize.first * inputSize.second * inputChannels )); d_mean = nullptr; - CUDA_CHECK(cudaMalloc((void **)&d_mean, sizeof(float) * inputSize * inputSize)); + CUDA_CHECK(cudaMalloc( + (void **)&d_mean, sizeof(float) * inputSize.first * inputSize.second + )); d_mean_sub = nullptr; - CUDA_CHECK( - cudaMalloc((void **)&d_mean_sub, sizeof(float) * inputSize * inputSize) - ); + CUDA_CHECK(cudaMalloc( + (void **)&d_mean_sub, sizeof(float) * inputSize.first * inputSize.second + )); d_sqrt_var = nullptr; - CUDA_CHECK(cudaMalloc((void **)&d_sqrt_var, sizeof(float) * inputSize * inputSize)); + CUDA_CHECK(cudaMalloc( + (void **)&d_sqrt_var, sizeof(float) * inputSize.first * inputSize.second + )); d_weights = nullptr; CUDA_CHECK(cudaMalloc((void **)&d_weights, sizeof(float) * inputChannels)); @@ -42,14 +47,18 @@ BatchNorm2D::BatchNorm2D( d_biases = nullptr; CUDA_CHECK(cudaMalloc((void **)&d_biases, sizeof(float) * inputChannels)); - d_length = nullptr; - float length = (float) inputSize * inputSize; + d_length = nullptr; + float length = (float)inputSize.first * inputSize.second; CUDA_CHECK(cudaMalloc((void **)&d_length, sizeof(float))); - CUDA_CHECK(cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice) + ); d_epsilon = nullptr; CUDA_CHECK(cudaMalloc((void **)&d_epsilon, sizeof(float))); - CUDA_CHECK(cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice) + ); weights.resize(inputChannels); biases.resize(inputChannels); @@ -60,7 +69,7 @@ BatchNorm2D::BatchNorm2D( toCuda(); gridSize = - (inputSize * inputSize + BLOCK_SIZE - 1) / BLOCK_SIZE; + (inputSize.first * inputSize.second + BLOCK_SIZE - 1) / BLOCK_SIZE; } BatchNorm2D::~BatchNorm2D() { @@ -112,84 +121,67 @@ void BatchNorm2D::toCuda() { } int BatchNorm2D::getInputSize() { - return inputSize * inputSize * inputChannels; + return inputSize.first * inputSize.second * inputChannels; } int BatchNorm2D::getOutputSize() { - return inputSize * inputSize * inputChannels; + return inputSize.first * inputSize.second * inputChannels; } float *BatchNorm2D::forward(const float *d_input) { - // Compute per-channel batch normalization for (int i = 0; i < inputChannels; i++) { - // Compute mean Utils::mean( - d_input + i * inputSize * inputSize, - d_mean, - d_length, - inputSize * inputSize + d_input + i * inputSize.first * inputSize.second, d_mean, d_length, + inputSize.first * inputSize.second ); // Subtract mean from input Kernels::vec_scalar_sub<<>>( - d_input + i * inputSize * inputSize, - d_mean_sub, - &d_mean[0], - inputSize * inputSize + d_input + i * inputSize.first * inputSize.second, d_mean_sub, + &d_mean[0], inputSize.first * inputSize.second ); CUDA_CHECK(cudaGetLastError()); // Compute variance Utils::var( - d_mean_sub, - d_sqrt_var, - d_length, - inputSize * inputSize + d_mean_sub, d_sqrt_var, d_length, inputSize.first * inputSize.second ); // Add epsilon to variance to avoid division by zero Kernels::vec_scalar_add<<>>( - d_sqrt_var, - d_sqrt_var, - &d_epsilon[0], - inputSize * inputSize + d_sqrt_var, d_sqrt_var, &d_epsilon[0], + inputSize.first * inputSize.second ); CUDA_CHECK(cudaGetLastError()); // Compute squared root of variance Kernels::vec_sqrt<<>>( - d_sqrt_var, - d_sqrt_var, - inputSize * inputSize + d_sqrt_var, d_sqrt_var, inputSize.first * inputSize.second ); CUDA_CHECK(cudaGetLastError()); // Divide by squared root of variance Kernels::vec_scalar_div<<>>( - d_mean_sub, - d_output + i * inputSize * inputSize, - &d_sqrt_var[0], - inputSize * inputSize + d_mean_sub, d_output + i * inputSize.first * inputSize.second, + &d_sqrt_var[0], inputSize.first * inputSize.second ); CUDA_CHECK(cudaGetLastError()); // Multiply by weights Kernels::vec_scalar_mul<<>>( - d_output + i * inputSize * inputSize, - d_output + i * inputSize * inputSize, - &d_weights[i], - inputSize * inputSize + d_output + i * inputSize.first * inputSize.second, + d_output + i * inputSize.first * inputSize.second, &d_weights[i], + inputSize.first * inputSize.second ); CUDA_CHECK(cudaGetLastError()); // Add biases Kernels::vec_scalar_add<<>>( - d_output + i * inputSize * inputSize, - d_output + i * inputSize * inputSize, - &d_biases[i], - inputSize * inputSize + d_output + i * inputSize.first * inputSize.second, + d_output + i * inputSize.first * inputSize.second, &d_biases[i], + inputSize.first * inputSize.second ); CUDA_CHECK(cudaGetLastError()); } diff --git a/src/layers/conv2d.cu b/src/layers/conv2d.cu index 82a3aa5..bbcc516 100644 --- a/src/layers/conv2d.cu +++ b/src/layers/conv2d.cu @@ -1,23 +1,23 @@ +#include +#include + #include "activation.cuh" #include "conv2d.cuh" #include "convolution.cuh" #include "cuda_helper.cuh" -#include "matmul.cuh" #include "layer.cuh" +#include "matmul.cuh" #include "vector.cuh" -#include -#include - using namespace CUDANet::Layers; Conv2d::Conv2d( - int inputSize, - int inputChannels, - int kernelSize, - int stride, - int numFilters, - int paddingSize, + dim2d inputSize, + int inputChannels, + dim2d kernelSize, + dim2d stride, + int numFilters, + dim2d paddingSize, ActivationType activationType ) : inputSize(inputSize), @@ -26,34 +26,35 @@ Conv2d::Conv2d( stride(stride), numFilters(numFilters), paddingSize(paddingSize) { + outputSize = { + (inputSize.first - kernelSize.first + 2 * paddingSize.first) / + stride.first + 1, + (inputSize.first - kernelSize.first + 2 * paddingSize.first) / + stride.first + 1 + }; - outputSize = (inputSize - kernelSize + 2 * paddingSize) / stride + 1; - - activation = new Activation( - activationType, outputSize * outputSize * numFilters - ); + activation = + new Activation(activationType, outputSize.first * outputSize.second * numFilters); d_output = nullptr; CUDA_CHECK(cudaMalloc( - (void**)&d_output, sizeof(float) * outputSize * outputSize * numFilters + (void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * numFilters )); - weights.resize(kernelSize * kernelSize * inputChannels * numFilters); + weights.resize(kernelSize.first * kernelSize.second * inputChannels * numFilters); initializeWeights(); d_weights = nullptr; CUDA_CHECK(cudaMalloc( (void**)&d_weights, - sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters + sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters )); biases.resize(numFilters); initializeBiases(); d_biases = nullptr; - CUDA_CHECK(cudaMalloc( - (void**)&d_biases, sizeof(float) * numFilters - )); + CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * numFilters)); toCuda(); } @@ -94,35 +95,33 @@ std::vector Conv2d::getBiases() { void Conv2d::toCuda() { CUDA_CHECK(cudaMemcpy( d_weights, weights.data(), - sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters, + sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters, cudaMemcpyHostToDevice )); CUDA_CHECK(cudaMemcpy( - d_biases, biases.data(), - sizeof(float) * numFilters, + d_biases, biases.data(), sizeof(float) * numFilters, cudaMemcpyHostToDevice )); } float* Conv2d::forward(const float* d_input) { - // Convolve - dim3 block(8,8,8); + dim3 block(8, 8, 8); dim3 grid( - (outputSize + block.x - 1) / block.x, - (outputSize + block.y - 1) / block.y, + (outputSize.first + block.x - 1) / block.x, + (outputSize.second + block.y - 1) / block.y, (numFilters + block.z - 1) / block.z ); - CUDANet::Utils::clear(d_output, outputSize * outputSize * numFilters); + CUDANet::Utils::clear(d_output, outputSize.first * outputSize.second * numFilters); Kernels::convolution<<>>( - d_input, d_weights, d_biases, d_output, inputSize, inputChannels, paddingSize, - kernelSize, stride, numFilters, outputSize + d_input, d_weights, d_biases, d_output, inputSize, inputChannels, + paddingSize, kernelSize, stride, numFilters, outputSize ); CUDA_CHECK(cudaGetLastError()); - + // Apply activation activation->activate(d_output); @@ -132,9 +131,9 @@ float* Conv2d::forward(const float* d_input) { } int Conv2d::getOutputSize() { - return outputSize * outputSize * numFilters; + return outputSize.first * outputSize.second * numFilters; } int Conv2d::getInputSize() { - return inputSize * inputSize * inputChannels; + return inputSize.first * inputSize.second * inputChannels; } \ No newline at end of file diff --git a/src/layers/max_pooling.cu b/src/layers/max_pooling.cu index 6288637..ee43f7d 100644 --- a/src/layers/max_pooling.cu +++ b/src/layers/max_pooling.cu @@ -1,45 +1,44 @@ -#include "max_pooling.cuh" #include "cuda_helper.cuh" +#include "max_pooling.cuh" #include "pooling.cuh" using namespace CUDANet::Layers; - MaxPooling2D::MaxPooling2D( - int inputSize, - int nChannels, - int poolingSize, - int stride, - ActivationType activationType - ) - : inputSize(inputSize), nChannels(nChannels), poolingSize(poolingSize), stride(stride) { + dim2d inputSize, + int nChannels, + dim2d poolingSize, + dim2d stride, + ActivationType activationType +) + : inputSize(inputSize), + nChannels(nChannels), + poolingSize(poolingSize), + stride(stride) { + outputSize = { + (inputSize.first - poolingSize.first) / stride.first + 1, + (inputSize.second - poolingSize.second) / stride.second + 1 + }; - - outputSize = (inputSize - poolingSize) / stride + 1; - - activation = new Activation( - activationType, outputSize * outputSize * nChannels - ); + activation = + new Activation(activationType, outputSize.first * outputSize.second * nChannels); d_output = nullptr; CUDA_CHECK(cudaMalloc( - (void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels + (void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * nChannels )); } - MaxPooling2D::~MaxPooling2D() { cudaFree(d_output); delete activation; } - float* MaxPooling2D::forward(const float* d_input) { - - dim3 block(8,8,8); + dim3 block(8, 8, 8); dim3 grid( - (outputSize + block.x - 1) / block.x, - (outputSize + block.y - 1) / block.y, + (outputSize.first + block.x - 1) / block.x, + (outputSize.second + block.y - 1) / block.y, (nChannels + block.z - 1) / block.z ); @@ -55,9 +54,9 @@ float* MaxPooling2D::forward(const float* d_input) { } int MaxPooling2D::getOutputSize() { - return outputSize * outputSize * nChannels; + return outputSize.first * outputSize.second * nChannels; } int MaxPooling2D::getInputSize() { - return inputSize * inputSize * nChannels; + return inputSize.first * inputSize.second * nChannels; } \ No newline at end of file diff --git a/src/model/model.cpp b/src/model/model.cpp index 6571662..f822646 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -11,13 +11,13 @@ using namespace CUDANet; -Model::Model(const int inputSize, const int inputChannels, const int outputSize) +Model::Model(const dim2d inputSize, const int inputChannels, const int outputSize) : inputSize(inputSize), inputChannels(inputChannels), outputSize(outputSize), layers(std::vector>()), layerMap(std::unordered_map()) { - inputLayer = new Layers::Input(inputSize * inputSize * inputChannels); + inputLayer = new Layers::Input(inputSize.first * inputSize.second * inputChannels); outputLayer = new Layers::Output(outputSize); }; diff --git a/test/layers/test_avg_pooling.cu b/test/layers/test_avg_pooling.cu index 29cd5a1..0ec6b8c 100644 --- a/test/layers/test_avg_pooling.cu +++ b/test/layers/test_avg_pooling.cu @@ -6,10 +6,10 @@ #include "avg_pooling.cuh" TEST(AvgPoolingLayerTest, AvgPoolForwardTest) { - int inputSize = 4; - int nChannels = 2; - int poolingSize = 2; - int stride = 2; + dim2d inputSize = {4, 4}; + int nChannels = 2; + dim2d poolingSize = {2, 2}; + dim2d stride = {2, 2}; cudaError_t cudaStatus; @@ -36,13 +36,14 @@ TEST(AvgPoolingLayerTest, AvgPoolForwardTest) { float *d_input; cudaStatus = cudaMalloc( - (void **)&d_input, sizeof(float) * inputSize * inputSize * nChannels + (void **)&d_input, + sizeof(float) * inputSize.first * inputSize.second * nChannels ); EXPECT_EQ(cudaStatus, cudaSuccess); cudaStatus = cudaMemcpy( d_input, input.data(), - sizeof(float) * inputSize * inputSize * nChannels, + sizeof(float) * inputSize.first * inputSize.second * nChannels, cudaMemcpyHostToDevice ); EXPECT_EQ(cudaStatus, cudaSuccess); @@ -53,13 +54,13 @@ TEST(AvgPoolingLayerTest, AvgPoolForwardTest) { std::vector output(outputSize); cudaStatus = cudaMemcpy( - output.data(), d_output, - sizeof(float) * outputSize, + output.data(), d_output, sizeof(float) * outputSize, cudaMemcpyDeviceToHost ); EXPECT_EQ(cudaStatus, cudaSuccess); - std::vector expected = {0.43775f, 0.49475f, 0.48975f, 0.339f, 0.45675f, 0.303f, 0.56975f, 0.57025f}; + std::vector expected = {0.43775f, 0.49475f, 0.48975f, 0.339f, + 0.45675f, 0.303f, 0.56975f, 0.57025f}; for (int i = 0; i < output.size(); ++i) { EXPECT_NEAR(expected[i], output[i], 1e-4); diff --git a/test/layers/test_batch_norm.cu b/test/layers/test_batch_norm.cu index c3fbb4c..e04b1f2 100644 --- a/test/layers/test_batch_norm.cu +++ b/test/layers/test_batch_norm.cu @@ -7,8 +7,8 @@ #include "batch_norm.cuh" TEST(BatchNormLayerTest, BatchNormSmallForwardTest) { - int inputSize = 4; - int nChannels = 2; + dim2d inputSize = {4, 4}; + int nChannels = 2; cudaError_t cudaStatus; @@ -17,7 +17,7 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) { ); std::vector weights = {0.63508f, 0.64903f}; - std::vector biases = {0.25079f, 0.66841f}; + std::vector biases = {0.25079f, 0.66841f}; batchNorm.setWeights(weights.data()); batchNorm.setBiases(biases.data()); @@ -47,27 +47,27 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) { EXPECT_EQ(cudaStatus, cudaSuccess); cudaStatus = cudaMemcpy( - d_input, input.data(), sizeof(float) * input.size(), cudaMemcpyHostToDevice + d_input, input.data(), sizeof(float) * input.size(), + cudaMemcpyHostToDevice ); EXPECT_EQ(cudaStatus, cudaSuccess); float* d_output = batchNorm.forward(d_input); cudaStatus = cudaMemcpy( - output.data(), d_output, sizeof(float) * output.size(), cudaMemcpyDeviceToHost + output.data(), d_output, sizeof(float) * output.size(), + cudaMemcpyDeviceToHost ); EXPECT_EQ(cudaStatus, cudaSuccess); - std::vector expected = { - -0.06007f, 0.951f, 0.18157f, 1.36202f, - 0.39244f, 0.47335f, 0.58598f, -1.00188f, - 0.59576f, 0.79919f, -0.57001f, 0.70469f, - -0.62847f, -0.06578f, -0.43668f, 0.72952f, - 0.37726f, 0.02088f, 0.35446f, 0.98092f, - 1.39264f, 1.80686f, 1.67786f, 1.58318f, - -0.0269f, 0.26878f, 0.81411f, 0.09022f, - 0.9126f, 0.71485f, -0.08184f, -0.19131f - }; + std::vector expected = {-0.06007f, 0.951f, 0.18157f, 1.36202f, + 0.39244f, 0.47335f, 0.58598f, -1.00188f, + 0.59576f, 0.79919f, -0.57001f, 0.70469f, + -0.62847f, -0.06578f, -0.43668f, 0.72952f, + 0.37726f, 0.02088f, 0.35446f, 0.98092f, + 1.39264f, 1.80686f, 1.67786f, 1.58318f, + -0.0269f, 0.26878f, 0.81411f, 0.09022f, + 0.9126f, 0.71485f, -0.08184f, -0.19131f}; // std::cout << "BatchNorm2D: " << std::endl; for (int i = 0; i < output.size(); i++) { @@ -76,5 +76,4 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) { } // std::cout << std::endl; cudaFree(d_input); - } \ No newline at end of file diff --git a/test/layers/test_conv2d.cu b/test/layers/test_conv2d.cu index 9ab986c..3f15b4b 100644 --- a/test/layers/test_conv2d.cu +++ b/test/layers/test_conv2d.cu @@ -8,12 +8,12 @@ class Conv2dTest : public ::testing::Test { protected: CUDANet::Layers::Conv2d commonTestSetup( - int inputSize, + dim2d inputSize, int inputChannels, - int kernelSize, - int stride, + dim2d kernelSize, + dim2d stride, int numFilters, - int paddingSize, + dim2d paddingSize, CUDANet::Layers::ActivationType activationType, std::vector& input, float* kernels, @@ -30,7 +30,7 @@ class Conv2dTest : public ::testing::Test { // Allocate device memory cudaStatus = cudaMalloc( (void**)&d_input, - sizeof(float) * inputSize * inputSize * inputChannels + sizeof(float) * inputSize.first * inputSize.second * inputChannels ); EXPECT_EQ(cudaStatus, cudaSuccess); @@ -47,19 +47,18 @@ class Conv2dTest : public ::testing::Test { void commonTestTeardown(float* d_input) { // Free device memory cudaFree(d_input); - } cudaError_t cudaStatus; }; TEST_F(Conv2dTest, SimpleTest) { - int inputSize = 4; - int inputChannels = 1; - int kernelSize = 2; - int stride = 1; - int numFilters = 1; - int paddingSize = 0; + dim2d inputSize = {4, 4}; + int inputChannels = 1; + dim2d kernelSize = {2, 2}; + dim2d stride = {1, 1}; + int numFilters = 1; + dim2d paddingSize = {0, 0}; CUDANet::Layers::ActivationType activationType = CUDANet::Layers::ActivationType::NONE; @@ -82,8 +81,9 @@ TEST_F(Conv2dTest, SimpleTest) { activationType, input, kernels.data(), d_input ); - int outputWidth = (inputSize - kernelSize) / stride + 1; - int outputSize = outputWidth * outputWidth * numFilters; + int outputHeight = (inputSize.first - kernelSize.first) / stride.first + 1; + int outputWidth = (inputSize.second - kernelSize.second) / stride.second + 1; + int outputSize = outputHeight * outputWidth * numFilters; EXPECT_EQ(outputSize, conv2d.getOutputSize()); d_output = conv2d.forward(d_input); @@ -106,12 +106,16 @@ TEST_F(Conv2dTest, SimpleTest) { } TEST_F(Conv2dTest, PaddedTest) { - int inputSize = 5; - int inputChannels = 3; - int kernelSize = 3; - int stride = 1; - int numFilters = 2; - int paddingSize = CUDANET_SAME_PADDING(inputSize, kernelSize, stride); + dim2d inputSize = {5, 5}; + int inputChannels = 3; + dim2d kernelSize = {3, 3}; + dim2d stride = {1, 1}; + int numFilters = 2; + + int paddingFirst = CUDANET_SAME_PADDING(inputSize.first, kernelSize.first, stride.first); + int paddingSecond = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second); + dim2d paddingSize = {paddingFirst, paddingSecond}; + CUDANet::Layers::ActivationType activationType = CUDANet::Layers::ActivationType::NONE; @@ -173,16 +177,14 @@ TEST_F(Conv2dTest, PaddedTest) { activationType, input, kernels.data(), d_input ); - EXPECT_EQ(inputSize * inputSize * numFilters, conv2d.getOutputSize()); + EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize()); d_output = conv2d.forward(d_input); - std::vector output( - conv2d.getOutputSize() - ); + std::vector output(conv2d.getOutputSize()); cudaMemcpy( - output.data(), d_output, - sizeof(float) * conv2d.getOutputSize(), cudaMemcpyDeviceToHost + output.data(), d_output, sizeof(float) * conv2d.getOutputSize(), + cudaMemcpyDeviceToHost ); // Generated by tools/generate_conv2d_test.py @@ -206,12 +208,17 @@ TEST_F(Conv2dTest, PaddedTest) { } TEST_F(Conv2dTest, StridedPaddedConvolution) { - int inputSize = 5; + dim2d inputSize = {5, 5}; int inputChannels = 2; - int kernelSize = 3; - int stride = 2; + dim2d kernelSize = {3, 3}; + dim2d stride = {2, 2}; int numFilters = 2; - int paddingSize = CUDANET_SAME_PADDING(inputSize, kernelSize, stride); + + int paddingFirst = CUDANET_SAME_PADDING(inputSize.first, kernelSize.second, stride.first); + int paddingSecond = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second); + dim2d paddingSize = {paddingFirst, paddingSecond}; + + CUDANet::Layers::ActivationType activationType = CUDANet::Layers::ActivationType::RELU; @@ -258,16 +265,13 @@ TEST_F(Conv2dTest, StridedPaddedConvolution) { activationType, input, kernels.data(), d_input ); - EXPECT_EQ(inputSize * inputSize * numFilters, conv2d.getOutputSize()); + EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize()); d_output = conv2d.forward(d_input); - std::vector output( - conv2d.getOutputSize() - ); + std::vector output(conv2d.getOutputSize()); cudaMemcpy( - output.data(), d_output, - sizeof(float) * conv2d.getOutputSize(), + output.data(), d_output, sizeof(float) * conv2d.getOutputSize(), cudaMemcpyDeviceToHost ); diff --git a/test/layers/test_max_pooling.cu b/test/layers/test_max_pooling.cu index 09c5214..552b1dd 100644 --- a/test/layers/test_max_pooling.cu +++ b/test/layers/test_max_pooling.cu @@ -6,10 +6,10 @@ #include "max_pooling.cuh" TEST(MaxPoolingLayerTest, MaxPoolForwardTest) { - int inputSize = 4; - int nChannels = 2; - int poolingSize = 2; - int stride = 2; + dim2d inputSize = {4, 4}; + int nChannels = 2; + dim2d poolingSize = {2, 2}; + dim2d stride = {2, 2}; cudaError_t cudaStatus; @@ -36,13 +36,13 @@ TEST(MaxPoolingLayerTest, MaxPoolForwardTest) { float *d_input; cudaStatus = cudaMalloc( - (void **)&d_input, sizeof(float) * inputSize * inputSize * nChannels + (void **)&d_input, sizeof(float) * inputSize.first * inputSize.second * nChannels ); EXPECT_EQ(cudaStatus, cudaSuccess); cudaStatus = cudaMemcpy( d_input, input.data(), - sizeof(float) * inputSize * inputSize * nChannels, + sizeof(float) * inputSize.first * inputSize.second * nChannels, cudaMemcpyHostToDevice ); EXPECT_EQ(cudaStatus, cudaSuccess); @@ -53,13 +53,13 @@ TEST(MaxPoolingLayerTest, MaxPoolForwardTest) { std::vector output(outputSize); cudaStatus = cudaMemcpy( - output.data(), d_output, - sizeof(float) * outputSize, + output.data(), d_output, sizeof(float) * outputSize, cudaMemcpyDeviceToHost ); EXPECT_EQ(cudaStatus, cudaSuccess); - std::vector expected = {0.619f, 0.732f, 0.712f, 0.742f, 0.919f, 0.973f, 0.819f, 0.85f}; + std::vector expected = {0.619f, 0.732f, 0.712f, 0.742f, + 0.919f, 0.973f, 0.819f, 0.85f}; for (int i = 0; i < output.size(); ++i) { EXPECT_FLOAT_EQ(expected[i], output[i]); diff --git a/test/model/test_model.cu b/test/model/test_model.cu index ac0f5b8..6c8a94c 100644 --- a/test/model/test_model.cu +++ b/test/model/test_model.cu @@ -10,27 +10,26 @@ class ModelTest : public ::testing::Test { CUDANet::Model *commonTestSetup( bool setWeights = true, - int inputSize = 6, - int inputChannels = 2, - int outputSize = 3, + dim2d inputSize = {6, 6}, + int inputChannels = 2, + int outputSize = 3, - int kernelSize = 3, - int stride = 1, - int numFilters = 2, + dim2d kernelSize = {3, 3}, + dim2d stride = {1, 1}, + int numFilters = 2, - int poolingSize = 2, - int poolingStride = 2 + dim2d poolingSize = {2, 2}, + dim2d poolingStride = {2, 2} ) { CUDANet::Model *model = new CUDANet::Model(inputSize, inputChannels, outputSize); - int paddingSize = 0; + dim2d paddingSize = {0, 0}; // Conv2d CUDANet::Layers::Conv2d *conv2d = new CUDANet::Layers::Conv2d( inputSize, inputChannels, kernelSize, stride, numFilters, - paddingSize, - CUDANet::Layers::ActivationType::NONE + paddingSize, CUDANet::Layers::ActivationType::NONE ); if (setWeights) { @@ -39,9 +38,13 @@ class ModelTest : public ::testing::Test { model->addLayer("conv1", conv2d); // maxpool2d + dim2d poolingInput = { + inputSize.first - kernelSize.first + 1, + inputSize.second - kernelSize.second + 1 + }; CUDANet::Layers::MaxPooling2D *maxpool2d = new CUDANet::Layers::MaxPooling2D( - inputSize - kernelSize + 1, numFilters, poolingSize, + poolingInput, numFilters, poolingSize, poolingStride, CUDANet::Layers::ActivationType::RELU ); model->addLayer("maxpool1", maxpool2d);