Add support for non square matrices

This commit is contained in:
2024-05-20 15:20:43 +02:00
parent 6f8b5f4081
commit 74098b24e3
21 changed files with 314 additions and 299 deletions

View File

@@ -2,6 +2,7 @@
#define CUDANET_CONVOLUTION_H #define CUDANET_CONVOLUTION_H
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "layer.cuh"
namespace CUDANet::Kernels { namespace CUDANet::Kernels {
@@ -24,13 +25,13 @@ __global__ void convolution(
const float* __restrict__ d_kernel, const float* __restrict__ d_kernel,
const float* __restrict__ d_bias, const float* __restrict__ d_bias,
float* __restrict__ d_output, float* __restrict__ d_output,
const int inputSize, const dim2d inputSize,
const int nChannels, const int nChannels,
const int paddingSize, const dim2d paddingSize,
const int kernelSize, const dim2d kernelSize,
const int stride, const dim2d stride,
const int nFilters, const int nFilters,
const int outputSize const dim2d outputSize
); );
} // namespace CUDANet::Kernels } // namespace CUDANet::Kernels

View File

@@ -2,27 +2,28 @@
#define CUDANET_POOLING_H #define CUDANET_POOLING_H
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "layer.cuh"
namespace CUDANet::Kernels { namespace CUDANet::Kernels {
__global__ void max_pooling( __global__ void max_pooling(
const float* __restrict__ d_input, const float* __restrict__ d_input,
float* __restrict__ d_output, float* __restrict__ d_output,
const int inputSize, const dim2d inputSize,
const int outputSize, const dim2d outputSize,
const int nChannels, const int nChannels,
const int poolingSize, const dim2d poolingSize,
const int stride const dim2d stride
); );
__global__ void avg_pooling( __global__ void avg_pooling(
const float* __restrict__ d_input, const float* __restrict__ d_input,
float* __restrict__ d_output, float* __restrict__ d_output,
const int inputSize, const dim2d inputSize,
const int outputSize, const dim2d outputSize,
const int nChannels, const int nChannels,
const int poolingSize, const dim2d poolingSize,
const int stride const dim2d stride
); );
} // namespace CUDANet::Kernels } // namespace CUDANet::Kernels

View File

@@ -9,10 +9,10 @@ namespace CUDANet::Layers {
class AvgPooling2D : public SequentialLayer { class AvgPooling2D : public SequentialLayer {
public: public:
AvgPooling2D( AvgPooling2D(
int inputSize, dim2d inputSize,
int nChannels, int nChannels,
int poolingSize, dim2d poolingSize,
int stride, dim2d stride,
ActivationType activationType ActivationType activationType
); );
~AvgPooling2D(); ~AvgPooling2D();
@@ -28,18 +28,18 @@ class AvgPooling2D : public SequentialLayer {
/** /**
* @brief Get input size * @brief Get input size
* *
* @return int input size * @return int input size
*/ */
int getInputSize(); int getInputSize();
private: private:
int inputSize; dim2d inputSize;
int nChannels; int nChannels;
int poolingSize; dim2d poolingSize;
int stride; dim2d stride;
int outputSize; dim2d outputSize;
float* d_output; float* d_output;

View File

@@ -10,7 +10,7 @@ namespace CUDANet::Layers {
class BatchNorm2D : public WeightedLayer { class BatchNorm2D : public WeightedLayer {
public: public:
BatchNorm2D(int inputSize, int inputChannels, float epsilon, ActivationType activationType); BatchNorm2D(dim2d inputSize, int inputChannels, float epsilon, ActivationType activationType);
~BatchNorm2D(); ~BatchNorm2D();
@@ -66,7 +66,7 @@ class BatchNorm2D : public WeightedLayer {
private: private:
int inputSize; dim2d inputSize;
int inputChannels; int inputChannels;
int gridSize; int gridSize;

View File

@@ -28,12 +28,12 @@ class Conv2d : public WeightedLayer {
* 'SOFTMAX' or 'NONE') * 'SOFTMAX' or 'NONE')
*/ */
Conv2d( Conv2d(
int inputSize, dim2d inputSize,
int inputChannels, int inputChannels,
int kernelSize, dim2d kernelSize,
int stride, dim2d stride,
int numFilters, int numFilters,
int paddingSize, dim2d paddingSize,
ActivationType activationType ActivationType activationType
); );
@@ -98,23 +98,23 @@ class Conv2d : public WeightedLayer {
* *
* @return int * @return int
*/ */
int getPaddingSize() { dim2d getPaddingSize() {
return paddingSize; return paddingSize;
} }
private: private:
// Inputs // Inputs
int inputSize; dim2d inputSize;
int inputChannels; int inputChannels;
// Outputs // Outputs
int outputSize; dim2d outputSize;
// Kernel // Kernel
int kernelSize; dim2d kernelSize;
int stride; dim2d stride;
int paddingSize; dim2d paddingSize;
int numFilters; int numFilters;
// Kernels // Kernels
std::vector<float> weights; std::vector<float> weights;

View File

@@ -81,8 +81,8 @@ class Dense : public WeightedLayer {
int getInputSize(); int getInputSize();
private: private:
unsigned int inputSize; int inputSize;
unsigned int outputSize; int outputSize;
float* d_output; float* d_output;
@@ -95,8 +95,8 @@ class Dense : public WeightedLayer {
Layers::Activation* activation; Layers::Activation* activation;
// Precompute kernel launch parameters // Precompute kernel launch parameters
unsigned int forwardGridSize; int forwardGridSize;
unsigned int biasGridSize; int biasGridSize;
/** /**
* @brief Initialize the weights to zeros * @brief Initialize the weights to zeros

View File

@@ -7,6 +7,8 @@
#define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \ #define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \
((stride - 1) * inputSize - stride + kernelSize) / 2; ((stride - 1) * inputSize - stride + kernelSize) / 2;
typedef std::pair<int, int> dim2d;
namespace CUDANet::Layers { namespace CUDANet::Layers {
/** /**

View File

@@ -9,10 +9,10 @@ namespace CUDANet::Layers {
class MaxPooling2D : public SequentialLayer { class MaxPooling2D : public SequentialLayer {
public: public:
MaxPooling2D( MaxPooling2D(
int inputSize, dim2d inputSize,
int nChannels, int nChannels,
int poolingSize, dim2d poolingSize,
int stride, dim2d stride,
ActivationType activationType ActivationType activationType
); );
~MaxPooling2D(); ~MaxPooling2D();
@@ -28,18 +28,18 @@ class MaxPooling2D : public SequentialLayer {
/** /**
* @brief Get input size * @brief Get input size
* *
* @return int input size * @return int input size
*/ */
int getInputSize(); int getInputSize();
private: private:
int inputSize; dim2d inputSize;
int nChannels; int nChannels;
int poolingSize; dim2d poolingSize;
int stride; dim2d stride;
int outputSize; dim2d outputSize;
float* d_output; float* d_output;

View File

@@ -26,7 +26,7 @@ struct TensorInfo {
class Model { class Model {
public: public:
Model(const int inputSize, const int inputChannels, const int outputSize); Model(const dim2d inputSize, const int inputChannels, const int outputSize);
Model(const Model& other); Model(const Model& other);
~Model(); ~Model();
@@ -43,7 +43,7 @@ class Model {
Layers::Input* inputLayer; Layers::Input* inputLayer;
Layers::Output* outputLayer; Layers::Output* outputLayer;
int inputSize; dim2d inputSize;
int inputChannels; int inputChannels;
int outputSize; int outputSize;

View File

@@ -9,19 +9,19 @@ __global__ void Kernels::convolution(
const float* __restrict__ d_kernel, const float* __restrict__ d_kernel,
const float* __restrict__ d_bias, const float* __restrict__ d_bias,
float* __restrict__ d_output, float* __restrict__ d_output,
const int inputSize, const dim2d inputSize,
const int nChannels, const int nChannels,
const int paddingSize, const dim2d paddingSize,
const int kernelSize, const dim2d kernelSize,
const int stride, const dim2d stride,
const int nFilters, const int nFilters,
const int outputSize const dim2d outputSize
) { ) {
int j = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y; int i = blockDim.y * blockIdx.y + threadIdx.y;
int f = blockDim.z * blockIdx.z + threadIdx.z; int f = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || f >= nFilters) { if (i >= outputSize.first || j >= outputSize.second || f >= nFilters) {
return; return;
} }
@@ -29,28 +29,32 @@ __global__ void Kernels::convolution(
// Iterate over kernel and input matrix // Iterate over kernel and input matrix
for (int c = 0; c < nChannels; c++) { for (int c = 0; c < nChannels; c++) {
for (int k = 0; k < kernelSize; k++) { for (int k = 0; k < kernelSize.first; k++) {
for (int l = 0; l < kernelSize; l++) { for (int l = 0; l < kernelSize.second; l++) {
// if i, j is in the padding region // if i, j is in the padding region
if (i * stride + k < paddingSize || if (i * stride.first + k < paddingSize.first ||
i * stride + k >= (inputSize + paddingSize) || i * stride.first + k >=
j * stride + l < paddingSize || (inputSize.first + paddingSize.first) ||
j * stride + l >= (inputSize + paddingSize)) { j * stride.second + l < paddingSize.second ||
j * stride.second + l >=
(inputSize.second + paddingSize.second)) {
continue; continue;
} }
int kernelIndex = f * kernelSize * kernelSize * nChannels + int kernelIndex =
c * kernelSize * kernelSize + k * kernelSize + f * kernelSize.first * kernelSize.second * nChannels +
l; c * kernelSize.first * kernelSize.second +
int inputIndex = c * inputSize * inputSize + k * kernelSize.second + l;
(i * stride + k - paddingSize) * inputSize + int inputIndex = c * inputSize.first * inputSize.second +
(j * stride + l - paddingSize); (i * stride.first + k - paddingSize.first) *
inputSize.second +
(j * stride.second + l - paddingSize.second);
sum += d_kernel[kernelIndex] * d_input[inputIndex]; sum += d_kernel[kernelIndex] * d_input[inputIndex];
} }
} }
} }
d_output[f * outputSize * outputSize + i * outputSize + j] = sum + d_bias[f]; d_output[f * outputSize.first * outputSize.second + i * outputSize.second + j] =
sum + d_bias[f];
} }

View File

@@ -1,4 +1,5 @@
#include "cuda_helper.cuh" #include "cuda_helper.cuh"
#include "layer.cuh"
#include "pooling.cuh" #include "pooling.cuh"
using namespace CUDANet; using namespace CUDANet;
@@ -6,26 +7,27 @@ using namespace CUDANet;
__global__ void Kernels::max_pooling( __global__ void Kernels::max_pooling(
const float* __restrict__ d_input, const float* __restrict__ d_input,
float* __restrict__ d_output, float* __restrict__ d_output,
const int inputSize, const dim2d inputSize,
const int outputSize, const dim2d outputSize,
const int nChannels, const int nChannels,
const int poolingSize, const dim2d poolingSize,
const int stride const dim2d stride
) { ) {
int j = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y; int i = blockDim.y * blockIdx.y + threadIdx.y;
int c = blockDim.z * blockIdx.z + threadIdx.z; int c = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || c >= nChannels) { if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
return; return;
} }
float max = 0.0f; float max = 0.0f;
for (int k = 0; k < poolingSize; k++) { for (int k = 0; k < poolingSize.first; k++) {
for (int l = 0; l < poolingSize; l++) { for (int l = 0; l < poolingSize.second; l++) {
int inputIndex = c * inputSize * inputSize + int inputIndex = c * inputSize.first * inputSize.second +
(i * stride + k) * inputSize + (j * stride + l); (i * stride.first + k) * inputSize.second +
(j * stride.second + l);
if (d_input[inputIndex] > max) { if (d_input[inputIndex] > max) {
max = d_input[inputIndex]; max = d_input[inputIndex];
@@ -33,37 +35,41 @@ __global__ void Kernels::max_pooling(
} }
} }
d_output[c * outputSize * outputSize + i * outputSize + j] = max; d_output
[c * outputSize.first * outputSize.second + i * outputSize.second + j] =
max;
} }
__global__ void Kernels::avg_pooling( __global__ void Kernels::avg_pooling(
const float* __restrict__ d_input, const float* __restrict__ d_input,
float* __restrict__ d_output, float* __restrict__ d_output,
const int inputSize, const dim2d inputSize,
const int outputSize, const dim2d outputSize,
const int nChannels, const int nChannels,
const int poolingSize, const dim2d poolingSize,
const int stride const dim2d stride
) { ) {
int j = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y; int i = blockDim.y * blockIdx.y + threadIdx.y;
int c = blockDim.z * blockIdx.z + threadIdx.z; int c = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || c >= outputSize) { if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
return; return;
} }
float sum = 0.0f; float sum = 0.0f;
for (int k = 0; k < poolingSize; k++) { for (int k = 0; k < poolingSize.first; k++) {
for (int l = 0; l < poolingSize; l++) { for (int l = 0; l < poolingSize.second; l++) {
int inputIndex = c * inputSize * inputSize + int inputIndex = c * inputSize.first * inputSize.second +
(i * stride + k) * inputSize + (j * stride + l); (i * stride.first + k) * inputSize.second +
(j * stride.second + l);
sum += d_input[inputIndex]; sum += d_input[inputIndex];
} }
} }
d_output[c * outputSize * outputSize + i * outputSize + j] = d_output
sum / (poolingSize * poolingSize); [c * outputSize.first * outputSize.second + i * outputSize.second + j] =
sum / (poolingSize.first * poolingSize.second);
} }

View File

@@ -5,24 +5,29 @@
using namespace CUDANet::Layers; using namespace CUDANet::Layers;
AvgPooling2D::AvgPooling2D( AvgPooling2D::AvgPooling2D(
int inputSize, dim2d inputSize,
int nChannels, int nChannels,
int poolingSize, dim2d poolingSize,
int stride, dim2d stride,
ActivationType activationType ActivationType activationType
) )
: inputSize(inputSize), : inputSize(inputSize),
nChannels(nChannels), nChannels(nChannels),
poolingSize(poolingSize), poolingSize(poolingSize),
stride(stride) { stride(stride) {
outputSize = (inputSize - poolingSize) / stride + 1; outputSize = {
(inputSize.first - poolingSize.first) / stride.first + 1,
(inputSize.second - poolingSize.second) / stride.second + 1
};
activation = activation = new Activation(
new Activation(activationType, outputSize * outputSize * nChannels); activationType, outputSize.first * outputSize.second * nChannels
);
d_output = nullptr; d_output = nullptr;
CUDA_CHECK(cudaMalloc( CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels (void**)&d_output,
sizeof(float) * outputSize.first * outputSize.second * nChannels
)); ));
} }
@@ -32,11 +37,10 @@ AvgPooling2D::~AvgPooling2D() {
} }
float* AvgPooling2D::forward(const float* d_input) { float* AvgPooling2D::forward(const float* d_input) {
dim3 block(8, 8, 8); dim3 block(8, 8, 8);
dim3 grid( dim3 grid(
(outputSize + block.x - 1) / block.x, (outputSize.first + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y, (outputSize.second + block.y - 1) / block.y,
(nChannels + block.z - 1) / block.z (nChannels + block.z - 1) / block.z
); );
@@ -52,9 +56,9 @@ float* AvgPooling2D::forward(const float* d_input) {
} }
int AvgPooling2D::getOutputSize() { int AvgPooling2D::getOutputSize() {
return outputSize * outputSize * nChannels; return outputSize.first * outputSize.second * nChannels;
} }
int AvgPooling2D::getInputSize() { int AvgPooling2D::getInputSize() {
return inputSize * inputSize * nChannels; return inputSize.first * inputSize.second * nChannels;
} }

View File

@@ -10,31 +10,36 @@
using namespace CUDANet::Layers; using namespace CUDANet::Layers;
BatchNorm2D::BatchNorm2D( BatchNorm2D::BatchNorm2D(
int inputSize, dim2d inputSize,
int inputChannels, int inputChannels,
float epsilon, float epsilon,
ActivationType activationType ActivationType activationType
) )
: inputSize(inputSize), inputChannels(inputChannels) { : inputSize(inputSize), inputChannels(inputChannels) {
activation = activation = new Activation(
new Activation(activationType, inputSize * inputSize * inputChannels); activationType, inputSize.first * inputSize.second * inputChannels
);
d_output = nullptr; d_output = nullptr;
CUDA_CHECK(cudaMalloc( CUDA_CHECK(cudaMalloc(
(void **)&d_output, (void **)&d_output,
sizeof(float) * inputSize * inputSize * inputChannels sizeof(float) * inputSize.first * inputSize.second * inputChannels
)); ));
d_mean = nullptr; d_mean = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_mean, sizeof(float) * inputSize * inputSize)); CUDA_CHECK(cudaMalloc(
(void **)&d_mean, sizeof(float) * inputSize.first * inputSize.second
));
d_mean_sub = nullptr; d_mean_sub = nullptr;
CUDA_CHECK( CUDA_CHECK(cudaMalloc(
cudaMalloc((void **)&d_mean_sub, sizeof(float) * inputSize * inputSize) (void **)&d_mean_sub, sizeof(float) * inputSize.first * inputSize.second
); ));
d_sqrt_var = nullptr; d_sqrt_var = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_sqrt_var, sizeof(float) * inputSize * inputSize)); CUDA_CHECK(cudaMalloc(
(void **)&d_sqrt_var, sizeof(float) * inputSize.first * inputSize.second
));
d_weights = nullptr; d_weights = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_weights, sizeof(float) * inputChannels)); CUDA_CHECK(cudaMalloc((void **)&d_weights, sizeof(float) * inputChannels));
@@ -42,14 +47,18 @@ BatchNorm2D::BatchNorm2D(
d_biases = nullptr; d_biases = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_biases, sizeof(float) * inputChannels)); CUDA_CHECK(cudaMalloc((void **)&d_biases, sizeof(float) * inputChannels));
d_length = nullptr; d_length = nullptr;
float length = (float) inputSize * inputSize; float length = (float)inputSize.first * inputSize.second;
CUDA_CHECK(cudaMalloc((void **)&d_length, sizeof(float))); CUDA_CHECK(cudaMalloc((void **)&d_length, sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(
cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice)
);
d_epsilon = nullptr; d_epsilon = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_epsilon, sizeof(float))); CUDA_CHECK(cudaMalloc((void **)&d_epsilon, sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(
cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice)
);
weights.resize(inputChannels); weights.resize(inputChannels);
biases.resize(inputChannels); biases.resize(inputChannels);
@@ -60,7 +69,7 @@ BatchNorm2D::BatchNorm2D(
toCuda(); toCuda();
gridSize = gridSize =
(inputSize * inputSize + BLOCK_SIZE - 1) / BLOCK_SIZE; (inputSize.first * inputSize.second + BLOCK_SIZE - 1) / BLOCK_SIZE;
} }
BatchNorm2D::~BatchNorm2D() { BatchNorm2D::~BatchNorm2D() {
@@ -112,84 +121,67 @@ void BatchNorm2D::toCuda() {
} }
int BatchNorm2D::getInputSize() { int BatchNorm2D::getInputSize() {
return inputSize * inputSize * inputChannels; return inputSize.first * inputSize.second * inputChannels;
} }
int BatchNorm2D::getOutputSize() { int BatchNorm2D::getOutputSize() {
return inputSize * inputSize * inputChannels; return inputSize.first * inputSize.second * inputChannels;
} }
float *BatchNorm2D::forward(const float *d_input) { float *BatchNorm2D::forward(const float *d_input) {
// Compute per-channel batch normalization // Compute per-channel batch normalization
for (int i = 0; i < inputChannels; i++) { for (int i = 0; i < inputChannels; i++) {
// Compute mean // Compute mean
Utils::mean( Utils::mean(
d_input + i * inputSize * inputSize, d_input + i * inputSize.first * inputSize.second, d_mean, d_length,
d_mean, inputSize.first * inputSize.second
d_length,
inputSize * inputSize
); );
// Subtract mean from input // Subtract mean from input
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
d_input + i * inputSize * inputSize, d_input + i * inputSize.first * inputSize.second, d_mean_sub,
d_mean_sub, &d_mean[0], inputSize.first * inputSize.second
&d_mean[0],
inputSize * inputSize
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
// Compute variance // Compute variance
Utils::var( Utils::var(
d_mean_sub, d_mean_sub, d_sqrt_var, d_length, inputSize.first * inputSize.second
d_sqrt_var,
d_length,
inputSize * inputSize
); );
// Add epsilon to variance to avoid division by zero // Add epsilon to variance to avoid division by zero
Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
d_sqrt_var, d_sqrt_var, d_sqrt_var, &d_epsilon[0],
d_sqrt_var, inputSize.first * inputSize.second
&d_epsilon[0],
inputSize * inputSize
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
// Compute squared root of variance // Compute squared root of variance
Kernels::vec_sqrt<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_sqrt<<<gridSize, BLOCK_SIZE>>>(
d_sqrt_var, d_sqrt_var, d_sqrt_var, inputSize.first * inputSize.second
d_sqrt_var,
inputSize * inputSize
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
// Divide by squared root of variance // Divide by squared root of variance
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
d_mean_sub, d_mean_sub, d_output + i * inputSize.first * inputSize.second,
d_output + i * inputSize * inputSize, &d_sqrt_var[0], inputSize.first * inputSize.second
&d_sqrt_var[0],
inputSize * inputSize
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
// Multiply by weights // Multiply by weights
Kernels::vec_scalar_mul<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_scalar_mul<<<gridSize, BLOCK_SIZE>>>(
d_output + i * inputSize * inputSize, d_output + i * inputSize.first * inputSize.second,
d_output + i * inputSize * inputSize, d_output + i * inputSize.first * inputSize.second, &d_weights[i],
&d_weights[i], inputSize.first * inputSize.second
inputSize * inputSize
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
// Add biases // Add biases
Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
d_output + i * inputSize * inputSize, d_output + i * inputSize.first * inputSize.second,
d_output + i * inputSize * inputSize, d_output + i * inputSize.first * inputSize.second, &d_biases[i],
&d_biases[i], inputSize.first * inputSize.second
inputSize * inputSize
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
} }

View File

@@ -1,23 +1,23 @@
#include <iostream>
#include <vector>
#include "activation.cuh" #include "activation.cuh"
#include "conv2d.cuh" #include "conv2d.cuh"
#include "convolution.cuh" #include "convolution.cuh"
#include "cuda_helper.cuh" #include "cuda_helper.cuh"
#include "matmul.cuh"
#include "layer.cuh" #include "layer.cuh"
#include "matmul.cuh"
#include "vector.cuh" #include "vector.cuh"
#include <iostream>
#include <vector>
using namespace CUDANet::Layers; using namespace CUDANet::Layers;
Conv2d::Conv2d( Conv2d::Conv2d(
int inputSize, dim2d inputSize,
int inputChannels, int inputChannels,
int kernelSize, dim2d kernelSize,
int stride, dim2d stride,
int numFilters, int numFilters,
int paddingSize, dim2d paddingSize,
ActivationType activationType ActivationType activationType
) )
: inputSize(inputSize), : inputSize(inputSize),
@@ -26,34 +26,35 @@ Conv2d::Conv2d(
stride(stride), stride(stride),
numFilters(numFilters), numFilters(numFilters),
paddingSize(paddingSize) { paddingSize(paddingSize) {
outputSize = {
(inputSize.first - kernelSize.first + 2 * paddingSize.first) /
stride.first + 1,
(inputSize.first - kernelSize.first + 2 * paddingSize.first) /
stride.first + 1
};
outputSize = (inputSize - kernelSize + 2 * paddingSize) / stride + 1; activation =
new Activation(activationType, outputSize.first * outputSize.second * numFilters);
activation = new Activation(
activationType, outputSize * outputSize * numFilters
);
d_output = nullptr; d_output = nullptr;
CUDA_CHECK(cudaMalloc( CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * numFilters (void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * numFilters
)); ));
weights.resize(kernelSize * kernelSize * inputChannels * numFilters); weights.resize(kernelSize.first * kernelSize.second * inputChannels * numFilters);
initializeWeights(); initializeWeights();
d_weights = nullptr; d_weights = nullptr;
CUDA_CHECK(cudaMalloc( CUDA_CHECK(cudaMalloc(
(void**)&d_weights, (void**)&d_weights,
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters
)); ));
biases.resize(numFilters); biases.resize(numFilters);
initializeBiases(); initializeBiases();
d_biases = nullptr; d_biases = nullptr;
CUDA_CHECK(cudaMalloc( CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * numFilters));
(void**)&d_biases, sizeof(float) * numFilters
));
toCuda(); toCuda();
} }
@@ -94,35 +95,33 @@ std::vector<float> Conv2d::getBiases() {
void Conv2d::toCuda() { void Conv2d::toCuda() {
CUDA_CHECK(cudaMemcpy( CUDA_CHECK(cudaMemcpy(
d_weights, weights.data(), d_weights, weights.data(),
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters, sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters,
cudaMemcpyHostToDevice cudaMemcpyHostToDevice
)); ));
CUDA_CHECK(cudaMemcpy( CUDA_CHECK(cudaMemcpy(
d_biases, biases.data(), d_biases, biases.data(), sizeof(float) * numFilters,
sizeof(float) * numFilters,
cudaMemcpyHostToDevice cudaMemcpyHostToDevice
)); ));
} }
float* Conv2d::forward(const float* d_input) { float* Conv2d::forward(const float* d_input) {
// Convolve // Convolve
dim3 block(8,8,8); dim3 block(8, 8, 8);
dim3 grid( dim3 grid(
(outputSize + block.x - 1) / block.x, (outputSize.first + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y, (outputSize.second + block.y - 1) / block.y,
(numFilters + block.z - 1) / block.z (numFilters + block.z - 1) / block.z
); );
CUDANet::Utils::clear(d_output, outputSize * outputSize * numFilters); CUDANet::Utils::clear(d_output, outputSize.first * outputSize.second * numFilters);
Kernels::convolution<<<grid, block>>>( Kernels::convolution<<<grid, block>>>(
d_input, d_weights, d_biases, d_output, inputSize, inputChannels, paddingSize, d_input, d_weights, d_biases, d_output, inputSize, inputChannels,
kernelSize, stride, numFilters, outputSize paddingSize, kernelSize, stride, numFilters, outputSize
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
// Apply activation // Apply activation
activation->activate(d_output); activation->activate(d_output);
@@ -132,9 +131,9 @@ float* Conv2d::forward(const float* d_input) {
} }
int Conv2d::getOutputSize() { int Conv2d::getOutputSize() {
return outputSize * outputSize * numFilters; return outputSize.first * outputSize.second * numFilters;
} }
int Conv2d::getInputSize() { int Conv2d::getInputSize() {
return inputSize * inputSize * inputChannels; return inputSize.first * inputSize.second * inputChannels;
} }

View File

@@ -1,45 +1,44 @@
#include "max_pooling.cuh"
#include "cuda_helper.cuh" #include "cuda_helper.cuh"
#include "max_pooling.cuh"
#include "pooling.cuh" #include "pooling.cuh"
using namespace CUDANet::Layers; using namespace CUDANet::Layers;
MaxPooling2D::MaxPooling2D( MaxPooling2D::MaxPooling2D(
int inputSize, dim2d inputSize,
int nChannels, int nChannels,
int poolingSize, dim2d poolingSize,
int stride, dim2d stride,
ActivationType activationType ActivationType activationType
) )
: inputSize(inputSize), nChannels(nChannels), poolingSize(poolingSize), stride(stride) { : inputSize(inputSize),
nChannels(nChannels),
poolingSize(poolingSize),
stride(stride) {
outputSize = {
(inputSize.first - poolingSize.first) / stride.first + 1,
(inputSize.second - poolingSize.second) / stride.second + 1
};
activation =
outputSize = (inputSize - poolingSize) / stride + 1; new Activation(activationType, outputSize.first * outputSize.second * nChannels);
activation = new Activation(
activationType, outputSize * outputSize * nChannels
);
d_output = nullptr; d_output = nullptr;
CUDA_CHECK(cudaMalloc( CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels (void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * nChannels
)); ));
} }
MaxPooling2D::~MaxPooling2D() { MaxPooling2D::~MaxPooling2D() {
cudaFree(d_output); cudaFree(d_output);
delete activation; delete activation;
} }
float* MaxPooling2D::forward(const float* d_input) { float* MaxPooling2D::forward(const float* d_input) {
dim3 block(8, 8, 8);
dim3 block(8,8,8);
dim3 grid( dim3 grid(
(outputSize + block.x - 1) / block.x, (outputSize.first + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y, (outputSize.second + block.y - 1) / block.y,
(nChannels + block.z - 1) / block.z (nChannels + block.z - 1) / block.z
); );
@@ -55,9 +54,9 @@ float* MaxPooling2D::forward(const float* d_input) {
} }
int MaxPooling2D::getOutputSize() { int MaxPooling2D::getOutputSize() {
return outputSize * outputSize * nChannels; return outputSize.first * outputSize.second * nChannels;
} }
int MaxPooling2D::getInputSize() { int MaxPooling2D::getInputSize() {
return inputSize * inputSize * nChannels; return inputSize.first * inputSize.second * nChannels;
} }

View File

@@ -11,13 +11,13 @@
using namespace CUDANet; using namespace CUDANet;
Model::Model(const int inputSize, const int inputChannels, const int outputSize) Model::Model(const dim2d inputSize, const int inputChannels, const int outputSize)
: inputSize(inputSize), : inputSize(inputSize),
inputChannels(inputChannels), inputChannels(inputChannels),
outputSize(outputSize), outputSize(outputSize),
layers(std::vector<std::pair<std::string, Layers::SequentialLayer*>>()), layers(std::vector<std::pair<std::string, Layers::SequentialLayer*>>()),
layerMap(std::unordered_map<std::string, Layers::SequentialLayer*>()) { layerMap(std::unordered_map<std::string, Layers::SequentialLayer*>()) {
inputLayer = new Layers::Input(inputSize * inputSize * inputChannels); inputLayer = new Layers::Input(inputSize.first * inputSize.second * inputChannels);
outputLayer = new Layers::Output(outputSize); outputLayer = new Layers::Output(outputSize);
}; };

View File

@@ -6,10 +6,10 @@
#include "avg_pooling.cuh" #include "avg_pooling.cuh"
TEST(AvgPoolingLayerTest, AvgPoolForwardTest) { TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
int inputSize = 4; dim2d inputSize = {4, 4};
int nChannels = 2; int nChannels = 2;
int poolingSize = 2; dim2d poolingSize = {2, 2};
int stride = 2; dim2d stride = {2, 2};
cudaError_t cudaStatus; cudaError_t cudaStatus;
@@ -36,13 +36,14 @@ TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
float *d_input; float *d_input;
cudaStatus = cudaMalloc( cudaStatus = cudaMalloc(
(void **)&d_input, sizeof(float) * inputSize * inputSize * nChannels (void **)&d_input,
sizeof(float) * inputSize.first * inputSize.second * nChannels
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
cudaStatus = cudaMemcpy( cudaStatus = cudaMemcpy(
d_input, input.data(), d_input, input.data(),
sizeof(float) * inputSize * inputSize * nChannels, sizeof(float) * inputSize.first * inputSize.second * nChannels,
cudaMemcpyHostToDevice cudaMemcpyHostToDevice
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
@@ -53,13 +54,13 @@ TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
std::vector<float> output(outputSize); std::vector<float> output(outputSize);
cudaStatus = cudaMemcpy( cudaStatus = cudaMemcpy(
output.data(), d_output, output.data(), d_output, sizeof(float) * outputSize,
sizeof(float) * outputSize,
cudaMemcpyDeviceToHost cudaMemcpyDeviceToHost
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
std::vector<float> expected = {0.43775f, 0.49475f, 0.48975f, 0.339f, 0.45675f, 0.303f, 0.56975f, 0.57025f}; std::vector<float> expected = {0.43775f, 0.49475f, 0.48975f, 0.339f,
0.45675f, 0.303f, 0.56975f, 0.57025f};
for (int i = 0; i < output.size(); ++i) { for (int i = 0; i < output.size(); ++i) {
EXPECT_NEAR(expected[i], output[i], 1e-4); EXPECT_NEAR(expected[i], output[i], 1e-4);

View File

@@ -7,8 +7,8 @@
#include "batch_norm.cuh" #include "batch_norm.cuh"
TEST(BatchNormLayerTest, BatchNormSmallForwardTest) { TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
int inputSize = 4; dim2d inputSize = {4, 4};
int nChannels = 2; int nChannels = 2;
cudaError_t cudaStatus; cudaError_t cudaStatus;
@@ -17,7 +17,7 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
); );
std::vector<float> weights = {0.63508f, 0.64903f}; std::vector<float> weights = {0.63508f, 0.64903f};
std::vector<float> biases = {0.25079f, 0.66841f}; std::vector<float> biases = {0.25079f, 0.66841f};
batchNorm.setWeights(weights.data()); batchNorm.setWeights(weights.data());
batchNorm.setBiases(biases.data()); batchNorm.setBiases(biases.data());
@@ -47,27 +47,27 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
cudaStatus = cudaMemcpy( cudaStatus = cudaMemcpy(
d_input, input.data(), sizeof(float) * input.size(), cudaMemcpyHostToDevice d_input, input.data(), sizeof(float) * input.size(),
cudaMemcpyHostToDevice
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
float* d_output = batchNorm.forward(d_input); float* d_output = batchNorm.forward(d_input);
cudaStatus = cudaMemcpy( cudaStatus = cudaMemcpy(
output.data(), d_output, sizeof(float) * output.size(), cudaMemcpyDeviceToHost output.data(), d_output, sizeof(float) * output.size(),
cudaMemcpyDeviceToHost
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
std::vector<float> expected = { std::vector<float> expected = {-0.06007f, 0.951f, 0.18157f, 1.36202f,
-0.06007f, 0.951f, 0.18157f, 1.36202f, 0.39244f, 0.47335f, 0.58598f, -1.00188f,
0.39244f, 0.47335f, 0.58598f, -1.00188f, 0.59576f, 0.79919f, -0.57001f, 0.70469f,
0.59576f, 0.79919f, -0.57001f, 0.70469f, -0.62847f, -0.06578f, -0.43668f, 0.72952f,
-0.62847f, -0.06578f, -0.43668f, 0.72952f, 0.37726f, 0.02088f, 0.35446f, 0.98092f,
0.37726f, 0.02088f, 0.35446f, 0.98092f, 1.39264f, 1.80686f, 1.67786f, 1.58318f,
1.39264f, 1.80686f, 1.67786f, 1.58318f, -0.0269f, 0.26878f, 0.81411f, 0.09022f,
-0.0269f, 0.26878f, 0.81411f, 0.09022f, 0.9126f, 0.71485f, -0.08184f, -0.19131f};
0.9126f, 0.71485f, -0.08184f, -0.19131f
};
// std::cout << "BatchNorm2D: " << std::endl; // std::cout << "BatchNorm2D: " << std::endl;
for (int i = 0; i < output.size(); i++) { for (int i = 0; i < output.size(); i++) {
@@ -76,5 +76,4 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
} }
// std::cout << std::endl; // std::cout << std::endl;
cudaFree(d_input); cudaFree(d_input);
} }

View File

@@ -8,12 +8,12 @@
class Conv2dTest : public ::testing::Test { class Conv2dTest : public ::testing::Test {
protected: protected:
CUDANet::Layers::Conv2d commonTestSetup( CUDANet::Layers::Conv2d commonTestSetup(
int inputSize, dim2d inputSize,
int inputChannels, int inputChannels,
int kernelSize, dim2d kernelSize,
int stride, dim2d stride,
int numFilters, int numFilters,
int paddingSize, dim2d paddingSize,
CUDANet::Layers::ActivationType activationType, CUDANet::Layers::ActivationType activationType,
std::vector<float>& input, std::vector<float>& input,
float* kernels, float* kernels,
@@ -30,7 +30,7 @@ class Conv2dTest : public ::testing::Test {
// Allocate device memory // Allocate device memory
cudaStatus = cudaMalloc( cudaStatus = cudaMalloc(
(void**)&d_input, (void**)&d_input,
sizeof(float) * inputSize * inputSize * inputChannels sizeof(float) * inputSize.first * inputSize.second * inputChannels
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
@@ -47,19 +47,18 @@ class Conv2dTest : public ::testing::Test {
void commonTestTeardown(float* d_input) { void commonTestTeardown(float* d_input) {
// Free device memory // Free device memory
cudaFree(d_input); cudaFree(d_input);
} }
cudaError_t cudaStatus; cudaError_t cudaStatus;
}; };
TEST_F(Conv2dTest, SimpleTest) { TEST_F(Conv2dTest, SimpleTest) {
int inputSize = 4; dim2d inputSize = {4, 4};
int inputChannels = 1; int inputChannels = 1;
int kernelSize = 2; dim2d kernelSize = {2, 2};
int stride = 1; dim2d stride = {1, 1};
int numFilters = 1; int numFilters = 1;
int paddingSize = 0; dim2d paddingSize = {0, 0};
CUDANet::Layers::ActivationType activationType = CUDANet::Layers::ActivationType activationType =
CUDANet::Layers::ActivationType::NONE; CUDANet::Layers::ActivationType::NONE;
@@ -82,8 +81,9 @@ TEST_F(Conv2dTest, SimpleTest) {
activationType, input, kernels.data(), d_input activationType, input, kernels.data(), d_input
); );
int outputWidth = (inputSize - kernelSize) / stride + 1; int outputHeight = (inputSize.first - kernelSize.first) / stride.first + 1;
int outputSize = outputWidth * outputWidth * numFilters; int outputWidth = (inputSize.second - kernelSize.second) / stride.second + 1;
int outputSize = outputHeight * outputWidth * numFilters;
EXPECT_EQ(outputSize, conv2d.getOutputSize()); EXPECT_EQ(outputSize, conv2d.getOutputSize());
d_output = conv2d.forward(d_input); d_output = conv2d.forward(d_input);
@@ -106,12 +106,16 @@ TEST_F(Conv2dTest, SimpleTest) {
} }
TEST_F(Conv2dTest, PaddedTest) { TEST_F(Conv2dTest, PaddedTest) {
int inputSize = 5; dim2d inputSize = {5, 5};
int inputChannels = 3; int inputChannels = 3;
int kernelSize = 3; dim2d kernelSize = {3, 3};
int stride = 1; dim2d stride = {1, 1};
int numFilters = 2; int numFilters = 2;
int paddingSize = CUDANET_SAME_PADDING(inputSize, kernelSize, stride);
int paddingFirst = CUDANET_SAME_PADDING(inputSize.first, kernelSize.first, stride.first);
int paddingSecond = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second);
dim2d paddingSize = {paddingFirst, paddingSecond};
CUDANet::Layers::ActivationType activationType = CUDANet::Layers::ActivationType activationType =
CUDANet::Layers::ActivationType::NONE; CUDANet::Layers::ActivationType::NONE;
@@ -173,16 +177,14 @@ TEST_F(Conv2dTest, PaddedTest) {
activationType, input, kernels.data(), d_input activationType, input, kernels.data(), d_input
); );
EXPECT_EQ(inputSize * inputSize * numFilters, conv2d.getOutputSize()); EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize());
d_output = conv2d.forward(d_input); d_output = conv2d.forward(d_input);
std::vector<float> output( std::vector<float> output(conv2d.getOutputSize());
conv2d.getOutputSize()
);
cudaMemcpy( cudaMemcpy(
output.data(), d_output, output.data(), d_output, sizeof(float) * conv2d.getOutputSize(),
sizeof(float) * conv2d.getOutputSize(), cudaMemcpyDeviceToHost cudaMemcpyDeviceToHost
); );
// Generated by tools/generate_conv2d_test.py // Generated by tools/generate_conv2d_test.py
@@ -206,12 +208,17 @@ TEST_F(Conv2dTest, PaddedTest) {
} }
TEST_F(Conv2dTest, StridedPaddedConvolution) { TEST_F(Conv2dTest, StridedPaddedConvolution) {
int inputSize = 5; dim2d inputSize = {5, 5};
int inputChannels = 2; int inputChannels = 2;
int kernelSize = 3; dim2d kernelSize = {3, 3};
int stride = 2; dim2d stride = {2, 2};
int numFilters = 2; int numFilters = 2;
int paddingSize = CUDANET_SAME_PADDING(inputSize, kernelSize, stride);
int paddingFirst = CUDANET_SAME_PADDING(inputSize.first, kernelSize.second, stride.first);
int paddingSecond = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second);
dim2d paddingSize = {paddingFirst, paddingSecond};
CUDANet::Layers::ActivationType activationType = CUDANet::Layers::ActivationType activationType =
CUDANet::Layers::ActivationType::RELU; CUDANet::Layers::ActivationType::RELU;
@@ -258,16 +265,13 @@ TEST_F(Conv2dTest, StridedPaddedConvolution) {
activationType, input, kernels.data(), d_input activationType, input, kernels.data(), d_input
); );
EXPECT_EQ(inputSize * inputSize * numFilters, conv2d.getOutputSize()); EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize());
d_output = conv2d.forward(d_input); d_output = conv2d.forward(d_input);
std::vector<float> output( std::vector<float> output(conv2d.getOutputSize());
conv2d.getOutputSize()
);
cudaMemcpy( cudaMemcpy(
output.data(), d_output, output.data(), d_output, sizeof(float) * conv2d.getOutputSize(),
sizeof(float) * conv2d.getOutputSize(),
cudaMemcpyDeviceToHost cudaMemcpyDeviceToHost
); );

View File

@@ -6,10 +6,10 @@
#include "max_pooling.cuh" #include "max_pooling.cuh"
TEST(MaxPoolingLayerTest, MaxPoolForwardTest) { TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
int inputSize = 4; dim2d inputSize = {4, 4};
int nChannels = 2; int nChannels = 2;
int poolingSize = 2; dim2d poolingSize = {2, 2};
int stride = 2; dim2d stride = {2, 2};
cudaError_t cudaStatus; cudaError_t cudaStatus;
@@ -36,13 +36,13 @@ TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
float *d_input; float *d_input;
cudaStatus = cudaMalloc( cudaStatus = cudaMalloc(
(void **)&d_input, sizeof(float) * inputSize * inputSize * nChannels (void **)&d_input, sizeof(float) * inputSize.first * inputSize.second * nChannels
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
cudaStatus = cudaMemcpy( cudaStatus = cudaMemcpy(
d_input, input.data(), d_input, input.data(),
sizeof(float) * inputSize * inputSize * nChannels, sizeof(float) * inputSize.first * inputSize.second * nChannels,
cudaMemcpyHostToDevice cudaMemcpyHostToDevice
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
@@ -53,13 +53,13 @@ TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
std::vector<float> output(outputSize); std::vector<float> output(outputSize);
cudaStatus = cudaMemcpy( cudaStatus = cudaMemcpy(
output.data(), d_output, output.data(), d_output, sizeof(float) * outputSize,
sizeof(float) * outputSize,
cudaMemcpyDeviceToHost cudaMemcpyDeviceToHost
); );
EXPECT_EQ(cudaStatus, cudaSuccess); EXPECT_EQ(cudaStatus, cudaSuccess);
std::vector<float> expected = {0.619f, 0.732f, 0.712f, 0.742f, 0.919f, 0.973f, 0.819f, 0.85f}; std::vector<float> expected = {0.619f, 0.732f, 0.712f, 0.742f,
0.919f, 0.973f, 0.819f, 0.85f};
for (int i = 0; i < output.size(); ++i) { for (int i = 0; i < output.size(); ++i) {
EXPECT_FLOAT_EQ(expected[i], output[i]); EXPECT_FLOAT_EQ(expected[i], output[i]);

View File

@@ -10,27 +10,26 @@ class ModelTest : public ::testing::Test {
CUDANet::Model *commonTestSetup( CUDANet::Model *commonTestSetup(
bool setWeights = true, bool setWeights = true,
int inputSize = 6, dim2d inputSize = {6, 6},
int inputChannels = 2, int inputChannels = 2,
int outputSize = 3, int outputSize = 3,
int kernelSize = 3, dim2d kernelSize = {3, 3},
int stride = 1, dim2d stride = {1, 1},
int numFilters = 2, int numFilters = 2,
int poolingSize = 2, dim2d poolingSize = {2, 2},
int poolingStride = 2 dim2d poolingStride = {2, 2}
) { ) {
CUDANet::Model *model = CUDANet::Model *model =
new CUDANet::Model(inputSize, inputChannels, outputSize); new CUDANet::Model(inputSize, inputChannels, outputSize);
int paddingSize = 0; dim2d paddingSize = {0, 0};
// Conv2d // Conv2d
CUDANet::Layers::Conv2d *conv2d = new CUDANet::Layers::Conv2d( CUDANet::Layers::Conv2d *conv2d = new CUDANet::Layers::Conv2d(
inputSize, inputChannels, kernelSize, stride, numFilters, inputSize, inputChannels, kernelSize, stride, numFilters,
paddingSize, paddingSize, CUDANet::Layers::ActivationType::NONE
CUDANet::Layers::ActivationType::NONE
); );
if (setWeights) { if (setWeights) {
@@ -39,9 +38,13 @@ class ModelTest : public ::testing::Test {
model->addLayer("conv1", conv2d); model->addLayer("conv1", conv2d);
// maxpool2d // maxpool2d
dim2d poolingInput = {
inputSize.first - kernelSize.first + 1,
inputSize.second - kernelSize.second + 1
};
CUDANet::Layers::MaxPooling2D *maxpool2d = CUDANet::Layers::MaxPooling2D *maxpool2d =
new CUDANet::Layers::MaxPooling2D( new CUDANet::Layers::MaxPooling2D(
inputSize - kernelSize + 1, numFilters, poolingSize, poolingInput, numFilters, poolingSize,
poolingStride, CUDANet::Layers::ActivationType::RELU poolingStride, CUDANet::Layers::ActivationType::RELU
); );
model->addLayer("maxpool1", maxpool2d); model->addLayer("maxpool1", maxpool2d);