Add support for non square matrices

This commit is contained in:
2024-05-20 15:20:43 +02:00
parent 6f8b5f4081
commit 74098b24e3
21 changed files with 314 additions and 299 deletions

View File

@@ -2,6 +2,7 @@
#define CUDANET_CONVOLUTION_H
#include <cuda_runtime.h>
#include "layer.cuh"
namespace CUDANet::Kernels {
@@ -24,13 +25,13 @@ __global__ void convolution(
const float* __restrict__ d_kernel,
const float* __restrict__ d_bias,
float* __restrict__ d_output,
const int inputSize,
const dim2d inputSize,
const int nChannels,
const int paddingSize,
const int kernelSize,
const int stride,
const dim2d paddingSize,
const dim2d kernelSize,
const dim2d stride,
const int nFilters,
const int outputSize
const dim2d outputSize
);
} // namespace CUDANet::Kernels

View File

@@ -2,27 +2,28 @@
#define CUDANET_POOLING_H
#include <cuda_runtime.h>
#include "layer.cuh"
namespace CUDANet::Kernels {
__global__ void max_pooling(
const float* __restrict__ d_input,
float* __restrict__ d_output,
const int inputSize,
const int outputSize,
const dim2d inputSize,
const dim2d outputSize,
const int nChannels,
const int poolingSize,
const int stride
const dim2d poolingSize,
const dim2d stride
);
__global__ void avg_pooling(
const float* __restrict__ d_input,
float* __restrict__ d_output,
const int inputSize,
const int outputSize,
const dim2d inputSize,
const dim2d outputSize,
const int nChannels,
const int poolingSize,
const int stride
const dim2d poolingSize,
const dim2d stride
);
} // namespace CUDANet::Kernels

View File

@@ -9,10 +9,10 @@ namespace CUDANet::Layers {
class AvgPooling2D : public SequentialLayer {
public:
AvgPooling2D(
int inputSize,
dim2d inputSize,
int nChannels,
int poolingSize,
int stride,
dim2d poolingSize,
dim2d stride,
ActivationType activationType
);
~AvgPooling2D();
@@ -34,12 +34,12 @@ class AvgPooling2D : public SequentialLayer {
int getInputSize();
private:
int inputSize;
int nChannels;
int poolingSize;
int stride;
dim2d inputSize;
int nChannels;
dim2d poolingSize;
dim2d stride;
int outputSize;
dim2d outputSize;
float* d_output;

View File

@@ -10,7 +10,7 @@ namespace CUDANet::Layers {
class BatchNorm2D : public WeightedLayer {
public:
BatchNorm2D(int inputSize, int inputChannels, float epsilon, ActivationType activationType);
BatchNorm2D(dim2d inputSize, int inputChannels, float epsilon, ActivationType activationType);
~BatchNorm2D();
@@ -66,7 +66,7 @@ class BatchNorm2D : public WeightedLayer {
private:
int inputSize;
dim2d inputSize;
int inputChannels;
int gridSize;

View File

@@ -28,12 +28,12 @@ class Conv2d : public WeightedLayer {
* 'SOFTMAX' or 'NONE')
*/
Conv2d(
int inputSize,
dim2d inputSize,
int inputChannels,
int kernelSize,
int stride,
dim2d kernelSize,
dim2d stride,
int numFilters,
int paddingSize,
dim2d paddingSize,
ActivationType activationType
);
@@ -98,23 +98,23 @@ class Conv2d : public WeightedLayer {
*
* @return int
*/
int getPaddingSize() {
dim2d getPaddingSize() {
return paddingSize;
}
private:
// Inputs
int inputSize;
int inputChannels;
dim2d inputSize;
int inputChannels;
// Outputs
int outputSize;
dim2d outputSize;
// Kernel
int kernelSize;
int stride;
int paddingSize;
int numFilters;
dim2d kernelSize;
dim2d stride;
dim2d paddingSize;
int numFilters;
// Kernels
std::vector<float> weights;

View File

@@ -81,8 +81,8 @@ class Dense : public WeightedLayer {
int getInputSize();
private:
unsigned int inputSize;
unsigned int outputSize;
int inputSize;
int outputSize;
float* d_output;
@@ -95,8 +95,8 @@ class Dense : public WeightedLayer {
Layers::Activation* activation;
// Precompute kernel launch parameters
unsigned int forwardGridSize;
unsigned int biasGridSize;
int forwardGridSize;
int biasGridSize;
/**
* @brief Initialize the weights to zeros

View File

@@ -7,6 +7,8 @@
#define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \
((stride - 1) * inputSize - stride + kernelSize) / 2;
typedef std::pair<int, int> dim2d;
namespace CUDANet::Layers {
/**

View File

@@ -9,10 +9,10 @@ namespace CUDANet::Layers {
class MaxPooling2D : public SequentialLayer {
public:
MaxPooling2D(
int inputSize,
dim2d inputSize,
int nChannels,
int poolingSize,
int stride,
dim2d poolingSize,
dim2d stride,
ActivationType activationType
);
~MaxPooling2D();
@@ -34,12 +34,12 @@ class MaxPooling2D : public SequentialLayer {
int getInputSize();
private:
int inputSize;
int nChannels;
int poolingSize;
int stride;
dim2d inputSize;
int nChannels;
dim2d poolingSize;
dim2d stride;
int outputSize;
dim2d outputSize;
float* d_output;

View File

@@ -26,7 +26,7 @@ struct TensorInfo {
class Model {
public:
Model(const int inputSize, const int inputChannels, const int outputSize);
Model(const dim2d inputSize, const int inputChannels, const int outputSize);
Model(const Model& other);
~Model();
@@ -43,7 +43,7 @@ class Model {
Layers::Input* inputLayer;
Layers::Output* outputLayer;
int inputSize;
dim2d inputSize;
int inputChannels;
int outputSize;

View File

@@ -9,19 +9,19 @@ __global__ void Kernels::convolution(
const float* __restrict__ d_kernel,
const float* __restrict__ d_bias,
float* __restrict__ d_output,
const int inputSize,
const int nChannels,
const int paddingSize,
const int kernelSize,
const int stride,
const int nFilters,
const int outputSize
const dim2d inputSize,
const int nChannels,
const dim2d paddingSize,
const dim2d kernelSize,
const dim2d stride,
const int nFilters,
const dim2d outputSize
) {
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int f = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || f >= nFilters) {
if (i >= outputSize.first || j >= outputSize.second || f >= nFilters) {
return;
}
@@ -29,28 +29,32 @@ __global__ void Kernels::convolution(
// Iterate over kernel and input matrix
for (int c = 0; c < nChannels; c++) {
for (int k = 0; k < kernelSize; k++) {
for (int l = 0; l < kernelSize; l++) {
for (int k = 0; k < kernelSize.first; k++) {
for (int l = 0; l < kernelSize.second; l++) {
// if i, j is in the padding region
if (i * stride + k < paddingSize ||
i * stride + k >= (inputSize + paddingSize) ||
j * stride + l < paddingSize ||
j * stride + l >= (inputSize + paddingSize)) {
if (i * stride.first + k < paddingSize.first ||
i * stride.first + k >=
(inputSize.first + paddingSize.first) ||
j * stride.second + l < paddingSize.second ||
j * stride.second + l >=
(inputSize.second + paddingSize.second)) {
continue;
}
int kernelIndex = f * kernelSize * kernelSize * nChannels +
c * kernelSize * kernelSize + k * kernelSize +
l;
int inputIndex = c * inputSize * inputSize +
(i * stride + k - paddingSize) * inputSize +
(j * stride + l - paddingSize);
int kernelIndex =
f * kernelSize.first * kernelSize.second * nChannels +
c * kernelSize.first * kernelSize.second +
k * kernelSize.second + l;
int inputIndex = c * inputSize.first * inputSize.second +
(i * stride.first + k - paddingSize.first) *
inputSize.second +
(j * stride.second + l - paddingSize.second);
sum += d_kernel[kernelIndex] * d_input[inputIndex];
}
}
}
d_output[f * outputSize * outputSize + i * outputSize + j] = sum + d_bias[f];
d_output[f * outputSize.first * outputSize.second + i * outputSize.second + j] =
sum + d_bias[f];
}

View File

@@ -1,4 +1,5 @@
#include "cuda_helper.cuh"
#include "layer.cuh"
#include "pooling.cuh"
using namespace CUDANet;
@@ -6,26 +7,27 @@ using namespace CUDANet;
__global__ void Kernels::max_pooling(
const float* __restrict__ d_input,
float* __restrict__ d_output,
const int inputSize,
const int outputSize,
const int nChannels,
const int poolingSize,
const int stride
const dim2d inputSize,
const dim2d outputSize,
const int nChannels,
const dim2d poolingSize,
const dim2d stride
) {
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int c = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || c >= nChannels) {
if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
return;
}
float max = 0.0f;
for (int k = 0; k < poolingSize; k++) {
for (int l = 0; l < poolingSize; l++) {
int inputIndex = c * inputSize * inputSize +
(i * stride + k) * inputSize + (j * stride + l);
for (int k = 0; k < poolingSize.first; k++) {
for (int l = 0; l < poolingSize.second; l++) {
int inputIndex = c * inputSize.first * inputSize.second +
(i * stride.first + k) * inputSize.second +
(j * stride.second + l);
if (d_input[inputIndex] > max) {
max = d_input[inputIndex];
@@ -33,37 +35,41 @@ __global__ void Kernels::max_pooling(
}
}
d_output[c * outputSize * outputSize + i * outputSize + j] = max;
d_output
[c * outputSize.first * outputSize.second + i * outputSize.second + j] =
max;
}
__global__ void Kernels::avg_pooling(
const float* __restrict__ d_input,
float* __restrict__ d_output,
const int inputSize,
const int outputSize,
const int nChannels,
const int poolingSize,
const int stride
const dim2d inputSize,
const dim2d outputSize,
const int nChannels,
const dim2d poolingSize,
const dim2d stride
) {
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int c = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || c >= outputSize) {
if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
return;
}
float sum = 0.0f;
for (int k = 0; k < poolingSize; k++) {
for (int l = 0; l < poolingSize; l++) {
int inputIndex = c * inputSize * inputSize +
(i * stride + k) * inputSize + (j * stride + l);
for (int k = 0; k < poolingSize.first; k++) {
for (int l = 0; l < poolingSize.second; l++) {
int inputIndex = c * inputSize.first * inputSize.second +
(i * stride.first + k) * inputSize.second +
(j * stride.second + l);
sum += d_input[inputIndex];
}
}
d_output[c * outputSize * outputSize + i * outputSize + j] =
sum / (poolingSize * poolingSize);
d_output
[c * outputSize.first * outputSize.second + i * outputSize.second + j] =
sum / (poolingSize.first * poolingSize.second);
}

View File

@@ -5,24 +5,29 @@
using namespace CUDANet::Layers;
AvgPooling2D::AvgPooling2D(
int inputSize,
dim2d inputSize,
int nChannels,
int poolingSize,
int stride,
dim2d poolingSize,
dim2d stride,
ActivationType activationType
)
: inputSize(inputSize),
nChannels(nChannels),
poolingSize(poolingSize),
stride(stride) {
outputSize = (inputSize - poolingSize) / stride + 1;
outputSize = {
(inputSize.first - poolingSize.first) / stride.first + 1,
(inputSize.second - poolingSize.second) / stride.second + 1
};
activation =
new Activation(activationType, outputSize * outputSize * nChannels);
activation = new Activation(
activationType, outputSize.first * outputSize.second * nChannels
);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels
(void**)&d_output,
sizeof(float) * outputSize.first * outputSize.second * nChannels
));
}
@@ -32,11 +37,10 @@ AvgPooling2D::~AvgPooling2D() {
}
float* AvgPooling2D::forward(const float* d_input) {
dim3 block(8, 8, 8);
dim3 grid(
(outputSize + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y,
(outputSize.first + block.x - 1) / block.x,
(outputSize.second + block.y - 1) / block.y,
(nChannels + block.z - 1) / block.z
);
@@ -52,9 +56,9 @@ float* AvgPooling2D::forward(const float* d_input) {
}
int AvgPooling2D::getOutputSize() {
return outputSize * outputSize * nChannels;
return outputSize.first * outputSize.second * nChannels;
}
int AvgPooling2D::getInputSize() {
return inputSize * inputSize * nChannels;
return inputSize.first * inputSize.second * nChannels;
}

View File

@@ -10,31 +10,36 @@
using namespace CUDANet::Layers;
BatchNorm2D::BatchNorm2D(
int inputSize,
dim2d inputSize,
int inputChannels,
float epsilon,
ActivationType activationType
)
: inputSize(inputSize), inputChannels(inputChannels) {
activation =
new Activation(activationType, inputSize * inputSize * inputChannels);
activation = new Activation(
activationType, inputSize.first * inputSize.second * inputChannels
);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void **)&d_output,
sizeof(float) * inputSize * inputSize * inputChannels
sizeof(float) * inputSize.first * inputSize.second * inputChannels
));
d_mean = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_mean, sizeof(float) * inputSize * inputSize));
CUDA_CHECK(cudaMalloc(
(void **)&d_mean, sizeof(float) * inputSize.first * inputSize.second
));
d_mean_sub = nullptr;
CUDA_CHECK(
cudaMalloc((void **)&d_mean_sub, sizeof(float) * inputSize * inputSize)
);
CUDA_CHECK(cudaMalloc(
(void **)&d_mean_sub, sizeof(float) * inputSize.first * inputSize.second
));
d_sqrt_var = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_sqrt_var, sizeof(float) * inputSize * inputSize));
CUDA_CHECK(cudaMalloc(
(void **)&d_sqrt_var, sizeof(float) * inputSize.first * inputSize.second
));
d_weights = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_weights, sizeof(float) * inputChannels));
@@ -42,14 +47,18 @@ BatchNorm2D::BatchNorm2D(
d_biases = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_biases, sizeof(float) * inputChannels));
d_length = nullptr;
float length = (float) inputSize * inputSize;
d_length = nullptr;
float length = (float)inputSize.first * inputSize.second;
CUDA_CHECK(cudaMalloc((void **)&d_length, sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(
cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice)
);
d_epsilon = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_epsilon, sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(
cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice)
);
weights.resize(inputChannels);
biases.resize(inputChannels);
@@ -60,7 +69,7 @@ BatchNorm2D::BatchNorm2D(
toCuda();
gridSize =
(inputSize * inputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
(inputSize.first * inputSize.second + BLOCK_SIZE - 1) / BLOCK_SIZE;
}
BatchNorm2D::~BatchNorm2D() {
@@ -112,84 +121,67 @@ void BatchNorm2D::toCuda() {
}
int BatchNorm2D::getInputSize() {
return inputSize * inputSize * inputChannels;
return inputSize.first * inputSize.second * inputChannels;
}
int BatchNorm2D::getOutputSize() {
return inputSize * inputSize * inputChannels;
return inputSize.first * inputSize.second * inputChannels;
}
float *BatchNorm2D::forward(const float *d_input) {
// Compute per-channel batch normalization
for (int i = 0; i < inputChannels; i++) {
// Compute mean
Utils::mean(
d_input + i * inputSize * inputSize,
d_mean,
d_length,
inputSize * inputSize
d_input + i * inputSize.first * inputSize.second, d_mean, d_length,
inputSize.first * inputSize.second
);
// Subtract mean from input
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
d_input + i * inputSize * inputSize,
d_mean_sub,
&d_mean[0],
inputSize * inputSize
d_input + i * inputSize.first * inputSize.second, d_mean_sub,
&d_mean[0], inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Compute variance
Utils::var(
d_mean_sub,
d_sqrt_var,
d_length,
inputSize * inputSize
d_mean_sub, d_sqrt_var, d_length, inputSize.first * inputSize.second
);
// Add epsilon to variance to avoid division by zero
Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
d_sqrt_var,
d_sqrt_var,
&d_epsilon[0],
inputSize * inputSize
d_sqrt_var, d_sqrt_var, &d_epsilon[0],
inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Compute squared root of variance
Kernels::vec_sqrt<<<gridSize, BLOCK_SIZE>>>(
d_sqrt_var,
d_sqrt_var,
inputSize * inputSize
d_sqrt_var, d_sqrt_var, inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Divide by squared root of variance
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
d_mean_sub,
d_output + i * inputSize * inputSize,
&d_sqrt_var[0],
inputSize * inputSize
d_mean_sub, d_output + i * inputSize.first * inputSize.second,
&d_sqrt_var[0], inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Multiply by weights
Kernels::vec_scalar_mul<<<gridSize, BLOCK_SIZE>>>(
d_output + i * inputSize * inputSize,
d_output + i * inputSize * inputSize,
&d_weights[i],
inputSize * inputSize
d_output + i * inputSize.first * inputSize.second,
d_output + i * inputSize.first * inputSize.second, &d_weights[i],
inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Add biases
Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
d_output + i * inputSize * inputSize,
d_output + i * inputSize * inputSize,
&d_biases[i],
inputSize * inputSize
d_output + i * inputSize.first * inputSize.second,
d_output + i * inputSize.first * inputSize.second, &d_biases[i],
inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
}

View File

@@ -1,23 +1,23 @@
#include <iostream>
#include <vector>
#include "activation.cuh"
#include "conv2d.cuh"
#include "convolution.cuh"
#include "cuda_helper.cuh"
#include "matmul.cuh"
#include "layer.cuh"
#include "matmul.cuh"
#include "vector.cuh"
#include <iostream>
#include <vector>
using namespace CUDANet::Layers;
Conv2d::Conv2d(
int inputSize,
int inputChannels,
int kernelSize,
int stride,
int numFilters,
int paddingSize,
dim2d inputSize,
int inputChannels,
dim2d kernelSize,
dim2d stride,
int numFilters,
dim2d paddingSize,
ActivationType activationType
)
: inputSize(inputSize),
@@ -26,34 +26,35 @@ Conv2d::Conv2d(
stride(stride),
numFilters(numFilters),
paddingSize(paddingSize) {
outputSize = {
(inputSize.first - kernelSize.first + 2 * paddingSize.first) /
stride.first + 1,
(inputSize.first - kernelSize.first + 2 * paddingSize.first) /
stride.first + 1
};
outputSize = (inputSize - kernelSize + 2 * paddingSize) / stride + 1;
activation = new Activation(
activationType, outputSize * outputSize * numFilters
);
activation =
new Activation(activationType, outputSize.first * outputSize.second * numFilters);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * numFilters
(void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * numFilters
));
weights.resize(kernelSize * kernelSize * inputChannels * numFilters);
weights.resize(kernelSize.first * kernelSize.second * inputChannels * numFilters);
initializeWeights();
d_weights = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_weights,
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters
sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters
));
biases.resize(numFilters);
initializeBiases();
d_biases = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_biases, sizeof(float) * numFilters
));
CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * numFilters));
toCuda();
}
@@ -94,32 +95,30 @@ std::vector<float> Conv2d::getBiases() {
void Conv2d::toCuda() {
CUDA_CHECK(cudaMemcpy(
d_weights, weights.data(),
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters,
sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters,
cudaMemcpyHostToDevice
));
CUDA_CHECK(cudaMemcpy(
d_biases, biases.data(),
sizeof(float) * numFilters,
d_biases, biases.data(), sizeof(float) * numFilters,
cudaMemcpyHostToDevice
));
}
float* Conv2d::forward(const float* d_input) {
// Convolve
dim3 block(8,8,8);
dim3 block(8, 8, 8);
dim3 grid(
(outputSize + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y,
(outputSize.first + block.x - 1) / block.x,
(outputSize.second + block.y - 1) / block.y,
(numFilters + block.z - 1) / block.z
);
CUDANet::Utils::clear(d_output, outputSize * outputSize * numFilters);
CUDANet::Utils::clear(d_output, outputSize.first * outputSize.second * numFilters);
Kernels::convolution<<<grid, block>>>(
d_input, d_weights, d_biases, d_output, inputSize, inputChannels, paddingSize,
kernelSize, stride, numFilters, outputSize
d_input, d_weights, d_biases, d_output, inputSize, inputChannels,
paddingSize, kernelSize, stride, numFilters, outputSize
);
CUDA_CHECK(cudaGetLastError());
@@ -132,9 +131,9 @@ float* Conv2d::forward(const float* d_input) {
}
int Conv2d::getOutputSize() {
return outputSize * outputSize * numFilters;
return outputSize.first * outputSize.second * numFilters;
}
int Conv2d::getInputSize() {
return inputSize * inputSize * inputChannels;
return inputSize.first * inputSize.second * inputChannels;
}

View File

@@ -1,45 +1,44 @@
#include "max_pooling.cuh"
#include "cuda_helper.cuh"
#include "max_pooling.cuh"
#include "pooling.cuh"
using namespace CUDANet::Layers;
MaxPooling2D::MaxPooling2D(
int inputSize,
int nChannels,
int poolingSize,
int stride,
ActivationType activationType
)
: inputSize(inputSize), nChannels(nChannels), poolingSize(poolingSize), stride(stride) {
dim2d inputSize,
int nChannels,
dim2d poolingSize,
dim2d stride,
ActivationType activationType
)
: inputSize(inputSize),
nChannels(nChannels),
poolingSize(poolingSize),
stride(stride) {
outputSize = {
(inputSize.first - poolingSize.first) / stride.first + 1,
(inputSize.second - poolingSize.second) / stride.second + 1
};
outputSize = (inputSize - poolingSize) / stride + 1;
activation = new Activation(
activationType, outputSize * outputSize * nChannels
);
activation =
new Activation(activationType, outputSize.first * outputSize.second * nChannels);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels
(void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * nChannels
));
}
MaxPooling2D::~MaxPooling2D() {
cudaFree(d_output);
delete activation;
}
float* MaxPooling2D::forward(const float* d_input) {
dim3 block(8,8,8);
dim3 block(8, 8, 8);
dim3 grid(
(outputSize + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y,
(outputSize.first + block.x - 1) / block.x,
(outputSize.second + block.y - 1) / block.y,
(nChannels + block.z - 1) / block.z
);
@@ -55,9 +54,9 @@ float* MaxPooling2D::forward(const float* d_input) {
}
int MaxPooling2D::getOutputSize() {
return outputSize * outputSize * nChannels;
return outputSize.first * outputSize.second * nChannels;
}
int MaxPooling2D::getInputSize() {
return inputSize * inputSize * nChannels;
return inputSize.first * inputSize.second * nChannels;
}

View File

@@ -11,13 +11,13 @@
using namespace CUDANet;
Model::Model(const int inputSize, const int inputChannels, const int outputSize)
Model::Model(const dim2d inputSize, const int inputChannels, const int outputSize)
: inputSize(inputSize),
inputChannels(inputChannels),
outputSize(outputSize),
layers(std::vector<std::pair<std::string, Layers::SequentialLayer*>>()),
layerMap(std::unordered_map<std::string, Layers::SequentialLayer*>()) {
inputLayer = new Layers::Input(inputSize * inputSize * inputChannels);
inputLayer = new Layers::Input(inputSize.first * inputSize.second * inputChannels);
outputLayer = new Layers::Output(outputSize);
};

View File

@@ -6,10 +6,10 @@
#include "avg_pooling.cuh"
TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
int inputSize = 4;
int nChannels = 2;
int poolingSize = 2;
int stride = 2;
dim2d inputSize = {4, 4};
int nChannels = 2;
dim2d poolingSize = {2, 2};
dim2d stride = {2, 2};
cudaError_t cudaStatus;
@@ -36,13 +36,14 @@ TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
float *d_input;
cudaStatus = cudaMalloc(
(void **)&d_input, sizeof(float) * inputSize * inputSize * nChannels
(void **)&d_input,
sizeof(float) * inputSize.first * inputSize.second * nChannels
);
EXPECT_EQ(cudaStatus, cudaSuccess);
cudaStatus = cudaMemcpy(
d_input, input.data(),
sizeof(float) * inputSize * inputSize * nChannels,
sizeof(float) * inputSize.first * inputSize.second * nChannels,
cudaMemcpyHostToDevice
);
EXPECT_EQ(cudaStatus, cudaSuccess);
@@ -53,13 +54,13 @@ TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
std::vector<float> output(outputSize);
cudaStatus = cudaMemcpy(
output.data(), d_output,
sizeof(float) * outputSize,
output.data(), d_output, sizeof(float) * outputSize,
cudaMemcpyDeviceToHost
);
EXPECT_EQ(cudaStatus, cudaSuccess);
std::vector<float> expected = {0.43775f, 0.49475f, 0.48975f, 0.339f, 0.45675f, 0.303f, 0.56975f, 0.57025f};
std::vector<float> expected = {0.43775f, 0.49475f, 0.48975f, 0.339f,
0.45675f, 0.303f, 0.56975f, 0.57025f};
for (int i = 0; i < output.size(); ++i) {
EXPECT_NEAR(expected[i], output[i], 1e-4);

View File

@@ -7,8 +7,8 @@
#include "batch_norm.cuh"
TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
int inputSize = 4;
int nChannels = 2;
dim2d inputSize = {4, 4};
int nChannels = 2;
cudaError_t cudaStatus;
@@ -17,7 +17,7 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
);
std::vector<float> weights = {0.63508f, 0.64903f};
std::vector<float> biases = {0.25079f, 0.66841f};
std::vector<float> biases = {0.25079f, 0.66841f};
batchNorm.setWeights(weights.data());
batchNorm.setBiases(biases.data());
@@ -47,27 +47,27 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
EXPECT_EQ(cudaStatus, cudaSuccess);
cudaStatus = cudaMemcpy(
d_input, input.data(), sizeof(float) * input.size(), cudaMemcpyHostToDevice
d_input, input.data(), sizeof(float) * input.size(),
cudaMemcpyHostToDevice
);
EXPECT_EQ(cudaStatus, cudaSuccess);
float* d_output = batchNorm.forward(d_input);
cudaStatus = cudaMemcpy(
output.data(), d_output, sizeof(float) * output.size(), cudaMemcpyDeviceToHost
output.data(), d_output, sizeof(float) * output.size(),
cudaMemcpyDeviceToHost
);
EXPECT_EQ(cudaStatus, cudaSuccess);
std::vector<float> expected = {
-0.06007f, 0.951f, 0.18157f, 1.36202f,
0.39244f, 0.47335f, 0.58598f, -1.00188f,
0.59576f, 0.79919f, -0.57001f, 0.70469f,
-0.62847f, -0.06578f, -0.43668f, 0.72952f,
0.37726f, 0.02088f, 0.35446f, 0.98092f,
1.39264f, 1.80686f, 1.67786f, 1.58318f,
-0.0269f, 0.26878f, 0.81411f, 0.09022f,
0.9126f, 0.71485f, -0.08184f, -0.19131f
};
std::vector<float> expected = {-0.06007f, 0.951f, 0.18157f, 1.36202f,
0.39244f, 0.47335f, 0.58598f, -1.00188f,
0.59576f, 0.79919f, -0.57001f, 0.70469f,
-0.62847f, -0.06578f, -0.43668f, 0.72952f,
0.37726f, 0.02088f, 0.35446f, 0.98092f,
1.39264f, 1.80686f, 1.67786f, 1.58318f,
-0.0269f, 0.26878f, 0.81411f, 0.09022f,
0.9126f, 0.71485f, -0.08184f, -0.19131f};
// std::cout << "BatchNorm2D: " << std::endl;
for (int i = 0; i < output.size(); i++) {
@@ -76,5 +76,4 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
}
// std::cout << std::endl;
cudaFree(d_input);
}

View File

@@ -8,12 +8,12 @@
class Conv2dTest : public ::testing::Test {
protected:
CUDANet::Layers::Conv2d commonTestSetup(
int inputSize,
dim2d inputSize,
int inputChannels,
int kernelSize,
int stride,
dim2d kernelSize,
dim2d stride,
int numFilters,
int paddingSize,
dim2d paddingSize,
CUDANet::Layers::ActivationType activationType,
std::vector<float>& input,
float* kernels,
@@ -30,7 +30,7 @@ class Conv2dTest : public ::testing::Test {
// Allocate device memory
cudaStatus = cudaMalloc(
(void**)&d_input,
sizeof(float) * inputSize * inputSize * inputChannels
sizeof(float) * inputSize.first * inputSize.second * inputChannels
);
EXPECT_EQ(cudaStatus, cudaSuccess);
@@ -47,19 +47,18 @@ class Conv2dTest : public ::testing::Test {
void commonTestTeardown(float* d_input) {
// Free device memory
cudaFree(d_input);
}
cudaError_t cudaStatus;
};
TEST_F(Conv2dTest, SimpleTest) {
int inputSize = 4;
int inputChannels = 1;
int kernelSize = 2;
int stride = 1;
int numFilters = 1;
int paddingSize = 0;
dim2d inputSize = {4, 4};
int inputChannels = 1;
dim2d kernelSize = {2, 2};
dim2d stride = {1, 1};
int numFilters = 1;
dim2d paddingSize = {0, 0};
CUDANet::Layers::ActivationType activationType =
CUDANet::Layers::ActivationType::NONE;
@@ -82,8 +81,9 @@ TEST_F(Conv2dTest, SimpleTest) {
activationType, input, kernels.data(), d_input
);
int outputWidth = (inputSize - kernelSize) / stride + 1;
int outputSize = outputWidth * outputWidth * numFilters;
int outputHeight = (inputSize.first - kernelSize.first) / stride.first + 1;
int outputWidth = (inputSize.second - kernelSize.second) / stride.second + 1;
int outputSize = outputHeight * outputWidth * numFilters;
EXPECT_EQ(outputSize, conv2d.getOutputSize());
d_output = conv2d.forward(d_input);
@@ -106,12 +106,16 @@ TEST_F(Conv2dTest, SimpleTest) {
}
TEST_F(Conv2dTest, PaddedTest) {
int inputSize = 5;
int inputChannels = 3;
int kernelSize = 3;
int stride = 1;
int numFilters = 2;
int paddingSize = CUDANET_SAME_PADDING(inputSize, kernelSize, stride);
dim2d inputSize = {5, 5};
int inputChannels = 3;
dim2d kernelSize = {3, 3};
dim2d stride = {1, 1};
int numFilters = 2;
int paddingFirst = CUDANET_SAME_PADDING(inputSize.first, kernelSize.first, stride.first);
int paddingSecond = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second);
dim2d paddingSize = {paddingFirst, paddingSecond};
CUDANet::Layers::ActivationType activationType =
CUDANet::Layers::ActivationType::NONE;
@@ -173,16 +177,14 @@ TEST_F(Conv2dTest, PaddedTest) {
activationType, input, kernels.data(), d_input
);
EXPECT_EQ(inputSize * inputSize * numFilters, conv2d.getOutputSize());
EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize());
d_output = conv2d.forward(d_input);
std::vector<float> output(
conv2d.getOutputSize()
);
std::vector<float> output(conv2d.getOutputSize());
cudaMemcpy(
output.data(), d_output,
sizeof(float) * conv2d.getOutputSize(), cudaMemcpyDeviceToHost
output.data(), d_output, sizeof(float) * conv2d.getOutputSize(),
cudaMemcpyDeviceToHost
);
// Generated by tools/generate_conv2d_test.py
@@ -206,12 +208,17 @@ TEST_F(Conv2dTest, PaddedTest) {
}
TEST_F(Conv2dTest, StridedPaddedConvolution) {
int inputSize = 5;
dim2d inputSize = {5, 5};
int inputChannels = 2;
int kernelSize = 3;
int stride = 2;
dim2d kernelSize = {3, 3};
dim2d stride = {2, 2};
int numFilters = 2;
int paddingSize = CUDANET_SAME_PADDING(inputSize, kernelSize, stride);
int paddingFirst = CUDANET_SAME_PADDING(inputSize.first, kernelSize.second, stride.first);
int paddingSecond = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second);
dim2d paddingSize = {paddingFirst, paddingSecond};
CUDANet::Layers::ActivationType activationType =
CUDANet::Layers::ActivationType::RELU;
@@ -258,16 +265,13 @@ TEST_F(Conv2dTest, StridedPaddedConvolution) {
activationType, input, kernels.data(), d_input
);
EXPECT_EQ(inputSize * inputSize * numFilters, conv2d.getOutputSize());
EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize());
d_output = conv2d.forward(d_input);
std::vector<float> output(
conv2d.getOutputSize()
);
std::vector<float> output(conv2d.getOutputSize());
cudaMemcpy(
output.data(), d_output,
sizeof(float) * conv2d.getOutputSize(),
output.data(), d_output, sizeof(float) * conv2d.getOutputSize(),
cudaMemcpyDeviceToHost
);

View File

@@ -6,10 +6,10 @@
#include "max_pooling.cuh"
TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
int inputSize = 4;
int nChannels = 2;
int poolingSize = 2;
int stride = 2;
dim2d inputSize = {4, 4};
int nChannels = 2;
dim2d poolingSize = {2, 2};
dim2d stride = {2, 2};
cudaError_t cudaStatus;
@@ -36,13 +36,13 @@ TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
float *d_input;
cudaStatus = cudaMalloc(
(void **)&d_input, sizeof(float) * inputSize * inputSize * nChannels
(void **)&d_input, sizeof(float) * inputSize.first * inputSize.second * nChannels
);
EXPECT_EQ(cudaStatus, cudaSuccess);
cudaStatus = cudaMemcpy(
d_input, input.data(),
sizeof(float) * inputSize * inputSize * nChannels,
sizeof(float) * inputSize.first * inputSize.second * nChannels,
cudaMemcpyHostToDevice
);
EXPECT_EQ(cudaStatus, cudaSuccess);
@@ -53,13 +53,13 @@ TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
std::vector<float> output(outputSize);
cudaStatus = cudaMemcpy(
output.data(), d_output,
sizeof(float) * outputSize,
output.data(), d_output, sizeof(float) * outputSize,
cudaMemcpyDeviceToHost
);
EXPECT_EQ(cudaStatus, cudaSuccess);
std::vector<float> expected = {0.619f, 0.732f, 0.712f, 0.742f, 0.919f, 0.973f, 0.819f, 0.85f};
std::vector<float> expected = {0.619f, 0.732f, 0.712f, 0.742f,
0.919f, 0.973f, 0.819f, 0.85f};
for (int i = 0; i < output.size(); ++i) {
EXPECT_FLOAT_EQ(expected[i], output[i]);

View File

@@ -10,27 +10,26 @@ class ModelTest : public ::testing::Test {
CUDANet::Model *commonTestSetup(
bool setWeights = true,
int inputSize = 6,
int inputChannels = 2,
int outputSize = 3,
dim2d inputSize = {6, 6},
int inputChannels = 2,
int outputSize = 3,
int kernelSize = 3,
int stride = 1,
int numFilters = 2,
dim2d kernelSize = {3, 3},
dim2d stride = {1, 1},
int numFilters = 2,
int poolingSize = 2,
int poolingStride = 2
dim2d poolingSize = {2, 2},
dim2d poolingStride = {2, 2}
) {
CUDANet::Model *model =
new CUDANet::Model(inputSize, inputChannels, outputSize);
int paddingSize = 0;
dim2d paddingSize = {0, 0};
// Conv2d
CUDANet::Layers::Conv2d *conv2d = new CUDANet::Layers::Conv2d(
inputSize, inputChannels, kernelSize, stride, numFilters,
paddingSize,
CUDANet::Layers::ActivationType::NONE
paddingSize, CUDANet::Layers::ActivationType::NONE
);
if (setWeights) {
@@ -39,9 +38,13 @@ class ModelTest : public ::testing::Test {
model->addLayer("conv1", conv2d);
// maxpool2d
dim2d poolingInput = {
inputSize.first - kernelSize.first + 1,
inputSize.second - kernelSize.second + 1
};
CUDANet::Layers::MaxPooling2D *maxpool2d =
new CUDANet::Layers::MaxPooling2D(
inputSize - kernelSize + 1, numFilters, poolingSize,
poolingInput, numFilters, poolingSize,
poolingStride, CUDANet::Layers::ActivationType::RELU
);
model->addLayer("maxpool1", maxpool2d);