Add support for non square matrices

This commit is contained in:
2024-05-20 15:20:43 +02:00
parent 6f8b5f4081
commit 74098b24e3
21 changed files with 314 additions and 299 deletions

View File

@@ -9,19 +9,19 @@ __global__ void Kernels::convolution(
const float* __restrict__ d_kernel,
const float* __restrict__ d_bias,
float* __restrict__ d_output,
const int inputSize,
const int nChannels,
const int paddingSize,
const int kernelSize,
const int stride,
const int nFilters,
const int outputSize
const dim2d inputSize,
const int nChannels,
const dim2d paddingSize,
const dim2d kernelSize,
const dim2d stride,
const int nFilters,
const dim2d outputSize
) {
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int f = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || f >= nFilters) {
if (i >= outputSize.first || j >= outputSize.second || f >= nFilters) {
return;
}
@@ -29,28 +29,32 @@ __global__ void Kernels::convolution(
// Iterate over kernel and input matrix
for (int c = 0; c < nChannels; c++) {
for (int k = 0; k < kernelSize; k++) {
for (int l = 0; l < kernelSize; l++) {
for (int k = 0; k < kernelSize.first; k++) {
for (int l = 0; l < kernelSize.second; l++) {
// if i, j is in the padding region
if (i * stride + k < paddingSize ||
i * stride + k >= (inputSize + paddingSize) ||
j * stride + l < paddingSize ||
j * stride + l >= (inputSize + paddingSize)) {
if (i * stride.first + k < paddingSize.first ||
i * stride.first + k >=
(inputSize.first + paddingSize.first) ||
j * stride.second + l < paddingSize.second ||
j * stride.second + l >=
(inputSize.second + paddingSize.second)) {
continue;
}
int kernelIndex = f * kernelSize * kernelSize * nChannels +
c * kernelSize * kernelSize + k * kernelSize +
l;
int inputIndex = c * inputSize * inputSize +
(i * stride + k - paddingSize) * inputSize +
(j * stride + l - paddingSize);
int kernelIndex =
f * kernelSize.first * kernelSize.second * nChannels +
c * kernelSize.first * kernelSize.second +
k * kernelSize.second + l;
int inputIndex = c * inputSize.first * inputSize.second +
(i * stride.first + k - paddingSize.first) *
inputSize.second +
(j * stride.second + l - paddingSize.second);
sum += d_kernel[kernelIndex] * d_input[inputIndex];
}
}
}
d_output[f * outputSize * outputSize + i * outputSize + j] = sum + d_bias[f];
d_output[f * outputSize.first * outputSize.second + i * outputSize.second + j] =
sum + d_bias[f];
}

View File

@@ -1,4 +1,5 @@
#include "cuda_helper.cuh"
#include "layer.cuh"
#include "pooling.cuh"
using namespace CUDANet;
@@ -6,26 +7,27 @@ using namespace CUDANet;
__global__ void Kernels::max_pooling(
const float* __restrict__ d_input,
float* __restrict__ d_output,
const int inputSize,
const int outputSize,
const int nChannels,
const int poolingSize,
const int stride
const dim2d inputSize,
const dim2d outputSize,
const int nChannels,
const dim2d poolingSize,
const dim2d stride
) {
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int c = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || c >= nChannels) {
if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
return;
}
float max = 0.0f;
for (int k = 0; k < poolingSize; k++) {
for (int l = 0; l < poolingSize; l++) {
int inputIndex = c * inputSize * inputSize +
(i * stride + k) * inputSize + (j * stride + l);
for (int k = 0; k < poolingSize.first; k++) {
for (int l = 0; l < poolingSize.second; l++) {
int inputIndex = c * inputSize.first * inputSize.second +
(i * stride.first + k) * inputSize.second +
(j * stride.second + l);
if (d_input[inputIndex] > max) {
max = d_input[inputIndex];
@@ -33,37 +35,41 @@ __global__ void Kernels::max_pooling(
}
}
d_output[c * outputSize * outputSize + i * outputSize + j] = max;
d_output
[c * outputSize.first * outputSize.second + i * outputSize.second + j] =
max;
}
__global__ void Kernels::avg_pooling(
const float* __restrict__ d_input,
float* __restrict__ d_output,
const int inputSize,
const int outputSize,
const int nChannels,
const int poolingSize,
const int stride
const dim2d inputSize,
const dim2d outputSize,
const int nChannels,
const dim2d poolingSize,
const dim2d stride
) {
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int c = blockDim.z * blockIdx.z + threadIdx.z;
if (i >= outputSize || j >= outputSize || c >= outputSize) {
if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
return;
}
float sum = 0.0f;
for (int k = 0; k < poolingSize; k++) {
for (int l = 0; l < poolingSize; l++) {
int inputIndex = c * inputSize * inputSize +
(i * stride + k) * inputSize + (j * stride + l);
for (int k = 0; k < poolingSize.first; k++) {
for (int l = 0; l < poolingSize.second; l++) {
int inputIndex = c * inputSize.first * inputSize.second +
(i * stride.first + k) * inputSize.second +
(j * stride.second + l);
sum += d_input[inputIndex];
}
}
d_output[c * outputSize * outputSize + i * outputSize + j] =
sum / (poolingSize * poolingSize);
d_output
[c * outputSize.first * outputSize.second + i * outputSize.second + j] =
sum / (poolingSize.first * poolingSize.second);
}

View File

@@ -5,24 +5,29 @@
using namespace CUDANet::Layers;
AvgPooling2D::AvgPooling2D(
int inputSize,
dim2d inputSize,
int nChannels,
int poolingSize,
int stride,
dim2d poolingSize,
dim2d stride,
ActivationType activationType
)
: inputSize(inputSize),
nChannels(nChannels),
poolingSize(poolingSize),
stride(stride) {
outputSize = (inputSize - poolingSize) / stride + 1;
outputSize = {
(inputSize.first - poolingSize.first) / stride.first + 1,
(inputSize.second - poolingSize.second) / stride.second + 1
};
activation =
new Activation(activationType, outputSize * outputSize * nChannels);
activation = new Activation(
activationType, outputSize.first * outputSize.second * nChannels
);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels
(void**)&d_output,
sizeof(float) * outputSize.first * outputSize.second * nChannels
));
}
@@ -32,11 +37,10 @@ AvgPooling2D::~AvgPooling2D() {
}
float* AvgPooling2D::forward(const float* d_input) {
dim3 block(8, 8, 8);
dim3 grid(
(outputSize + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y,
(outputSize.first + block.x - 1) / block.x,
(outputSize.second + block.y - 1) / block.y,
(nChannels + block.z - 1) / block.z
);
@@ -52,9 +56,9 @@ float* AvgPooling2D::forward(const float* d_input) {
}
int AvgPooling2D::getOutputSize() {
return outputSize * outputSize * nChannels;
return outputSize.first * outputSize.second * nChannels;
}
int AvgPooling2D::getInputSize() {
return inputSize * inputSize * nChannels;
return inputSize.first * inputSize.second * nChannels;
}

View File

@@ -10,31 +10,36 @@
using namespace CUDANet::Layers;
BatchNorm2D::BatchNorm2D(
int inputSize,
dim2d inputSize,
int inputChannels,
float epsilon,
ActivationType activationType
)
: inputSize(inputSize), inputChannels(inputChannels) {
activation =
new Activation(activationType, inputSize * inputSize * inputChannels);
activation = new Activation(
activationType, inputSize.first * inputSize.second * inputChannels
);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void **)&d_output,
sizeof(float) * inputSize * inputSize * inputChannels
sizeof(float) * inputSize.first * inputSize.second * inputChannels
));
d_mean = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_mean, sizeof(float) * inputSize * inputSize));
CUDA_CHECK(cudaMalloc(
(void **)&d_mean, sizeof(float) * inputSize.first * inputSize.second
));
d_mean_sub = nullptr;
CUDA_CHECK(
cudaMalloc((void **)&d_mean_sub, sizeof(float) * inputSize * inputSize)
);
CUDA_CHECK(cudaMalloc(
(void **)&d_mean_sub, sizeof(float) * inputSize.first * inputSize.second
));
d_sqrt_var = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_sqrt_var, sizeof(float) * inputSize * inputSize));
CUDA_CHECK(cudaMalloc(
(void **)&d_sqrt_var, sizeof(float) * inputSize.first * inputSize.second
));
d_weights = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_weights, sizeof(float) * inputChannels));
@@ -42,14 +47,18 @@ BatchNorm2D::BatchNorm2D(
d_biases = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_biases, sizeof(float) * inputChannels));
d_length = nullptr;
float length = (float) inputSize * inputSize;
d_length = nullptr;
float length = (float)inputSize.first * inputSize.second;
CUDA_CHECK(cudaMalloc((void **)&d_length, sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(
cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice)
);
d_epsilon = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_epsilon, sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(
cudaMemcpy(d_epsilon, &epsilon, sizeof(float), cudaMemcpyHostToDevice)
);
weights.resize(inputChannels);
biases.resize(inputChannels);
@@ -60,7 +69,7 @@ BatchNorm2D::BatchNorm2D(
toCuda();
gridSize =
(inputSize * inputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
(inputSize.first * inputSize.second + BLOCK_SIZE - 1) / BLOCK_SIZE;
}
BatchNorm2D::~BatchNorm2D() {
@@ -112,84 +121,67 @@ void BatchNorm2D::toCuda() {
}
int BatchNorm2D::getInputSize() {
return inputSize * inputSize * inputChannels;
return inputSize.first * inputSize.second * inputChannels;
}
int BatchNorm2D::getOutputSize() {
return inputSize * inputSize * inputChannels;
return inputSize.first * inputSize.second * inputChannels;
}
float *BatchNorm2D::forward(const float *d_input) {
// Compute per-channel batch normalization
for (int i = 0; i < inputChannels; i++) {
// Compute mean
Utils::mean(
d_input + i * inputSize * inputSize,
d_mean,
d_length,
inputSize * inputSize
d_input + i * inputSize.first * inputSize.second, d_mean, d_length,
inputSize.first * inputSize.second
);
// Subtract mean from input
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
d_input + i * inputSize * inputSize,
d_mean_sub,
&d_mean[0],
inputSize * inputSize
d_input + i * inputSize.first * inputSize.second, d_mean_sub,
&d_mean[0], inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Compute variance
Utils::var(
d_mean_sub,
d_sqrt_var,
d_length,
inputSize * inputSize
d_mean_sub, d_sqrt_var, d_length, inputSize.first * inputSize.second
);
// Add epsilon to variance to avoid division by zero
Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
d_sqrt_var,
d_sqrt_var,
&d_epsilon[0],
inputSize * inputSize
d_sqrt_var, d_sqrt_var, &d_epsilon[0],
inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Compute squared root of variance
Kernels::vec_sqrt<<<gridSize, BLOCK_SIZE>>>(
d_sqrt_var,
d_sqrt_var,
inputSize * inputSize
d_sqrt_var, d_sqrt_var, inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Divide by squared root of variance
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
d_mean_sub,
d_output + i * inputSize * inputSize,
&d_sqrt_var[0],
inputSize * inputSize
d_mean_sub, d_output + i * inputSize.first * inputSize.second,
&d_sqrt_var[0], inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Multiply by weights
Kernels::vec_scalar_mul<<<gridSize, BLOCK_SIZE>>>(
d_output + i * inputSize * inputSize,
d_output + i * inputSize * inputSize,
&d_weights[i],
inputSize * inputSize
d_output + i * inputSize.first * inputSize.second,
d_output + i * inputSize.first * inputSize.second, &d_weights[i],
inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
// Add biases
Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
d_output + i * inputSize * inputSize,
d_output + i * inputSize * inputSize,
&d_biases[i],
inputSize * inputSize
d_output + i * inputSize.first * inputSize.second,
d_output + i * inputSize.first * inputSize.second, &d_biases[i],
inputSize.first * inputSize.second
);
CUDA_CHECK(cudaGetLastError());
}

View File

@@ -1,23 +1,23 @@
#include <iostream>
#include <vector>
#include "activation.cuh"
#include "conv2d.cuh"
#include "convolution.cuh"
#include "cuda_helper.cuh"
#include "matmul.cuh"
#include "layer.cuh"
#include "matmul.cuh"
#include "vector.cuh"
#include <iostream>
#include <vector>
using namespace CUDANet::Layers;
Conv2d::Conv2d(
int inputSize,
int inputChannels,
int kernelSize,
int stride,
int numFilters,
int paddingSize,
dim2d inputSize,
int inputChannels,
dim2d kernelSize,
dim2d stride,
int numFilters,
dim2d paddingSize,
ActivationType activationType
)
: inputSize(inputSize),
@@ -26,34 +26,35 @@ Conv2d::Conv2d(
stride(stride),
numFilters(numFilters),
paddingSize(paddingSize) {
outputSize = {
(inputSize.first - kernelSize.first + 2 * paddingSize.first) /
stride.first + 1,
(inputSize.first - kernelSize.first + 2 * paddingSize.first) /
stride.first + 1
};
outputSize = (inputSize - kernelSize + 2 * paddingSize) / stride + 1;
activation = new Activation(
activationType, outputSize * outputSize * numFilters
);
activation =
new Activation(activationType, outputSize.first * outputSize.second * numFilters);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * numFilters
(void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * numFilters
));
weights.resize(kernelSize * kernelSize * inputChannels * numFilters);
weights.resize(kernelSize.first * kernelSize.second * inputChannels * numFilters);
initializeWeights();
d_weights = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_weights,
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters
sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters
));
biases.resize(numFilters);
initializeBiases();
d_biases = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_biases, sizeof(float) * numFilters
));
CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * numFilters));
toCuda();
}
@@ -94,35 +95,33 @@ std::vector<float> Conv2d::getBiases() {
void Conv2d::toCuda() {
CUDA_CHECK(cudaMemcpy(
d_weights, weights.data(),
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters,
sizeof(float) * kernelSize.first * kernelSize.second * inputChannels * numFilters,
cudaMemcpyHostToDevice
));
CUDA_CHECK(cudaMemcpy(
d_biases, biases.data(),
sizeof(float) * numFilters,
d_biases, biases.data(), sizeof(float) * numFilters,
cudaMemcpyHostToDevice
));
}
float* Conv2d::forward(const float* d_input) {
// Convolve
dim3 block(8,8,8);
dim3 block(8, 8, 8);
dim3 grid(
(outputSize + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y,
(outputSize.first + block.x - 1) / block.x,
(outputSize.second + block.y - 1) / block.y,
(numFilters + block.z - 1) / block.z
);
CUDANet::Utils::clear(d_output, outputSize * outputSize * numFilters);
CUDANet::Utils::clear(d_output, outputSize.first * outputSize.second * numFilters);
Kernels::convolution<<<grid, block>>>(
d_input, d_weights, d_biases, d_output, inputSize, inputChannels, paddingSize,
kernelSize, stride, numFilters, outputSize
d_input, d_weights, d_biases, d_output, inputSize, inputChannels,
paddingSize, kernelSize, stride, numFilters, outputSize
);
CUDA_CHECK(cudaGetLastError());
// Apply activation
activation->activate(d_output);
@@ -132,9 +131,9 @@ float* Conv2d::forward(const float* d_input) {
}
int Conv2d::getOutputSize() {
return outputSize * outputSize * numFilters;
return outputSize.first * outputSize.second * numFilters;
}
int Conv2d::getInputSize() {
return inputSize * inputSize * inputChannels;
return inputSize.first * inputSize.second * inputChannels;
}

View File

@@ -1,45 +1,44 @@
#include "max_pooling.cuh"
#include "cuda_helper.cuh"
#include "max_pooling.cuh"
#include "pooling.cuh"
using namespace CUDANet::Layers;
MaxPooling2D::MaxPooling2D(
int inputSize,
int nChannels,
int poolingSize,
int stride,
ActivationType activationType
)
: inputSize(inputSize), nChannels(nChannels), poolingSize(poolingSize), stride(stride) {
dim2d inputSize,
int nChannels,
dim2d poolingSize,
dim2d stride,
ActivationType activationType
)
: inputSize(inputSize),
nChannels(nChannels),
poolingSize(poolingSize),
stride(stride) {
outputSize = {
(inputSize.first - poolingSize.first) / stride.first + 1,
(inputSize.second - poolingSize.second) / stride.second + 1
};
outputSize = (inputSize - poolingSize) / stride + 1;
activation = new Activation(
activationType, outputSize * outputSize * nChannels
);
activation =
new Activation(activationType, outputSize.first * outputSize.second * nChannels);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels
(void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * nChannels
));
}
MaxPooling2D::~MaxPooling2D() {
cudaFree(d_output);
delete activation;
}
float* MaxPooling2D::forward(const float* d_input) {
dim3 block(8,8,8);
dim3 block(8, 8, 8);
dim3 grid(
(outputSize + block.x - 1) / block.x,
(outputSize + block.y - 1) / block.y,
(outputSize.first + block.x - 1) / block.x,
(outputSize.second + block.y - 1) / block.y,
(nChannels + block.z - 1) / block.z
);
@@ -55,9 +54,9 @@ float* MaxPooling2D::forward(const float* d_input) {
}
int MaxPooling2D::getOutputSize() {
return outputSize * outputSize * nChannels;
return outputSize.first * outputSize.second * nChannels;
}
int MaxPooling2D::getInputSize() {
return inputSize * inputSize * nChannels;
return inputSize.first * inputSize.second * nChannels;
}

View File

@@ -11,13 +11,13 @@
using namespace CUDANet;
Model::Model(const int inputSize, const int inputChannels, const int outputSize)
Model::Model(const dim2d inputSize, const int inputChannels, const int outputSize)
: inputSize(inputSize),
inputChannels(inputChannels),
outputSize(outputSize),
layers(std::vector<std::pair<std::string, Layers::SequentialLayer*>>()),
layerMap(std::unordered_map<std::string, Layers::SequentialLayer*>()) {
inputLayer = new Layers::Input(inputSize * inputSize * inputChannels);
inputLayer = new Layers::Input(inputSize.first * inputSize.second * inputChannels);
outputLayer = new Layers::Output(outputSize);
};