mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-11-06 01:34:22 +00:00
149 lines
3.9 KiB
Plaintext
149 lines
3.9 KiB
Plaintext
#include <iostream>
|
|
#include <string>
|
|
|
|
#include "activations.cuh"
|
|
#include "conv2d.cuh"
|
|
#include "convolution.cuh"
|
|
#include "cuda_helper.cuh"
|
|
#include "matmul.cuh"
|
|
|
|
Layers::Conv2d::Conv2d(
|
|
int inputSize,
|
|
int inputChannels,
|
|
int kernelSize,
|
|
int stride,
|
|
Layers::Padding padding,
|
|
int numFilters,
|
|
Layers::Activation activation
|
|
)
|
|
: inputSize(inputSize),
|
|
inputChannels(inputChannels),
|
|
kernelSize(kernelSize),
|
|
stride(stride),
|
|
numFilters(numFilters),
|
|
activation(activation) {
|
|
switch (padding) {
|
|
case SAME:
|
|
outputSize = inputSize;
|
|
paddingSize = ((stride - 1) * inputSize - stride + kernelSize) / 2;
|
|
break;
|
|
|
|
case VALID:
|
|
paddingSize = 0;
|
|
outputSize = (inputSize - kernelSize) / stride + 1;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
d_output = nullptr;
|
|
CUDA_CHECK(cudaMalloc(
|
|
(void**)&d_output,
|
|
sizeof(float) * outputSize * outputSize * numFilters
|
|
));
|
|
|
|
weights.resize(kernelSize * kernelSize * inputChannels * numFilters);
|
|
initializeWeights();
|
|
|
|
d_weights = nullptr;
|
|
CUDA_CHECK(cudaMalloc(
|
|
(void**)&d_weights,
|
|
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters
|
|
));
|
|
|
|
biases.resize(outputSize * outputSize * numFilters);
|
|
initializeBiases();
|
|
|
|
d_biases = nullptr;
|
|
CUDA_CHECK(cudaMalloc(
|
|
(void**)&d_biases, sizeof(float) * outputSize * outputSize * numFilters
|
|
));
|
|
|
|
d_padded = nullptr;
|
|
CUDA_CHECK(cudaMalloc(
|
|
(void**)&d_padded, sizeof(float) * (inputSize + 2 * paddingSize) *
|
|
(inputSize + 2 * paddingSize) * inputChannels
|
|
));
|
|
|
|
toCuda();
|
|
}
|
|
|
|
Layers::Conv2d::~Conv2d() {
|
|
cudaFree(d_output);
|
|
cudaFree(d_weights);
|
|
cudaFree(d_biases);
|
|
cudaFree(d_padded);
|
|
}
|
|
|
|
void Layers::Conv2d::initializeWeights() {
|
|
std::fill(weights.begin(), weights.end(), 0.0f);
|
|
}
|
|
|
|
void Layers::Conv2d::initializeBiases() {
|
|
std::fill(biases.begin(), biases.end(), 0.0f);
|
|
}
|
|
|
|
void Layers::Conv2d::setWeights(const float* weights_input) {
|
|
std::copy(weights_input, weights_input + weights.size(), weights.begin());
|
|
toCuda();
|
|
}
|
|
|
|
void Layers::Conv2d::setBiases(const float* biases_input) {
|
|
std::copy(biases_input, biases_input + biases.size(), biases.begin());
|
|
toCuda();
|
|
}
|
|
|
|
void Layers::Conv2d::toCuda() {
|
|
CUDA_CHECK(cudaMemcpy(
|
|
d_weights, weights.data(),
|
|
sizeof(float) * kernelSize * kernelSize * inputChannels * numFilters,
|
|
cudaMemcpyHostToDevice
|
|
));
|
|
|
|
CUDA_CHECK(cudaMemcpy(
|
|
d_biases, biases.data(),
|
|
sizeof(float) * outputSize * outputSize * numFilters,
|
|
cudaMemcpyHostToDevice
|
|
));
|
|
}
|
|
|
|
float* Layers::Conv2d::forward(const float* d_input) {
|
|
// Pad input
|
|
int THREADS_PER_BLOCK = (inputSize + 2 * paddingSize) *
|
|
(inputSize + 2 * paddingSize) * inputChannels;
|
|
|
|
Kernels::padding<<<1, THREADS_PER_BLOCK>>>(
|
|
d_input, d_padded, inputSize, inputSize, inputChannels, paddingSize
|
|
);
|
|
|
|
// Convolve
|
|
THREADS_PER_BLOCK = outputSize * outputSize * numFilters;
|
|
Kernels::convolution<<<1, THREADS_PER_BLOCK>>>(
|
|
d_padded, d_weights, d_output, inputSize + (2 * paddingSize),
|
|
inputChannels, kernelSize, stride, numFilters, outputSize
|
|
);
|
|
|
|
// Add bias
|
|
Kernels::vec_vec_add<<<1, biases.size()>>>(
|
|
d_biases, d_output, d_output, biases.size()
|
|
);
|
|
|
|
switch (activation) {
|
|
case SIGMOID:
|
|
Kernels::sigmoid<<<1, outputSize>>>(d_output, d_output, outputSize);
|
|
break;
|
|
|
|
case RELU:
|
|
Kernels::relu<<<1, outputSize>>>(d_output, d_output, outputSize);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
return d_output;
|
|
}
|