Change forward function to return output pointer

This commit is contained in:
2024-03-12 20:50:49 +01:00
parent 2518138ef8
commit 9d91896f13
8 changed files with 69 additions and 113 deletions

View File

@@ -37,6 +37,12 @@ Layers::Conv2d::Conv2d(
break;
}
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output,
sizeof(float) * outputSize * outputSize * numFilters
));
weights.resize(kernelSize * kernelSize * inputChannels * numFilters);
initializeWeights();
@@ -64,6 +70,7 @@ Layers::Conv2d::Conv2d(
}
Layers::Conv2d::~Conv2d() {
cudaFree(d_output);
cudaFree(d_weights);
cudaFree(d_biases);
cudaFree(d_padded);
@@ -101,7 +108,7 @@ void Layers::Conv2d::toCuda() {
));
}
void Layers::Conv2d::forward(const float* d_input, float* d_output) {
float* Layers::Conv2d::forward(const float* d_input) {
// Pad input
int THREADS_PER_BLOCK = (inputSize + 2 * paddingSize) *
(inputSize + 2 * paddingSize) * inputChannels;
@@ -136,44 +143,6 @@ void Layers::Conv2d::forward(const float* d_input, float* d_output) {
}
CUDA_CHECK(cudaDeviceSynchronize());
return d_output;
}
/*
Convolves input vector with kernel and stores result in output
input: matrix (inputSize + paddingSize) x (inputSize + paddingSize) x
inputChannels represented as a vector output: output matrix outputSize x
outputSize x numFilters
*/
void Layers::Conv2d::host_conv(const float* input, float* output) {
// Iterate over output matrix
for (int tid = 0; tid < outputSize * outputSize * numFilters; tid++) {
// Get output index
int f = tid / (outputSize * outputSize);
int i = tid % (outputSize * outputSize) / outputSize;
int j = tid % outputSize;
float sum = 0.0f;
// Iterate over kernel and input matrix
for (int k = 0; k < kernelSize; k++) {
for (int l = 0; l < kernelSize; l++) {
for (int c = 0; c < inputChannels; c++) {
int kernelIndex =
f * kernelSize * kernelSize * inputChannels +
c * kernelSize * kernelSize + k * kernelSize + l;
int inputIndex = c * inputSize * inputSize +
(i * stride + k) * inputSize +
(j * stride + l);
sum += weights[kernelIndex] * input[inputIndex];
}
}
}
int outputIndex = f * outputSize * outputSize + i * outputSize + j;
output[outputIndex] = sum;
}
}

View File

@@ -19,6 +19,10 @@ Layers::Dense::Dense(int inputSize, int outputSize, Layers::Activation activatio
initializeWeights();
initializeBiases();
d_output = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * outputSize));
d_weights = nullptr;
d_biases = nullptr;
@@ -33,6 +37,7 @@ Layers::Dense::Dense(int inputSize, int outputSize, Layers::Activation activatio
Layers::Dense::~Dense() {
// Free GPU memory
cudaFree(d_output);
cudaFree(d_weights);
cudaFree(d_biases);
}
@@ -45,7 +50,7 @@ void Layers::Dense::initializeBiases() {
std::fill(biases.begin(), biases.end(), 0.0f);
}
void Layers::Dense::forward(const float* d_input, float* d_output) {
float* Layers::Dense::forward(const float* d_input) {
Kernels::mat_vec_mul<<<1, outputSize>>>(
d_weights, d_input, d_output, inputSize, outputSize
);
@@ -68,6 +73,8 @@ void Layers::Dense::forward(const float* d_input, float* d_output) {
}
CUDA_CHECK(cudaDeviceSynchronize());
return d_output;
}
void Layers::Dense::toCuda() {