WIP Migrate Dense layer

This commit is contained in:
2025-11-18 21:12:47 +01:00
parent 64eac7050b
commit 7f203b8947
14 changed files with 116 additions and 221 deletions

View File

@@ -45,4 +45,24 @@ void CUDA::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
CUDANet::Tensor& CUDA::dense(CUDANet::Tensor &weights, CUDANet::Tensor &biases, CUDANet::Tensor &input, CUDANet::Tensor &output, size_t input_size, size_t output_size) {
auto forwardGridSize =
(std::max(input_size, output_size) + BLOCK_SIZE - 1) / BLOCK_SIZE;
auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
weights.data<float>(), input.data<float>(), output.data<float>(), input_size, output_size
);
CUDA_CHECK(cudaGetLastError());
Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
biases.data<float>(), output.data<float>(), output.data<float>(), output_size
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
return output;
}

View File

@@ -1,77 +0,0 @@
#include <vector>
#include "activation.hpp"
#include "activation_functions.cuh"
#include "cuda_helper.cuh"
#include "matmul.cuh"
#include "vector.cuh"
using namespace CUDANet::Layers;
void Activation::initCUDA() {
if (activationType == SOFTMAX) {
d_softmax_sum = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length));
d_max = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_max, sizeof(float) * length));
}
gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
}
void Activation::delCUDA() {
if (activationType == SOFTMAX) {
CUDA_CHECK(cudaFree(d_softmax_sum));
CUDA_CHECK(cudaFree(d_max));
}
}
void Activation::activateCUDA(float* d_input) {
// float sum = 0.0f;
switch (activationType) {
case SIGMOID:
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, length
);
CUDA_CHECK(cudaGetLastError());
break;
case RELU:
Kernels::relu<<<gridSize, BLOCK_SIZE>>>(d_input, d_input, length);
CUDA_CHECK(cudaGetLastError());
break;
case SOFTMAX:
// Find max value
Utils::max(d_input, d_max, length);
// Subtract max value to improve numerical stability
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, &d_max[0], length
);
CUDA_CHECK(cudaGetLastError());
// Compute exponentials
Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, length
);
CUDA_CHECK(cudaGetLastError());
// Find sum
Utils::sum(d_input, d_softmax_sum, length);
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, &d_softmax_sum[0], length
);
CUDA_CHECK(cudaGetLastError());
break;
default:
break;
}
CUDA_CHECK(cudaDeviceSynchronize());
}

View File

@@ -1,69 +0,0 @@
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <iostream>
#include "vector.cuh"
#include "activation.hpp"
#include "cuda_helper.cuh"
#include "dense.hpp"
#include "matmul.cuh"
using namespace CUDANet::Layers;
void Dense::initCUDA() {
d_output = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * outputSize));
d_weights = nullptr;
d_biases = nullptr;
// Allocate GPU memory for weights and biases
CUDA_CHECK(
cudaMalloc((void**)&d_weights, sizeof(float) * inputSize * outputSize)
);
CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * outputSize));
toCuda();
// Calculate block and grid sizes
forwardGridSize =
(std::max(inputSize, outputSize) + BLOCK_SIZE - 1) / BLOCK_SIZE;
biasGridSize = (outputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
}
void Dense::delCUDA() {
cudaFree(d_output);
cudaFree(d_weights);
cudaFree(d_biases);
}
void Dense::toCuda() {
CUDA_CHECK(cudaMemcpy(
d_weights, weights.data(), sizeof(float) * inputSize * outputSize,
cudaMemcpyHostToDevice
));
CUDA_CHECK(cudaMemcpy(
d_biases, biases.data(), sizeof(float) * outputSize,
cudaMemcpyHostToDevice
));
}
float* Dense::forwardCUDA(const float* d_input) {
Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
d_weights, d_input, d_output, inputSize, outputSize
);
CUDA_CHECK(cudaGetLastError());
Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
d_biases, d_output, d_output, outputSize
);
CUDA_CHECK(cudaGetLastError());
activation->activate(d_output);
CUDA_CHECK(cudaDeviceSynchronize());
return d_output;
}

View File

@@ -26,6 +26,10 @@ void CUDA::zero(CUDANet::Tensor &input) {
CUDA_CHECK(cudaMemset(input.data<float>(), 0, sizeof(float) * input.numel()));
}
void CUDA::copy_to_device(CUDANet::Tensor &tensor, void *data, size_t size) {
CUDA_CHECK(cudaMemcpy(tensor.data<float>(), data, size, cudaMemcpyHostToDevice));
}
void CUDA::sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
auto length = input.numel();
const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE;

View File

@@ -57,10 +57,10 @@ size_t Activation::output_size() {
return shape[0];
}
void Activation::set_weights(CUDANet::Tensor &input) {}
void Activation::set_weights(void *input) {}
CUDANet::Tensor& Activation::get_weights() {}
void Activation::set_biases(CUDANet::Tensor &input) {}
void Activation::set_biases(void *input) {}
CUDANet::Tensor& Activation::get_biases() {}

View File

@@ -22,12 +22,16 @@ Dense::Dense(CUDANet::Backend *backend, CUDANet::Shape input_shape, CUDANet::Sha
auto weights = CUDANet::Tensor{Shape(input_len * output_len), CUDANet::DType::FLOAT32, backend};
auto biases = CUDANet::Tensor(Shape(output_len), CUDANet::DType::FLOAT32, backend);
auto output = CUDANet::Tensor(Shape(output_len), CUDANet::DType::FLOAT32, backend);
weights.zero();
biases.zero();
}
CUDANet::Tensor& Dense::forward(CUDANet::Tensor &input);
CUDANet::Tensor& Dense::forward(CUDANet::Tensor &input) {
backend->dense(weights, biases, input, output, in_shape[0], out_shape[0]);
return output;
}
CUDANet::Shape Dense::input_shape() {
return in_shape;
@@ -45,13 +49,17 @@ size_t Dense::output_size() {
return out_shape[0];
};
void Dense::set_weights(CUDANet::Tensor &input);
void Dense::set_weights(void *input) {
weights.set_data<float>(static_cast<float*>(input));
}
CUDANet::Tensor& Dense::get_weights() {
return weights;
}
void Dense::set_biases(CUDANet::Tensor &input);
void Dense::set_biases(void *input) {
biases.set_data<float>(static_cast<float*>(input));
}
CUDANet::Tensor& Dense::get_biases() {
return biases;

View File

@@ -54,3 +54,8 @@ T* Tensor::data() {
void Tensor::zero() {
backend->zero(*this);
}
template <typename T>
void Tensor::set_data(T *data) {
backend->copy_to_device(*this, data, total_size)
}