WIP Migrate Dense layer

2025-12-22 22:34:22 +00:00 · 2025-11-18 21:12:47 +01:00
parent 64eac7050b
commit 7f203b8947
14 changed files with 116 additions and 221 deletions
--- a/src/backends/cuda/layer_ops.cu
+++ b/src/backends/cuda/layer_ops.cu
@@ -45,4 +45,24 @@ void CUDA::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+CUDANet::Tensor& CUDA::dense(CUDANet::Tensor &weights, CUDANet::Tensor &biases, CUDANet::Tensor &input, CUDANet::Tensor &output, size_t input_size, size_t output_size) {
+
+    auto forwardGridSize =
+        (std::max(input_size, output_size) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
+        weights.data<float>(), input.data<float>(), output.data<float>(), input_size, output_size
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
+        biases.data<float>(), output.data<float>(), output.data<float>(), output_size
+    );
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    return output;
 }
--- a/src/backends/cuda/layers/activation.cu
+++ b/src/backends/cuda/layers/activation.cu
@@ -1,77 +0,0 @@
-#include <vector>
-
-#include "activation.hpp"
-#include "activation_functions.cuh"
-#include "cuda_helper.cuh"
-#include "matmul.cuh"
-#include "vector.cuh"
-
-using namespace CUDANet::Layers;
-
-void Activation::initCUDA() {
-    if (activationType == SOFTMAX) {
-        d_softmax_sum = nullptr;
-        CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length));
-
-        d_max = nullptr;
-        CUDA_CHECK(cudaMalloc((void**)&d_max, sizeof(float) * length));
-    }
-
-    gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
-}
-
-void Activation::delCUDA() {
-    if (activationType == SOFTMAX) {
-        CUDA_CHECK(cudaFree(d_softmax_sum));
-        CUDA_CHECK(cudaFree(d_max));
-    }
-}
-
-void Activation::activateCUDA(float* d_input) {
-
-    // float sum = 0.0f;
-
-    switch (activationType) {
-        case SIGMOID:
-            Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, length
-            );
-            CUDA_CHECK(cudaGetLastError());
-            break;
-
-        case RELU:
-            Kernels::relu<<<gridSize, BLOCK_SIZE>>>(d_input, d_input, length);
-            CUDA_CHECK(cudaGetLastError());
-            break;
-        case SOFTMAX:
-
-            // Find max value
-            Utils::max(d_input, d_max, length);
-
-            // Subtract max value to improve numerical stability
-            Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, &d_max[0], length
-            );
-            CUDA_CHECK(cudaGetLastError());
-
-            // Compute exponentials
-            Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, length
-            );
-            CUDA_CHECK(cudaGetLastError());
-            
-            // Find sum
-            Utils::sum(d_input, d_softmax_sum, length);
-
-            Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, &d_softmax_sum[0], length
-            );
-            CUDA_CHECK(cudaGetLastError());
-            break;
-
-        default:
-            break;    
-    }
-
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
--- a/src/backends/cuda/layers/dense.cu
+++ b/src/backends/cuda/layers/dense.cu
@@ -1,69 +0,0 @@
-#include <cuda_runtime.h>
-
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <iostream>
-
-#include "vector.cuh"
-#include "activation.hpp"
-#include "cuda_helper.cuh"
-#include "dense.hpp"
-#include "matmul.cuh"
-
-using namespace CUDANet::Layers;
-
-void Dense::initCUDA() {
-    d_output = nullptr;
-
-    CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * outputSize));
-
-    d_weights = nullptr;
-    d_biases  = nullptr;
-
-    // Allocate GPU memory for weights and biases
-    CUDA_CHECK(
-        cudaMalloc((void**)&d_weights, sizeof(float) * inputSize * outputSize)
-    );
-    CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * outputSize));
-    toCuda();
-
-    // Calculate block and grid sizes
-    forwardGridSize =
-        (std::max(inputSize, outputSize) + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    biasGridSize = (outputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
-}
-
-void Dense::delCUDA() {
-    cudaFree(d_output);
-    cudaFree(d_weights);
-    cudaFree(d_biases);
-}
-
-void Dense::toCuda() {
-    CUDA_CHECK(cudaMemcpy(
-        d_weights, weights.data(), sizeof(float) * inputSize * outputSize,
-        cudaMemcpyHostToDevice
-    ));
-    CUDA_CHECK(cudaMemcpy(
-        d_biases, biases.data(), sizeof(float) * outputSize,
-        cudaMemcpyHostToDevice
-    ));
-}
-
-float* Dense::forwardCUDA(const float* d_input) {
-    Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
-        d_weights, d_input, d_output, inputSize, outputSize
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
-        d_biases, d_output, d_output, outputSize
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    activation->activate(d_output);
-    CUDA_CHECK(cudaDeviceSynchronize());
-
-    return d_output;
-}
--- a/src/backends/cuda/tensor_ops.cu
+++ b/src/backends/cuda/tensor_ops.cu
@@ -26,6 +26,10 @@ void CUDA::zero(CUDANet::Tensor &input) {
    CUDA_CHECK(cudaMemset(input.data<float>(), 0, sizeof(float) * input.numel()));
 }

+void CUDA::copy_to_device(CUDANet::Tensor &tensor, void *data, size_t size) {
+    CUDA_CHECK(cudaMemcpy(tensor.data<float>(), data, size, cudaMemcpyHostToDevice));
+}
+
 void CUDA::sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
    auto length = input.numel();
    const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE;
--- a/src/layers/activation.cpp
+++ b/src/layers/activation.cpp
@@ -57,10 +57,10 @@ size_t Activation::output_size() {
    return shape[0];
 }

-void Activation::set_weights(CUDANet::Tensor &input) {}
+void Activation::set_weights(void *input) {}

 CUDANet::Tensor& Activation::get_weights() {}

-void Activation::set_biases(CUDANet::Tensor &input) {}
+void Activation::set_biases(void *input) {}

 CUDANet::Tensor& Activation::get_biases() {}
--- a/src/layers/dense.cpp
+++ b/src/layers/dense.cpp
@@ -22,12 +22,16 @@ Dense::Dense(CUDANet::Backend *backend, CUDANet::Shape input_shape, CUDANet::Sha

    auto weights = CUDANet::Tensor{Shape(input_len * output_len), CUDANet::DType::FLOAT32, backend};
    auto biases = CUDANet::Tensor(Shape(output_len), CUDANet::DType::FLOAT32, backend);
+    auto output = CUDANet::Tensor(Shape(output_len), CUDANet::DType::FLOAT32, backend);

    weights.zero();
    biases.zero();
 }

-CUDANet::Tensor& Dense::forward(CUDANet::Tensor &input);
+CUDANet::Tensor& Dense::forward(CUDANet::Tensor &input) {
+    backend->dense(weights, biases, input, output, in_shape[0], out_shape[0]);
+    return output;
+}

 CUDANet::Shape Dense::input_shape() {
    return in_shape;
@@ -45,13 +49,17 @@ size_t Dense::output_size() {
    return out_shape[0];
 };

-void Dense::set_weights(CUDANet::Tensor &input);
+void Dense::set_weights(void *input) {
+    weights.set_data<float>(static_cast<float*>(input));
+}

 CUDANet::Tensor& Dense::get_weights() {
    return weights;
 }

-void Dense::set_biases(CUDANet::Tensor &input);
+void Dense::set_biases(void *input) {
+    biases.set_data<float>(static_cast<float*>(input));
+}

 CUDANet::Tensor& Dense::get_biases() {
    return biases;
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
--- a/src/model/module.cpp
+++ b/src/model/module.cpp
--- a/src/backends/tensor.cpp
+++ b/src/backends/tensor.cpp
@@ -54,3 +54,8 @@ T* Tensor::data() {
 void Tensor::zero() {
    backend->zero(*this);
 }
+
+template <typename T>
+void Tensor::set_data(T *data) {
+    backend->copy_to_device(*this, data, total_size)
+}