WIP Migrate Activation to Tensor

2025-12-22 14:24:22 +00:00 · 2025-11-16 22:17:46 +01:00
parent 64bf9197ff
commit 6133fb20af
7 changed files with 160 additions and 41 deletions
--- a/include/backend/backend.hpp
+++ b/include/backend/backend.hpp
@@ -13,9 +13,10 @@ public:
    virtual void* allocate(size_t bytes) = 0;
    virtual void deallocate(void* ptr) = 0;
-    virtual void copyToDevice(void* devicePtr, const void* hostPtr, size_t bytes) = 0;
+    // Layer operations
-    virtual void copyToHost(void* hostPtr, const void* devicePtr, size_t bytes) = 0;
+    virtual void relu(CUDANet::Backend::Tensor &tensor) = 0;
-
+    virtual void sigmoid(CUDANet::Backend::Tensor &tensor) = 0;
    virtual void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) = 0;
 };
 } // namespace CUDANet::Backend
--- a/include/backend/cuda_backend.cuh
+++ b/include/backend/cuda_backend.cuh
@@ -0,0 +1,23 @@
 #pragma once
 #include "backend/backend.hpp"
 #include "backend/tensor.hpp"
 namespace CUDANet::Backend {
 class CUDABackend : public IBackend {
 public:
    // Memory management
    void* allocate(size_t bytes) override;
    void deallocate(void* ptr) override;
    // Layer operations
    void relu(CUDANet::Backend::Tensor &tensor) override;
    void sigmoid(CUDANet::Backend::Tensor &tensor) override;
    void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) override;
 private:
    static constexpr int BLOCK_SIZE = 256;
 };
 }  // namespace CUDANet::Backend
--- a/include/backend/tensor.hpp
+++ b/include/backend/tensor.hpp
@@ -18,6 +18,8 @@ typedef std::vector<size_t> Shape;
 class Tensor
 {
 public:
    Tensor() = default;
    Tensor(Shape shape, DType dtype, IBackend* backend);
    ~Tensor();
@@ -27,6 +29,10 @@ public:
    void toDevice(const void* hostPtr);
    void toHost(void* hostPtr);
    size_t size() const;
    size_t numel() const;
    void* data() const;
 private:
    Shape       shape;
    DType       dtype;
--- a/include/layers/activation.hpp
+++ b/include/layers/activation.hpp
@@ -1,5 +1,7 @@
-#ifndef CUDANET_ACTIVATION_H
+#pragma once
-#define CUDANET_ACTIVATION_H
+
 #include "backend/tensor.hpp"
 #include "backend/backend.hpp"
 namespace CUDANet::Layers {
@@ -41,29 +43,16 @@ class Activation {
     * 
     * @param d_input Pointer to the input vector on the device
     */
-    void activate(float* d_input);
+    void activate(CUDANet::Backend::Tensor input);
  private:
    CUDANet::Backend::IBackend* backend;
    ActivationType activationType;
    int length;
-    void activateCPU(float* input);
+    CUDANet::Backend::Tensor softmax_sum;
-
+    CUDANet::Backend::Tensor tensor_max;
 #ifdef USE_CUDA
    int gridSize;
    float* d_softmax_sum;
    float* d_max;
    void activateCUDA(float* d_input);
    void initCUDA();
    void delCUDA();
 #endif
 };
 }  // namespace CUDANet::Layers
 #endif  // CUDANET_ACTIVATION_H
--- a/src/backends/cuda/cuda_backend.cu
+++ b/src/backends/cuda/cuda_backend.cu
@@ -0,0 +1,69 @@
 #include "backend/cuda_backend.cuh"
 #include "utils/cuda_helper.cuh"
 #include "kernels/activation_functions.cuh"
 #include "kernels/matmul.cuh"
 #include "utils/vector.cuh"
 using namespace CUDANet::Backend;
 void *CUDABackend::allocate(size_t bytes) {
    void* devicePtr = nullptr;
    CUDA_CHECK(cudaMalloc(&devicePtr, bytes));
    return devicePtr;
 }
 void CUDABackend::deallocate(void* ptr) {
    CUDA_CHECK(cudaFree(ptr));
 }
 // void CUDABackend::copyToDevice(void* devicePtr, const void* hostPtr, size_t bytes) {
 //     CUDA_CHECK(cudaMemcpy(devicePtr, hostPtr, bytes, cudaMemcpyHostToDevice));
 //     CUDA_CHECK(cudaDeviceSynchronize());
 // }
 // void CUDABackend::copyToHost(void* hostPtr, const void* devicePtr, size_t bytes) {
 //     CUDA_CHECK(cudaMemcpy(hostPtr, devicePtr, bytes, cudaMemcpyDeviceToHost));
 //     CUDA_CHECK(cudaDeviceSynchronize());
 // }
 void CUDABackend::relu(Tensor &tensor) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
    Kernels::relu<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }
 void CUDABackend::sigmoid(Tensor &tensor) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
    Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }
 void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
    // Find max value
    Utils::max(tensor, temp_max, tensor.numel());
    // Subtract max value to improve numerical stability
    Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
        (float*)tensor.data(), (float*)tensor.data(), (float*)temp_max.data(), tensor.numel()
    );
    CUDA_CHECK(cudaGetLastError());
    // Compute exponentials
    Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
        (float*)tensor.data(), (float*)tensor.data(), tensor.numel()
    );
    CUDA_CHECK(cudaGetLastError());
    // Find sum
    Utils::sum(tensor, temp_sum, tensor.numel());
    Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
        (float*)tensor.data(), (float*)tensor.data(), (float*)temp_sum.data(), tensor.numel()
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }
--- a/src/backends/tensor.cpp
+++ b/src/backends/tensor.cpp
@@ -1,3 +1,5 @@
 #include <stdexcept>
 #include "backend/tensor.hpp"
 using namespace CUDANet::Backend;
@@ -9,3 +11,29 @@ Tensor::~Tensor() {
    deallocate();
 }
 size_t Tensor::numel() const {
    size_t totalElements = 1;
    for (const auto& dim : shape) {
        totalElements *= dim;
    }
    return totalElements;
 }
 size_t Tensor::size() const {
    size_t totalSize = numel();
    size_t typeSize = 0;
    switch (dtype) {
        case DType::FLOAT32:
            typeSize = 4;
            break;
        default:
            throw std::runtime_error("Unsupported data type");
    }
    return totalSize * typeSize;
 }
 void* Tensor::data() const {
    return devicePtr;
 }
--- a/src/layers/activation.cpp
+++ b/src/layers/activation.cpp
@@ -2,30 +2,33 @@
 #include <vector>
 #include "activation.hpp"
 #include "backend/tensor.hpp"
 using namespace CUDANet::Layers;
 Activation::Activation(ActivationType activation, const int length)
    : activationType(activation), length(length) {
-#ifdef USE_CUDA
+
-    initCUDA();
+
-#endif
+    if (activationType == SOFTMAX) {
      softmax_sum = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, nullptr);
      tensor_max = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, nullptr);
    }
 }
-Activation::~Activation() {
+void Activation::activate(CUDANet::Backend::Tensor input) {
-#ifdef USE_CUDA
+    switch (activationType)
-    delCUDA();
+    {
-#endif
+    case ActivationType::SIGMOID:
-}
+        backend->sigmoid(input);
-
+        break;
-void Activation::activateCPU(float* input) {
+    case ActivationType::RELU:
-    throw std::logic_error("Not implemented");
+        /* code */
-}
+        break;
-
+    case ActivationType::SOFTMAX:
-void Activation::activate(float* input) {
+        /* code */
-#ifdef USE_CUDA
+        break;
-    activateCUDA(input);
+    default:
-#else
+        break;
-    activateCPU(input);
+    }
 #endif
 }