WIP Migrate Activation to Tensor

Add simple Tensor class
2025-12-22 14:24:22 +00:00 · 2025-11-16 22:17:46 +01:00 · 2025-11-16 19:31:09 +01:00
7 changed files with 227 additions and 38 deletions
--- a/include/backend/backend.hpp
+++ b/include/backend/backend.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cstddef>
+
+namespace CUDANet::Backend
+{   
+
+class IBackend
+{
+public:
+
+    // Memory management
+    virtual void* allocate(size_t bytes) = 0;
+    virtual void deallocate(void* ptr) = 0;
+
+    // Layer operations
+    virtual void relu(CUDANet::Backend::Tensor &tensor) = 0;
+    virtual void sigmoid(CUDANet::Backend::Tensor &tensor) = 0;
+    virtual void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) = 0;
+};
+
+} // namespace CUDANet::Backend
--- a/include/backend/cuda_backend.cuh
+++ b/include/backend/cuda_backend.cuh
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "backend/backend.hpp"
+#include "backend/tensor.hpp"
+
+namespace CUDANet::Backend {
+
+class CUDABackend : public IBackend {
+public:
+    // Memory management
+    void* allocate(size_t bytes) override;
+    void deallocate(void* ptr) override;
+
+    // Layer operations
+    void relu(CUDANet::Backend::Tensor &tensor) override;
+    void sigmoid(CUDANet::Backend::Tensor &tensor) override;
+    void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) override;
+
+private:
+    static constexpr int BLOCK_SIZE = 256;
+};
+
+}  // namespace CUDANet::Backend
--- a/include/backend/tensor.hpp
+++ b/include/backend/tensor.hpp
@@ -0,0 +1,44 @@
+#pragma once
+#include <cstddef>
+#include "backend/backend.hpp"
+#include <vector>
+
+namespace CUDANet::Backend
+{
+
+enum class DType
+{
+    FLOAT32,
+    // FLOAT16,  // Not implemented yet
+    // INT32,  // Not implemented yet
+};
+
+typedef std::vector<size_t> Shape;
+
+class Tensor
+{
+public:
+
+    Tensor() = default;
+    Tensor(Shape shape, DType dtype, IBackend* backend);
+    ~Tensor();
+
+    void* allocate();
+    void deallocate();
+
+    void toDevice(const void* hostPtr);
+    void toHost(void* hostPtr);
+
+    size_t size() const;
+    size_t numel() const;
+    void* data() const;
+
+private:
+    Shape       shape;
+    DType       dtype;
+    IBackend*   backend;
+    void*       devicePtr;
+    void*       hostPtr;
+};
+
+} // namespace CUDANet::Backend
--- a/include/layers/activation.hpp
+++ b/include/layers/activation.hpp
@@ -1,5 +1,7 @@
-#ifndef CUDANET_ACTIVATION_H
-#define CUDANET_ACTIVATION_H
+#pragma once
+
+#include "backend/tensor.hpp"
+#include "backend/backend.hpp"

 namespace CUDANet::Layers {

@@ -41,29 +43,16 @@ class Activation {
     * 
     * @param d_input Pointer to the input vector on the device
     */
-    void activate(float* d_input);
+    void activate(CUDANet::Backend::Tensor input);


  private:
+    CUDANet::Backend::IBackend* backend;
    ActivationType activationType;
    int length;

-    void activateCPU(float* input);
-
-#ifdef USE_CUDA
-    int gridSize;
-
-    float* d_softmax_sum;
-    float* d_max;
-
-    void activateCUDA(float* d_input);
-
-    void initCUDA();
-    void delCUDA();
-#endif
+    CUDANet::Backend::Tensor softmax_sum;
+    CUDANet::Backend::Tensor tensor_max;
 };

-
 }  // namespace CUDANet::Layers
-
-#endif  // CUDANET_ACTIVATION_H
--- a/src/backends/cuda/cuda_backend.cu
+++ b/src/backends/cuda/cuda_backend.cu
@@ -0,0 +1,69 @@
+#include "backend/cuda_backend.cuh"
+#include "utils/cuda_helper.cuh"
+#include "kernels/activation_functions.cuh"
+#include "kernels/matmul.cuh"
+#include "utils/vector.cuh"
+
+using namespace CUDANet::Backend;
+
+void *CUDABackend::allocate(size_t bytes) {
+    void* devicePtr = nullptr;
+    CUDA_CHECK(cudaMalloc(&devicePtr, bytes));
+    return devicePtr;
+}
+
+void CUDABackend::deallocate(void* ptr) {
+    CUDA_CHECK(cudaFree(ptr));
+}
+
+// void CUDABackend::copyToDevice(void* devicePtr, const void* hostPtr, size_t bytes) {
+//     CUDA_CHECK(cudaMemcpy(devicePtr, hostPtr, bytes, cudaMemcpyHostToDevice));
+//     CUDA_CHECK(cudaDeviceSynchronize());
+// }
+
+// void CUDABackend::copyToHost(void* hostPtr, const void* devicePtr, size_t bytes) {
+//     CUDA_CHECK(cudaMemcpy(hostPtr, devicePtr, bytes, cudaMemcpyDeviceToHost));
+//     CUDA_CHECK(cudaDeviceSynchronize());
+// }
+
+void CUDABackend::relu(Tensor &tensor) {
+    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    Kernels::relu<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void CUDABackend::sigmoid(Tensor &tensor) {
+    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
+    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    // Find max value
+    Utils::max(tensor, temp_max, tensor.numel());
+
+    // Subtract max value to improve numerical stability
+    Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
+        (float*)tensor.data(), (float*)tensor.data(), (float*)temp_max.data(), tensor.numel()
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    // Compute exponentials
+    Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
+        (float*)tensor.data(), (float*)tensor.data(), tensor.numel()
+    );
+    CUDA_CHECK(cudaGetLastError());
+    
+    // Find sum
+    Utils::sum(tensor, temp_sum, tensor.numel());
+
+    Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
+        (float*)tensor.data(), (float*)tensor.data(), (float*)temp_sum.data(), tensor.numel()
+    );
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
--- a/src/backends/tensor.cpp
+++ b/src/backends/tensor.cpp
@@ -0,0 +1,39 @@
+#include <stdexcept>
+
+#include "backend/tensor.hpp"
+
+using namespace CUDANet::Backend;
+
+Tensor::Tensor(Shape shape, DType dtype, IBackend* backend)
+    : shape(shape), dtype(dtype), backend(backend), devicePtr(nullptr), hostPtr(nullptr) {}
+
+Tensor::~Tensor() {
+    deallocate();
+}
+
+size_t Tensor::numel() const {
+    size_t totalElements = 1;
+    for (const auto& dim : shape) {
+        totalElements *= dim;
+    }
+    return totalElements;
+}
+
+size_t Tensor::size() const {
+    size_t totalSize = numel();
+
+    size_t typeSize = 0;
+    switch (dtype) {
+        case DType::FLOAT32:
+            typeSize = 4;
+            break;
+        default:
+            throw std::runtime_error("Unsupported data type");
+    }
+
+    return totalSize * typeSize;
+}
+
+void* Tensor::data() const {
+    return devicePtr;
+}
--- a/src/layers/activation.cpp
+++ b/src/layers/activation.cpp
@@ -2,30 +2,33 @@
 #include <vector>

 #include "activation.hpp"
+#include "backend/tensor.hpp"

 using namespace CUDANet::Layers;

 Activation::Activation(ActivationType activation, const int length)
    : activationType(activation), length(length) {
-#ifdef USE_CUDA
-    initCUDA();
-#endif
+
+
+    if (activationType == SOFTMAX) {
+      softmax_sum = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, nullptr);
+      tensor_max = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, nullptr);
+    }
 }

-Activation::~Activation() {
-#ifdef USE_CUDA
-    delCUDA();
-#endif
+void Activation::activate(CUDANet::Backend::Tensor input) {
+    switch (activationType)
+    {
+    case ActivationType::SIGMOID:
+        backend->sigmoid(input);
+        break;
+    case ActivationType::RELU:
+        /* code */
+        break;
+    case ActivationType::SOFTMAX:
+        /* code */
+        break;
+    default:
+        break;
    }
-
-void Activation::activateCPU(float* input) {
-    throw std::logic_error("Not implemented");
-}
-
-void Activation::activate(float* input) {
-#ifdef USE_CUDA
-    activateCUDA(input);
-#else
-    activateCPU(input);
-#endif
 }
Author	SHA1	Message	Date
LordMathis	6133fb20af	WIP Migrate Activation to Tensor	2025-11-16 22:17:46 +01:00
LordMathis	64bf9197ff	Add simple Tensor class	2025-11-16 19:31:09 +01:00