From 64eac7050b6af9e1711f3aa30e2fe3e317392087 Mon Sep 17 00:00:00 2001
From: LordMathis <matus@namesny.com>
Date: Tue, 18 Nov 2025 19:33:51 +0100
Subject: [PATCH] WIP Migrate Dense layer

---
 include/backend.hpp             |   2 +-
 include/backend/cpu.hpp         |   2 +-
 include/backend/cuda.cuh        |   2 +-
 include/layers/dense.hpp        | 133 ++++++++++----------------------
 include/tensor.hpp              |   5 ++
 src/backends/cuda/tensor_ops.cu |  10 +--
 src/backends/tensor.cpp         |   4 +
 src/layers/dense.cpp            |  90 ++++++++-------------
 8 files changed, 90 insertions(+), 158 deletions(-)
diff --git a/include/backend.hpp b/include/backend.hpp
index 8da3f2d..e8d397a 100644
--- a/include/backend.hpp
+++ b/include/backend.hpp
@@ -17,7 +17,7 @@ public:
 
     // Tensor ops
     virtual void print(const CUDANet::Tensor &input) = 0;
-    virtual void clear(CUDANet::Tensor &input) = 0;
+    virtual void zero(CUDANet::Tensor &input) = 0;
     virtual void sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) = 0;
     virtual void max(const CUDANet::Tensor &input, CUDANet::Tensor &max) = 0;
 
diff --git a/include/backend/cpu.hpp b/include/backend/cpu.hpp
index beb65b1..ad261bb 100644
--- a/include/backend/cpu.hpp
+++ b/include/backend/cpu.hpp
@@ -13,7 +13,7 @@ public:
 
     // Tensor ops
     void print(const CUDANet::Tensor &input) override;
-    void clear(CUDANet::Tensor &input) override;
+    void zero(CUDANet::Tensor &input) override;
     void sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) override;
     void max(const CUDANet::Tensor &input, CUDANet::Tensor &max) override;
 
diff --git a/include/backend/cuda.cuh b/include/backend/cuda.cuh
index e08ce34..5045e28 100644
--- a/include/backend/cuda.cuh
+++ b/include/backend/cuda.cuh
@@ -13,7 +13,7 @@ public:
 
     // Tensor ops
     void print(const CUDANet::Tensor &input) override;
-    void clear(CUDANet::Tensor &input) override;
+    void zero(CUDANet::Tensor &input) override;
     void sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) override;
     void max(const CUDANet::Tensor &input, CUDANet::Tensor &max) override;
 
diff --git a/include/layers/dense.hpp b/include/layers/dense.hpp
index 24fc2d4..d6bee21 100644
--- a/include/layers/dense.hpp
+++ b/include/layers/dense.hpp
@@ -1,9 +1,8 @@
-#ifndef CUDANET_DENSE_LAYER_H
-#define CUDANET_DENSE_LAYER_H
+#pragma once
 
 #include <vector>
 
-#include "activation.hpp"
+#include "backend.hpp"
 #include "layer.hpp"
 
 namespace CUDANet::Layers {
@@ -12,121 +11,67 @@ namespace CUDANet::Layers {
  * @brief Dense (fully connected) layer
  *
  */
-class Dense : public WeightedLayer {
+class Dense : public Layer {
   public:
-    /**
-     * @brief Construct a new Dense layer
-     *
-     * @param inputSize Size of the input vector
-     * @param outputSize Size of the output vector
-     * @param activationType Activation function type ('RELU', 'SIGMOID',
-     * 'SOFTMAX' or 'NONE')
-     */
-    Dense(int inputSize, int outputSize, Layers::ActivationType activationType);
 
-    /**
-     * @brief Destroy the Dense layer
-     *
-     */
+    Dense(CUDANet::Backend *backend, CUDANet::Shape input_shape, CUDANet::Shape output_shape);
+
     ~Dense();
 
-    /**
-     * @brief Forward pass of the dense layer
-     *
-     * @param d_input Device pointer to the input vector
-     * @return Device pointer to the output vector
-     */
-    float* forward(const float* d_input);
+    CUDANet::Tensor& forward(CUDANet::Tensor &input);
 
-    /**
-     * @brief Set the weights of the layer
-     *
-     * @param weights Pointer to vector of weights
-     */
-    void setWeights(const float* weights);
+    CUDANet::Shape input_shape();
 
-    /**
-     * @brief Get the weights of the layer
-     *
-     * @return Vector of weights
-     */
-    std::vector<float> getWeights();
+    CUDANet::Shape output_shape();
 
-    /**
-     * @brief Set the biases of the layer
-     *
-     * @param biases Pointer to vector of biases
-     */
-    void setBiases(const float* biases);
+    size_t input_size();
 
-    /**
-     * @brief Get the biases of the layer
-     *
-     * @return Vector of biases
-     */
-    std::vector<float> getBiases();
+    size_t output_size();
 
-    /**
-     * @brief Get output size
-     *
-     * @return int output size
-     */
-    int getOutputSize();
+    void set_weights(CUDANet::Tensor &input);
 
-    /**
-     * @brief Get input size
-     *
-     * @return int input size
-     */
-    int getInputSize();
+    CUDANet::Tensor& get_weights();
+
+    void set_biases(CUDANet::Tensor &input);
+
+    CUDANet::Tensor& get_biases();
 
   private:
-    int inputSize;
-    int outputSize;
+    CUDANet::Backend *backend;
 
-    std::vector<float> weights;
-    std::vector<float> biases;
+    CUDANet::Shape in_shape;
+    CUDANet::Shape out_shape;
 
-    Layers::Activation* activation;
+    CUDANet::Tensor weights;
+    CUDANet::Tensor biases;
 
-    /**
-     * @brief Initialize the weights to zeros
-     *
-     */
-    void initializeWeights();
 
-    /**
-     * @brief Initialize the biases to zeros
-     *
-     */
-    void initializeBiases();
+    void init_weights();
+    void init_biases();
 
-    float* forwardCPU(const float* input);
+// #ifdef USE_CUDA
+//     float* d_output;
 
-#ifdef USE_CUDA
-    float* d_output;
+//     float* d_weights;
+//     float* d_biases;
 
-    float* d_weights;
-    float* d_biases;
+//     // Precompute kernel launch parameters
+//     int forwardGridSize;
+//     int biasGridSize;
 
-    // Precompute kernel launch parameters
-    int forwardGridSize;
-    int biasGridSize;
+//     /**
+//      * @brief Copy the weights and biases to the device
+//      *
+//      */
+//     void toCuda();
 
-    /**
-     * @brief Copy the weights and biases to the device
-     *
-     */
-    void toCuda();
+//     void initCUDA();
+//     void delCUDA();
 
-    void initCUDA();
-    void delCUDA();
-
-    float* forwardCUDA(const float* d_input);
-#endif
+//     float* forwardCUDA(const float* d_input);
+// #endif
 
 };
 
 }  // namespace CUDANet::Layers
 
-#endif  // CUDANET_DENSE_LAYER_H
diff --git a/include/tensor.hpp b/include/tensor.hpp
index 56b6848..5e074b9 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -33,6 +33,11 @@ public:
     template <typename T>
     T* data();
 
+    void zero();
+
+    template <typename T>
+    void set_data(T *data);
+
 private:
     Shape       shape;
     DType       dtype;
diff --git a/src/backends/cuda/tensor_ops.cu b/src/backends/cuda/tensor_ops.cu
index 508d6cf..ef9e256 100644
--- a/src/backends/cuda/tensor_ops.cu
+++ b/src/backends/cuda/tensor_ops.cu
@@ -1,13 +1,13 @@
 #include <iostream>
 
-#include "backend/backend.hpp"
+#include "backend.hpp"
 #include "backend/cuda.cuh"
 #include "utils/cuda_helper.cuh"
 #include "kernels/matmul.cuh"
 
 using namespace CUDANet::Backend;
 
-void CUDA::print(const CUDANet::Backend::Tensor &input) {
+void CUDA::print(const CUDANet::Tensor &input) {
     auto length = input.numel();
     std::vector<float> h_vec(input.numel());
 
@@ -22,11 +22,11 @@ void CUDA::print(const CUDANet::Backend::Tensor &input) {
     std::cout << std::endl;
 }
 
-void CUDA::clear(CUDANet::Backend::Tensor &input) {
+void CUDA::zero(CUDANet::Tensor &input) {
     CUDA_CHECK(cudaMemset(input.data<float>(), 0, sizeof(float) * input.numel()));
 }
 
-void CUDA::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) {
+void CUDA::sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
     auto length = input.numel();
     const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
@@ -45,7 +45,7 @@ void CUDA::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &
     }
 }
 
-void CUDA::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) {
+void CUDA::max(const CUDANet::Tensor &input, CUDANet::Tensor &max) {
     auto length = input.numel();
     const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
diff --git a/src/backends/tensor.cpp b/src/backends/tensor.cpp
index 1026e3c..f15a7e9 100644
--- a/src/backends/tensor.cpp
+++ b/src/backends/tensor.cpp
@@ -50,3 +50,7 @@ template <typename T>
 T* Tensor::data() {
     return static_cast<T*>(d_ptr);
 }
+
+void Tensor::zero() {
+    backend->zero(*this);
+}
diff --git a/src/layers/dense.cpp b/src/layers/dense.cpp
index 61f9ab1..245281c 100644
--- a/src/layers/dense.cpp
+++ b/src/layers/dense.cpp
@@ -1,80 +1,58 @@
-#include "dense.hpp"
-
+#include <format>
 #include <stdexcept>
 
-#include "activation.hpp"
+#include "dense.hpp"
 
 using namespace CUDANet::Layers;
 
-Dense::Dense(int inputSize, int outputSize, ActivationType activationType)
-    : inputSize(inputSize), outputSize(outputSize) {
+Dense::Dense(CUDANet::Backend *backend, CUDANet::Shape input_shape, CUDANet::Shape output_shape)
+    : backend(backend), in_shape(input_shape), out_shape(output_shape) {
     // Allocate memory for weights and biases
-    weights.resize(outputSize * inputSize);
-    biases.resize(outputSize);
 
-    initializeWeights();
-    initializeBiases();
+    if (input_shape.size() != 1) {
+        throw std::runtime_error(std::format("Invalid shape. Expected [1], got {}", input_shape));
+    }
+    
+    if (output_shape.size() != 1) {
+        throw std::runtime_error(std::format("Invalid shape. Expected [1], got {}", output_shape));
+    }
 
-    activation = new Activation(activationType, outputSize);
+    auto input_len = input_shape[0];
+    auto output_len = output_shape[0];
 
-#ifdef USE_CUDA
-    initCUDA();
-#endif
+    auto weights = CUDANet::Tensor{Shape(input_len * output_len), CUDANet::DType::FLOAT32, backend};
+    auto biases = CUDANet::Tensor(Shape(output_len), CUDANet::DType::FLOAT32, backend);
+
+    weights.zero();
+    biases.zero();
 }
 
-Dense::~Dense() {
-    delete activation;
-#ifdef USE_CUDA
-    delCUDA();
-#endif
+CUDANet::Tensor& Dense::forward(CUDANet::Tensor &input);
+
+CUDANet::Shape Dense::input_shape() {
+    return in_shape;
 }
 
-void Dense::initializeWeights() {
-    std::fill(weights.begin(), weights.end(), 0.0f);
+CUDANet::Shape Dense::output_shape() {
+    return out_shape;
 }
 
-void Dense::initializeBiases() {
-    std::fill(biases.begin(), biases.end(), 0.0f);
-}
+size_t Dense::input_size() {
+    return in_shape[0];
+};
 
-float* Dense::forwardCPU(const float* input) {
-    throw std::logic_error("Not implemented");
-}
+size_t Dense::output_size() {
+    return out_shape[0];
+};
 
-float* Dense::forward(const float* input) {
-#ifdef USE_CUDA
-    return forwardCUDA(input);
-#else
-    return forwardCPU(input);
-#endif
-}
+void Dense::set_weights(CUDANet::Tensor &input);
 
-void Dense::setWeights(const float* weights_input) {
-    std::copy(weights_input, weights_input + weights.size(), weights.begin());
-#ifdef USE_CUDA
-    toCuda();
-#endif
-}
-
-std::vector<float> Dense::getWeights() {
+CUDANet::Tensor& Dense::get_weights() {
     return weights;
 }
 
-void Dense::setBiases(const float* biases_input) {
-    std::copy(biases_input, biases_input + biases.size(), biases.begin());
-#ifdef USE_CUDA
-    toCuda();
-#endif
-}
+void Dense::set_biases(CUDANet::Tensor &input);
 
-std::vector<float> Dense::getBiases() {
+CUDANet::Tensor& Dense::get_biases() {
     return biases;
-}
-
-int Dense::getOutputSize() {
-    return outputSize;
-}
-
-int Dense::getInputSize() {
-    return inputSize;
 }
\ No newline at end of file