Fix Tensor issues

Fix some dense layer issues
WIP Migrate Dense layer
2025-12-22 14:24:22 +00:00 · 2025-11-18 22:38:56 +01:00 · 2025-11-18 22:17:08 +01:00 · 2025-11-18 21:12:47 +01:00 · 2025-11-18 19:33:51 +01:00 · 2025-11-18 19:10:18 +01:00
33 changed files with 507 additions and 632 deletions
--- a/examples/inception_v3/inception_modules.cpp
+++ b/examples/inception_v3/inception_modules.cpp
@@ -213,7 +213,7 @@ InceptionB::InceptionB(
        branch3x3->getOutputSize(), branch3x3dbl_3->getOutputSize()
    );
    concat_2 = new CUDANet::Layers::Concat(
-        concat_1->getOutputSize(), branchPool->getOutputSize()
+        concat_1->getOutputSize(), branchPool->get_output_size()
    );

    outputSize = concat_2->getOutputSize();
@@ -441,7 +441,7 @@ InceptionD::InceptionD(
        branch3x3_2->getOutputSize(), branch7x7x3_4->getOutputSize()
    );
    concat_2 = new CUDANet::Layers::Concat(
-        concat_1->getOutputSize(), branchPool->getOutputSize()
+        concat_1->getOutputSize(), branchPool->get_output_size()
    );

    outputSize = concat_2->getOutputSize();
@@ -707,7 +707,7 @@ InceptionV3::InceptionV3(
    addLayer("AveragePool", avgpool);

    fc = new CUDANet::Layers::Dense(
-        avgpool->getOutputSize(), 1000, CUDANet::Layers::ActivationType::NONE
+        avgpool->get_output_size(), 1000, CUDANet::Layers::ActivationType::NONE
    );
    addLayer("fc", fc);
 }
--- a/include/backend.hpp
+++ b/include/backend.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <cstddef>
+
+namespace CUDANet {
+
+// Forward declaration
+class Tensor;
+
+class Backend {
+  public:
+    // Memory management
+    virtual void* allocate(size_t bytes) = 0;
+    virtual void  deallocate(void* ptr)  = 0;
+
+    // Tensor ops
+    virtual void print(const CUDANet::Tensor& input) = 0;
+    virtual void zero(CUDANet::Tensor& input)        = 0;
+
+    virtual void
+    copy_to_device(CUDANet::Tensor& tensor, void* data, size_t size) = 0;
+
+    virtual void sum(const CUDANet::Tensor& input, CUDANet::Tensor& sum) = 0;
+    virtual void max(const CUDANet::Tensor& input, CUDANet::Tensor& max) = 0;
+
+    // Layer ops
+    virtual void relu(CUDANet::Tensor& tensor)    = 0;
+    virtual void sigmoid(CUDANet::Tensor& tensor) = 0;
+    virtual void softmax(
+        CUDANet::Tensor& tensor,
+        CUDANet::Tensor& temp_max,
+        CUDANet::Tensor& temp_sum
+    ) = 0;
+
+    virtual CUDANet::Tensor& dense(
+        const CUDANet::Tensor& weights,
+        const CUDANet::Tensor& biases,
+        const CUDANet::Tensor& input,
+        CUDANet::Tensor& output,
+        const size_t           input_size,
+        const size_t           output_size
+    ) = 0;
+};
+
+}  // namespace CUDANet
--- a/include/backend/backend.hpp
+++ b/include/backend/backend.hpp
@@ -1,29 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include "backend/tensor.hpp"
-
-namespace CUDANet::Backend
-{   
-
-class IBackend
-{
-public:
-
-    // Memory management
-    virtual void* allocate(size_t bytes) = 0;
-    virtual void deallocate(void* ptr) = 0;
-
-    // Tensor ops
-    virtual void print(const CUDANet::Backend::Tensor &input) = 0;
-    virtual void clear(CUDANet::Backend::Tensor &input) = 0;
-    virtual void sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) = 0;
-    virtual void max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) = 0;
-
-    // Layer ops
-    virtual void relu(CUDANet::Backend::Tensor &tensor) = 0;
-    virtual void sigmoid(CUDANet::Backend::Tensor &tensor) = 0;
-    virtual void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) = 0;
-};
-
-} // namespace CUDANet::Backend
--- a/include/backend/cpu.hpp
+++ b/include/backend/cpu.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "backend.hpp"
+#include "tensor.hpp"
+
+namespace CUDANet::Backend {
+
+class CPU : public Backend {
+public:
+    // Memory management
+    void* allocate(size_t bytes) override;
+    void deallocate(void* ptr) override;
+
+    // Tensor ops
+    void print(const CUDANet::Tensor &input) override;
+    void zero(CUDANet::Tensor &input) override;
+    void sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) override;
+    void max(const CUDANet::Tensor &input, CUDANet::Tensor &max) override;
+
+    // Layer ops
+    void relu(CUDANet::Tensor &tensor) override;
+    void sigmoid(CUDANet::Tensor &tensor) override;
+    void softmax(CUDANet::Tensor &tensor, CUDANet::Tensor &temp_max, CUDANet::Tensor &temp_sum) override;
+};
+
+}
--- a/include/backend/cuda.cuh
+++ b/include/backend/cuda.cuh
@@ -1,29 +1,41 @@
 #pragma once

-#include "backend/backend.hpp"
-#include "backend/tensor.hpp"
+#include "backend.hpp"
+#include "tensor.hpp"

 namespace CUDANet::Backend {

-class CUDABackend : public IBackend {
+class CUDA : public Backend {
  public:
    // Memory management
    void* allocate(size_t bytes) override;
    void  deallocate(void* ptr) override;

    // Tensor ops
-    void print(const CUDANet::Backend::Tensor &input) override;
-    void clear(CUDANet::Backend::Tensor &input) override;
-    void sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) override;
-    void max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) override;
+    void print(const CUDANet::Tensor& input) override;
+    void zero(CUDANet::Tensor& input) override;
+    void
+    copy_to_device(CUDANet::Tensor& tensor, void* data, size_t size) override;
+    void sum(const CUDANet::Tensor& input, CUDANet::Tensor& sum) override;
+    void max(const CUDANet::Tensor& input, CUDANet::Tensor& max) override;

    // Layer ops
-    void relu(CUDANet::Backend::Tensor &tensor) override;
-    void sigmoid(CUDANet::Backend::Tensor &tensor) override;
-    void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) override;
+    void relu(CUDANet::Tensor& tensor) override;
+    void sigmoid(CUDANet::Tensor& tensor) override;
+    void softmax(
+        CUDANet::Tensor& tensor,
+        CUDANet::Tensor& temp_max,
+        CUDANet::Tensor& temp_sum
+    ) override;

-private:
-    static constexpr int BLOCK_SIZE = 256;
+    CUDANet::Tensor& dense(
+        const CUDANet::Tensor& weights,
+        const CUDANet::Tensor& biases,
+        const CUDANet::Tensor& input,
+        CUDANet::Tensor& output,
+        const size_t           input_size,
+        const size_t           output_size
+    ) override;
 };

 }  // namespace CUDANet::Backend
--- a/include/backend/tensor.hpp
+++ b/include/backend/tensor.hpp
@@ -1,46 +0,0 @@
-#pragma once
-#include <cstddef>
-#include "backend/backend.hpp"
-#include <vector>
-
-namespace CUDANet::Backend
-{
-
-enum class DType
-{
-    FLOAT32,
-    // FLOAT16,  // Not implemented yet
-    // INT32,  // Not implemented yet
-};
-
-typedef std::vector<size_t> Shape;
-
-class Tensor
-{
-public:
-
-    Tensor() = default;
-    Tensor(Shape shape, DType dtype, IBackend* backend);
-    ~Tensor();
-
-    size_t size() const;
-    size_t numel() const;
-
-    template <typename T>
-    const T* data() const;
-
-    template <typename T>
-    T* data();
-
-private:
-    Shape       shape;
-    DType       dtype;
-
-    size_t total_elms;
-    size_t total_size;
-
-    IBackend*   backend;
-    void*       d_ptr;
-};
-
-} // namespace CUDANet::Backend
--- a/include/layer.hpp
+++ b/include/layer.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <vector>
+
+#include "shape.hpp"
+#include "tensor.hpp"
+
+#define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \
+    ((stride - 1) * inputSize - stride + kernelSize) / 2;
+
+
+namespace CUDANet {
+
+/**
+ * @brief Basic Sequential Layer
+ *
+ */
+class Layer {
+  public:
+
+    virtual ~Layer(){};
+
+    virtual CUDANet::Tensor& forward(const CUDANet::Tensor &input) = 0;
+    
+    virtual CUDANet::Shape input_shape() = 0;
+
+    virtual CUDANet::Shape output_shape() = 0;
+
+    virtual size_t input_size() = 0;
+
+    virtual size_t output_size() = 0;
+
+    virtual void set_weights(void *input) = 0;
+
+    virtual CUDANet::Tensor& get_weights() = 0;
+
+    virtual void set_biases(void *input) = 0;
+
+    virtual CUDANet::Tensor& get_biases() = 0;
+};
+
+}  // namespace CUDANet::Layers
--- a/include/layers/activation.hpp
+++ b/include/layers/activation.hpp
@@ -1,7 +1,8 @@
 #pragma once

-#include "backend/tensor.hpp"
-#include "backend/backend.hpp"
+#include "tensor.hpp"
+#include "backend.hpp"
+#include "layer.hpp"

 namespace CUDANet::Layers {

@@ -19,40 +20,41 @@ enum ActivationType { SIGMOID, RELU, SOFTMAX, NONE };
 * @brief Utility class that performs activation
 * 
 */
-class Activation {
+class Activation : public Layer {
  public:

    Activation() = default;

-    /**
-     * @brief Construct a new Activation object
-     * 
-     * @param activation Type of activation
-     * @param length     Length of the input
-     */
-    Activation(CUDANet::Backend::IBackend* backend, ActivationType activation, const int length);
+    Activation(CUDANet::Backend* backend, ActivationType activation, const CUDANet::Shape &shape);

-    /**
-     * @brief Destroy the Activation object
-     * 
-     */
-    ~Activation();
+    ~Activation() = default;

-    /**
-     * @brief Run the activation function on the input
-     * 
-     * @param d_input Pointer to the input vector on the device
-     */
-    void activate(CUDANet::Backend::Tensor input);
+    CUDANet::Tensor& forward(CUDANet::Tensor &input) override;
+    
+    CUDANet::Shape input_shape() override;
+
+    CUDANet::Shape output_shape() override;
+
+    size_t input_size() override;
+
+    size_t output_size() override;
+
+    void set_weights(void *input) override;
+
+    CUDANet::Tensor& get_weights() override;
+
+    void set_biases(void *input) override;
+
+    CUDANet::Tensor& get_biases() override;


  private:
-    CUDANet::Backend::IBackend* backend;
+    CUDANet::Backend* backend;
    ActivationType activationType;
-    int length;
+    CUDANet::Shape shape;

-    CUDANet::Backend::Tensor softmax_sum;
-    CUDANet::Backend::Tensor tensor_max;
+    CUDANet::Tensor softmax_sum;
+    CUDANet::Tensor tensor_max;
 };

 }  // namespace CUDANet::Layers
--- a/include/layers/avg_pooling.hpp
+++ b/include/layers/avg_pooling.hpp
@@ -6,7 +6,7 @@

 namespace CUDANet::Layers {

-class AvgPooling2d : public SequentialLayer, public TwoDLayer {
+class AvgPooling2d : public Layer, public TwoDLayer {
  public:
    AvgPooling2d(
        shape2d        inputSize,
@@ -25,7 +25,7 @@ class AvgPooling2d : public SequentialLayer, public TwoDLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();

    /**
     * @brief Get input size
--- a/include/layers/dense.hpp
+++ b/include/layers/dense.hpp
@@ -1,9 +1,8 @@
-#ifndef CUDANET_DENSE_LAYER_H
-#define CUDANET_DENSE_LAYER_H
+#pragma once

 #include <vector>

-#include "activation.hpp"
+#include "backend.hpp"
 #include "layer.hpp"

 namespace CUDANet::Layers {
@@ -12,121 +11,42 @@ namespace CUDANet::Layers {
 * @brief Dense (fully connected) layer
 *
 */
-class Dense : public WeightedLayer {
+class Dense : public Layer {
  public:
-    /**
-     * @brief Construct a new Dense layer
-     *
-     * @param inputSize Size of the input vector
-     * @param outputSize Size of the output vector
-     * @param activationType Activation function type ('RELU', 'SIGMOID',
-     * 'SOFTMAX' or 'NONE')
-     */
-    Dense(int inputSize, int outputSize, Layers::ActivationType activationType);

-    /**
-     * @brief Destroy the Dense layer
-     *
-     */
+    Dense(CUDANet::Backend *backend, CUDANet::Shape input_shape, CUDANet::Shape output_shape);
+
    ~Dense();

-    /**
-     * @brief Forward pass of the dense layer
-     *
-     * @param d_input Device pointer to the input vector
-     * @return Device pointer to the output vector
-     */
-    float* forward(const float* d_input);
+    CUDANet::Tensor& forward(const CUDANet::Tensor &input) override;

-    /**
-     * @brief Set the weights of the layer
-     *
-     * @param weights Pointer to vector of weights
-     */
-    void setWeights(const float* weights);
+    CUDANet::Shape input_shape() override;

-    /**
-     * @brief Get the weights of the layer
-     *
-     * @return Vector of weights
-     */
-    std::vector<float> getWeights();
+    CUDANet::Shape output_shape() override;

-    /**
-     * @brief Set the biases of the layer
-     *
-     * @param biases Pointer to vector of biases
-     */
-    void setBiases(const float* biases);
+    size_t input_size() override;

-    /**
-     * @brief Get the biases of the layer
-     *
-     * @return Vector of biases
-     */
-    std::vector<float> getBiases();
+    size_t output_size() override;

-    /**
-     * @brief Get output size
-     *
-     * @return int output size
-     */
-    int getOutputSize();
+    void set_weights(void *input) override;

-    /**
-     * @brief Get input size
-     *
-     * @return int input size
-     */
-    int getInputSize();
+    CUDANet::Tensor& get_weights() override;
+
+    void set_biases(void *input) override;
+
+    CUDANet::Tensor& get_biases() override;

  private:
-    int inputSize;
-    int outputSize;
+    CUDANet::Backend *backend;

-    std::vector<float> weights;
-    std::vector<float> biases;
+    CUDANet::Shape in_shape;
+    CUDANet::Shape out_shape;

-    Layers::Activation* activation;
-
-    /**
-     * @brief Initialize the weights to zeros
-     *
-     */
-    void initializeWeights();
-
-    /**
-     * @brief Initialize the biases to zeros
-     *
-     */
-    void initializeBiases();
-
-    float* forwardCPU(const float* input);
-
-#ifdef USE_CUDA
-    float* d_output;
-
-    float* d_weights;
-    float* d_biases;
-
-    // Precompute kernel launch parameters
-    int forwardGridSize;
-    int biasGridSize;
-
-    /**
-     * @brief Copy the weights and biases to the device
-     *
-     */
-    void toCuda();
-
-    void initCUDA();
-    void delCUDA();
-
-    float* forwardCUDA(const float* d_input);
-#endif
+    CUDANet::Tensor weights;
+    CUDANet::Tensor biases;

+    CUDANet::Tensor output;
 };

 }  // namespace CUDANet::Layers

-#endif  // CUDANET_DENSE_LAYER_H
--- a/include/layers/input.hpp
+++ b/include/layers/input.hpp
@@ -9,7 +9,7 @@ namespace CUDANet::Layers {
 * @brief Input layer, just copies the input to the device
 *
 */
-class Input : public SequentialLayer {
+class Input : public Layer {
  public:
    /**
     * @brief Create a new Input layer
@@ -38,7 +38,7 @@ class Input : public SequentialLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();

    /**
     * @brief Get input size
--- a/include/layers/layer.hpp
+++ b/include/layers/layer.hpp
@@ -1,124 +0,0 @@
-
-#ifndef CUDANET_I_LAYER_H
-#define CUDANET_I_LAYER_H
-
-#include <vector>
-
-#define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \
-    ((stride - 1) * inputSize - stride + kernelSize) / 2;
-
-typedef std::pair<int, int> shape2d;
-
-namespace CUDANet::Layers {
-
-
-class TwoDLayer {
-
-  public:
-    virtual shape2d getOutputDims() = 0;
-
-};
-
-
-/**
- * @brief Basic Sequential Layer
- *
- */
-class SequentialLayer {
-  public:
-    /**
-     * @brief Destroy the Sequential Layer
-     *
-     */
-    virtual ~SequentialLayer(){};
-
-    /**
-     * @brief Forward propagation virtual function
-     *
-     * @param input Device pointer to the input
-     * @return float* Device pointer to the output
-     */
-    virtual float* forward(const float* input) = 0;
-
-    /**
-     * @brief Get output size
-     *
-     * @return int output size
-     */
-    virtual int getOutputSize() = 0;
-
-    /**
-     * @brief Get input size
-     * 
-     * @return int input size
-     */
-    virtual int getInputSize() = 0;
-};
-
-/**
- * @brief Base class for layers with weights and biases
- */
-class WeightedLayer : public SequentialLayer {
-  public:
-    /**
-     * @brief Destroy the ILayer object
-     *
-     */
-    virtual ~WeightedLayer(){};
-
-    /**
-     * @brief Virtual function for forward pass
-     *
-     * @param input (Device) Pointer to the input
-     * @return float* Device pointer to the output
-     */
-    virtual float* forward(const float* input) = 0;
-
-    /**
-     * @brief Virtual function for setting weights
-     *
-     * @param weights Pointer to the weights
-     */
-    virtual void setWeights(const float* weights) = 0;
-
-    /**
-     * @brief Virtual function for getting weights
-     *
-     */
-    virtual std::vector<float> getWeights() = 0;
-
-    /**
-     * @brief Virtual function for setting biases
-     *
-     * @param biases Pointer to the biases
-     */
-    virtual void setBiases(const float* biases) = 0;
-
-    /**
-     * @brief Virtual function for getting biases
-     *
-     */
-    virtual std::vector<float> getBiases() = 0;
-
-  private:
-    /**
-     * @brief Initialize the weights
-     */
-    virtual void initializeWeights() = 0;
-
-    /**
-     * @brief Initialize the biases
-     */
-    virtual void initializeBiases() = 0;
-
-#ifdef USE_CUDA
-    /**
-     * @brief Copy the weights and biases to the device
-     */
-    virtual void toCuda() = 0;
-#endif
-};
-
-}  // namespace CUDANet::Layers
-
-#endif  // CUDANET_I_LAYERH
--- a/include/layers/max_pooling.hpp
+++ b/include/layers/max_pooling.hpp
@@ -6,7 +6,7 @@

 namespace CUDANet::Layers {

-class MaxPooling2d : public SequentialLayer, public TwoDLayer {
+class MaxPooling2d : public Layer, public TwoDLayer {
  public:
    MaxPooling2d(
        shape2d        inputSize,
@@ -25,7 +25,7 @@ class MaxPooling2d : public SequentialLayer, public TwoDLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();

    /**
     * @brief Get input size
--- a/include/layers/output.hpp
+++ b/include/layers/output.hpp
@@ -5,7 +5,7 @@

 namespace CUDANet::Layers {

-class Output : public SequentialLayer {
+class Output : public Layer {
  public:
    /**
     * @brief Create a new Output layer
@@ -34,7 +34,7 @@ class Output : public SequentialLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();

    /**
     * @brief Get input size
--- a/include/shape.hpp
+++ b/include/shape.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <vector>
+
+namespace CUDANet {
+
+typedef std::vector<size_t> Shape;
+
+} // namespace CUDANet
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "backend.hpp"
+#include "shape.hpp"
+
+namespace CUDANet
+{
+
+enum class DType
+{
+    FLOAT32,
+    // FLOAT16,  // Not implemented yet
+    // INT32,  // Not implemented yet
+};
+
+class Tensor
+{
+public:
+
+    Tensor() = default;
+    Tensor(Shape shape, DType dtype, CUDANet::Backend* backend);
+
+    Tensor(Tensor&& other) noexcept;
+    Tensor& operator=(Tensor&& other) noexcept;
+    Tensor(const Tensor&) = delete;
+    Tensor& operator=(const Tensor&) = delete;
+
+    ~Tensor();
+
+    size_t size() const;
+    size_t numel() const;
+
+    template <typename T>
+    const T* data() const {
+        return static_cast<T*>(d_ptr);
+    }
+
+    template <typename T>
+    T* data() {
+        return static_cast<T*>(d_ptr);
+    }
+
+    void zero();
+
+    template <typename T>
+    void set_data(T *data) {
+        backend->copy_to_device(*this, data, total_size);
+    }
+
+private:
+    Shape       shape;
+    DType       dtype;
+
+    size_t total_elms;
+    size_t total_size;
+
+    CUDANet::Backend*   backend;
+    void*       d_ptr;
+};
+
+} // namespace CUDANet
--- a/src/backends/cuda/cuda_backend.cu
+++ b/src/backends/cuda/cuda_backend.cu
@@ -28,12 +28,12 @@ cudaDeviceProp initializeCUDA() {

 using namespace CUDANet::Backend;

-void* CUDABackend::allocate(size_t bytes) {
+void* CUDA::allocate(size_t bytes) {
    void* d_ptr = nullptr;
    CUDA_CHECK(cudaMalloc(&d_ptr, bytes));
    return d_ptr;
 }

-void CUDABackend::deallocate(void* ptr) {
+void CUDA::deallocate(void* ptr) {
    CUDA_CHECK(cudaFree(ptr));
 }
--- a/src/backends/cuda/layer_ops.cu
+++ b/src/backends/cuda/layer_ops.cu
@@ -1,25 +1,29 @@
 #include "backend/cuda.cuh"
-#include "utils/cuda_helper.cuh"
 #include "kernels/activation_functions.cuh"
 #include "kernels/matmul.cuh"
+#include "utils/cuda_helper.cuh"

 using namespace CUDANet::Backend;

-void CUDABackend::relu(Tensor &tensor) {
+void CUDA::relu(Tensor& tensor) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    Kernels::relu<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
+    Kernels::relu<<<gridSize, BLOCK_SIZE>>>(
+        tensor.data<float>(), tensor.data<float>(), tensor.numel()
+    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }

-void CUDABackend::sigmoid(Tensor &tensor) {
+void CUDA::sigmoid(Tensor& tensor) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
+    Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
+        tensor.data<float>(), tensor.data<float>(), tensor.numel()
+    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }

-void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
+void CUDA::softmax(Tensor& tensor, Tensor& temp_max, Tensor& temp_sum) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;

    // Find max value
@@ -27,7 +31,8 @@ void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {

    // Subtract max value to improve numerical stability
    Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
-        tensor.data<float>(), tensor.data<float>(), temp_max.data<float>(), tensor.numel()
+        tensor.data<float>(), tensor.data<float>(), temp_max.data<float>(),
+        tensor.numel()
    );
    CUDA_CHECK(cudaGetLastError());

@@ -41,8 +46,37 @@ void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
    sum(tensor, temp_sum);

    Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
-        tensor.data<float>(), tensor.data<float>(), temp_sum.data<float>(), tensor.numel()
+        tensor.data<float>(), tensor.data<float>(), temp_sum.data<float>(),
+        tensor.numel()
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }
+
+CUDANet::Tensor& CUDA::dense(
+    const CUDANet::Tensor& weights,
+    const CUDANet::Tensor& biases,
+    const CUDANet::Tensor& input,
+    CUDANet::Tensor& output,
+    const size_t           input_size,
+    const size_t           output_size
+) {
+    auto forwardGridSize =
+        (std::max(input_size, output_size) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
+        weights.data<float>(), input.data<float>(), output.data<float>(),
+        input_size, output_size
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
+        biases.data<float>(), output.data<float>(), output.data<float>(),
+        output_size
+    );
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    return output;
+}
--- a/src/backends/cuda/layers/activation.cu
+++ b/src/backends/cuda/layers/activation.cu
@@ -1,77 +0,0 @@
-#include <vector>
-
-#include "activation.hpp"
-#include "activation_functions.cuh"
-#include "cuda_helper.cuh"
-#include "matmul.cuh"
-#include "vector.cuh"
-
-using namespace CUDANet::Layers;
-
-void Activation::initCUDA() {
-    if (activationType == SOFTMAX) {
-        d_softmax_sum = nullptr;
-        CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length));
-
-        d_max = nullptr;
-        CUDA_CHECK(cudaMalloc((void**)&d_max, sizeof(float) * length));
-    }
-
-    gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
-}
-
-void Activation::delCUDA() {
-    if (activationType == SOFTMAX) {
-        CUDA_CHECK(cudaFree(d_softmax_sum));
-        CUDA_CHECK(cudaFree(d_max));
-    }
-}
-
-void Activation::activateCUDA(float* d_input) {
-
-    // float sum = 0.0f;
-
-    switch (activationType) {
-        case SIGMOID:
-            Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, length
-            );
-            CUDA_CHECK(cudaGetLastError());
-            break;
-
-        case RELU:
-            Kernels::relu<<<gridSize, BLOCK_SIZE>>>(d_input, d_input, length);
-            CUDA_CHECK(cudaGetLastError());
-            break;
-        case SOFTMAX:
-
-            // Find max value
-            Utils::max(d_input, d_max, length);
-
-            // Subtract max value to improve numerical stability
-            Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, &d_max[0], length
-            );
-            CUDA_CHECK(cudaGetLastError());
-
-            // Compute exponentials
-            Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, length
-            );
-            CUDA_CHECK(cudaGetLastError());
-            
-            // Find sum
-            Utils::sum(d_input, d_softmax_sum, length);
-
-            Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
-                d_input, d_input, &d_softmax_sum[0], length
-            );
-            CUDA_CHECK(cudaGetLastError());
-            break;
-
-        default:
-            break;    
-    }
-
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
--- a/src/backends/cuda/layers/dense.cu
+++ b/src/backends/cuda/layers/dense.cu
@@ -1,69 +0,0 @@
-#include <cuda_runtime.h>
-
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <iostream>
-
-#include "vector.cuh"
-#include "activation.hpp"
-#include "cuda_helper.cuh"
-#include "dense.hpp"
-#include "matmul.cuh"
-
-using namespace CUDANet::Layers;
-
-void Dense::initCUDA() {
-    d_output = nullptr;
-
-    CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * outputSize));
-
-    d_weights = nullptr;
-    d_biases  = nullptr;
-
-    // Allocate GPU memory for weights and biases
-    CUDA_CHECK(
-        cudaMalloc((void**)&d_weights, sizeof(float) * inputSize * outputSize)
-    );
-    CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * outputSize));
-    toCuda();
-
-    // Calculate block and grid sizes
-    forwardGridSize =
-        (std::max(inputSize, outputSize) + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    biasGridSize = (outputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
-}
-
-void Dense::delCUDA() {
-    cudaFree(d_output);
-    cudaFree(d_weights);
-    cudaFree(d_biases);
-}
-
-void Dense::toCuda() {
-    CUDA_CHECK(cudaMemcpy(
-        d_weights, weights.data(), sizeof(float) * inputSize * outputSize,
-        cudaMemcpyHostToDevice
-    ));
-    CUDA_CHECK(cudaMemcpy(
-        d_biases, biases.data(), sizeof(float) * outputSize,
-        cudaMemcpyHostToDevice
-    ));
-}
-
-float* Dense::forwardCUDA(const float* d_input) {
-    Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
-        d_weights, d_input, d_output, inputSize, outputSize
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
-        d_biases, d_output, d_output, outputSize
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    activation->activate(d_output);
-    CUDA_CHECK(cudaDeviceSynchronize());
-
-    return d_output;
-}
--- a/src/backends/cuda/tensor_ops.cu
+++ b/src/backends/cuda/tensor_ops.cu
@@ -1,13 +1,13 @@
 #include <iostream>

-#include "backend/backend.hpp"
+#include "backend.hpp"
 #include "backend/cuda.cuh"
 #include "utils/cuda_helper.cuh"
 #include "kernels/matmul.cuh"

 using namespace CUDANet::Backend;

-void CUDABackend::print(const CUDANet::Backend::Tensor &input) {
+void CUDA::print(const CUDANet::Tensor &input) {
    auto length = input.numel();
    std::vector<float> h_vec(input.numel());

@@ -22,11 +22,15 @@ void CUDABackend::print(const CUDANet::Backend::Tensor &input) {
    std::cout << std::endl;
 }

-void CUDABackend::clear(CUDANet::Backend::Tensor &input) {
+void CUDA::zero(CUDANet::Tensor &input) {
    CUDA_CHECK(cudaMemset(input.data<float>(), 0, sizeof(float) * input.numel()));
 }

-void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) {
+void CUDA::copy_to_device(CUDANet::Tensor &tensor, void *data, size_t size) {
+    CUDA_CHECK(cudaMemcpy(tensor.data<float>(), data, size, cudaMemcpyHostToDevice));
+}
+
+void CUDA::sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
    auto length = input.numel();
    const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE;

@@ -45,7 +49,7 @@ void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::T
    }
 }

-void CUDABackend::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) {
+void CUDA::max(const CUDANet::Tensor &input, CUDANet::Tensor &max) {
    auto length = input.numel();
    const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;

--- a/src/backends/tensor.cpp
+++ b/src/backends/tensor.cpp
@@ -1,52 +0,0 @@
-#include "backend/tensor.hpp"
-
-#include <stdexcept>
-
-using namespace CUDANet::Backend;
-
-Tensor::Tensor(Shape shape, DType dtype, IBackend* backend)
-    : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {
-    // Count total elements
-    size_t count = 1;
-    for (const auto& dim : shape) {
-        count *= dim;
-    }
-    total_elms = count;
-
-    // Compute total size (bytes)
-    size_t type_size = 0;
-    switch (dtype) {
-        case DType::FLOAT32:
-            type_size = 4;
-            break;
-        default:
-            throw std::runtime_error("Unsupported data type");
-    }
-    total_size = total_elms * type_size;
-
-    // Allocate memory on backend
-    d_ptr = backend->allocate(total_size);
-}
-
-Tensor::~Tensor() {
-    backend->deallocate(d_ptr);
-    d_ptr = nullptr;
-}
-
-size_t Tensor::numel() const {
-    return total_elms;
-}
-
-size_t Tensor::size() const {
-    return total_size;
-}
-
-template <typename T>
-const T* Tensor::data() const {
-    return static_cast<T*>(d_ptr);
-}
-
-template <typename T>
-T* Tensor::data() {
-    return static_cast<T*>(d_ptr);
-}
--- a/src/layers/activation.cpp
+++ b/src/layers/activation.cpp
@@ -1,22 +1,28 @@
+#include <format>
 #include <stdexcept>
 #include <vector>

 #include "activation.hpp"
-#include "backend/tensor.hpp"
+#include "tensor.hpp"

 using namespace CUDANet::Layers;

-Activation::Activation(CUDANet::Backend::IBackend* backend, ActivationType activation, const int length)
-    : backend(backend), activationType(activation), length(length) {
+Activation::Activation(CUDANet::Backend* backend, ActivationType activation, const CUDANet::Shape &shape)
+    : backend(backend), activationType(activation), shape(shape) {

+    if (shape.size() != 1) {
+        throw std::runtime_error(std::format("Invalid shape. Expected [1], got {}", shape));
+    }
+
+    auto length = shape[0];

    if (activationType == SOFTMAX) {
-      softmax_sum = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, backend);
-      tensor_max = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, backend);
+      softmax_sum = CUDANet::Tensor({static_cast<size_t>(length)}, CUDANet::DType::FLOAT32, backend);
+      tensor_max = CUDANet::Tensor({static_cast<size_t>(length)}, CUDANet::DType::FLOAT32, backend);
    }
 }

-void Activation::activate(CUDANet::Backend::Tensor input) {
+CUDANet::Tensor& Activation::forward(CUDANet::Tensor &input) {
    switch (activationType)
    {
    case ActivationType::SIGMOID:
@@ -31,4 +37,30 @@ void Activation::activate(CUDANet::Backend::Tensor input) {
    default:
        break;
    }
+
+    return input;
 }
+
+CUDANet::Shape Activation::input_shape() {
+    return shape;
+}
+
+CUDANet::Shape Activation::output_shape() {
+    return shape;
+}
+
+size_t Activation::input_size() {
+    return shape[0];
+}
+
+size_t Activation::output_size() {
+    return shape[0];
+}
+
+void Activation::set_weights(void *input) {}
+
+CUDANet::Tensor& Activation::get_weights() {}
+
+void Activation::set_biases(void *input) {}
+
+CUDANet::Tensor& Activation::get_biases() {}
--- a/src/layers/avg_pooling.cpp
+++ b/src/layers/avg_pooling.cpp
@@ -54,7 +54,7 @@ float* AvgPooling2d::forward(const float* input) {
 #endif
 }

-int AvgPooling2d::getOutputSize() {
+int AvgPooling2d::get_output_size() {
    return outputSize.first * outputSize.second * nChannels;
 }

--- a/src/layers/dense.cpp
+++ b/src/layers/dense.cpp
@@ -1,80 +1,75 @@
 #include "dense.hpp"

+#include <format>
 #include <stdexcept>

-#include "activation.hpp"
-
 using namespace CUDANet::Layers;

-Dense::Dense(int inputSize, int outputSize, ActivationType activationType)
-    : inputSize(inputSize), outputSize(outputSize) {
+Dense::Dense(CUDANet::Backend* backend, CUDANet::Shape in, CUDANet::Shape out)
+    : backend(backend),
+      in_shape(in),
+      out_shape(out),
+      weights(
+          CUDANet::Tensor(Shape{in[0] * out[0]}, CUDANet::DType::FLOAT32, backend)
+      ),
+      biases(CUDANet::Tensor(Shape{out[0]}, CUDANet::DType::FLOAT32, backend)),
+      output(CUDANet::Tensor(Shape{out[0]}, CUDANet::DType::FLOAT32, backend)) {
    // Allocate memory for weights and biases
-    weights.resize(outputSize * inputSize);
-    biases.resize(outputSize);

-    initializeWeights();
-    initializeBiases();
-
-    activation = new Activation(activationType, outputSize);
-
-#ifdef USE_CUDA
-    initCUDA();
-#endif
+    if (in.size() != 1) {
+        throw std::runtime_error(
+            std::format("Invalid shape. Expected [1], got {}", in)
+        );
    }

-Dense::~Dense() {
-    delete activation;
-#ifdef USE_CUDA
-    delCUDA();
-#endif
+    if (out.size() != 1) {
+        throw std::runtime_error(
+            std::format("Invalid shape. Expected [1], got {}", out)
+        );
    }

-void Dense::initializeWeights() {
-    std::fill(weights.begin(), weights.end(), 0.0f);
+    auto input_len  = in[0];
+    auto output_len = out[0];
+
+    weights.zero();
+    biases.zero();
 }

-void Dense::initializeBiases() {
-    std::fill(biases.begin(), biases.end(), 0.0f);
+Dense::~Dense() {}
+
+CUDANet::Tensor& Dense::forward(const CUDANet::Tensor& input) {
+    backend->dense(weights, biases, input, output, in_shape[0], out_shape[0]);
+    return output;
 }

-float* Dense::forwardCPU(const float* input) {
-    throw std::logic_error("Not implemented");
+CUDANet::Shape Dense::input_shape() {
+    return in_shape;
 }

-float* Dense::forward(const float* input) {
-#ifdef USE_CUDA
-    return forwardCUDA(input);
-#else
-    return forwardCPU(input);
-#endif
+CUDANet::Shape Dense::output_shape() {
+    return out_shape;
 }

-void Dense::setWeights(const float* weights_input) {
-    std::copy(weights_input, weights_input + weights.size(), weights.begin());
-#ifdef USE_CUDA
-    toCuda();
-#endif
+size_t Dense::input_size() {
+    return in_shape[0];
+};
+
+size_t Dense::output_size() {
+    return out_shape[0];
+};
+
+void Dense::set_weights(void* input) {
+    weights.set_data<float>(static_cast<float*>(input));
 }

-std::vector<float> Dense::getWeights() {
+CUDANet::Tensor& Dense::get_weights() {
    return weights;
 }

-void Dense::setBiases(const float* biases_input) {
-    std::copy(biases_input, biases_input + biases.size(), biases.begin());
-#ifdef USE_CUDA
-    toCuda();
-#endif
+void Dense::set_biases(void* input) {
+    biases.set_data<float>(static_cast<float*>(input));
 }

-std::vector<float> Dense::getBiases() {
+CUDANet::Tensor& Dense::get_biases() {
    return biases;
 }
-
-int Dense::getOutputSize() {
-    return outputSize;
-}
-
-int Dense::getInputSize() {
-    return inputSize;
-}
--- a/src/layers/input.cpp
+++ b/src/layers/input.cpp
@@ -28,7 +28,7 @@ float* Input::forward(const float* input) {
 #endif
 }

-int Input::getOutputSize() {
+int Input::get_output_size() {
    return inputSize;
 }

--- a/src/layers/max_pooling.cpp
+++ b/src/layers/max_pooling.cpp
@@ -54,7 +54,7 @@ float* MaxPooling2d::forward(const float* input) {
 }


-int MaxPooling2d::getOutputSize() {
+int MaxPooling2d::get_output_size() {
    return outputSize.first * outputSize.second * nChannels;
 }

--- a/src/layers/output.cu
+++ b/src/layers/output.cu
@@ -24,7 +24,7 @@ float* Output::forward(const float* input) {
 #endif
 }

-int Output::getOutputSize() {
+int Output::get_output_size() {
    return inputSize;
 }

--- a/src/model/model.cpp
+++ b/src/model/model.cpp
--- a/src/model/module.cpp
+++ b/src/model/module.cpp
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -0,0 +1,87 @@
+#include <stdexcept>
+
+#include "tensor.hpp"
+
+using namespace CUDANet;
+
+Tensor::Tensor(Shape shape, DType dtype, Backend* backend)
+    : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {
+
+    if (shape.empty()) {
+        throw std::runtime_error("Tensor shape cannot be empty");
+    }
+    
+    // Count total elements
+    size_t count = 1;
+    for (const auto& dim : shape) {
+        count *= dim;
+    }
+    total_elms = count;
+
+    // Compute total size (bytes)
+    size_t type_size = 0;
+    switch (dtype) {
+        case DType::FLOAT32:
+            type_size = 4;
+            break;
+        default:
+            throw std::runtime_error("Unsupported data type");
+    }
+    total_size = total_elms * type_size;
+
+    // Allocate memory on backend
+    d_ptr = backend->allocate(total_size);
+}
+
+Tensor::Tensor(Tensor&& other) noexcept
+    : shape(std::move(other.shape)),
+      dtype(other.dtype),
+      total_elms(other.total_elms),
+      total_size(other.total_size),
+      backend(other.backend),
+      d_ptr(other.d_ptr)
+{
+    other.d_ptr = nullptr;
+    other.backend = nullptr;
+}
+
+Tensor& Tensor::operator=(Tensor&& other) noexcept {
+    if (this != &other) {
+        // Clean up our current resources
+        if (d_ptr != nullptr && backend != nullptr) {
+            backend->deallocate(d_ptr);
+        }
+        
+        // Steal other's resources
+        shape = std::move(other.shape);
+        dtype = other.dtype;
+        total_elms = other.total_elms;
+        total_size = other.total_size;
+        backend = other.backend;
+        d_ptr = other.d_ptr;
+        
+        // Leave other in valid but empty state
+        other.d_ptr = nullptr;
+        other.backend = nullptr;
+    }
+    return *this;
+}
+
+Tensor::~Tensor() {
+    if (backend && d_ptr) {
+        backend->deallocate(d_ptr);
+        d_ptr = nullptr;
+    }
+}
+
+size_t Tensor::numel() const {
+    return total_elms;
+}
+
+size_t Tensor::size() const {
+    return total_size;
+}
+
+void Tensor::zero() {
+    backend->zero(*this);
+}
--- a/test/cuda/layers/test_avg_pooling.cu
+++ b/test/cuda/layers/test_avg_pooling.cu
@@ -54,7 +54,7 @@ class AvgPoolingLayerTest : public ::testing::Test {

        d_output = avgPoolingLayer->forward(d_input);

-        int outputSize = avgPoolingLayer->getOutputSize();
+        int outputSize = avgPoolingLayer->get_output_size();

        std::vector<float> output(outputSize);
        cudaStatus = cudaMemcpy(
@@ -229,7 +229,7 @@ class AdaptiveAvgPoolingLayerTest : public ::testing::Test {

        d_output = adaptiveAvgPoolingLayer->forward(d_input);

-        int outputSize = adaptiveAvgPoolingLayer->getOutputSize();
+        int outputSize = adaptiveAvgPoolingLayer->get_output_size();

        std::vector<float> output(outputSize);
        cudaStatus = cudaMemcpy(
--- a/test/cuda/layers/test_max_pooling.cu
+++ b/test/cuda/layers/test_max_pooling.cu
@@ -52,7 +52,7 @@ class MaxPoolingLayerTest : public ::testing::Test {

        d_output = maxPoolingLayer->forward(d_input);

-        int outputSize = maxPoolingLayer->getOutputSize();
+        int outputSize = maxPoolingLayer->get_output_size();

        std::vector<float> output(outputSize);
        cudaStatus = cudaMemcpy(
Author	SHA1	Message	Date
LordMathis	10c84d75fc	Fix Tensor issues	2025-11-18 22:38:56 +01:00
LordMathis	4c26efe826	Fix some dense layer issues	2025-11-18 22:17:08 +01:00
LordMathis	7f203b8947	WIP Migrate Dense layer	2025-11-18 21:12:47 +01:00
LordMathis	64eac7050b	WIP Migrate Dense layer	2025-11-18 19:33:51 +01:00
LordMathis	24606491a3	WIP Refactor Layer and Activation classes	2025-11-18 19:10:18 +01:00
LordMathis	6340b27055	Refactor Backend and Layer interfaces	2025-11-18 18:27:57 +01:00