Refactor Backend and Layer interfaces

2025-12-22 06:14:22 +00:00 · 2025-11-18 18:27:57 +01:00
parent 25670f90c4
commit 6340b27055
23 changed files with 154 additions and 201 deletions
--- a/examples/inception_v3/inception_modules.cpp
+++ b/examples/inception_v3/inception_modules.cpp
@@ -213,7 +213,7 @@ InceptionB::InceptionB(
        branch3x3->getOutputSize(), branch3x3dbl_3->getOutputSize()
    );
    concat_2 = new CUDANet::Layers::Concat(
-        concat_1->getOutputSize(), branchPool->getOutputSize()
+        concat_1->getOutputSize(), branchPool->get_output_size()
    );
    outputSize = concat_2->getOutputSize();
@@ -441,7 +441,7 @@ InceptionD::InceptionD(
        branch3x3_2->getOutputSize(), branch7x7x3_4->getOutputSize()
    );
    concat_2 = new CUDANet::Layers::Concat(
-        concat_1->getOutputSize(), branchPool->getOutputSize()
+        concat_1->getOutputSize(), branchPool->get_output_size()
    );
    outputSize = concat_2->getOutputSize();
@@ -707,7 +707,7 @@ InceptionV3::InceptionV3(
    addLayer("AveragePool", avgpool);
    fc = new CUDANet::Layers::Dense(
-        avgpool->getOutputSize(), 1000, CUDANet::Layers::ActivationType::NONE
+        avgpool->get_output_size(), 1000, CUDANet::Layers::ActivationType::NONE
    );
    addLayer("fc", fc);
 }
--- a/include/backend.hpp
+++ b/include/backend.hpp
@@ -0,0 +1,30 @@
 #pragma once
 #include <cstddef>
 #include "tensor.hpp"
 namespace CUDANet
 {   
 class Backend
 {
 public:
    // Memory management
    virtual void* allocate(size_t bytes) = 0;
    virtual void deallocate(void* ptr) = 0;
    // Tensor ops
    virtual void print(const CUDANet::Tensor &input) = 0;
    virtual void clear(CUDANet::Tensor &input) = 0;
    virtual void sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) = 0;
    virtual void max(const CUDANet::Tensor &input, CUDANet::Tensor &max) = 0;
    // Layer ops
    virtual void relu(CUDANet::Tensor &tensor) = 0;
    virtual void sigmoid(CUDANet::Tensor &tensor) = 0;
    virtual void softmax(CUDANet::Tensor &tensor, CUDANet::Tensor &temp_max, CUDANet::Tensor &temp_sum) = 0;
 };
 } // namespace CUDANet::Backend
--- a/include/backend/backend.hpp
+++ b/include/backend/backend.hpp
@@ -1,29 +0,0 @@
 #pragma once
 #include <cstddef>
 #include "backend/tensor.hpp"
 namespace CUDANet::Backend
 {   
 class IBackend
 {
 public:
    // Memory management
    virtual void* allocate(size_t bytes) = 0;
    virtual void deallocate(void* ptr) = 0;
    // Tensor ops
    virtual void print(const CUDANet::Backend::Tensor &input) = 0;
    virtual void clear(CUDANet::Backend::Tensor &input) = 0;
    virtual void sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) = 0;
    virtual void max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) = 0;
    // Layer ops
    virtual void relu(CUDANet::Backend::Tensor &tensor) = 0;
    virtual void sigmoid(CUDANet::Backend::Tensor &tensor) = 0;
    virtual void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) = 0;
 };
 } // namespace CUDANet::Backend
--- a/include/backend/cpu.hpp
+++ b/include/backend/cpu.hpp
@@ -0,0 +1,26 @@
 #pragma once
 #include "backend.hpp"
 #include "tensor.hpp"
 namespace CUDANet::Backend {
 class CPU : public Backend {
 public:
    // Memory management
    void* allocate(size_t bytes) override;
    void deallocate(void* ptr) override;
    // Tensor ops
    void print(const CUDANet::Tensor &input) override;
    void clear(CUDANet::Tensor &input) override;
    void sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) override;
    void max(const CUDANet::Tensor &input, CUDANet::Tensor &max) override;
    // Layer ops
    void relu(CUDANet::Tensor &tensor) override;
    void sigmoid(CUDANet::Tensor &tensor) override;
    void softmax(CUDANet::Tensor &tensor, CUDANet::Tensor &temp_max, CUDANet::Tensor &temp_sum) override;
 };
 }
--- a/include/backend/cuda.cuh
+++ b/include/backend/cuda.cuh
@@ -1,29 +1,26 @@
 #pragma once
-#include "backend/backend.hpp"
+#include "backend.hpp"
-#include "backend/tensor.hpp"
+#include "tensor.hpp"
 namespace CUDANet::Backend {
-class CUDABackend : public IBackend {
+class CUDA : public Backend {
 public:
    // Memory management
    void* allocate(size_t bytes) override;
    void deallocate(void* ptr) override;
    // Tensor ops
-    void print(const CUDANet::Backend::Tensor &input) override;
+    void print(const CUDANet::Tensor &input) override;
-    void clear(CUDANet::Backend::Tensor &input) override;
+    void clear(CUDANet::Tensor &input) override;
-    void sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) override;
+    void sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) override;
-    void max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) override;
+    void max(const CUDANet::Tensor &input, CUDANet::Tensor &max) override;
    // Layer ops
-    void relu(CUDANet::Backend::Tensor &tensor) override;
+    void relu(CUDANet::Tensor &tensor) override;
-    void sigmoid(CUDANet::Backend::Tensor &tensor) override;
+    void sigmoid(CUDANet::Tensor &tensor) override;
-    void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) override;
+    void softmax(CUDANet::Tensor &tensor, CUDANet::Tensor &temp_max, CUDANet::Tensor &temp_sum) override;
 private:
    static constexpr int BLOCK_SIZE = 256;
 };
 }  // namespace CUDANet::Backend
--- a/include/layer.hpp
+++ b/include/layer.hpp
@@ -0,0 +1,42 @@
 #pragma once
 #include <vector>
 #include "shape.hpp"
 #include "tensor.hpp"
 #define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \
    ((stride - 1) * inputSize - stride + kernelSize) / 2;
 namespace CUDANet {
 /**
 * @brief Basic Sequential Layer
 *
 */
 class Layer {
  public:
    virtual ~Layer(){};
    virtual CUDANet::Tensor& forward(CUDANet::Tensor &input) = 0;
    virtual CUDANet::Shape input_shape() = 0;
    virtual CUDANet::Shape output_shape() = 0;
    virtual int input_size() = 0;
    virtual int output_size() = 0;
    virtual void set_weights(CUDANet::Tensor &input) = 0;
    virtual CUDANet::Tensor& get_weights() = 0;
    virtual void set_biases(CUDANet::Tensor &input) = 0;
    virtual CUDANet::Tensor& get_biases() = 0;
 };
 }  // namespace CUDANet::Layers
--- a/include/layers/activation.hpp
+++ b/include/layers/activation.hpp
@@ -2,6 +2,7 @@
 #include "backend/tensor.hpp"
 #include "backend/backend.hpp"
 #include "layers/layer.hpp"
 namespace CUDANet::Layers {
@@ -19,7 +20,7 @@ enum ActivationType { SIGMOID, RELU, SOFTMAX, NONE };
 * @brief Utility class that performs activation
 * 
 */
-class Activation {
+class Activation : Layer {
  public:
    Activation() = default;
--- a/include/layers/avg_pooling.hpp
+++ b/include/layers/avg_pooling.hpp
@@ -6,7 +6,7 @@
 namespace CUDANet::Layers {
-class AvgPooling2d : public SequentialLayer, public TwoDLayer {
+class AvgPooling2d : public Layer, public TwoDLayer {
  public:
    AvgPooling2d(
        shape2d        inputSize,
@@ -25,7 +25,7 @@ class AvgPooling2d : public SequentialLayer, public TwoDLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();
    /**
     * @brief Get input size
--- a/include/layers/input.hpp
+++ b/include/layers/input.hpp
@@ -9,7 +9,7 @@ namespace CUDANet::Layers {
 * @brief Input layer, just copies the input to the device
 *
 */
-class Input : public SequentialLayer {
+class Input : public Layer {
  public:
    /**
     * @brief Create a new Input layer
@@ -38,7 +38,7 @@ class Input : public SequentialLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();
    /**
     * @brief Get input size
--- a/include/layers/layer.hpp
+++ b/include/layers/layer.hpp
@@ -1,124 +0,0 @@
 #ifndef CUDANET_I_LAYER_H
 #define CUDANET_I_LAYER_H
 #include <vector>
 #define CUDANET_SAME_PADDING(inputSize, kernelSize, stride) \
    ((stride - 1) * inputSize - stride + kernelSize) / 2;
 typedef std::pair<int, int> shape2d;
 namespace CUDANet::Layers {
 class TwoDLayer {
  public:
    virtual shape2d getOutputDims() = 0;
 };
 /**
 * @brief Basic Sequential Layer
 *
 */
 class SequentialLayer {
  public:
    /**
     * @brief Destroy the Sequential Layer
     *
     */
    virtual ~SequentialLayer(){};
    /**
     * @brief Forward propagation virtual function
     *
     * @param input Device pointer to the input
     * @return float* Device pointer to the output
     */
    virtual float* forward(const float* input) = 0;
    /**
     * @brief Get output size
     *
     * @return int output size
     */
    virtual int getOutputSize() = 0;
    /**
     * @brief Get input size
     * 
     * @return int input size
     */
    virtual int getInputSize() = 0;
 };
 /**
 * @brief Base class for layers with weights and biases
 */
 class WeightedLayer : public SequentialLayer {
  public:
    /**
     * @brief Destroy the ILayer object
     *
     */
    virtual ~WeightedLayer(){};
    /**
     * @brief Virtual function for forward pass
     *
     * @param input (Device) Pointer to the input
     * @return float* Device pointer to the output
     */
    virtual float* forward(const float* input) = 0;
    /**
     * @brief Virtual function for setting weights
     *
     * @param weights Pointer to the weights
     */
    virtual void setWeights(const float* weights) = 0;
    /**
     * @brief Virtual function for getting weights
     *
     */
    virtual std::vector<float> getWeights() = 0;
    /**
     * @brief Virtual function for setting biases
     *
     * @param biases Pointer to the biases
     */
    virtual void setBiases(const float* biases) = 0;
    /**
     * @brief Virtual function for getting biases
     *
     */
    virtual std::vector<float> getBiases() = 0;
  private:
    /**
     * @brief Initialize the weights
     */
    virtual void initializeWeights() = 0;
    /**
     * @brief Initialize the biases
     */
    virtual void initializeBiases() = 0;
 #ifdef USE_CUDA
    /**
     * @brief Copy the weights and biases to the device
     */
    virtual void toCuda() = 0;
 #endif
 };
 }  // namespace CUDANet::Layers
 #endif  // CUDANET_I_LAYERH
--- a/include/layers/max_pooling.hpp
+++ b/include/layers/max_pooling.hpp
@@ -6,7 +6,7 @@
 namespace CUDANet::Layers {
-class MaxPooling2d : public SequentialLayer, public TwoDLayer {
+class MaxPooling2d : public Layer, public TwoDLayer {
  public:
    MaxPooling2d(
        shape2d        inputSize,
@@ -25,7 +25,7 @@ class MaxPooling2d : public SequentialLayer, public TwoDLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();
    /**
     * @brief Get input size
--- a/include/layers/output.hpp
+++ b/include/layers/output.hpp
@@ -5,7 +5,7 @@
 namespace CUDANet::Layers {
-class Output : public SequentialLayer {
+class Output : public Layer {
  public:
    /**
     * @brief Create a new Output layer
@@ -34,7 +34,7 @@ class Output : public SequentialLayer {
     *
     * @return int output size
     */
-    int getOutputSize();
+    int get_output_size();
    /**
     * @brief Get input size
--- a/include/shape.hpp
+++ b/include/shape.hpp
@@ -0,0 +1,9 @@
 #pragma once
 #include <vector>
 namespace CUDANet {
 typedef std::vector<size_t> Shape;
 } // namespace CUDANet
--- a/include/backend/tensor.hpp
+++ b/include/backend/tensor.hpp
@@ -1,9 +1,12 @@
 #pragma once
 #include <cstddef>
 #include "backend/backend.hpp"
 #include <vector>
-namespace CUDANet::Backend
+#include "backend.hpp"
 #include "shape.hpp"
 namespace CUDANet
 {
 enum class DType
@@ -13,14 +16,12 @@ enum class DType
    // INT32,  // Not implemented yet
 };
 typedef std::vector<size_t> Shape;
 class Tensor
 {
 public:
    Tensor() = default;
-    Tensor(Shape shape, DType dtype, IBackend* backend);
+    Tensor(Shape shape, DType dtype, CUDANet::Backend::IBackend* backend);
    ~Tensor();
    size_t size() const;
@@ -39,8 +40,8 @@ private:
    size_t total_elms;
    size_t total_size;
-    IBackend*   backend;
+    CUDANet::Backend::IBackend*   backend;
    void*       d_ptr;
 };
-} // namespace CUDANet::Backend
+} // namespace CUDANet
--- a/src/backends/cuda/cuda_backend.cu
+++ b/src/backends/cuda/cuda_backend.cu
@@ -28,12 +28,12 @@ cudaDeviceProp initializeCUDA() {
 using namespace CUDANet::Backend;
-void* CUDABackend::allocate(size_t bytes) {
+void* CUDA::allocate(size_t bytes) {
    void* d_ptr = nullptr;
    CUDA_CHECK(cudaMalloc(&d_ptr, bytes));
    return d_ptr;
 }
-void CUDABackend::deallocate(void* ptr) {
+void CUDA::deallocate(void* ptr) {
    CUDA_CHECK(cudaFree(ptr));
 }
--- a/src/backends/cuda/layer_ops.cu
+++ b/src/backends/cuda/layer_ops.cu
@@ -5,21 +5,21 @@
 using namespace CUDANet::Backend;
-void CUDABackend::relu(Tensor &tensor) {
+void CUDA::relu(Tensor &tensor) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
    Kernels::relu<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }
-void CUDABackend::sigmoid(Tensor &tensor) {
+void CUDA::sigmoid(Tensor &tensor) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
    Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
 }
-void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
+void CUDA::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
    // Find max value
--- a/src/backends/cuda/tensor_ops.cu
+++ b/src/backends/cuda/tensor_ops.cu
@@ -7,7 +7,7 @@
 using namespace CUDANet::Backend;
-void CUDABackend::print(const CUDANet::Backend::Tensor &input) {
+void CUDA::print(const CUDANet::Backend::Tensor &input) {
    auto length = input.numel();
    std::vector<float> h_vec(input.numel());
@@ -22,11 +22,11 @@ void CUDABackend::print(const CUDANet::Backend::Tensor &input) {
    std::cout << std::endl;
 }
-void CUDABackend::clear(CUDANet::Backend::Tensor &input) {
+void CUDA::clear(CUDANet::Backend::Tensor &input) {
    CUDA_CHECK(cudaMemset(input.data<float>(), 0, sizeof(float) * input.numel()));
 }
-void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) {
+void CUDA::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) {
    auto length = input.numel();
    const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE;
@@ -45,7 +45,7 @@ void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::T
    }
 }
-void CUDABackend::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) {
+void CUDA::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) {
    auto length = input.numel();
    const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
--- a/src/layers/avg_pooling.cpp
+++ b/src/layers/avg_pooling.cpp
@@ -54,7 +54,7 @@ float* AvgPooling2d::forward(const float* input) {
 #endif
 }
-int AvgPooling2d::getOutputSize() {
+int AvgPooling2d::get_output_size() {
    return outputSize.first * outputSize.second * nChannels;
 }
--- a/src/layers/input.cpp
+++ b/src/layers/input.cpp
@@ -28,7 +28,7 @@ float* Input::forward(const float* input) {
 #endif
 }
-int Input::getOutputSize() {
+int Input::get_output_size() {
    return inputSize;
 }
--- a/src/layers/max_pooling.cpp
+++ b/src/layers/max_pooling.cpp
@@ -54,7 +54,7 @@ float* MaxPooling2d::forward(const float* input) {
 }
-int MaxPooling2d::getOutputSize() {
+int MaxPooling2d::get_output_size() {
    return outputSize.first * outputSize.second * nChannels;
 }
--- a/src/layers/output.cu
+++ b/src/layers/output.cu
@@ -24,7 +24,7 @@ float* Output::forward(const float* input) {
 #endif
 }
-int Output::getOutputSize() {
+int Output::get_output_size() {
    return inputSize;
 }
--- a/test/cuda/layers/test_avg_pooling.cu
+++ b/test/cuda/layers/test_avg_pooling.cu
@@ -54,7 +54,7 @@ class AvgPoolingLayerTest : public ::testing::Test {
        d_output = avgPoolingLayer->forward(d_input);
-        int outputSize = avgPoolingLayer->getOutputSize();
+        int outputSize = avgPoolingLayer->get_output_size();
        std::vector<float> output(outputSize);
        cudaStatus = cudaMemcpy(
@@ -229,7 +229,7 @@ class AdaptiveAvgPoolingLayerTest : public ::testing::Test {
        d_output = adaptiveAvgPoolingLayer->forward(d_input);
-        int outputSize = adaptiveAvgPoolingLayer->getOutputSize();
+        int outputSize = adaptiveAvgPoolingLayer->get_output_size();
        std::vector<float> output(outputSize);
        cudaStatus = cudaMemcpy(
--- a/test/cuda/layers/test_max_pooling.cu
+++ b/test/cuda/layers/test_max_pooling.cu
@@ -52,7 +52,7 @@ class MaxPoolingLayerTest : public ::testing::Test {
        d_output = maxPoolingLayer->forward(d_input);
-        int outputSize = maxPoolingLayer->getOutputSize();
+        int outputSize = maxPoolingLayer->get_output_size();
        std::vector<float> output(outputSize);
        cudaStatus = cudaMemcpy(