Add toplevel CUDANet namespace

2025-12-22 22:34:22 +00:00 · 2024-03-17 16:08:53 +01:00
parent dc86cddeb7
commit 0c22fac64e
19 changed files with 183 additions and 149 deletions
--- a/include/kernels/activations.cuh
+++ b/include/kernels/activations.cuh
@@ -1,7 +1,7 @@
 #ifndef CUDANET_ACTIVATIONS_H
 #define CUDANET_ACTIVATIONS_H

-namespace Kernels {
+namespace CUDANet::Kernels {

 /**
 * @brief Sigmoid activation function kernel
@@ -23,6 +23,6 @@ sigmoid(const float* __restrict__ src, float* __restrict__ dst, int len);
 __global__ void
 relu(const float* __restrict__ src, float* __restrict__ dst, int len);

-}  // namespace Kernels
+}  // namespace CUDANet::Kernels

 #endif  // CUDANET_ACTIVATIONS_H
--- a/include/kernels/convolution.cuh
+++ b/include/kernels/convolution.cuh
@@ -1,11 +1,11 @@
 #ifndef CUDANET_CONVOLUTION_H
 #define CUDANET_CONVOLUTION_H

-namespace Kernels {
+namespace CUDANet::Kernels {

 /**
 * @brief Kernel that pads the input matrix with zeros
- * 
+ *
 * @param d_input Device pointer to the input matrix (as vector)
 * @param d_padded Device pointer to the padded matrix (as vector)
 * @param w Width of the input matrix
@@ -14,17 +14,17 @@ namespace Kernels {
 * @param p Padding size
 */
 __global__ void padding(
-    const float* d_input,
-    float*       d_padded,
-    int          w,
-    int          h,
-    int          n,
-    int          p
+    const float* __restrict__ d_input,
+    float* __restrict__ d_padded,
+    const unsigned int w,
+    const unsigned int h,
+    const unsigned int n,
+    const unsigned int p
 );

 /**
 * @brief Convolution kernel
- * 
+ *
 * @param d_input Device pointer to the input matrix
 * @param d_kernel Device pointer to the convolution kernel
 * @param d_output Device pointer to the output matrix
@@ -36,17 +36,17 @@ __global__ void padding(
 * @param outputSize Width and height of the output matrix
 */
 __global__ void convolution(
-    const float* d_input,
-    const float* d_kernel,
-    float*       d_output,
-    int          inputSize,
-    int          nChannels,
-    int          kernelSize,
-    int          stride,
-    int          nFilters,
-    int          outputSize
+    const float* __restrict__ d_input,
+    const float* __restrict__ d_kernel,
+    float* __restrict__ d_output,
+    const unsigned int inputSize,
+    const unsigned int nChannels,
+    const unsigned int kernelSize,
+    const unsigned int stride,
+    const unsigned int nFilters,
+    const unsigned int outputSize
 );

-}  // namespace Kernels
+}  // namespace CUDANet::Kernels

 #endif  // CUDANET_CONVOLUTION_H
--- a/include/kernels/matmul.cuh
+++ b/include/kernels/matmul.cuh
@@ -1,11 +1,11 @@
 #ifndef CUDANET_MATMUL_H
 #define CUDANET_MATMUL_H

-namespace Kernels {
+namespace CUDANet::Kernels {

 /**
 * @brief Matrix vector multiplication kernel
- * 
+ *
 * @param d_matrix Device pointer to matrix
 * @param d_vector Device pointer to vector
 * @param d_output Device pointer to output vector
@@ -13,28 +13,41 @@ namespace Kernels {
 * @param h Height of the matrix
 */
 __global__ void mat_vec_mul(
-    const float* d_matrix,
-    const float* d_vector,
-    float*       d_output,
-    int          w,
-    int          h
+    const float* __restrict__ d_matrix,
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int w,
+    const unsigned int h
 );

 /**
 * @brief Vector vector addition kernel
- * 
+ *
 * @param d_vector1 Device pointer to first vector
 * @param d_vector2 Device pointer to second vector
 * @param d_output Device pointer to output vector
 * @param w Length of the vectors
 */
 __global__ void vec_vec_add(
-    const float* d_vector1,
-    const float* d_vector2,
-    float*       d_output,
-    int          w
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
 );

-}  // namespace Kernels
+/**
+ * @brief 
+ * 
+ * @param d_vector Device pointer to vector
+ * @param d_output Device pointer to output vector
+ * @param w Length of the vector
+ */
+__global__ void reduce_sum(
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int w
+);
+
+}  // namespace CUDANet::Kernels

 #endif  // CUDANET_MATMUL_H
--- a/include/layers/conv2d.cuh
+++ b/include/layers/conv2d.cuh
@@ -8,7 +8,7 @@
 #include "convolution.cuh"
 #include "ilayer.cuh"

-namespace Layers {
+namespace CUDANet::Layers {

 /**
 * @brief 2D convolutional layer
@@ -125,6 +125,6 @@ class Conv2d : public ILayer {
    void toCuda();
 };

-}  // namespace Layers
+}  // namespace CUDANet::Layers

 #endif  // CUDANET_CONV_LAYER_H
--- a/include/layers/dense.cuh
+++ b/include/layers/dense.cuh
@@ -7,7 +7,7 @@

 #include "ilayer.cuh"

-namespace Layers {
+namespace CUDANet::Layers {

 /**
 * @brief Dense (fully connected) layer
@@ -53,8 +53,8 @@ class Dense : public ILayer {
    void setBiases(const float* biases);

  private:
-    int inputSize;
-    int outputSize;
+    unsigned int inputSize;
+    unsigned int outputSize;

    float* d_output;

@@ -67,8 +67,8 @@ class Dense : public ILayer {
    Layers::Activation activation;

    // Precompute kernel launch parameters
-    int forwardGridSize;
-    int biasGridSize;
+    unsigned int forwardGridSize;
+    unsigned int biasGridSize;

    /**
     * @brief Initialize the weights to zeros
@@ -89,6 +89,6 @@ class Dense : public ILayer {
    void toCuda();
 };

-}  // namespace Layers
+}  // namespace CUDANet::Layers

 #endif  // CUDANET_DENSE_LAYER_H
--- a/include/layers/ilayer.cuh
+++ b/include/layers/ilayer.cuh
@@ -4,7 +4,7 @@

 #include <vector>

-namespace Layers {
+namespace CUDANet::Layers {

 /**
 * @brief Activation functions
@@ -88,6 +88,6 @@ class ILayer {
    Layers::Activation activation;
 };

-}  // namespace Layers
+}  // namespace CUDANet::Layers

 #endif  // CUDANET_I_LAYERH
--- a/include/layers/input.cuh
+++ b/include/layers/input.cuh
@@ -3,7 +3,7 @@

 #include <ilayer.cuh>

-namespace Layers {
+namespace CUDANet::Layers {

 /**
 * @brief Input layer, just copies the input to the device
@@ -45,6 +45,6 @@ class Input : public ILayer {
    float* d_output;
 };

-}  // namespace Layers
+}  // namespace CUDANet::Layers

 #endif  // CUDANET_INPUT_LAYER_H