Add toplevel CUDANet namespace

2025-12-22 22:34:22 +00:00 · 2024-03-17 16:08:53 +01:00
parent dc86cddeb7
commit 0c22fac64e
19 changed files with 183 additions and 149 deletions
--- a/include/kernels/activations.cuh
+++ b/include/kernels/activations.cuh
@@ -1,7 +1,7 @@
 #ifndef CUDANET_ACTIVATIONS_H
 #define CUDANET_ACTIVATIONS_H

-namespace Kernels {
+namespace CUDANet::Kernels {

 /**
 * @brief Sigmoid activation function kernel
@@ -23,6 +23,6 @@ sigmoid(const float* __restrict__ src, float* __restrict__ dst, int len);
 __global__ void
 relu(const float* __restrict__ src, float* __restrict__ dst, int len);

-}  // namespace Kernels
+}  // namespace CUDANet::Kernels

 #endif  // CUDANET_ACTIVATIONS_H
--- a/include/kernels/convolution.cuh
+++ b/include/kernels/convolution.cuh
@@ -1,11 +1,11 @@
 #ifndef CUDANET_CONVOLUTION_H
 #define CUDANET_CONVOLUTION_H

-namespace Kernels {
+namespace CUDANet::Kernels {

 /**
 * @brief Kernel that pads the input matrix with zeros
- * 
+ *
 * @param d_input Device pointer to the input matrix (as vector)
 * @param d_padded Device pointer to the padded matrix (as vector)
 * @param w Width of the input matrix
@@ -14,17 +14,17 @@ namespace Kernels {
 * @param p Padding size
 */
 __global__ void padding(
-    const float* d_input,
-    float*       d_padded,
-    int          w,
-    int          h,
-    int          n,
-    int          p
+    const float* __restrict__ d_input,
+    float* __restrict__ d_padded,
+    const unsigned int w,
+    const unsigned int h,
+    const unsigned int n,
+    const unsigned int p
 );

 /**
 * @brief Convolution kernel
- * 
+ *
 * @param d_input Device pointer to the input matrix
 * @param d_kernel Device pointer to the convolution kernel
 * @param d_output Device pointer to the output matrix
@@ -36,17 +36,17 @@ __global__ void padding(
 * @param outputSize Width and height of the output matrix
 */
 __global__ void convolution(
-    const float* d_input,
-    const float* d_kernel,
-    float*       d_output,
-    int          inputSize,
-    int          nChannels,
-    int          kernelSize,
-    int          stride,
-    int          nFilters,
-    int          outputSize
+    const float* __restrict__ d_input,
+    const float* __restrict__ d_kernel,
+    float* __restrict__ d_output,
+    const unsigned int inputSize,
+    const unsigned int nChannels,
+    const unsigned int kernelSize,
+    const unsigned int stride,
+    const unsigned int nFilters,
+    const unsigned int outputSize
 );

-}  // namespace Kernels
+}  // namespace CUDANet::Kernels

 #endif  // CUDANET_CONVOLUTION_H
--- a/include/kernels/matmul.cuh
+++ b/include/kernels/matmul.cuh
@@ -1,11 +1,11 @@
 #ifndef CUDANET_MATMUL_H
 #define CUDANET_MATMUL_H

-namespace Kernels {
+namespace CUDANet::Kernels {

 /**
 * @brief Matrix vector multiplication kernel
- * 
+ *
 * @param d_matrix Device pointer to matrix
 * @param d_vector Device pointer to vector
 * @param d_output Device pointer to output vector
@@ -13,28 +13,41 @@ namespace Kernels {
 * @param h Height of the matrix
 */
 __global__ void mat_vec_mul(
-    const float* d_matrix,
-    const float* d_vector,
-    float*       d_output,
-    int          w,
-    int          h
+    const float* __restrict__ d_matrix,
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int w,
+    const unsigned int h
 );

 /**
 * @brief Vector vector addition kernel
- * 
+ *
 * @param d_vector1 Device pointer to first vector
 * @param d_vector2 Device pointer to second vector
 * @param d_output Device pointer to output vector
 * @param w Length of the vectors
 */
 __global__ void vec_vec_add(
-    const float* d_vector1,
-    const float* d_vector2,
-    float*       d_output,
-    int          w
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
 );

-}  // namespace Kernels
+/**
+ * @brief 
+ * 
+ * @param d_vector Device pointer to vector
+ * @param d_output Device pointer to output vector
+ * @param w Length of the vector
+ */
+__global__ void reduce_sum(
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int w
+);
+
+}  // namespace CUDANet::Kernels

 #endif  // CUDANET_MATMUL_H