Add toplevel CUDANet namespace

2025-11-06 09:44:28 +00:00 · 2024-03-17 16:08:53 +01:00
parent dc86cddeb7
commit 0c22fac64e
19 changed files with 183 additions and 149 deletions
--- a/src/kernels/convolution.cu
+++ b/src/kernels/convolution.cu
@@ -1,6 +1,7 @@
-#include "convolution.cuh"
 #include <iostream>

+#include "convolution.cuh"
+
 /*
 Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
 padding x n_channels Matrix is represented as a pointer to a vector
@@ -47,13 +48,13 @@ pre-allocated)
  n: Number of channels in input matrix
  p: Padding
 */
-__global__ void Kernels::padding(
-    const float* d_input,
-    float*       d_padded,
-    int          w,
-    int          h,
-    int          n,
-    int          p
+__global__ void CUDANet::Kernels::padding(
+    const float* __restrict__ d_input,
+    float* __restrict__ d_padded,
+    const unsigned int w,
+    const unsigned int h,
+    const unsigned int n,
+    const unsigned int p
 ) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

@@ -78,16 +79,16 @@ __global__ void Kernels::padding(
    }
 }

-__global__ void Kernels::convolution(
-    const float* d_input,
-    const float* d_kernel,
-    float*       d_output,
-    int          inputSize,
-    int          nChannels,
-    int          kernelSize,
-    int          stride,
-    int          nFilters,
-    int          outputSize
+__global__ void CUDANet::Kernels::convolution(
+    const float* __restrict__ d_input,
+    const float* __restrict__ d_kernel,
+    float* __restrict__ d_output,
+    const unsigned int inputSize,
+    const unsigned int nChannels,
+    const unsigned int kernelSize,
+    const unsigned int stride,
+    const unsigned int nFilters,
+    const unsigned int outputSize
 ) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;