Add toplevel CUDANet namespace

2025-11-06 09:44:28 +00:00 · 2024-03-17 16:08:53 +01:00
parent dc86cddeb7
commit 0c22fac64e
19 changed files with 183 additions and 149 deletions
--- a/src/kernels/activations.cu
+++ b/src/kernels/activations.cu
@@ -2,7 +2,7 @@

 #include "activations.cuh"

-__global__ void Kernels::sigmoid(
+__global__ void CUDANet::Kernels::sigmoid(
    const float* __restrict__ src,
    float* __restrict__ dst,
    int len
@@ -15,8 +15,11 @@ __global__ void Kernels::sigmoid(
    }
 }

-__global__ void
-Kernels::relu(const float* __restrict__ src, float* __restrict__ dst, int len) {
+__global__ void CUDANet::Kernels::relu(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    int len
+) {
    int stride = gridDim.x * blockDim.x;
    int tid    = blockDim.x * blockIdx.x + threadIdx.x;

--- a/src/kernels/convolution.cu
+++ b/src/kernels/convolution.cu
@@ -1,6 +1,7 @@
-#include "convolution.cuh"
 #include <iostream>

+#include "convolution.cuh"
+
 /*
 Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
 padding x n_channels Matrix is represented as a pointer to a vector
@@ -47,13 +48,13 @@ pre-allocated)
  n: Number of channels in input matrix
  p: Padding
 */
-__global__ void Kernels::padding(
-    const float* d_input,
-    float*       d_padded,
-    int          w,
-    int          h,
-    int          n,
-    int          p
+__global__ void CUDANet::Kernels::padding(
+    const float* __restrict__ d_input,
+    float* __restrict__ d_padded,
+    const unsigned int w,
+    const unsigned int h,
+    const unsigned int n,
+    const unsigned int p
 ) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

@@ -78,16 +79,16 @@ __global__ void Kernels::padding(
    }
 }

-__global__ void Kernels::convolution(
-    const float* d_input,
-    const float* d_kernel,
-    float*       d_output,
-    int          inputSize,
-    int          nChannels,
-    int          kernelSize,
-    int          stride,
-    int          nFilters,
-    int          outputSize
+__global__ void CUDANet::Kernels::convolution(
+    const float* __restrict__ d_input,
+    const float* __restrict__ d_kernel,
+    float* __restrict__ d_output,
+    const unsigned int inputSize,
+    const unsigned int nChannels,
+    const unsigned int kernelSize,
+    const unsigned int stride,
+    const unsigned int nFilters,
+    const unsigned int outputSize
 ) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

--- a/src/kernels/matmul.cu
+++ b/src/kernels/matmul.cu
@@ -1,14 +1,12 @@
 #include "cuda_helper.cuh"
 #include "matmul.cuh"

-#define SHARED_SIZE 128 * 4
-
-__global__ void Kernels::mat_vec_mul(
+__global__ void CUDANet::Kernels::mat_vec_mul(
    const float* __restrict__ d_matrix,
    const float* __restrict__ d_vector,
    float* __restrict__ d_output,
-    int w,
-    int h
+    const unsigned int w,
+    const unsigned int h
 ) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

@@ -16,9 +14,8 @@ __global__ void Kernels::mat_vec_mul(

    float temp = 0.0f;

-    #pragma unroll
-    for (unsigned int i = 0; i < (w + BLOCK_SIZE - 1) / BLOCK_SIZE; i++)
-    {
+#pragma unroll
+    for (unsigned int i = 0; i < (w + BLOCK_SIZE - 1) / BLOCK_SIZE; i++) {
        if (i * BLOCK_SIZE + threadIdx.x < w) {
            shared[threadIdx.x] = d_vector[i * BLOCK_SIZE + threadIdx.x];
        } else {
@@ -27,22 +24,22 @@ __global__ void Kernels::mat_vec_mul(

        __syncthreads();

-        for (unsigned int j = 0; j < BLOCK_SIZE; j++)
-        {
+#pragma unroll
+        for (unsigned int j = 0; j < BLOCK_SIZE; j++) {
            temp += d_matrix[tid * w + i * BLOCK_SIZE + j] * shared[j];
        }

        __syncthreads();
    }
-    
+
    d_output[tid] = temp;
 }

-__global__ void Kernels::vec_vec_add(
-    const float* d_vector1,
-    const float* d_vector2,
-    float*       d_output,
-    int          w
+__global__ void CUDANet::Kernels::vec_vec_add(
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
 ) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= w) {
@@ -50,3 +47,15 @@ __global__ void Kernels::vec_vec_add(
    }
    d_output[tid] = d_vector1[tid] + d_vector2[tid];
 }
+
+__global__ void CUDANet::Kernels::reduce_sum(
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int w
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    __shared__ float shared[BLOCK_SIZE];
+    shared[threadIdx.x] = d_vector[tid];
+    __syncthreads();
+}
--- a/src/layers/conv2d.cu
+++ b/src/layers/conv2d.cu
@@ -7,6 +7,8 @@
 #include "cuda_helper.cuh"
 #include "matmul.cuh"

+using namespace CUDANet;
+
 Layers::Conv2d::Conv2d(
    int                inputSize,
    int                inputChannels,
--- a/src/layers/dense.cu
+++ b/src/layers/dense.cu
@@ -10,6 +10,8 @@
 #include "dense.cuh"
 #include "matmul.cuh"

+using namespace CUDANet;
+
 Layers::Dense::Dense(
    int                inputSize,
    int                outputSize,
--- a/src/layers/input.cu
+++ b/src/layers/input.cu
@@ -1,6 +1,8 @@
 #include "cuda_helper.cuh"
 #include "input.cuh"

+using namespace CUDANet;
+
 Layers::Input::Input(int inputSize) : inputSize(inputSize) {
    d_output = nullptr;
    CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * inputSize));