Restructure cuda backend

2025-12-24 07:14:22 +00:00 · 2024-09-05 22:23:47 +02:00
parent 65727dfee8
commit f8220f0ec1
19 changed files with 69 additions and 16 deletions
--- a/src/backends/cuda/kernels/activation_functions.cu
+++ b/src/backends/cuda/kernels/activation_functions.cu
@@ -0,0 +1,30 @@
+#include "activation_functions.cuh"
+#include "cuda_helper.cuh"
+
+using namespace CUDANet;
+
+__global__ void Kernels::sigmoid(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const unsigned int len
+) {
+    int stride = gridDim.x * blockDim.x;
+    int tid    = blockDim.x * blockIdx.x + threadIdx.x;
+
+    for (int i = tid; i < len; i += stride) {
+        dst[i] = 1.0 / (1.0 + exp(-src[i]));
+    }
+}
+
+__global__ void Kernels::relu(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const unsigned int len
+) {
+    int stride = gridDim.x * blockDim.x;
+    int tid    = blockDim.x * blockIdx.x + threadIdx.x;
+
+    for (int i = tid; i < len; i += stride) {
+        dst[i] = src[i] < 0.0 ? 0.0 : src[i];
+    }
+}
--- a/src/backends/cuda/kernels/convolution.cu
+++ b/src/backends/cuda/kernels/convolution.cu
@@ -0,0 +1,60 @@
+#include <iostream>
+
+#include "convolution.cuh"
+
+using namespace CUDANet;
+
+__global__ void Kernels::convolution(
+    const float* __restrict__ d_input,
+    const float* __restrict__ d_kernel,
+    const float* __restrict__ d_bias,
+    float* __restrict__ d_output,
+    const shape2d inputSize,
+    const int   nChannels,
+    const shape2d paddingSize,
+    const shape2d kernelSize,
+    const shape2d stride,
+    const int   nFilters,
+    const shape2d outputSize
+) {
+    int j = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.y * blockIdx.y + threadIdx.y;
+    int f = blockDim.z * blockIdx.z + threadIdx.z;
+
+    if (i >= outputSize.first || j >= outputSize.second || f >= nFilters) {
+        return;
+    }
+
+    float sum = 0.0f;
+
+    // Iterate over kernel and input matrix
+    for (int c = 0; c < nChannels; c++) {
+        for (int k = 0; k < kernelSize.first; k++) {
+            for (int l = 0; l < kernelSize.second; l++) {
+                // if i, j is in the padding region
+                if (i * stride.first + k < paddingSize.first ||
+                    i * stride.first + k >=
+                        (inputSize.first + paddingSize.first) ||
+                    j * stride.second + l < paddingSize.second ||
+                    j * stride.second + l >=
+                        (inputSize.second + paddingSize.second)) {
+                    continue;
+                }
+
+                int kernelIndex =
+                    f * kernelSize.first * kernelSize.second * nChannels +
+                    c * kernelSize.first * kernelSize.second +
+                    k * kernelSize.second + l;
+                int inputIndex = c * inputSize.first * inputSize.second +
+                                 (i * stride.first + k - paddingSize.first) *
+                                     inputSize.second +
+                                 (j * stride.second + l - paddingSize.second);
+
+                sum += d_kernel[kernelIndex] * d_input[inputIndex];
+            }
+        }
+    }
+
+    d_output[f * outputSize.first * outputSize.second + i * outputSize.second + j] =
+        sum + d_bias[f];
+}
--- a/src/backends/cuda/kernels/matmul.cu
+++ b/src/backends/cuda/kernels/matmul.cu
@@ -0,0 +1,211 @@
+#include "cuda_helper.cuh"
+#include "matmul.cuh"
+
+using namespace CUDANet;
+
+__global__ void Kernels::mat_vec_mul(
+    const float* __restrict__ d_matrix,
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int w,
+    const unsigned int h
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (tid < h) {
+        float temp = 0.0f;
+
+        for (unsigned int j = 0; j < w; j++) {
+            temp += d_matrix[tid * w + j] * d_vector[j];
+        }
+
+        d_output[tid] = temp;
+    }
+}
+
+__global__ void Kernels::vec_vec_add(
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= w) {
+        return;
+    }
+    d_output[tid] = d_vector1[tid] + d_vector2[tid];
+}
+
+__global__ void Kernels::vec_vec_sub(
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= w) {
+        return;
+    }
+    d_output[tid] = d_vector1[tid] - d_vector2[tid];
+}
+
+__global__ void Kernels::vec_vec_mul(
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= w) {
+        return;
+    }
+    d_output[tid] = d_vector1[tid] * d_vector2[tid];
+}
+
+__global__ void Kernels::vec_scalar_sub(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= len) {
+        return;
+    }
+    d_out[tid] = d_src[tid] - *d_scalar;
+}
+
+__global__ void Kernels::vec_scalar_add(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= len) {
+        return;
+    }
+    d_out[tid] = d_src[tid] + *d_scalar;
+}
+
+__global__ void Kernels::vec_scalar_div(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= len) {
+        return;
+    }
+    d_out[tid] = d_src[tid] / *d_scalar;
+}
+
+__global__ void Kernels::vec_scalar_mul(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= len) {
+        return;
+    }
+    d_out[tid] = d_src[tid] * *d_scalar;
+}
+
+__global__ void Kernels::vec_exp(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const unsigned int len
+) {
+    int stride = gridDim.x * blockDim.x;
+    int tid    = blockDim.x * blockIdx.x + threadIdx.x;
+
+    for (int i = tid; i < len; i += stride) {
+        dst[i] = expf(src[i]);
+    }
+}
+
+__global__ void Kernels::vec_sqrt(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const unsigned int len
+) {
+    int stride = gridDim.x * blockDim.x;
+    int tid    = blockDim.x * blockIdx.x + threadIdx.x; 
+
+    for (int i = tid; i < len; i += stride) {
+        dst[i] = sqrtf(src[i]);
+    }
+}
+
+__global__ void Kernels::vec_scale(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const float* __restrict__ scale,
+    const float* epsilon,
+    const unsigned int len
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < len) {
+        float inv_std = rsqrtf(*scale + *epsilon);
+        dst[idx] = src[idx] * inv_std;
+    }
+}
+
+__global__ void Kernels::max_reduce(
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int len
+) {
+    __shared__ float shared_max[BLOCK_SIZE];
+    int i       = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < len) {
+        shared_max[threadIdx.x] = d_vector[i];
+    } else {
+        shared_max[threadIdx.x] = -INFINITY;
+    }    
+
+    __syncthreads();
+
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (threadIdx.x < s) {
+            shared_max[threadIdx.x] = fmaxf(shared_max[threadIdx.x], shared_max[threadIdx.x + s]);
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        d_output[blockIdx.x] = shared_max[0];
+    }
+}
+
+__global__ void Kernels::sum_reduce(
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int len
+) {
+    __shared__ float partial_sum[BLOCK_SIZE];
+    int              i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < len) {
+        partial_sum[threadIdx.x] = d_vector[i];
+    } else {
+        partial_sum[threadIdx.x] = 0.0f;
+    }
+
+    __syncthreads();
+
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (threadIdx.x < s) {
+            partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        d_output[blockIdx.x] = partial_sum[0];
+    }
+}
--- a/src/backends/cuda/kernels/pooling.cu
+++ b/src/backends/cuda/kernels/pooling.cu
@@ -0,0 +1,85 @@
+#include "cuda_helper.cuh"
+#include "layer.cuh"
+#include "pooling.cuh"
+
+using namespace CUDANet;
+
+__global__ void Kernels::max_pooling(
+    const float* __restrict__ d_input,
+    float* __restrict__ d_output,
+    const shape2d inputSize,
+    const shape2d outputSize,
+    const int   nChannels,
+    const shape2d poolingSize,
+    const shape2d stride,
+    const shape2d padding
+) {
+    int j = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.y * blockIdx.y + threadIdx.y;
+    int c = blockDim.z * blockIdx.z + threadIdx.z;
+
+    if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
+        return;
+    }
+
+    float max = 0.0f;
+
+    for (int k = 0; k < poolingSize.first; k++) {
+        for (int l = 0; l < poolingSize.second; l++) {
+            int inputRow = i * stride.first + k - padding.first;
+            int inputCol = j * stride.second + l - padding.second;
+
+            if (inputRow >= 0 && inputRow < inputSize.first && inputCol >= 0 &&
+                inputCol < inputSize.second) {
+                int inputIndex = c * inputSize.first * inputSize.second +
+                                 inputRow * inputSize.second + inputCol;
+                if (d_input[inputIndex] > max) {
+                    max = d_input[inputIndex];
+                }
+            }
+        }
+    }
+
+    d_output
+        [c * outputSize.first * outputSize.second + i * outputSize.second + j] =
+            max;
+}
+
+__global__ void Kernels::avg_pooling(
+    const float* __restrict__ d_input,
+    float* __restrict__ d_output,
+    const shape2d inputSize,
+    const shape2d outputSize,
+    const int   nChannels,
+    const shape2d poolingSize,
+    const shape2d stride,
+    const shape2d padding
+) {
+    int j = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.y * blockIdx.y + threadIdx.y;
+    int c = blockDim.z * blockIdx.z + threadIdx.z;
+
+    if (i >= outputSize.first || j >= outputSize.second || c >= nChannels) {
+        return;
+    }
+
+    float sum = 0.0f;
+
+    for (int k = 0; k < poolingSize.first; k++) {
+        for (int l = 0; l < poolingSize.second; l++) {
+            int inputRow = i * stride.first + k - padding.first;
+            int inputCol = j * stride.second + l - padding.second;
+
+            if (inputRow >= 0 && inputRow < inputSize.first && inputCol >= 0 &&
+                inputCol < inputSize.second) {
+                int inputIndex = c * inputSize.first * inputSize.second +
+                                 inputRow * inputSize.second + inputCol;
+                sum += d_input[inputIndex];
+            }
+        }
+    }
+
+    d_output
+        [c * outputSize.first * outputSize.second + i * outputSize.second + j] =
+            sum / (poolingSize.first * poolingSize.second);
+}