Use shared memory for mat vec mul kernel

2025-11-05 17:34:21 +00:00 · 2024-03-13 22:13:11 +01:00
parent 09480e42e5
commit 77004c16be
4 changed files with 77 additions and 7 deletions
--- a/src/kernels/matmul.cu
+++ b/src/kernels/matmul.cu
@@ -10,16 +10,22 @@ __global__ void Kernels::mat_vec_mul(
    
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

-    if (tid >= w * h) {
-        return;
+    extern __shared__ float shared[];
+    
+    if (tid < w) {
+        shared[tid] = d_vector[tid];
    }

-    d_output[tid] = 0.0f;
+    __syncthreads();

-    for (int i = 0; i < w; i++) {
-        d_output[tid] += d_matrix[tid * w + i] * d_vector[i];
+    if (tid < h) {
+        d_output[tid] = 0.0f;
+
+        #pragma unroll
+        for (int i = 0; i < w; i++) {
+            d_output[tid] += d_matrix[tid * w + i] * shared[i];
+        }
    }
-
 }

 __global__ void Kernels::vec_vec_add(
--- a/src/layers/dense.cu
+++ b/src/layers/dense.cu
@@ -51,7 +51,7 @@ void Layers::Dense::initializeBiases() {
 }

 float* Layers::Dense::forward(const float* d_input) {
-    Kernels::mat_vec_mul<<<1, outputSize>>>(
+    Kernels::mat_vec_mul<<<1, std::max(inputSize, outputSize), sizeof(float) * inputSize>>>(
        d_weights, d_input, d_output, inputSize, outputSize
    );