Initial cuda conv kernel implementation

2025-12-22 14:24:22 +00:00 · 2024-03-08 23:35:54 +01:00
parent 4b6fcbc191
commit e51aabc2f2
2 changed files with 69 additions and 0 deletions
--- a/src/kernels/convolution.cu
+++ b/src/kernels/convolution.cu
@@ -0,0 +1,53 @@
+#include "convolution.cuh"
+
+__global__ void convolution_kernel(
+    const float* d_input,
+    const float* d_kernel,
+    float*       d_output,
+    int          inputSize,
+    int          nChannels,
+    int          kernelSize,
+    int          stride,
+    int          nFilters,
+    int          outputSize
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (tid >= outputSize * outputSize * nFilters) {
+        return;
+    }
+
+    // Get output index
+    int f = tid / (outputSize * outputSize);
+    int i = (tid % (outputSize * outputSize)) / outputSize;
+    int j = (tid % (outputSize * outputSize)) % outputSize;
+
+    float sum = 0.0f;
+
+    // std::cout << "f: " << f << ", i: " << i << ", j: " << j << std::endl;
+
+    // Iterate over kernel and input matrix
+    for (int k = 0; k < kernelSize; k++) {
+        for (int l = 0; l < kernelSize; l++) {
+            for (int c = 0; c < nChannels; c++) {
+                int kernelIndex =
+                    k * (kernelSize * nChannels * nFilters) +
+                    l * (nChannels * nFilters) + c * (nFilters) + f;
+                int inputIndex =
+                    (i * stride + k) * (inputSize * nChannels) +
+                    (j * stride + l) * (nChannels) + c;
+
+                // std::cout << "kernelIndex: " << kernelIndex << ", kernel
+                // value: " << kernels[kernelIndex] << ", inputIndex: " <<
+                // inputIndex << ", input value: " << input[inputIndex] <<
+                // std::endl;
+
+                sum += d_kernel[kernelIndex] * d_input[inputIndex];
+            }
+        }
+    }
+
+    // std::cout << "sum: " << sum << std::endl;
+
+    d_output[i * (outputSize * nFilters) + j * (nFilters) + f] = sum;
+}