From e51aabc2f240596a08d1570a2f139eeee05ee5d8 Mon Sep 17 00:00:00 2001
From: LordMathis <matus@namesny.com>
Date: Fri, 8 Mar 2024 23:35:54 +0100
Subject: [PATCH] Initial cuda conv kernel implementation

---
 include/kernels/convolution.cuh | 16 ++++++++++
 src/kernels/convolution.cu      | 53 +++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 include/kernels/convolution.cuh
 create mode 100644 src/kernels/convolution.cu

diff --git a/include/kernels/convolution.cuh b/include/kernels/convolution.cuh
new file mode 100644
index 0000000..d7d009a
--- /dev/null
+++ b/include/kernels/convolution.cuh
@@ -0,0 +1,16 @@
+#ifndef CONVOLUTION_H
+#define CONVOLUTION_H
+
+__global__ void convolution_kernel(
+    const float* d_input,
+    const float* d_kernel,
+    float*       d_output,
+    int          inputSize,
+    int          nChannels,
+    int          kernelSize,
+    int          stride,
+    int          nFilters,
+    int          outputSize
+);
+
+#endif  // CONVOLUTION_H
\ No newline at end of file
diff --git a/src/kernels/convolution.cu b/src/kernels/convolution.cu
new file mode 100644
index 0000000..7e5027d
--- /dev/null
+++ b/src/kernels/convolution.cu
@@ -0,0 +1,53 @@
+#include "convolution.cuh"
+
+__global__ void convolution_kernel(
+    const float* d_input,
+    const float* d_kernel,
+    float*       d_output,
+    int          inputSize,
+    int          nChannels,
+    int          kernelSize,
+    int          stride,
+    int          nFilters,
+    int          outputSize
+) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (tid >= outputSize * outputSize * nFilters) {
+        return;
+    }
+
+    // Get output index
+    int f = tid / (outputSize * outputSize);
+    int i = (tid % (outputSize * outputSize)) / outputSize;
+    int j = (tid % (outputSize * outputSize)) % outputSize;
+
+    float sum = 0.0f;
+
+    // std::cout << "f: " << f << ", i: " << i << ", j: " << j << std::endl;
+
+    // Iterate over kernel and input matrix
+    for (int k = 0; k < kernelSize; k++) {
+        for (int l = 0; l < kernelSize; l++) {
+            for (int c = 0; c < nChannels; c++) {
+                int kernelIndex =
+                    k * (kernelSize * nChannels * nFilters) +
+                    l * (nChannels * nFilters) + c * (nFilters) + f;
+                int inputIndex =
+                    (i * stride + k) * (inputSize * nChannels) +
+                    (j * stride + l) * (nChannels) + c;
+
+                // std::cout << "kernelIndex: " << kernelIndex << ", kernel
+                // value: " << kernels[kernelIndex] << ", inputIndex: " <<
+                // inputIndex << ", input value: " << input[inputIndex] <<
+                // std::endl;
+
+                sum += d_kernel[kernelIndex] * d_input[inputIndex];
+            }
+        }
+    }
+
+    // std::cout << "sum: " << sum << std::endl;
+
+    d_output[i * (outputSize * nFilters) + j * (nFilters) + f] = sum;
+}
\ No newline at end of file