From e51aabc2f240596a08d1570a2f139eeee05ee5d8 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Fri, 8 Mar 2024 23:35:54 +0100 Subject: [PATCH] Initial cuda conv kernel implementation --- include/kernels/convolution.cuh | 16 ++++++++++ src/kernels/convolution.cu | 53 +++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 include/kernels/convolution.cuh create mode 100644 src/kernels/convolution.cu diff --git a/include/kernels/convolution.cuh b/include/kernels/convolution.cuh new file mode 100644 index 0000000..d7d009a --- /dev/null +++ b/include/kernels/convolution.cuh @@ -0,0 +1,16 @@ +#ifndef CONVOLUTION_H +#define CONVOLUTION_H + +__global__ void convolution_kernel( + const float* d_input, + const float* d_kernel, + float* d_output, + int inputSize, + int nChannels, + int kernelSize, + int stride, + int nFilters, + int outputSize +); + +#endif // CONVOLUTION_H \ No newline at end of file diff --git a/src/kernels/convolution.cu b/src/kernels/convolution.cu new file mode 100644 index 0000000..7e5027d --- /dev/null +++ b/src/kernels/convolution.cu @@ -0,0 +1,53 @@ +#include "convolution.cuh" + +__global__ void convolution_kernel( + const float* d_input, + const float* d_kernel, + float* d_output, + int inputSize, + int nChannels, + int kernelSize, + int stride, + int nFilters, + int outputSize +) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + if (tid >= outputSize * outputSize * nFilters) { + return; + } + + // Get output index + int f = tid / (outputSize * outputSize); + int i = (tid % (outputSize * outputSize)) / outputSize; + int j = (tid % (outputSize * outputSize)) % outputSize; + + float sum = 0.0f; + + // std::cout << "f: " << f << ", i: " << i << ", j: " << j << std::endl; + + // Iterate over kernel and input matrix + for (int k = 0; k < kernelSize; k++) { + for (int l = 0; l < kernelSize; l++) { + for (int c = 0; c < nChannels; c++) { + int kernelIndex = + k * (kernelSize * nChannels * nFilters) + + l * (nChannels * nFilters) + c * (nFilters) + f; + int inputIndex = + (i * stride + k) * (inputSize * nChannels) + + (j * stride + l) * (nChannels) + c; + + // std::cout << "kernelIndex: " << kernelIndex << ", kernel + // value: " << kernels[kernelIndex] << ", inputIndex: " << + // inputIndex << ", input value: " << input[inputIndex] << + // std::endl; + + sum += d_kernel[kernelIndex] * d_input[inputIndex]; + } + } + } + + // std::cout << "sum: " << sum << std::endl; + + d_output[i * (outputSize * nFilters) + j * (nFilters) + f] = sum; +} \ No newline at end of file