mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-11-05 17:34:21 +00:00
Initial cuda conv kernel implementation
This commit is contained in:
16
include/kernels/convolution.cuh
Normal file
16
include/kernels/convolution.cuh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#ifndef CONVOLUTION_H
|
||||||
|
#define CONVOLUTION_H
|
||||||
|
|
||||||
|
__global__ void convolution_kernel(
|
||||||
|
const float* d_input,
|
||||||
|
const float* d_kernel,
|
||||||
|
float* d_output,
|
||||||
|
int inputSize,
|
||||||
|
int nChannels,
|
||||||
|
int kernelSize,
|
||||||
|
int stride,
|
||||||
|
int nFilters,
|
||||||
|
int outputSize
|
||||||
|
);
|
||||||
|
|
||||||
|
#endif // CONVOLUTION_H
|
||||||
53
src/kernels/convolution.cu
Normal file
53
src/kernels/convolution.cu
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#include "convolution.cuh"
|
||||||
|
|
||||||
|
__global__ void convolution_kernel(
|
||||||
|
const float* d_input,
|
||||||
|
const float* d_kernel,
|
||||||
|
float* d_output,
|
||||||
|
int inputSize,
|
||||||
|
int nChannels,
|
||||||
|
int kernelSize,
|
||||||
|
int stride,
|
||||||
|
int nFilters,
|
||||||
|
int outputSize
|
||||||
|
) {
|
||||||
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (tid >= outputSize * outputSize * nFilters) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get output index
|
||||||
|
int f = tid / (outputSize * outputSize);
|
||||||
|
int i = (tid % (outputSize * outputSize)) / outputSize;
|
||||||
|
int j = (tid % (outputSize * outputSize)) % outputSize;
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
|
||||||
|
// std::cout << "f: " << f << ", i: " << i << ", j: " << j << std::endl;
|
||||||
|
|
||||||
|
// Iterate over kernel and input matrix
|
||||||
|
for (int k = 0; k < kernelSize; k++) {
|
||||||
|
for (int l = 0; l < kernelSize; l++) {
|
||||||
|
for (int c = 0; c < nChannels; c++) {
|
||||||
|
int kernelIndex =
|
||||||
|
k * (kernelSize * nChannels * nFilters) +
|
||||||
|
l * (nChannels * nFilters) + c * (nFilters) + f;
|
||||||
|
int inputIndex =
|
||||||
|
(i * stride + k) * (inputSize * nChannels) +
|
||||||
|
(j * stride + l) * (nChannels) + c;
|
||||||
|
|
||||||
|
// std::cout << "kernelIndex: " << kernelIndex << ", kernel
|
||||||
|
// value: " << kernels[kernelIndex] << ", inputIndex: " <<
|
||||||
|
// inputIndex << ", input value: " << input[inputIndex] <<
|
||||||
|
// std::endl;
|
||||||
|
|
||||||
|
sum += d_kernel[kernelIndex] * d_input[inputIndex];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// std::cout << "sum: " << sum << std::endl;
|
||||||
|
|
||||||
|
d_output[i * (outputSize * nFilters) + j * (nFilters) + f] = sum;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user