diff --git a/include/kernels/convolution.cuh b/include/kernels/convolution.cuh index 59a3b0e..22380a2 100644 --- a/include/kernels/convolution.cuh +++ b/include/kernels/convolution.cuh @@ -39,12 +39,13 @@ __global__ void convolution( const float* __restrict__ d_input, const float* __restrict__ d_kernel, float* __restrict__ d_output, - const unsigned int inputSize, - const unsigned int nChannels, - const unsigned int kernelSize, - const unsigned int stride, - const unsigned int nFilters, - const unsigned int outputSize + const int inputSize, + const int nChannels, + const int paddingSize, + const int kernelSize, + const int stride, + const int nFilters, + const int outputSize ); } // namespace CUDANet::Kernels diff --git a/src/kernels/convolution.cu b/src/kernels/convolution.cu index 1a5eaf4..d25adb3 100644 --- a/src/kernels/convolution.cu +++ b/src/kernels/convolution.cu @@ -83,12 +83,13 @@ __global__ void CUDANet::Kernels::convolution( const float* __restrict__ d_input, const float* __restrict__ d_kernel, float* __restrict__ d_output, - const unsigned int inputSize, - const unsigned int nChannels, - const unsigned int kernelSize, - const unsigned int stride, - const unsigned int nFilters, - const unsigned int outputSize + const int inputSize, + const int nChannels, + const int paddingSize, + const int kernelSize, + const int stride, + const int nFilters, + const int outputSize ) { int tid = blockDim.x * blockIdx.x + threadIdx.x; @@ -104,9 +105,9 @@ __global__ void CUDANet::Kernels::convolution( float sum = 0.0f; // Iterate over kernel and input matrix - for (int k = 0; k < kernelSize; k++) { - for (int l = 0; l < kernelSize; l++) { - for (int c = 0; c < nChannels; c++) { + for (int c = 0; c < nChannels; c++) { + for (int k = 0; k < kernelSize; k++) { + for (int l = 0; l < kernelSize; l++) { int kernelIndex = f * kernelSize * kernelSize * nChannels + c * kernelSize * kernelSize + k * kernelSize + l; diff --git a/src/layers/conv2d.cu b/src/layers/conv2d.cu index 2c4d7c7..065185f 100644 --- a/src/layers/conv2d.cu +++ b/src/layers/conv2d.cu @@ -124,8 +124,8 @@ float* Layers::Conv2d::forward(const float* d_input) { // Convolve THREADS_PER_BLOCK = outputSize * outputSize * numFilters; Kernels::convolution<<<1, THREADS_PER_BLOCK>>>( - d_padded, d_weights, d_output, inputSize + (2 * paddingSize), - inputChannels, kernelSize, stride, numFilters, outputSize + d_padded, d_weights, d_output, inputSize + 2 * paddingSize, inputChannels, paddingSize, + kernelSize, stride, numFilters, outputSize ); // Add bias