diff --git a/src/kernels/convolution.cu b/src/kernels/convolution.cu index 9f1efc2..9c7a6b3 100644 --- a/src/kernels/convolution.cu +++ b/src/kernels/convolution.cu @@ -16,17 +16,14 @@ __global__ void Kernels::convolution( const int nFilters, const int outputSize ) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int i = blockDim.y * blockIdx.y + threadIdx.y; + int f = blockDim.z * blockIdx.z + threadIdx.z; - if (tid >= outputSize * outputSize * nFilters) { + if (i >= outputSize || j >= outputSize || f >= nFilters) { return; } - // Get output index - int f = tid / (outputSize * outputSize); - int i = tid % (outputSize * outputSize) / outputSize; - int j = tid % outputSize; - float sum = 0.0f; // Iterate over kernel and input matrix @@ -54,5 +51,5 @@ __global__ void Kernels::convolution( } } - d_output[tid] = sum; + d_output[f * outputSize * outputSize + i * outputSize + j] = sum; } \ No newline at end of file diff --git a/src/layers/conv2d.cu b/src/layers/conv2d.cu index 3b046ec..9c269d3 100644 --- a/src/layers/conv2d.cu +++ b/src/layers/conv2d.cu @@ -108,8 +108,14 @@ void Conv2d::toCuda() { float* Conv2d::forward(const float* d_input) { // Convolve - int THREADS_PER_BLOCK = outputSize * outputSize * numFilters; - Kernels::convolution<<<1, THREADS_PER_BLOCK>>>( + dim3 block(8,8,8); + dim3 grid( + (outputSize + block.x - 1) / block.x, + (outputSize + block.y - 1) / block.y, + (numFilters + block.z - 1) / block.z + ); + + Kernels::convolution<<>>( d_input, d_weights, d_output, inputSize, inputChannels, paddingSize, kernelSize, stride, numFilters, outputSize );