From c062e89972387c6b237ac36ffdca27ed94f99251 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Wed, 20 Mar 2024 19:17:30 +0100 Subject: [PATCH] Use 3d memory layout for pooling --- src/kernels/pooling.cu | 40 +++++++++++++++---------------------- src/layers/avg_pooling.cu | 42 ++++++++++++++++++++++----------------- src/layers/max_pooling.cu | 10 +++++++++- 3 files changed, 49 insertions(+), 43 deletions(-) diff --git a/src/kernels/pooling.cu b/src/kernels/pooling.cu index 580b552..d1a5066 100644 --- a/src/kernels/pooling.cu +++ b/src/kernels/pooling.cu @@ -1,6 +1,5 @@ -#include "pooling.cuh" - #include "cuda_helper.cuh" +#include "pooling.cuh" using namespace CUDANet; @@ -12,24 +11,20 @@ __global__ void Kernels::max_pooling( const int poolingSize, const int stride ) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid >= inputSize * inputSize * nChannels) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + int i = blockDim.y * blockIdx.y + threadIdx.y; + int c = blockDim.z * blockIdx.z + threadIdx.z; + + if (i >= inputSize || j >= inputSize || c >= nChannels) { return; } - // Get output index - int c = tid / (inputSize * inputSize); - int i = tid % (inputSize * inputSize) / inputSize; - int j = tid % inputSize; - float max = 0.0f; for (int k = 0; k < poolingSize; k++) { for (int l = 0; l < poolingSize; l++) { - int inputIndex = c * inputSize * inputSize + - (i * stride + k) * inputSize + - (j * stride + l); + (i * stride + k) * inputSize + (j * stride + l); if (d_input[inputIndex] > max) { max = d_input[inputIndex]; @@ -37,7 +32,7 @@ __global__ void Kernels::max_pooling( } } - d_output[tid] = max; + d_output[c * inputSize * inputSize + i * inputSize + j] = max; } __global__ void Kernels::avg_pooling( @@ -48,28 +43,25 @@ __global__ void Kernels::avg_pooling( const int poolingSize, const int stride ) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid >= inputSize * inputSize * nChannels) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + int i = blockDim.y * blockIdx.y + threadIdx.y; + int c = blockDim.z * blockIdx.z + threadIdx.z; + + if (i >= inputSize || j >= inputSize || c >= nChannels) { return; } - // Get output index - int c = tid / (inputSize * inputSize); - int i = tid % (inputSize * inputSize) / inputSize; - int j = tid % inputSize; - float sum = 0.0f; for (int k = 0; k < poolingSize; k++) { for (int l = 0; l < poolingSize; l++) { - int inputIndex = c * inputSize * inputSize + - (i * stride + k) * inputSize + - (j * stride + l); + (i * stride + k) * inputSize + (j * stride + l); sum += d_input[inputIndex]; } } - d_output[tid] = sum / (poolingSize * poolingSize); + d_output[c * inputSize * inputSize + i * inputSize + j] = + sum / (poolingSize * poolingSize); } \ No newline at end of file diff --git a/src/layers/avg_pooling.cu b/src/layers/avg_pooling.cu index b8180df..5c61566 100644 --- a/src/layers/avg_pooling.cu +++ b/src/layers/avg_pooling.cu @@ -5,38 +5,44 @@ using namespace CUDANet::Layers; AvgPooling2D::AvgPooling2D( - int inputSize, - int nChannels, - int poolingSize, - int stride, - ActivationType activationType - ) - : inputSize(inputSize), nChannels(nChannels), poolingSize(poolingSize), stride(stride) { + int inputSize, + int nChannels, + int poolingSize, + int stride, + ActivationType activationType +) + : inputSize(inputSize), + nChannels(nChannels), + poolingSize(poolingSize), + stride(stride) { + outputSize = (inputSize - poolingSize) / stride + 1; - - outputSize = (inputSize - poolingSize) / stride + 1; - - activation = Activation( - activationType, outputSize * outputSize * nChannels - ); + activation = + Activation(activationType, outputSize * outputSize * nChannels); d_output = nullptr; CUDA_CHECK(cudaMalloc( (void**)&d_output, sizeof(float) * outputSize * outputSize * nChannels )); - gridSize = (outputSize * outputSize * nChannels + BLOCK_SIZE - 1) / BLOCK_SIZE; - + gridSize = + (outputSize * outputSize * nChannels + BLOCK_SIZE - 1) / BLOCK_SIZE; } - AvgPooling2D::~AvgPooling2D() { cudaFree(d_output); } - float* AvgPooling2D::forward(const float* d_input) { - Kernels::avg_pooling<<>>( + + dim3 block(8, 8, 8); + dim3 grid( + (outputSize + block.x - 1) / block.x, + (outputSize + block.y - 1) / block.y, + (nChannels + block.z - 1) / block.z + ); + + Kernels::avg_pooling<<>>( d_input, d_output, inputSize, nChannels, poolingSize, stride ); diff --git a/src/layers/max_pooling.cu b/src/layers/max_pooling.cu index c660782..3dac272 100644 --- a/src/layers/max_pooling.cu +++ b/src/layers/max_pooling.cu @@ -37,7 +37,15 @@ MaxPooling2D::~MaxPooling2D() { float* MaxPooling2D::forward(const float* d_input) { - Kernels::max_pooling<<>>( + + dim3 block(8,8,8); + dim3 grid( + (outputSize + block.x - 1) / block.x, + (outputSize + block.y - 1) / block.y, + (nChannels + block.z - 1) / block.z + ); + + Kernels::max_pooling<<>>( d_input, d_output, inputSize, nChannels, poolingSize, stride );