diff --git a/include/layers/dense.cuh b/include/layers/dense.cuh index cd84f4b..93ce6ab 100644 --- a/include/layers/dense.cuh +++ b/include/layers/dense.cuh @@ -66,6 +66,10 @@ class Dense : public ILayer { Layers::Activation activation; + // Precompute kernel launch parameters + int forwardGridSize; + int biasGridSize; + /** * @brief Initialize the weights to zeros * diff --git a/include/utils/cuda_helper.cuh b/include/utils/cuda_helper.cuh index 31731c5..aa5b7a3 100644 --- a/include/utils/cuda_helper.cuh +++ b/include/utils/cuda_helper.cuh @@ -4,6 +4,10 @@ #include #include +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 128 +#endif // BLOCK_SIZE + /** * @brief CUDA error checking macro * diff --git a/src/kernels/matmul.cu b/src/kernels/matmul.cu index cb483f2..a176697 100644 --- a/src/kernels/matmul.cu +++ b/src/kernels/matmul.cu @@ -1,31 +1,41 @@ +#include "cuda_helper.cuh" #include "matmul.cuh" +#define SHARED_SIZE 128 * 4 + __global__ void Kernels::mat_vec_mul( - const float* d_matrix, - const float* d_vector, - float* d_output, - int w, - int h + const float* __restrict__ d_matrix, + const float* __restrict__ d_vector, + float* __restrict__ d_output, + int w, + int h ) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; - extern __shared__ float shared[]; - - if (tid < w) { - shared[tid] = d_vector[tid]; - } + __shared__ float shared[BLOCK_SIZE]; - __syncthreads(); + float temp = 0.0f; - if (tid < h) { - d_output[tid] = 0.0f; - - #pragma unroll - for (int i = 0; i < w; i++) { - d_output[tid] += d_matrix[tid * w + i] * shared[i]; + #pragma unroll + for (unsigned int i = 0; i < (w + BLOCK_SIZE - 1) / BLOCK_SIZE; i++) + { + if (i * BLOCK_SIZE + threadIdx.x < w) { + shared[threadIdx.x] = d_vector[i * BLOCK_SIZE + threadIdx.x]; + } else { + shared[threadIdx.x] = 0.0f; } + + __syncthreads(); + + for (unsigned int j = 0; j < BLOCK_SIZE; j++) + { + temp += d_matrix[tid * w + i * BLOCK_SIZE + j] * shared[j]; + } + + __syncthreads(); } + + d_output[tid] = temp; } __global__ void Kernels::vec_vec_add( diff --git a/src/layers/dense.cu b/src/layers/dense.cu index 454f932..289c8a0 100644 --- a/src/layers/dense.cu +++ b/src/layers/dense.cu @@ -10,7 +10,11 @@ #include "dense.cuh" #include "matmul.cuh" -Layers::Dense::Dense(int inputSize, int outputSize, Layers::Activation activation) +Layers::Dense::Dense( + int inputSize, + int outputSize, + Layers::Activation activation +) : inputSize(inputSize), outputSize(outputSize), activation(activation) { // Allocate memory for weights and biases weights.resize(outputSize * inputSize); @@ -31,8 +35,12 @@ Layers::Dense::Dense(int inputSize, int outputSize, Layers::Activation activatio cudaMalloc((void**)&d_weights, sizeof(float) * inputSize * outputSize) ); CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * outputSize)); - toCuda(); + + // Calculate block and grid sizes + forwardGridSize = + (std::max(inputSize, outputSize) + BLOCK_SIZE - 1) / BLOCK_SIZE; + biasGridSize = (outputSize + BLOCK_SIZE - 1) / BLOCK_SIZE; } Layers::Dense::~Dense() { @@ -51,21 +59,25 @@ void Layers::Dense::initializeBiases() { } float* Layers::Dense::forward(const float* d_input) { - Kernels::mat_vec_mul<<<1, std::max(inputSize, outputSize), sizeof(float) * inputSize>>>( + Kernels::mat_vec_mul<<>>( d_weights, d_input, d_output, inputSize, outputSize ); - Kernels::vec_vec_add<<<1, outputSize>>>( + Kernels::vec_vec_add<<>>( d_biases, d_output, d_output, outputSize ); switch (activation) { case SIGMOID: - Kernels::sigmoid<<<1, outputSize>>>(d_output, d_output, outputSize); + Kernels::sigmoid<<>>( + d_output, d_output, outputSize + ); break; case RELU: - Kernels::relu<<<1, outputSize>>>(d_output, d_output, outputSize); + Kernels::relu<<>>( + d_output, d_output, outputSize + ); break; default: