From b49dddf34a2167257b94144434604ad507855d98 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Mon, 8 Apr 2024 23:25:46 +0200 Subject: [PATCH] Improve softmax numerical stability --- include/kernels/matmul.cuh | 27 +++++++++++++++++++++++++++ include/layers/activation.cuh | 1 + src/kernels/matmul.cu | 35 +++++++++++++++++++++++++++++++++++ src/layers/activation.cu | 22 ++++++++++++++++++++++ test/kernels/test_matmul.cu | 29 +++++++++++++++++++++++++++++ test/model/test_model.cu | 9 +++++---- 6 files changed, 119 insertions(+), 4 deletions(-) diff --git a/include/kernels/matmul.cuh b/include/kernels/matmul.cuh index 3d067b8..75100ba 100644 --- a/include/kernels/matmul.cuh +++ b/include/kernels/matmul.cuh @@ -35,6 +35,33 @@ __global__ void vec_vec_add( const unsigned int w ); +/** + * @brief Max reduction kernel + * + * @param d_vector Device pointer to vector + * @param d_output Device pointer to output vector + */ +__global__ void max_reduce( + const float* __restrict__ d_vector, + float* __restrict__ d_output +); + +/** + * @brief Add scalar to each element of the vector + * + * @param d_vector + * @param d_scalar + * @param d_output + * @param w + * @return __global__ + */ +__global__ void vec_scalar_sub( + const float* __restrict__ d_vector, + const float* __restrict__ d_scalar, + float* __restrict__ d_output, + const unsigned int w +); + } // namespace CUDANet::Kernels #endif // CUDANET_MATMUL_H \ No newline at end of file diff --git a/include/layers/activation.cuh b/include/layers/activation.cuh index 66f492f..dab7aa9 100644 --- a/include/layers/activation.cuh +++ b/include/layers/activation.cuh @@ -50,6 +50,7 @@ class Activation { unsigned int gridSize; float* d_softmax_sum; + float* d_max; }; diff --git a/src/kernels/matmul.cu b/src/kernels/matmul.cu index 24ac1b4..190b8b4 100644 --- a/src/kernels/matmul.cu +++ b/src/kernels/matmul.cu @@ -49,3 +49,38 @@ __global__ void Kernels::vec_vec_add( } d_output[tid] = d_vector1[tid] + d_vector2[tid]; } + +__global__ void Kernels::max_reduce( + const float* __restrict__ d_vector, + float* __restrict__ d_output +) { + __shared__ float shared_max[BLOCK_SIZE]; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + shared_max[threadIdx.x] = d_vector[i]; + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + shared_max[threadIdx.x] = fmaxf(shared_max[threadIdx.x], shared_max[threadIdx.x + s]); + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + d_output[blockIdx.x] = shared_max[0]; + } +} + +__global__ void Kernels::vec_scalar_sub( + const float* __restrict__ d_vector, + const float* __restrict__ d_scalar, + float* __restrict__ d_output, + const unsigned int w +) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid >= w) { + return; + } + d_output[tid] = d_vector[tid] - d_scalar[0]; +} \ No newline at end of file diff --git a/src/layers/activation.cu b/src/layers/activation.cu index 8c002cd..15175b4 100644 --- a/src/layers/activation.cu +++ b/src/layers/activation.cu @@ -2,6 +2,10 @@ #include "cuda_helper.cuh" #include "activation_functions.cuh" +#include "matmul.cuh" + +#include +#include using namespace CUDANet::Layers; @@ -11,6 +15,9 @@ Activation::Activation(ActivationType activation, const unsigned int length) if (activationType == SOFTMAX) { d_softmax_sum = nullptr; CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length)); + + d_max = nullptr; + CUDA_CHECK(cudaMalloc((void**)&d_max, sizeof(float) * length)); } gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; @@ -37,6 +44,21 @@ void Activation::activate(float* __restrict__ d_input) { ); break; case SOFTMAX: + + // Find max value + Kernels::max_reduce<<>>( + d_input, d_max + ); + Kernels::max_reduce<<<1, BLOCK_SIZE>>>( + d_max, d_max + ); + + // Subtract max value to improve numerical stability + Kernels::vec_scalar_sub<<>>( + d_input, d_max, d_input, length + ); + + // Compute softmax Kernels::softmax_exp<<>>( d_input, d_input, length ); diff --git a/test/kernels/test_matmul.cu b/test/kernels/test_matmul.cu index e1c89af..cd8a1a6 100644 --- a/test/kernels/test_matmul.cu +++ b/test/kernels/test_matmul.cu @@ -3,6 +3,7 @@ #include +#include "cuda_helper.cuh" #include "matmul.cuh" TEST(MatMulTest, MatVecMulTest) { @@ -61,3 +62,31 @@ TEST(MatMulTest, MatVecMulTest) { } } +TEST(MatMulTest, MaxReduceTest) { + cudaError_t cudaStatus; + + std::vector input = {0.643f, 0.912f, 0.723f, 0.587f, 0.155f, 0.932f, 0.391f, 0.279f, 0.846f, 0.788f}; + + float* d_input; + float* d_output; + + cudaStatus = cudaMalloc((void**)&d_input, sizeof(float) * 10); + EXPECT_EQ(cudaStatus, cudaSuccess); + + cudaStatus = cudaMalloc((void**)&d_output, sizeof(float)); + EXPECT_EQ(cudaStatus, cudaSuccess); + + cudaStatus = cudaMemcpy(d_input, input.data(), sizeof(float) * 10, cudaMemcpyHostToDevice); + EXPECT_EQ(cudaStatus, cudaSuccess); + + const int grid_size = (10 + BLOCK_SIZE - 1) / BLOCK_SIZE; + + CUDANet::Kernels::max_reduce<<>>(d_input, d_output); + CUDANet::Kernels::max_reduce<<<1, BLOCK_SIZE>>>(d_output, d_output); + + std::vector output(10); + cudaStatus = cudaMemcpy(output.data(), d_output, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cudaStatus, cudaSuccess); + + EXPECT_EQ(output[0], 0.932f); +} \ No newline at end of file diff --git a/test/model/test_model.cu b/test/model/test_model.cu index 52865c0..0702cf8 100644 --- a/test/model/test_model.cu +++ b/test/model/test_model.cu @@ -52,7 +52,7 @@ TEST(Model, TestModelPredict) { // dense CUDANet::Layers::Dense dense( - 18, 6, CUDANet::Layers::ActivationType::NONE + 18, 6, CUDANet::Layers::ActivationType::SOFTMAX ); // dense weights 18*6 std::vector denseWeights = { @@ -93,13 +93,14 @@ TEST(Model, TestModelPredict) { // predict const float* output = model.predict(input.data()); + float sum = 0.0f; // float sum = 0.0f; for (int i = 0; i < outputSize; ++i) { - // sum += output[i]; + sum += output[i]; std::cout << output[i] << " "; } - // EXPECT_NEAR(sum, 1.0f, 1e-5f); - std::cout << std::endl; + + EXPECT_NEAR(sum, 1.0f, 1e-5f); } \ No newline at end of file