Abstract activation and implement softmax

This commit is contained in:
2024-03-17 18:37:15 +01:00
parent b1621819ca
commit 42d646750b
19 changed files with 370 additions and 205 deletions

View File

@@ -0,0 +1,79 @@
#include <functional>
#include "activation_functions.cuh"
#include "cuda_helper.cuh"
__global__ void CUDANet::Kernels::sigmoid(
const float* __restrict__ src,
float* __restrict__ dst,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = 1.0 / (1.0 + exp(-src[i]));
}
}
__global__ void CUDANet::Kernels::relu(
const float* __restrict__ src,
float* __restrict__ dst,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = src[i] < 0.0 ? 0.0 : src[i];
}
}
__global__ void CUDANet::Kernels::softmax_exp(
const float* __restrict__ src,
float* __restrict__ dst,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = exp(src[i]);
}
}
__global__ void CUDANet::Kernels::softmax_sum(
const float* __restrict__ d_vector,
float* __restrict__ d_output,
const unsigned int w
) {
__shared__ float partial_sum[BLOCK_SIZE];
int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
partial_sum[threadIdx.x] = d_vector[i] + d_vector[i + blockDim.x];
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
d_output[blockIdx.x] = partial_sum[0];
}
}
__global__ void CUDANet::Kernels::softmax_div(
const float* __restrict__ src,
float* __restrict__ dst,
const float* __restrict__ sum,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = src[i] / sum[0];
}
}

View File

@@ -1,29 +0,0 @@
#include <functional>
#include "activations.cuh"
__global__ void CUDANet::Kernels::sigmoid(
const float* __restrict__ src,
float* __restrict__ dst,
int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = 1.0 / (1.0 + exp(-src[i]));
}
}
__global__ void CUDANet::Kernels::relu(
const float* __restrict__ src,
float* __restrict__ dst,
int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = src[i] < 0.0 ? 0.0 : src[i];
}
}

View File

@@ -47,15 +47,3 @@ __global__ void CUDANet::Kernels::vec_vec_add(
}
d_output[tid] = d_vector1[tid] + d_vector2[tid];
}
__global__ void CUDANet::Kernels::reduce_sum(
const float* __restrict__ d_vector,
float* __restrict__ d_output,
const unsigned int w
) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float shared[BLOCK_SIZE];
shared[threadIdx.x] = d_vector[tid];
__syncthreads();
}

60
src/layers/activation.cu Normal file
View File

@@ -0,0 +1,60 @@
#include "activation.cuh"
#include "cuda_helper.cuh"
#include "activation_functions.cuh"
using namespace CUDANet;
Layers::Activation::Activation(ActivationType activation, const unsigned int length)
: activationType(activation), length(length) {
if (activationType == SOFTMAX) {
d_softmax_sum = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length));
}
gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
}
Layers::Activation::~Activation() {
if (activationType == SOFTMAX) {
cudaFree(d_softmax_sum);
}
}
void Layers::Activation::activate(float* __restrict__ d_input) {
switch (activationType) {
case SIGMOID:
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, length
);
break;
case RELU:
Kernels::relu<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, length
);
break;
case SOFTMAX:
Kernels::softmax_exp<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, length
);
Kernels::softmax_sum<<<gridSize, BLOCK_SIZE>>>(
d_input, d_softmax_sum, length
);
Kernels::softmax_sum<<<1, BLOCK_SIZE>>>(
d_softmax_sum, d_softmax_sum, length
);
Kernels::softmax_div<<<gridSize, BLOCK_SIZE>>>(
d_input, d_input, d_softmax_sum, length
);
break;
default:
break;
}
}

View File

@@ -1,7 +1,7 @@
#include <iostream>
#include <string>
#include "activations.cuh"
#include "activation.cuh"
#include "conv2d.cuh"
#include "convolution.cuh"
#include "cuda_helper.cuh"
@@ -10,20 +10,19 @@
using namespace CUDANet;
Layers::Conv2d::Conv2d(
int inputSize,
int inputChannels,
int kernelSize,
int stride,
Layers::Padding padding,
int numFilters,
Layers::Activation activation
int inputSize,
int inputChannels,
int kernelSize,
int stride,
int numFilters,
Layers::Padding padding,
Layers::ActivationType activationType
)
: inputSize(inputSize),
inputChannels(inputChannels),
kernelSize(kernelSize),
stride(stride),
numFilters(numFilters),
activation(activation) {
numFilters(numFilters) {
switch (padding) {
case SAME:
outputSize = inputSize;
@@ -39,10 +38,13 @@ Layers::Conv2d::Conv2d(
break;
}
activation = Layers::Activation(
activationType, outputSize * outputSize * numFilters
);
d_output = nullptr;
CUDA_CHECK(cudaMalloc(
(void**)&d_output,
sizeof(float) * outputSize * outputSize * numFilters
(void**)&d_output, sizeof(float) * outputSize * outputSize * numFilters
));
weights.resize(kernelSize * kernelSize * inputChannels * numFilters);
@@ -131,18 +133,8 @@ float* Layers::Conv2d::forward(const float* d_input) {
d_biases, d_output, d_output, biases.size()
);
switch (activation) {
case SIGMOID:
Kernels::sigmoid<<<1, outputSize>>>(d_output, d_output, outputSize);
break;
case RELU:
Kernels::relu<<<1, outputSize>>>(d_output, d_output, outputSize);
break;
default:
break;
}
// Apply activation
activation.activate(d_output);
CUDA_CHECK(cudaDeviceSynchronize());

View File

@@ -5,7 +5,7 @@
#include <functional>
#include <iostream>
#include "activations.cuh"
#include "activation.cuh"
#include "cuda_helper.cuh"
#include "dense.cuh"
#include "matmul.cuh"
@@ -15,13 +15,15 @@ using namespace CUDANet;
Layers::Dense::Dense(
int inputSize,
int outputSize,
Layers::Activation activation
Layers::ActivationType activationType
)
: inputSize(inputSize), outputSize(outputSize), activation(activation) {
: inputSize(inputSize), outputSize(outputSize) {
// Allocate memory for weights and biases
weights.resize(outputSize * inputSize);
biases.resize(outputSize);
activation = Layers::Activation(activationType, outputSize);
initializeWeights();
initializeBiases();
@@ -69,22 +71,7 @@ float* Layers::Dense::forward(const float* d_input) {
d_biases, d_output, d_output, outputSize
);
switch (activation) {
case SIGMOID:
Kernels::sigmoid<<<biasGridSize, BLOCK_SIZE>>>(
d_output, d_output, outputSize
);
break;
case RELU:
Kernels::relu<<<biasGridSize, BLOCK_SIZE>>>(
d_output, d_output, outputSize
);
break;
default:
break;
}
activation.activate(d_output);
CUDA_CHECK(cudaDeviceSynchronize());