mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-11-07 02:04:26 +00:00
Abstract activation and implement softmax
This commit is contained in:
79
src/kernels/activation_functions.cu
Normal file
79
src/kernels/activation_functions.cu
Normal file
@@ -0,0 +1,79 @@
|
||||
#include <functional>
|
||||
|
||||
#include "activation_functions.cuh"
|
||||
#include "cuda_helper.cuh"
|
||||
|
||||
__global__ void CUDANet::Kernels::sigmoid(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
const unsigned int len
|
||||
) {
|
||||
int stride = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (int i = tid; i < len; i += stride) {
|
||||
dst[i] = 1.0 / (1.0 + exp(-src[i]));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CUDANet::Kernels::relu(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
const unsigned int len
|
||||
) {
|
||||
int stride = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (int i = tid; i < len; i += stride) {
|
||||
dst[i] = src[i] < 0.0 ? 0.0 : src[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CUDANet::Kernels::softmax_exp(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
const unsigned int len
|
||||
) {
|
||||
int stride = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (int i = tid; i < len; i += stride) {
|
||||
dst[i] = exp(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CUDANet::Kernels::softmax_sum(
|
||||
const float* __restrict__ d_vector,
|
||||
float* __restrict__ d_output,
|
||||
const unsigned int w
|
||||
) {
|
||||
__shared__ float partial_sum[BLOCK_SIZE];
|
||||
int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
|
||||
partial_sum[threadIdx.x] = d_vector[i] + d_vector[i + blockDim.x];
|
||||
__syncthreads();
|
||||
|
||||
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
|
||||
if (threadIdx.x < s) {
|
||||
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
d_output[blockIdx.x] = partial_sum[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CUDANet::Kernels::softmax_div(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
const float* __restrict__ sum,
|
||||
const unsigned int len
|
||||
) {
|
||||
int stride = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (int i = tid; i < len; i += stride) {
|
||||
dst[i] = src[i] / sum[0];
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
#include <functional>
|
||||
|
||||
#include "activations.cuh"
|
||||
|
||||
__global__ void CUDANet::Kernels::sigmoid(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
int len
|
||||
) {
|
||||
int stride = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (int i = tid; i < len; i += stride) {
|
||||
dst[i] = 1.0 / (1.0 + exp(-src[i]));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CUDANet::Kernels::relu(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
int len
|
||||
) {
|
||||
int stride = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (int i = tid; i < len; i += stride) {
|
||||
dst[i] = src[i] < 0.0 ? 0.0 : src[i];
|
||||
}
|
||||
}
|
||||
@@ -47,15 +47,3 @@ __global__ void CUDANet::Kernels::vec_vec_add(
|
||||
}
|
||||
d_output[tid] = d_vector1[tid] + d_vector2[tid];
|
||||
}
|
||||
|
||||
__global__ void CUDANet::Kernels::reduce_sum(
|
||||
const float* __restrict__ d_vector,
|
||||
float* __restrict__ d_output,
|
||||
const unsigned int w
|
||||
) {
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
__shared__ float shared[BLOCK_SIZE];
|
||||
shared[threadIdx.x] = d_vector[tid];
|
||||
__syncthreads();
|
||||
}
|
||||
60
src/layers/activation.cu
Normal file
60
src/layers/activation.cu
Normal file
@@ -0,0 +1,60 @@
|
||||
#include "activation.cuh"
|
||||
|
||||
#include "cuda_helper.cuh"
|
||||
#include "activation_functions.cuh"
|
||||
|
||||
using namespace CUDANet;
|
||||
|
||||
Layers::Activation::Activation(ActivationType activation, const unsigned int length)
|
||||
: activationType(activation), length(length) {
|
||||
|
||||
if (activationType == SOFTMAX) {
|
||||
d_softmax_sum = nullptr;
|
||||
CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length));
|
||||
}
|
||||
|
||||
gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
}
|
||||
|
||||
Layers::Activation::~Activation() {
|
||||
if (activationType == SOFTMAX) {
|
||||
cudaFree(d_softmax_sum);
|
||||
}
|
||||
}
|
||||
|
||||
void Layers::Activation::activate(float* __restrict__ d_input) {
|
||||
|
||||
switch (activationType) {
|
||||
case SIGMOID:
|
||||
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, length
|
||||
);
|
||||
break;
|
||||
|
||||
case RELU:
|
||||
Kernels::relu<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, length
|
||||
);
|
||||
break;
|
||||
case SOFTMAX:
|
||||
Kernels::softmax_exp<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, length
|
||||
);
|
||||
|
||||
Kernels::softmax_sum<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_softmax_sum, length
|
||||
);
|
||||
|
||||
Kernels::softmax_sum<<<1, BLOCK_SIZE>>>(
|
||||
d_softmax_sum, d_softmax_sum, length
|
||||
);
|
||||
|
||||
Kernels::softmax_div<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, d_softmax_sum, length
|
||||
);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "activations.cuh"
|
||||
#include "activation.cuh"
|
||||
#include "conv2d.cuh"
|
||||
#include "convolution.cuh"
|
||||
#include "cuda_helper.cuh"
|
||||
@@ -10,20 +10,19 @@
|
||||
using namespace CUDANet;
|
||||
|
||||
Layers::Conv2d::Conv2d(
|
||||
int inputSize,
|
||||
int inputChannels,
|
||||
int kernelSize,
|
||||
int stride,
|
||||
Layers::Padding padding,
|
||||
int numFilters,
|
||||
Layers::Activation activation
|
||||
int inputSize,
|
||||
int inputChannels,
|
||||
int kernelSize,
|
||||
int stride,
|
||||
int numFilters,
|
||||
Layers::Padding padding,
|
||||
Layers::ActivationType activationType
|
||||
)
|
||||
: inputSize(inputSize),
|
||||
inputChannels(inputChannels),
|
||||
kernelSize(kernelSize),
|
||||
stride(stride),
|
||||
numFilters(numFilters),
|
||||
activation(activation) {
|
||||
numFilters(numFilters) {
|
||||
switch (padding) {
|
||||
case SAME:
|
||||
outputSize = inputSize;
|
||||
@@ -39,10 +38,13 @@ Layers::Conv2d::Conv2d(
|
||||
break;
|
||||
}
|
||||
|
||||
activation = Layers::Activation(
|
||||
activationType, outputSize * outputSize * numFilters
|
||||
);
|
||||
|
||||
d_output = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(
|
||||
(void**)&d_output,
|
||||
sizeof(float) * outputSize * outputSize * numFilters
|
||||
(void**)&d_output, sizeof(float) * outputSize * outputSize * numFilters
|
||||
));
|
||||
|
||||
weights.resize(kernelSize * kernelSize * inputChannels * numFilters);
|
||||
@@ -131,18 +133,8 @@ float* Layers::Conv2d::forward(const float* d_input) {
|
||||
d_biases, d_output, d_output, biases.size()
|
||||
);
|
||||
|
||||
switch (activation) {
|
||||
case SIGMOID:
|
||||
Kernels::sigmoid<<<1, outputSize>>>(d_output, d_output, outputSize);
|
||||
break;
|
||||
|
||||
case RELU:
|
||||
Kernels::relu<<<1, outputSize>>>(d_output, d_output, outputSize);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// Apply activation
|
||||
activation.activate(d_output);
|
||||
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
|
||||
#include "activations.cuh"
|
||||
#include "activation.cuh"
|
||||
#include "cuda_helper.cuh"
|
||||
#include "dense.cuh"
|
||||
#include "matmul.cuh"
|
||||
@@ -15,13 +15,15 @@ using namespace CUDANet;
|
||||
Layers::Dense::Dense(
|
||||
int inputSize,
|
||||
int outputSize,
|
||||
Layers::Activation activation
|
||||
Layers::ActivationType activationType
|
||||
)
|
||||
: inputSize(inputSize), outputSize(outputSize), activation(activation) {
|
||||
: inputSize(inputSize), outputSize(outputSize) {
|
||||
// Allocate memory for weights and biases
|
||||
weights.resize(outputSize * inputSize);
|
||||
biases.resize(outputSize);
|
||||
|
||||
activation = Layers::Activation(activationType, outputSize);
|
||||
|
||||
initializeWeights();
|
||||
initializeBiases();
|
||||
|
||||
@@ -69,22 +71,7 @@ float* Layers::Dense::forward(const float* d_input) {
|
||||
d_biases, d_output, d_output, outputSize
|
||||
);
|
||||
|
||||
switch (activation) {
|
||||
case SIGMOID:
|
||||
Kernels::sigmoid<<<biasGridSize, BLOCK_SIZE>>>(
|
||||
d_output, d_output, outputSize
|
||||
);
|
||||
break;
|
||||
|
||||
case RELU:
|
||||
Kernels::relu<<<biasGridSize, BLOCK_SIZE>>>(
|
||||
d_output, d_output, outputSize
|
||||
);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
activation.activate(d_output);
|
||||
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
|
||||
Reference in New Issue
Block a user