Files
CUDANet/src/kernels/activation_functions.cu

79 lines
2.0 KiB
Plaintext

#include <functional>
#include "activation_functions.cuh"
#include "cuda_helper.cuh"
__global__ void CUDANet::Kernels::sigmoid(
const float* __restrict__ src,
float* __restrict__ dst,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = 1.0 / (1.0 + exp(-src[i]));
}
}
__global__ void CUDANet::Kernels::relu(
const float* __restrict__ src,
float* __restrict__ dst,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = src[i] < 0.0 ? 0.0 : src[i];
}
}
__global__ void CUDANet::Kernels::softmax_exp(
const float* __restrict__ src,
float* __restrict__ dst,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = exp(src[i]);
}
}
__global__ void CUDANet::Kernels::softmax_sum(
const float* __restrict__ d_vector,
float* __restrict__ d_output,
const unsigned int w
) {
__shared__ float partial_sum[BLOCK_SIZE];
int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
partial_sum[threadIdx.x] = d_vector[i] + d_vector[i + blockDim.x];
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
d_output[blockIdx.x] = partial_sum[0];
}
}
__global__ void CUDANet::Kernels::softmax_div(
const float* __restrict__ src,
float* __restrict__ dst,
const float* __restrict__ sum,
const unsigned int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = src[i] / sum[0];
}
}