mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-11-05 17:34:21 +00:00
79 lines
2.0 KiB
Plaintext
79 lines
2.0 KiB
Plaintext
#include <functional>
|
|
|
|
#include "activation_functions.cuh"
|
|
#include "cuda_helper.cuh"
|
|
|
|
__global__ void CUDANet::Kernels::sigmoid(
|
|
const float* __restrict__ src,
|
|
float* __restrict__ dst,
|
|
const unsigned int len
|
|
) {
|
|
int stride = gridDim.x * blockDim.x;
|
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
for (int i = tid; i < len; i += stride) {
|
|
dst[i] = 1.0 / (1.0 + exp(-src[i]));
|
|
}
|
|
}
|
|
|
|
__global__ void CUDANet::Kernels::relu(
|
|
const float* __restrict__ src,
|
|
float* __restrict__ dst,
|
|
const unsigned int len
|
|
) {
|
|
int stride = gridDim.x * blockDim.x;
|
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
for (int i = tid; i < len; i += stride) {
|
|
dst[i] = src[i] < 0.0 ? 0.0 : src[i];
|
|
}
|
|
}
|
|
|
|
__global__ void CUDANet::Kernels::softmax_exp(
|
|
const float* __restrict__ src,
|
|
float* __restrict__ dst,
|
|
const unsigned int len
|
|
) {
|
|
int stride = gridDim.x * blockDim.x;
|
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
for (int i = tid; i < len; i += stride) {
|
|
dst[i] = exp(src[i]);
|
|
}
|
|
}
|
|
|
|
__global__ void CUDANet::Kernels::softmax_sum(
|
|
const float* __restrict__ d_vector,
|
|
float* __restrict__ d_output,
|
|
const unsigned int w
|
|
) {
|
|
__shared__ float partial_sum[BLOCK_SIZE];
|
|
int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
|
|
partial_sum[threadIdx.x] = d_vector[i] + d_vector[i + blockDim.x];
|
|
__syncthreads();
|
|
|
|
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
|
|
if (threadIdx.x < s) {
|
|
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
if (threadIdx.x == 0) {
|
|
d_output[blockIdx.x] = partial_sum[0];
|
|
}
|
|
}
|
|
|
|
__global__ void CUDANet::Kernels::softmax_div(
|
|
const float* __restrict__ src,
|
|
float* __restrict__ dst,
|
|
const float* __restrict__ sum,
|
|
const unsigned int len
|
|
) {
|
|
int stride = gridDim.x * blockDim.x;
|
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
for (int i = tid; i < len; i += stride) {
|
|
dst[i] = src[i] / sum[0];
|
|
}
|
|
} |