mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-11-06 09:44:28 +00:00
Add toplevel CUDANet namespace
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
|
||||
#include "activations.cuh"
|
||||
|
||||
__global__ void Kernels::sigmoid(
|
||||
__global__ void CUDANet::Kernels::sigmoid(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
int len
|
||||
@@ -15,8 +15,11 @@ __global__ void Kernels::sigmoid(
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
Kernels::relu(const float* __restrict__ src, float* __restrict__ dst, int len) {
|
||||
__global__ void CUDANet::Kernels::relu(
|
||||
const float* __restrict__ src,
|
||||
float* __restrict__ dst,
|
||||
int len
|
||||
) {
|
||||
int stride = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "convolution.cuh"
|
||||
#include <iostream>
|
||||
|
||||
#include "convolution.cuh"
|
||||
|
||||
/*
|
||||
Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
|
||||
padding x n_channels Matrix is represented as a pointer to a vector
|
||||
@@ -47,13 +48,13 @@ pre-allocated)
|
||||
n: Number of channels in input matrix
|
||||
p: Padding
|
||||
*/
|
||||
__global__ void Kernels::padding(
|
||||
const float* d_input,
|
||||
float* d_padded,
|
||||
int w,
|
||||
int h,
|
||||
int n,
|
||||
int p
|
||||
__global__ void CUDANet::Kernels::padding(
|
||||
const float* __restrict__ d_input,
|
||||
float* __restrict__ d_padded,
|
||||
const unsigned int w,
|
||||
const unsigned int h,
|
||||
const unsigned int n,
|
||||
const unsigned int p
|
||||
) {
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
@@ -78,16 +79,16 @@ __global__ void Kernels::padding(
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Kernels::convolution(
|
||||
const float* d_input,
|
||||
const float* d_kernel,
|
||||
float* d_output,
|
||||
int inputSize,
|
||||
int nChannels,
|
||||
int kernelSize,
|
||||
int stride,
|
||||
int nFilters,
|
||||
int outputSize
|
||||
__global__ void CUDANet::Kernels::convolution(
|
||||
const float* __restrict__ d_input,
|
||||
const float* __restrict__ d_kernel,
|
||||
float* __restrict__ d_output,
|
||||
const unsigned int inputSize,
|
||||
const unsigned int nChannels,
|
||||
const unsigned int kernelSize,
|
||||
const unsigned int stride,
|
||||
const unsigned int nFilters,
|
||||
const unsigned int outputSize
|
||||
) {
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
#include "cuda_helper.cuh"
|
||||
#include "matmul.cuh"
|
||||
|
||||
#define SHARED_SIZE 128 * 4
|
||||
|
||||
__global__ void Kernels::mat_vec_mul(
|
||||
__global__ void CUDANet::Kernels::mat_vec_mul(
|
||||
const float* __restrict__ d_matrix,
|
||||
const float* __restrict__ d_vector,
|
||||
float* __restrict__ d_output,
|
||||
int w,
|
||||
int h
|
||||
const unsigned int w,
|
||||
const unsigned int h
|
||||
) {
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
@@ -16,9 +14,8 @@ __global__ void Kernels::mat_vec_mul(
|
||||
|
||||
float temp = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (unsigned int i = 0; i < (w + BLOCK_SIZE - 1) / BLOCK_SIZE; i++)
|
||||
{
|
||||
#pragma unroll
|
||||
for (unsigned int i = 0; i < (w + BLOCK_SIZE - 1) / BLOCK_SIZE; i++) {
|
||||
if (i * BLOCK_SIZE + threadIdx.x < w) {
|
||||
shared[threadIdx.x] = d_vector[i * BLOCK_SIZE + threadIdx.x];
|
||||
} else {
|
||||
@@ -27,22 +24,22 @@ __global__ void Kernels::mat_vec_mul(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (unsigned int j = 0; j < BLOCK_SIZE; j++)
|
||||
{
|
||||
#pragma unroll
|
||||
for (unsigned int j = 0; j < BLOCK_SIZE; j++) {
|
||||
temp += d_matrix[tid * w + i * BLOCK_SIZE + j] * shared[j];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
d_output[tid] = temp;
|
||||
}
|
||||
|
||||
__global__ void Kernels::vec_vec_add(
|
||||
const float* d_vector1,
|
||||
const float* d_vector2,
|
||||
float* d_output,
|
||||
int w
|
||||
__global__ void CUDANet::Kernels::vec_vec_add(
|
||||
const float* __restrict__ d_vector1,
|
||||
const float* __restrict__ d_vector2,
|
||||
float* __restrict__ d_output,
|
||||
const unsigned int w
|
||||
) {
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid >= w) {
|
||||
@@ -50,3 +47,15 @@ __global__ void Kernels::vec_vec_add(
|
||||
}
|
||||
d_output[tid] = d_vector1[tid] + d_vector2[tid];
|
||||
}
|
||||
|
||||
__global__ void CUDANet::Kernels::reduce_sum(
|
||||
const float* __restrict__ d_vector,
|
||||
float* __restrict__ d_output,
|
||||
const unsigned int w
|
||||
) {
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
__shared__ float shared[BLOCK_SIZE];
|
||||
shared[threadIdx.x] = d_vector[tid];
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -7,6 +7,8 @@
|
||||
#include "cuda_helper.cuh"
|
||||
#include "matmul.cuh"
|
||||
|
||||
using namespace CUDANet;
|
||||
|
||||
Layers::Conv2d::Conv2d(
|
||||
int inputSize,
|
||||
int inputChannels,
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
#include "dense.cuh"
|
||||
#include "matmul.cuh"
|
||||
|
||||
using namespace CUDANet;
|
||||
|
||||
Layers::Dense::Dense(
|
||||
int inputSize,
|
||||
int outputSize,
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "cuda_helper.cuh"
|
||||
#include "input.cuh"
|
||||
|
||||
using namespace CUDANet;
|
||||
|
||||
Layers::Input::Input(int inputSize) : inputSize(inputSize) {
|
||||
d_output = nullptr;
|
||||
CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * inputSize));
|
||||
|
||||
Reference in New Issue
Block a user