Add toplevel CUDANet namespace

This commit is contained in:
2024-03-17 16:08:53 +01:00
parent dc86cddeb7
commit 0c22fac64e
19 changed files with 183 additions and 149 deletions

View File

@@ -2,7 +2,7 @@
#include "activations.cuh"
__global__ void Kernels::sigmoid(
__global__ void CUDANet::Kernels::sigmoid(
const float* __restrict__ src,
float* __restrict__ dst,
int len
@@ -15,8 +15,11 @@ __global__ void Kernels::sigmoid(
}
}
__global__ void
Kernels::relu(const float* __restrict__ src, float* __restrict__ dst, int len) {
__global__ void CUDANet::Kernels::relu(
const float* __restrict__ src,
float* __restrict__ dst,
int len
) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;

View File

@@ -1,6 +1,7 @@
#include "convolution.cuh"
#include <iostream>
#include "convolution.cuh"
/*
Pads matrix width x height x n_channels to width + 2 * padding x height + 2 *
padding x n_channels Matrix is represented as a pointer to a vector
@@ -47,13 +48,13 @@ pre-allocated)
n: Number of channels in input matrix
p: Padding
*/
__global__ void Kernels::padding(
const float* d_input,
float* d_padded,
int w,
int h,
int n,
int p
__global__ void CUDANet::Kernels::padding(
const float* __restrict__ d_input,
float* __restrict__ d_padded,
const unsigned int w,
const unsigned int h,
const unsigned int n,
const unsigned int p
) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
@@ -78,16 +79,16 @@ __global__ void Kernels::padding(
}
}
__global__ void Kernels::convolution(
const float* d_input,
const float* d_kernel,
float* d_output,
int inputSize,
int nChannels,
int kernelSize,
int stride,
int nFilters,
int outputSize
__global__ void CUDANet::Kernels::convolution(
const float* __restrict__ d_input,
const float* __restrict__ d_kernel,
float* __restrict__ d_output,
const unsigned int inputSize,
const unsigned int nChannels,
const unsigned int kernelSize,
const unsigned int stride,
const unsigned int nFilters,
const unsigned int outputSize
) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;

View File

@@ -1,14 +1,12 @@
#include "cuda_helper.cuh"
#include "matmul.cuh"
#define SHARED_SIZE 128 * 4
__global__ void Kernels::mat_vec_mul(
__global__ void CUDANet::Kernels::mat_vec_mul(
const float* __restrict__ d_matrix,
const float* __restrict__ d_vector,
float* __restrict__ d_output,
int w,
int h
const unsigned int w,
const unsigned int h
) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
@@ -16,9 +14,8 @@ __global__ void Kernels::mat_vec_mul(
float temp = 0.0f;
#pragma unroll
for (unsigned int i = 0; i < (w + BLOCK_SIZE - 1) / BLOCK_SIZE; i++)
{
#pragma unroll
for (unsigned int i = 0; i < (w + BLOCK_SIZE - 1) / BLOCK_SIZE; i++) {
if (i * BLOCK_SIZE + threadIdx.x < w) {
shared[threadIdx.x] = d_vector[i * BLOCK_SIZE + threadIdx.x];
} else {
@@ -27,22 +24,22 @@ __global__ void Kernels::mat_vec_mul(
__syncthreads();
for (unsigned int j = 0; j < BLOCK_SIZE; j++)
{
#pragma unroll
for (unsigned int j = 0; j < BLOCK_SIZE; j++) {
temp += d_matrix[tid * w + i * BLOCK_SIZE + j] * shared[j];
}
__syncthreads();
}
d_output[tid] = temp;
}
__global__ void Kernels::vec_vec_add(
const float* d_vector1,
const float* d_vector2,
float* d_output,
int w
__global__ void CUDANet::Kernels::vec_vec_add(
const float* __restrict__ d_vector1,
const float* __restrict__ d_vector2,
float* __restrict__ d_output,
const unsigned int w
) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= w) {
@@ -50,3 +47,15 @@ __global__ void Kernels::vec_vec_add(
}
d_output[tid] = d_vector1[tid] + d_vector2[tid];
}
__global__ void CUDANet::Kernels::reduce_sum(
const float* __restrict__ d_vector,
float* __restrict__ d_output,
const unsigned int w
) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float shared[BLOCK_SIZE];
shared[threadIdx.x] = d_vector[tid];
__syncthreads();
}

View File

@@ -7,6 +7,8 @@
#include "cuda_helper.cuh"
#include "matmul.cuh"
using namespace CUDANet;
Layers::Conv2d::Conv2d(
int inputSize,
int inputChannels,

View File

@@ -10,6 +10,8 @@
#include "dense.cuh"
#include "matmul.cuh"
using namespace CUDANet;
Layers::Dense::Dense(
int inputSize,
int outputSize,

View File

@@ -1,6 +1,8 @@
#include "cuda_helper.cuh"
#include "input.cuh"
using namespace CUDANet;
Layers::Input::Input(int inputSize) : inputSize(inputSize) {
d_output = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * inputSize));