Restructure cuda backend

This commit is contained in:
2024-09-05 22:23:47 +02:00
parent 65727dfee8
commit f8220f0ec1
19 changed files with 69 additions and 16 deletions

View File

@@ -20,6 +20,7 @@ if(USE_CUDA)
endif() endif()
file(GLOB_RECURSE CPU_SOURCES file(GLOB_RECURSE CPU_SOURCES
src/layers/*.cpp
src/model/*.cpp src/model/*.cpp
) )
@@ -27,10 +28,11 @@ set(LIBRARY_SOURCES ${CPU_SOURCES})
if(USE_CUDA) if(USE_CUDA)
file(GLOB_RECURSE CUDA_SOURCES file(GLOB_RECURSE CUDA_SOURCES
src/*.cu src/backends/cuda/*.cu
src/cuda/utils/*.cu src/backends/cuda/utils/*.cu
src/cuda/kernels/*.cu src/backends/cuda/kernels/*.cu
src/cuda/layers/*.cu src/backends/cuda/layers/*.cu
src/layers/*.cu # To be removed
) )
set(LIBRARY_SOURCES ${LIBRARY_SOURCES} ${CUDA_SOURCES}) set(LIBRARY_SOURCES ${LIBRARY_SOURCES} ${CUDA_SOURCES})
endif() endif()

View File

@@ -25,13 +25,23 @@ class Add {
* @param d_inputB Device pointer to the second input * @param d_inputB Device pointer to the second input
* *
*/ */
void forward(const float* d_inputA, const float* d_inputB); float* forward(const float* inputA, const float* inputB);
private: private:
int inputSize; int inputSize;
float* output;
float* forwardCPU(const float* inputA, const float* inputB);
#ifdef USE_CUDA
float* d_output;
int gridSize; int gridSize;
float* d_output; float* forwardCUDA(const float* d_inputA, const float* d_inputB);
void initCUDA();
void delCUDA();
#endif
}; };
} // namespace CUDANet::Layers } // namespace CUDANet::Layers

View File

@@ -1,26 +1,21 @@
#include "add.cuh" #include "add.hpp"
#include "matmul.cuh" #include "matmul.cuh"
#include "cuda_helper.cuh" #include "cuda_helper.cuh"
using namespace CUDANet::Layers; using namespace CUDANet::Layers;
void Add::initCUDA() {
Add::Add(int inputSize)
: inputSize(inputSize) {
d_output = nullptr; d_output = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * inputSize)); CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * inputSize));
gridSize = (inputSize + BLOCK_SIZE - 1) / BLOCK_SIZE; gridSize = (inputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
} }
void Add::delCUDA() {
Add::~Add() {
cudaFree(d_output); cudaFree(d_output);
} }
float* Add::forwardCUDA(const float* d_inputA, const float* d_inputB) {
void Add::forward(const float* d_inputA, const float* d_inputB) {
Kernels::vec_vec_add<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_vec_add<<<gridSize, BLOCK_SIZE>>>(
d_inputA, d_inputB, d_output, inputSize d_inputA, d_inputB, d_output, inputSize
@@ -28,4 +23,6 @@ void Add::forward(const float* d_inputA, const float* d_inputB) {
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
} return d_output;
}

44
src/layers/add.cpp Normal file
View File

@@ -0,0 +1,44 @@
#include "add.hpp"
#include <stddef.h>
using namespace CUDANet::Layers;
Add::Add(int inputSize)
: inputSize(inputSize) {
output = new float[inputSize];
#ifdef USE_CUDA
initCUDA();
#endif
}
Add::~Add() {
#ifdef USE_CUDA
delCUDA();
#endif
}
float* Add::forward(const float* inputA, const float* inputB) {
#ifdef USE_CUDA
return forwardCUDA(inputA, inputB);
#else
return forwardCPU(inputA, inputB);
#endif
}
float* Add::forwardCPU(const float* inputA, const float* inputB) {
for (size_t i = 0; i < inputSize; i++)
{
output[i] = inputA[i] + inputB[i];
}
return output;
}