diff --git a/include/backend.hpp b/include/backend.hpp index dc71ded..88b2887 100644 --- a/include/backend.hpp +++ b/include/backend.hpp @@ -90,6 +90,12 @@ class Backend { CUDANet::Tensor& input_b, CUDANet::Tensor& output ) = 0; + + virtual CUDANet::Tensor& add( + CUDANet::Tensor& input_a, + CUDANet::Tensor& input_b, + CUDANet::Tensor& output + ) = 0; }; } // namespace CUDANet \ No newline at end of file diff --git a/include/backend/cuda.cuh b/include/backend/cuda.cuh index 638650b..68d9adb 100644 --- a/include/backend/cuda.cuh +++ b/include/backend/cuda.cuh @@ -86,6 +86,12 @@ class CUDA : public Backend { CUDANet::Tensor& input_b, CUDANet::Tensor& output ) override; + + CUDANet::Tensor& add( + CUDANet::Tensor& input_a, + CUDANet::Tensor& input_b, + CUDANet::Tensor& output + ) override; }; } // namespace CUDANet::Backend \ No newline at end of file diff --git a/include/layers/add.hpp b/include/layers/add.hpp index 95d1e12..72aa849 100644 --- a/include/layers/add.hpp +++ b/include/layers/add.hpp @@ -1,49 +1,24 @@ -#ifndef CUDANET_ADD_LAYER_H -#define CUDANET_ADD_LAYER_H +#pragma once + +#include "shape.hpp" +#include "tensor.hpp" namespace CUDANet::Layers { class Add { public: - /** - * @brief Create a new Add layer - * - * @param inputSize Size of the input arrays - */ - Add(int inputSize); + Add(CUDANet::Shape a_shape, CUDANet::Shape b_shape, CUDANet::Backend* backend); - /** - * @brief Destroy the Add layer - * - */ ~Add(); - /** - * @brief Adds first input to second input - * - * @param d_inputA Device pointer to the first input - * @param d_inputB Device pointer to the second input - * - */ - float* forward(const float* inputA, const float* inputB); + CUDANet::Tensor& + forward(CUDANet::Tensor& input_a, CUDANet::Tensor& input_b); private: - int inputSize; + CUDANet::Shape out_shape; + CUDANet::Tensor output; - float* output; - - float* forwardCPU(const float* inputA, const float* inputB); - -#ifdef USE_CUDA - float* d_output; - int gridSize; - - float* forwardCUDA(const float* d_inputA, const float* d_inputB); - void initCUDA(); - void delCUDA(); -#endif + CUDANet::Backend *backend; }; } // namespace CUDANet::Layers - -#endif // CUDANET_ADD_LAYER_H \ No newline at end of file diff --git a/src/backends/cuda/layer_ops.cu b/src/backends/cuda/layer_ops.cu index 11079ad..69eb374 100644 --- a/src/backends/cuda/layer_ops.cu +++ b/src/backends/cuda/layer_ops.cu @@ -211,6 +211,7 @@ CUDANet::Tensor& CUDA::batch_norm( ); CUDA_CHECK(cudaGetLastError()); } + CUDA_CHECK(cudaDeviceSynchronize()); } CUDANet::Tensor& CUDA::concat( @@ -228,6 +229,23 @@ CUDANet::Tensor& CUDA::concat( cudaMemcpyDeviceToDevice )); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + return output; +} + +CUDANet::Tensor& CUDA::add( + CUDANet::Tensor& input_a, + CUDANet::Tensor& input_b, + CUDANet::Tensor& output +) { + auto gridSize = (input_a.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; + + Kernels::vec_vec_add<<>>( + input_a.data(), input_b.data(), output.data(), input_a.numel() + ); + CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); return output; diff --git a/src/backends/cuda/layers/add.cu b/src/backends/cuda/layers/add.cu deleted file mode 100644 index c6a735d..0000000 --- a/src/backends/cuda/layers/add.cu +++ /dev/null @@ -1,28 +0,0 @@ -#include "add.hpp" -#include "matmul.cuh" -#include "cuda_helper.cuh" - -using namespace CUDANet::Layers; - -void Add::initCUDA() { - d_output = nullptr; - CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * inputSize)); - - gridSize = (inputSize + BLOCK_SIZE - 1) / BLOCK_SIZE; -} - -void Add::delCUDA() { - cudaFree(d_output); -} - -float* Add::forwardCUDA(const float* d_inputA, const float* d_inputB) { - - Kernels::vec_vec_add<<>>( - d_inputA, d_inputB, d_output, inputSize - ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaDeviceSynchronize()); - - return d_output; - -} diff --git a/src/layers/add.cpp b/src/layers/add.cpp index 8d101f5..2abbad4 100644 --- a/src/layers/add.cpp +++ b/src/layers/add.cpp @@ -1,44 +1,22 @@ #include "add.hpp" -#include - using namespace CUDANet::Layers; -Add::Add(int inputSize) - : inputSize(inputSize) { - - output = new float[inputSize]; - -#ifdef USE_CUDA - initCUDA(); -#endif - -} - - -Add::~Add() { -#ifdef USE_CUDA - delCUDA(); -#endif -} - - -float* Add::forward(const float* inputA, const float* inputB) { - -#ifdef USE_CUDA - return forwardCUDA(inputA, inputB); -#else - return forwardCPU(inputA, inputB); -#endif - -} - -float* Add::forwardCPU(const float* inputA, const float* inputB) { - for (size_t i = 0; i < inputSize; i++) - { - output[i] = inputA[i] + inputB[i]; +Add::Add(CUDANet::Shape a_shape, CUDANet::Shape b_shape, CUDANet::Backend* backend) : backend(backend) { + if (a_shape != b_shape) { + throw InvalidShapeException( + "Add requires matching dimensions", a_shape, b_shape + ); } - return output; -} \ No newline at end of file + out_shape = a_shape; + output = CUDANet::Tensor(out_shape, CUDANet::DType::FLOAT32, backend); +} + +Add::~Add() {} + +CUDANet::Tensor& +Add::forward(CUDANet::Tensor& input_a, CUDANet::Tensor& input_b) { + output.zero(); +} diff --git a/src/layers/concat.cpp b/src/layers/concat.cpp index cad4ca0..e4f365d 100644 --- a/src/layers/concat.cpp +++ b/src/layers/concat.cpp @@ -6,7 +6,7 @@ Concat::Concat(const CUDANet::Shape a_shape, const CUDANet::Shape b_shape, CUDAN : a_shape(a_shape), b_shape(b_shape), backend(backend) { if (a_shape[0] != b_shape[0] || a_shape[1] != b_shape[1]) { throw InvalidShapeException( - "Concat requires matching batch and height dimensions", a_shape, + "Concat requires matching height and width dimensions", a_shape, b_shape ); }