diff --git a/include/backend/backend.hpp b/include/backend/backend.hpp index 473a7b9..6c91573 100644 --- a/include/backend/backend.hpp +++ b/include/backend/backend.hpp @@ -13,9 +13,10 @@ public: virtual void* allocate(size_t bytes) = 0; virtual void deallocate(void* ptr) = 0; - virtual void copyToDevice(void* devicePtr, const void* hostPtr, size_t bytes) = 0; - virtual void copyToHost(void* hostPtr, const void* devicePtr, size_t bytes) = 0; - + // Layer operations + virtual void relu(CUDANet::Backend::Tensor &tensor) = 0; + virtual void sigmoid(CUDANet::Backend::Tensor &tensor) = 0; + virtual void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) = 0; }; } // namespace CUDANet::Backend \ No newline at end of file diff --git a/include/backend/cuda_backend.cuh b/include/backend/cuda_backend.cuh new file mode 100644 index 0000000..71e0b66 --- /dev/null +++ b/include/backend/cuda_backend.cuh @@ -0,0 +1,23 @@ +#pragma once + +#include "backend/backend.hpp" +#include "backend/tensor.hpp" + +namespace CUDANet::Backend { + +class CUDABackend : public IBackend { +public: + // Memory management + void* allocate(size_t bytes) override; + void deallocate(void* ptr) override; + + // Layer operations + void relu(CUDANet::Backend::Tensor &tensor) override; + void sigmoid(CUDANet::Backend::Tensor &tensor) override; + void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) override; + +private: + static constexpr int BLOCK_SIZE = 256; +}; + +} // namespace CUDANet::Backend \ No newline at end of file diff --git a/include/backend/tensor.hpp b/include/backend/tensor.hpp index 87c2e04..9960a84 100644 --- a/include/backend/tensor.hpp +++ b/include/backend/tensor.hpp @@ -18,6 +18,8 @@ typedef std::vector Shape; class Tensor { public: + + Tensor() = default; Tensor(Shape shape, DType dtype, IBackend* backend); ~Tensor(); @@ -27,6 +29,10 @@ public: void toDevice(const void* hostPtr); void toHost(void* hostPtr); + size_t size() const; + size_t numel() const; + void* data() const; + private: Shape shape; DType dtype; diff --git a/include/layers/activation.hpp b/include/layers/activation.hpp index c157225..49787cf 100644 --- a/include/layers/activation.hpp +++ b/include/layers/activation.hpp @@ -1,5 +1,7 @@ -#ifndef CUDANET_ACTIVATION_H -#define CUDANET_ACTIVATION_H +#pragma once + +#include "backend/tensor.hpp" +#include "backend/backend.hpp" namespace CUDANet::Layers { @@ -41,29 +43,16 @@ class Activation { * * @param d_input Pointer to the input vector on the device */ - void activate(float* d_input); + void activate(CUDANet::Backend::Tensor input); private: + CUDANet::Backend::IBackend* backend; ActivationType activationType; int length; - void activateCPU(float* input); - -#ifdef USE_CUDA - int gridSize; - - float* d_softmax_sum; - float* d_max; - - void activateCUDA(float* d_input); - - void initCUDA(); - void delCUDA(); -#endif + CUDANet::Backend::Tensor softmax_sum; + CUDANet::Backend::Tensor tensor_max; }; - } // namespace CUDANet::Layers - -#endif // CUDANET_ACTIVATION_H \ No newline at end of file diff --git a/src/backends/cuda/cuda_backend.cu b/src/backends/cuda/cuda_backend.cu new file mode 100644 index 0000000..19a57a2 --- /dev/null +++ b/src/backends/cuda/cuda_backend.cu @@ -0,0 +1,69 @@ +#include "backend/cuda_backend.cuh" +#include "utils/cuda_helper.cuh" +#include "kernels/activation_functions.cuh" +#include "kernels/matmul.cuh" +#include "utils/vector.cuh" + +using namespace CUDANet::Backend; + +void *CUDABackend::allocate(size_t bytes) { + void* devicePtr = nullptr; + CUDA_CHECK(cudaMalloc(&devicePtr, bytes)); + return devicePtr; +} + +void CUDABackend::deallocate(void* ptr) { + CUDA_CHECK(cudaFree(ptr)); +} + +// void CUDABackend::copyToDevice(void* devicePtr, const void* hostPtr, size_t bytes) { +// CUDA_CHECK(cudaMemcpy(devicePtr, hostPtr, bytes, cudaMemcpyHostToDevice)); +// CUDA_CHECK(cudaDeviceSynchronize()); +// } + +// void CUDABackend::copyToHost(void* hostPtr, const void* devicePtr, size_t bytes) { +// CUDA_CHECK(cudaMemcpy(hostPtr, devicePtr, bytes, cudaMemcpyDeviceToHost)); +// CUDA_CHECK(cudaDeviceSynchronize()); +// } + +void CUDABackend::relu(Tensor &tensor) { + int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; + Kernels::relu<<>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel()); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void CUDABackend::sigmoid(Tensor &tensor) { + int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; + Kernels::sigmoid<<>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel()); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) { + int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; + + // Find max value + Utils::max(tensor, temp_max, tensor.numel()); + + // Subtract max value to improve numerical stability + Kernels::vec_scalar_sub<<>>( + (float*)tensor.data(), (float*)tensor.data(), (float*)temp_max.data(), tensor.numel() + ); + CUDA_CHECK(cudaGetLastError()); + + // Compute exponentials + Kernels::vec_exp<<>>( + (float*)tensor.data(), (float*)tensor.data(), tensor.numel() + ); + CUDA_CHECK(cudaGetLastError()); + + // Find sum + Utils::sum(tensor, temp_sum, tensor.numel()); + + Kernels::vec_scalar_div<<>>( + (float*)tensor.data(), (float*)tensor.data(), (float*)temp_sum.data(), tensor.numel() + ); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} diff --git a/src/backends/tensor.cpp b/src/backends/tensor.cpp index 9413b5b..77ea2a1 100644 --- a/src/backends/tensor.cpp +++ b/src/backends/tensor.cpp @@ -1,3 +1,5 @@ +#include + #include "backend/tensor.hpp" using namespace CUDANet::Backend; @@ -9,3 +11,29 @@ Tensor::~Tensor() { deallocate(); } +size_t Tensor::numel() const { + size_t totalElements = 1; + for (const auto& dim : shape) { + totalElements *= dim; + } + return totalElements; +} + +size_t Tensor::size() const { + size_t totalSize = numel(); + + size_t typeSize = 0; + switch (dtype) { + case DType::FLOAT32: + typeSize = 4; + break; + default: + throw std::runtime_error("Unsupported data type"); + } + + return totalSize * typeSize; +} + +void* Tensor::data() const { + return devicePtr; +} \ No newline at end of file diff --git a/src/layers/activation.cpp b/src/layers/activation.cpp index 08a864d..64f7590 100644 --- a/src/layers/activation.cpp +++ b/src/layers/activation.cpp @@ -2,30 +2,33 @@ #include #include "activation.hpp" +#include "backend/tensor.hpp" using namespace CUDANet::Layers; Activation::Activation(ActivationType activation, const int length) : activationType(activation), length(length) { -#ifdef USE_CUDA - initCUDA(); -#endif + + + if (activationType == SOFTMAX) { + softmax_sum = CUDANet::Backend::Tensor({static_cast(length)}, CUDANet::Backend::DType::FLOAT32, nullptr); + tensor_max = CUDANet::Backend::Tensor({static_cast(length)}, CUDANet::Backend::DType::FLOAT32, nullptr); + } } -Activation::~Activation() { -#ifdef USE_CUDA - delCUDA(); -#endif -} - -void Activation::activateCPU(float* input) { - throw std::logic_error("Not implemented"); -} - -void Activation::activate(float* input) { -#ifdef USE_CUDA - activateCUDA(input); -#else - activateCPU(input); -#endif +void Activation::activate(CUDANet::Backend::Tensor input) { + switch (activationType) + { + case ActivationType::SIGMOID: + backend->sigmoid(input); + break; + case ActivationType::RELU: + /* code */ + break; + case ActivationType::SOFTMAX: + /* code */ + break; + default: + break; + } } \ No newline at end of file