diff --git a/include/backend/tensor.hpp b/include/backend/tensor.hpp index f327a4f..0586f25 100644 --- a/include/backend/tensor.hpp +++ b/include/backend/tensor.hpp @@ -23,9 +23,6 @@ public: Tensor(Shape shape, DType dtype, IBackend* backend); ~Tensor(); - void* allocate(); - void deallocate(); - size_t size() const; size_t numel() const; @@ -38,6 +35,10 @@ public: private: Shape shape; DType dtype; + + size_t total_elms; + size_t total_size; + IBackend* backend; void* d_ptr; }; diff --git a/src/backends/cuda/cuda_backend.cu b/src/backends/cuda/cuda_backend.cu index 3201140..6e47316 100644 --- a/src/backends/cuda/cuda_backend.cu +++ b/src/backends/cuda/cuda_backend.cu @@ -1,60 +1,39 @@ -#include "backend/cuda_backend.cuh" -#include "utils/cuda_helper.cuh" -#include "kernels/activation_functions.cuh" -#include "kernels/matmul.cuh" -#include "utils/vector.cuh" +#include + +#include +#include +#include + +#include "backend/cuda.cuh" + +cudaDeviceProp initializeCUDA() { + int deviceCount; + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + + if (deviceCount == 0) { + std::fprintf(stderr, "No CUDA devices found. Exiting.\n"); + std::exit(EXIT_FAILURE); + } + + int device = 0; + CUDA_CHECK(cudaSetDevice(device)); + + cudaDeviceProp deviceProp; + CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, device)); + + std::printf("Using CUDA device %d: %s\n", device, deviceProp.name); + + return deviceProp; +} using namespace CUDANet::Backend; -void *CUDABackend::allocate(size_t bytes) { - void* devicePtr = nullptr; - CUDA_CHECK(cudaMalloc(&devicePtr, bytes)); - return devicePtr; +void* CUDABackend::allocate(size_t bytes) { + void* d_ptr = nullptr; + CUDA_CHECK(cudaMalloc(&d_ptr, bytes)); + return d_ptr; } void CUDABackend::deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); } - - -void CUDABackend::relu(Tensor &tensor) { - int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; - Kernels::relu<<>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel()); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaDeviceSynchronize()); -} - -void CUDABackend::sigmoid(Tensor &tensor) { - int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; - Kernels::sigmoid<<>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel()); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaDeviceSynchronize()); -} - -void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) { - int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; - - // Find max value - Utils::max(tensor, temp_max, tensor.numel()); - - // Subtract max value to improve numerical stability - Kernels::vec_scalar_sub<<>>( - (float*)tensor.data(), (float*)tensor.data(), (float*)temp_max.data(), tensor.numel() - ); - CUDA_CHECK(cudaGetLastError()); - - // Compute exponentials - Kernels::vec_exp<<>>( - (float*)tensor.data(), (float*)tensor.data(), tensor.numel() - ); - CUDA_CHECK(cudaGetLastError()); - - // Find sum - Utils::sum(tensor, temp_sum, tensor.numel()); - - Kernels::vec_scalar_div<<>>( - (float*)tensor.data(), (float*)tensor.data(), (float*)temp_sum.data(), tensor.numel() - ); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaDeviceSynchronize()); -} diff --git a/src/backends/cuda/layer_ops.cu b/src/backends/cuda/layer_ops.cu new file mode 100644 index 0000000..9b70f4e --- /dev/null +++ b/src/backends/cuda/layer_ops.cu @@ -0,0 +1,48 @@ +#include "backend/cuda.cuh" +#include "utils/cuda_helper.cuh" +#include "kernels/activation_functions.cuh" +#include "kernels/matmul.cuh" + +using namespace CUDANet::Backend; + +void CUDABackend::relu(Tensor &tensor) { + int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; + Kernels::relu<<>>(tensor.data(), tensor.data(), tensor.numel()); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void CUDABackend::sigmoid(Tensor &tensor) { + int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; + Kernels::sigmoid<<>>(tensor.data(), tensor.data(), tensor.numel()); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) { + int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; + + // Find max value + max(tensor, temp_max); + + // Subtract max value to improve numerical stability + Kernels::vec_scalar_sub<<>>( + tensor.data(), tensor.data(), temp_max.data(), tensor.numel() + ); + CUDA_CHECK(cudaGetLastError()); + + // Compute exponentials + Kernels::vec_exp<<>>( + tensor.data(), tensor.data(), tensor.numel() + ); + CUDA_CHECK(cudaGetLastError()); + + // Find sum + sum(tensor, temp_sum); + + Kernels::vec_scalar_div<<>>( + tensor.data(), tensor.data(), temp_sum.data(), tensor.numel() + ); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} \ No newline at end of file diff --git a/src/backends/cuda/tensor_ops.cu b/src/backends/cuda/tensor_ops.cu index 00fd8b7..b5334e4 100644 --- a/src/backends/cuda/tensor_ops.cu +++ b/src/backends/cuda/tensor_ops.cu @@ -43,7 +43,6 @@ void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::T remaining = blocks_needed; } - } void CUDABackend::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) { diff --git a/src/backends/cuda/utils/cuda_helper.cu b/src/backends/cuda/utils/cuda_helper.cu deleted file mode 100644 index d1bd99b..0000000 --- a/src/backends/cuda/utils/cuda_helper.cu +++ /dev/null @@ -1,26 +0,0 @@ -#include - -#include -#include - -#include "cuda_helper.cuh" - -cudaDeviceProp initializeCUDA() { - int deviceCount; - CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); - - if (deviceCount == 0) { - std::fprintf(stderr, "No CUDA devices found. Exiting.\n"); - std::exit(EXIT_FAILURE); - } - - int device = 0; - CUDA_CHECK(cudaSetDevice(device)); - - cudaDeviceProp deviceProp; - CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, device)); - - std::printf("Using CUDA device %d: %s\n", device, deviceProp.name); - - return deviceProp; -} \ No newline at end of file diff --git a/src/backends/tensor.cpp b/src/backends/tensor.cpp index cd72b99..6a164d0 100644 --- a/src/backends/tensor.cpp +++ b/src/backends/tensor.cpp @@ -1,37 +1,44 @@ -#include - #include "backend/tensor.hpp" +#include + using namespace CUDANet::Backend; Tensor::Tensor(Shape shape, DType dtype, IBackend* backend) - : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {} - -Tensor::~Tensor() { - deallocate(); -} - -size_t Tensor::numel() const { - size_t totalElements = 1; + : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) { + // Count total elements + size_t count = 1; for (const auto& dim : shape) { - totalElements *= dim; + count *= dim; } - return totalElements; -} + total_elms = count; -size_t Tensor::size() const { - size_t totalSize = numel(); - - size_t typeSize = 0; + // Compute total size (bytes) + size_t type_size = 0; switch (dtype) { case DType::FLOAT32: - typeSize = 4; + type_size = 4; break; default: throw std::runtime_error("Unsupported data type"); } + total_size = total_elms * type_size; - return totalSize * typeSize; + // Allocate memory on backend + d_ptr = backend->allocate(total_size); +} + +Tensor::~Tensor() { + backend->deallocate(d_ptr); + d_ptr = nullptr; +} + +size_t Tensor::numel() const { + return total_elms; +} + +size_t Tensor::size() const { + return total_size; } template @@ -42,4 +49,4 @@ const T* Tensor::data() const { template T* Tensor::data() { return static_cast(d_ptr); -} \ No newline at end of file +}