diff --git a/include/backend/backend.hpp b/include/backend/backend.hpp index 6c91573..41a1d27 100644 --- a/include/backend/backend.hpp +++ b/include/backend/backend.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include "backend/tensor.hpp" namespace CUDANet::Backend { @@ -13,7 +14,13 @@ public: virtual void* allocate(size_t bytes) = 0; virtual void deallocate(void* ptr) = 0; - // Layer operations + // Tensor ops + virtual void print(const CUDANet::Backend::Tensor &input) = 0; + virtual void clear(CUDANet::Backend::Tensor &input) = 0; + virtual void sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) = 0; + virtual void max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) = 0; + + // Layer ops virtual void relu(CUDANet::Backend::Tensor &tensor) = 0; virtual void sigmoid(CUDANet::Backend::Tensor &tensor) = 0; virtual void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) = 0; diff --git a/include/backend/cuda_backend.cuh b/include/backend/cuda.cuh similarity index 63% rename from include/backend/cuda_backend.cuh rename to include/backend/cuda.cuh index 71e0b66..f2b788d 100644 --- a/include/backend/cuda_backend.cuh +++ b/include/backend/cuda.cuh @@ -11,7 +11,13 @@ public: void* allocate(size_t bytes) override; void deallocate(void* ptr) override; - // Layer operations + // Tensor ops + void print(const CUDANet::Backend::Tensor &input) override; + void clear(CUDANet::Backend::Tensor &input) override; + void sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) override; + void max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) override; + + // Layer ops void relu(CUDANet::Backend::Tensor &tensor) override; void sigmoid(CUDANet::Backend::Tensor &tensor) override; void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) override; diff --git a/include/backend/tensor.hpp b/include/backend/tensor.hpp index 9960a84..f327a4f 100644 --- a/include/backend/tensor.hpp +++ b/include/backend/tensor.hpp @@ -26,19 +26,20 @@ public: void* allocate(); void deallocate(); - void toDevice(const void* hostPtr); - void toHost(void* hostPtr); - size_t size() const; size_t numel() const; - void* data() const; + + template + const T* data() const; + + template + T* data(); private: Shape shape; DType dtype; IBackend* backend; - void* devicePtr; - void* hostPtr; + void* d_ptr; }; } // namespace CUDANet::Backend \ No newline at end of file diff --git a/include/utils/vector.cuh b/include/utils/vector.cuh deleted file mode 100644 index 24e36af..0000000 --- a/include/utils/vector.cuh +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef CUDANET_VECTOR_H -#define CUDANET_VECTOR_H - -namespace CUDANet::Utils { - - -/** - * @brief Utility function that prints a vector - * - * @param d_vec Pointer to the vector on device - * @param length Length of the vector - */ -void print_vec(const float *d_vec, const unsigned int length); - -/** - * @brief Utility function that clears a vector - * - * @param d_vector Pointer to the vector on device - * @param len Length of the vector - */ -void clear(float *d_vector, const unsigned int len); - - -/** - * @brief Utility function that returns the sum of a vector - * - * @param d_vec Pointer to the vector - * @param length Length of the vector - */ -void sum(const float *d_vec, float *d_sum, const unsigned int length); - - -/** - * @brief Get the max of a vector - * - * @param d_vec Pointer to the vector - * @param length Length of the vector - */ -void max(const float *d_vec, float *d_max, const unsigned int length); - - -/** - * @brief Compute the mean of the vector - * - * @param d_vec Device pointer to the vector - * @param d_mean Device pointer to the mean - * @param d_length Device pointer to the length - * @param length Length of the vector - */ -void mean(const float *d_vec, float *d_mean, float *d_length, int length); - -/** - * @brief Compute the variance of a vector - * - * @param d_vec - * @param d_var - * @param length - */ -void var(float *d_vec, float *d_var, float *d_length, const unsigned int length); - -} // namespace CUDANet::Utils - -#endif // CUDANET_VECTOR_H \ No newline at end of file diff --git a/src/backends/cuda/cuda_backend.cu b/src/backends/cuda/cuda_backend.cu index 19a57a2..3201140 100644 --- a/src/backends/cuda/cuda_backend.cu +++ b/src/backends/cuda/cuda_backend.cu @@ -16,15 +16,6 @@ void CUDABackend::deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); } -// void CUDABackend::copyToDevice(void* devicePtr, const void* hostPtr, size_t bytes) { -// CUDA_CHECK(cudaMemcpy(devicePtr, hostPtr, bytes, cudaMemcpyHostToDevice)); -// CUDA_CHECK(cudaDeviceSynchronize()); -// } - -// void CUDABackend::copyToHost(void* hostPtr, const void* devicePtr, size_t bytes) { -// CUDA_CHECK(cudaMemcpy(hostPtr, devicePtr, bytes, cudaMemcpyDeviceToHost)); -// CUDA_CHECK(cudaDeviceSynchronize()); -// } void CUDABackend::relu(Tensor &tensor) { int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; diff --git a/src/backends/cuda/tensor_ops.cu b/src/backends/cuda/tensor_ops.cu new file mode 100644 index 0000000..00fd8b7 --- /dev/null +++ b/src/backends/cuda/tensor_ops.cu @@ -0,0 +1,65 @@ +#include + +#include "backend/backend.hpp" +#include "backend/cuda.cuh" +#include "utils/cuda_helper.cuh" +#include "kernels/matmul.cuh" + +using namespace CUDANet::Backend; + +void CUDABackend::print(const CUDANet::Backend::Tensor &input) { + auto length = input.numel(); + std::vector h_vec(input.numel()); + + CUDA_CHECK(cudaMemcpy( + h_vec.data(), input.data(), sizeof(float) * length, cudaMemcpyDeviceToHost + )); + + for (int i = 0; i < length; ++i) { + std::cout << h_vec[i] << ", "; + } + + std::cout << std::endl; +} + +void CUDABackend::clear(CUDANet::Backend::Tensor &input) { + CUDA_CHECK(cudaMemset(input.data(), 0, sizeof(float) * input.numel())); +} + +void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &sum) { + auto length = input.numel(); + const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE; + + CUDANet::Kernels::sum_reduce<<>>( + input.data(), sum.data(), length + ); + CUDA_CHECK(cudaGetLastError()); + + int remaining = gridSize; + while (remaining > 1) { + int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; + CUDANet::Kernels::sum_reduce<<>>(sum.data(), sum.data(), remaining); + CUDA_CHECK(cudaGetLastError()); + + remaining = blocks_needed; + } + +} + +void CUDABackend::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) { + auto length = input.numel(); + const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; + + Kernels::max_reduce<<>>(input.data(), max.data(), length); + CUDA_CHECK(cudaGetLastError()); + + int remaining = grid_size; + + while (remaining > 1) { + int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; + CUDANet::Kernels::max_reduce<<>>(max.data(), max.data(), remaining); + CUDA_CHECK(cudaGetLastError()); + + remaining = blocks_needed; + } +} diff --git a/src/backends/cuda/utils/vector.cu b/src/backends/cuda/utils/vector.cu deleted file mode 100644 index 9dfb950..0000000 --- a/src/backends/cuda/utils/vector.cu +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include - -#include "vector.cuh" -#include "matmul.cuh" -#include "cuda_helper.cuh" - -using namespace CUDANet; - -void Utils::print_vec(const float* d_vec, const unsigned int length) { - std::vector h_vec(length); - CUDA_CHECK(cudaMemcpy( - h_vec.data(), d_vec, sizeof(float) * length, cudaMemcpyDeviceToHost - )); - - for (int i = 0; i < length; ++i) { - std::cout << h_vec[i] << ", "; - } - - std::cout << std::endl; -} - -void Utils::clear(float* d_vec, const unsigned int length) { - CUDA_CHECK(cudaMemset(d_vec, 0, sizeof(float) * length)); -} - -void Utils::max(const float* d_vec, float* d_max, const unsigned int length) { - - const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; - Kernels::max_reduce<<>>(d_vec, d_max, length); - CUDA_CHECK(cudaGetLastError()); - - int remaining = grid_size; - - while (remaining > 1) { - int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; - CUDANet::Kernels::max_reduce<<>>(d_max, d_max, remaining); - CUDA_CHECK(cudaGetLastError()); - - remaining = blocks_needed; - } - -} - -void Utils::sum(const float* d_vec, float* d_sum, const unsigned int length) { - - const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; - - CUDANet::Kernels::sum_reduce<<>>( - d_vec, d_sum, length - ); - CUDA_CHECK(cudaGetLastError()); - - int remaining = gridSize; - while (remaining > 1) { - int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; - CUDANet::Kernels::sum_reduce<<>>(d_sum, d_sum, remaining); - CUDA_CHECK(cudaGetLastError()); - - remaining = blocks_needed; - } -} - -void Utils::mean(const float* d_vec, float* d_mean, float *d_length, int length) { - Utils::sum(d_vec, d_mean, length); - - const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; - Kernels::vec_scalar_div<<>>( - d_mean, - d_mean, - d_length, - length - ); - - CUDA_CHECK(cudaGetLastError()); -} - - -void Utils::var(float* d_vec, float* d_var, float *d_length, const unsigned int length) { - - const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; - - Kernels::vec_vec_mul<<>>( - d_vec, - d_vec, - d_var, - length - ); - CUDA_CHECK(cudaGetLastError()); - - // Sum over all differences - Utils::sum( - d_var, - d_var, - length - ); - - // Divide by difference sum / length -> variance - Kernels::vec_scalar_div<<>>( - d_var, - d_var, - d_length, - length - ); - CUDA_CHECK(cudaGetLastError()); - -} \ No newline at end of file diff --git a/src/backends/tensor.cpp b/src/backends/tensor.cpp index 77ea2a1..cd72b99 100644 --- a/src/backends/tensor.cpp +++ b/src/backends/tensor.cpp @@ -5,7 +5,7 @@ using namespace CUDANet::Backend; Tensor::Tensor(Shape shape, DType dtype, IBackend* backend) - : shape(shape), dtype(dtype), backend(backend), devicePtr(nullptr), hostPtr(nullptr) {} + : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {} Tensor::~Tensor() { deallocate(); @@ -34,6 +34,12 @@ size_t Tensor::size() const { return totalSize * typeSize; } -void* Tensor::data() const { - return devicePtr; +template +const T* Tensor::data() const { + return static_cast(d_ptr); +} + +template +T* Tensor::data() { + return static_cast(d_ptr); } \ No newline at end of file