WIP Implement Tensor constructor and destructor

This commit is contained in:
2025-11-17 22:36:29 +01:00
parent 6744c8964f
commit d231e515b1
6 changed files with 110 additions and 102 deletions

View File

@@ -23,9 +23,6 @@ public:
Tensor(Shape shape, DType dtype, IBackend* backend); Tensor(Shape shape, DType dtype, IBackend* backend);
~Tensor(); ~Tensor();
void* allocate();
void deallocate();
size_t size() const; size_t size() const;
size_t numel() const; size_t numel() const;
@@ -38,6 +35,10 @@ public:
private: private:
Shape shape; Shape shape;
DType dtype; DType dtype;
size_t total_elms;
size_t total_size;
IBackend* backend; IBackend* backend;
void* d_ptr; void* d_ptr;
}; };

View File

@@ -1,60 +1,39 @@
#include "backend/cuda_backend.cuh" #include <cuda_runtime.h>
#include "utils/cuda_helper.cuh"
#include "kernels/activation_functions.cuh" #include <cstdio>
#include "kernels/matmul.cuh" #include <cstdlib>
#include "utils/vector.cuh" #include <cuda_helper.cuh>
#include "backend/cuda.cuh"
cudaDeviceProp initializeCUDA() {
int deviceCount;
CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0) {
std::fprintf(stderr, "No CUDA devices found. Exiting.\n");
std::exit(EXIT_FAILURE);
}
int device = 0;
CUDA_CHECK(cudaSetDevice(device));
cudaDeviceProp deviceProp;
CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, device));
std::printf("Using CUDA device %d: %s\n", device, deviceProp.name);
return deviceProp;
}
using namespace CUDANet::Backend; using namespace CUDANet::Backend;
void *CUDABackend::allocate(size_t bytes) { void* CUDABackend::allocate(size_t bytes) {
void* devicePtr = nullptr; void* d_ptr = nullptr;
CUDA_CHECK(cudaMalloc(&devicePtr, bytes)); CUDA_CHECK(cudaMalloc(&d_ptr, bytes));
return devicePtr; return d_ptr;
} }
void CUDABackend::deallocate(void* ptr) { void CUDABackend::deallocate(void* ptr) {
CUDA_CHECK(cudaFree(ptr)); CUDA_CHECK(cudaFree(ptr));
} }
void CUDABackend::relu(Tensor &tensor) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::relu<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
void CUDABackend::sigmoid(Tensor &tensor) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
// Find max value
Utils::max(tensor, temp_max, tensor.numel());
// Subtract max value to improve numerical stability
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
(float*)tensor.data(), (float*)tensor.data(), (float*)temp_max.data(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
// Compute exponentials
Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
(float*)tensor.data(), (float*)tensor.data(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
// Find sum
Utils::sum(tensor, temp_sum, tensor.numel());
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
(float*)tensor.data(), (float*)tensor.data(), (float*)temp_sum.data(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}

View File

@@ -0,0 +1,48 @@
#include "backend/cuda.cuh"
#include "utils/cuda_helper.cuh"
#include "kernels/activation_functions.cuh"
#include "kernels/matmul.cuh"
using namespace CUDANet::Backend;
void CUDABackend::relu(Tensor &tensor) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::relu<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
void CUDABackend::sigmoid(Tensor &tensor) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
// Find max value
max(tensor, temp_max);
// Subtract max value to improve numerical stability
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
tensor.data<float>(), tensor.data<float>(), temp_max.data<float>(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
// Compute exponentials
Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
tensor.data<float>(), tensor.data<float>(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
// Find sum
sum(tensor, temp_sum);
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
tensor.data<float>(), tensor.data<float>(), temp_sum.data<float>(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}

View File

@@ -43,7 +43,6 @@ void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::T
remaining = blocks_needed; remaining = blocks_needed;
} }
} }
void CUDABackend::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) { void CUDABackend::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) {

View File

@@ -1,26 +0,0 @@
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>
#include "cuda_helper.cuh"
cudaDeviceProp initializeCUDA() {
int deviceCount;
CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0) {
std::fprintf(stderr, "No CUDA devices found. Exiting.\n");
std::exit(EXIT_FAILURE);
}
int device = 0;
CUDA_CHECK(cudaSetDevice(device));
cudaDeviceProp deviceProp;
CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, device));
std::printf("Using CUDA device %d: %s\n", device, deviceProp.name);
return deviceProp;
}

View File

@@ -1,37 +1,44 @@
#include <stdexcept>
#include "backend/tensor.hpp" #include "backend/tensor.hpp"
#include <stdexcept>
using namespace CUDANet::Backend; using namespace CUDANet::Backend;
Tensor::Tensor(Shape shape, DType dtype, IBackend* backend) Tensor::Tensor(Shape shape, DType dtype, IBackend* backend)
: shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {} : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {
// Count total elements
Tensor::~Tensor() { size_t count = 1;
deallocate();
}
size_t Tensor::numel() const {
size_t totalElements = 1;
for (const auto& dim : shape) { for (const auto& dim : shape) {
totalElements *= dim; count *= dim;
} }
return totalElements; total_elms = count;
}
size_t Tensor::size() const { // Compute total size (bytes)
size_t totalSize = numel(); size_t type_size = 0;
size_t typeSize = 0;
switch (dtype) { switch (dtype) {
case DType::FLOAT32: case DType::FLOAT32:
typeSize = 4; type_size = 4;
break; break;
default: default:
throw std::runtime_error("Unsupported data type"); throw std::runtime_error("Unsupported data type");
} }
total_size = total_elms * type_size;
return totalSize * typeSize; // Allocate memory on backend
d_ptr = backend->allocate(total_size);
}
Tensor::~Tensor() {
backend->deallocate(d_ptr);
d_ptr = nullptr;
}
size_t Tensor::numel() const {
return total_elms;
}
size_t Tensor::size() const {
return total_size;
} }
template <typename T> template <typename T>
@@ -42,4 +49,4 @@ const T* Tensor::data() const {
template <typename T> template <typename T>
T* Tensor::data() { T* Tensor::data() {
return static_cast<T*>(d_ptr); return static_cast<T*>(d_ptr);
} }