Compare commits

...

2 Commits

Author SHA1 Message Date
6133fb20af WIP Migrate Activation to Tensor 2025-11-16 22:17:46 +01:00
64bf9197ff Add simple Tensor class 2025-11-16 19:31:09 +01:00
7 changed files with 227 additions and 38 deletions

View File

@@ -0,0 +1,22 @@
#pragma once
#include <cstddef>
namespace CUDANet::Backend
{
class IBackend
{
public:
// Memory management
virtual void* allocate(size_t bytes) = 0;
virtual void deallocate(void* ptr) = 0;
// Layer operations
virtual void relu(CUDANet::Backend::Tensor &tensor) = 0;
virtual void sigmoid(CUDANet::Backend::Tensor &tensor) = 0;
virtual void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) = 0;
};
} // namespace CUDANet::Backend

View File

@@ -0,0 +1,23 @@
#pragma once
#include "backend/backend.hpp"
#include "backend/tensor.hpp"
namespace CUDANet::Backend {
class CUDABackend : public IBackend {
public:
// Memory management
void* allocate(size_t bytes) override;
void deallocate(void* ptr) override;
// Layer operations
void relu(CUDANet::Backend::Tensor &tensor) override;
void sigmoid(CUDANet::Backend::Tensor &tensor) override;
void softmax(CUDANet::Backend::Tensor &tensor, CUDANet::Backend::Tensor &temp_max, CUDANet::Backend::Tensor &temp_sum) override;
private:
static constexpr int BLOCK_SIZE = 256;
};
} // namespace CUDANet::Backend

View File

@@ -0,0 +1,44 @@
#pragma once
#include <cstddef>
#include "backend/backend.hpp"
#include <vector>
namespace CUDANet::Backend
{
enum class DType
{
FLOAT32,
// FLOAT16, // Not implemented yet
// INT32, // Not implemented yet
};
typedef std::vector<size_t> Shape;
class Tensor
{
public:
Tensor() = default;
Tensor(Shape shape, DType dtype, IBackend* backend);
~Tensor();
void* allocate();
void deallocate();
void toDevice(const void* hostPtr);
void toHost(void* hostPtr);
size_t size() const;
size_t numel() const;
void* data() const;
private:
Shape shape;
DType dtype;
IBackend* backend;
void* devicePtr;
void* hostPtr;
};
} // namespace CUDANet::Backend

View File

@@ -1,5 +1,7 @@
#ifndef CUDANET_ACTIVATION_H
#define CUDANET_ACTIVATION_H
#pragma once
#include "backend/tensor.hpp"
#include "backend/backend.hpp"
namespace CUDANet::Layers {
@@ -41,29 +43,16 @@ class Activation {
*
* @param d_input Pointer to the input vector on the device
*/
void activate(float* d_input);
void activate(CUDANet::Backend::Tensor input);
private:
CUDANet::Backend::IBackend* backend;
ActivationType activationType;
int length;
void activateCPU(float* input);
#ifdef USE_CUDA
int gridSize;
float* d_softmax_sum;
float* d_max;
void activateCUDA(float* d_input);
void initCUDA();
void delCUDA();
#endif
CUDANet::Backend::Tensor softmax_sum;
CUDANet::Backend::Tensor tensor_max;
};
} // namespace CUDANet::Layers
#endif // CUDANET_ACTIVATION_H

View File

@@ -0,0 +1,69 @@
#include "backend/cuda_backend.cuh"
#include "utils/cuda_helper.cuh"
#include "kernels/activation_functions.cuh"
#include "kernels/matmul.cuh"
#include "utils/vector.cuh"
using namespace CUDANet::Backend;
void *CUDABackend::allocate(size_t bytes) {
void* devicePtr = nullptr;
CUDA_CHECK(cudaMalloc(&devicePtr, bytes));
return devicePtr;
}
void CUDABackend::deallocate(void* ptr) {
CUDA_CHECK(cudaFree(ptr));
}
// void CUDABackend::copyToDevice(void* devicePtr, const void* hostPtr, size_t bytes) {
// CUDA_CHECK(cudaMemcpy(devicePtr, hostPtr, bytes, cudaMemcpyHostToDevice));
// CUDA_CHECK(cudaDeviceSynchronize());
// }
// void CUDABackend::copyToHost(void* hostPtr, const void* devicePtr, size_t bytes) {
// CUDA_CHECK(cudaMemcpy(hostPtr, devicePtr, bytes, cudaMemcpyDeviceToHost));
// CUDA_CHECK(cudaDeviceSynchronize());
// }
void CUDABackend::relu(Tensor &tensor) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::relu<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
void CUDABackend::sigmoid(Tensor &tensor) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
// Find max value
Utils::max(tensor, temp_max, tensor.numel());
// Subtract max value to improve numerical stability
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
(float*)tensor.data(), (float*)tensor.data(), (float*)temp_max.data(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
// Compute exponentials
Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
(float*)tensor.data(), (float*)tensor.data(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
// Find sum
Utils::sum(tensor, temp_sum, tensor.numel());
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
(float*)tensor.data(), (float*)tensor.data(), (float*)temp_sum.data(), tensor.numel()
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}

39
src/backends/tensor.cpp Normal file
View File

@@ -0,0 +1,39 @@
#include <stdexcept>
#include "backend/tensor.hpp"
using namespace CUDANet::Backend;
Tensor::Tensor(Shape shape, DType dtype, IBackend* backend)
: shape(shape), dtype(dtype), backend(backend), devicePtr(nullptr), hostPtr(nullptr) {}
Tensor::~Tensor() {
deallocate();
}
size_t Tensor::numel() const {
size_t totalElements = 1;
for (const auto& dim : shape) {
totalElements *= dim;
}
return totalElements;
}
size_t Tensor::size() const {
size_t totalSize = numel();
size_t typeSize = 0;
switch (dtype) {
case DType::FLOAT32:
typeSize = 4;
break;
default:
throw std::runtime_error("Unsupported data type");
}
return totalSize * typeSize;
}
void* Tensor::data() const {
return devicePtr;
}

View File

@@ -2,30 +2,33 @@
#include <vector>
#include "activation.hpp"
#include "backend/tensor.hpp"
using namespace CUDANet::Layers;
Activation::Activation(ActivationType activation, const int length)
: activationType(activation), length(length) {
#ifdef USE_CUDA
initCUDA();
#endif
if (activationType == SOFTMAX) {
softmax_sum = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, nullptr);
tensor_max = CUDANet::Backend::Tensor({static_cast<size_t>(length)}, CUDANet::Backend::DType::FLOAT32, nullptr);
}
}
Activation::~Activation() {
#ifdef USE_CUDA
delCUDA();
#endif
void Activation::activate(CUDANet::Backend::Tensor input) {
switch (activationType)
{
case ActivationType::SIGMOID:
backend->sigmoid(input);
break;
case ActivationType::RELU:
/* code */
break;
case ActivationType::SOFTMAX:
/* code */
break;
default:
break;
}
void Activation::activateCPU(float* input) {
throw std::logic_error("Not implemented");
}
void Activation::activate(float* input) {
#ifdef USE_CUDA
activateCUDA(input);
#else
activateCPU(input);
#endif
}