diff --git a/include/backend.hpp b/include/backend.hpp index 30c96a4..938be54 100644 --- a/include/backend.hpp +++ b/include/backend.hpp @@ -28,6 +28,7 @@ class Backend { std::optional default_dtype; public: + // Dtypes virtual bool supports_dtype(DType dtype) const = 0; virtual void set_default_dtype(DType dtype) = 0; virtual DType get_default_dtype() const = 0; diff --git a/include/module.hpp b/include/module.hpp index ea84007..dd32a41 100644 --- a/include/module.hpp +++ b/include/module.hpp @@ -15,10 +15,6 @@ class Module { CUDANet::Shape output_shape(); - size_t input_size(); - - size_t output_size(); - void register_layer(const std::string& name, Layer* layer); void register_module(Module& module); diff --git a/include/tensor.hpp b/include/tensor.hpp index 40e42ea..691c2aa 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -16,6 +16,19 @@ enum class DType // INT32, // Not implemented yet }; +size_t dtype_size(DType dtype) { + switch (dtype) + { + case DType::FLOAT32: + return 4; + break; + + default: + throw std::runtime_error("Unknown DType"); + break; + } +} + class Tensor { public: diff --git a/src/backend_factory.cpp b/src/backend_factory.cpp index beaea89..705468b 100644 --- a/src/backend_factory.cpp +++ b/src/backend_factory.cpp @@ -16,7 +16,7 @@ std::unique_ptr BackendFactory::create(BackendType backend_type, const #ifdef USE_CUDA if (!CUDANet::Backends::CUDA::is_cuda_available()) { - throw std::runtime_error("No CUDA devices found") + throw std::runtime_error("No CUDA devices found"); } auto cuda = std::make_unique(config); @@ -31,6 +31,7 @@ std::unique_ptr BackendFactory::create(BackendType backend_type, const break; default: + throw std::runtime_error("Invalid backend"); break; } diff --git a/src/backends/cuda/cuda.cu b/src/backends/cuda/cuda.cu index c2094f2..331ed1d 100644 --- a/src/backends/cuda/cuda.cu +++ b/src/backends/cuda/cuda.cu @@ -65,7 +65,6 @@ CUDANet::DType CUDA::get_default_dtype() const { return DType::FLOAT32; } - void* CUDA::allocate(size_t bytes) { void* d_ptr = nullptr; CUDA_CHECK(cudaMalloc(&d_ptr, bytes)); diff --git a/src/backends/cuda/kernels/convolution.cu b/src/backends/cuda/kernels/convolution.cu index 85abf98..12a444e 100644 --- a/src/backends/cuda/kernels/convolution.cu +++ b/src/backends/cuda/kernels/convolution.cu @@ -36,7 +36,7 @@ __global__ void Kernels::convolution( return; } - T sum = static_cast(0); + T sum = static_cast(0); // Iterate over kernel and input matrix for (int c = 0; c < input_shape[2]; c++) { diff --git a/src/backends/cuda/layer_ops.cu b/src/backends/cuda/layer_ops.cu index 812b042..3d07fc9 100644 --- a/src/backends/cuda/layer_ops.cu +++ b/src/backends/cuda/layer_ops.cu @@ -24,7 +24,7 @@ template void CUDA::relu_impl(Tensor& tensor) { int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::relu<<>>( - tensor.data(), tensor.data(), tensor.numel() + static_cast(tensor.device_ptr()), static_cast(tensor.device_ptr()), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -48,7 +48,7 @@ template void CUDA::sigmoid_impl(CUDANet::Tensor& tensor) { int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::sigmoid<<>>( - tensor.data(), tensor.data(), tensor.numel() + static_cast(tensor.device_ptr()), static_cast(tensor.device_ptr()), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -78,13 +78,13 @@ void CUDA::softmax_impl(Tensor& tensor, Tensor& temp_max, Tensor& temp_sum) { // Subtract max value to improve numerical stability Kernels::vec_scalar_sub<<>>( - tensor.data(), tensor.data(), temp_max.data(), tensor.numel() + static_cast(tensor.device_ptr()), static_cast(tensor.device_ptr()), static_cast(temp_max.device_ptr()), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); // Compute exponentials Kernels::vec_exp<<>>( - tensor.data(), tensor.data(), tensor.numel() + static_cast(tensor.device_ptr()), static_cast(tensor.device_ptr()), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); @@ -92,7 +92,7 @@ void CUDA::softmax_impl(Tensor& tensor, Tensor& temp_max, Tensor& temp_sum) { sum(tensor, temp_sum); Kernels::vec_scalar_div<<>>( - tensor.data(), tensor.data(), temp_sum.data(), tensor.numel() + static_cast(tensor.device_ptr()), static_cast(tensor.device_ptr()), static_cast(temp_sum.device_ptr()), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -142,13 +142,13 @@ CUDANet::Tensor& CUDA::dense_impl( auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::mat_vec_mul<<>>( - weights.data(), input.data(), output.data(), input_size, + static_cast(weights.device_ptr()), static_cast(input.device_ptr()), static_cast(output.device_ptr()), input_size, output_size ); CUDA_CHECK(cudaGetLastError()); Kernels::vec_vec_add<<>>( - biases.data(), output.data(), output.data(), output_size + static_cast(biases.device_ptr()), static_cast(output.device_ptr()), static_cast(output.device_ptr()), output_size ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -213,7 +213,7 @@ CUDANet::Tensor& CUDA::conv2d_impl( ); Kernels::convolution<<>>( - input.data(), weights.data(), biases.data(), output.data(), + static_cast(input.device_ptr())(), static_cast(weights.device_ptr())(), static_cast(biases.device_ptr())(), static_cast(output.device_ptr())(), in_shape, padding_shape, kernel_shape, stride_shape, out_shape ); CUDA_CHECK(cudaGetLastError()); @@ -273,7 +273,7 @@ CUDANet::Tensor& CUDA::max_pool2d_impl( ); Kernels::max_pool<<>>( - input.data(), output.data(), input_shape, output_shape, + static_cast(input.device_ptr())(), static_cast(output.device_ptr())(), input_shape, output_shape, pool_shape, stride_shape, padding_shape ); CUDA_CHECK(cudaGetLastError()); @@ -333,7 +333,7 @@ CUDANet::Tensor& CUDA::avg_pool2d_impl( ); Kernels::avg_pool<<>>( - input.data(), output.data(), input_shape, output_shape, + static_cast(input.device_ptr())(), static_cast(output.device_ptr())(), input_shape, output_shape, pool_shape, stride_shape, padding_shape ); CUDA_CHECK(cudaGetLastError()); @@ -394,34 +394,34 @@ CUDANet::Tensor& CUDA::batch_norm_impl( for (int i = 0; i < input_shape[2]; i++) { // Subtract mean from input Kernels::vec_scalar_sub<<>>( - input.data() + i * input_shape[0] * input_shape[1], - output.data() + i * input_shape[0] * input_shape[1], - &running_mean.data()[i], input_shape[0] * input_shape[1] + static_cast(input.device_ptr())() + i * input_shape[0] * input_shape[1], + static_cast(output.device_ptr())() + i * input_shape[0] * input_shape[1], + &static_cast(running_mean.device_ptr())()[i], input_shape[0] * input_shape[1] ); CUDA_CHECK(cudaGetLastError()); // Divide by sqrt(running_var + epsilon) Kernels::vec_scale<<>>( - output.data() + i * input_shape[0] * input_shape[1], - output.data() + i * input_shape[0] * input_shape[1], - &running_var.data()[i], epsilon.data(), + static_cast(output.device_ptr())() + i * input_shape[0] * input_shape[1], + static_cast(output.device_ptr())() + i * input_shape[0] * input_shape[1], + &static_cast(running_var.device_ptr())()[i], static_cast(epsilon.device_ptr())(), input_shape[0] * input_shape[1] ); CUDA_CHECK(cudaGetLastError()); // Multiply by weights Kernels::vec_scalar_mul<<>>( - output.data() + i * input_shape[0] * input_shape[1], - output.data() + i * input_shape[0] * input_shape[1], - &weights.data()[i], input_shape[0] * input_shape[1] + static_cast(output.device_ptr())() + i * input_shape[0] * input_shape[1], + static_cast(output.device_ptr())() + i * input_shape[0] * input_shape[1], + &static_cast(weights.device_ptr())()[i], input_shape[0] * input_shape[1] ); CUDA_CHECK(cudaGetLastError()); // Add biases Kernels::vec_scalar_add<<>>( - output.data() + i * input_shape[0] * input_shape[1], - output.data() + i * input_shape[0] * input_shape[1], - &biases.data()[i], input_shape[0] * input_shape[1] + static_cast(output.device_ptr())() + i * input_shape[0] * input_shape[1], + static_cast(output.device_ptr())() + i * input_shape[0] * input_shape[1], + &static_cast(biases.device_ptr())()[i], input_shape[0] * input_shape[1] ); CUDA_CHECK(cudaGetLastError()); } @@ -460,12 +460,12 @@ CUDANet::Tensor& CUDA::concat_impl( CUDANet::Tensor& output ) { CUDA_CHECK(cudaMemcpy( - output.data(), input_a.data(), input_a.size(), + static_cast(output.device_ptr())(), static_cast(input_a.device_ptr())(), input_a.size(), cudaMemcpyDeviceToDevice )); CUDA_CHECK(cudaMemcpy( - output.data() + input_a.numel(), input_b.data(), input_b.size(), + static_cast(output.device_ptr())() + input_a.numel(), static_cast(input_b.device_ptr())(), input_b.size(), cudaMemcpyDeviceToDevice )); @@ -508,7 +508,7 @@ CUDANet::Tensor& CUDA::add_impl( auto gridSize = (input_a.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::vec_vec_add<<>>( - input_a.data(), input_b.data(), output.data(), input_a.numel() + static_cast(input_a.device_ptr())(), static_cast(input_b.device_ptr())(), static_cast(output.device_ptr())(), input_a.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/src/backends/cuda/tensor_ops.cu b/src/backends/cuda/tensor_ops.cu index a39d3ce..eeaf77e 100644 --- a/src/backends/cuda/tensor_ops.cu +++ b/src/backends/cuda/tensor_ops.cu @@ -26,7 +26,7 @@ void CUDA::print_impl(const CUDANet::Tensor &input) { std::vector h_vec(input.numel()); CUDA_CHECK(cudaMemcpy( - h_vec.data(), input.data(), sizeof(T) * length, cudaMemcpyDeviceToHost + h_vec.data(), static_cast(input.device_ptr())(), sizeof(T) * length, cudaMemcpyDeviceToHost )); for (int i = 0; i < length; ++i) { @@ -56,7 +56,7 @@ template void CUDA::fill_impl(CUDANet::Tensor &input, int value); template void CUDA::fill_impl(CUDANet::Tensor &input, int value) { - CUDA_CHECK(cudaMemset(input.data(), value, sizeof(T) * input.numel())); + CUDA_CHECK(cudaMemset(static_cast(input.device_ptr())(), value, sizeof(T) * input.numel())); } void CUDA::copy_to_device(CUDANet::Tensor &tensor, void *data, size_t size) { @@ -75,7 +75,7 @@ template void CUDA::copy_to_device_impl(CUDANet::Tensor &tensor, void *da template void CUDA::copy_to_device_impl(CUDANet::Tensor &tensor, void *data, size_t size) { - CUDA_CHECK(cudaMemcpy(tensor.data(), data, size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(static_cast(tensor.device_ptr())(), data, size, cudaMemcpyHostToDevice)); } void CUDA::sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) { @@ -95,17 +95,17 @@ template void CUDA::sum_impl(const CUDANet::Tensor &input, CUDANet::Tenso template void CUDA::sum_impl(const CUDANet::Tensor &input, CUDANet::Tensor &sum) { auto length = input.numel(); - const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE; + const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; CUDANet::Kernels::sum_reduce<<>>( - input.data(), sum.data(), length + static_cast(input.device_ptr())(), static_cast(sum.device_ptr())(), length ); CUDA_CHECK(cudaGetLastError()); int remaining = gridSize; while (remaining > 1) { int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; - CUDANet::Kernels::sum_reduce<<>>(sum.data(), sum.data(), remaining); + CUDANet::Kernels::sum_reduce<<>>(static_cast(sum.device_ptr())(), static_cast(sum.device_ptr())(), remaining); CUDA_CHECK(cudaGetLastError()); remaining = blocks_needed; @@ -131,14 +131,14 @@ void CUDA::max_impl(const CUDANet::Tensor &input, CUDANet::Tensor &max) { auto length = input.numel(); const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; - Kernels::max_reduce<<>>(input.data(), max.data(), length); + Kernels::max_reduce<<>>(static_cast(input.device_ptr())(), static_cast(max.device_ptr())(), length); CUDA_CHECK(cudaGetLastError()); int remaining = grid_size; while (remaining > 1) { int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; - CUDANet::Kernels::max_reduce<<>>(max.data(), max.data(), remaining); + CUDANet::Kernels::max_reduce<<>>(static_cast(max.device_ptr())(), static_cast(max.device_ptr())(), remaining); CUDA_CHECK(cudaGetLastError()); remaining = blocks_needed; diff --git a/src/layers/avg_pooling.cpp b/src/layers/avg_pooling.cpp index 375e2c9..90719d4 100644 --- a/src/layers/avg_pooling.cpp +++ b/src/layers/avg_pooling.cpp @@ -84,11 +84,11 @@ CUDANet::Shape AvgPool2d::output_shape() { } size_t AvgPool2d::input_size() { - return sizeof(float) * in_shape[0] * in_shape[1] * in_shape[2]; + return dtype_size(dtype) * in_shape[0] * in_shape[1] * in_shape[2]; } size_t AvgPool2d::output_size() { - return sizeof(float) * out_shape[0] * out_shape[1] * out_shape[2]; + return dtype_size(dtype) * out_shape[0] * out_shape[1] * out_shape[2]; } void AvgPool2d::set_weights(void* input) {} diff --git a/src/layers/batch_norm.cpp b/src/layers/batch_norm.cpp index dc0a6e3..d73d40b 100644 --- a/src/layers/batch_norm.cpp +++ b/src/layers/batch_norm.cpp @@ -73,11 +73,11 @@ CUDANet::Shape BatchNorm2d::output_shape() { } size_t BatchNorm2d::input_size() { - return sizeof(float) * in_shape[0] * in_shape[1] * in_shape[2]; + return dtype_size(dtype) * in_shape[0] * in_shape[1] * in_shape[2]; } size_t BatchNorm2d::output_size() { - return sizeof(float) * in_shape[0] * in_shape[1] * in_shape[2]; + return dtype_size(dtype) * in_shape[0] * in_shape[1] * in_shape[2]; } void BatchNorm2d::set_weights(void* input) { diff --git a/src/layers/conv2d.cpp b/src/layers/conv2d.cpp index 111cb40..628b2b5 100644 --- a/src/layers/conv2d.cpp +++ b/src/layers/conv2d.cpp @@ -97,11 +97,11 @@ CUDANet::Shape Conv2d::output_shape() { } size_t Conv2d::input_size() { - return sizeof(float) * in_shape[0] * in_shape[1] * in_shape[2]; + return dtype_size(dtype) * in_shape[0] * in_shape[1] * in_shape[2]; } size_t Conv2d::output_size() { - return sizeof(float) * out_shape[0] * out_shape[1] * out_shape[2]; + return dtype_size(dtype) * out_shape[0] * out_shape[1] * out_shape[2]; } void Conv2d::set_weights(void* input) { diff --git a/src/layers/max_pool.cpp b/src/layers/max_pool.cpp index 38ba131..ddf2935 100644 --- a/src/layers/max_pool.cpp +++ b/src/layers/max_pool.cpp @@ -78,11 +78,11 @@ CUDANet::Shape MaxPool2d::output_shape() { } size_t MaxPool2d::input_size() { - return sizeof(float) * in_shape[0] * in_shape[1] * in_shape[2]; + return dtype_size(dtype) * in_shape[0] * in_shape[1] * in_shape[2]; } size_t MaxPool2d::output_size() { - return sizeof(float) * out_shape[0] * out_shape[1] * out_shape[2]; + return dtype_size(dtype) * out_shape[0] * out_shape[1] * out_shape[2]; } void MaxPool2d::set_weights(void* input) {} diff --git a/src/module.cpp b/src/module.cpp index 9d3acde..dda1b78 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -12,22 +12,6 @@ CUDANet::Shape Module::output_shape() { return out_shape; } -size_t Module::input_size() { - size_t count = 1; - for (const auto& dim : in_shape) { - count *= dim; - } - return sizeof(float) * count; -} - -size_t Module::output_size() { - size_t count = 1; - for (const auto& dim : out_shape) { - count *= dim; - } - return sizeof(float) * count; -} - void Module::register_layer(const std::string& name, Layer* layer) { layers.push_back({name, layer}); }