Use const T* for input tensors in layer and tensor operations

Move dtype size implementation to cpp file
2025-12-22 14:24:22 +00:00 · 2025-11-28 21:41:38 +01:00 · 2025-11-27 23:29:54 +01:00
4 changed files with 27 additions and 24 deletions
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -16,19 +16,9 @@ enum class DType
    // INT32,  // Not implemented yet
 };

-size_t dtype_size(DType dtype) {
-    switch (dtype)
-    {
-    case DType::FLOAT32:
-        return 4;
-        break;
-    
-    default:
-        throw std::runtime_error("Unknown DType");
-        break;
-    }
-}
+size_t dtype_size(DType dtype);

+// Forward declaration
 class Backend;

 class Tensor
--- a/src/backends/cuda/layer_ops.cu
+++ b/src/backends/cuda/layer_ops.cu
@@ -142,13 +142,13 @@ CUDANet::Tensor& CUDA::dense_impl(
    auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;

    Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
-        static_cast<T*>(weights.device_ptr()), static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_size,
+        static_cast<const T*>(weights.device_ptr()), static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_size,
        output_size
    );
    CUDA_CHECK(cudaGetLastError());

    Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
-        static_cast<T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()), static_cast<T*>(output.device_ptr()), output_size
+        static_cast<const T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()), static_cast<T*>(output.device_ptr()), output_size
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
@@ -213,7 +213,7 @@ CUDANet::Tensor& CUDA::conv2d_impl(
    );

    Kernels::convolution<<<grid, block>>>(
-        static_cast<T*>(input.device_ptr()), static_cast<T*>(weights.device_ptr()), static_cast<T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()),
+        static_cast<const T*>(input.device_ptr()), static_cast<const T*>(weights.device_ptr()), static_cast<const T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()),
        in_shape, padding_shape, kernel_shape, stride_shape, out_shape
    );
    CUDA_CHECK(cudaGetLastError());
@@ -273,7 +273,7 @@ CUDANet::Tensor& CUDA::max_pool2d_impl(
    );

    Kernels::max_pool<<<grid, block>>>(
-        static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
+        static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
        pool_shape, stride_shape, padding_shape
    );
    CUDA_CHECK(cudaGetLastError());
@@ -333,7 +333,7 @@ CUDANet::Tensor& CUDA::avg_pool2d_impl(
    );

    Kernels::avg_pool<<<grid, block>>>(
-        static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
+        static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
        pool_shape, stride_shape, padding_shape
    );
    CUDA_CHECK(cudaGetLastError());
@@ -394,7 +394,7 @@ CUDANet::Tensor& CUDA::batch_norm_impl(
    for (int i = 0; i < input_shape[2]; i++) {
        // Subtract mean from input
        Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
-            static_cast<T*>(input.device_ptr()) + i * input_shape[0] * input_shape[1],
+            static_cast<const T*>(input.device_ptr()) + i * input_shape[0] * input_shape[1],
            static_cast<T*>(output.device_ptr()) + i * input_shape[0] * input_shape[1],
            &static_cast<T*>(running_mean.device_ptr())[i], input_shape[0] * input_shape[1]
        );
@@ -460,12 +460,12 @@ CUDANet::Tensor& CUDA::concat_impl(
    CUDANet::Tensor& output
 ) {
    CUDA_CHECK(cudaMemcpy(
-        static_cast<T*>(output.device_ptr()), static_cast<T*>(input_a.device_ptr()), input_a.size(),
+        static_cast<T*>(output.device_ptr()), static_cast<const T*>(input_a.device_ptr()), input_a.size(),
        cudaMemcpyDeviceToDevice
    ));

    CUDA_CHECK(cudaMemcpy(
-        static_cast<T*>(output.device_ptr()) + input_a.numel(), static_cast<T*>(input_b.device_ptr()), input_b.size(),
+        static_cast<T*>(output.device_ptr()) + input_a.numel(), static_cast<const T*>(input_b.device_ptr()), input_b.size(),
        cudaMemcpyDeviceToDevice
    ));

@@ -508,7 +508,7 @@ CUDANet::Tensor& CUDA::add_impl(
    auto gridSize = (input_a.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;

    Kernels::vec_vec_add<<<gridSize, BLOCK_SIZE>>>(
-        static_cast<T*>(input_a.device_ptr()), static_cast<T*>(input_b.device_ptr()), static_cast<T*>(output.device_ptr()), input_a.numel()
+        static_cast<const T*>(input_a.device_ptr()), static_cast<const T*>(input_b.device_ptr()), static_cast<T*>(output.device_ptr()), input_a.numel()
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
--- a/src/backends/cuda/tensor_ops.cu
+++ b/src/backends/cuda/tensor_ops.cu
@@ -26,7 +26,7 @@ void CUDA::print_impl(const CUDANet::Tensor &input) {
    std::vector<T> h_vec(input.numel());

    CUDA_CHECK(cudaMemcpy(
-        h_vec.data(), static_cast<T*>(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost
+        h_vec.data(), static_cast<const T*>(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost
    ));

    for (int i = 0; i < length; ++i) {
@@ -98,7 +98,7 @@ void CUDA::sum_impl(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
    const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;

    CUDANet::Kernels::sum_reduce<<<gridSize, BLOCK_SIZE>>>(
-        static_cast<T*>(input.device_ptr()), static_cast<T*>(sum.device_ptr()), length
+        static_cast<const T*>(input.device_ptr()), static_cast<T*>(sum.device_ptr()), length
    );
    CUDA_CHECK(cudaGetLastError());

@@ -131,7 +131,7 @@ void CUDA::max_impl(const CUDANet::Tensor &input, CUDANet::Tensor &max) {
    auto length = input.numel();
    const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;

-    Kernels::max_reduce<<<grid_size, BLOCK_SIZE>>>(static_cast<T*>(input.device_ptr()), static_cast<T*>(max.device_ptr()), length);
+    Kernels::max_reduce<<<grid_size, BLOCK_SIZE>>>(static_cast<const T*>(input.device_ptr()), static_cast<T*>(max.device_ptr()), length);
    CUDA_CHECK(cudaGetLastError());

    int remaining = grid_size;
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -4,6 +4,19 @@

 using namespace CUDANet;

+size_t dtype_size(DType dtype) {
+    switch (dtype)
+    {
+    case DType::FLOAT32:
+        return 4;
+        break;
+    
+    default:
+        throw std::runtime_error("Unknown DType");
+        break;
+    }
+}
+
 Tensor::Tensor(Shape shape, CUDANet::Backend* backend)
    : Tensor(shape, backend->get_default_dtype(), backend) {}
Author	SHA1	Message	Date
LordMathis	6318d52f12	Use const T* for input tensors in layer and tensor operations	2025-11-28 21:41:38 +01:00
LordMathis	71dc5a924d	Move dtype size implementation to cpp file	2025-11-27 23:29:54 +01:00