From 6318d52f129bbd9c15828ed36821cbd408aa5737 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Fri, 28 Nov 2025 21:41:38 +0100 Subject: [PATCH] Use const T* for input tensors in layer and tensor operations --- src/backends/cuda/layer_ops.cu | 18 +++++++++--------- src/backends/cuda/tensor_ops.cu | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/backends/cuda/layer_ops.cu b/src/backends/cuda/layer_ops.cu index 4d927f3..f74103d 100644 --- a/src/backends/cuda/layer_ops.cu +++ b/src/backends/cuda/layer_ops.cu @@ -142,13 +142,13 @@ CUDANet::Tensor& CUDA::dense_impl( auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::mat_vec_mul<<>>( - static_cast(weights.device_ptr()), static_cast(input.device_ptr()), static_cast(output.device_ptr()), input_size, + static_cast(weights.device_ptr()), static_cast(input.device_ptr()), static_cast(output.device_ptr()), input_size, output_size ); CUDA_CHECK(cudaGetLastError()); Kernels::vec_vec_add<<>>( - static_cast(biases.device_ptr()), static_cast(output.device_ptr()), static_cast(output.device_ptr()), output_size + static_cast(biases.device_ptr()), static_cast(output.device_ptr()), static_cast(output.device_ptr()), output_size ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -213,7 +213,7 @@ CUDANet::Tensor& CUDA::conv2d_impl( ); Kernels::convolution<<>>( - static_cast(input.device_ptr()), static_cast(weights.device_ptr()), static_cast(biases.device_ptr()), static_cast(output.device_ptr()), + static_cast(input.device_ptr()), static_cast(weights.device_ptr()), static_cast(biases.device_ptr()), static_cast(output.device_ptr()), in_shape, padding_shape, kernel_shape, stride_shape, out_shape ); CUDA_CHECK(cudaGetLastError()); @@ -273,7 +273,7 @@ CUDANet::Tensor& CUDA::max_pool2d_impl( ); Kernels::max_pool<<>>( - static_cast(input.device_ptr()), static_cast(output.device_ptr()), input_shape, output_shape, + static_cast(input.device_ptr()), static_cast(output.device_ptr()), input_shape, output_shape, pool_shape, stride_shape, padding_shape ); CUDA_CHECK(cudaGetLastError()); @@ -333,7 +333,7 @@ CUDANet::Tensor& CUDA::avg_pool2d_impl( ); Kernels::avg_pool<<>>( - static_cast(input.device_ptr()), static_cast(output.device_ptr()), input_shape, output_shape, + static_cast(input.device_ptr()), static_cast(output.device_ptr()), input_shape, output_shape, pool_shape, stride_shape, padding_shape ); CUDA_CHECK(cudaGetLastError()); @@ -394,7 +394,7 @@ CUDANet::Tensor& CUDA::batch_norm_impl( for (int i = 0; i < input_shape[2]; i++) { // Subtract mean from input Kernels::vec_scalar_sub<<>>( - static_cast(input.device_ptr()) + i * input_shape[0] * input_shape[1], + static_cast(input.device_ptr()) + i * input_shape[0] * input_shape[1], static_cast(output.device_ptr()) + i * input_shape[0] * input_shape[1], &static_cast(running_mean.device_ptr())[i], input_shape[0] * input_shape[1] ); @@ -460,12 +460,12 @@ CUDANet::Tensor& CUDA::concat_impl( CUDANet::Tensor& output ) { CUDA_CHECK(cudaMemcpy( - static_cast(output.device_ptr()), static_cast(input_a.device_ptr()), input_a.size(), + static_cast(output.device_ptr()), static_cast(input_a.device_ptr()), input_a.size(), cudaMemcpyDeviceToDevice )); CUDA_CHECK(cudaMemcpy( - static_cast(output.device_ptr()) + input_a.numel(), static_cast(input_b.device_ptr()), input_b.size(), + static_cast(output.device_ptr()) + input_a.numel(), static_cast(input_b.device_ptr()), input_b.size(), cudaMemcpyDeviceToDevice )); @@ -508,7 +508,7 @@ CUDANet::Tensor& CUDA::add_impl( auto gridSize = (input_a.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::vec_vec_add<<>>( - static_cast(input_a.device_ptr()), static_cast(input_b.device_ptr()), static_cast(output.device_ptr()), input_a.numel() + static_cast(input_a.device_ptr()), static_cast(input_b.device_ptr()), static_cast(output.device_ptr()), input_a.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/src/backends/cuda/tensor_ops.cu b/src/backends/cuda/tensor_ops.cu index 901a52a..d5550f9 100644 --- a/src/backends/cuda/tensor_ops.cu +++ b/src/backends/cuda/tensor_ops.cu @@ -26,7 +26,7 @@ void CUDA::print_impl(const CUDANet::Tensor &input) { std::vector h_vec(input.numel()); CUDA_CHECK(cudaMemcpy( - h_vec.data(), static_cast(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost + h_vec.data(), static_cast(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost )); for (int i = 0; i < length; ++i) { @@ -98,7 +98,7 @@ void CUDA::sum_impl(const CUDANet::Tensor &input, CUDANet::Tensor &sum) { const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; CUDANet::Kernels::sum_reduce<<>>( - static_cast(input.device_ptr()), static_cast(sum.device_ptr()), length + static_cast(input.device_ptr()), static_cast(sum.device_ptr()), length ); CUDA_CHECK(cudaGetLastError()); @@ -131,7 +131,7 @@ void CUDA::max_impl(const CUDANet::Tensor &input, CUDANet::Tensor &max) { auto length = input.numel(); const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; - Kernels::max_reduce<<>>(static_cast(input.device_ptr()), static_cast(max.device_ptr()), length); + Kernels::max_reduce<<>>(static_cast(input.device_ptr()), static_cast(max.device_ptr()), length); CUDA_CHECK(cudaGetLastError()); int remaining = grid_size;