Use const T* for input tensors in layer and tensor operations

This commit is contained in:
2025-11-28 21:41:38 +01:00
parent 71dc5a924d
commit 6318d52f12
2 changed files with 12 additions and 12 deletions

View File

@@ -142,13 +142,13 @@ CUDANet::Tensor& CUDA::dense_impl(
auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>( Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
static_cast<T*>(weights.device_ptr()), static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_size, static_cast<const T*>(weights.device_ptr()), static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_size,
output_size output_size
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>( Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
static_cast<T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()), static_cast<T*>(output.device_ptr()), output_size static_cast<const T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()), static_cast<T*>(output.device_ptr()), output_size
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
@@ -213,7 +213,7 @@ CUDANet::Tensor& CUDA::conv2d_impl(
); );
Kernels::convolution<<<grid, block>>>( Kernels::convolution<<<grid, block>>>(
static_cast<T*>(input.device_ptr()), static_cast<T*>(weights.device_ptr()), static_cast<T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()), static_cast<const T*>(input.device_ptr()), static_cast<const T*>(weights.device_ptr()), static_cast<const T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()),
in_shape, padding_shape, kernel_shape, stride_shape, out_shape in_shape, padding_shape, kernel_shape, stride_shape, out_shape
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
@@ -273,7 +273,7 @@ CUDANet::Tensor& CUDA::max_pool2d_impl(
); );
Kernels::max_pool<<<grid, block>>>( Kernels::max_pool<<<grid, block>>>(
static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape, static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
pool_shape, stride_shape, padding_shape pool_shape, stride_shape, padding_shape
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
@@ -333,7 +333,7 @@ CUDANet::Tensor& CUDA::avg_pool2d_impl(
); );
Kernels::avg_pool<<<grid, block>>>( Kernels::avg_pool<<<grid, block>>>(
static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape, static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
pool_shape, stride_shape, padding_shape pool_shape, stride_shape, padding_shape
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
@@ -394,7 +394,7 @@ CUDANet::Tensor& CUDA::batch_norm_impl(
for (int i = 0; i < input_shape[2]; i++) { for (int i = 0; i < input_shape[2]; i++) {
// Subtract mean from input // Subtract mean from input
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
static_cast<T*>(input.device_ptr()) + i * input_shape[0] * input_shape[1], static_cast<const T*>(input.device_ptr()) + i * input_shape[0] * input_shape[1],
static_cast<T*>(output.device_ptr()) + i * input_shape[0] * input_shape[1], static_cast<T*>(output.device_ptr()) + i * input_shape[0] * input_shape[1],
&static_cast<T*>(running_mean.device_ptr())[i], input_shape[0] * input_shape[1] &static_cast<T*>(running_mean.device_ptr())[i], input_shape[0] * input_shape[1]
); );
@@ -460,12 +460,12 @@ CUDANet::Tensor& CUDA::concat_impl(
CUDANet::Tensor& output CUDANet::Tensor& output
) { ) {
CUDA_CHECK(cudaMemcpy( CUDA_CHECK(cudaMemcpy(
static_cast<T*>(output.device_ptr()), static_cast<T*>(input_a.device_ptr()), input_a.size(), static_cast<T*>(output.device_ptr()), static_cast<const T*>(input_a.device_ptr()), input_a.size(),
cudaMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
)); ));
CUDA_CHECK(cudaMemcpy( CUDA_CHECK(cudaMemcpy(
static_cast<T*>(output.device_ptr()) + input_a.numel(), static_cast<T*>(input_b.device_ptr()), input_b.size(), static_cast<T*>(output.device_ptr()) + input_a.numel(), static_cast<const T*>(input_b.device_ptr()), input_b.size(),
cudaMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
)); ));
@@ -508,7 +508,7 @@ CUDANet::Tensor& CUDA::add_impl(
auto gridSize = (input_a.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; auto gridSize = (input_a.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::vec_vec_add<<<gridSize, BLOCK_SIZE>>>( Kernels::vec_vec_add<<<gridSize, BLOCK_SIZE>>>(
static_cast<T*>(input_a.device_ptr()), static_cast<T*>(input_b.device_ptr()), static_cast<T*>(output.device_ptr()), input_a.numel() static_cast<const T*>(input_a.device_ptr()), static_cast<const T*>(input_b.device_ptr()), static_cast<T*>(output.device_ptr()), input_a.numel()
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());

View File

@@ -26,7 +26,7 @@ void CUDA::print_impl(const CUDANet::Tensor &input) {
std::vector<T> h_vec(input.numel()); std::vector<T> h_vec(input.numel());
CUDA_CHECK(cudaMemcpy( CUDA_CHECK(cudaMemcpy(
h_vec.data(), static_cast<T*>(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost h_vec.data(), static_cast<const T*>(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost
)); ));
for (int i = 0; i < length; ++i) { for (int i = 0; i < length; ++i) {
@@ -98,7 +98,7 @@ void CUDA::sum_impl(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
CUDANet::Kernels::sum_reduce<<<gridSize, BLOCK_SIZE>>>( CUDANet::Kernels::sum_reduce<<<gridSize, BLOCK_SIZE>>>(
static_cast<T*>(input.device_ptr()), static_cast<T*>(sum.device_ptr()), length static_cast<const T*>(input.device_ptr()), static_cast<T*>(sum.device_ptr()), length
); );
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
@@ -131,7 +131,7 @@ void CUDA::max_impl(const CUDANet::Tensor &input, CUDANet::Tensor &max) {
auto length = input.numel(); auto length = input.numel();
const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernels::max_reduce<<<grid_size, BLOCK_SIZE>>>(static_cast<T*>(input.device_ptr()), static_cast<T*>(max.device_ptr()), length); Kernels::max_reduce<<<grid_size, BLOCK_SIZE>>>(static_cast<const T*>(input.device_ptr()), static_cast<T*>(max.device_ptr()), length);
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
int remaining = grid_size; int remaining = grid_size;