#include #include "backend.hpp" #include "backend/cuda.cuh" #include "utils/cuda_helper.cuh" #include "kernels/matmul.cuh" using namespace CUDANet::Backend; void CUDA::print(const CUDANet::Tensor &input) { auto length = input.numel(); std::vector h_vec(input.numel()); CUDA_CHECK(cudaMemcpy( h_vec.data(), input.data(), sizeof(float) * length, cudaMemcpyDeviceToHost )); for (int i = 0; i < length; ++i) { std::cout << h_vec[i] << ", "; } std::cout << std::endl; } void CUDA::zero(CUDANet::Tensor &input) { CUDA_CHECK(cudaMemset(input.data(), 0, sizeof(float) * input.numel())); } void CUDA::copy_to_device(CUDANet::Tensor &tensor, void *data, size_t size) { CUDA_CHECK(cudaMemcpy(tensor.data(), data, size, cudaMemcpyHostToDevice)); } void CUDA::sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) { auto length = input.numel(); const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE; CUDANet::Kernels::sum_reduce<<>>( input.data(), sum.data(), length ); CUDA_CHECK(cudaGetLastError()); int remaining = gridSize; while (remaining > 1) { int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; CUDANet::Kernels::sum_reduce<<>>(sum.data(), sum.data(), remaining); CUDA_CHECK(cudaGetLastError()); remaining = blocks_needed; } } void CUDA::max(const CUDANet::Tensor &input, CUDANet::Tensor &max) { auto length = input.numel(); const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::max_reduce<<>>(input.data(), max.data(), length); CUDA_CHECK(cudaGetLastError()); int remaining = grid_size; while (remaining > 1) { int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; CUDANet::Kernels::max_reduce<<>>(max.data(), max.data(), remaining); CUDA_CHECK(cudaGetLastError()); remaining = blocks_needed; } }