#include "backend/cuda.cuh" #include "kernels/activation_functions.cuh" #include "kernels/convolution.cuh" #include "kernels/matmul.cuh" #include "utils/cuda_helper.cuh" using namespace CUDANet::Backend; void CUDA::relu(Tensor& tensor) { int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::relu<<>>( tensor.data(), tensor.data(), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } void CUDA::sigmoid(Tensor& tensor) { int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::sigmoid<<>>( tensor.data(), tensor.data(), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } void CUDA::softmax(Tensor& tensor, Tensor& temp_max, Tensor& temp_sum) { int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE; // Find max value max(tensor, temp_max); // Subtract max value to improve numerical stability Kernels::vec_scalar_sub<<>>( tensor.data(), tensor.data(), temp_max.data(), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); // Compute exponentials Kernels::vec_exp<<>>( tensor.data(), tensor.data(), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); // Find sum sum(tensor, temp_sum); Kernels::vec_scalar_div<<>>( tensor.data(), tensor.data(), temp_sum.data(), tensor.numel() ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } CUDANet::Tensor& CUDA::dense( const CUDANet::Tensor& weights, const CUDANet::Tensor& biases, const CUDANet::Tensor& input, CUDANet::Tensor& output, const size_t input_size, const size_t output_size ) { auto forwardGridSize = (std::max(input_size, output_size) + BLOCK_SIZE - 1) / BLOCK_SIZE; auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE; Kernels::mat_vec_mul<<>>( weights.data(), input.data(), output.data(), input_size, output_size ); CUDA_CHECK(cudaGetLastError()); Kernels::vec_vec_add<<>>( biases.data(), output.data(), output.data(), output_size ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); return output; } CUDANet::Tensor& CUDA::conv2d( const CUDANet::Tensor& weights, const CUDANet::Tensor& biases, const CUDANet::Tensor& input, CUDANet::Tensor& output, const CUDANet::Shape in_shape, const CUDANet::Shape padding_shape, const CUDANet::Shape kernel_shape, const CUDANet::Shape stride_shape, const CUDANet::Shape out_shape ) { dim3 block(8, 8, 8); dim3 grid( (out_shape[0] + block.x - 1) / block.x, (out_shape[1] + block.y - 1) / block.y, (out_shape[3] + block.z - 1) / block.z ); Kernels::convolution<<>>( input.data(), weights.data(), biases.data(), output.data(), in_shape, padding_shape, kernel_shape, stride_shape, out_shape ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); return output; }