#pragma once #include namespace CUDANet::Kernels { template __global__ void mat_vec_mul( const T* __restrict__ d_matrix, const T* __restrict__ d_vector, T* __restrict__ d_output, const unsigned int w, const unsigned int h ); template __global__ void vec_vec_add( const T* __restrict__ d_vector1, const T* __restrict__ d_vector2, T* __restrict__ d_output, const unsigned int w ); template __global__ void vec_vec_sub( const T* __restrict__ d_vector1, const T* __restrict__ d_vector2, T* __restrict__ d_output, const unsigned int w ); template __global__ void vec_vec_mul( const T* __restrict__ d_vector1, const T* __restrict__ d_vector2, T* __restrict__ d_output, const unsigned int w ); template __global__ void vec_scalar_sub( const T* __restrict__ d_src, T* __restrict__ d_out, const T* __restrict__ d_scalar, const unsigned int len ); template __global__ void vec_scalar_add( const T* __restrict__ d_src, T* __restrict__ d_out, const T* __restrict__ d_scalar, const unsigned int len ); template __global__ void vec_scalar_div( const T* __restrict__ d_src, T* __restrict__ d_out, const T* __restrict__ d_scalar, const unsigned int len ); template __global__ void vec_scalar_mul( const T* __restrict__ d_src, T* __restrict__ d_out, const T* __restrict__ d_scalar, const unsigned int len ); template __global__ void vec_exp( const T* __restrict__ src, T* __restrict__ dst, const unsigned int len ); template __global__ void vec_sqrt( const T* __restrict__ src, T* __restrict__ dst, const unsigned int len ); template __global__ void vec_scale( const T* __restrict__ src, T* __restrict__ dst, const T* __restrict__ scale, const T* epsilon, const unsigned int len ); template __global__ void max_reduce( const T* __restrict__ d_vector, T* __restrict__ d_output, const unsigned int len ); template __global__ void sum_reduce( const T* __restrict__ d_vector, T* __restrict__ d_output, const unsigned int len ); } // namespace CUDANet::Kernels