Restructure include paths

2025-12-22 14:24:22 +00:00 · 2025-11-23 20:57:08 +01:00
parent 4161caf3e1
commit 38cb0c9ac0
13 changed files with 15 additions and 15 deletions
--- a/include/backend/cuda/kernels/matmul.cuh
+++ b/include/backend/cuda/kernels/matmul.cuh
@@ -0,0 +1,192 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+namespace CUDANet::Kernels {
+
+/**
+ * @brief Matrix vector multiplication kernel
+ *
+ * @param d_matrix Device pointer to matrix
+ * @param d_vector Device pointer to vector
+ * @param d_output Device pointer to output vector
+ * @param w Width of the matrix
+ * @param h Height of the matrix
+ */
+__global__ void mat_vec_mul(
+    const float* __restrict__ d_matrix,
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int w,
+    const unsigned int h
+);
+
+/**
+ * @brief Vector vector addition kernel
+ *
+ * @param d_vector1 Device pointer to first vector
+ * @param d_vector2 Device pointer to second vector
+ * @param d_output Device pointer to output vector
+ * @param w Length of the vectors
+ */
+__global__ void vec_vec_add(
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
+);
+
+/**
+ * @brief Vector vector subtraction kernel
+ * 
+ * @param d_vector1 
+ * @param d_vector2 
+ * @param d_output 
+ * @param w 
+ * @return __global__ 
+ */
+__global__ void vec_vec_sub(
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
+);
+
+__global__ void vec_vec_mul(
+    const float* __restrict__ d_vector1,
+    const float* __restrict__ d_vector2,
+    float* __restrict__ d_output,
+    const unsigned int w
+);
+
+/**
+ * @brief Sub scalar from each element of the vector
+ * 
+ * @param d_vector 
+ * @param d_scalar 
+ * @param d_output 
+ * @param w 
+ * @return __global__ 
+ */
+__global__ void vec_scalar_sub(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+);
+
+/**
+ * @brief Add scalar to each element of the vector
+ * 
+ * @param d_src 
+ * @param d_out 
+ * @param d_scalar 
+ * @param len 
+ * @return __global__ 
+ */
+__global__ void vec_scalar_add(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+);
+
+/**
+ * @brief Divide each element of the vector by a scalar
+ *
+ * @param src Pointer to the source array
+ * @param dst Pointer to the destination array
+ * @param len Length of the arrays
+ */
+__global__ void vec_scalar_div(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+);
+
+/**
+ * @brief Multiply each element of the vector by a scalar
+ * 
+ * @param d_src 
+ * @param d_out 
+ * @param d_scalar 
+ * @param len 
+ * @return __global__ 
+ */
+__global__ void vec_scalar_mul(
+    const float* __restrict__ d_src,
+    float* __restrict__ d_out,
+    const float* __restrict__ d_scalar,
+    const unsigned int len
+);
+
+/**
+ * @brief Exponentiate each element of the vector
+ *
+ * @param src Pointer to the source array
+ * @param dst Pointer to the destination array
+ * @param len Length of the arrays
+ */
+__global__ void vec_exp(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const unsigned int len
+);
+
+/**
+ * @brief Compute the square root of each element of the vector
+ * 
+ * @param src Device pointer to source vector
+ * @param dst Device pointer to destination vector
+ * @param len Length of the vector
+ */
+__global__ void vec_sqrt(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const unsigned int len
+);
+
+/**
+ * @brief Scales the vector by 1/sqrt(scale + epsilon)
+ * 
+ * @param src Device pointer to source vector
+ * @param dst Device pointer to destination vector
+ * @param scale Scale
+ * @param epsilon Epsilon
+ * @param len Length of the vector
+ */
+__global__ void vec_scale(
+    const float* __restrict__ src,
+    float* __restrict__ dst,
+    const float* __restrict__ scale,
+    const float* epsilon,
+    const unsigned int len
+);
+
+/**
+ * @brief Max reduction kernel
+ *
+ * @param d_vector Device pointer to vector
+ * @param d_output Device pointer to output vector
+ */
+__global__ void max_reduce(
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int len
+);
+
+/**
+ * @brief 
+ * 
+ * @param d_vector Device pointer to vector
+ * @param d_output Device pointer to output vector
+ * @param len Length of the vector
+ */
+__global__ void sum_reduce(
+    const float* __restrict__ d_vector,
+    float* __restrict__ d_output,
+    const unsigned int len
+);
+
+}  // namespace CUDANet::Kernels