#include "backend/cuda.cuh"
#include "matmul.cuh"

using namespace CUDANet;

__global__ void Kernels::mat_vec_mul(
    const float* __restrict__ d_matrix,
    const float* __restrict__ d_vector,
    float* __restrict__ d_output,
    const unsigned int w,
    const unsigned int h
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < h) {
        float temp = 0.0f;

        for (unsigned int j = 0; j < w; j++) {
            temp += d_matrix[tid * w + j] * d_vector[j];
        }

        d_output[tid] = temp;
    }
}

__global__ void Kernels::vec_vec_add(
    const float* __restrict__ d_vector1,
    const float* __restrict__ d_vector2,
    float* __restrict__ d_output,
    const unsigned int w
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= w) {
        return;
    }
    d_output[tid] = d_vector1[tid] + d_vector2[tid];
}

__global__ void Kernels::vec_vec_sub(
    const float* __restrict__ d_vector1,
    const float* __restrict__ d_vector2,
    float* __restrict__ d_output,
    const unsigned int w
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= w) {
        return;
    }
    d_output[tid] = d_vector1[tid] - d_vector2[tid];
}

__global__ void Kernels::vec_vec_mul(
    const float* __restrict__ d_vector1,
    const float* __restrict__ d_vector2,
    float* __restrict__ d_output,
    const unsigned int w
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= w) {
        return;
    }
    d_output[tid] = d_vector1[tid] * d_vector2[tid];
}

__global__ void Kernels::vec_scalar_sub(
    const float* __restrict__ d_src,
    float* __restrict__ d_out,
    const float* __restrict__ d_scalar,
    const unsigned int len
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= len) {
        return;
    }
    d_out[tid] = d_src[tid] - *d_scalar;
}

__global__ void Kernels::vec_scalar_add(
    const float* __restrict__ d_src,
    float* __restrict__ d_out,
    const float* __restrict__ d_scalar,
    const unsigned int len
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= len) {
        return;
    }
    d_out[tid] = d_src[tid] + *d_scalar;
}

__global__ void Kernels::vec_scalar_div(
    const float* __restrict__ d_src,
    float* __restrict__ d_out,
    const float* __restrict__ d_scalar,
    const unsigned int len
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= len) {
        return;
    }
    d_out[tid] = d_src[tid] / *d_scalar;
}

__global__ void Kernels::vec_scalar_mul(
    const float* __restrict__ d_src,
    float* __restrict__ d_out,
    const float* __restrict__ d_scalar,
    const unsigned int len
) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= len) {
        return;
    }
    d_out[tid] = d_src[tid] * *d_scalar;
}

__global__ void Kernels::vec_exp(
    const float* __restrict__ src,
    float* __restrict__ dst,
    const unsigned int len
) {
    int stride = gridDim.x * blockDim.x;
    int tid    = blockDim.x * blockIdx.x + threadIdx.x;

    for (int i = tid; i < len; i += stride) {
        dst[i] = expf(src[i]);
    }
}

__global__ void Kernels::vec_sqrt(
    const float* __restrict__ src,
    float* __restrict__ dst,
    const unsigned int len
) {
    int stride = gridDim.x * blockDim.x;
    int tid    = blockDim.x * blockIdx.x + threadIdx.x; 

    for (int i = tid; i < len; i += stride) {
        dst[i] = sqrtf(src[i]);
    }
}

__global__ void Kernels::vec_scale(
    const float* __restrict__ src,
    float* __restrict__ dst,
    const float* __restrict__ scale,
    const float* epsilon,
    const unsigned int len
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < len) {
        float inv_std = rsqrtf(*scale + *epsilon);
        dst[idx] = src[idx] * inv_std;
    }
}

__global__ void Kernels::max_reduce(
    const float* __restrict__ d_vector,
    float* __restrict__ d_output,
    const unsigned int len
) {
    __shared__ float shared_max[BLOCK_SIZE];
    int i       = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < len) {
        shared_max[threadIdx.x] = d_vector[i];
    } else {
        shared_max[threadIdx.x] = -INFINITY;
    }    

    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (threadIdx.x < s) {
            shared_max[threadIdx.x] = fmaxf(shared_max[threadIdx.x], shared_max[threadIdx.x + s]);
        }
        __syncthreads();
    }

    if (threadIdx.x == 0) {
        d_output[blockIdx.x] = shared_max[0];
    }
}

__global__ void Kernels::sum_reduce(
    const float* __restrict__ d_vector,
    float* __restrict__ d_output,
    const unsigned int len
) {
    __shared__ float partial_sum[BLOCK_SIZE];
    int              i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < len) {
        partial_sum[threadIdx.x] = d_vector[i];
    } else {
        partial_sum[threadIdx.x] = 0.0f;
    }

    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (threadIdx.x < s) {
            partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
        }
        __syncthreads();
    }

    if (threadIdx.x == 0) {
        d_output[blockIdx.x] = partial_sum[0];
    }
}