Use shared memory for mat vec mul kernel

This commit is contained in:
2024-03-13 22:13:11 +01:00
parent 09480e42e5
commit 77004c16be
4 changed files with 77 additions and 7 deletions

View File

@@ -10,16 +10,22 @@ __global__ void Kernels::mat_vec_mul(
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= w * h) {
return;
extern __shared__ float shared[];
if (tid < w) {
shared[tid] = d_vector[tid];
}
d_output[tid] = 0.0f;
__syncthreads();
for (int i = 0; i < w; i++) {
d_output[tid] += d_matrix[tid * w + i] * d_vector[i];
if (tid < h) {
d_output[tid] = 0.0f;
#pragma unroll
for (int i = 0; i < w; i++) {
d_output[tid] += d_matrix[tid * w + i] * shared[i];
}
}
}
__global__ void Kernels::vec_vec_add(

View File

@@ -51,7 +51,7 @@ void Layers::Dense::initializeBiases() {
}
float* Layers::Dense::forward(const float* d_input) {
Kernels::mat_vec_mul<<<1, outputSize>>>(
Kernels::mat_vec_mul<<<1, std::max(inputSize, outputSize), sizeof(float) * inputSize>>>(
d_weights, d_input, d_output, inputSize, outputSize
);