Implement device vector utils

This commit is contained in:
2024-04-11 22:22:33 +02:00
parent 710a33bdde
commit 4b9d123e94
6 changed files with 109 additions and 19 deletions

View File

@@ -4,6 +4,7 @@
#include <vector>
#include "cuda_helper.cuh"
#include "vector.cuh"
#include "matmul.cuh"
TEST(MatMulTest, MatVecMulTest) {
@@ -45,7 +46,7 @@ TEST(MatMulTest, MatVecMulTest) {
int THREADS_PER_BLOCK = std::max(w, h);
int BLOCKS = 1;
CUDANet::Kernels::clear<<<BLOCKS, h>>>(d_output, h);
CUDANet::Utils::clear(d_output, h);
CUDANet::Kernels::mat_vec_mul<<<BLOCKS, THREADS_PER_BLOCK, sizeof(float) * w>>>(d_matrix, d_vector, d_output, w, h);
cudaStatus = cudaDeviceSynchronize();
@@ -198,6 +199,7 @@ TEST(MatMulTest, SumReduceTest) {
remaining = blocks_needed;
}
std::vector<float> sum(n);
cudaStatus = cudaMemcpy(
sum.data(), d_sum, sizeof(float) * n, cudaMemcpyDeviceToHost