Implement device vector utils

2025-12-22 22:34:22 +00:00 · 2024-04-11 22:22:33 +02:00
parent 710a33bdde
commit 4b9d123e94
6 changed files with 109 additions and 19 deletions
--- a/test/kernels/test_matmul.cu
+++ b/test/kernels/test_matmul.cu
@@ -4,6 +4,7 @@
 #include <vector>

 #include "cuda_helper.cuh"
+#include "vector.cuh"
 #include "matmul.cuh"

 TEST(MatMulTest, MatVecMulTest) {
@@ -45,7 +46,7 @@ TEST(MatMulTest, MatVecMulTest) {
    int THREADS_PER_BLOCK = std::max(w, h);
    int BLOCKS            = 1;

-    CUDANet::Kernels::clear<<<BLOCKS, h>>>(d_output, h);
+    CUDANet::Utils::clear(d_output, h);

    CUDANet::Kernels::mat_vec_mul<<<BLOCKS, THREADS_PER_BLOCK, sizeof(float) * w>>>(d_matrix, d_vector, d_output, w, h);
    cudaStatus = cudaDeviceSynchronize();
@@ -198,6 +199,7 @@ TEST(MatMulTest, SumReduceTest) {
        remaining = blocks_needed;
    }

+
    std::vector<float> sum(n);
    cudaStatus = cudaMemcpy(
        sum.data(), d_sum, sizeof(float) * n, cudaMemcpyDeviceToHost