Implement vector variance function

2025-12-23 14:54:28 +00:00 · 2024-05-14 21:58:23 +02:00
parent 33a3c6afcb
commit 7c48ed86d2
3 changed files with 42 additions and 27 deletions
--- a/src/layers/batch_norm.cu
+++ b/src/layers/batch_norm.cu
@@ -142,30 +142,12 @@ float *BatchNorm::forward(const float *d_input) {
        CUDA_CHECK(cudaGetLastError());

        // Compute variance
-        // Square differences of input - mean
-        Kernels::vec_vec_mul<<<gridSize, BLOCK_SIZE>>>(
+        Utils::var(
            d_mean_sub,
-            d_mean_sub,
-            d_sqrt_var,
-            inputSize * inputSize
-        );
-        CUDA_CHECK(cudaGetLastError());
-
-        // Sum over all differences
-        Utils::sum(
-            d_sqrt_var,
-            d_sqrt_var,
-            inputSize * inputSize
-        );
-
-        // Divide by difference sum / length -> variance
-        Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
-            d_sqrt_var,
            d_sqrt_var,
            d_length,
            inputSize * inputSize
        );
-        CUDA_CHECK(cudaGetLastError());

        // Add epsilon to variance to avoid division by zero
        Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
@@ -193,6 +175,7 @@ float *BatchNorm::forward(const float *d_input) {
        );
        CUDA_CHECK(cudaGetLastError());

+        // Multiply by weights
        Kernels::vec_scalar_mul<<<gridSize, BLOCK_SIZE>>>(
            d_output + i * inputSize * inputSize,
            d_output + i * inputSize * inputSize,
@@ -201,6 +184,7 @@ float *BatchNorm::forward(const float *d_input) {
        );
        CUDA_CHECK(cudaGetLastError());

+        // Add biases
        Kernels::vec_scalar_add<<<gridSize, BLOCK_SIZE>>>(
            d_output + i * inputSize * inputSize,
            d_output + i * inputSize * inputSize,
--- a/src/utils/vector.cu
+++ b/src/utils/vector.cu
@@ -73,4 +73,35 @@ void Utils::mean(const float* d_vec, float* d_mean, float *d_length, int length)
    );

    CUDA_CHECK(cudaGetLastError());
+}
+
+
+void Utils::var(float* d_vec, float* d_var, float *d_length, const unsigned int length) {
+
+    const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    Kernels::vec_vec_mul<<<gridSize, BLOCK_SIZE>>>(
+        d_vec,
+        d_vec,
+        d_var,
+        length
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    // Sum over all differences
+    Utils::sum(
+        d_var,
+        d_var,
+        length
+    );
+
+    // Divide by difference sum / length -> variance
+    Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
+        d_var,
+        d_var,
+        d_length,
+        length
+    );
+    CUDA_CHECK(cudaGetLastError());
+
 }