diff --git a/src/layers/batch_norm.cu b/src/layers/batch_norm.cu
index 27f8876..b9f9fa9 100644
--- a/src/layers/batch_norm.cu
+++ b/src/layers/batch_norm.cu
@@ -42,8 +42,9 @@ BatchNorm::BatchNorm(
     CUDA_CHECK(cudaMalloc((void **)&d_biases, sizeof(float) * inputChannels));
 
     d_length = nullptr;
+    float length = (float) inputSize * inputSize;
     CUDA_CHECK(cudaMalloc((void **)&d_length, sizeof(float)));
-    CUDA_CHECK(cudaMemset(d_length, inputSize * inputSize, sizeof(float)));
+    CUDA_CHECK(cudaMemcpy(d_length, &length, sizeof(float), cudaMemcpyHostToDevice));
 
     d_epsilon = nullptr;
     float epsilon = 1e-5f;
@@ -124,21 +125,12 @@ float *BatchNorm::forward(const float *d_input) {
     for (int i = 0; i < inputChannels; i++) {
 
         // Compute mean
-        // Sum over all values
-        Utils::sum(
+        Utils::mean(
             d_input + i * inputSize * inputSize,
             d_mean,
-            inputSize * inputSize
-        );
-
-        // Divide sum by length -> mean
-        Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
-            d_mean,
-            d_mean,
             d_length,
             inputSize * inputSize
         );
-        CUDA_CHECK(cudaGetLastError());
 
         // Subtract mean from input
         Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
diff --git a/test/layers/test_batch_norm.cu b/test/layers/test_batch_norm.cu
index c791d33..da54619 100644
--- a/test/layers/test_batch_norm.cu
+++ b/test/layers/test_batch_norm.cu
@@ -71,7 +71,7 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
 
     // std::cout << "BatchNorm: " << std::endl;
     for (int i = 0; i < output.size(); i++) {
-        EXPECT_EQ(output[i], expected[i]);
+        EXPECT_NEAR(output[i], expected[i], 1e-5);
         // std::cout << output[i] << " ";
     }
     // std::cout << std::endl;