diff --git a/include/layers/max_pooling.cuh b/include/layers/max_pooling.cuh
index e2aaa81..7aa35e7 100644
--- a/include/layers/max_pooling.cuh
+++ b/include/layers/max_pooling.cuh
@@ -9,10 +9,10 @@ namespace CUDANet::Layers {
 class MaxPooling2d : public SequentialLayer {
   public:
     MaxPooling2d(
-        dim2d           inputSize,
+        dim2d          inputSize,
         int            nChannels,
-        dim2d           poolingSize,
-        dim2d           stride,
+        dim2d          poolingSize,
+        dim2d          stride,
         ActivationType activationType
     );
     ~MaxPooling2d();
@@ -35,7 +35,7 @@ class MaxPooling2d : public SequentialLayer {
 
   private:
     dim2d inputSize;
-    int  nChannels;
+    int   nChannels;
     dim2d poolingSize;
     dim2d stride;
 
diff --git a/test/layers/test_avg_pooling.cu b/test/layers/test_avg_pooling.cu
index cd6a90b..4ccc69d 100644
--- a/test/layers/test_avg_pooling.cu
+++ b/test/layers/test_avg_pooling.cu
@@ -5,15 +5,78 @@
 
 #include "avg_pooling.cuh"
 
-TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
-    dim2d inputSize   = {4, 4};
-    int   nChannels   = 2;
-    dim2d poolingSize = {2, 2};
-    dim2d stride      = {2, 2};
+class AvgPoolingLayerTest : public ::testing::Test {
+  protected:
+    dim2d              inputSize;
+    int                nChannels;
+    dim2d              poolingSize;
+    dim2d              stride;
+    std::vector<float> input;
+    std::vector<float> expected;
 
-    cudaError_t cudaStatus;
+    float                         *d_input;
+    float                         *d_output;
+    CUDANet::Layers::AvgPooling2d *avgPoolingLayer;
 
-    std::vector<float> input = {
+    virtual void SetUp() override {
+        d_input         = nullptr;
+        d_output        = nullptr;
+        avgPoolingLayer = nullptr;
+    }
+
+    virtual void TearDown() override {
+        if (d_input) {
+            cudaFree(d_input);
+        }
+    }
+
+    void runTest() {
+        cudaError_t cudaStatus;
+
+        avgPoolingLayer = new CUDANet::Layers::AvgPooling2d(
+            inputSize, nChannels, poolingSize, stride,
+            CUDANet::Layers::ActivationType::NONE
+        );
+
+        cudaStatus = cudaMalloc(
+            (void **)&d_input,
+            sizeof(float) * inputSize.first * inputSize.second * nChannels
+        );
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        cudaStatus = cudaMemcpy(
+            d_input, input.data(),
+            sizeof(float) * inputSize.first * inputSize.second * nChannels,
+            cudaMemcpyHostToDevice
+        );
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        d_output = avgPoolingLayer->forward(d_input);
+
+        int outputSize = avgPoolingLayer->getOutputSize();
+
+        std::vector<float> output(outputSize);
+        cudaStatus = cudaMemcpy(
+            output.data(), d_output, sizeof(float) * outputSize,
+            cudaMemcpyDeviceToHost
+        );
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        for (int i = 0; i < output.size(); ++i) {
+            EXPECT_NEAR(expected[i], output[i], 1e-4);
+        }
+
+        delete avgPoolingLayer;
+    }
+};
+
+TEST_F(AvgPoolingLayerTest, AvgPoolForwardTest) {
+    inputSize   = {4, 4};
+    nChannels   = 2;
+    poolingSize = {2, 2};
+    stride      = {2, 2};
+
+    input = {
         // clang-format off
         // Channel 0
         0.573f, 0.619f, 0.732f, 0.055f,
@@ -28,44 +91,69 @@ TEST(AvgPoolingLayerTest, AvgPoolForwardTest) {
         // clang-format on
     };
 
-    CUDANet::Layers::AvgPooling2d avgPoolingLayer(
-        inputSize, nChannels, poolingSize, stride,
-        CUDANet::Layers::ActivationType::NONE
-    );
+    expected = {0.43775f, 0.49475f, 0.48975f, 0.339f,
+                0.45675f, 0.303f,   0.56975f, 0.57025f};
 
-    float *d_input;
-
-    cudaStatus = cudaMalloc(
-        (void **)&d_input,
-        sizeof(float) * inputSize.first * inputSize.second * nChannels
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
-
-    cudaStatus = cudaMemcpy(
-        d_input, input.data(),
-        sizeof(float) * inputSize.first * inputSize.second * nChannels,
-        cudaMemcpyHostToDevice
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
-
-    float *d_output = avgPoolingLayer.forward(d_input);
-
-    int outputSize = avgPoolingLayer.getOutputSize();
-
-    std::vector<float> output(outputSize);
-    cudaStatus = cudaMemcpy(
-        output.data(), d_output, sizeof(float) * outputSize,
-        cudaMemcpyDeviceToHost
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
-
-    std::vector<float> expected = {0.43775f, 0.49475f, 0.48975f, 0.339f,
-                                   0.45675f, 0.303f,   0.56975f, 0.57025f};
-
-    for (int i = 0; i < output.size(); ++i) {
-        EXPECT_NEAR(expected[i], output[i], 1e-4);
-    }
-
-    cudaStatus = cudaFree(d_input);
-    EXPECT_EQ(cudaStatus, cudaSuccess);
+    runTest();
 }
+
+TEST_F(AvgPoolingLayerTest, AvgPoolForwardNonSquareInputTest) {
+    inputSize   = {4, 6};  // Non-square input
+    nChannels   = 2;
+    poolingSize = {2, 2};
+    stride      = {2, 2};
+
+    input = {// Channel 0
+             0.573f, 0.619f, 0.732f, 0.055f, 0.123f, 0.234f, 0.243f, 0.316f,
+             0.573f, 0.619f, 0.456f, 0.789f, 0.712f, 0.055f, 0.243f, 0.316f,
+             0.654f, 0.987f, 0.573f, 0.619f, 0.742f, 0.055f, 0.321f, 0.654f,
+             // Channel 1
+             0.473f, 0.919f, 0.107f, 0.073f, 0.321f, 0.654f, 0.073f, 0.362f,
+             0.973f, 0.059f, 0.654f, 0.987f, 0.473f, 0.455f, 0.283f, 0.416f,
+             0.789f, 0.123f, 0.532f, 0.819f, 0.732f, 0.850f, 0.987f, 0.321f
+    };
+
+    expected = {0.43775f, 0.49475f, 0.4005f, 0.48975f, 0.339f,   0.654f,
+                0.45675f, 0.303f,   0.654f,  0.56975f, 0.57025f, 0.555f};
+
+    runTest();
+}
+
+TEST_F(AvgPoolingLayerTest, AvgPoolForwardNonSquarePoolingTest) {
+    inputSize   = {4, 4};
+    nChannels   = 2;
+    poolingSize = {2, 3};  // Non-square pooling
+    stride      = {2, 2};
+
+    input = {// Channel 0
+             0.573f, 0.619f, 0.732f, 0.055f, 0.243f, 0.316f, 0.573f, 0.619f,
+             0.712f, 0.055f, 0.243f, 0.316f, 0.573f, 0.619f, 0.742f, 0.055f,
+             // Channel 1
+             0.473f, 0.919f, 0.107f, 0.073f, 0.073f, 0.362f, 0.973f, 0.059f,
+             0.473f, 0.455f, 0.283f, 0.416f, 0.532f, 0.819f, 0.732f, 0.850f
+    };
+
+    expected = {0.50933f, 0.49067f, 0.4845f, 0.549f};
+
+    runTest();
+}
+
+TEST_F(AvgPoolingLayerTest, AvgPoolForwardNonSquareStrideTest) {
+    inputSize   = {4, 4};
+    nChannels   = 2;
+    poolingSize = {2, 2};
+    stride      = {1, 2};  // Non-square stride
+
+    input = {// Channel 0
+             0.573f, 0.619f, 0.732f, 0.055f, 0.243f, 0.316f, 0.573f, 0.619f,
+             0.712f, 0.055f, 0.243f, 0.316f, 0.573f, 0.619f, 0.742f, 0.055f,
+             // Channel 1
+             0.473f, 0.919f, 0.107f, 0.073f, 0.073f, 0.362f, 0.973f, 0.059f,
+             0.473f, 0.455f, 0.283f, 0.416f, 0.532f, 0.819f, 0.732f, 0.850f
+    };
+
+    expected = {0.43775f, 0.49475f, 0.3315f,  0.43775f, 0.48975f, 0.339f,
+                0.45675f, 0.303f,   0.34075f, 0.43275f, 0.56975f, 0.57025f};
+
+    runTest();
+}
\ No newline at end of file
diff --git a/test/layers/test_batch_norm.cu b/test/layers/test_batch_norm.cu
index 25db618..ee7bb60 100644
--- a/test/layers/test_batch_norm.cu
+++ b/test/layers/test_batch_norm.cu
@@ -6,27 +6,80 @@
 #include "activation.cuh"
 #include "batch_norm.cuh"
 
-TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
-    dim2d inputSize = {4, 4};
-    int   nChannels = 2;
+class BatchNormLayerTest : public ::testing::Test {
+  protected:
+    dim2d              inputSize;
+    int                nChannels;
+    std::vector<float> weights;
+    std::vector<float> biases;
+    std::vector<float> input;
+    std::vector<float> expected;
 
-    cudaError_t cudaStatus;
+    float                        *d_input;
+    float                        *d_output;
+    CUDANet::Layers::BatchNorm2d *batchNorm;
 
-    CUDANet::Layers::BatchNorm2d batchNorm(
-        inputSize, nChannels, 1e-5f, CUDANet::Layers::ActivationType::NONE
-    );
+    virtual void SetUp() override {
+        d_input   = nullptr;
+        d_output  = nullptr;
+        batchNorm = nullptr;
+    }
 
-    std::vector<float> weights = {0.63508f, 0.64903f};
-    std::vector<float> biases  = {0.25079f, 0.66841f};
+    virtual void TearDown() override {
+        if (d_input) {
+            cudaFree(d_input);
+        }
+    }
 
-    batchNorm.setWeights(weights.data());
-    batchNorm.setBiases(biases.data());
+    void runTest() {
+        cudaError_t cudaStatus;
 
-    cudaStatus = cudaGetLastError();
-    EXPECT_EQ(cudaStatus, cudaSuccess);
+        batchNorm = new CUDANet::Layers::BatchNorm2d(
+            inputSize, nChannels, 1e-5f, CUDANet::Layers::ActivationType::NONE
+        );
+
+        batchNorm->setWeights(weights.data());
+        batchNorm->setBiases(biases.data());
+
+        cudaStatus = cudaGetLastError();
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        cudaStatus =
+            cudaMalloc((void **)&d_input, sizeof(float) * input.size());
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        cudaStatus = cudaMemcpy(
+            d_input, input.data(), sizeof(float) * input.size(),
+            cudaMemcpyHostToDevice
+        );
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        d_output = batchNorm->forward(d_input);
+
+        std::vector<float> output(input.size());
+        cudaStatus = cudaMemcpy(
+            output.data(), d_output, sizeof(float) * output.size(),
+            cudaMemcpyDeviceToHost
+        );
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        for (int i = 0; i < output.size(); ++i) {
+            EXPECT_NEAR(output[i], expected[i], 1e-5);
+        }
+
+        delete batchNorm;
+    }
+};
+
+TEST_F(BatchNormLayerTest, BatchNormSmallForwardTest) {
+    inputSize = {4, 4};
+    nChannels = 2;
+
+    weights = {0.63508f, 0.64903f};
+    biases  = {0.25079f, 0.66841f};
 
     // clang-format off
-    std::vector<float> input = {
+    input = {
         // Channel 0
         0.38899f, 0.80478f, 0.48836f, 0.97381f,
         0.57508f, 0.60835f, 0.65467f, 0.00168f,
@@ -40,40 +93,44 @@ TEST(BatchNormLayerTest, BatchNormSmallForwardTest) {
     };
     // clang-format on
 
-    std::vector<float> output(input.size());
+    expected = {-0.06007f, 0.951f,    0.18157f,  1.36202f, 0.39244f,  0.47335f,
+                0.58598f,  -1.00188f, 0.59576f,  0.79919f, -0.57001f, 0.70469f,
+                -0.62847f, -0.06578f, -0.43668f, 0.72952f, 0.37726f,  0.02088f,
+                0.35446f,  0.98092f,  1.39264f,  1.80686f, 1.67786f,  1.58318f,
+                -0.0269f,  0.26878f,  0.81411f,  0.09022f, 0.9126f,   0.71485f,
+                -0.08184f, -0.19131f};
 
-    float* d_input;
-    cudaStatus = cudaMalloc((void**)&d_input, sizeof(float) * input.size());
-    EXPECT_EQ(cudaStatus, cudaSuccess);
+    runTest();
+}
 
-    cudaStatus = cudaMemcpy(
-        d_input, input.data(), sizeof(float) * input.size(),
-        cudaMemcpyHostToDevice
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
+TEST_F(BatchNormLayerTest, BatchNormNonSquareInputTest) {
+    inputSize = {4, 6};  // Non-square input
+    nChannels = 2;
+    weights   = {0.63508f, 0.64903f};
+    biases    = {0.25079f, 0.66841f};
 
-    float* d_output = batchNorm.forward(d_input);
+    input = {// Channel 0
+             0.38899f, 0.80478f, 0.48836f, 0.97381f, 0.21567f, 0.92312f,
+             0.57508f, 0.60835f, 0.65467f, 0.00168f, 0.31567f, 0.71345f,
+             0.65869f, 0.74235f, 0.17928f, 0.70349f, 0.12856f, 0.95645f,
+             0.15524f, 0.38664f, 0.23411f, 0.7137f, 0.26789f, 0.83412f,
+             // Channel 1
+             0.32473f, 0.15698f, 0.314f, 0.60888f, 0.23145f, 0.78945f, 0.80268f,
+             0.99766f, 0.93694f, 0.89237f, 0.61234f, 0.92314f, 0.13449f,
+             0.27367f, 0.53036f, 0.18962f, 0.45623f, 0.14523f, 0.57672f,
+             0.48364f, 0.10863f, 0.0571f, 0.78934f, 0.67545f
+    };
 
-    cudaStatus = cudaMemcpy(
-        output.data(), d_output, sizeof(float) * output.size(),
-        cudaMemcpyDeviceToHost
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
+    expected = {-0.05598f, 0.87495f,  0.1665f,   1.2534f,   -0.44404f,
+                1.13991f,  0.36066f,  0.43515f,  0.53886f,  -0.92315f,
+                -0.22014f, 0.67047f,  0.54786f,  0.73517f,  -0.52552f,
+                0.64817f,  -0.63907f, 1.21453f,  -0.57934f, -0.06124f,
+                -0.40275f, 0.67103f,  -0.32712f, 0.94064f,  0.28344f,
+                -0.08405f, 0.25993f,  0.90592f,  0.07909f,  1.30149f,
+                1.33047f,  1.7576f,   1.62459f,  1.52695f,  0.9135f,
+                1.59436f,  -0.13331f, 0.17158f,  0.73391f,  -0.01254f,
+                0.57151f,  -0.10979f, 0.83546f,  0.63156f,  -0.18996f,
+                -0.30285f, 1.30124f,  1.05175f};
 
-    std::vector<float> expected = {-0.06007f, 0.951f,    0.18157f,  1.36202f,
-                                   0.39244f,  0.47335f,  0.58598f,  -1.00188f,
-                                   0.59576f,  0.79919f,  -0.57001f, 0.70469f,
-                                   -0.62847f, -0.06578f, -0.43668f, 0.72952f,
-                                   0.37726f,  0.02088f,  0.35446f,  0.98092f,
-                                   1.39264f,  1.80686f,  1.67786f,  1.58318f,
-                                   -0.0269f,  0.26878f,  0.81411f,  0.09022f,
-                                   0.9126f,   0.71485f,  -0.08184f, -0.19131f};
-
-    // std::cout << "BatchNorm2d: " << std::endl;
-    for (int i = 0; i < output.size(); i++) {
-        EXPECT_NEAR(output[i], expected[i], 1e-5);
-        // std::cout << output[i] << " ";
-    }
-    // std::cout << std::endl;
-    cudaFree(d_input);
+    runTest();
 }
\ No newline at end of file
diff --git a/test/layers/test_conv2d.cu b/test/layers/test_conv2d.cu
index 3f15b4b..0084e3f 100644
--- a/test/layers/test_conv2d.cu
+++ b/test/layers/test_conv2d.cu
@@ -82,8 +82,9 @@ TEST_F(Conv2dTest, SimpleTest) {
     );
 
     int outputHeight = (inputSize.first - kernelSize.first) / stride.first + 1;
-    int outputWidth  = (inputSize.second - kernelSize.second) / stride.second + 1;
-    int outputSize  = outputHeight * outputWidth * numFilters;
+    int outputWidth =
+        (inputSize.second - kernelSize.second) / stride.second + 1;
+    int outputSize = outputHeight * outputWidth * numFilters;
     EXPECT_EQ(outputSize, conv2d.getOutputSize());
 
     d_output = conv2d.forward(d_input);
@@ -112,9 +113,12 @@ TEST_F(Conv2dTest, PaddedTest) {
     dim2d stride        = {1, 1};
     int   numFilters    = 2;
 
-    int paddingFirst = CUDANET_SAME_PADDING(inputSize.first, kernelSize.first, stride.first);
-    int paddingSecond = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second);
-    dim2d paddingSize   = {paddingFirst, paddingSecond};
+    int paddingFirst =
+        CUDANET_SAME_PADDING(inputSize.first, kernelSize.first, stride.first);
+    int paddingSecond = CUDANET_SAME_PADDING(
+        inputSize.second, kernelSize.second, stride.second
+    );
+    dim2d paddingSize = {paddingFirst, paddingSecond};
 
     CUDANet::Layers::ActivationType activationType =
         CUDANet::Layers::ActivationType::NONE;
@@ -177,7 +181,9 @@ TEST_F(Conv2dTest, PaddedTest) {
         activationType, input, kernels.data(), d_input
     );
 
-    EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize());
+    EXPECT_EQ(
+        inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize()
+    );
 
     d_output = conv2d.forward(d_input);
 
@@ -209,16 +215,18 @@ TEST_F(Conv2dTest, PaddedTest) {
 
 TEST_F(Conv2dTest, StridedPaddedConvolution) {
     dim2d inputSize     = {5, 5};
-    int inputChannels = 2;
+    int   inputChannels = 2;
     dim2d kernelSize    = {3, 3};
     dim2d stride        = {2, 2};
-    int numFilters    = 2;
+    int   numFilters    = 2;
 
-    int paddingFirst   = CUDANET_SAME_PADDING(inputSize.first, kernelSize.second, stride.first);
-    int paddingSecond  = CUDANET_SAME_PADDING(inputSize.second, kernelSize.second, stride.second);
+    int paddingFirst =
+        CUDANET_SAME_PADDING(inputSize.first, kernelSize.second, stride.first);
+    int paddingSecond = CUDANET_SAME_PADDING(
+        inputSize.second, kernelSize.second, stride.second
+    );
     dim2d paddingSize = {paddingFirst, paddingSecond};
 
-
     CUDANet::Layers::ActivationType activationType =
         CUDANet::Layers::ActivationType::RELU;
 
@@ -265,7 +273,9 @@ TEST_F(Conv2dTest, StridedPaddedConvolution) {
         activationType, input, kernels.data(), d_input
     );
 
-    EXPECT_EQ(inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize());
+    EXPECT_EQ(
+        inputSize.first * inputSize.second * numFilters, conv2d.getOutputSize()
+    );
 
     d_output = conv2d.forward(d_input);
 
diff --git a/test/layers/test_max_pooling.cu b/test/layers/test_max_pooling.cu
index cff5210..bc883f5 100644
--- a/test/layers/test_max_pooling.cu
+++ b/test/layers/test_max_pooling.cu
@@ -5,15 +5,74 @@
 
 #include "max_pooling.cuh"
 
-TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
-    dim2d inputSize   = {4, 4};
-    int   nChannels   = 2;
-    dim2d poolingSize = {2, 2};
-    dim2d stride      = {2, 2};
+class MaxPoolingLayerTest : public ::testing::Test {
+  protected:
+    dim2d              inputSize;
+    int                nChannels;
+    dim2d              poolingSize;
+    dim2d              stride;
+    std::vector<float> input;
+    std::vector<float> expected;
 
-    cudaError_t cudaStatus;
+    float                         *d_input;
+    float                         *d_output;
+    CUDANet::Layers::MaxPooling2d *maxPoolingLayer;
 
-    std::vector<float> input = {
+    virtual void SetUp() override {
+        d_input         = nullptr;
+        d_output        = nullptr;
+        maxPoolingLayer = nullptr;
+    }
+
+    virtual void TearDown() override {
+        if (d_input) {
+            cudaFree(d_input);
+        }
+        delete maxPoolingLayer;
+    }
+
+    void runTest() {
+        cudaError_t cudaStatus;
+
+        maxPoolingLayer = new CUDANet::Layers::MaxPooling2d(
+            inputSize, nChannels, poolingSize, stride,
+            CUDANet::Layers::ActivationType::NONE
+        );
+
+        cudaStatus =
+            cudaMalloc((void **)&d_input, sizeof(float) * input.size());
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        cudaStatus = cudaMemcpy(
+            d_input, input.data(), sizeof(float) * input.size(),
+            cudaMemcpyHostToDevice
+        );
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        d_output = maxPoolingLayer->forward(d_input);
+
+        int outputSize = maxPoolingLayer->getOutputSize();
+
+        std::vector<float> output(outputSize);
+        cudaStatus = cudaMemcpy(
+            output.data(), d_output, sizeof(float) * output.size(),
+            cudaMemcpyDeviceToHost
+        );
+        EXPECT_EQ(cudaStatus, cudaSuccess);
+
+        for (int i = 0; i < output.size(); ++i) {
+            EXPECT_FLOAT_EQ(expected[i], output[i]);
+        }
+    }
+};
+
+TEST_F(MaxPoolingLayerTest, MaxPoolForwardTest) {
+    inputSize   = {4, 4};
+    nChannels   = 2;
+    poolingSize = {2, 2};
+    stride      = {2, 2};
+
+    input = {
         // clang-format off
         // Channel 0
         0.573f, 0.619f, 0.732f, 0.055f,
@@ -28,43 +87,82 @@ TEST(MaxPoolingLayerTest, MaxPoolForwardTest) {
         // clang-format on
     };
 
-    CUDANet::Layers::MaxPooling2d maxPoolingLayer(
-        inputSize, nChannels, poolingSize, stride,
-        CUDANet::Layers::ActivationType::NONE
-    );
+    expected = {0.619f, 0.732f, 0.712f, 0.742f, 0.919f, 0.973f, 0.819f, 0.85f};
 
-    float *d_input;
-
-    cudaStatus = cudaMalloc(
-        (void **)&d_input, sizeof(float) * inputSize.first * inputSize.second * nChannels
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
-
-    cudaStatus = cudaMemcpy(
-        d_input, input.data(),
-        sizeof(float) * inputSize.first * inputSize.second * nChannels,
-        cudaMemcpyHostToDevice
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
-
-    float *d_output = maxPoolingLayer.forward(d_input);
-
-    int outputSize = maxPoolingLayer.getOutputSize();
-
-    std::vector<float> output(outputSize);
-    cudaStatus = cudaMemcpy(
-        output.data(), d_output, sizeof(float) * outputSize,
-        cudaMemcpyDeviceToHost
-    );
-    EXPECT_EQ(cudaStatus, cudaSuccess);
-
-    std::vector<float> expected = {0.619f, 0.732f, 0.712f, 0.742f,
-                                   0.919f, 0.973f, 0.819f, 0.85f};
-
-    for (int i = 0; i < output.size(); ++i) {
-        EXPECT_FLOAT_EQ(expected[i], output[i]);
-    }
-
-    cudaStatus = cudaFree(d_input);
-    EXPECT_EQ(cudaStatus, cudaSuccess);
+    runTest();
 }
+
+TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquareInputTest) {
+    inputSize   = {4, 6};  // Non-square input
+    nChannels   = 2;
+    poolingSize = {2, 2};
+    stride      = {2, 2};
+
+    input = {// Channel 0
+             0.573f, 0.619f, 0.732f, 0.055f, 0.123f, 0.234f, 0.243f, 0.316f,
+             0.573f, 0.619f, 0.456f, 0.789f, 0.712f, 0.055f, 0.243f, 0.316f,
+             0.654f, 0.987f, 0.573f, 0.619f, 0.742f, 0.055f, 0.321f, 0.654f,
+             // Channel 1
+             0.473f, 0.919f, 0.107f, 0.073f, 0.321f, 0.654f, 0.073f, 0.362f,
+             0.973f, 0.059f, 0.654f, 0.987f, 0.473f, 0.455f, 0.283f, 0.416f,
+             0.789f, 0.123f, 0.532f, 0.819f, 0.732f, 0.850f, 0.987f, 0.321f
+    };
+
+    expected = {0.619f, 0.732f, 0.789f, 0.712f, 0.742f, 0.987f, 0.919f, 0.973f, 0.987f, 0.819f, 0.85f, 0.987f};
+
+    runTest();
+}
+
+TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquarePoolSizeTest) {
+    inputSize   = {4, 4};
+    nChannels   = 2;
+    poolingSize = {2, 3};  // Non-square pooling size
+    stride      = {2, 2};
+
+    input = {
+        // clang-format off
+        // Channel 0
+        0.573f, 0.619f, 0.732f, 0.055f,
+        0.243f, 0.316f, 0.573f, 0.619f,
+        0.712f, 0.055f, 0.243f, 0.316f,
+        0.573f, 0.619f, 0.742f, 0.055f,
+        // Channel 1
+        0.473f, 0.919f, 0.107f, 0.073f,
+        0.073f, 0.362f, 0.973f, 0.059f,
+        0.473f, 0.455f, 0.283f, 0.416f,
+        0.532f, 0.819f, 0.732f, 0.850f
+        // clang-format on
+    };
+
+    expected = {0.732f, 0.742f, 0.973f, 0.819f};
+
+    runTest();
+
+}
+
+TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquareStrideTest) {
+    inputSize   = {4, 4};
+    nChannels   = 2;
+    poolingSize = {2, 2};
+    stride      = {1, 2};  // Non-square stride
+
+    input = {
+        // clang-format off
+        // Channel 0
+        0.573f, 0.619f, 0.732f, 0.055f,
+        0.243f, 0.316f, 0.573f, 0.619f,
+        0.712f, 0.055f, 0.243f, 0.316f,
+        0.573f, 0.619f, 0.742f, 0.055f,
+        // Channel 1
+        0.473f, 0.919f, 0.107f, 0.073f,
+        0.073f, 0.362f, 0.973f, 0.059f,
+        0.473f, 0.455f, 0.283f, 0.416f,
+        0.532f, 0.819f, 0.732f, 0.850f
+        // clang-format on
+    };
+
+    expected = {0.619f, 0.732f, 0.712f, 0.619f, 0.712f, 0.742f, 0.919f, 0.973f, 0.473f, 0.973f, 0.819f, 0.85f};
+
+    runTest();
+
+}
\ No newline at end of file
diff --git a/tools/batch_norm_test.py b/tools/batch_norm_test.py
index 0003b83..271949d 100644
--- a/tools/batch_norm_test.py
+++ b/tools/batch_norm_test.py
@@ -2,30 +2,51 @@ import torch
 
 from utils import print_cpp_vector
 
-batch_norm = torch.nn.BatchNorm2d(2, track_running_stats=False)
+def gen_batch_norm_test_result(input):
 
-weights = torch.Tensor([0.63508, 0.64903])
-biases= torch.Tensor([0.25079, 0.66841])
+    batch_norm = torch.nn.BatchNorm2d(2, track_running_stats=False)
 
-batch_norm.weight = torch.nn.Parameter(weights)
-batch_norm.bias = torch.nn.Parameter(biases)
+    weights = torch.Tensor([0.63508, 0.64903])
+    biases= torch.Tensor([0.25079, 0.66841])
 
-input = torch.Tensor([
-    # Channel 0
-    0.38899, 0.80478, 0.48836, 0.97381,
-    0.57508, 0.60835, 0.65467, 0.00168,
-    0.65869, 0.74235, 0.17928, 0.70349,
-    0.15524, 0.38664, 0.23411, 0.7137,
-    # Channel 1
-    0.32473, 0.15698, 0.314, 0.60888,
-    0.80268, 0.99766, 0.93694, 0.89237,
-    0.13449, 0.27367, 0.53036, 0.18962,
-    0.57672, 0.48364, 0.10863, 0.0571
-]).reshape(1, 2, 4, 4)
+    batch_norm.weight = torch.nn.Parameter(weights)
+    batch_norm.bias = torch.nn.Parameter(biases)
 
-output = batch_norm(input)
-print_cpp_vector(output.flatten())
+    output = batch_norm(input)
+    print_cpp_vector(output.flatten())
 
-print(batch_norm.running_mean)
-print(batch_norm.running_var)
+if __name__ == "__main__":
+
+    print("Generating test results...")
+    print("Batch norm test:")
+
+    input = torch.Tensor([
+        # Channel 0
+        0.38899, 0.80478, 0.48836, 0.97381,
+        0.57508, 0.60835, 0.65467, 0.00168,
+        0.65869, 0.74235, 0.17928, 0.70349,
+        0.15524, 0.38664, 0.23411, 0.7137,
+        # Channel 1
+        0.32473, 0.15698, 0.314, 0.60888,
+        0.80268, 0.99766, 0.93694, 0.89237,
+        0.13449, 0.27367, 0.53036, 0.18962,
+        0.57672, 0.48364, 0.10863, 0.0571
+    ]).reshape(1, 2, 4, 4)
+
+    gen_batch_norm_test_result(input)
+
+    print("Batch norm test non square input:")
+
+    input = torch.Tensor([
+        0.38899, 0.80478, 0.48836, 0.97381, 0.21567, 0.92312,
+        0.57508, 0.60835, 0.65467, 0.00168, 0.31567, 0.71345,
+        0.65869, 0.74235, 0.17928, 0.70349, 0.12856, 0.95645,
+        0.15524, 0.38664, 0.23411, 0.7137, 0.26789, 0.83412,
+        0.32473, 0.15698, 0.314, 0.60888, 0.23145, 0.78945,
+        0.80268, 0.99766, 0.93694, 0.89237, 0.61234, 0.92314,
+        0.13449, 0.27367, 0.53036, 0.18962, 0.45623, 0.14523,
+        0.57672, 0.48364, 0.10863, 0.0571, 0.78934, 0.67545
+    ]).reshape(1, 2, 4, 6)
+
+    gen_batch_norm_test_result(input)
 
diff --git a/tools/pooling_test.py b/tools/pooling_test.py
index f2af0e0..8d0caf1 100644
--- a/tools/pooling_test.py
+++ b/tools/pooling_test.py
@@ -14,6 +14,19 @@ def _get_pool_input():
         0.532, 0.819, 0.732, 0.850
     ]).reshape(1, 2, 4, 4)
 
+def _get_pool_input_non_square():
+    return torch.Tensor([
+        0.573, 0.619, 0.732, 0.055, 0.123, 0.234,
+        0.243, 0.316, 0.573, 0.619, 0.456, 0.789,
+        0.712, 0.055, 0.243, 0.316, 0.654, 0.987,
+        0.573, 0.619, 0.742, 0.055, 0.321, 0.654,
+        0.473, 0.919, 0.107, 0.073, 0.321, 0.654,
+        0.073, 0.362, 0.973, 0.059, 0.654, 0.987,
+        0.473, 0.455, 0.283, 0.416, 0.789, 0.123,
+        0.532, 0.819, 0.732, 0.850, 0.987, 0.321
+    ]).reshape(1, 2, 4, 6)
+
+
 def gen_max_pool_test_result():
     input = _get_pool_input()
 
@@ -23,6 +36,33 @@ def gen_max_pool_test_result():
     print_cpp_vector(output)
 
 
+def gen_max_pool_non_square_input_test_result():
+    input = _get_pool_input_non_square()
+
+    output = torch.nn.MaxPool2d(kernel_size=2, stride=2)(input)
+    output = torch.flatten(output)
+
+    print_cpp_vector(output)
+
+
+def gen_max_non_square_pool_test_result():
+    input = _get_pool_input()
+
+    output = torch.nn.MaxPool2d(kernel_size=(2, 3), stride=2)(input)
+    output = torch.flatten(output)
+
+    print_cpp_vector(output)
+
+
+def gen_max_pool_non_square_stride_test_result():
+    input = _get_pool_input()
+
+    output = torch.nn.MaxPool2d(kernel_size=2, stride=(1, 2))(input)
+    output = torch.flatten(output)
+
+    print_cpp_vector(output)
+
+
 def gen_avg_pool_test_result():
 
     input = _get_pool_input()
@@ -33,9 +73,55 @@ def gen_avg_pool_test_result():
     print_cpp_vector(output)
 
 
+def gen_avg_pool_non_square_input_test_result():
+
+    input = _get_pool_input_non_square()
+
+    output = torch.nn.AvgPool2d(kernel_size=2, stride=2)(input)
+    output = torch.flatten(output)
+
+    print_cpp_vector(output)
+
+
+def gen_avg_non_square_pool_test_result():
+
+    input = _get_pool_input()
+
+    output = torch.nn.AvgPool2d(kernel_size=(2, 3), stride=2)(input)
+    output = torch.flatten(output)
+
+    print_cpp_vector(output)
+
+
+def gen_avg_pool_non_square_stride_test_result():
+
+    input = _get_pool_input()
+
+    output = torch.nn.AvgPool2d(kernel_size=2, stride=(1, 2))(input)
+    output = torch.flatten(output)
+
+    print_cpp_vector(output)
+
+
+
 if __name__ == "__main__":
     print("Generating test results...")
     print("Max pool test:")
     gen_max_pool_test_result()
+    print("Max pool non square input test:")
+    gen_max_pool_non_square_input_test_result()
+    print("Max non square pool test:")
+    gen_max_non_square_pool_test_result()
+    print("Max pool non square stride test:")
+    gen_max_pool_non_square_stride_test_result()
+
+    print("--------------")
+
     print("Avg pool test:")
     gen_avg_pool_test_result()
+    print("Avg pool non square input test:")
+    gen_avg_pool_non_square_input_test_result()
+    print("Avg non square pool test:")
+    gen_avg_non_square_pool_test_result()
+    print("Avg pool non square stride test:")
+    gen_avg_pool_non_square_stride_test_result()