diff --git a/include/kernels/pooling.cuh b/include/kernels/pooling.cuh
index 35d6dda..7e5a0aa 100644
--- a/include/kernels/pooling.cuh
+++ b/include/kernels/pooling.cuh
@@ -13,7 +13,8 @@ __global__ void max_pooling(
     const dim2d outputSize,
     const int nChannels,
     const dim2d poolingSize,
-    const dim2d stride
+    const dim2d stride,
+    const dim2d padding
 );
 
 __global__ void avg_pooling(
diff --git a/include/layers/max_pooling.cuh b/include/layers/max_pooling.cuh
index 020e643..7cef5f5 100644
--- a/include/layers/max_pooling.cuh
+++ b/include/layers/max_pooling.cuh
@@ -13,6 +13,7 @@ class MaxPooling2d : public SequentialLayer, public TwoDLayer {
         int            nChannels,
         dim2d          poolingSize,
         dim2d          stride,
+        dim2d          padding,
         ActivationType activationType
     );
     ~MaxPooling2d();
@@ -40,6 +41,7 @@ class MaxPooling2d : public SequentialLayer, public TwoDLayer {
     int   nChannels;
     dim2d poolingSize;
     dim2d stride;
+    dim2d padding;
 
     dim2d outputSize;
 
diff --git a/src/kernels/pooling.cu b/src/kernels/pooling.cu
index ffc520f..5a27555 100644
--- a/src/kernels/pooling.cu
+++ b/src/kernels/pooling.cu
@@ -11,7 +11,8 @@ __global__ void Kernels::max_pooling(
     const dim2d outputSize,
     const int   nChannels,
     const dim2d poolingSize,
-    const dim2d stride
+    const dim2d stride,
+    const dim2d padding
 ) {
     int j = blockDim.x * blockIdx.x + threadIdx.x;
     int i = blockDim.y * blockIdx.y + threadIdx.y;
@@ -25,12 +26,16 @@ __global__ void Kernels::max_pooling(
 
     for (int k = 0; k < poolingSize.first; k++) {
         for (int l = 0; l < poolingSize.second; l++) {
-            int inputIndex = c * inputSize.first * inputSize.second +
-                             (i * stride.first + k) * inputSize.second +
-                             (j * stride.second + l);
+            int inputRow = i * stride.first + k - padding.first;
+            int inputCol = j * stride.second + l - padding.second;
 
-            if (d_input[inputIndex] > max) {
-                max = d_input[inputIndex];
+            if (inputRow >= 0 && inputRow < inputSize.first && inputCol >= 0 &&
+                inputCol < inputSize.second) {
+                int inputIndex = c * inputSize.first * inputSize.second +
+                                 inputRow * inputSize.second + inputCol;
+                if (d_input[inputIndex] > max) {
+                    max = d_input[inputIndex];
+                }
             }
         }
     }
@@ -62,12 +67,11 @@ __global__ void Kernels::avg_pooling(
 
     for (int k = 0; k < poolingSize.first; k++) {
         for (int l = 0; l < poolingSize.second; l++) {
-
             int inputRow = i * stride.first + k - padding.first;
             int inputCol = j * stride.second + l - padding.second;
 
-            if (inputRow >= 0 && inputRow < inputSize.first &&
-                inputCol >= 0 && inputCol < inputSize.second) {
+            if (inputRow >= 0 && inputRow < inputSize.first && inputCol >= 0 &&
+                inputCol < inputSize.second) {
                 int inputIndex = c * inputSize.first * inputSize.second +
                                  inputRow * inputSize.second + inputCol;
                 sum += d_input[inputIndex];
diff --git a/src/layers/max_pooling.cu b/src/layers/max_pooling.cu
index f1819cd..cd50b5f 100644
--- a/src/layers/max_pooling.cu
+++ b/src/layers/max_pooling.cu
@@ -9,23 +9,31 @@ MaxPooling2d::MaxPooling2d(
     int            nChannels,
     dim2d          poolingSize,
     dim2d          stride,
+    dim2d          padding,
     ActivationType activationType
 )
     : inputSize(inputSize),
       nChannels(nChannels),
       poolingSize(poolingSize),
-      stride(stride) {
+      stride(stride),
+      padding(padding) {
     outputSize = {
-        (inputSize.first - poolingSize.first) / stride.first + 1,
-        (inputSize.second - poolingSize.second) / stride.second + 1
+        (inputSize.first + 2 * padding.first - poolingSize.first) /
+                stride.first +
+            1,
+        (inputSize.second + 2 * padding.second - poolingSize.second) /
+                stride.second +
+            1
     };
 
-    activation =
-        new Activation(activationType, outputSize.first * outputSize.second * nChannels);
+    activation = new Activation(
+        activationType, outputSize.first * outputSize.second * nChannels
+    );
 
     d_output = nullptr;
     CUDA_CHECK(cudaMalloc(
-        (void**)&d_output, sizeof(float) * outputSize.first * outputSize.second * nChannels
+        (void**)&d_output,
+        sizeof(float) * outputSize.first * outputSize.second * nChannels
     ));
 }
 
@@ -43,7 +51,8 @@ float* MaxPooling2d::forward(const float* d_input) {
     );
 
     Kernels::max_pooling<<<grid, block>>>(
-        d_input, d_output, inputSize, outputSize, nChannels, poolingSize, stride
+        d_input, d_output, inputSize, outputSize, nChannels, poolingSize,
+        stride, padding
     );
     CUDA_CHECK(cudaGetLastError());
 
diff --git a/test/layers/test_max_pooling.cu b/test/layers/test_max_pooling.cu
index bc883f5..a25e88c 100644
--- a/test/layers/test_max_pooling.cu
+++ b/test/layers/test_max_pooling.cu
@@ -11,6 +11,7 @@ class MaxPoolingLayerTest : public ::testing::Test {
     int                nChannels;
     dim2d              poolingSize;
     dim2d              stride;
+    dim2d              padding;
     std::vector<float> input;
     std::vector<float> expected;
 
@@ -35,7 +36,7 @@ class MaxPoolingLayerTest : public ::testing::Test {
         cudaError_t cudaStatus;
 
         maxPoolingLayer = new CUDANet::Layers::MaxPooling2d(
-            inputSize, nChannels, poolingSize, stride,
+            inputSize, nChannels, poolingSize, stride, padding,
             CUDANet::Layers::ActivationType::NONE
         );
 
@@ -71,6 +72,7 @@ TEST_F(MaxPoolingLayerTest, MaxPoolForwardTest) {
     nChannels   = 2;
     poolingSize = {2, 2};
     stride      = {2, 2};
+    padding     = {0, 0};
 
     input = {
         // clang-format off
@@ -97,6 +99,7 @@ TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquareInputTest) {
     nChannels   = 2;
     poolingSize = {2, 2};
     stride      = {2, 2};
+    padding     = {0, 0};
 
     input = {// Channel 0
              0.573f, 0.619f, 0.732f, 0.055f, 0.123f, 0.234f, 0.243f, 0.316f,
@@ -118,6 +121,7 @@ TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquarePoolSizeTest) {
     nChannels   = 2;
     poolingSize = {2, 3};  // Non-square pooling size
     stride      = {2, 2};
+    padding     = {0, 0};
 
     input = {
         // clang-format off
@@ -145,6 +149,7 @@ TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquareStrideTest) {
     nChannels   = 2;
     poolingSize = {2, 2};
     stride      = {1, 2};  // Non-square stride
+    padding     = {0, 0};
 
     input = {
         // clang-format off
@@ -165,4 +170,32 @@ TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquareStrideTest) {
 
     runTest();
 
+}
+
+TEST_F(MaxPoolingLayerTest, MaxPoolForwardNonSquarePaddingTest) {
+    inputSize   = {4, 4};
+    nChannels   = 2;
+    poolingSize = {2, 2};
+    stride      = {2, 2};  // Non-square stride
+    padding     = {0, 1};
+
+    input = {
+        // clang-format off
+        // Channel 0
+        0.573f, 0.619f, 0.732f, 0.055f,
+        0.243f, 0.316f, 0.573f, 0.619f,
+        0.712f, 0.055f, 0.243f, 0.316f,
+        0.573f, 0.619f, 0.742f, 0.055f,
+        // Channel 1
+        0.473f, 0.919f, 0.107f, 0.073f,
+        0.073f, 0.362f, 0.973f, 0.059f,
+        0.473f, 0.455f, 0.283f, 0.416f,
+        0.532f, 0.819f, 0.732f, 0.850f
+        // clang-format on
+    };
+
+    expected = {0.573f, 0.732f, 0.619f, 0.712f, 0.742f, 0.316f, 0.473f, 0.973f, 0.073f, 0.532f, 0.819f, 0.85f};
+
+    runTest();
+
 }
\ No newline at end of file
diff --git a/test/model/test_model.cu b/test/model/test_model.cu
index 571cfe4..c9b397c 100644
--- a/test/model/test_model.cu
+++ b/test/model/test_model.cu
@@ -45,7 +45,7 @@ class ModelTest : public ::testing::Test {
         CUDANet::Layers::MaxPooling2d *maxpool2d =
             new CUDANet::Layers::MaxPooling2d(
                 poolingInput, numFilters, poolingSize,
-                poolingStride, CUDANet::Layers::ActivationType::RELU
+                poolingStride, {0, 0}, CUDANet::Layers::ActivationType::RELU
             );
         model->addLayer("maxpool1", maxpool2d);
 
diff --git a/tools/pooling_test.py b/tools/pooling_test.py
index 3e48381..97f38ba 100644
--- a/tools/pooling_test.py
+++ b/tools/pooling_test.py
@@ -62,6 +62,14 @@ def gen_max_pool_non_square_stride_test_result():
 
     print_cpp_vector(output)
 
+def gen_max_pool_non_square_padding_test_result():
+    input = _get_pool_input()
+
+    output = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=(0, 1))(input)
+    output = torch.flatten(output)
+
+    print_cpp_vector(output)
+
 
 def gen_avg_pool_test_result():
 
@@ -123,6 +131,8 @@ if __name__ == "__main__":
     gen_max_non_square_pool_test_result()
     print("Max pool non square stride test:")
     gen_max_pool_non_square_stride_test_result()
+    print("Max pool non square padding test:")
+    gen_max_pool_non_square_padding_test_result()
 
     print("--------------")