diff --git a/include/kernels/activations.cuh b/include/kernels/activations.cuh
index 34ff65d..a23402d 100644
--- a/include/kernels/activations.cuh
+++ b/include/kernels/activations.cuh
@@ -3,9 +3,23 @@
 
 namespace Kernels {
 
+/**
+ * @brief Sigmoid activation function kernel
+ * 
+ * @param src Pointer to the source array
+ * @param dst Pointer to the destination array
+ * @param len Length of the arrays
+ */
 __global__ void
 sigmoid(const float* __restrict__ src, float* __restrict__ dst, int len);
 
+/**
+ * @brief Relu activation function kernel
+ * 
+ * @param src Pointer to the source array
+ * @param dst Pointer to the destination array
+ * @param len Length of the arrays
+ */
 __global__ void
 relu(const float* __restrict__ src, float* __restrict__ dst, int len);
 
diff --git a/include/kernels/convolution.cuh b/include/kernels/convolution.cuh
index 0332320..ea3fc32 100644
--- a/include/kernels/convolution.cuh
+++ b/include/kernels/convolution.cuh
@@ -3,6 +3,16 @@
 
 namespace Kernels {
 
+/**
+ * @brief Kernel that pads the input matrix with zeros
+ * 
+ * @param d_input Device pointer to the input matrix (as vector)
+ * @param d_padded Device pointer to the padded matrix (as vector)
+ * @param w Width of the input matrix
+ * @param h Height of the input matrix
+ * @param n Number of input channels
+ * @param p Padding size
+ */
 __global__ void padding(
     const float* d_input,
     float*       d_padded,
@@ -12,6 +22,19 @@ __global__ void padding(
     int          p
 );
 
+/**
+ * @brief Convolution kernel
+ * 
+ * @param d_input Device pointer to the input matrix
+ * @param d_kernel Device pointer to the convolution kernel
+ * @param d_output Device pointer to the output matrix
+ * @param inputSize Width and height of the input matrix
+ * @param nChannels Number of channels in the input matrix
+ * @param kernelSize Width and height of the convolution kernel
+ * @param stride Convolution stride
+ * @param nFilters Number of output filters
+ * @param outputSize Width and height of the output matrix
+ */
 __global__ void convolution(
     const float* d_input,
     const float* d_kernel,
diff --git a/include/kernels/matmul.cuh b/include/kernels/matmul.cuh
index e8dd782..897f8df 100644
--- a/include/kernels/matmul.cuh
+++ b/include/kernels/matmul.cuh
@@ -3,6 +3,15 @@
 
 namespace Kernels {
 
+/**
+ * @brief Matrix vector multiplication kernel
+ * 
+ * @param d_matrix Device pointer to matrix
+ * @param d_vector Device pointer to vector
+ * @param d_output Device pointer to output vector
+ * @param w Width of the matrix
+ * @param h Height of the matrix
+ */
 __global__ void mat_vec_mul(
     const float* d_matrix,
     const float* d_vector,
@@ -11,6 +20,14 @@ __global__ void mat_vec_mul(
     int          h
 );
 
+/**
+ * @brief Vector vector addition kernel
+ * 
+ * @param d_vector1 Device pointer to first vector
+ * @param d_vector2 Device pointer to second vector
+ * @param d_output Device pointer to output vector
+ * @param w Length of the vectors
+ */
 __global__ void vec_vec_add(
     const float* d_vector1,
     const float* d_vector2,
diff --git a/include/layers/conv2d.cuh b/include/layers/conv2d.cuh
index 8da8377..8e0335c 100644
--- a/include/layers/conv2d.cuh
+++ b/include/layers/conv2d.cuh
@@ -10,8 +10,23 @@
 
 namespace Layers {
 
+/**
+ * @brief 2D convolutional layer
+ *
+ */
 class Conv2d : public ILayer {
   public:
+    /**
+     * @brief Construct a new Conv 2d layer
+     *
+     * @param inputSize Width and height of the input matrix
+     * @param inputChannels Number of channels in the input matrix
+     * @param kernelSize Width and height of the convolution kernel
+     * @param stride Convolution stride
+     * @param padding Padding type ('SAME' or 'VALID')
+     * @param numFilters Number of output filters
+     * @param activation Activation function ('RELU', 'SIGMOID' or 'NONE')
+     */
     Conv2d(
         int                inputSize,
         int                inputChannels,
@@ -21,21 +36,57 @@ class Conv2d : public ILayer {
         int                numFilters,
         Layers::Activation activation
     );
+
+    /**
+     * @brief Destroy the Conv 2d object
+     *
+     */
     ~Conv2d();
 
-    // Outputs
-    int outputSize;
-
+    /**
+     * @brief Forward pass of the convolutional layer
+     *
+     * @param d_input Device pointer to the input matrix
+     * @return Device pointer to the output matrix
+     */
     float* forward(const float* d_input);
-    void   setWeights(const float* weights_input);
-    void   setBiases(const float* biases_input);
-    void   host_conv(const float* input, float* output);
+
+    /**
+     * @brief Set the weights of the convolutional layer
+     *
+     * @param weights_input Pointer to the weights
+     */
+    void setWeights(const float* weights_input);
+
+    /**
+     * @brief Set the biases of the convolutional layer
+     *
+     * @param biases_input Pointer to the biases
+     */
+    void setBiases(const float* biases_input);
+
+    /**
+     * @brief Get the output width (/ height) of the layer
+     * 
+     * @return int 
+     */
+    int getOutputSize() { return outputSize; }
+
+    /**
+     * @brief Get the padding size of the layer
+     * 
+     * @return int 
+     */
+    int getPaddingSize() { return paddingSize; }
 
   private:
     // Inputs
     int inputSize;
     int inputChannels;
 
+    // Outputs
+    int outputSize;
+
     // Kernel
     int kernelSize;
     int stride;
@@ -55,8 +106,22 @@ class Conv2d : public ILayer {
     // Kernels
     Layers::Activation activation;
 
+    /**
+     * @brief Initialize weights of the convolutional layer with zeros
+     *
+     */
     void initializeWeights();
+
+    /**
+     * @brief Initialize biases of the convolutional layer with zeros
+     *
+     */
     void initializeBiases();
+
+    /**
+     * @brief Copy weights and biases to the device
+     *
+     */
     void toCuda();
 };
 
diff --git a/include/layers/dense.cuh b/include/layers/dense.cuh
index 6d17e2c..7191dd7 100644
--- a/include/layers/dense.cuh
+++ b/include/layers/dense.cuh
@@ -9,17 +9,47 @@
 
 namespace Layers {
 
+/**
+ * @brief Dense (fully connected) layer
+ *
+ */
 class Dense : public ILayer {
   public:
-    Dense(
-        int            inputSize,
-        int            outputSize,
-        Layers::Activation     activation
-    );
+    /**
+     * @brief Construct a new Dense layer
+     *
+     * @param inputSize Size of the input vector
+     * @param outputSize Size of the output vector
+     * @param activation Activation function ('RELU', 'SIGMOID' or 'NONE')
+     */
+    Dense(int inputSize, int outputSize, Layers::Activation activation);
+
+    /**
+     * @brief Destroy the Dense layer
+     *
+     */
     ~Dense();
 
+    /**
+     * @brief Forward pass of the dense layer
+     *
+     * @param d_input Device pointer to the input vector
+     * @return Device pointer to the output vector
+     */
     float* forward(const float* d_input);
+
+    /**
+     * @brief Set the weights of the layer
+     *
+     * @param weights Pointer to vector of weights
+     */
     void setWeights(const float* weights);
+
+    /**
+     * @brief Set the biases of the layer
+     *
+     * @param biases Pointer to vector of biases
+     */
     void setBiases(const float* biases);
 
   private:
@@ -36,8 +66,22 @@ class Dense : public ILayer {
 
     Layers::Activation activation;
 
+    /**
+     * @brief Initialize the weights to zeros
+     *
+     */
     void initializeWeights();
+
+    /**
+     * @brief Initialize the biases to zeros
+     *
+     */
     void initializeBiases();
+
+    /**
+     * @brief Copy the weights and biases to the device
+     *
+     */
     void toCuda();
 };
 
diff --git a/include/layers/ilayer.cuh b/include/layers/ilayer.cuh
index a63fbe1..8aa42db 100644
--- a/include/layers/ilayer.cuh
+++ b/include/layers/ilayer.cuh
@@ -6,22 +6,72 @@
 
 namespace Layers {
 
+/**
+ * @brief Activation functions
+ * 
+ * SIGMOID: Sigmoid
+ * RELU: Rectified Linear Unit
+ *
+ */
 enum Activation { SIGMOID, RELU, NONE };
 
+/**
+ * @brief Padding types
+ * 
+ * SAME: Zero padding such that the output size is the same as the input
+ * VALID: No padding
+ *
+ */
 enum Padding { SAME, VALID };
 
+/**
+ * @brief Base class for all layers
+ */
 class ILayer {
   public:
+    /**
+     * @brief Destroy the ILayer object
+     * 
+     */
     virtual ~ILayer() {}
 
+    /**
+     * @brief Virtual function for forward pass
+     * 
+     * @param input (Device) Pointer to the input
+     * @return float* Device pointer to the output
+     */
     virtual float* forward(const float* input)      = 0;
+
+    /**
+     * @brief Virtual function for setting weights
+     * 
+     * @param weights Pointer to the weights
+     */
     virtual void   setWeights(const float* weights) = 0;
+
+    /**
+     * @brief Virtual function for setting biases
+     * 
+     * @param biases Pointer to the biases
+     */
     virtual void   setBiases(const float* biases)   = 0;
 
   private:
+
+    /**
+     * @brief Initialize the weights
+     */
     virtual void initializeWeights() = 0;
+
+    /**
+     * @brief Initialize the biases
+     */
     virtual void initializeBiases()  = 0;
 
+    /**
+     * @brief Copy the weights and biases to the device
+     */
     virtual void toCuda() = 0;
 
     int inputSize;
diff --git a/include/layers/input.cuh b/include/layers/input.cuh
index 5c66124..d4a8283 100644
--- a/include/layers/input.cuh
+++ b/include/layers/input.cuh
@@ -5,11 +5,31 @@
 
 namespace Layers {
 
+/**
+ * @brief Input layer, just copies the input to the device
+ *
+ */
 class Input : public ILayer {
   public:
+    /**
+     * @brief Create a new Input layer
+     * 
+     * @param inputSize Size of the input vector
+     */
     Input(int inputSize);
+
+    /**
+     * @brief Destroy the Input layer
+     * 
+     */
     ~Input();
 
+    /**
+     * @brief Forward pass of the input layer. Just copies the input to the device
+     * 
+     * @param input Host pointer to the input vector
+     * @return Device pointer to the output vector
+     */
     float* forward(const float* input);
 
     void setWeights(const float* weights);
diff --git a/include/utils/cuda_helper.cuh b/include/utils/cuda_helper.cuh
index 0a80a05..799044d 100644
--- a/include/utils/cuda_helper.cuh
+++ b/include/utils/cuda_helper.cuh
@@ -4,7 +4,10 @@
 #include <cuda_runtime.h>
 #include <cstdio>
 
-// CUDA error checking macro
+/**
+ * @brief CUDA error checking macro
+ * 
+ */
 #define CUDA_CHECK(call) \
 do { \
     cudaError_t result = call; \
diff --git a/test/layers/test_conv2d.cu b/test/layers/test_conv2d.cu
index ac73652..993a91b 100644
--- a/test/layers/test_conv2d.cu
+++ b/test/layers/test_conv2d.cu
@@ -80,7 +80,7 @@ TEST_F(Conv2dTest, SimpleTest) {
     );
 
     int outputSize = (inputSize - kernelSize) / stride + 1;
-    EXPECT_EQ(outputSize, conv2d.outputSize);
+    EXPECT_EQ(outputSize, conv2d.getOutputSize());
 
     d_output = conv2d.forward(d_input);
 
@@ -168,16 +168,16 @@ TEST_F(Conv2dTest, PaddedTest) {
         activation, input, kernels.data(), d_input
     );
 
-    EXPECT_EQ(inputSize, conv2d.outputSize);
+    EXPECT_EQ(inputSize, conv2d.getOutputSize());
 
     d_output = conv2d.forward(d_input);
 
     std::vector<float> output(
-        conv2d.outputSize * conv2d.outputSize * numFilters
+        conv2d.getOutputSize() * conv2d.getOutputSize() * numFilters
     );
     cudaMemcpy(
         output.data(), d_output,
-        sizeof(float) * conv2d.outputSize * conv2d.outputSize * numFilters,
+        sizeof(float) * conv2d.getOutputSize() * conv2d.getOutputSize() * numFilters,
         cudaMemcpyDeviceToHost
     );
 
@@ -253,16 +253,16 @@ TEST_F(Conv2dTest, StridedPaddedConvolution) {
         activation, input, kernels.data(), d_input
     );
 
-    EXPECT_EQ(inputSize, conv2d.outputSize);
+    EXPECT_EQ(inputSize, conv2d.getOutputSize());
 
     d_output = conv2d.forward(d_input);
 
     std::vector<float> output(
-        conv2d.outputSize * conv2d.outputSize * numFilters
+        conv2d.getOutputSize() * conv2d.getOutputSize() * numFilters
     );
     cudaMemcpy(
         output.data(), d_output,
-        sizeof(float) * conv2d.outputSize * conv2d.outputSize * numFilters,
+        sizeof(float) * conv2d.getOutputSize() * conv2d.getOutputSize() * numFilters,
         cudaMemcpyDeviceToHost
     );