diff --git a/include/backend/tensor.hpp b/include/backend/tensor.hpp
index f327a4f..0586f25 100644
--- a/include/backend/tensor.hpp
+++ b/include/backend/tensor.hpp
@@ -23,9 +23,6 @@ public:
     Tensor(Shape shape, DType dtype, IBackend* backend);
     ~Tensor();
 
-    void* allocate();
-    void deallocate();
-
     size_t size() const;
     size_t numel() const;
 
@@ -38,6 +35,10 @@ public:
 private:
     Shape       shape;
     DType       dtype;
+
+    size_t total_elms;
+    size_t total_size;
+
     IBackend*   backend;
     void*       d_ptr;
 };
diff --git a/src/backends/cuda/cuda_backend.cu b/src/backends/cuda/cuda_backend.cu
index 3201140..6e47316 100644
--- a/src/backends/cuda/cuda_backend.cu
+++ b/src/backends/cuda/cuda_backend.cu
@@ -1,60 +1,39 @@
-#include "backend/cuda_backend.cuh"
-#include "utils/cuda_helper.cuh"
-#include "kernels/activation_functions.cuh"
-#include "kernels/matmul.cuh"
-#include "utils/vector.cuh"
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_helper.cuh>
+
+#include "backend/cuda.cuh"
+
+cudaDeviceProp initializeCUDA() {
+    int deviceCount;
+    CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+
+    if (deviceCount == 0) {
+        std::fprintf(stderr, "No CUDA devices found. Exiting.\n");
+        std::exit(EXIT_FAILURE);
+    }
+
+    int device = 0;
+    CUDA_CHECK(cudaSetDevice(device));
+
+    cudaDeviceProp deviceProp;
+    CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, device));
+
+    std::printf("Using CUDA device %d: %s\n", device, deviceProp.name);
+
+    return deviceProp;
+}
 
 using namespace CUDANet::Backend;
 
-void *CUDABackend::allocate(size_t bytes) {
-    void* devicePtr = nullptr;
-    CUDA_CHECK(cudaMalloc(&devicePtr, bytes));
-    return devicePtr;
+void* CUDABackend::allocate(size_t bytes) {
+    void* d_ptr = nullptr;
+    CUDA_CHECK(cudaMalloc(&d_ptr, bytes));
+    return d_ptr;
 }
 
 void CUDABackend::deallocate(void* ptr) {
     CUDA_CHECK(cudaFree(ptr));
 }
-
-
-void CUDABackend::relu(Tensor &tensor) {
-    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    Kernels::relu<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-void CUDABackend::sigmoid(Tensor &tensor) {
-    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>((float*)tensor.data(), (float*)tensor.data(), tensor.numel());
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
-    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    // Find max value
-    Utils::max(tensor, temp_max, tensor.numel());
-
-    // Subtract max value to improve numerical stability
-    Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
-        (float*)tensor.data(), (float*)tensor.data(), (float*)temp_max.data(), tensor.numel()
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    // Compute exponentials
-    Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
-        (float*)tensor.data(), (float*)tensor.data(), tensor.numel()
-    );
-    CUDA_CHECK(cudaGetLastError());
-    
-    // Find sum
-    Utils::sum(tensor, temp_sum, tensor.numel());
-
-    Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
-        (float*)tensor.data(), (float*)tensor.data(), (float*)temp_sum.data(), tensor.numel()
-    );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
diff --git a/src/backends/cuda/layer_ops.cu b/src/backends/cuda/layer_ops.cu
new file mode 100644
index 0000000..9b70f4e
--- /dev/null
+++ b/src/backends/cuda/layer_ops.cu
@@ -0,0 +1,48 @@
+#include "backend/cuda.cuh"
+#include "utils/cuda_helper.cuh"
+#include "kernels/activation_functions.cuh"
+#include "kernels/matmul.cuh"
+
+using namespace CUDANet::Backend;
+
+void CUDABackend::relu(Tensor &tensor) {
+    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    Kernels::relu<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void CUDABackend::sigmoid(Tensor &tensor) {
+    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void CUDABackend::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
+    int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    // Find max value
+    max(tensor, temp_max);
+
+    // Subtract max value to improve numerical stability
+    Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
+        tensor.data<float>(), tensor.data<float>(), temp_max.data<float>(), tensor.numel()
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    // Compute exponentials
+    Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
+        tensor.data<float>(), tensor.data<float>(), tensor.numel()
+    );
+    CUDA_CHECK(cudaGetLastError());
+    
+    // Find sum
+    sum(tensor, temp_sum);
+
+    Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
+        tensor.data<float>(), tensor.data<float>(), temp_sum.data<float>(), tensor.numel()
+    );
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/src/backends/cuda/tensor_ops.cu b/src/backends/cuda/tensor_ops.cu
index 00fd8b7..b5334e4 100644
--- a/src/backends/cuda/tensor_ops.cu
+++ b/src/backends/cuda/tensor_ops.cu
@@ -43,7 +43,6 @@ void CUDABackend::sum(const CUDANet::Backend::Tensor &input, CUDANet::Backend::T
 
         remaining = blocks_needed;
     }
-
 }
 
 void CUDABackend::max(const CUDANet::Backend::Tensor &input, CUDANet::Backend::Tensor &max) {
diff --git a/src/backends/cuda/utils/cuda_helper.cu b/src/backends/cuda/utils/cuda_helper.cu
deleted file mode 100644
index d1bd99b..0000000
--- a/src/backends/cuda/utils/cuda_helper.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <cuda_runtime.h>
-
-#include <cstdio>
-#include <cstdlib>
-
-#include "cuda_helper.cuh"
-
-cudaDeviceProp initializeCUDA() {
-    int deviceCount;
-    CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
-
-    if (deviceCount == 0) {
-        std::fprintf(stderr, "No CUDA devices found. Exiting.\n");
-        std::exit(EXIT_FAILURE);
-    }
-
-    int device = 0;
-    CUDA_CHECK(cudaSetDevice(device));
-
-    cudaDeviceProp deviceProp;
-    CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, device));
-
-    std::printf("Using CUDA device %d: %s\n", device, deviceProp.name);
-
-    return deviceProp;
-}
\ No newline at end of file
diff --git a/src/backends/tensor.cpp b/src/backends/tensor.cpp
index cd72b99..6a164d0 100644
--- a/src/backends/tensor.cpp
+++ b/src/backends/tensor.cpp
@@ -1,37 +1,44 @@
-#include <stdexcept>
-
 #include "backend/tensor.hpp"
 
+#include <stdexcept>
+
 using namespace CUDANet::Backend;
 
 Tensor::Tensor(Shape shape, DType dtype, IBackend* backend)
-    : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {}
-
-Tensor::~Tensor() {
-    deallocate();
-}
-
-size_t Tensor::numel() const {
-    size_t totalElements = 1;
+    : shape(shape), dtype(dtype), backend(backend), d_ptr(nullptr) {
+    // Count total elements
+    size_t count = 1;
     for (const auto& dim : shape) {
-        totalElements *= dim;
+        count *= dim;
     }
-    return totalElements;
-}
+    total_elms = count;
 
-size_t Tensor::size() const {
-    size_t totalSize = numel();
-
-    size_t typeSize = 0;
+    // Compute total size (bytes)
+    size_t type_size = 0;
     switch (dtype) {
         case DType::FLOAT32:
-            typeSize = 4;
+            type_size = 4;
             break;
         default:
             throw std::runtime_error("Unsupported data type");
     }
+    total_size = total_elms * type_size;
 
-    return totalSize * typeSize;
+    // Allocate memory on backend
+    d_ptr = backend->allocate(total_size);
+}
+
+Tensor::~Tensor() {
+    backend->deallocate(d_ptr);
+    d_ptr = nullptr;
+}
+
+size_t Tensor::numel() const {
+    return total_elms;
+}
+
+size_t Tensor::size() const {
+    return total_size;
 }
 
 template <typename T>
@@ -42,4 +49,4 @@ const T* Tensor::data() const {
 template <typename T>
 T* Tensor::data() {
     return static_cast<T*>(d_ptr);
-}
\ No newline at end of file
+}