Allocate activation on heap

2025-11-05 17:34:21 +00:00 · 2024-04-22 18:59:16 +02:00
parent 26cea9b12c
commit a32c737785
10 changed files with 17 additions and 15 deletions
--- a/src/layers/activation.cu
+++ b/src/layers/activation.cu
@@ -77,4 +77,3 @@ void Activation::activate(float* d_input) {

    CUDA_CHECK(cudaDeviceSynchronize());
 }
-
--- a/src/layers/avg_pooling.cu
+++ b/src/layers/avg_pooling.cu
@@ -18,7 +18,7 @@ AvgPooling2D::AvgPooling2D(
    outputSize = (inputSize - poolingSize) / stride + 1;

    activation =
-        Activation(activationType, outputSize * outputSize * nChannels);
+        new Activation(activationType, outputSize * outputSize * nChannels);

    d_output = nullptr;
    CUDA_CHECK(cudaMalloc(
@@ -28,6 +28,7 @@ AvgPooling2D::AvgPooling2D(

 AvgPooling2D::~AvgPooling2D() {
    cudaFree(d_output);
+    delete activation;
 }

 float* AvgPooling2D::forward(const float* d_input) {
@@ -44,7 +45,7 @@ float* AvgPooling2D::forward(const float* d_input) {
    );
    CUDA_CHECK(cudaGetLastError());

-    activation.activate(d_output);
+    activation->activate(d_output);
    CUDA_CHECK(cudaDeviceSynchronize());

    return d_output;
--- a/src/layers/conv2d.cu
+++ b/src/layers/conv2d.cu
@@ -29,7 +29,7 @@ Conv2d::Conv2d(

    outputSize = (inputSize - kernelSize + 2 * paddingSize) / stride + 1;

-    activation = Activation(
+    activation = new Activation(
        activationType, outputSize * outputSize * numFilters
    );

@@ -62,6 +62,7 @@ Conv2d::~Conv2d() {
    cudaFree(d_output);
    cudaFree(d_weights);
    cudaFree(d_biases);
+    delete activation;
 }

 void Conv2d::initializeWeights() {
@@ -123,7 +124,7 @@ float* Conv2d::forward(const float* d_input) {
    CUDA_CHECK(cudaGetLastError());
 
    // Apply activation
-    activation.activate(d_output);
+    activation->activate(d_output);

    CUDA_CHECK(cudaDeviceSynchronize());

--- a/src/layers/dense.cu
+++ b/src/layers/dense.cu
@@ -45,14 +45,14 @@ Dense::Dense(
        (std::max(inputSize, outputSize) + BLOCK_SIZE - 1) / BLOCK_SIZE;
    biasGridSize = (outputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;

-    activation = Activation(activationType, outputSize);
+    activation = new Activation(activationType, outputSize);
 }

 Dense::~Dense() {
-    // Free GPU memory
    cudaFree(d_output);
    cudaFree(d_weights);
    cudaFree(d_biases);
+    delete activation;
 }

 void Dense::initializeWeights() {
@@ -75,7 +75,7 @@ float* Dense::forward(const float* d_input) {
    );
    CUDA_CHECK(cudaGetLastError());

-    activation.activate(d_output);
+    activation->activate(d_output);
    CUDA_CHECK(cudaDeviceSynchronize());

    return d_output;
--- a/src/layers/max_pooling.cu
+++ b/src/layers/max_pooling.cu
@@ -17,7 +17,7 @@ MaxPooling2D::MaxPooling2D(

    outputSize  = (inputSize - 1) / stride + 1;

-    activation = Activation(
+    activation = new Activation(
        activationType, outputSize * outputSize * nChannels
    );

@@ -30,6 +30,7 @@ MaxPooling2D::MaxPooling2D(

 MaxPooling2D::~MaxPooling2D() {
    cudaFree(d_output);
+    delete activation;
 }


@@ -47,7 +48,7 @@ float* MaxPooling2D::forward(const float* d_input) {
    );
    CUDA_CHECK(cudaGetLastError());

-    activation.activate(d_output);
+    activation->activate(d_output);
    CUDA_CHECK(cudaDeviceSynchronize());

    return d_output;
--- a/src/utils/vector.cu
+++ b/src/utils/vector.cu
@@ -31,6 +31,7 @@ void Utils::max(float* d_vec, float* d_max, const unsigned int length) {
    CUDA_CHECK(cudaGetLastError());

    int remaining = grid_size;
+
    while (remaining > 1) {
        int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE;
        CUDANet::Kernels::max_reduce<<<blocks_needed, BLOCK_SIZE>>>(d_max, d_max, remaining);