mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-12-24 15:24:24 +00:00
WIP Migrate Dense layer
This commit is contained in:
@@ -45,4 +45,24 @@ void CUDA::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
CUDANet::Tensor& CUDA::dense(CUDANet::Tensor &weights, CUDANet::Tensor &biases, CUDANet::Tensor &input, CUDANet::Tensor &output, size_t input_size, size_t output_size) {
|
||||
|
||||
auto forwardGridSize =
|
||||
(std::max(input_size, output_size) + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
|
||||
weights.data<float>(), input.data<float>(), output.data<float>(), input_size, output_size
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
|
||||
biases.data<float>(), output.data<float>(), output.data<float>(), output_size
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
return output;
|
||||
}
|
||||
@@ -1,77 +0,0 @@
|
||||
#include <vector>
|
||||
|
||||
#include "activation.hpp"
|
||||
#include "activation_functions.cuh"
|
||||
#include "cuda_helper.cuh"
|
||||
#include "matmul.cuh"
|
||||
#include "vector.cuh"
|
||||
|
||||
using namespace CUDANet::Layers;
|
||||
|
||||
void Activation::initCUDA() {
|
||||
if (activationType == SOFTMAX) {
|
||||
d_softmax_sum = nullptr;
|
||||
CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length));
|
||||
|
||||
d_max = nullptr;
|
||||
CUDA_CHECK(cudaMalloc((void**)&d_max, sizeof(float) * length));
|
||||
}
|
||||
|
||||
gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
}
|
||||
|
||||
void Activation::delCUDA() {
|
||||
if (activationType == SOFTMAX) {
|
||||
CUDA_CHECK(cudaFree(d_softmax_sum));
|
||||
CUDA_CHECK(cudaFree(d_max));
|
||||
}
|
||||
}
|
||||
|
||||
void Activation::activateCUDA(float* d_input) {
|
||||
|
||||
// float sum = 0.0f;
|
||||
|
||||
switch (activationType) {
|
||||
case SIGMOID:
|
||||
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, length
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
break;
|
||||
|
||||
case RELU:
|
||||
Kernels::relu<<<gridSize, BLOCK_SIZE>>>(d_input, d_input, length);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
break;
|
||||
case SOFTMAX:
|
||||
|
||||
// Find max value
|
||||
Utils::max(d_input, d_max, length);
|
||||
|
||||
// Subtract max value to improve numerical stability
|
||||
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, &d_max[0], length
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Compute exponentials
|
||||
Kernels::vec_exp<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, length
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Find sum
|
||||
Utils::sum(d_input, d_softmax_sum, length);
|
||||
|
||||
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
|
||||
d_input, d_input, &d_softmax_sum[0], length
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
@@ -1,69 +0,0 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
|
||||
#include "vector.cuh"
|
||||
#include "activation.hpp"
|
||||
#include "cuda_helper.cuh"
|
||||
#include "dense.hpp"
|
||||
#include "matmul.cuh"
|
||||
|
||||
using namespace CUDANet::Layers;
|
||||
|
||||
void Dense::initCUDA() {
|
||||
d_output = nullptr;
|
||||
|
||||
CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float) * outputSize));
|
||||
|
||||
d_weights = nullptr;
|
||||
d_biases = nullptr;
|
||||
|
||||
// Allocate GPU memory for weights and biases
|
||||
CUDA_CHECK(
|
||||
cudaMalloc((void**)&d_weights, sizeof(float) * inputSize * outputSize)
|
||||
);
|
||||
CUDA_CHECK(cudaMalloc((void**)&d_biases, sizeof(float) * outputSize));
|
||||
toCuda();
|
||||
|
||||
// Calculate block and grid sizes
|
||||
forwardGridSize =
|
||||
(std::max(inputSize, outputSize) + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
biasGridSize = (outputSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
}
|
||||
|
||||
void Dense::delCUDA() {
|
||||
cudaFree(d_output);
|
||||
cudaFree(d_weights);
|
||||
cudaFree(d_biases);
|
||||
}
|
||||
|
||||
void Dense::toCuda() {
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
d_weights, weights.data(), sizeof(float) * inputSize * outputSize,
|
||||
cudaMemcpyHostToDevice
|
||||
));
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
d_biases, biases.data(), sizeof(float) * outputSize,
|
||||
cudaMemcpyHostToDevice
|
||||
));
|
||||
}
|
||||
|
||||
float* Dense::forwardCUDA(const float* d_input) {
|
||||
Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
|
||||
d_weights, d_input, d_output, inputSize, outputSize
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
|
||||
d_biases, d_output, d_output, outputSize
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
activation->activate(d_output);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
return d_output;
|
||||
}
|
||||
@@ -26,6 +26,10 @@ void CUDA::zero(CUDANet::Tensor &input) {
|
||||
CUDA_CHECK(cudaMemset(input.data<float>(), 0, sizeof(float) * input.numel()));
|
||||
}
|
||||
|
||||
void CUDA::copy_to_device(CUDANet::Tensor &tensor, void *data, size_t size) {
|
||||
CUDA_CHECK(cudaMemcpy(tensor.data<float>(), data, size, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void CUDA::sum(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
|
||||
auto length = input.numel();
|
||||
const int gridSize = ( + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
Reference in New Issue
Block a user