mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-12-22 14:24:22 +00:00
Fix some dense layer issues
This commit is contained in:
@@ -1,25 +1,29 @@
|
||||
#include "backend/cuda.cuh"
|
||||
#include "utils/cuda_helper.cuh"
|
||||
#include "kernels/activation_functions.cuh"
|
||||
#include "kernels/matmul.cuh"
|
||||
#include "utils/cuda_helper.cuh"
|
||||
|
||||
using namespace CUDANet::Backend;
|
||||
|
||||
void CUDA::relu(Tensor &tensor) {
|
||||
void CUDA::relu(Tensor& tensor) {
|
||||
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
Kernels::relu<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
|
||||
Kernels::relu<<<gridSize, BLOCK_SIZE>>>(
|
||||
tensor.data<float>(), tensor.data<float>(), tensor.numel()
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void CUDA::sigmoid(Tensor &tensor) {
|
||||
void CUDA::sigmoid(Tensor& tensor) {
|
||||
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(tensor.data<float>(), tensor.data<float>(), tensor.numel());
|
||||
Kernels::sigmoid<<<gridSize, BLOCK_SIZE>>>(
|
||||
tensor.data<float>(), tensor.data<float>(), tensor.numel()
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void CUDA::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
|
||||
void CUDA::softmax(Tensor& tensor, Tensor& temp_max, Tensor& temp_sum) {
|
||||
int gridSize = (tensor.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
// Find max value
|
||||
@@ -27,7 +31,8 @@ void CUDA::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
|
||||
|
||||
// Subtract max value to improve numerical stability
|
||||
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
|
||||
tensor.data<float>(), tensor.data<float>(), temp_max.data<float>(), tensor.numel()
|
||||
tensor.data<float>(), tensor.data<float>(), temp_max.data<float>(),
|
||||
tensor.numel()
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
@@ -36,30 +41,39 @@ void CUDA::softmax(Tensor &tensor, Tensor &temp_max, Tensor &temp_sum) {
|
||||
tensor.data<float>(), tensor.data<float>(), tensor.numel()
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
|
||||
// Find sum
|
||||
sum(tensor, temp_sum);
|
||||
|
||||
Kernels::vec_scalar_div<<<gridSize, BLOCK_SIZE>>>(
|
||||
tensor.data<float>(), tensor.data<float>(), temp_sum.data<float>(), tensor.numel()
|
||||
tensor.data<float>(), tensor.data<float>(), temp_sum.data<float>(),
|
||||
tensor.numel()
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
CUDANet::Tensor& CUDA::dense(CUDANet::Tensor &weights, CUDANet::Tensor &biases, CUDANet::Tensor &input, CUDANet::Tensor &output, size_t input_size, size_t output_size) {
|
||||
|
||||
CUDANet::Tensor& CUDA::dense(
|
||||
const CUDANet::Tensor& weights,
|
||||
const CUDANet::Tensor& biases,
|
||||
const CUDANet::Tensor& input,
|
||||
CUDANet::Tensor& output,
|
||||
const size_t input_size,
|
||||
const size_t output_size
|
||||
) {
|
||||
auto forwardGridSize =
|
||||
(std::max(input_size, output_size) + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
|
||||
weights.data<float>(), input.data<float>(), output.data<float>(), input_size, output_size
|
||||
weights.data<float>(), input.data<float>(), output.data<float>(),
|
||||
input_size, output_size
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
|
||||
biases.data<float>(), output.data<float>(), output.data<float>(), output_size
|
||||
biases.data<float>(), output.data<float>(), output.data<float>(),
|
||||
output_size
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
Reference in New Issue
Block a user