mirror of
https://github.com/lordmathis/CUDANet.git
synced 2025-12-22 14:24:22 +00:00
Compare commits
2 Commits
7e27c87673
...
6318d52f12
| Author | SHA1 | Date | |
|---|---|---|---|
| 6318d52f12 | |||
| 71dc5a924d |
@@ -16,19 +16,9 @@ enum class DType
|
||||
// INT32, // Not implemented yet
|
||||
};
|
||||
|
||||
size_t dtype_size(DType dtype) {
|
||||
switch (dtype)
|
||||
{
|
||||
case DType::FLOAT32:
|
||||
return 4;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("Unknown DType");
|
||||
break;
|
||||
}
|
||||
}
|
||||
size_t dtype_size(DType dtype);
|
||||
|
||||
// Forward declaration
|
||||
class Backend;
|
||||
|
||||
class Tensor
|
||||
|
||||
@@ -142,13 +142,13 @@ CUDANet::Tensor& CUDA::dense_impl(
|
||||
auto biasGridSize = (output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
Kernels::mat_vec_mul<<<forwardGridSize, BLOCK_SIZE>>>(
|
||||
static_cast<T*>(weights.device_ptr()), static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_size,
|
||||
static_cast<const T*>(weights.device_ptr()), static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_size,
|
||||
output_size
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
Kernels::vec_vec_add<<<biasGridSize, BLOCK_SIZE>>>(
|
||||
static_cast<T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()), static_cast<T*>(output.device_ptr()), output_size
|
||||
static_cast<const T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()), static_cast<T*>(output.device_ptr()), output_size
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
@@ -213,7 +213,7 @@ CUDANet::Tensor& CUDA::conv2d_impl(
|
||||
);
|
||||
|
||||
Kernels::convolution<<<grid, block>>>(
|
||||
static_cast<T*>(input.device_ptr()), static_cast<T*>(weights.device_ptr()), static_cast<T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()),
|
||||
static_cast<const T*>(input.device_ptr()), static_cast<const T*>(weights.device_ptr()), static_cast<const T*>(biases.device_ptr()), static_cast<T*>(output.device_ptr()),
|
||||
in_shape, padding_shape, kernel_shape, stride_shape, out_shape
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -273,7 +273,7 @@ CUDANet::Tensor& CUDA::max_pool2d_impl(
|
||||
);
|
||||
|
||||
Kernels::max_pool<<<grid, block>>>(
|
||||
static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
|
||||
static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
|
||||
pool_shape, stride_shape, padding_shape
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -333,7 +333,7 @@ CUDANet::Tensor& CUDA::avg_pool2d_impl(
|
||||
);
|
||||
|
||||
Kernels::avg_pool<<<grid, block>>>(
|
||||
static_cast<T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
|
||||
static_cast<const T*>(input.device_ptr()), static_cast<T*>(output.device_ptr()), input_shape, output_shape,
|
||||
pool_shape, stride_shape, padding_shape
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -394,7 +394,7 @@ CUDANet::Tensor& CUDA::batch_norm_impl(
|
||||
for (int i = 0; i < input_shape[2]; i++) {
|
||||
// Subtract mean from input
|
||||
Kernels::vec_scalar_sub<<<gridSize, BLOCK_SIZE>>>(
|
||||
static_cast<T*>(input.device_ptr()) + i * input_shape[0] * input_shape[1],
|
||||
static_cast<const T*>(input.device_ptr()) + i * input_shape[0] * input_shape[1],
|
||||
static_cast<T*>(output.device_ptr()) + i * input_shape[0] * input_shape[1],
|
||||
&static_cast<T*>(running_mean.device_ptr())[i], input_shape[0] * input_shape[1]
|
||||
);
|
||||
@@ -460,12 +460,12 @@ CUDANet::Tensor& CUDA::concat_impl(
|
||||
CUDANet::Tensor& output
|
||||
) {
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
static_cast<T*>(output.device_ptr()), static_cast<T*>(input_a.device_ptr()), input_a.size(),
|
||||
static_cast<T*>(output.device_ptr()), static_cast<const T*>(input_a.device_ptr()), input_a.size(),
|
||||
cudaMemcpyDeviceToDevice
|
||||
));
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
static_cast<T*>(output.device_ptr()) + input_a.numel(), static_cast<T*>(input_b.device_ptr()), input_b.size(),
|
||||
static_cast<T*>(output.device_ptr()) + input_a.numel(), static_cast<const T*>(input_b.device_ptr()), input_b.size(),
|
||||
cudaMemcpyDeviceToDevice
|
||||
));
|
||||
|
||||
@@ -508,7 +508,7 @@ CUDANet::Tensor& CUDA::add_impl(
|
||||
auto gridSize = (input_a.numel() + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
Kernels::vec_vec_add<<<gridSize, BLOCK_SIZE>>>(
|
||||
static_cast<T*>(input_a.device_ptr()), static_cast<T*>(input_b.device_ptr()), static_cast<T*>(output.device_ptr()), input_a.numel()
|
||||
static_cast<const T*>(input_a.device_ptr()), static_cast<const T*>(input_b.device_ptr()), static_cast<T*>(output.device_ptr()), input_a.numel()
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
@@ -26,7 +26,7 @@ void CUDA::print_impl(const CUDANet::Tensor &input) {
|
||||
std::vector<T> h_vec(input.numel());
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
h_vec.data(), static_cast<T*>(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost
|
||||
h_vec.data(), static_cast<const T*>(input.device_ptr()), sizeof(T) * length, cudaMemcpyDeviceToHost
|
||||
));
|
||||
|
||||
for (int i = 0; i < length; ++i) {
|
||||
@@ -98,7 +98,7 @@ void CUDA::sum_impl(const CUDANet::Tensor &input, CUDANet::Tensor &sum) {
|
||||
const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
CUDANet::Kernels::sum_reduce<<<gridSize, BLOCK_SIZE>>>(
|
||||
static_cast<T*>(input.device_ptr()), static_cast<T*>(sum.device_ptr()), length
|
||||
static_cast<const T*>(input.device_ptr()), static_cast<T*>(sum.device_ptr()), length
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
@@ -131,7 +131,7 @@ void CUDA::max_impl(const CUDANet::Tensor &input, CUDANet::Tensor &max) {
|
||||
auto length = input.numel();
|
||||
const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
Kernels::max_reduce<<<grid_size, BLOCK_SIZE>>>(static_cast<T*>(input.device_ptr()), static_cast<T*>(max.device_ptr()), length);
|
||||
Kernels::max_reduce<<<grid_size, BLOCK_SIZE>>>(static_cast<const T*>(input.device_ptr()), static_cast<T*>(max.device_ptr()), length);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
int remaining = grid_size;
|
||||
|
||||
@@ -4,6 +4,19 @@
|
||||
|
||||
using namespace CUDANet;
|
||||
|
||||
size_t dtype_size(DType dtype) {
|
||||
switch (dtype)
|
||||
{
|
||||
case DType::FLOAT32:
|
||||
return 4;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("Unknown DType");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Tensor::Tensor(Shape shape, CUDANet::Backend* backend)
|
||||
: Tensor(shape, backend->get_default_dtype(), backend) {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user