diff --git a/examples/alexnet/alexnet.py b/examples/alexnet/alexnet.py index d1fcd56..df64ebb 100644 --- a/examples/alexnet/alexnet.py +++ b/examples/alexnet/alexnet.py @@ -1,6 +1,9 @@ import torchvision +import torch import sys +from torchsummary import summary + sys.path.append('../../tools') # Ugly hack from utils import export_model_weights, print_model_parameters @@ -9,5 +12,9 @@ if __name__ == "__main__": print_model_parameters(alexnet) # print layer names and number of parameters export_model_weights(alexnet, 'alexnet_weights.bin') print() - print(alexnet) + + if torch.cuda.is_available(): + alexnet.cuda() + + summary(alexnet, (3, 227, 227)) diff --git a/src/layers/activation.cu b/src/layers/activation.cu index 3341483..a78248a 100644 --- a/src/layers/activation.cu +++ b/src/layers/activation.cu @@ -17,6 +17,8 @@ Activation::Activation(ActivationType activation, const int length) d_softmax_sum = nullptr; CUDA_CHECK(cudaMalloc((void**)&d_softmax_sum, sizeof(float) * length)); + + std::cout << "Activation: Softmax " << length << std::endl; } gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; diff --git a/src/utils/vector.cu b/src/utils/vector.cu index d84a4f2..70fbaf9 100644 --- a/src/utils/vector.cu +++ b/src/utils/vector.cu @@ -28,7 +28,16 @@ void Utils::max(float* d_vec, float* d_max, const unsigned int length) { const int grid_size = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; + std::cout << "grid_size: " << grid_size << ", length: " << length << std::endl; + CUDA_CHECK(cudaGetLastError()); + Kernels::max_reduce<<>>(d_vec, d_max, length); + + std::cout << "input: " << std::endl; + print_vec(d_vec, length); + std::cout << "max: " << std::endl; + print_vec(d_max, length); + CUDA_CHECK(cudaGetLastError()); int remaining = grid_size; @@ -46,7 +55,6 @@ void Utils::sum(float* d_vec, float* d_sum, const unsigned int length) { const int gridSize = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; - CUDANet::Kernels::sum_reduce<<>>( d_vec, d_sum, length ); @@ -57,7 +65,7 @@ void Utils::sum(float* d_vec, float* d_sum, const unsigned int length) { int blocks_needed = (remaining + BLOCK_SIZE - 1) / BLOCK_SIZE; CUDANet::Kernels::sum_reduce<<>>(d_sum, d_sum, remaining); CUDA_CHECK(cudaGetLastError()); - + remaining = blocks_needed; } } \ No newline at end of file diff --git a/test/layers/test_activation.cu b/test/layers/test_activation.cu index e629df2..7226618 100644 --- a/test/layers/test_activation.cu +++ b/test/layers/test_activation.cu @@ -4,26 +4,33 @@ #include TEST(ActivationTest, SoftmaxTest1) { + const int inputSize = 5; + cudaError_t cudaStatus; + CUDANet::Layers::Activation activation( - CUDANet::Layers::ActivationType::SOFTMAX, 5 + CUDANet::Layers::ActivationType::SOFTMAX, inputSize ); std::vector input = {0.573f, 0.619f, 0.732f, 0.055f, 0.243f}; float* d_input; - cudaMalloc((void**)&d_input, sizeof(float) * 5); - cudaMemcpy(d_input, input.data(), sizeof(float) * 5, cudaMemcpyHostToDevice); + cudaStatus = cudaMalloc((void**)&d_input, sizeof(float) * inputSize); + EXPECT_EQ(cudaStatus, cudaSuccess); + + cudaStatus = cudaMemcpy(d_input, input.data(), sizeof(float) * inputSize, cudaMemcpyHostToDevice); + EXPECT_EQ(cudaStatus, cudaSuccess); activation.activate(d_input); std::vector output(5); - cudaMemcpy( - output.data(), d_input, sizeof(float) * 5, cudaMemcpyDeviceToHost + cudaStatus = cudaMemcpy( + output.data(), d_input, sizeof(float) * inputSize, cudaMemcpyDeviceToHost ); + EXPECT_EQ(cudaStatus, cudaSuccess); float sum = 0.0f; std::vector expected = {0.22055f, 0.23094f, 0.25856f, 0.13139f, 0.15856f}; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < inputSize; ++i) { sum += output[i]; EXPECT_NEAR(output[i], expected[i], 1e-5f); } @@ -35,32 +42,42 @@ TEST(ActivationTest, SoftmaxTest1) { } TEST(ActivationTest, SoftmaxTest2) { + const int inputSize = 6; + cudaError_t cudaStatus; + CUDANet::Layers::Activation activation( - CUDANet::Layers::ActivationType::SOFTMAX, 6 + CUDANet::Layers::ActivationType::SOFTMAX, inputSize ); + cudaStatus = cudaGetLastError(); + EXPECT_EQ(cudaStatus, cudaSuccess); + std::vector input = {22.496f, 36.9006f, 30.9904f, 28.4213f, 26.4541f, 31.7887f}; float* d_input; - cudaMalloc((void**)&d_input, sizeof(float) * 6); - cudaMemcpy(d_input, input.data(), sizeof(float) * 6, cudaMemcpyHostToDevice); + cudaStatus = cudaMalloc((void**)&d_input, sizeof(float) * inputSize); + EXPECT_EQ(cudaStatus, cudaSuccess); + + cudaStatus = cudaMemcpy(d_input, input.data(), sizeof(float) * inputSize, cudaMemcpyHostToDevice); + EXPECT_EQ(cudaStatus, cudaSuccess); activation.activate(d_input); - std::vector output(6); - cudaMemcpy( - output.data(), d_input, sizeof(float) * 6, cudaMemcpyDeviceToHost + std::vector output(inputSize); + cudaStatus = cudaMemcpy( + output.data(), d_input, sizeof(float) * inputSize, cudaMemcpyDeviceToHost ); + EXPECT_EQ(cudaStatus, cudaSuccess); float sum = 0.0f; std::vector expected = {0.0f, 0.99111f, 0.00269f, 0.00021f, 3e-05f, 0.00597f}; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < inputSize; ++i) { sum += output[i]; EXPECT_NEAR(output[i], expected[i], 1e-5f); } + EXPECT_NEAR(sum, 1.0f, 1e-5f); - EXPECT_NEAR(sum, 1.0f, 1e-2f); - + // Cleanup cudaFree(d_input); cudaDeviceReset(); } \ No newline at end of file