X-Git-Url: http://www.fmaj7b5.info/git?p=cuda.git;a=blobdiff_plain;f=mult_matrices%2Fmult_matrices.cu;fp=mult_matrices%2Fmult_matrices.cu;h=96467dbdc460c27de2a337892d6ba38497a0f132;hp=0000000000000000000000000000000000000000;hb=d2ea0089dcca0e43ec2d836077c75384d23df2d9;hpb=e4a0dfee97228c4e3af62199e692b8fe6018939d diff --git a/mult_matrices/mult_matrices.cu b/mult_matrices/mult_matrices.cu new file mode 100644 index 0000000..96467db --- /dev/null +++ b/mult_matrices/mult_matrices.cu @@ -0,0 +1,112 @@ +/* + Copyright (C) 2012, 2013 fmaj7b5.info + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include "cuda\cuda_wrapper.h" + +#include "mult_matrices.h" +#include "DeviceMemory.cuh" + +using namespace FM7b5; +namespace cuda = FM7b5::cuda; + +#define DIV 16 +#define FM7b5_USE_PINNED + +__global__ void +mult_matrix(float* C, const float* A, const float* B) +{ + const size_t d(threadIdx.z); + const size_t offset(16*(blockIdx.x*DIV + d)); + const size_t index(threadIdx.x + blockDim.y*threadIdx.y); + + __shared__ float _A[DIV][4*4], _B[DIV][4*4]; + + _A[d][index] = A[offset + index]; + _B[d][index] = B[offset + index]; + + float c_elem = 0.0; + for (int k = 0; k < 4; ++k) { + c_elem += _A[d][threadIdx.x + 4*k] * _B[d][k + 4*threadIdx.y]; + } + + C[offset + index] = c_elem; +} + +void +FM7b5::mult_matrices_init_gpu() +{ + cudaError_t status; + + cudaDeviceReset(); + status = cudaSetDeviceFlags(cudaDeviceMapHost); + if (status != cudaSuccess) { + std::cerr << cudaGetErrorString(status) << std::endl; + } + + cudaFree(0); +} + +void +FM7b5::mult_matrices_gpu(float* C, const float* A, const float* B, const size_t num) +{ + const size_t num_elements(num * 4 * 4); + +#ifdef FM7b5_USE_PINNED + cudaError_t status; + + if ((status = cudaHostRegister(const_cast(A), sizeof(float) * num_elements, 0)) != cudaSuccess) { + std::cerr << cudaGetErrorString(status) << std::endl; + } + if ((status = cudaHostRegister(const_cast(B), sizeof(float) * num_elements, 0)) != cudaSuccess) { + std::cerr << cudaGetErrorString(status) << std::endl; + } + if ((status = cudaHostRegister(C, sizeof(float) * num_elements, 0)) != cudaSuccess) { + std::cerr << cudaGetErrorString(status) << std::endl; + } +#endif + + memory::Linear d_A(num_elements), d_B(num_elements), d_C(num_elements); + + d_A.copy_from(A, sizeof(float) * num_elements); + d_B.copy_from(B, sizeof(float) * num_elements); + + cuda::Event start, finish; + + start.record(); + + mult_matrix<<<(num + (DIV - 1))/ DIV, dim3(4, 4, DIV)>>>(d_C.data(), d_A.data(), d_B.data()); + + finish.record(); + finish.synchronize(); + + float ms; + cudaEventElapsedTime(&ms, start, finish); + + std::cout << "kernel: " << ms << " [ms] (" << static_cast(4*4*7*num) / ms * 1.0e-6 << " GFLOPS)" << std::endl; + + d_C.copy_to(C, sizeof(float) * num_elements); + +#ifdef FM7b5_USE_PINNED + cudaHostUnregister(C); + cudaHostUnregister(const_cast(B)); + cudaHostUnregister(const_cast(A)); +#endif +}