/* Copyright (C) 2012, 2013 fmaj7b5.info This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include "cuda\cuda_wrapper.h" #include "mult_matrices.h" #include "DeviceMemory.cuh" using namespace FM7b5; namespace cuda = FM7b5::cuda; #define DIV 16 #define FM7b5_USE_PINNED __global__ void mult_matrix(float* C, const float* A, const float* B) { const size_t d(threadIdx.z); const size_t offset(16*(blockIdx.x*DIV + d)); const size_t index(threadIdx.x + blockDim.y*threadIdx.y); __shared__ float _A[DIV][4*4], _B[DIV][4*4]; _A[d][index] = A[offset + index]; _B[d][index] = B[offset + index]; float c_elem = 0.0; for (int k = 0; k < 4; ++k) { c_elem += _A[d][threadIdx.x + 4*k] * _B[d][k + 4*threadIdx.y]; } C[offset + index] = c_elem; } void FM7b5::mult_matrices_init_gpu() { cudaError_t status; cudaDeviceReset(); status = cudaSetDeviceFlags(cudaDeviceMapHost); if (status != cudaSuccess) { std::cerr << cudaGetErrorString(status) << std::endl; } cudaFree(0); } void FM7b5::mult_matrices_gpu(float* C, const float* A, const float* B, const size_t num) { const size_t num_elements(num * 4 * 4); #ifdef FM7b5_USE_PINNED cudaError_t status; if ((status = cudaHostRegister(const_cast(A), sizeof(float) * num_elements, 0)) != cudaSuccess) { std::cerr << cudaGetErrorString(status) << std::endl; } if ((status = cudaHostRegister(const_cast(B), sizeof(float) * num_elements, 0)) != cudaSuccess) { std::cerr << cudaGetErrorString(status) << std::endl; } if ((status = cudaHostRegister(C, sizeof(float) * num_elements, 0)) != cudaSuccess) { std::cerr << cudaGetErrorString(status) << std::endl; } #endif memory::Linear d_A(num_elements), d_B(num_elements), d_C(num_elements); d_A.copy_from(A, sizeof(float) * num_elements); d_B.copy_from(B, sizeof(float) * num_elements); cuda::Event start, finish; start.record(); mult_matrix<<<(num + (DIV - 1))/ DIV, dim3(4, 4, DIV)>>>(d_C.data(), d_A.data(), d_B.data()); finish.record(); finish.synchronize(); float ms; cudaEventElapsedTime(&ms, start, finish); std::cout << "kernel: " << ms << " [ms] (" << static_cast(4*4*7*num) / ms * 1.0e-6 << " GFLOPS)" << std::endl; d_C.copy_to(C, sizeof(float) * num_elements); #ifdef FM7b5_USE_PINNED cudaHostUnregister(C); cudaHostUnregister(const_cast(B)); cudaHostUnregister(const_cast(A)); #endif }