/*
Copyright (C) 2012, 2013 fmaj7b5.info
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include "cuda\cuda_wrapper.h"
#include "mult_matrices.h"
#include "DeviceMemory.cuh"
using namespace FM7b5;
namespace cuda = FM7b5::cuda;
#define DIV 16
#define FM7b5_USE_PINNED
__global__ void
mult_matrix(float* C, const float* A, const float* B)
{
const size_t d(threadIdx.z);
const size_t offset(16*(blockIdx.x*DIV + d));
const size_t index(threadIdx.x + blockDim.y*threadIdx.y);
__shared__ float _A[DIV][4*4], _B[DIV][4*4];
_A[d][index] = A[offset + index];
_B[d][index] = B[offset + index];
float c_elem = 0.0;
for (int k = 0; k < 4; ++k) {
c_elem += _A[d][threadIdx.x + 4*k] * _B[d][k + 4*threadIdx.y];
}
C[offset + index] = c_elem;
}
void
FM7b5::mult_matrices_init_gpu()
{
cudaError_t status;
cudaDeviceReset();
status = cudaSetDeviceFlags(cudaDeviceMapHost);
if (status != cudaSuccess) {
std::cerr << cudaGetErrorString(status) << std::endl;
}
cudaFree(0);
}
void
FM7b5::mult_matrices_gpu(float* C, const float* A, const float* B, const size_t num)
{
const size_t num_elements(num * 4 * 4);
#ifdef FM7b5_USE_PINNED
cudaError_t status;
if ((status = cudaHostRegister(const_cast(A), sizeof(float) * num_elements, 0)) != cudaSuccess) {
std::cerr << cudaGetErrorString(status) << std::endl;
}
if ((status = cudaHostRegister(const_cast(B), sizeof(float) * num_elements, 0)) != cudaSuccess) {
std::cerr << cudaGetErrorString(status) << std::endl;
}
if ((status = cudaHostRegister(C, sizeof(float) * num_elements, 0)) != cudaSuccess) {
std::cerr << cudaGetErrorString(status) << std::endl;
}
#endif
memory::Linear d_A(num_elements), d_B(num_elements), d_C(num_elements);
d_A.copy_from(A, sizeof(float) * num_elements);
d_B.copy_from(B, sizeof(float) * num_elements);
cuda::Event start, finish;
start.record();
mult_matrix<<<(num + (DIV - 1))/ DIV, dim3(4, 4, DIV)>>>(d_C.data(), d_A.data(), d_B.data());
finish.record();
finish.synchronize();
float ms;
cudaEventElapsedTime(&ms, start, finish);
std::cout << "kernel: " << ms << " [ms] (" << static_cast(4*4*7*num) / ms * 1.0e-6 << " GFLOPS)" << std::endl;
d_C.copy_to(C, sizeof(float) * num_elements);
#ifdef FM7b5_USE_PINNED
cudaHostUnregister(C);
cudaHostUnregister(const_cast(B));
cudaHostUnregister(const_cast(A));
#endif
}