mult_matrices/mult_matrices.cu

   1 /*
   2         Copyright (C) 2012, 2013  fmaj7b5.info
   3
   4         This program is free software: you can redistribute it and/or modify
   5         it under the terms of the GNU General Public License as published by
   6         the Free Software Foundation, either version 2 of the License, or
   7         (at your option) any later version.
   8
   9         This program is distributed in the hope that it will be useful,
  10         but WITHOUT ANY WARRANTY; without even the implied warranty of
  11         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12         GNU General Public License for more details.
  13
  14         You should have received a copy of the GNU General Public License
  15         along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 */
  17
  18 #include <iostream>
  19
  20 #include <cuda.h>
  21 #include <cuda_runtime.h>
  22 #include "cuda\cuda_wrapper.h"
  23
  24 #include "mult_matrices.h"
  25 #include "DeviceMemory.cuh"
  26
  27 using namespace FM7b5;
  28 namespace cuda = FM7b5::cuda;
  29
  30 #define DIV 16
  31 #define FM7b5_USE_PINNED
  32
  33 __global__ void
  34 mult_matrix(float* C, const float* A, const float* B)
  35 {
  36         const size_t d(threadIdx.z);
  37         const size_t offset(16*(blockIdx.x*DIV + d));
  38         const size_t index(threadIdx.x + blockDim.y*threadIdx.y);
  39
  40         __shared__ float _A[DIV][4*4], _B[DIV][4*4];
  41
  42         _A[d][index] = A[offset + index];
  43         _B[d][index] = B[offset + index];
  44
  45         float c_elem = 0.0;
  46         for (int k = 0; k < 4; ++k) {
  47                 c_elem += _A[d][threadIdx.x + 4*k] * _B[d][k + 4*threadIdx.y];
  48         }
  49
  50         C[offset + index] = c_elem;
  51 }
  52
  53 void
  54 FM7b5::mult_matrices_init_gpu()
  55 {
  56         cudaError_t status;
  57
  58         cudaDeviceReset();
  59         status = cudaSetDeviceFlags(cudaDeviceMapHost);
  60         if (status != cudaSuccess) {
  61                 std::cerr << cudaGetErrorString(status) << std::endl;
  62         }
  63
  64         cudaFree(0);
  65 }
  66
  67 void
  68 FM7b5::mult_matrices_gpu(float* C, const float* A, const float* B, const size_t num)
  69 {
  70         const size_t num_elements(num * 4 * 4);
  71
  72 #ifdef FM7b5_USE_PINNED
  73         cudaError_t status;
  74
  75         if ((status = cudaHostRegister(const_cast<float*>(A), sizeof(float) * num_elements, 0)) != cudaSuccess) {
  76                 std::cerr << cudaGetErrorString(status) << std::endl;
  77         }
  78         if ((status = cudaHostRegister(const_cast<float*>(B), sizeof(float) * num_elements, 0)) != cudaSuccess) {
  79                 std::cerr << cudaGetErrorString(status) << std::endl;
  80         }
  81         if ((status = cudaHostRegister(C, sizeof(float) * num_elements, 0)) != cudaSuccess) {
  82                 std::cerr << cudaGetErrorString(status) << std::endl;
  83         }
  84 #endif
  85
  86         memory::Linear<float> d_A(num_elements), d_B(num_elements), d_C(num_elements);
  87
  88         d_A.copy_from(A, sizeof(float) * num_elements);
  89         d_B.copy_from(B, sizeof(float) * num_elements);
  90
  91         cuda::Event start, finish;
  92
  93         start.record();
  94
  95         mult_matrix<<<(num + (DIV - 1))/ DIV, dim3(4, 4, DIV)>>>(d_C.data(), d_A.data(), d_B.data());
  96
  97         finish.record();
  98         finish.synchronize();
  99
 100         float ms;
 101         cudaEventElapsedTime(&ms, start, finish);
 102
 103         std::cout << "kernel: " << ms << " [ms] (" << static_cast<double>(4*4*7*num) / ms * 1.0e-6 << " GFLOPS)" << std::endl;
 104
 105         d_C.copy_to(C, sizeof(float) * num_elements);
 106
 107 #ifdef FM7b5_USE_PINNED
 108         cudaHostUnregister(C);
 109         cudaHostUnregister(const_cast<float*>(B));
 110         cudaHostUnregister(const_cast<float*>(A));
 111 #endif
 112 }