mult_matrices/mult_matrices.cpp

   1 /*
   2         Copyright (C) 2012, 2013  fmaj7b5.info
   3
   4         This program is free software: you can redistribute it and/or modify
   5         it under the terms of the GNU General Public License as published by
   6         the Free Software Foundation, either version 2 of the License, or
   7         (at your option) any later version.
   8
   9         This program is distributed in the hope that it will be useful,
  10         but WITHOUT ANY WARRANTY; without even the implied warranty of
  11         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12         GNU General Public License for more details.
  13
  14         You should have received a copy of the GNU General Public License
  15         along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 */
  17
  18 // mult_matrices.cpp : \83R\83\93\83\\81[\83\8b \83A\83v\83\8a\83P\81[\83V\83\87\83\93\82Ì\83G\83\93\83g\83\8a \83|\83C\83\93\83g\82ð\92è\8b`\82µ\82Ü\82·\81B
  19 //
  20
  21 #include "stdafx.h"
  22
  23 #include "mult_matrices.h"
  24
  25 using namespace FM7b5;
  26
  27 const size_t g_num(1024 * 1024);
  28 const size_t g_num_run(10);
  29
  30 std::default_random_engine g_re;
  31
  32 static void random_matrices(float* mat44, const size_t num);
  33
  34 static void disp_matrix(const float* m44);
  35 static inline void mult_matrix(float* C, const float* A, const float* B);
  36 static void mult_matrices(float* C, const float* A, const float* B, const size_t num);
  37
  38 static double flops(const size_t num, const double elapsed_ms);
  39
  40 #define FM7b5_USE_CPU
  41 //#define FM7b5_TRANSPOSED
  42
  43 int _tmain(int argc, _TCHAR* argv[])
  44 {
  45         /* Column-major 4x4 matrices (i.e. M(i, j) = M[i + 4*j] */
  46         std::vector<float> A(g_num * 4*4), B(g_num * 4*4), C(g_num * 4*4), C_gpu(g_num * 4*4);
  47
  48         random_matrices(&A[0], g_num);
  49         random_matrices(&B[0], g_num);
  50
  51         ULONGLONG start, finish;
  52         double elapsed_ms;
  53
  54 #ifdef FM7b5_USE_CPU
  55         /* CPU */
  56         start = GetTickCount64();
  57         for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
  58                 mult_matrices(&C[0], &A[0], &B[0], g_num);
  59         }
  60         finish = GetTickCount64();
  61         elapsed_ms = static_cast<double>(finish - start);
  62         printf("1 CPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast<double>(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
  63 #endif
  64
  65         /* GPU */
  66         mult_matrices_init_gpu();
  67         start = GetTickCount64();
  68         for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
  69                 mult_matrices_gpu(&C_gpu[0], &A[0], &B[0], g_num);
  70         }
  71         finish = GetTickCount64();
  72         elapsed_ms = static_cast<double>(finish - start);
  73         printf("GPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast<double>(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
  74
  75 #if 0
  76         disp_matrix(&A[g_num-1]);
  77         disp_matrix(&B[g_num-1]);
  78
  79 # ifdef FM7b5_USE_CPU
  80         disp_matrix(&C[g_num-1]);
  81 # endif
  82         disp_matrix(&C_gpu[g_num-1]);
  83 #endif
  84
  85         return 0;
  86 }
  87
  88 void
  89 random_matrices(float* mat44, const size_t num)
  90 {
  91         if (mat44 == NULL || num < 1) {
  92                 return;
  93         }
  94
  95         std::uniform_real_distribution<float> rand_dist;
  96
  97         for (size_t i = 0; i < num; ++i) {
  98                 float* p(mat44 + 16*i);
  99
 100                 for (size_t j = 0; j < 16; ++j)
 101                 {
 102                         p[j] = rand_dist(g_re);
 103                 }
 104         }
 105 }
 106
 107 void
 108 disp_matrix(const float* m44)
 109 {
 110         for (size_t r = 0; r < 4; ++r) {
 111                 for (size_t c = 0; c < 4; ++c) {
 112                         printf("% 7.3f ", m44[r + 4*c]);
 113                 }
 114                 printf("\n");
 115         }
 116         printf("\n");
 117 }
 118
 119 inline void
 120 mult_matrix(float* __restrict C, const float* __restrict A, const float* __restrict B)
 121 {
 122         for (size_t i = 0; i < 16; ++i) {
 123                 C[i] = 0.0;
 124         }
 125
 126 #ifdef FM7b5_TRANSPOSED
 127         float Bt[16];
 128         for (size_t c = 0; c < 4; ++c) {
 129                 for (size_t r = 0; r < 4; ++r) {
 130                         Bt[r + 4*c] = B[c + 4*r];
 131                 }
 132         }
 133 #endif
 134
 135         for (size_t k = 0; k < 4; ++k) {
 136                 for (size_t c = 0; c < 4; ++c) {
 137                         for (size_t r = 0; r < 4; ++r) {
 138 #ifndef FM7b5_TRANSPOSED
 139                                 C[r + 4*c] += A[r + 4*k] * B[k + 4*c];
 140 #else
 141                                 C[r + 4*c] += A[r + 4*k] * Bt[c + 4*k];
 142 #endif
 143                         }
 144                 }
 145         }
 146 }
 147
 148 void
 149 mult_matrices(float* C, const float* A, const float* B, const size_t num)
 150 {
 151 #pragma omp parallel for
 152         for (int i = 0; i < static_cast<int>(num); ++i) {
 153                 mult_matrix(C + 16*i, A + 16*i, B + 16*i);
 154         }
 155 }
 156
 157 double
 158 flops(const size_t num, const double elapsed_ms)
 159 {
 160         /* num of multiplications and additions in a single matrix-matrix multiplication */
 161         const int ops(4 * 4 * (4 + 3));
 162
 163         return static_cast<double>(num * ops) * 1.0e3 / elapsed_ms;
 164 }