2 Copyright (C) 2012, 2013 fmaj7b5.info
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 // mult_matrices.cpp :
\83R
\83\93\83\
\81[
\83\8b \83A
\83v
\83\8a\83P
\81[
\83V
\83\87\83\93\82Ì
\83G
\83\93\83g
\83\8a \83|
\83C
\83\93\83g
\82ð
\92è
\8b`
\82µ
\82Ü
\82·
\81B
23 #include "mult_matrices.h"
25 using namespace FM7b5;
27 const size_t g_num(1024 * 1024);
28 const size_t g_num_run(10);
30 std::default_random_engine g_re;
32 static void random_matrices(float* mat44, const size_t num);
34 static void disp_matrix(const float* m44);
35 static inline void mult_matrix(float* C, const float* A, const float* B);
36 static void mult_matrices(float* C, const float* A, const float* B, const size_t num);
38 static double flops(const size_t num, const double elapsed_ms);
41 //#define FM7b5_TRANSPOSED
43 int _tmain(int argc, _TCHAR* argv[])
45 /* Column-major 4x4 matrices (i.e. M(i, j) = M[i + 4*j] */
46 std::vector<float> A(g_num * 4*4), B(g_num * 4*4), C(g_num * 4*4), C_gpu(g_num * 4*4);
48 random_matrices(&A[0], g_num);
49 random_matrices(&B[0], g_num);
51 ULONGLONG start, finish;
56 start = GetTickCount64();
57 for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
58 mult_matrices(&C[0], &A[0], &B[0], g_num);
60 finish = GetTickCount64();
61 elapsed_ms = static_cast<double>(finish - start);
62 printf("1 CPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast<double>(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
66 mult_matrices_init_gpu();
67 start = GetTickCount64();
68 for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
69 mult_matrices_gpu(&C_gpu[0], &A[0], &B[0], g_num);
71 finish = GetTickCount64();
72 elapsed_ms = static_cast<double>(finish - start);
73 printf("GPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast<double>(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
76 disp_matrix(&A[g_num-1]);
77 disp_matrix(&B[g_num-1]);
80 disp_matrix(&C[g_num-1]);
82 disp_matrix(&C_gpu[g_num-1]);
89 random_matrices(float* mat44, const size_t num)
91 if (mat44 == NULL || num < 1) {
95 std::uniform_real_distribution<float> rand_dist;
97 for (size_t i = 0; i < num; ++i) {
98 float* p(mat44 + 16*i);
100 for (size_t j = 0; j < 16; ++j)
102 p[j] = rand_dist(g_re);
108 disp_matrix(const float* m44)
110 for (size_t r = 0; r < 4; ++r) {
111 for (size_t c = 0; c < 4; ++c) {
112 printf("% 7.3f ", m44[r + 4*c]);
120 mult_matrix(float* __restrict C, const float* __restrict A, const float* __restrict B)
122 for (size_t i = 0; i < 16; ++i) {
126 #ifdef FM7b5_TRANSPOSED
128 for (size_t c = 0; c < 4; ++c) {
129 for (size_t r = 0; r < 4; ++r) {
130 Bt[r + 4*c] = B[c + 4*r];
135 for (size_t k = 0; k < 4; ++k) {
136 for (size_t c = 0; c < 4; ++c) {
137 for (size_t r = 0; r < 4; ++r) {
138 #ifndef FM7b5_TRANSPOSED
139 C[r + 4*c] += A[r + 4*k] * B[k + 4*c];
141 C[r + 4*c] += A[r + 4*k] * Bt[c + 4*k];
149 mult_matrices(float* C, const float* A, const float* B, const size_t num)
151 #pragma omp parallel for
152 for (int i = 0; i < static_cast<int>(num); ++i) {
153 mult_matrix(C + 16*i, A + 16*i, B + 16*i);
158 flops(const size_t num, const double elapsed_ms)
160 /* num of multiplications and additions in a single matrix-matrix multiplication */
161 const int ops(4 * 4 * (4 + 3));
163 return static_cast<double>(num * ops) * 1.0e3 / elapsed_ms;