/* Copyright (C) 2012, 2013 fmaj7b5.info This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ // mult_matrices.cpp : コンソールアプリケーションのエントリポイントを定義します。 // #include "stdafx.h" #include "mult_matrices.h" using namespace FM7b5; const size_t g_num(1024 * 1024); const size_t g_num_run(10); std::default_random_engine g_re; static void random_matrices(float* mat44, const size_t num); static void disp_matrix(const float* m44); static inline void mult_matrix(float* C, const float* A, const float* B); static void mult_matrices(float* C, const float* A, const float* B, const size_t num); static double flops(const size_t num, const double elapsed_ms); #define FM7b5_USE_CPU //#define FM7b5_TRANSPOSED int _tmain(int argc, _TCHAR* argv[]) { /* Column-major 4x4 matrices (i.e. M(i, j) = M[i + 4*j] */ std::vector A(g_num * 4*4), B(g_num * 4*4), C(g_num * 4*4), C_gpu(g_num * 4*4); random_matrices(&A[0], g_num); random_matrices(&B[0], g_num); ULONGLONG start, finish; double elapsed_ms; #ifdef FM7b5_USE_CPU /* CPU */ start = GetTickCount64(); for (size_t nrun = 0; nrun < g_num_run; ++nrun) { mult_matrices(&C[0], &A[0], &B[0], g_num); } finish = GetTickCount64(); elapsed_ms = static_cast(finish - start); printf("1 CPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast(g_num_run) * flops(g_num, elapsed_ms) / 1e9); #endif /* GPU */ mult_matrices_init_gpu(); start = GetTickCount64(); for (size_t nrun = 0; nrun < g_num_run; ++nrun) { mult_matrices_gpu(&C_gpu[0], &A[0], &B[0], g_num); } finish = GetTickCount64(); elapsed_ms = static_cast(finish - start); printf("GPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast(g_num_run) * flops(g_num, elapsed_ms) / 1e9); #if 0 disp_matrix(&A[g_num-1]); disp_matrix(&B[g_num-1]); # ifdef FM7b5_USE_CPU disp_matrix(&C[g_num-1]); # endif disp_matrix(&C_gpu[g_num-1]); #endif return 0; } void random_matrices(float* mat44, const size_t num) { if (mat44 == NULL || num < 1) { return; } std::uniform_real_distribution rand_dist; for (size_t i = 0; i < num; ++i) { float* p(mat44 + 16*i); for (size_t j = 0; j < 16; ++j) { p[j] = rand_dist(g_re); } } } void disp_matrix(const float* m44) { for (size_t r = 0; r < 4; ++r) { for (size_t c = 0; c < 4; ++c) { printf("% 7.3f ", m44[r + 4*c]); } printf("\n"); } printf("\n"); } inline void mult_matrix(float* __restrict C, const float* __restrict A, const float* __restrict B) { for (size_t i = 0; i < 16; ++i) { C[i] = 0.0; } #ifdef FM7b5_TRANSPOSED float Bt[16]; for (size_t c = 0; c < 4; ++c) { for (size_t r = 0; r < 4; ++r) { Bt[r + 4*c] = B[c + 4*r]; } } #endif for (size_t k = 0; k < 4; ++k) { for (size_t c = 0; c < 4; ++c) { for (size_t r = 0; r < 4; ++r) { #ifndef FM7b5_TRANSPOSED C[r + 4*c] += A[r + 4*k] * B[k + 4*c]; #else C[r + 4*c] += A[r + 4*k] * Bt[c + 4*k]; #endif } } } } void mult_matrices(float* C, const float* A, const float* B, const size_t num) { #pragma omp parallel for for (int i = 0; i < static_cast(num); ++i) { mult_matrix(C + 16*i, A + 16*i, B + 16*i); } } double flops(const size_t num, const double elapsed_ms) { /* num of multiplications and additions in a single matrix-matrix multiplication */ const int ops(4 * 4 * (4 + 3)); return static_cast(num * ops) * 1.0e3 / elapsed_ms; }