/*
Copyright (C) 2012, 2013 fmaj7b5.info
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
// mult_matrices.cpp : コンソール アプリケーションのエントリ ポイントを定義します。
//
#include "stdafx.h"
#include "mult_matrices.h"
using namespace FM7b5;
const size_t g_num(1024 * 1024);
const size_t g_num_run(10);
std::default_random_engine g_re;
static void random_matrices(float* mat44, const size_t num);
static void disp_matrix(const float* m44);
static inline void mult_matrix(float* C, const float* A, const float* B);
static void mult_matrices(float* C, const float* A, const float* B, const size_t num);
static double flops(const size_t num, const double elapsed_ms);
#define FM7b5_USE_CPU
//#define FM7b5_TRANSPOSED
int _tmain(int argc, _TCHAR* argv[])
{
/* Column-major 4x4 matrices (i.e. M(i, j) = M[i + 4*j] */
std::vector A(g_num * 4*4), B(g_num * 4*4), C(g_num * 4*4), C_gpu(g_num * 4*4);
random_matrices(&A[0], g_num);
random_matrices(&B[0], g_num);
ULONGLONG start, finish;
double elapsed_ms;
#ifdef FM7b5_USE_CPU
/* CPU */
start = GetTickCount64();
for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
mult_matrices(&C[0], &A[0], &B[0], g_num);
}
finish = GetTickCount64();
elapsed_ms = static_cast(finish - start);
printf("1 CPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
#endif
/* GPU */
mult_matrices_init_gpu();
start = GetTickCount64();
for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
mult_matrices_gpu(&C_gpu[0], &A[0], &B[0], g_num);
}
finish = GetTickCount64();
elapsed_ms = static_cast(finish - start);
printf("GPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
#if 0
disp_matrix(&A[g_num-1]);
disp_matrix(&B[g_num-1]);
# ifdef FM7b5_USE_CPU
disp_matrix(&C[g_num-1]);
# endif
disp_matrix(&C_gpu[g_num-1]);
#endif
return 0;
}
void
random_matrices(float* mat44, const size_t num)
{
if (mat44 == NULL || num < 1) {
return;
}
std::uniform_real_distribution rand_dist;
for (size_t i = 0; i < num; ++i) {
float* p(mat44 + 16*i);
for (size_t j = 0; j < 16; ++j)
{
p[j] = rand_dist(g_re);
}
}
}
void
disp_matrix(const float* m44)
{
for (size_t r = 0; r < 4; ++r) {
for (size_t c = 0; c < 4; ++c) {
printf("% 7.3f ", m44[r + 4*c]);
}
printf("\n");
}
printf("\n");
}
inline void
mult_matrix(float* __restrict C, const float* __restrict A, const float* __restrict B)
{
for (size_t i = 0; i < 16; ++i) {
C[i] = 0.0;
}
#ifdef FM7b5_TRANSPOSED
float Bt[16];
for (size_t c = 0; c < 4; ++c) {
for (size_t r = 0; r < 4; ++r) {
Bt[r + 4*c] = B[c + 4*r];
}
}
#endif
for (size_t k = 0; k < 4; ++k) {
for (size_t c = 0; c < 4; ++c) {
for (size_t r = 0; r < 4; ++r) {
#ifndef FM7b5_TRANSPOSED
C[r + 4*c] += A[r + 4*k] * B[k + 4*c];
#else
C[r + 4*c] += A[r + 4*k] * Bt[c + 4*k];
#endif
}
}
}
}
void
mult_matrices(float* C, const float* A, const float* B, const size_t num)
{
#pragma omp parallel for
for (int i = 0; i < static_cast(num); ++i) {
mult_matrix(C + 16*i, A + 16*i, B + 16*i);
}
}
double
flops(const size_t num, const double elapsed_ms)
{
/* num of multiplications and additions in a single matrix-matrix multiplication */
const int ops(4 * 4 * (4 + 3));
return static_cast(num * ops) * 1.0e3 / elapsed_ms;
}