X-Git-Url: http://www.fmaj7b5.info/git?p=cuda.git;a=blobdiff_plain;f=mult_matrices%2Fmult_matrices.cpp;fp=mult_matrices%2Fmult_matrices.cpp;h=bddb96ee82301f9596cf65ece5d6441411a9a7b5;hp=0000000000000000000000000000000000000000;hb=474dc4b3cd13aea3ccaae408eac81bbb616c056b;hpb=9c4ea7a2596620dcb156a00d12bc55c07ba03339 diff --git a/mult_matrices/mult_matrices.cpp b/mult_matrices/mult_matrices.cpp new file mode 100644 index 0000000..bddb96e --- /dev/null +++ b/mult_matrices/mult_matrices.cpp @@ -0,0 +1,164 @@ +/* + Copyright (C) 2012, 2013 fmaj7b5.info + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// mult_matrices.cpp : ƒRƒ“ƒ\[ƒ‹ ƒAƒvƒŠƒP[ƒVƒ‡ƒ“‚̃Gƒ“ƒgƒŠ ƒ|ƒCƒ“ƒg‚ð’è‹`‚µ‚Ü‚·B +// + +#include "stdafx.h" + +#include "mult_matrices.h" + +using namespace FM7b5; + +const size_t g_num(1024 * 1024); +const size_t g_num_run(10); + +std::default_random_engine g_re; + +static void random_matrices(float* mat44, const size_t num); + +static void disp_matrix(const float* m44); +static inline void mult_matrix(float* C, const float* A, const float* B); +static void mult_matrices(float* C, const float* A, const float* B, const size_t num); + +static double flops(const size_t num, const double elapsed_ms); + +#define FM7b5_USE_CPU +//#define FM7b5_TRANSPOSED + +int _tmain(int argc, _TCHAR* argv[]) +{ + /* Column-major 4x4 matrices (i.e. M(i, j) = M[i + 4*j] */ + std::vector A(g_num * 4*4), B(g_num * 4*4), C(g_num * 4*4), C_gpu(g_num * 4*4); + + random_matrices(&A[0], g_num); + random_matrices(&B[0], g_num); + + ULONGLONG start, finish; + double elapsed_ms; + +#ifdef FM7b5_USE_CPU + /* CPU */ + start = GetTickCount64(); + for (size_t nrun = 0; nrun < g_num_run; ++nrun) { + mult_matrices(&C[0], &A[0], &B[0], g_num); + } + finish = GetTickCount64(); + elapsed_ms = static_cast(finish - start); + printf("1 CPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast(g_num_run) * flops(g_num, elapsed_ms) / 1e9); +#endif + + /* GPU */ + mult_matrices_init_gpu(); + start = GetTickCount64(); + for (size_t nrun = 0; nrun < g_num_run; ++nrun) { + mult_matrices_gpu(&C_gpu[0], &A[0], &B[0], g_num); + } + finish = GetTickCount64(); + elapsed_ms = static_cast(finish - start); + printf("GPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast(g_num_run) * flops(g_num, elapsed_ms) / 1e9); + +#if 0 + disp_matrix(&A[g_num-1]); + disp_matrix(&B[g_num-1]); + +# ifdef FM7b5_USE_CPU + disp_matrix(&C[g_num-1]); +# endif + disp_matrix(&C_gpu[g_num-1]); +#endif + + return 0; +} + +void +random_matrices(float* mat44, const size_t num) +{ + if (mat44 == NULL || num < 1) { + return; + } + + std::uniform_real_distribution rand_dist; + + for (size_t i = 0; i < num; ++i) { + float* p(mat44 + 16*i); + + for (size_t j = 0; j < 16; ++j) + { + p[j] = rand_dist(g_re); + } + } +} + +void +disp_matrix(const float* m44) +{ + for (size_t r = 0; r < 4; ++r) { + for (size_t c = 0; c < 4; ++c) { + printf("% 7.3f ", m44[r + 4*c]); + } + printf("\n"); + } + printf("\n"); +} + +inline void +mult_matrix(float* __restrict C, const float* __restrict A, const float* __restrict B) +{ + for (size_t i = 0; i < 16; ++i) { + C[i] = 0.0; + } + +#ifdef FM7b5_TRANSPOSED + float Bt[16]; + for (size_t c = 0; c < 4; ++c) { + for (size_t r = 0; r < 4; ++r) { + Bt[r + 4*c] = B[c + 4*r]; + } + } +#endif + + for (size_t k = 0; k < 4; ++k) { + for (size_t c = 0; c < 4; ++c) { + for (size_t r = 0; r < 4; ++r) { +#ifndef FM7b5_TRANSPOSED + C[r + 4*c] += A[r + 4*k] * B[k + 4*c]; +#else + C[r + 4*c] += A[r + 4*k] * Bt[c + 4*k]; +#endif + } + } + } +} + +void +mult_matrices(float* C, const float* A, const float* B, const size_t num) +{ +#pragma omp parallel for + for (int i = 0; i < static_cast(num); ++i) { + mult_matrix(C + 16*i, A + 16*i, B + 16*i); + } +} + +double +flops(const size_t num, const double elapsed_ms) +{ + /* num of multiplications and additions in a single matrix-matrix multiplication */ + const int ops(4 * 4 * (4 + 3)); + + return static_cast(num * ops) * 1.0e3 / elapsed_ms; +}