X-Git-Url: http://www.fmaj7b5.info/git?p=cuda.git;a=blobdiff_plain;f=mult_matrices%2Fmult_matrices.cpp;fp=mult_matrices%2Fmult_matrices.cpp;h=bddb96ee82301f9596cf65ece5d6441411a9a7b5;hp=0000000000000000000000000000000000000000;hb=474dc4b3cd13aea3ccaae408eac81bbb616c056b;hpb=9c4ea7a2596620dcb156a00d12bc55c07ba03339

diff --git a/mult_matrices/mult_matrices.cpp b/mult_matrices/mult_matrices.cpp
new file mode 100644
index 0000000..bddb96e
--- /dev/null
+++ b/mult_matrices/mult_matrices.cpp
@@ -0,0 +1,164 @@
+/*
+	Copyright (C) 2012, 2013  fmaj7b5.info
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 2 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// mult_matrices.cpp : R\[ AvP[VÌGg |Cgðè`µÜ·B
+//
+
+#include "stdafx.h"
+
+#include "mult_matrices.h"
+
+using namespace FM7b5;
+
+const size_t g_num(1024 * 1024);
+const size_t g_num_run(10);
+
+std::default_random_engine g_re;
+
+static void random_matrices(float* mat44, const size_t num);
+
+static void disp_matrix(const float* m44);
+static inline void mult_matrix(float* C, const float* A, const float* B);
+static void mult_matrices(float* C, const float* A, const float* B, const size_t num);
+
+static double flops(const size_t num, const double elapsed_ms);
+
+#define FM7b5_USE_CPU
+//#define FM7b5_TRANSPOSED
+
+int _tmain(int argc, _TCHAR* argv[])
+{
+	/* Column-major 4x4 matrices (i.e. M(i, j) = M[i + 4*j] */
+	std::vector<float> A(g_num * 4*4), B(g_num * 4*4), C(g_num * 4*4), C_gpu(g_num * 4*4);
+
+	random_matrices(&A[0], g_num);
+	random_matrices(&B[0], g_num);
+	
+	ULONGLONG start, finish;
+	double elapsed_ms;
+
+#ifdef FM7b5_USE_CPU
+	/* CPU */
+	start = GetTickCount64();
+	for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
+		mult_matrices(&C[0], &A[0], &B[0], g_num);
+	}
+	finish = GetTickCount64();
+	elapsed_ms = static_cast<double>(finish - start);
+	printf("1 CPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast<double>(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
+#endif
+
+	/* GPU */
+	mult_matrices_init_gpu();
+	start = GetTickCount64();
+	for (size_t nrun = 0; nrun < g_num_run; ++nrun) {
+		mult_matrices_gpu(&C_gpu[0], &A[0], &B[0], g_num);
+	}
+	finish = GetTickCount64();
+	elapsed_ms = static_cast<double>(finish - start);
+	printf("GPU: %.1f [ms] (%.1f GFLOPS)\n\n", elapsed_ms, static_cast<double>(g_num_run) * flops(g_num, elapsed_ms) / 1e9);
+
+#if 0
+	disp_matrix(&A[g_num-1]);
+	disp_matrix(&B[g_num-1]);
+
+# ifdef FM7b5_USE_CPU
+	disp_matrix(&C[g_num-1]);
+# endif
+	disp_matrix(&C_gpu[g_num-1]);
+#endif
+
+	return 0;
+}
+
+void
+random_matrices(float* mat44, const size_t num)
+{
+	if (mat44 == NULL || num < 1) {
+		return;
+	}
+
+	std::uniform_real_distribution<float> rand_dist;
+
+	for (size_t i = 0; i < num; ++i) {
+		float* p(mat44 + 16*i);
+
+		for (size_t j = 0; j < 16; ++j)
+		{
+			p[j] = rand_dist(g_re);
+		}
+	}
+}
+
+void
+disp_matrix(const float* m44)
+{
+	for (size_t r = 0; r < 4; ++r) {
+		for (size_t c = 0; c < 4; ++c) {
+			printf("% 7.3f ", m44[r + 4*c]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+inline void
+mult_matrix(float* __restrict C, const float* __restrict A, const float* __restrict B)
+{
+	for (size_t i = 0; i < 16; ++i) {
+		C[i] = 0.0;
+	}
+
+#ifdef FM7b5_TRANSPOSED
+	float Bt[16];
+	for (size_t c = 0; c < 4; ++c) {
+		for (size_t r = 0; r < 4; ++r) {
+			Bt[r + 4*c] = B[c + 4*r];
+		}
+	}
+#endif
+
+	for (size_t k = 0; k < 4; ++k) {
+		for (size_t c = 0; c < 4; ++c) {
+			for (size_t r = 0; r < 4; ++r) {
+#ifndef FM7b5_TRANSPOSED
+				C[r + 4*c] += A[r + 4*k] * B[k + 4*c];
+#else
+				C[r + 4*c] += A[r + 4*k] * Bt[c + 4*k];
+#endif
+			}
+		}
+	}
+}
+
+void
+mult_matrices(float* C, const float* A, const float* B, const size_t num)
+{
+#pragma omp parallel for
+	for (int i = 0; i < static_cast<int>(num); ++i) {
+		mult_matrix(C + 16*i, A + 16*i, B + 16*i);
+	}
+}
+
+double
+flops(const size_t num, const double elapsed_ms)
+{
+	/* num of multiplications and additions in a single matrix-matrix multiplication */
+	const int ops(4 * 4 * (4 + 3));
+
+	return static_cast<double>(num * ops) * 1.0e3 / elapsed_ms;
+}