X-Git-Url: http://www.fmaj7b5.info/git?p=cuda.git;a=blobdiff_plain;f=mult_matrices%2Fmult_matrices.cu;fp=mult_matrices%2Fmult_matrices.cu;h=96467dbdc460c27de2a337892d6ba38497a0f132;hp=0000000000000000000000000000000000000000;hb=d2ea0089dcca0e43ec2d836077c75384d23df2d9;hpb=e4a0dfee97228c4e3af62199e692b8fe6018939d

diff --git a/mult_matrices/mult_matrices.cu b/mult_matrices/mult_matrices.cu
new file mode 100644
index 0000000..96467db
--- /dev/null
+++ b/mult_matrices/mult_matrices.cu
@@ -0,0 +1,112 @@
+/*
+	Copyright (C) 2012, 2013  fmaj7b5.info
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 2 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "cuda\cuda_wrapper.h"
+
+#include "mult_matrices.h"
+#include "DeviceMemory.cuh"
+
+using namespace FM7b5;
+namespace cuda = FM7b5::cuda;
+
+#define DIV 16
+#define FM7b5_USE_PINNED
+
+__global__ void
+mult_matrix(float* C, const float* A, const float* B)
+{
+	const size_t d(threadIdx.z);
+	const size_t offset(16*(blockIdx.x*DIV + d));
+	const size_t index(threadIdx.x + blockDim.y*threadIdx.y);
+
+	__shared__ float _A[DIV][4*4], _B[DIV][4*4];
+
+	_A[d][index] = A[offset + index];
+	_B[d][index] = B[offset + index];
+
+	float c_elem = 0.0;
+	for (int k = 0; k < 4; ++k) {
+		c_elem += _A[d][threadIdx.x + 4*k] * _B[d][k + 4*threadIdx.y];
+	}
+
+	C[offset + index] = c_elem;
+}
+
+void
+FM7b5::mult_matrices_init_gpu()
+{
+	cudaError_t status;
+
+	cudaDeviceReset();
+	status = cudaSetDeviceFlags(cudaDeviceMapHost);
+	if (status != cudaSuccess) {
+		std::cerr << cudaGetErrorString(status) << std::endl;
+	}
+	
+	cudaFree(0);
+}
+
+void
+FM7b5::mult_matrices_gpu(float* C, const float* A, const float* B, const size_t num)
+{
+	const size_t num_elements(num * 4 * 4);
+
+#ifdef FM7b5_USE_PINNED
+	cudaError_t status;
+
+	if ((status = cudaHostRegister(const_cast<float*>(A), sizeof(float) * num_elements, 0)) != cudaSuccess) {
+		std::cerr << cudaGetErrorString(status) << std::endl;
+	}
+	if ((status = cudaHostRegister(const_cast<float*>(B), sizeof(float) * num_elements, 0)) != cudaSuccess) {
+		std::cerr << cudaGetErrorString(status) << std::endl;
+	}
+	if ((status = cudaHostRegister(C, sizeof(float) * num_elements, 0)) != cudaSuccess) {
+		std::cerr << cudaGetErrorString(status) << std::endl;
+	}
+#endif
+
+	memory::Linear<float> d_A(num_elements), d_B(num_elements), d_C(num_elements);
+
+	d_A.copy_from(A, sizeof(float) * num_elements);
+	d_B.copy_from(B, sizeof(float) * num_elements);
+
+	cuda::Event start, finish;
+
+	start.record();
+
+	mult_matrix<<<(num + (DIV - 1))/ DIV, dim3(4, 4, DIV)>>>(d_C.data(), d_A.data(), d_B.data());
+
+	finish.record();
+	finish.synchronize();
+
+	float ms;
+	cudaEventElapsedTime(&ms, start, finish);
+
+	std::cout << "kernel: " << ms << " [ms] (" << static_cast<double>(4*4*7*num) / ms * 1.0e-6 << " GFLOPS)" << std::endl;
+
+	d_C.copy_to(C, sizeof(float) * num_elements);
+
+#ifdef FM7b5_USE_PINNED
+	cudaHostUnregister(C);
+	cudaHostUnregister(const_cast<float*>(B));
+	cudaHostUnregister(const_cast<float*>(A));
+#endif
+}