作者aada (aada)
看板C_and_CPP
標題[問題] 使用CUDA Share memory執行"點乘"比global memory慢
時間Wed Mar 17 23:49:45 2010
請問一下,
我想使用Share memory來執行點乘,
但我測出來, 竟然是使用global memory比較快,
請問為什麼呢?是我的share memory寫錯了嗎
是因為矩陣點乘的重覆率不高嗎(一點對一點乘,並沒有重複)
以下是我的kernel function
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<cuda.h>
#include <math.h>
#include <time.h>
#define NNx 512
#define NNy 512
#define BLOCK_SIZE 16
// _____________ 矩陣點乘 _____________ //
__global__ void Matrix_Point_Multiplication_GM( float *input1, float *input2,
float *output1 )
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
if( col < NNx && row < NNy )
{
output1[ row*NNx+col ] = input1[ row*NNx+col ] * input2[ row*NNx+col ] ;
}
}
// _____________ 矩陣點乘( Share Memory ) _____________ //
__global__ void Matrix_Point_Multiplication_SM( float *Ma, float *Nb, float
*Pc )
{
__shared__ float Msm[ BLOCK_SIZE ][ BLOCK_SIZE ];
__shared__ float Nsm[ BLOCK_SIZE ][ BLOCK_SIZE ];
__shared__ float Psm[ BLOCK_SIZE ][ BLOCK_SIZE ];
int k, w;
int ty = threadIdx.y; //0*16,1*16,~15*16
int tx = threadIdx.x; // 0~15 *15
int row = blockIdx.y * BLOCK_SIZE ; //0*512 16*512 32*512...
int col = blockIdx.x * BLOCK_SIZE ; //(0:16:512) *512
for( k=0; k<NNx; k+=BLOCK_SIZE )
{
// 矩陣搬移Share memory
Msm[ tx ][ ty ] = Ma[ (row+tx)*NNx +(k+ty) ];
Nsm[ tx ][ ty ] = Nb[ (row+tx)*NNx +(k+ty) ];
__syncthreads();
// 矩陣運算Share memory
Pc[ (row+tx)*NNx +(k+ty) ] = Msm[ tx ][ ty ] * Nsm[ tx ][ ty
];
__syncthreads();
}
}
int main()
{
int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE;
int by = (NNy + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 blocks(bx,by);
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
FILE *fp1, *fp2, *fp3;
char filename1[] = "n_phase2.txt", filename2[] = "output.txt", filename3[]
= "output_2.txt";
int i=0;
float *input1, *input2, *output, *d_input1, *d_input2, *d_output,
*d_output_s;
clock_t start_gm, end_gm, start_sm, end_sm;
input1 = (float*) malloc( sizeof(float)*NNx*NNy );
input2 = (float*) malloc( sizeof(float)*NNx*NNy );
output = (float*) malloc( sizeof(float)*NNx*NNy );
cudaMalloc( (void**)&d_input1, sizeof(float)*NNx*NNy );
cudaMalloc( (void**)&d_input2, sizeof(float)*NNx*NNy );
cudaMalloc( (void**)&d_output, sizeof(float)*NNx*NNy );
cudaMalloc( (void**)&d_output_s, sizeof(float)*NNx*NNy );
printf("input1");
for( i=0; i<NNx*NNy; i++)
{
input1[ i ] = i;
}
printf("\n");
printf("input2");
for( i=0; i<NNx*NNy; i++)
{
input2[ i ] = i;
}
printf("\n");
cudaMemcpy( d_input1, input1, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice
);
cudaMemcpy( d_input2, input2, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice
);
// ____________ Global Memory ____________ //
Matrix_Point_Multiplication_GM<<<blocks, threads>>>( d_input1, d_input2,
d_output );
cudaMemcpy( output, d_output, sizeof(float)*NNx*NNy, cudaMemcpyDeviceToHost
);
// ____________ Share Memory ____________ //
float *output_s;
output_s = (float*) malloc( sizeof(float)*NNx*NNy );
Matrix_Point_Multiplication_SM<<<blocks, threads,
BLOCK_SIZE*BLOCK_SIZE*sieof(float)>>>( d_input1, d_input2, d_output_s );
cudaMemcpy( output_s, d_output_s, sizeof(float)*NNx*NNy,
cudaMemcpyDeviceToHost );
printf("\n");
system("PAUSE");
return 0;
}
◆ From: 140.122.192.147
※ 編輯: aada 來自: 140.122.192.147 (03/17 23:51)
※ 編輯: aada 來自: 140.122.192.147 (03/17 23:52)
推 LittleBob:長寬都是16的倍數嘛??? 03/18 00:00
→ aada:是的,我處理的矩陣式512x512y 03/18 01:00
推 lgen7604:我想先知道你怎麼呼叫kernel function 03/18 01:44
→ lgen7604:真的有用到shared memory嗎? 03/18 01:45
※ 編輯: aada 來自: 140.122.192.147 (03/18 01:56)
推 lgen7604:果然沒用到shared memory啊 03/18 02:10
→ aada:怎麼說呢,謝謝 03/18 02:18
→ aada:<<< , ,share memory size>>> 03/18 02:19
※ 編輯: aada 來自: 140.122.192.147 (03/18 02:33)