作者lgen7604 ()
看板C_and_CPP
標題Re: [問題] 使用CUDA Share memory執行"點乘"比glo …
時間Thu Mar 18 03:11:51 2010
我還是再提醒一次
寫CUDA的時候一定要盡可能的想清楚
每個block和每個thread該如何處理分配到的工作
我希望你注意的地方有兩個
第一個是我和你寫法不同的shared memory的點乘
你的想法和我的想法有什麼區別?
第二個是CUDA的計時方式
如果善用CUDA的Event
可以做到精確度微秒的計時
以下的作法提供你作為參考
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define NNx 512
#define NNy 512
#define BLOCK_SIZE 16
float A[NNy][NNx];
float B[NNy][NNx];
float C_g[NNy][NNx];
float C_s[NNy][NNx];
// ------------------- 用 share memory ------------------- //
__global__ void Matrix_Point_Multiplication_SM( float *Ma, float *Nb,
float *Pc )
{
__shared__ float Msm[ BLOCK_SIZE ][ BLOCK_SIZE ];
__shared__ float Nsm[ BLOCK_SIZE ][ BLOCK_SIZE ];
int ty = threadIdx.y;
int tx = threadIdx.x;
int row = blockIdx.y * BLOCK_SIZE ;
int col = blockIdx.x * BLOCK_SIZE ;
if( (row+ty < NNy) && (col+tx < NNx) )
{
// 矩陣搬移Share memory
Msm[ ty ][ tx ] = Ma[ (row+ty)*NNx + (col+tx) ];
Nsm[ ty ][ tx ] = Nb[ (row+ty)*NNx + (col+tx) ];
// 矩陣運算Share memory
Pc[ (row+ty)*NNx + (col+tx) ] = Msm[ ty ][ tx ] * Nsm[ ty ][ tx ];
}
}
// ------------------- 用 global memory ------------------- //
__global__ void Matrix_Point_Multiplication_GM( float *input1, float *input2,
float *output1 )
{
int col = blockIdx.x*blockDim.x+threadIdx.x; // x 表示 column
int row = blockIdx.y*blockDim.y+threadIdx.y; // y 表示 row
if( col < NNx && row < NNy )
{
output1[ row*NNx+col ] = input1[ row*NNx+col ] * input2[ row*NNx+col ] ;
}
}
int main(int argc, char* argv[])
{
float *Ad;
float *Bd;
float *Cd_g; // global memory compute
float *Cd_s; // shared memory compute
float elapsedTime;
int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE;
int by = (NNy + BLOCK_SIZE - 1) / BLOCK_SIZE;
int equal = 1;
dim3 blocks(bx,by);
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
for(int i = 0; i < NNy; i++) {
for(int j = 0; j < NNx; j++) {
A[i][j] = rand();
B[i][j] = rand();
}
}
cudaMalloc( (void**) &Ad, sizeof(float)*NNx*NNy );
cudaMalloc( (void**) &Bd, sizeof(float)*NNx*NNy );
cudaMalloc( (void**) &Cd_g, sizeof(float)*NNx*NNy );
cudaMalloc( (void**) &Cd_s, sizeof(float)*NNx*NNy );
cudaMemcpy( Ad, A, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice );
cudaMemcpy( Bd, B, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice );
// ------------------ global memory ------------------
cudaEvent_t start, stop;
cudaEventCreate (&start);
cudaEventCreate (&stop);
cudaEventRecord(start, 0);
Matrix_Point_Multiplication_GM<<<blocks, threads>>>( Ad, Bd, Cd_g );
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Global memory processing time: %f (ms)\n", elapsedTime);
// ------------------ shared memory ------------------
cudaEventRecord(start, 0);
Matrix_Point_Multiplication_GM<<<blocks, threads,
BLOCK_SIZE*BLOCK_SIZE*2*sizeof(float)>>>( Ad, Bd, Cd_s );
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Shared memory processing time: %f (ms)\n", elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// ------------------ copy result ------------------
cudaMemcpy( C_g, Cd_g, sizeof(float)*NNx*NNy, cudaMemcpyDeviceToHost );
cudaMemcpy( C_s, Cd_s, sizeof(float)*NNx*NNy, cudaMemcpyDeviceToHost );
for(int i = 0; i < NNy && equal == 1; i++) {
for(int j = 0; j < NNx && equal == 1; j++) {
if(C_g[i][j] - C_s[i][j] > 1e-6) equal = 0;
if(C_s[i][j] - C_g[i][j] > 1e-6) equal = 0;
}
}
if(equal == 0)
printf("Error\n");
else
printf("Done\n");
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd_g);
cudaFree(Cd_s);
system("pause");
return 0;
}
--
--
※ 發信站: 批踢踢實業坊(ptt.cc)
◆ From: 122.120.44.12
推 aada:謝謝L大的提醒,我知道了,要多想想如何分配threadu, 以及用正 03/18 03:32
→ aada:CUDA計時, 會比較精確 03/18 03:33