看板 C_and_CPP 關於我們 聯絡資訊
我還是再提醒一次 寫CUDA的時候一定要盡可能的想清楚 每個block和每個thread該如何處理分配到的工作 我希望你注意的地方有兩個 第一個是我和你寫法不同的shared memory的點乘 你的想法和我的想法有什麼區別? 第二個是CUDA的計時方式 如果善用CUDA的Event 可以做到精確度微秒的計時 以下的作法提供你作為參考 #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #define NNx 512 #define NNy 512 #define BLOCK_SIZE 16 float A[NNy][NNx]; float B[NNy][NNx]; float C_g[NNy][NNx]; float C_s[NNy][NNx]; // ------------------- 用 share memory ------------------- // __global__ void Matrix_Point_Multiplication_SM( float *Ma, float *Nb, float *Pc ) { __shared__ float Msm[ BLOCK_SIZE ][ BLOCK_SIZE ]; __shared__ float Nsm[ BLOCK_SIZE ][ BLOCK_SIZE ]; int ty = threadIdx.y; int tx = threadIdx.x; int row = blockIdx.y * BLOCK_SIZE ; int col = blockIdx.x * BLOCK_SIZE ; if( (row+ty < NNy) && (col+tx < NNx) ) { // 矩陣搬移Share memory Msm[ ty ][ tx ] = Ma[ (row+ty)*NNx + (col+tx) ]; Nsm[ ty ][ tx ] = Nb[ (row+ty)*NNx + (col+tx) ]; // 矩陣運算Share memory Pc[ (row+ty)*NNx + (col+tx) ] = Msm[ ty ][ tx ] * Nsm[ ty ][ tx ]; } } // ------------------- 用 global memory ------------------- // __global__ void Matrix_Point_Multiplication_GM( float *input1, float *input2, float *output1 ) { int col = blockIdx.x*blockDim.x+threadIdx.x; // x 表示 column int row = blockIdx.y*blockDim.y+threadIdx.y; // y 表示 row if( col < NNx && row < NNy ) { output1[ row*NNx+col ] = input1[ row*NNx+col ] * input2[ row*NNx+col ] ; } } int main(int argc, char* argv[]) { float *Ad; float *Bd; float *Cd_g; // global memory compute float *Cd_s; // shared memory compute float elapsedTime; int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE; int by = (NNy + BLOCK_SIZE - 1) / BLOCK_SIZE; int equal = 1; dim3 blocks(bx,by); dim3 threads(BLOCK_SIZE,BLOCK_SIZE); for(int i = 0; i < NNy; i++) { for(int j = 0; j < NNx; j++) { A[i][j] = rand(); B[i][j] = rand(); } } cudaMalloc( (void**) &Ad, sizeof(float)*NNx*NNy ); cudaMalloc( (void**) &Bd, sizeof(float)*NNx*NNy ); cudaMalloc( (void**) &Cd_g, sizeof(float)*NNx*NNy ); cudaMalloc( (void**) &Cd_s, sizeof(float)*NNx*NNy ); cudaMemcpy( Ad, A, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice ); cudaMemcpy( Bd, B, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice ); // ------------------ global memory ------------------ cudaEvent_t start, stop; cudaEventCreate (&start); cudaEventCreate (&stop); cudaEventRecord(start, 0); Matrix_Point_Multiplication_GM<<<blocks, threads>>>( Ad, Bd, Cd_g ); cudaThreadSynchronize(); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("Global memory processing time: %f (ms)\n", elapsedTime); // ------------------ shared memory ------------------ cudaEventRecord(start, 0); Matrix_Point_Multiplication_GM<<<blocks, threads, BLOCK_SIZE*BLOCK_SIZE*2*sizeof(float)>>>( Ad, Bd, Cd_s ); cudaThreadSynchronize(); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("Shared memory processing time: %f (ms)\n", elapsedTime); cudaEventDestroy(start); cudaEventDestroy(stop); // ------------------ copy result ------------------ cudaMemcpy( C_g, Cd_g, sizeof(float)*NNx*NNy, cudaMemcpyDeviceToHost ); cudaMemcpy( C_s, Cd_s, sizeof(float)*NNx*NNy, cudaMemcpyDeviceToHost ); for(int i = 0; i < NNy && equal == 1; i++) { for(int j = 0; j < NNx && equal == 1; j++) { if(C_g[i][j] - C_s[i][j] > 1e-6) equal = 0; if(C_s[i][j] - C_g[i][j] > 1e-6) equal = 0; } } if(equal == 0) printf("Error\n"); else printf("Done\n"); cudaFree(Ad); cudaFree(Bd); cudaFree(Cd_g); cudaFree(Cd_s); system("pause"); return 0; } -- -- ※ 發信站: 批踢踢實業坊(ptt.cc) ◆ From: 122.120.44.12
aada:謝謝L大的提醒,我知道了,要多想想如何分配threadu, 以及用正 03/18 03:32
aada:CUDA計時, 會比較精確 03/18 03:33