看板 C_and_CPP 關於我們 聯絡資訊
開發平台(Platform): (Ex: VC++, GCC, Linux, ...) vc++ 2010; CUDA 4.1 額外使用到的函數庫(Library Used): (Ex: OpenGL, ...) 問題(Question): 小弟最近在寫一個矩陣相乘的CUDA程式,先用小型矩陣測試一下,但是跑出來的 答案不對,無奈就是找不到錯誤,有請板上高手指點一二!感激不盡! m(_ _)m 餵入的資料(Input): A * B = C 其中 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 A = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 , A是一個8*16的矩陣,如左邊所示, 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 B是A的轉置矩陣。 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 預期的正確結果(Expected Output): 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 C= 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 0 0 0 0 0 0 0 0 錯誤結果(Wrong Output): 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 C= 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 16 16 16 16 16 16 16 0 程式碼(Code):(請善用置底文網頁, 記得排版) 以下的程式碼,是參考NVIDIA官方發佈的 <<CUDA C Programming Guide>>(Version 4.1) 第25~第27頁所改寫的。 typedef struct { int width; int height; int stride; float* elements; } Matrix; Kernel: __global__ void mul(Matrix da, Matrix db, Matrix dc) { int blockRow = blockIdx.y; int blockCol = blockIdx.x; Matrix Csub = GetSubMatrix(dc, blockRow, blockCol); float Cvalue = 0; int row = threadIdx.y; int col = threadIdx.x; for (int m = 0; m < (da.width / 8); ++m) { Matrix Asub = GetSubMatrix(da, blockRow, m); Matrix Bsub = GetSubMatrix(db, m, blockCol); __shared__ float As[8][8]; __shared__ float Bs[8][8]; As[row][col] = GetElement(Asub, row, col); Bs[row][col] = GetElement(Bsub, row, col); __syncthreads(); for (int e = 0; e < 8; ++e) Cvalue += As[row][e] * Bs[e][col]; __syncthreads(); } SetElement(Csub, row, col, Cvalue); } Kernel所中呼叫的函式: __device__ float GetElement(const Matrix A, int row, int col) { return A.elements[row * A.width + col]; } __device__ void SetElement(Matrix A, int row, int col, float value) { A.elements[row * A.width + col] = value; } __device__ Matrix GetSubMatrix(Matrix A, int row, int col) { Matrix Asub; Asub.width = 8; Asub.height = 8; Asub.stride = A.stride; Asub.elements = &A.elements[A.stride * 8 * row + 8 * col]; return Asub; } 再來附上main函式中產生A矩陣及B矩陣的程式碼: float a[8][16]={0}; float b[16][8]={0}; float c[8][8]={0}; for(i=0;i<7;i++){ for(j=0;j<16;j++){ a[i][j] =1; } } for(i=0;i<16;i++){ for(j=0;j<8;j++){ b[i][j] =a[j][i]; } } 最後附上GPU的記憶體配置、Kernel呼叫及相關程式碼: Matrix da = {16,8, 8, a[0]}; Matrix db = {8, 16, 8, b[0]}; Matrix dc = {8, 8, 8, c[0]}; size_t sizea = da.width * da.height * sizeof(float); size_t sizeb = db.width * db.height * sizeof(float); size_t sizec = dc.width * dc.height * sizeof(float); cudaMalloc(&da.elements,sizea); cudaMalloc(&db.elements,sizeb); cudaMemcpy(da.elements, a[0], sizea, cudaMemcpyHostToDevice); cudaMemcpy(db.elements, b[0], sizeb, cudaMemcpyHostToDevice); cudaMalloc(&dc.elements, sizec); dim3 dimGrid(1,1); dim3 dimBlock(8,8); MatMulKernel<<<dimGrid, dimBlock, sizeof(float)*64>>>(da,db,dc); cudaMemcpy(c[0], dc.elements, sizec, cudaMemcpyDeviceToHost); cudaFree(da.elements); cudaFree(db.elements); cudaFree(dc.elements); 補充說明(Supplement): 很想知道問題出在哪,以上,麻煩大家了! 謝謝!!! -- ※ 發信站: 批踢踢實業坊(ptt.cc) ◆ From: 123.240.229.170
advance760:先把A B輸出來看到底對不對阿 妳有確認這個嗎 04/18 12:54