→ advance760:先把A B輸出來看到底對不對阿 妳有確認這個嗎 04/18 12:54
開發平台(Platform): (Ex: VC++, GCC, Linux, ...)
vc++ 2010; CUDA 4.1
額外使用到的函數庫(Library Used): (Ex: OpenGL, ...)
問題(Question):
小弟最近在寫一個矩陣相乘的CUDA程式,先用小型矩陣測試一下,但是跑出來的
答案不對,無奈就是找不到錯誤,有請板上高手指點一二!感激不盡! m(_ _)m
餵入的資料(Input):
A * B = C
其中
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
A = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 , A是一個8*16的矩陣,如左邊所示,
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 B是A的轉置矩陣。
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
預期的正確結果(Expected Output):
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
C= 16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
0 0 0 0 0 0 0 0
錯誤結果(Wrong Output):
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
C= 16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
16 16 16 16 16 16 16 0
程式碼(Code):(請善用置底文網頁, 記得排版)
以下的程式碼,是參考NVIDIA官方發佈的 <<CUDA C Programming Guide>>(Version 4.1)
第25~第27頁所改寫的。
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
Kernel:
__global__ void mul(Matrix da, Matrix db, Matrix dc) {
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
Matrix Csub = GetSubMatrix(dc, blockRow, blockCol);
float Cvalue = 0;
int row = threadIdx.y;
int col = threadIdx.x;
for (int m = 0; m < (da.width / 8); ++m) {
Matrix Asub = GetSubMatrix(da, blockRow, m);
Matrix Bsub = GetSubMatrix(db, m, blockCol);
__shared__ float As[8][8];
__shared__ float Bs[8][8];
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
__syncthreads();
for (int e = 0; e < 8; ++e)
Cvalue += As[row][e] * Bs[e][col];
__syncthreads();
}
SetElement(Csub, row, col, Cvalue);
}
Kernel所中呼叫的函式:
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.width + col];
}
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.width + col] = value;
}
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = 8;
Asub.height = 8;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * 8 * row + 8 * col];
return Asub;
}
再來附上main函式中產生A矩陣及B矩陣的程式碼:
float a[8][16]={0};
float b[16][8]={0};
float c[8][8]={0};
for(i=0;i<7;i++){
for(j=0;j<16;j++){
a[i][j] =1;
}
}
for(i=0;i<16;i++){
for(j=0;j<8;j++){
b[i][j] =a[j][i];
}
}
最後附上GPU的記憶體配置、Kernel呼叫及相關程式碼:
Matrix da = {16,8, 8, a[0]};
Matrix db = {8, 16, 8, b[0]};
Matrix dc = {8, 8, 8, c[0]};
size_t sizea = da.width * da.height * sizeof(float);
size_t sizeb = db.width * db.height * sizeof(float);
size_t sizec = dc.width * dc.height * sizeof(float);
cudaMalloc(&da.elements,sizea);
cudaMalloc(&db.elements,sizeb);
cudaMemcpy(da.elements, a[0], sizea, cudaMemcpyHostToDevice);
cudaMemcpy(db.elements, b[0], sizeb, cudaMemcpyHostToDevice);
cudaMalloc(&dc.elements, sizec);
dim3 dimGrid(1,1);
dim3 dimBlock(8,8);
MatMulKernel<<<dimGrid, dimBlock, sizeof(float)*64>>>(da,db,dc);
cudaMemcpy(c[0], dc.elements, sizec, cudaMemcpyDeviceToHost);
cudaFree(da.elements);
cudaFree(db.elements);
cudaFree(dc.elements);
補充說明(Supplement):
很想知道問題出在哪,以上,麻煩大家了! 謝謝!!!
--
※ 發信站: 批踢踢實業坊(ptt.cc)
◆ From: 123.240.229.170