我試著把index改成你的寫法, 但是輸出結果不對, 於是我就照原來的寫法,
下面是我的程式, movaIM5是取大矩陣(ex.4x4)上與下面(3x4)矩陣,movaIM6是取左右,
而最後我將輸出結果,請問一下我在kernel中這樣的寫法OK嗎(我目前只注意到結果,
想請教是不是有比較好的寫法)
謝謝
__global__ void movaIM5(float *input1, float *output1)
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
output1[row*NNx+col] = input1[row*NNx+col+1]; // 取下面矩陣
//output1[row*NNx+col] = input1[row*NNx+col]; // 取上面矩陣
}
__global__ void movaIM6(float *input1, float *output1)
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
output1[row*NNx+col] = input1[row*NNx+col+NNx]; // 取右邊矩陣
//output1[row*NNx+col] = input1[row*NNx+col]; // 取左邊矩陣
}
int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 blocks(bx, bx);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
// _______ GPU 矩陣IM2搬移 _______ //
float *d_input;
cudaMalloc((void**)&d_input, sizeof(float)*NNx*NNy);
cudaMemcpy( d_input, input, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice );
float *d_IM5, *d_IM6;
cudaMalloc((void**)&d_IM5, sizeof(float)*(NNx-1)*NNy);
cudaMalloc((void**)&d_IM6, sizeof(float)*(NNx-1)*NNy);
movaIM5<<<blocks, threads>>>( d_input, d_IM5 );
movaIM6<<<blocks, threads>>>( d_input, d_IM6 );
float *IM5, *IM6;
IM5 = (float*) malloc( sizeof(float)*(NNx-1)*NNy );
IM6 = (float*) malloc( sizeof(float)*(NNx-1)*NNy );
cudaMemcpy( IM5, d_IM5, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost );
cudaMemcpy( IM6, d_IM6, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost );
// 輸出結果
FILE *fp2;
char filename2[] = "D:\\GPU\\gpu_IM5.txt";
fp2= fopen( filename2 , "w");
for(int i=0; i<NNy; i++)
{
for (int j=0; j<NNx-1; j++)
{
fprintf(fp2,"%8.4f ",IM5[i*NNy+j]);
}
fprintf(fp2,"\n");
}
fclose(fp2);
FILE *fp3;
char filename3[] = "D:\\GPU\\gpu_IM6.txt";
fp3= fopen( filename3 , "w");
for(int i=0; i<NNy-1; i++)
{
for (int j=0; j<NNx; j++)
{
fprintf(fp3,"%8.4f ",IM6[i*NNy+j]);
}
fprintf(fp3,"\n");
}
fclose(fp3);
--
※ 發信站: 批踢踢實業坊(ptt.cc)
◆ From: 140.122.192.147