看板 C_and_CPP 關於我們 聯絡資訊
我試著把index改成你的寫法, 但是輸出結果不對, 於是我就照原來的寫法, 下面是我的程式, movaIM5是取大矩陣(ex.4x4)上與下面(3x4)矩陣,movaIM6是取左右, 而最後我將輸出結果,請問一下我在kernel中這樣的寫法OK嗎(我目前只注意到結果, 想請教是不是有比較好的寫法) 謝謝 __global__ void movaIM5(float *input1, float *output1) { int col = blockIdx.x*blockDim.x+threadIdx.x; int row = blockIdx.y*blockDim.y+threadIdx.y; output1[row*NNx+col] = input1[row*NNx+col+1]; // 取下面矩陣 //output1[row*NNx+col] = input1[row*NNx+col]; // 取上面矩陣 } __global__ void movaIM6(float *input1, float *output1) { int col = blockIdx.x*blockDim.x+threadIdx.x; int row = blockIdx.y*blockDim.y+threadIdx.y; output1[row*NNx+col] = input1[row*NNx+col+NNx]; // 取右邊矩陣 //output1[row*NNx+col] = input1[row*NNx+col]; // 取左邊矩陣 } int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE; dim3 blocks(bx, bx); dim3 threads(BLOCK_SIZE, BLOCK_SIZE); // _______ GPU 矩陣IM2搬移 _______ // float *d_input; cudaMalloc((void**)&d_input, sizeof(float)*NNx*NNy); cudaMemcpy( d_input, input, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice ); float *d_IM5, *d_IM6; cudaMalloc((void**)&d_IM5, sizeof(float)*(NNx-1)*NNy); cudaMalloc((void**)&d_IM6, sizeof(float)*(NNx-1)*NNy); movaIM5<<<blocks, threads>>>( d_input, d_IM5 ); movaIM6<<<blocks, threads>>>( d_input, d_IM6 ); float *IM5, *IM6; IM5 = (float*) malloc( sizeof(float)*(NNx-1)*NNy ); IM6 = (float*) malloc( sizeof(float)*(NNx-1)*NNy ); cudaMemcpy( IM5, d_IM5, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost ); cudaMemcpy( IM6, d_IM6, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost ); // 輸出結果 FILE *fp2; char filename2[] = "D:\\GPU\\gpu_IM5.txt"; fp2= fopen( filename2 , "w"); for(int i=0; i<NNy; i++) { for (int j=0; j<NNx-1; j++) { fprintf(fp2,"%8.4f ",IM5[i*NNy+j]); } fprintf(fp2,"\n"); } fclose(fp2); FILE *fp3; char filename3[] = "D:\\GPU\\gpu_IM6.txt"; fp3= fopen( filename3 , "w"); for(int i=0; i<NNy-1; i++) { for (int j=0; j<NNx; j++) { fprintf(fp3,"%8.4f ",IM6[i*NNy+j]); } fprintf(fp3,"\n"); } fclose(fp3); -- ※ 發信站: 批踢踢實業坊(ptt.cc) ◆ From: 140.122.192.147