看板 C_and_CPP 關於我們 聯絡資訊
請問L大一下, 我現在想把我取出的矩陣A(16x15), 貼到一個新的矩陣,矩陣B中(16x17), 我目前遇到的問題是有的值會少貼, 導致後面的順序就錯了 矩陣A 矩陣B 矩陣C(正確) 1 2 3 0 0 0 0 0 1 2 3 0 4 5 6 0 0 0 0 0 4 5 6 0 7 8 9 0 0 0 0 0 7 8 9 0 謝謝 以下是程式碼 #include <stdio.h> #include <stdlib.h> #include <string.h> #include <cuda_runtime.h> #include <cutil.h> #define NNx 16 #define NNy 16 #define BLOCK_SIZE 16 #define ALIGN 3 // 對齊printf用 __global__ void movaIM5(float *input1, float *output1) { int col = blockIdx.x*blockDim.x+threadIdx.x; int row = blockIdx.y*blockDim.y+threadIdx.y; if(col+1 < NNx && row < NNy) output1[row*(NNx-1)+col] = input1[row*NNx+col+1]; // 取右邊矩陣 //if(col < (NNx-1) && row < NNy) // output1[row*(NNx-1)+col] = input1[row*NNx+col]; // 取左邊矩陣 } __global__ void movaIM6(float *input1, float *output1) { int col = blockIdx.x*blockDim.x+threadIdx.x; int row = blockIdx.y*blockDim.y+threadIdx.y; if(col < NNx && row+1 < NNy) output1[row*NNx+col] = input1[(row+1)*NNx+col]; // 取下面矩陣 //if(col < NNx && row < (NNy-1)) // output1[row*NNx+col] = input1[row*NNx+col]; // 取上面矩陣 } __global__ void cu_delx( float *input1, float *input2 ) { int col = blockIdx.x*blockDim.x+threadIdx.x; int row = blockIdx.y*blockDim.y+threadIdx.y; input1[row*NNx+col]=0; if( col+1 < NNx && row < NNy ) { input1[ row*NNx+col +1+threadIdx.y] =input2[ row*NNx+col ]; ^^^^^^^^^^^^^^^^^^^^^ 我覺得是這邊的問題,但我要怎麼設定col,row呢? } } int main(int argc, char* argv[]) { int i; //float input[NNx*NNy]; float *input; input = (float*) malloc( sizeof(float)*NNx*NNy ); float *d_input; printf("input\n"); for(i = 0; i < NNx*NNy; i++) { input[i] = i+1; } printf("\n"); cudaMalloc((void**)&d_input, sizeof(float)*NNx*NNy); cudaMemcpy( d_input, input, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice ); float *d_IM5, *d_IM6; cudaMalloc((void**)&d_IM5, sizeof(float)*NNx*(NNy-1)); cudaMalloc((void**)&d_IM6, sizeof(float)*(NNx-1)*NNy); int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE; int by = (NNy + BLOCK_SIZE - 1) / BLOCK_SIZE; dim3 blocks(bx,by); dim3 threads(BLOCK_SIZE,BLOCK_SIZE); movaIM5<<<blocks, threads>>>( d_input, d_IM5 ); movaIM6<<<blocks, threads>>>( d_input, d_IM6 ); float *IM5, *IM6; IM5 = (float*) malloc( sizeof(float)*NNx*(NNy-1) ); IM6 = (float*) malloc( sizeof(float)*(NNx-1)*NNy ); cudaMemcpy( IM5, d_IM5, sizeof(float)*NNx*(NNy-1), cudaMemcpyDeviceToHost ); cudaMemcpy( IM6, d_IM6, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost ); int bx_del = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE; int by_del = ((NNy+1) + BLOCK_SIZE - 1) / (BLOCK_SIZE+1); dim3 blocks_del(bx_del,by_del); dim3 threads_del(BLOCK_SIZE, BLOCK_SIZE+1); //我在這邊將thread多設了一些^^^^^^^^^^^^^^^^ float *d_delx; cudaMalloc((void**)&d_delx, sizeof(float)*NNx*(NNy+1)); cu_delx<<<blocks_del, threads_del>>>( d_delx, d_IM5 ); float *delx; delx = (float*) malloc( sizeof(float)*NNx*(NNy+1) ); cudaMemcpy( delx, d_delx, sizeof(float)*NNx*(NNy+1), cudaMemcpyDeviceToHost ); printf("delx\n"); for(i = 0; i < NNx*(NNy+1); i++) { if(i % (NNy+1) == 0) printf("\n"); else printf(" "); printf("%*.0f", ALIGN, delx[i]); } printf("\n"); cudaFree(d_input); cudaFree(d_IM5); cudaFree(d_IM6); cudaFree(d_delx); system("pause"); return 0; } ※ 編輯: aada 來自: 140.122.192.147 (02/26 01:26)