請問L大一下, 我現在想把我取出的矩陣A(16x15),
貼到一個新的矩陣,矩陣B中(16x17),
我目前遇到的問題是有的值會少貼, 導致後面的順序就錯了
矩陣A 矩陣B 矩陣C(正確)
1 2 3 0 0 0 0 0 1 2 3 0
4 5 6 0 0 0 0 0 4 5 6 0
7 8 9 0 0 0 0 0 7 8 9 0
謝謝
以下是程式碼
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cutil.h>
#define NNx 16
#define NNy 16
#define BLOCK_SIZE 16
#define ALIGN 3 // 對齊printf用
__global__ void movaIM5(float *input1, float *output1)
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
if(col+1 < NNx && row < NNy)
output1[row*(NNx-1)+col] = input1[row*NNx+col+1]; // 取右邊矩陣
//if(col < (NNx-1) && row < NNy)
// output1[row*(NNx-1)+col] = input1[row*NNx+col]; // 取左邊矩陣
}
__global__ void movaIM6(float *input1, float *output1)
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
if(col < NNx && row+1 < NNy)
output1[row*NNx+col] = input1[(row+1)*NNx+col]; // 取下面矩陣
//if(col < NNx && row < (NNy-1))
// output1[row*NNx+col] = input1[row*NNx+col]; // 取上面矩陣
}
__global__ void cu_delx( float *input1, float *input2 )
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
input1[row*NNx+col]=0;
if( col+1 < NNx && row < NNy )
{
input1[ row*NNx+col +1+threadIdx.y] =input2[ row*NNx+col ];
^^^^^^^^^^^^^^^^^^^^^
我覺得是這邊的問題,但我要怎麼設定col,row呢?
}
}
int main(int argc, char* argv[])
{
int i;
//float input[NNx*NNy];
float *input;
input = (float*) malloc( sizeof(float)*NNx*NNy );
float *d_input;
printf("input\n");
for(i = 0; i < NNx*NNy; i++)
{
input[i] = i+1;
}
printf("\n");
cudaMalloc((void**)&d_input, sizeof(float)*NNx*NNy);
cudaMemcpy( d_input, input, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice );
float *d_IM5, *d_IM6;
cudaMalloc((void**)&d_IM5, sizeof(float)*NNx*(NNy-1));
cudaMalloc((void**)&d_IM6, sizeof(float)*(NNx-1)*NNy);
int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE;
int by = (NNy + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 blocks(bx,by);
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
movaIM5<<<blocks, threads>>>( d_input, d_IM5 );
movaIM6<<<blocks, threads>>>( d_input, d_IM6 );
float *IM5, *IM6;
IM5 = (float*) malloc( sizeof(float)*NNx*(NNy-1) );
IM6 = (float*) malloc( sizeof(float)*(NNx-1)*NNy );
cudaMemcpy( IM5, d_IM5, sizeof(float)*NNx*(NNy-1), cudaMemcpyDeviceToHost );
cudaMemcpy( IM6, d_IM6, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost );
int bx_del = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE;
int by_del = ((NNy+1) + BLOCK_SIZE - 1) / (BLOCK_SIZE+1);
dim3 blocks_del(bx_del,by_del);
dim3 threads_del(BLOCK_SIZE, BLOCK_SIZE+1);
//我在這邊將thread多設了一些^^^^^^^^^^^^^^^^
float *d_delx;
cudaMalloc((void**)&d_delx, sizeof(float)*NNx*(NNy+1));
cu_delx<<<blocks_del, threads_del>>>( d_delx, d_IM5 );
float *delx;
delx = (float*) malloc( sizeof(float)*NNx*(NNy+1) );
cudaMemcpy( delx, d_delx, sizeof(float)*NNx*(NNy+1), cudaMemcpyDeviceToHost
);
printf("delx\n");
for(i = 0; i < NNx*(NNy+1); i++)
{
if(i % (NNy+1) == 0) printf("\n");
else printf(" ");
printf("%*.0f", ALIGN, delx[i]);
}
printf("\n");
cudaFree(d_input);
cudaFree(d_IM5);
cudaFree(d_IM6);
cudaFree(d_delx);
system("pause");
return 0;
}
※ 編輯: aada 來自: 140.122.192.147 (02/26 01:26)