作者lgen7604 ()
看板C_and_CPP
標題Re: [問題] 使用CUDA來擷取矩陣中部分值
時間Tue Feb 23 10:09:19 2010
我執行的結果正常啊
取上下左右的部份都沒問題
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cutil.h>
#define NNx 4
#define NNy 4
__global__ void movaIM5(float *input1, float *output1)
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
output1[row*NNx+col] = input1[(row+1)*NNx+col]; // 取下面矩陣
//output1[row*NNx+col] = input1[row*NNx+col]; // 取上面矩陣
}
__global__ void movaIM6(float *input1, float *output1)
{
int col = blockIdx.x*blockDim.x+threadIdx.x;
int row = blockIdx.y*blockDim.y+threadIdx.y;
output1[row*(NNx-1)+col] = input1[row*NNx+col+1]; // 取右邊矩陣
//output1[row*(NNx-1)+col] = input1[row*NNx+col]; // 取左邊矩陣
}
int main(int argc, char* argv[])
{
int i;
float input[NNx*NNy];
float *d_input;
for(i = 0; i < NNx*NNy; i++){
input[i] = i+1;
if(i % NNx == 0) printf("\n");
printf("%3.0f ", input[i]);
}
printf("\n");
cudaMalloc((void**)&d_input, sizeof(float)*NNx*NNy);
cudaMemcpy( d_input, input, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice );
float *d_IM5, *d_IM6;
cudaMalloc((void**)&d_IM5, sizeof(float)*NNx*(NNy-1));
cudaMalloc((void**)&d_IM6, sizeof(float)*(NNx-1)*NNy);
dim3 blocks(1,1);
dim3 threadsIM5(NNx,NNy-1);
dim3 threadsIM6(NNx-1,NNy);
movaIM5<<<blocks, threadsIM5>>>( d_input, d_IM5 );
movaIM6<<<blocks, threadsIM6>>>( d_input, d_IM6 );
float *IM5, *IM6;
IM5 = (float*) malloc( sizeof(float)*NNx*(NNy-1) );
IM6 = (float*) malloc( sizeof(float)*(NNx-1)*NNy );
cudaMemcpy( IM5, d_IM5, sizeof(float)*NNx*(NNy-1), cudaMemcpyDeviceToHost );
cudaMemcpy( IM6, d_IM6, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost );
for(i = 0; i < NNx*(NNy-1); i++){
if(i % NNx == 0) printf("\n");
printf("%3.0f ", IM5[i]);
}
printf("\n");
for(i = 0; i < (NNx-1)*NNy; i++){
if(i % (NNx-1) == 0) printf("\n");
printf("%3.0f ", IM6[i]);
}
printf("\n");
cudaFree(d_input);
cudaFree(d_IM5);
cudaFree(d_IM6);
system("pause");
return 0;
}
--
※ 發信站: 批踢踢實業坊(ptt.cc)
◆ From: 122.120.40.234
→ aada:如果我今天矩陣是比較大的畫,如512*512 02/23 12:08
→ aada:dim3 blocks, threadsIM5好像就不可以這樣寫了,是嗎 02/23 12:10
→ aada:int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE; 02/23 12:20
→ aada:dim3 blocks(bx, bx); 02/23 12:20
→ aada:dim3 threads(BLOCK_SIZE, BLOCK_SIZE); 02/23 12:21
→ aada:BLOCK_SIZE 我設為16 02/23 12:21
推 aada:我IM5大致上試出來了 02/23 22:22
→ aada:dim3 blocksIM5( 16, 73); 02/23 22:23
→ aada:dim3 threadsIM5( 32, 7 ); 02/23 22:23
→ aada:movaIM5<<<blocksIM5, threadsIM5>>>( d_input, d_IM5 ); 02/23 22:23
→ aada:這樣剛好處理(16*32=512, 73*7=511) --> 512*511的矩陣 02/23 22:25
→ aada:我這樣寫的方式OK嗎,有什麼地方可以改進的 02/23 22:25