图像模糊
GPU处理版本
version1 对每个通道分别作平均
__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){int col = threadIdx.x+blockIdx.x*blockDim.x;int row =threadIdx.y+blockIdx.y*blockDim.y;if (col<width && row < height){int pixVal =0;int pixels =0;for(int i=0;i<channels;i++){ //int rgboffset = (col+row*width)*channels+i;// dim of (r,g,b)for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){int cur_col =col+blur_col;int cur_row = row + blur_row;int offset= (cur_row*width+cur_col)*channels+i;if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){pixVal+=g_input_image[offset];pixels++;}}}g_output_image[rgboffset]=(unsigned char)(pixVal/pixels);}}}
这里的问题是pixVal 和 pixels的 重置应该放在遍历的通道里面,也就是 line 7里面
version 2 按照通道遍历
__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){int col = threadIdx.x+blockIdx.x*blockDim.x;int row =threadIdx.y+blockIdx.y*blockDim.y;if (col<width && row < height){for(int i=0;i<channels;i++){ //int pixVal =0;int pixels =0;int rgboffset = (col+row*width)*channels+i;// dim of (r,g,b)for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){int cur_col =col+blur_col;int cur_row = row + blur_row;int offset= (cur_row*width+cur_col)*channels+i;if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){pixVal+=g_input_image[offset];pixels++;}}}g_output_image[rgboffset]=(unsigned char)(pixVal/pixels);}}}
更新后的多通道代码
__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){int col = threadIdx.x+blockIdx.x*blockDim.x;int row =threadIdx.y+blockIdx.y*blockDim.y;int offset = (col+row*width)*channels;if (col<width && row < height){int pixVal [10]={0};int pixels =0;for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){int cur_col =col+blur_col;int cur_row = row + blur_row;int bluroffset= (cur_row*width+cur_col)*channels;if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){for(int c=0;c<channels;c++){pixVal[c]+=g_input_image[bluroffset+c];}pixels++;}}}for(int c =0;c<channels;c++){g_output_image[offset+c]=(unsigned char)(pixVal[c]/pixels);}}}
单通道代码(灰度图)
__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){int col = threadIdx.x+blockIdx.x*blockDim.x;int row =threadIdx.y+blockIdx.y*blockDim.y;int offset = (col+row*width);//if (col<width && row < height){int pixVal =0;int pixels =0;for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){int cur_col =col+blur_col;int cur_row = row + blur_row;int bluroffset= (cur_row*width+cur_col);if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){pixVal+=g_input_image[bluroffset];pixels++;}}}g_output_image[offset]=(unsigned char)(pixVal/pixels);}}
CPU处理版本
遍历所有的行和列
void blur_cpu(unsigned char *input_image, unsigned char *output_image, int width, int height,int channels){for(int row=0; row<height; row++){for(int col=0; col<width; col++){int pixVal =0;int pixels =0;int blur_row=-BLUR_SIZE;int blur_col =-BLUR_SIZE;for(int i=0;i<channels;i++){int rgboffset = (col+row*width)*channels+i;for(blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){for(blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){int cur_col =col+blur_col;int cur_row = row + blur_row;int offset= (cur_row*width+cur_col)*channels+i;if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){pixVal+=input_image[offset];pixels++;}}}output_image[rgboffset]=(unsigned char)(pixVal/pixels);}}}}
主函数调用
传入输入图片,输出图片 和 运行参数
if(argc<4){printf("Usage: command input-image-name output-image-name option option(cpu/gpu)");return -1;}char *input_image_name = argv[1];char *output_image_name = argv[2];char *option = argv[3];
初始化参数
加载输入图片,得到图片大小(height,width,channels)
int width, height, original_no_channels;int desired_no_channels = 0; // Pass 0 to load the image as isunsigned char *stbi_img = stbi_load(input_image_name, &width, &height, &original_no_channels, desired_no_channels);if(stbi_img==NULL){ printf("Error in loading the image.\n"); exit(1);}printf("Loaded image with a width of %dpx, a height of %dpx. The original image had %d channels, the loaded image has %d channels.\n", width, height, original_no_channels, desired_no_channels);int channels = original_no_channels;int img_mem_size = width * height * channels * sizeof(char);double begin;
GPU调用
if(strcmp(option, "gpu")==0){printf("Processing with GPU!\n");// Todo: 1. Allocate memory on GPUunsigned char * g_input_img = NULL;unsigned char * g_output_img = NULL;cudaMalloc((void**)&g_input_img,img_mem_size);CHECK(cudaGetLastError());cudaMalloc((void**)&g_output_img,img_mem_size);// Todo: 2. Copy data from host memory to device memorycudaMemcpy(g_input_img,stbi_img,img_mem_size,cudaMemcpyHostToDevice);// Todo: 3. Call kernel function// 3.1 Declare block and grid sizes/* dim3 block(..., ...);dim3 grid(..., ...); */dim3 block_size(16,16,1);dim3 grid_size((width-1)/16+1,(height-1)/16+1,1);// 3.2 Record the time cost of GPU computationbegin = cpuSecond();// Todo: 3.3 Call the kernel function (Don't forget to call cudaDeviceSynchronize() before time recording)blur_gpu<<<grid_size,block_size>>>(g_input_img,g_output_img,width,height,channels);CHECK(cudaGetLastError());cudaDeviceSynchronize();printf("Time cost [GPU]:%f s\n", cpuSecond()-begin);// Todo: 4. Copy data from device to hostunsigned char *sepia_img_from_gpu = (unsigned char *)malloc(img_mem_size);cudaMemcpy(sepia_img_from_gpu,g_output_img,img_mem_size,cudaMemcpyDeviceToHost);// Todo: 5. Save results as an imagestbi_write_jpg(output_image_name, width, height, channels, sepia_img_from_gpu, 100);// Todo: 6. Release host memory and device memorycudaFree(g_input_img);cudaFree(g_output_img);free(sepia_img_from_gpu);}
Block的维度设置

一个SM 最多只能跑8个blocks,最多48个wraps就是4832=1536个线程
一个处理模块只能调用32个wraps,最大1536个线程,使用1616的block,刚好可以让SM完全利用
CPU调用
if(strcmp(option, "cpu")==0)
{
printf("Processing with CPU!\n");
unsigned char *sepia_img = (unsigned char *)malloc(img_mem_size);
if(sepia_img==NULL){ printf("Unable to allocate memory for the sepia image. \n"); exit(1); }
// Time stamp
begin = cpuSecond();
// C PU computation (for reference)
rgb_to_sepia_cpu(stbi_img, sepia_img, width, height, channels);
// Time stamp
printf("Time cost [CPU]:%f s\n", cpuSecond()-begin);
// Save to an image file
stbi_write_jpg(output_image_name, width, height, channels, sepia_img, 100);
free(sepia_img);
}
else
{
printf("Unexpected option (please use cpu/gpu) !\n");
}
stbi_image_free(stbi_img);
return 0;
}
