图像模糊

GPU处理版本

version1 对每个通道分别作平均

  1. __global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
  2. int col = threadIdx.x+blockIdx.x*blockDim.x;
  3. int row =threadIdx.y+blockIdx.y*blockDim.y;
  4. if (col<width && row < height){
  5. int pixVal =0;
  6. int pixels =0;
  7. for(int i=0;i<channels;i++){ //
  8. int rgboffset = (col+row*width)*channels+i;// dim of (r,g,b)
  9. for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
  10. for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
  11. int cur_col =col+blur_col;
  12. int cur_row = row + blur_row;
  13. int offset= (cur_row*width+cur_col)*channels+i;
  14. if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
  15. pixVal+=g_input_image[offset];
  16. pixels++;
  17. }
  18. }
  19. }
  20. g_output_image[rgboffset]=(unsigned char)(pixVal/pixels);
  21. }
  22. }
  23. }

这里的问题是pixVal 和 pixels的 重置应该放在遍历的通道里面,也就是 line 7里面

version 2 按照通道遍历

  1. __global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
  2. int col = threadIdx.x+blockIdx.x*blockDim.x;
  3. int row =threadIdx.y+blockIdx.y*blockDim.y;
  4. if (col<width && row < height){
  5. for(int i=0;i<channels;i++){ //
  6. int pixVal =0;
  7. int pixels =0;
  8. int rgboffset = (col+row*width)*channels+i;// dim of (r,g,b)
  9. for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
  10. for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
  11. int cur_col =col+blur_col;
  12. int cur_row = row + blur_row;
  13. int offset= (cur_row*width+cur_col)*channels+i;
  14. if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
  15. pixVal+=g_input_image[offset];
  16. pixels++;
  17. }
  18. }
  19. }
  20. g_output_image[rgboffset]=(unsigned char)(pixVal/pixels);
  21. }
  22. }
  23. }

更新后的多通道代码

  1. __global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
  2. int col = threadIdx.x+blockIdx.x*blockDim.x;
  3. int row =threadIdx.y+blockIdx.y*blockDim.y;
  4. int offset = (col+row*width)*channels;
  5. if (col<width && row < height){
  6. int pixVal [10]={0};
  7. int pixels =0;
  8. for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
  9. for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
  10. int cur_col =col+blur_col;
  11. int cur_row = row + blur_row;
  12. int bluroffset= (cur_row*width+cur_col)*channels;
  13. if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
  14. for(int c=0;c<channels;c++){
  15. pixVal[c]+=g_input_image[bluroffset+c];
  16. }
  17. pixels++;
  18. }
  19. }
  20. }
  21. for(int c =0;c<channels;c++){
  22. g_output_image[offset+c]=(unsigned char)(pixVal[c]/pixels);
  23. }
  24. }
  25. }

单通道代码(灰度图)

  1. __global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
  2. int col = threadIdx.x+blockIdx.x*blockDim.x;
  3. int row =threadIdx.y+blockIdx.y*blockDim.y;
  4. int offset = (col+row*width);//
  5. if (col<width && row < height){
  6. int pixVal =0;
  7. int pixels =0;
  8. for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
  9. for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
  10. int cur_col =col+blur_col;
  11. int cur_row = row + blur_row;
  12. int bluroffset= (cur_row*width+cur_col);
  13. if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
  14. pixVal+=g_input_image[bluroffset];
  15. pixels++;
  16. }
  17. }
  18. }
  19. g_output_image[offset]=(unsigned char)(pixVal/pixels);
  20. }
  21. }

CPU处理版本

遍历所有的行和列

  1. void blur_cpu(unsigned char *input_image, unsigned char *output_image, int width, int height,int channels)
  2. {
  3. for(int row=0; row<height; row++)
  4. {
  5. for(int col=0; col<width; col++)
  6. {
  7. int pixVal =0;
  8. int pixels =0;
  9. int blur_row=-BLUR_SIZE;
  10. int blur_col =-BLUR_SIZE;
  11. for(int i=0;i<channels;i++){
  12. int rgboffset = (col+row*width)*channels+i;
  13. for(blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
  14. for(blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
  15. int cur_col =col+blur_col;
  16. int cur_row = row + blur_row;
  17. int offset= (cur_row*width+cur_col)*channels+i;
  18. if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
  19. pixVal+=input_image[offset];
  20. pixels++;
  21. }
  22. }
  23. }
  24. output_image[rgboffset]=(unsigned char)(pixVal/pixels);
  25. }
  26. }
  27. }
  28. }

主函数调用

传入输入图片,输出图片 和 运行参数

  1. if(argc<4)
  2. {
  3. printf("Usage: command input-image-name output-image-name option option(cpu/gpu)");
  4. return -1;
  5. }
  6. char *input_image_name = argv[1];
  7. char *output_image_name = argv[2];
  8. char *option = argv[3];

初始化参数

加载输入图片,得到图片大小(height,width,channels)

  1. int width, height, original_no_channels;
  2. int desired_no_channels = 0; // Pass 0 to load the image as is
  3. unsigned char *stbi_img = stbi_load(input_image_name, &width, &height, &original_no_channels, desired_no_channels);
  4. if(stbi_img==NULL){ printf("Error in loading the image.\n"); exit(1);}
  5. printf("Loaded image with a width of %dpx, a height of %dpx. The original image had %d channels, the loaded image has %d channels.\n", width, height, original_no_channels, desired_no_channels);
  6. int channels = original_no_channels;
  7. int img_mem_size = width * height * channels * sizeof(char);
  8. double begin;

GPU调用

  1. if(strcmp(option, "gpu")==0){
  2. printf("Processing with GPU!\n");
  3. // Todo: 1. Allocate memory on GPU
  4. unsigned char * g_input_img = NULL;
  5. unsigned char * g_output_img = NULL;
  6. cudaMalloc((void**)&g_input_img,img_mem_size);
  7. CHECK(cudaGetLastError());
  8. cudaMalloc((void**)&g_output_img,img_mem_size);
  9. // Todo: 2. Copy data from host memory to device memory
  10. cudaMemcpy(g_input_img,stbi_img,img_mem_size,cudaMemcpyHostToDevice);
  11. // Todo: 3. Call kernel function
  12. // 3.1 Declare block and grid sizes
  13. /* dim3 block(..., ...);
  14. dim3 grid(..., ...); */
  15. dim3 block_size(16,16,1);
  16. dim3 grid_size((width-1)/16+1,(height-1)/16+1,1);
  17. // 3.2 Record the time cost of GPU computation
  18. begin = cpuSecond();
  19. // Todo: 3.3 Call the kernel function (Don't forget to call cudaDeviceSynchronize() before time recording)
  20. blur_gpu<<<grid_size,block_size>>>(g_input_img,g_output_img,width,height,channels);
  21. CHECK(cudaGetLastError());
  22. cudaDeviceSynchronize();
  23. printf("Time cost [GPU]:%f s\n", cpuSecond()-begin);
  24. // Todo: 4. Copy data from device to host
  25. unsigned char *sepia_img_from_gpu = (unsigned char *)malloc(img_mem_size);
  26. cudaMemcpy(sepia_img_from_gpu,g_output_img,img_mem_size,cudaMemcpyDeviceToHost);
  27. // Todo: 5. Save results as an image
  28. stbi_write_jpg(output_image_name, width, height, channels, sepia_img_from_gpu, 100);
  29. // Todo: 6. Release host memory and device memory
  30. cudaFree(g_input_img);
  31. cudaFree(g_output_img);
  32. free(sepia_img_from_gpu);
  33. }

Block的维度设置

image.png
一个SM 最多只能跑8个blocks,最多48个wraps就是4832=1536个线程
一个处理模块只能调用32个wraps,最大1536个线程,使用16
16的block,刚好可以让SM完全利用

CPU调用

    if(strcmp(option, "cpu")==0)
    {
        printf("Processing with CPU!\n");
        unsigned char *sepia_img = (unsigned char *)malloc(img_mem_size);
        if(sepia_img==NULL){  printf("Unable to allocate memory for the sepia image. \n");  exit(1);  }

        // Time stamp
        begin = cpuSecond();

        // C PU computation (for reference)
        rgb_to_sepia_cpu(stbi_img, sepia_img, width, height, channels);

        // Time stamp
        printf("Time cost [CPU]:%f s\n", cpuSecond()-begin);

        // Save to an image file
        stbi_write_jpg(output_image_name, width, height, channels, sepia_img, 100);

        free(sepia_img);
    }

    else
    {
        printf("Unexpected option (please use cpu/gpu) !\n");
    }   

    stbi_image_free(stbi_img);

    return 0;
}