图像模糊

GPU处理版本

version1 对每个通道分别作平均

__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
     int col = threadIdx.x+blockIdx.x*blockDim.x;
     int row =threadIdx.y+blockIdx.y*blockDim.y;
     if (col<width && row < height){
         int pixVal =0;     
         int pixels =0;
         for(int i=0;i<channels;i++){   // 
             int rgboffset = (col+row*width)*channels+i;// dim of (r,g,b)
             for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
                 for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
                     int cur_col =col+blur_col;
                     int cur_row = row + blur_row;
                     int offset= (cur_row*width+cur_col)*channels+i;
                     if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
                         pixVal+=g_input_image[offset];
                         pixels++;
                     } 
                 }
             }
             g_output_image[rgboffset]=(unsigned char)(pixVal/pixels);
         }
     }
}

这里的问题是pixVal 和 pixels的重置应该放在遍历的通道里面，也就是 line 7里面

version 2 按照通道遍历

__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
     int col = threadIdx.x+blockIdx.x*blockDim.x;
     int row =threadIdx.y+blockIdx.y*blockDim.y;
     if (col<width && row < height){
         for(int i=0;i<channels;i++){   // 
             int pixVal =0;     
             int pixels =0;
             int rgboffset = (col+row*width)*channels+i;// dim of (r,g,b)
             for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
                 for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
                     int cur_col =col+blur_col;
                     int cur_row = row + blur_row;
                     int offset= (cur_row*width+cur_col)*channels+i;
                     if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
                         pixVal+=g_input_image[offset];
                         pixels++;
                     } 
                 }
             }
             g_output_image[rgboffset]=(unsigned char)(pixVal/pixels);
         }
     }
}

更新后的多通道代码

__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
     int col = threadIdx.x+blockIdx.x*blockDim.x;
     int row =threadIdx.y+blockIdx.y*blockDim.y;
     int offset = (col+row*width)*channels;
     if (col<width && row < height){
        int pixVal [10]={0};
        int pixels =0;
        for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
            for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
                    int cur_col =col+blur_col;
                    int cur_row = row + blur_row;
                    int bluroffset= (cur_row*width+cur_col)*channels;
                    if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
                        for(int c=0;c<channels;c++){
                            pixVal[c]+=g_input_image[bluroffset+c];
                        }
                        pixels++;
                    } 
                }
            }
        for(int c =0;c<channels;c++){
             g_output_image[offset+c]=(unsigned char)(pixVal[c]/pixels);
        }
     }
}

单通道代码（灰度图）

__global__ void blur_gpu(unsigned char *g_input_image,unsigned char *g_output_image, int width, int height,int channels){
     int col = threadIdx.x+blockIdx.x*blockDim.x;
     int row =threadIdx.y+blockIdx.y*blockDim.y;
     int offset = (col+row*width);// 
     if (col<width && row < height){
        int pixVal =0;
        int pixels =0;
        for(int blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
            for(int blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
                    int cur_col =col+blur_col;
                    int cur_row = row + blur_row;
                    int bluroffset= (cur_row*width+cur_col);
                    if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
                        pixVal+=g_input_image[bluroffset];
                        pixels++;
                    } 
                }
            }
        g_output_image[offset]=(unsigned char)(pixVal/pixels);
     }
}

CPU处理版本

遍历所有的行和列

void blur_cpu(unsigned char *input_image, unsigned char *output_image, int width, int height,int channels)
{
    for(int row=0; row<height; row++)
    {
        for(int col=0; col<width; col++)
        {
            int pixVal =0;
            int pixels =0;
            int blur_row=-BLUR_SIZE;
            int blur_col =-BLUR_SIZE;
             for(int i=0;i<channels;i++){
                         int rgboffset = (col+row*width)*channels+i;
                         for(blur_row=-BLUR_SIZE;blur_row<=BLUR_SIZE;blur_row++){
                             for(blur_col =-BLUR_SIZE;blur_col<=BLUR_SIZE;blur_col++){
                                 int cur_col =col+blur_col;
                                 int cur_row = row + blur_row;
                                 int offset= (cur_row*width+cur_col)*channels+i;
                                 if(cur_row>-1&&cur_row<height&&cur_col>-1&&cur_col<width){
                                     pixVal+=input_image[offset];
                                     pixels++;
                                 } 
                             }
                         }
                         output_image[rgboffset]=(unsigned char)(pixVal/pixels);
                     }
        }
    }
}

主函数调用

传入输入图片，输出图片和运行参数

 if(argc<4)
    {
        printf("Usage: command    input-image-name    output-image-name option   option(cpu/gpu)");
        return -1;
    }
    char *input_image_name = argv[1];
    char *output_image_name = argv[2];
    char *option = argv[3];

初始化参数

加载输入图片，得到图片大小（height,width,channels）

    int width, height, original_no_channels;
    int desired_no_channels = 0; // Pass 0 to load the image as is
    unsigned char *stbi_img = stbi_load(input_image_name, &width, &height, &original_no_channels, desired_no_channels);
    if(stbi_img==NULL){ printf("Error in loading the image.\n"); exit(1);}
    printf("Loaded image with a width of %dpx, a height of %dpx. The original image had %d channels, the loaded image has %d channels.\n", width, height, original_no_channels, desired_no_channels);
    int channels = original_no_channels;
    int img_mem_size = width * height * channels * sizeof(char);
    double begin;

GPU调用

if(strcmp(option, "gpu")==0){
        printf("Processing with GPU!\n");
 //  Todo: 1. Allocate memory on GPU
        unsigned char * g_input_img = NULL;
        unsigned char * g_output_img = NULL;
        cudaMalloc((void**)&g_input_img,img_mem_size);
        CHECK(cudaGetLastError());
        cudaMalloc((void**)&g_output_img,img_mem_size);
  //  Todo: 2. Copy data from host memory to device memory
        cudaMemcpy(g_input_img,stbi_img,img_mem_size,cudaMemcpyHostToDevice);
  //  Todo: 3. Call kernel function
    //        3.1 Declare block and grid sizes
        /*  dim3 block(..., ...);  
            dim3 grid(..., ...);  */
        dim3 block_size(16,16,1);
        dim3 grid_size((width-1)/16+1,(height-1)/16+1,1);
        //        3.2 Record the time cost of GPU computation
        begin = cpuSecond();
        //  Todo: 3.3 Call the kernel function (Don't forget to call cudaDeviceSynchronize() before time recording)
        blur_gpu<<<grid_size,block_size>>>(g_input_img,g_output_img,width,height,channels);
        CHECK(cudaGetLastError());
        cudaDeviceSynchronize();
        printf("Time cost [GPU]:%f s\n", cpuSecond()-begin);
        //  Todo:  4. Copy data from device to host
        unsigned char *sepia_img_from_gpu = (unsigned char *)malloc(img_mem_size);
        cudaMemcpy(sepia_img_from_gpu,g_output_img,img_mem_size,cudaMemcpyDeviceToHost);
        //  Todo:  5. Save results as an image
            stbi_write_jpg(output_image_name, width, height, channels, sepia_img_from_gpu, 100);
        //  Todo:  6. Release host memory and device memory
         cudaFree(g_input_img);
         cudaFree(g_output_img);
         free(sepia_img_from_gpu);
}

Block的维度设置

一个SM 最多只能跑8个blocks，最多48个wraps就是4832=1536个线程
一个处理模块只能调用32个wraps,最大1536个线程，使用1616的block，刚好可以让SM完全利用

CPU调用

    if(strcmp(option, "cpu")==0)
    {
        printf("Processing with CPU!\n");
        unsigned char *sepia_img = (unsigned char *)malloc(img_mem_size);
        if(sepia_img==NULL){  printf("Unable to allocate memory for the sepia image. \n");  exit(1);  }

        // Time stamp
        begin = cpuSecond();

        // C PU computation (for reference)
        rgb_to_sepia_cpu(stbi_img, sepia_img, width, height, channels);

        // Time stamp
        printf("Time cost [CPU]:%f s\n", cpuSecond()-begin);

        // Save to an image file
        stbi_write_jpg(output_image_name, width, height, channels, sepia_img, 100);

        free(sepia_img);
    }

    else
    {
        printf("Unexpected option (please use cpu/gpu) !\n");
    }   

    stbi_image_free(stbi_img);

    return 0;
}

GPU编程

Image Blur