image.png__
image.png
row is loaded coalesced
location of block is filpped

Conflicts in share memory

Shared memory is setup as 32 bank

  • If we divide the shared memory into 4 byte-long elements,
    element i lies in bank i % 32.

image.png

Mapping addresses in shared memory to banks

shared DTYPE tile[TILE_DIM][TILE_DIM];
image.png
Shared memory is setup as 32 banks

  • A bank conflict occurs when ≥2 threads in a warp access different elements in the same bank.
  • Bank conflicts cause serial memory accesses rather than parallel.

    AVOIDING BANK CONFLICTS

    Padding
    image.png

    CODE

    1. __global__ void kernel_transpose_per_element_tiled_no_bank_conflicts(DTYPE *input, DTYPE *output, int num_rows, int num_cols)
    2. {
    3. __shared__ float data[BLOCK_SIZE][BLOCK_SIZE+1];
    4. int input_col_id = blockIdx.x*blockDim.x+threadIdx.x;
    5. int input_row_id = blockIdx.y*blockDim.y+threadIdx.y;
    6. int block_x = blockIdx.x*blockDim.x;
    7. int block_y = blockIdx.y*blockDim.y;
    8. if(input_col_id<num_cols&&input_row_id<num_rows&&threadIdx.x<BLOCK_SIZE)
    9. data[threadIdx.y][threadIdx.x]=input[input_row_id*num_cols+input_col_id];
    10. __syncthreads();
    11. int output_col_id = block_y+threadIdx.x;
    12. int output_row_id =block_x+threadIdx.y;
    13. if(output_col_id<num_rows&&output_row_id<num_cols)
    14. output[output_row_id*num_rows+output_col_id]=data[threadIdx.x][threadIdx.y];
    15. }

    main 函数调用

    ```cpp

    include

    include”error_check.h”

    include”gpu_timer.h”

define DTYPE float

define DTYPE_OUTPUT_FORMAT “%f “

define H 1024

define W 977

define BLOCK_SIZE 32

int main() { int numBytes = HWsizeof(DTYPE); DTYPE data_input = (DTYPE )malloc(numBytes); DTYPE data_output = (DTYPE )malloc(numBytes); DTYPE data_result = (DTYPE )malloc(numBytes);

  1. init_data(data_input, H*W);
  2. transpose_CPU(data_input, data_result, H, W);
  3. DTYPE *d_in, *d_out;
  4. cudaMalloc((void **)&d_in, numBytes);
  5. cudaMalloc((void **)&d_out, numBytes);
  6. cudaMemcpy(d_in, data_input, numBytes, cudaMemcpyHostToDevice);
  7. GpuTimer timer;
  8. /*
  9. * 1. matrix transpose serial *
  10. */
  11. timer.Start();
  12. kernel_transpose_serial<<<1, 1>>>(d_in, d_out, H, W);
  13. timer.Stop();
  14. cudaMemcpy(data_output, d_out, numBytes, cudaMemcpyDeviceToHost);
  15. printf("\nTime cost (serial):%g ms. Veryifying results...%s\n",
  16. timer.Elapsed(), compare_matrices(data_output, data_result, H, W)?"Passed":"Failed");
  17. memset(data_output, 0, numBytes);
  18. cudaMemset(d_out, 0, numBytes);
  19. /*
  20. * 2. matrix transpose per row *
  21. */
  22. timer.Start();
  23. kernel_transpose_per_row<<<1, W>>>(d_in, d_out, H, W);
  24. timer.Stop();
  25. cudaMemcpy(data_output, d_out, numBytes, cudaMemcpyDeviceToHost);
  26. printf("\nTime cost (per row):%g ms. Veryifying results...%s\n",
  27. timer.Elapsed(), compare_matrices(data_output, data_result, H, W)?"Passed":"Failed");
  28. memset(data_output, 0, numBytes);
  29. cudaMemset(d_out, 0, numBytes);
  30. /*
  31. * 3. matrix transpose per element *
  32. */
  33. timer.Start();
  34. dim3 blocks((W-1)/BLOCK_SIZE+1, (H-1)/BLOCK_SIZE+1);
  35. dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
  36. kernel_transpose_per_element<<<blocks, threads>>>(d_in, d_out, H, W);
  37. timer.Stop();
  38. cudaMemcpy(data_output, d_out, numBytes, cudaMemcpyDeviceToHost);
  39. printf("\nTime cost (per element):%g ms. Veryifying results...%s\n",
  40. timer.Elapsed(), compare_matrices(data_output, data_result, H, W)?"Passed":"Failed");
  41. memset(data_output, 0, numBytes);
  42. cudaMemset(d_out, 0, numBytes);
  43. /*
  44. * 4. matrix transpose tiled with shared memory *
  45. */
  46. timer.Start();
  47. kernel_transpose_per_element_tiled<<<blocks, threads>>>(d_in, d_out, H, W);
  48. timer.Stop();
  49. cudaMemcpy(data_output, d_out, numBytes, cudaMemcpyDeviceToHost);
  50. printf("\nTime cost (tiled with shared memory):%g ms. Veryifying results...%s\n",
  51. timer.Elapsed(), compare_matrices(data_output, data_result, H, W)?"Passed":"Failed");
  52. memset(data_output, 0, numBytes);
  53. cudaMemset(d_out, 0, numBytes);
  54. /*
  55. * 5. matrix transpose tiled without bank conflicts *
  56. */
  57. timer.Start();
  58. kernel_transpose_per_element_tiled_no_bank_conflicts<<<blocks, threads>>>(d_in, d_out, H, W);
  59. timer.Stop();
  60. cudaMemcpy(data_output, d_out, numBytes, cudaMemcpyDeviceToHost);
  61. printf("\nTime cost (tiled without bank conflicts):%g ms. Veryifying results...%s\n",
  62. timer.Elapsed(), compare_matrices(data_output, data_result, H, W)?"Passed":"Failed");
  63. free(data_input);
  64. free(data_output);
  65. free(data_result);
  66. cudaFree(d_in);
  67. cudaFree(d_out);
  68. return 0;

}

<a name="k6X9R"></a>
### matrix helper
```cpp
void init_data(DTYPE *arr, int n)
{
    for(int i=0; i<n; i++){
        arr[i] = (DTYPE)(i+1);
    }
}

int compare_matrices(DTYPE *input, DTYPE *ref, int num_rows, int num_cols)
{
    for(int row_idx=0; row_idx<num_rows; row_idx++)
    {
        for(int col_idx=0; col_idx<num_cols; col_idx++)
        {
            if(abs(ref[row_idx*num_cols+col_idx]-input[row_idx*num_cols+col_idx])>1e-3)
            {
                printf("Error:%f at (%d, %d)\n", abs(ref[row_idx*num_cols+col_idx]-input[row_idx*num_cols+col_idx]), row_idx, col_idx);
                return 0;
            }
        }
    }
    return 1;
}