image.png

image.png

image.png
image.png
image.png

Why do we still need CPU?

image.png

GPU programming with CUDA

image.png

Practice: Sequential CPU one and parallel GPU one

Sequential CPU:

image.png

  1. #include <iostream>
  2. #include <ctime>
  3. #include <stack>
  4. #include <limits>
  5. std::stack<clock_t> tictoc_stack;
  6. void tic()
  7. {
  8. tictoc_stack.push(clock());
  9. }
  10. void toc()
  11. {
  12. std::cout << "Time elapsed: "
  13. << ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC
  14. << std::endl;
  15. tictoc_stack.pop();
  16. }
  17. inline int p5(int i) { return i * i * i * i * i; }
  18. inline int value(int x)
  19. {
  20. int a = 0;
  21. for (int i = 0; i < 30; i++)
  22. {
  23. // ALL the possibility is verified here
  24. if (x & (1 << i))
  25. {
  26. a += p5(i + 1);
  27. }
  28. else
  29. {
  30. a -= p5(i + 1);
  31. }
  32. }
  33. return abs(a);
  34. }
  35. int main()
  36. {
  37. constexpr int total = 1 << 30;
  38. int best_x = 0;
  39. int best_v = value(best_x);
  40. tic();
  41. for (int x = 0; x < total; x++)
  42. {
  43. int v = value(x);
  44. if (v < best_v)
  45. {
  46. best_x = x;
  47. best_v = value(x);
  48. }
  49. }
  50. std::cout << "best_x:" << best_x << " best_v:" << best_v << std::endl;
  51. toc();
  52. }

image.png

image.png

  1. #include <ctime>
  2. #include <stack>
  3. #include <limits>
  4. #include <vector>
  5. #include <stdio.h>
  6. constexpr int blocks = 1 << 10;
  7. constexpr int threads = 1 << 6;
  8. constexpr int iterations = 1 << 14;
  9. __host__ __device__ inline int p5(int i) { return i * i * i * i * i; }
  10. __host__ __device__ inline int value(int x)
  11. {
  12. int a = 0;
  13. for (int i = 0; i < 30; i++)
  14. {
  15. if (x & (1 << i))
  16. {
  17. a += p5(i + 1);
  18. }
  19. else
  20. {
  21. a -= p5(i + 1);
  22. }
  23. }
  24. return abs(a);
  25. }
  26. __global__ void mykernel(int* r)
  27. {
  28. int x3 = blockIdx.x;
  29. int x2 = threadIdx.x;
  30. int best_x = 0;
  31. int best_v = value(best_x);
  32. for (int x1 = 0; x1 < iterations; x1++)
  33. {
  34. int x = (x3 << 20) | (x2 << 14) | x1;
  35. int v = value(x);
  36. if (v < best_v)
  37. {
  38. best_x = x;
  39. best_v = v;
  40. }
  41. }
  42. r[(x3 << 6) | x2] = best_x;
  43. }
  44. int main()
  45. {
  46. int* rGPU = NULL;
  47. Each block will calculate the limited number of records, which really speed up
  48. cudaMalloc((void**)&rGPU, blocks * threads * sizeof(int));
  49. mykernel<<<blocks, threads>>>(rGPU);
  50. cudaDeviceSynchronize();
  51. cudaGetLastError();
  52. std::vector<int> r(blocks * threads);
  53. cudaMemcpy(r.data(), rGPU, blocks * threads * sizeof(int), cudaMemcpyDeviceToHost);
  54. cudaFree(rGPU);
  55. }

Parallel GPU Reading:

Memory access pattern

image.png