




Why do we still need CPU?
GPU programming with CUDA

Practice: Sequential CPU one and parallel GPU one
Sequential CPU:

#include <iostream>#include <ctime>#include <stack>#include <limits>std::stack<clock_t> tictoc_stack;void tic(){ tictoc_stack.push(clock());}void toc(){ std::cout << "Time elapsed: " << ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC << std::endl; tictoc_stack.pop();}inline int p5(int i) { return i * i * i * i * i; }inline int value(int x) { int a = 0; for (int i = 0; i < 30; i++) { // ALL the possibility is verified here if (x & (1 << i)) { a += p5(i + 1); } else { a -= p5(i + 1); } } return abs(a);}int main(){ constexpr int total = 1 << 30; int best_x = 0; int best_v = value(best_x); tic(); for (int x = 0; x < total; x++) { int v = value(x); if (v < best_v) { best_x = x; best_v = value(x); } } std::cout << "best_x:" << best_x << " best_v:" << best_v << std::endl; toc();}


#include <ctime>#include <stack>#include <limits>#include <vector>#include <stdio.h>constexpr int blocks = 1 << 10;constexpr int threads = 1 << 6;constexpr int iterations = 1 << 14;__host__ __device__ inline int p5(int i) { return i * i * i * i * i; }__host__ __device__ inline int value(int x) { int a = 0; for (int i = 0; i < 30; i++) { if (x & (1 << i)) { a += p5(i + 1); } else { a -= p5(i + 1); } } return abs(a);}__global__ void mykernel(int* r){ int x3 = blockIdx.x; int x2 = threadIdx.x; int best_x = 0; int best_v = value(best_x); for (int x1 = 0; x1 < iterations; x1++) { int x = (x3 << 20) | (x2 << 14) | x1; int v = value(x); if (v < best_v) { best_x = x; best_v = v; } } r[(x3 << 6) | x2] = best_x;}int main(){ int* rGPU = NULL; Each block will calculate the limited number of records, which really speed up cudaMalloc((void**)&rGPU, blocks * threads * sizeof(int)); mykernel<<<blocks, threads>>>(rGPU); cudaDeviceSynchronize(); cudaGetLastError(); std::vector<int> r(blocks * threads); cudaMemcpy(r.data(), rGPU, blocks * threads * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(rGPU);}
Parallel GPU Reading:
Memory access pattern
