Lecture 4: GPU Programming

Why do we still need CPU?
GPU programming with CUDA
Practice: Sequential CPU one and parallel GPU one

Why do we still need CPU?

GPU programming with CUDA

Practice: Sequential CPU one and parallel GPU one

Sequential CPU:

#include <iostream>
#include <ctime>
#include <stack>
#include <limits>
std::stack<clock_t> tictoc_stack;
void tic()
{
    tictoc_stack.push(clock());
}
void toc()
{
    std::cout << "Time elapsed: "
              << ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC
              << std::endl;
    tictoc_stack.pop();
}
inline int p5(int i) { return i * i * i * i * i; }
inline int value(int x) 
{ 
    int a = 0;
    for (int i = 0; i < 30; i++)
    {
        // ALL the possibility is verified here
        if (x & (1 << i))
        {
            a += p5(i + 1);
        }
        else 
        {
            a -= p5(i + 1);
        }
    }
    return abs(a);
}
int main()
{
    constexpr int total = 1 << 30;
    int best_x = 0;
    int best_v = value(best_x);
    tic();
    for (int x = 0; x < total; x++)
    {
        int v = value(x);
        if (v < best_v)
        {
            best_x = x;
            best_v = value(x);
        }
    }
    std::cout << "best_x:" << best_x << " best_v:" << best_v << std::endl;
    toc();
}

#include <ctime>
#include <stack>
#include <limits>
#include <vector>
#include <stdio.h>
constexpr int blocks = 1 << 10;
constexpr int threads = 1 << 6;
constexpr int iterations = 1 << 14;
__host__ __device__ inline int p5(int i) { return i * i * i * i * i; }
__host__ __device__ inline int value(int x) 
{ 
    int a = 0;
    for (int i = 0; i < 30; i++)
    {
        if (x & (1 << i))
        {
            a += p5(i + 1);
        }
        else 
        {
            a -= p5(i + 1);
        }
    }
    return abs(a);
}
__global__ void mykernel(int* r)
{
    int x3 = blockIdx.x;
    int x2 = threadIdx.x;
    int best_x = 0;
    int best_v = value(best_x);
    for (int x1 = 0; x1 < iterations; x1++)
    {
        int x = (x3 << 20) | (x2 << 14) | x1;
        int v = value(x);
        if (v < best_v)
        {
            best_x = x;
            best_v = v;
        }
    }
    r[(x3 << 6) | x2] = best_x;
}
int main()
{
    int* rGPU = NULL;
    Each block will calculate the limited number of records, which really speed up
    cudaMalloc((void**)&rGPU, blocks * threads * sizeof(int));
    mykernel<<<blocks, threads>>>(rGPU);
    cudaDeviceSynchronize();
    cudaGetLastError();
    std::vector<int> r(blocks * threads);
    cudaMemcpy(r.data(), rGPU, blocks * threads * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(rGPU);
}

Programming Parallel Computers

Why do we still need CPU?

GPU programming with CUDA

Practice: Sequential CPU one and parallel GPU one

Sequential CPU:

Parallel GPU Reading:

Memory access pattern