r1 =fun1c(nBlk)不能运行
func1的调用在global修饰的函数中,即会在GPU中运行,而不能调用在CPU里常见的函数
又因为func1不加任何的修饰符,默认就是host 函数,这些函数只为主机端编译,即在CPU中运行,
所以r1 =fun1c(nBlk)不能运行
r2=fun2c(nBlk)不能运行
func2的调用在device修饰的函数中,即会在GPU中调用和运行,不能再CPU中调用
而r2的调用者在main函数中,即在CPU中,所以不能运行
cudaDeviceSynchronize() :
原本该执行的CPU程序被放到了后面,因为该函数停止CPU端线程的执行,直到GPU端完成之前。
#include<stdio.h>int func1(int x){return 2*x;}__device__ int func2(int x){return 2*x;}__host__ __device__ int func3(int x){return 2*x;}void __global__ cube_gpu1(){int tid = threadIdx.x;int r1;//r1 = func1(tid);//r1 = func2(tid);r1 = func3(tid);printf("tid:%d, cube:%d\n", tid, r1);}__global__ void cube_gpu2(){int tid = threadIdx.x;int r1;r1= func3(tid);printf("tid:%d, cube:%d\n", tid, r1);}int main(){printf("Host and device functions!\n");printf("\nResults from device:\n");int nBlk = 3;int nGrid = 2;cube_gpu1<<<nGrid, nBlk>>>();cube_gpu2<<<nGrid, nBlk>>>();cudaDeviceSynchronize();int r2;r2 = func1(nBlk);//r2 = func2(nBlk);r2 = func3(nBlk);printf("\nResults from host:%d\n", r2);r2 = func3(nGrid);printf("\nResults from host:%d\n", r2);cudaDeviceReset();return 0;}
