“CUDA”版本间的差异
来自iCenter Wiki
第28行: | 第28行: | ||
Hello World from CPU! | Hello World from CPU! | ||
Hello World from GPU! | Hello World from GPU! | ||
+ | |||
+ | #include <stdio.h> | ||
+ | #include <cuda.h> | ||
+ | __global__ void my_kernel() { | ||
+ | printf(“GPU blk %i thread %i %i\n”, blockIdx.x, threadIdx.x, threadIdx.y); | ||
+ | } | ||
+ | |||
+ | int main() | ||
+ | { | ||
+ | const dim3 threadsPerBlock(2,2); | ||
+ | const dim3 blocksPerGrid(2); | ||
+ | my_kernel<<<threadsPerBlock, blocksPerGrid>>>(); | ||
+ | return 0; | ||
+ | } | ||
+ | |||
+ | |||
+ | $ nvcc hello_world.cu | ||
+ | $ ./a.out | ||
+ | GPU block 0 thread 0 0 | ||
+ | GPU block 0 thread 0 1 | ||
+ | GPU block 0 thread 1 0 | ||
+ | GPU block 0 thread 1 1 | ||
+ | GPU block 1 thread 0 0 | ||
+ | GPU block 1 thread 0 1 | ||
+ | GPU block 1 thread 1 0 | ||
+ | GPU block 1 thread 1 1 |
2017年4月19日 (三) 12:22的版本
CUDA(Compute Unified Device Architecture,统一计算架构)
异构平行计算系统(heterogeneous parallel computing systems)
latency devices (CPU cores)
throughput devices (GPU cores)
单进程多数据(single process and mutiple data, SPMD) 模型
#include <stdio.h> #include <cuda.h> __global__ void my_kernel() { printf(“Hello World from GPU!\n”); } int main() { printf("Hello World from CPU!\n"); my_kernel<<<1,1>>>(); cudaDeviceSynchronize() return 0; }
编译输出
$ nvcc hello_world.cu $ ./a.out Hello World from CPU! Hello World from GPU!
#include <stdio.h> #include <cuda.h> __global__ void my_kernel() { printf(“GPU blk %i thread %i %i\n”, blockIdx.x, threadIdx.x, threadIdx.y); }
int main() {
const dim3 threadsPerBlock(2,2); const dim3 blocksPerGrid(2); my_kernel<<<threadsPerBlock, blocksPerGrid>>>(); return 0; }
$ nvcc hello_world.cu
$ ./a.out
GPU block 0 thread 0 0 GPU block 0 thread 0 1 GPU block 0 thread 1 0 GPU block 0 thread 1 1 GPU block 1 thread 0 0 GPU block 1 thread 0 1 GPU block 1 thread 1 0 GPU block 1 thread 1 1