Hello Cuda（二）——向量加法-526互联

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

typedef float FLOAT;

double get_time();
void warm_up();
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N); // CPU端
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N);  // GPU端

// 二维GRID+1维BLOCK
#define get_tid() ((blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x)
#define get_bid() (blockIdx.y * gridDim.x + blockIdx.x)

#define WINDOWS 0

#if WINDOWS
#include <windows.h>
double get_time()
{
    LARGE_INTEGER timer;
    static LARGE_INTEGER fre;
    static int init = 0;
    double t;

    if(init != 1)
    {
        QueryPerformanceFrequency(&fre);
        init = 1;
    }

    QueryPerformanceFrequency(&timer);
    t = timer.QuadPart * 1. / fre.QuadPart;
    return t;

}

#else

#include <sys/time.h>
#include <time.h>

double get_time()
{
    struct timeval tv;
    double t;
    gettimeofday(&tv, (struct timezone*)0);
    t = tv.tv_sec + (double)tv.tv_usec*1e-6;
    return t;
    
}

#endif


// GPU WARM-UP
__global__ void warmup_knl(void)
{
    int i, j;
    i = 1;
    j = 1;
    i = i + j;
}

// CPU WARM-UP
void warm_up()
{
    int i = 0;
    for (; i < 0; ++i)
    {
        warmup_knl <<<1, 256>>> ();
    }
}

// CPU端-HOST
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
    int i;
    for (int i = 0; i < N; ++i)
        z[i] = x[i] + y[i] + z[i];
}

__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
    int idx = get_tid();
    if (idx < N)
        z[idx] = x[idx] + y[idx] + z[idx];
}

int main()
{
    int N = 20000000;
    int nbytes = N * sizeof(FLOAT);

    // 二维GRID 一维BLOCK
    int bs = 256; // BLOCK NUMBER
    int s = ceil(sqrt((N + bs - 1.) / bs));  // GRID SIZE 
    dim3 grid = dim3(s, s);

    FLOAT* dx = NULL, *hx = NULL;
    FLOAT* dy = NULL, *hy = NULL;
    FLOAT* dz = NULL, *hz = NULL;

    int iter = 30;
    int i;
    double th, td;

    warm_up();

    // 分配GPU内存
    cudaMalloc((void**)&dx, nbytes);
    cudaMalloc((void**)&dy, nbytes);
    cudaMalloc((void**)&dz, nbytes);

    if(dx == NULL || dy == NULL || dz == NULL)
    {
        printf("Couldn't allocate GPU Memory");
        return -1;
    }

    // 分配CPU内存
    hx = (FLOAT*)malloc(nbytes);
    hy = (FLOAT*)malloc(nbytes);
    hz = (FLOAT*)malloc(nbytes);

    if(hx == NULL || hy == NULL || hz == NULL)
    {
        printf("Couldn't allocate CPU Memory");
    }

    // 初始化
    for(int i = 0; i < N; ++i)
    {
        hx[i] = 1;
        hy[i] = 1;
        hz[i] = 1;
    }

    // 拷贝数据从主机CPU至设备GPU
    cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice); 
    cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice);

    warm_up();

    cudaThreadSynchronize(); // 异步线程开启
     // 调用GPU
    td = get_time();
    for(i = 0; i < iter; ++i)
        vec_add_device<<<grid, bs>>> (dx, dy, dz, N);
    td = get_time() - td;
    // 严格上这里需要有一个从设备端拷贝数据到主机端的过程
    // cudaMemcpy(hx, dx, nbytes, cudaMemcpyDevicetoHost);
    
    // 调用CPU
    th = get_time();
    for(i = 0; i < iter; ++i)
        vec_add_host(hx, hy, hz, N);
    th = get_time() - th;

    printf("GPU time: %.4f, CPU time: %.4f. SppedUp: %g \n", td, th, th/td);

    // 释放资源
    free(hx);
    free(hy);
    free(hz);
    cudaFree(hx);
    cudaFree(hy);
    cudaFree(hz);
    

    return 0;
}

GPU time: 0.0109, CPU time: 2.6454. SppedUp: 242.811

向量

加法

libtorch vscode hello cuda