文章目录
  1. 1. 初识CUDA
    1. 1.1. 部分函数介绍
    2. 1.2. cudaDeviceProp结构
    3. 1.3. 编译
  2. 2. 使用GPU实现数组平方和
    1. 2.1. 用GPU的简单实现
    2. 2.2. 改进1:多线程
    3. 2.3. 改进2:使内存的整体读取连续
    4. 2.4. 改进3:多block多thread
    5. 2.5. 改进4:使用block的共享内存
    6. 2.6. 改进5:加法树

CUDA安装在网上的教程很多,这里就不再详述。此外,笔者也是今天才开始写cuda的代码,所以内容上只是简单总结。

初识CUDA

CUDA 目前有两种不同的 API:Runtime API 和 Driver API,两种 API 各有其适用的范围。由于 runtime API 较容易使用,一开始我们会以 runetime API 为主。所以代码开头加上#include

以为CUDA语言基于C,里面涉及到很多指针的操作,C不熟悉的话可以先复习一下指针的使用。

部分函数介绍

1.cudaError_t cudaGetDeviceCount( int* count )
通过count返回可用于计算的设备数量。

2.cudaError_t cudaGetDeviceProperties( struct cudaDeviceProp* prop,int dev )
通过prop返回第dev台设备的属性,dev编号从0开始。

3.cudaError_t cudaSetDevice(int dev)
设置第dev台为执行设备。

cudaDeviceProp结构

struct cudaDeviceProp {

    char name [256];            //用于标识设备的ASCII字符串;
    size_t totalGlobalMem;        //设备上可用的全局存储器的总量,以字节为单位;
    size_t sharedMemPerBlock;    /*线程块可以使用的共享存储器的最大值,以字节为单位;
                                  多处理器上的所有线程块可以同时共享这些存储器;*/
    int regsPerBlock;            /*线程块可以使用的32位寄存器的最大值;
                                多处理器上的所有线程块可以同时共享这些寄存器;*/
    int warpSize;                //按线程计算的warp块大小;
    size_t memPitch;            /*允许通过cudaMallocPitch()为包含存储器区域的
                                存储器复制函数分配的最大间距(pitch),以字节为单位;*/
    int maxThreadsPerBlock;        //每个块中的最大线程数
    int maxThreadsDim [3];        //块各个维度的最大值:
    int maxGridSize [3];        //网格各个维度的最大值;
    size_t totalConstMem;        //设备上可用的不变存储器总量,以字节为单位;
    int major;                    //定义设备计算能力的主要修订号和次要修订号;
    int minor;                    //
    int clockRate;                //以千赫为单位的时钟频率;
    size_t textureAlignment;    /*对齐要求;与textureAlignment字节对齐的
                                纹理基址无需对纹理取样应用偏移;*/
    int deviceOverlap;            /*如果设备可在主机和设备之间并发复制存储器,
                                同时又能执行内核,则此值为 1;否则此值为 0;*/
    int multiProcessorCount;    //设备上多处理器的数量。

}

编译

nvcc 是 CUDA 的编译工具,它可以将 .cu 文件解析出在 GPU 和 host 上执行的部分.也就是说,它会帮忙把 GPU 上执行和主机上执行的代码区分开来,不需要我们手动去做了。在 GPU 执行的部分会通过 NVIDIA 提供的 编译器编译成中介码,主机执行的部分则调用 gcc 编译。

1
nvcc -o first_cuda first_cuda.cu

使用GPU实现数组平方和

用GPU的简单实现

squareSum2.cuDownload code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/******************************************************************** 
##### File Name: squareSum2.cu
##### File Func: calculate the sum of inputs's square
##### Author: HeZhichao
##### E-mail: hzc199307@gmail.com
##### Create Time: 2016-5-11
*********************************************************************/

# include<stdio.h>
# include <stdlib.h>
# include <cuda_runtime.h>
// ======== define area ========
# define DATA_SIZE 1048576 // 1M
// ======== global area ========
int data[DATA_SIZE];
void printDeviceProp( const cudaDeviceProp &prop);
bool InitCUDA();
void generateData( int *data, int size);
__global__ static void squaresSum( int *data, int *sum, clock_t *time);
int main( int argc, char const *argv[]) {
// init CUDA device
if (!InitCUDA()) { return 0 ; }
printf ( "CUDA initialized.\n" );
// generate rand datas
generateData(data, DATA_SIZE);
// malloc space for datas in GPU
int *gpuData, *sum;
clock_t *time;
cudaMalloc(( void **) &gpuData, sizeof ( int ) * DATA_SIZE);
cudaMalloc(( void **) &sum, sizeof ( int ));
cudaMalloc(( void **) &time, sizeof (clock_t));
cudaMemcpy(gpuData, data, sizeof ( int ) * DATA_SIZE, cudaMemcpyHostToDevice);
// calculate the squares's sum
//CUDA调用在GPU中函数名称<<<block num, thread num, shared memory size>>>(param,...) ;
squaresSum<<< 1 , 1 , 0 >>>(gpuData, sum, time);
// copy the result from GPU to HOST
int result;
clock_t time_used;
cudaMemcpy(&result, sum, sizeof ( int ), cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof (clock_t), cudaMemcpyDeviceToHost);
// free GPU spaces
cudaFree(gpuData);
cudaFree(sum); cudaFree(time);
// print result
printf ( "(GPU) sum:%d time:%ld\n" , result, time_used);
// CPU calculate
result = 0 ;
clock_t start = clock();
for ( int i = 0 ; i < DATA_SIZE; ++i) {
result += data[i] * data[i];
}
time_used = clock() - start;
printf ( "(CPU) sum:%d time:%ld\n" , result, time_used);
return 0 ;
}
//__global__ means that this function run in GPU, there isn't any return value.
__global__ static void squaresSum( int *data, int *sum, clock_t *time) {
int sum_t = 0 ;
clock_t start = clock();
for ( int i = 0 ; i < DATA_SIZE; ++i) {
sum_t += data[i] * data[i];
}
*sum = sum_t;
*time = clock() - start;
}
// ======== used to generate rand datas ========
void generateData( int *data, int size) {
for ( int i = 0 ; i < size; ++i) {
data[i] = rand() % 10 ;
}
}
void printDeviceProp(const cudaDeviceProp &prop)
{

printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %lu.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %lu.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %lu.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %lu.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %lu.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{

//used to count the device numbers
int count;

// get the cuda device count
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

// find the device >= 1.X
bool noDeviceSupport = true;
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
noDeviceSupport = false;
printf("****** Device No%d*********************************\n",i);
printDeviceProp(prop);
printf("\n");
}
}
}

// if can't find the device
if (noDeviceSupport == true) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}

// set cuda device
cudaSetDevice(0);
printf ( "Device No%d is selected.\n",0 );

return true;
}

改进1:多线程

squareSum3.cuDownload code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/******************************************************************** 
##### File Name: squareSum3.cu
##### File Func: calculate the sum of inputs's square
##### Author: HeZhichao
##### E-mail: hzc199307@gmail.com
##### Create Time: 2016-5-11
*********************************************************************/

# include<stdio.h>
# include <stdlib.h>
# include <cuda_runtime.h>
// ======== define area ========
# define DATA_SIZE 1048576 // 1M
# define THREAD_NUM 1024//# define THREAD_NUM 1024 // thread num
// ======== global area ========
int data[DATA_SIZE];
void printDeviceProp( const cudaDeviceProp &prop);
bool InitCUDA();
void generateData( int *data, int size);
__global__ static void squaresSum( int *data, int *sum, clock_t *time);
int main( int argc, char const *argv[]) {
// init CUDA device
if (!InitCUDA()) { return 0 ; }
printf ( "CUDA initialized.\n" );
// generate rand datas
printf("test !\n");
generateData(data, DATA_SIZE);
// malloc space for datas in GPU
int *gpuData;
int *sum;
clock_t *time;
printf("cudaMalloc start !\n");
cudaMalloc(( void **) &gpuData, sizeof ( int ) * DATA_SIZE);
printf("cudaMalloc gpuData is ok !\n");
cudaMalloc(( void **) &sum, sizeof ( int )*THREAD_NUM);
printf("cudaMalloc sum is ok !\n");
cudaMalloc(( void **) &time, sizeof (clock_t));
cudaMemcpy(gpuData, data, sizeof ( int ) * DATA_SIZE, cudaMemcpyHostToDevice);
printf("cudaMemcpy data to gpuData is ok !\n");
// calculate the squares's sum
//CUDA调用在GPU中函数名称<<<block num, thread num, shared memory size>>>(param,...) ;
squaresSum<<< 1 , THREAD_NUM , 0 >>>(gpuData, sum, time);
// copy the result from GPU to HOST
int result[THREAD_NUM];
clock_t time_used;
cudaMemcpy(result, sum, sizeof ( int )*THREAD_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof (clock_t), cudaMemcpyDeviceToHost);
// free GPU spaces
cudaFree(gpuData);
cudaFree(sum); cudaFree(time);
// print result
int tmp_result = 0;
for(int i=0;i<THREAD_NUM;++i){
tmp_result += result[i];
}
printf ( "(GPU) sum:%d time:%ld\n" , tmp_result, time_used);
// CPU calculate
tmp_result = 0 ;
clock_t start = clock();
for ( int i = 0 ; i < DATA_SIZE; ++i) {
tmp_result += data[i] * data[i];
}
time_used = clock() - start;
printf ( "(CPU) sum:%d time:%ld\n" , tmp_result, time_used);/**/
return 0 ;
}
//__global__ means that this function run in GPU, there isn't any return value.
__global__ static void squaresSum( int *data, int *sum, clock_t *time) {
const int size = DATA_SIZE / THREAD_NUM;
const int tid = threadIdx.x;
int tmp_sum = 0 ;
clock_t start = clock();
for ( int i = tid*size ; i < (tid+1)*size; ++i) {
tmp_sum += data[i] * data[i];
}
sum[tid] = tmp_sum;
*time = clock() - start;
}
// ======== used to generate rand datas ========
void generateData( int *data, int size) {
printf("generateData !");
for ( int i = 0 ; i < size; ++i) {
data[i] = rand() % 10 ;
}
}
void printDeviceProp(const cudaDeviceProp &prop)
{

printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %lu.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %lu.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %lu.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %lu.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %lu.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{

//used to count the device numbers
int count;

// get the cuda device count
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

// find the device >= 1.X
bool noDeviceSupport = true;
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
noDeviceSupport = false;
printf("****** Device No%d*********************************\n",i);
printDeviceProp(prop);
printf("\n");
}
}
}

// if can't find the device
if (noDeviceSupport == true) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}

// set cuda device
cudaSetDevice(4);
printf ( "Device No%d is selected.\n",4 );

return true;
}

改进2:使内存的整体读取连续

squareSum4.cuDownload code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/******************************************************************** 
##### File Name: squareSum4.cu
##### File Func: calculate the sum of inputs's square
##### Author: HeZhichao
##### E-mail: hzc199307@gmail.com
##### Create Time: 2016-5-11
*********************************************************************/

# include<stdio.h>
# include <stdlib.h>
# include <cuda_runtime.h>
// ======== define area ========
# define DATA_SIZE 1048576 // 1M
# define THREAD_NUM 1024//# define THREAD_NUM 1024 // thread num
// ======== global area ========
int data[DATA_SIZE];
void printDeviceProp( const cudaDeviceProp &prop);
bool InitCUDA();
void generateData( int *data, int size);
__global__ static void squaresSum( int *data, int *sum, clock_t *time);
int main( int argc, char const *argv[]) {
// init CUDA device
if (!InitCUDA()) { return 0 ; }
printf ( "CUDA initialized.\n" );
// generate rand datas
printf("test !\n");
generateData(data, DATA_SIZE);
// malloc space for datas in GPU
int *gpuData;
int *sum;
clock_t *time;
printf("cudaMalloc start !\n");
cudaMalloc(( void **) &gpuData, sizeof ( int ) * DATA_SIZE);
printf("cudaMalloc gpuData is ok !\n");
cudaMalloc(( void **) &sum, sizeof ( int )*THREAD_NUM);
printf("cudaMalloc sum is ok !\n");
cudaMalloc(( void **) &time, sizeof (clock_t));
cudaMemcpy(gpuData, data, sizeof ( int ) * DATA_SIZE, cudaMemcpyHostToDevice);
printf("cudaMemcpy data to gpuData is ok !\n");
// calculate the squares's sum
//CUDA调用在GPU中函数名称<<<block num, thread num, shared memory size>>>(param,...) ;
squaresSum<<< 1 , THREAD_NUM , 0 >>>(gpuData, sum, time);
// copy the result from GPU to HOST
int result[THREAD_NUM];
clock_t time_used;
cudaMemcpy(result, sum, sizeof ( int )*THREAD_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof (clock_t), cudaMemcpyDeviceToHost);
// free GPU spaces
cudaFree(gpuData);
cudaFree(sum); cudaFree(time);
// print result
int tmp_result = 0;
for(int i=0;i<THREAD_NUM;++i){
tmp_result += result[i];
}
printf ( "(GPU) sum:%d time:%ld\n" , tmp_result, time_used);
// CPU calculate
tmp_result = 0 ;
clock_t start = clock();
for ( int i = 0 ; i < DATA_SIZE; ++i) {
tmp_result += data[i] * data[i];
}
time_used = clock() - start;
printf ( "(CPU) sum:%d time:%ld\n" , tmp_result, time_used);/**/
return 0 ;
}
//__global__ means that this function run in GPU, there isn't any return value.
__global__ static void squaresSum( int *data, int *sum, clock_t *time) {
//const int size = DATA_SIZE / THREAD_NUM;
const int tid = threadIdx.x;
int tmp_sum = 0 ;
clock_t start = clock();
for ( int i = tid ; i < DATA_SIZE; i+=THREAD_NUM) {
tmp_sum += data[i] * data[i];
}
sum[tid] = tmp_sum;
*time = clock() - start;
}
// ======== used to generate rand datas ========
void generateData( int *data, int size) {
printf("generateData !");
for ( int i = 0 ; i < size; ++i) {
data[i] = rand() % 10 ;
}
}
void printDeviceProp(const cudaDeviceProp &prop)
{

printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %lu.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %lu.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %lu.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %lu.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %lu.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{

//used to count the device numbers
int count;

// get the cuda device count
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

// find the device >= 1.X
bool noDeviceSupport = true;
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
noDeviceSupport = false;
printf("****** Device No%d*********************************\n",i);
printDeviceProp(prop);
printf("\n");
}
}
}

// if can't find the device
if (noDeviceSupport == true) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}

// set cuda device
cudaSetDevice(4);
printf ( "Device No%d is selected.\n",4 );

return true;
}

改进3:多block多thread

squareSum5.cuDownload code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/******************************************************************** 
##### File Name: squareSum5.cu
##### File Func: calculate the sum of inputs's square
##### Author: HeZhichao
##### E-mail: hzc199307@gmail.com
##### Create Time: 2016-5-11
*********************************************************************/

# include<stdio.h>
# include <stdlib.h>
# include <cuda_runtime.h>
// ======== define area ========
# define DATA_SIZE 1048576 // 1M
//8*128=1024 threads
# define BLOCK_NUM 8 // block num
# define THREAD_NUM 128 // thread num per block

// ======== global area ========
int data[DATA_SIZE];
void printDeviceProp( const cudaDeviceProp &prop);
bool InitCUDA();
void generateData( int *data, int size);
__global__ static void squaresSum( int *data, int *sum, clock_t *time);
int main( int argc, char const *argv[]) {
// init CUDA device
if (!InitCUDA()) { return 0 ; }
printf ( "CUDA initialized.\n" );
// generate rand datas
printf("test !\n");
generateData(data, DATA_SIZE);
// malloc space for datas in GPU
int *gpuData;
int *sum;
clock_t *time;
printf("cudaMalloc start !\n");
cudaMalloc(( void **) &gpuData, sizeof ( int ) * DATA_SIZE);
printf("cudaMalloc gpuData is ok !\n");
cudaMalloc(( void **) &sum, sizeof ( int )*THREAD_NUM*BLOCK_NUM);
printf("cudaMalloc sum is ok !\n");
cudaMalloc(( void **) &time, sizeof (clock_t));
cudaMemcpy(gpuData, data, sizeof ( int ) * DATA_SIZE, cudaMemcpyHostToDevice);
printf("cudaMemcpy data to gpuData is ok !\n");
// calculate the squares's sum
//CUDA调用在GPU中函数名称<<<block num, thread num, shared memory size>>>(param,...) ;
squaresSum<<< BLOCK_NUM , THREAD_NUM , 0 >>>(gpuData, sum, time);
// copy the result from GPU to HOST
int result[THREAD_NUM*BLOCK_NUM];
clock_t time_used;
cudaMemcpy(result, sum, sizeof ( int )*THREAD_NUM*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof (clock_t), cudaMemcpyDeviceToHost);
// free GPU spaces
cudaFree(gpuData);
cudaFree(sum); cudaFree(time);
// print result
int tmp_result = 0;
for(int i=0;i<THREAD_NUM*BLOCK_NUM;++i){
tmp_result += result[i];
}
printf ( "(GPU) sum:%d time:%ld\n" , tmp_result, time_used);
// CPU calculate
tmp_result = 0 ;
clock_t start = clock();
for ( int i = 0 ; i < DATA_SIZE; ++i) {
tmp_result += data[i] * data[i];
}
time_used = clock() - start;
printf ( "(CPU) sum:%d time:%ld\n" , tmp_result, time_used);/**/
return 0 ;
}
//__global__ means that this function run in GPU, there isn't any return value.
__global__ static void squaresSum( int *data, int *sum, clock_t *time) {
//const int size = DATA_SIZE / THREAD_NUM;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int tmp_sum = 0 ;
clock_t start = clock();
for ( int i = bid * THREAD_NUM + tid ; i < DATA_SIZE; i+=THREAD_NUM*BLOCK_NUM) {
tmp_sum += data[i] * data[i];
}
sum[bid*THREAD_NUM+tid] = tmp_sum;
*time = clock() - start;
}
// ======== used to generate rand datas ========
void generateData( int *data, int size) {
printf("generateData !");
for ( int i = 0 ; i < size; ++i) {
data[i] = rand() % 10 ;
}
}
void printDeviceProp(const cudaDeviceProp &prop)
{

printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %lu.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %lu.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %lu.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %lu.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %lu.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{

//used to count the device numbers
int count;

// get the cuda device count
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

// find the device >= 1.X
bool noDeviceSupport = true;
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
noDeviceSupport = false;
printf("****** Device No%d*********************************\n",i);
printDeviceProp(prop);
printf("\n");
}
}
}

// if can't find the device
if (noDeviceSupport == true) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}

// set cuda device
cudaSetDevice(4);
printf ( "Device No%d is selected.\n",4 );

return true;
}

改进4:使用block的共享内存

以在block上求和该block的所有thread,然后再CPU中求和(小步提升)。

squareSum6.cuDownload code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/******************************************************************** 
##### File Name: squareSum6.cu
##### File Func: calculate the sum of inputs's square
##### Author: HeZhichao
##### E-mail: hzc199307@gmail.com
##### Create Time: 2016-5-11
*********************************************************************/
# include<stdio.h>
# include <stdlib.h>
# include <cuda_runtime.h>
// ======== define area ========
# define DATA_SIZE 1048576 // 1M
//8*128=1024 threads
# define BLOCK_NUM 8 // block num
# define THREAD_NUM 128 // thread num per block

// ======== global area ========
int data[DATA_SIZE];
void printDeviceProp( const cudaDeviceProp &prop);
bool InitCUDA();
void generateData( int *data, int size);
__global__ static void squaresSum( int *data, int *sum, clock_t *time);
int main( int argc, char const *argv[]) {
// init CUDA device
if (!InitCUDA()) { return 0 ; }
printf ( "CUDA initialized.\n" );
// generate rand datas
printf("test !\n");
generateData(data, DATA_SIZE);
// malloc space for datas in GPU
int *gpuData;
int *sum;
clock_t *time;
printf("cudaMalloc start !\n");
cudaMalloc(( void **) &gpuData, sizeof ( int ) * DATA_SIZE);
printf("cudaMalloc gpuData is ok !\n");
cudaMalloc(( void **) &sum, sizeof ( int )*BLOCK_NUM);
printf("cudaMalloc sum is ok !\n");
cudaMalloc(( void **) &time, sizeof (clock_t));
cudaMemcpy(gpuData, data, sizeof ( int ) * DATA_SIZE, cudaMemcpyHostToDevice);
printf("cudaMemcpy data to gpuData is ok !\n");
// calculate the squares's sum
//CUDA调用在GPU中函数名称<<<block num, thread num, shared memory size>>>(param,...) ;
squaresSum<<< BLOCK_NUM , THREAD_NUM , THREAD_NUM*sizeof(int) >>>(gpuData, sum, time);
// copy the result from GPU to HOST
int result[BLOCK_NUM];
clock_t time_used;
cudaMemcpy(result, sum, sizeof ( int )*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof (clock_t), cudaMemcpyDeviceToHost);
// free GPU spaces
cudaFree(gpuData);
cudaFree(sum); cudaFree(time);
// print result
int tmp_result = 0;
for(int i=0;i<BLOCK_NUM;++i){
tmp_result += result[i];
}
printf ( "(GPU) sum:%d time:%ld\n" , tmp_result, time_used);
// CPU calculate
tmp_result = 0 ;
clock_t start = clock();
for ( int i = 0 ; i < DATA_SIZE; ++i) {
tmp_result += data[i] * data[i];
}
time_used = clock() - start;
printf ( "(CPU) sum:%d time:%ld\n" , tmp_result, time_used);/**/
return 0 ;
}
//__global__ means that this function run in GPU, there isn't any return value.
__global__ static void squaresSum( int *data, int *sum, clock_t *time) {
// define of shared memory
__shared__ int shared[THREAD_NUM];

const int tid = threadIdx.x;
const int bid = blockIdx.x;

shared[tid] = 0 ;

clock_t start = clock();

for ( int i = bid * THREAD_NUM + tid ; i < DATA_SIZE; i+=THREAD_NUM*BLOCK_NUM) {
shared[tid] += data[i] * data[i];
}

//同步操作,必须等到之前的线程都运行结束,才能继续后面的程序
__syncthreads();
//同步完成之后,将部分和加到share[0]上面
if(tid==0){ //这里保证全部都在一个线程内完成
for(int i=1;i<THREAD_NUM;i++){
shared[0]+=shared[i];
}
sum[bid]=shared[0];
}

*time = clock() - start;
}
// ======== used to generate rand datas ========
void generateData( int *data, int size) {
printf("generateData !");
for ( int i = 0 ; i < size; ++i) {
data[i] = rand() % 10 ;
}
}
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %lu.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %lu.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %lu.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %lu.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %lu.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{
//used to count the device numbers
int count;

// get the cuda device count
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

// find the device >= 1.X
bool noDeviceSupport = true;
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
noDeviceSupport = false;
printf("****** Device No%d*********************************\n",i);
printDeviceProp(prop);
printf("\n");
}
}
}

// if can't find the device
if (noDeviceSupport == true) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}

// set cuda device
cudaSetDevice(4);
printf ( "Device No%d is selected.\n",4 );

return true;
}

改进5:加法树

在block内部求和时可以采用加法树的方法,实现并行化(小步提升)。

squareSum7.cuDownload code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/******************************************************************** 
##### File Name: squareSum7.cu
##### File Func: calculate the sum of inputs's square
##### Author: HeZhichao
##### E-mail: hzc199307@gmail.com
##### Create Time: 2016-5-11
*********************************************************************/
# include<stdio.h>
# include <stdlib.h>
# include <cuda_runtime.h>
// ======== define area ========
# define DATA_SIZE 1048576 // 1M
//8*128=1024 threads
# define BLOCK_NUM 8 // block num
# define THREAD_NUM 128 // thread num per block

// ======== global area ========
int data[DATA_SIZE];
void printDeviceProp( const cudaDeviceProp &prop);
bool InitCUDA();
void generateData( int *data, int size);
__global__ static void squaresSum( int *data, int *sum, clock_t *time);
int main( int argc, char const *argv[]) {
// init CUDA device
if (!InitCUDA()) { return 0 ; }
printf ( "CUDA initialized.\n" );
// generate rand datas
printf("test !\n");
generateData(data, DATA_SIZE);
// malloc space for datas in GPU
int *gpuData;
int *sum;
clock_t *time;
printf("cudaMalloc start !\n");
cudaMalloc(( void **) &gpuData, sizeof ( int ) * DATA_SIZE);
printf("cudaMalloc gpuData is ok !\n");
cudaMalloc(( void **) &sum, sizeof ( int )*BLOCK_NUM);
printf("cudaMalloc sum is ok !\n");
cudaMalloc(( void **) &time, sizeof (clock_t));
cudaMemcpy(gpuData, data, sizeof ( int ) * DATA_SIZE, cudaMemcpyHostToDevice);
printf("cudaMemcpy data to gpuData is ok !\n");
// calculate the squares's sum
//CUDA调用在GPU中函数名称<<<block num, thread num, shared memory size>>>(param,...) ;
squaresSum<<< BLOCK_NUM , THREAD_NUM , THREAD_NUM*sizeof(int) >>>(gpuData, sum, time);
// copy the result from GPU to HOST
int result[BLOCK_NUM];
clock_t time_used;
cudaMemcpy(result, sum, sizeof ( int )*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof (clock_t), cudaMemcpyDeviceToHost);
// free GPU spaces
cudaFree(gpuData);
cudaFree(sum); cudaFree(time);
// print result
int tmp_result = 0;
for(int i=0;i<BLOCK_NUM;++i){
tmp_result += result[i];
}
printf ( "(GPU) sum:%d time:%ld\n" , tmp_result, time_used);
// CPU calculate
tmp_result = 0 ;
clock_t start = clock();
for ( int i = 0 ; i < DATA_SIZE; ++i) {
tmp_result += data[i] * data[i];
}
time_used = clock() - start;
printf ( "(CPU) sum:%d time:%ld\n" , tmp_result, time_used);/**/
return 0 ;
}
//__global__ means that this function run in GPU, there isn't any return value.
__global__ static void squaresSum( int *data, int *sum, clock_t *time) {
// define of shared memory
__shared__ int shared[THREAD_NUM];

const int tid = threadIdx.x;
const int bid = blockIdx.x;

int offset = THREAD_NUM / 2;

shared[tid] = 0 ;

clock_t start = clock();

for ( int i = bid * THREAD_NUM + tid ; i < DATA_SIZE; i+=THREAD_NUM*BLOCK_NUM) {
shared[tid] += data[i] * data[i];
}

//同步操作,必须等到之前的线程都运行结束,才能继续后面的程序
__syncthreads();
while ( offset>0){
if(tid < offset){//block的后半部分的thread的值加到前面一半
shared[tid] += shared[tid+offset];
}
offset >>=1 ;// 除以2
__syncthreads();//等到对半加法全部完成
}

//只在每个block的第一个thread做赋值操作
if(tid == 0) sum[bid] = shared[0];

*time = clock() - start;
}
// ======== used to generate rand datas ========
void generateData( int *data, int size) {
printf("generateData !");
for ( int i = 0 ; i < size; ++i) {
data[i] = rand() % 10 ;
}
}
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %lu.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %lu.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %lu.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %lu.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %lu.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{
//used to count the device numbers
int count;

// get the cuda device count
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

// find the device >= 1.X
bool noDeviceSupport = true;
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
noDeviceSupport = false;
printf("****** Device No%d*********************************\n",i);
printDeviceProp(prop);
printf("\n");
}
}
}

// if can't find the device
if (noDeviceSupport == true) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}

// set cuda device
cudaSetDevice(4);
printf ( "Device No%d is selected.\n",4 );

return true;
}

 

 


版权声明

本文首发于 贺智超的博客http://hezhichao.cn/ or http://hzc199307.github.io),转载请注明出处。

本文链接:http://hezhichao.cn/tech/gpu/cuda0/
永久链接:http://hzc199307.github.io/tech/gpu/cuda0/

文章目录
  1. 1. 初识CUDA
    1. 1.1. 部分函数介绍
    2. 1.2. cudaDeviceProp结构
    3. 1.3. 编译
  2. 2. 使用GPU实现数组平方和
    1. 2.1. 用GPU的简单实现
    2. 2.2. 改进1:多线程
    3. 2.3. 改进2:使内存的整体读取连续
    4. 2.4. 改进3:多block多thread
    5. 2.5. 改进4:使用block的共享内存
    6. 2.6. 改进5:加法树