본문으로 건너뛰기

OpenCL Example

Platform informations

get_info.cpp
#include <CL/cl.h>
#include <cstdlib>
#include <iostream>

using namespace std;

int main(void) {
cl_int status;
size_t str_info_size;
char * str_info;
cl_uint uint_info;

cl_uint num_platforms;
cl_platform_id *platforms;

cl_uint num_devices;
cl_device_id *devices;

status = clGetPlatformIDs(0, NULL, &num_platforms);
if(status != CL_SUCCESS) {
cerr << "Failed to get the number of platforms." << endl;
return -1;
}

if(num_platforms > 0) {
cout << "* The number of platforms: " << num_platforms << endl;
} else {
cerr << "The number of platforms is zero." << endl;
return -1;
}

platforms
= (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);

for(int i = 0; i < num_platforms; ++i) {
cout << " -----------------------------------------------" << endl;
cout << " Platform-" << i + 1 << endl;

clGetPlatformInfo(
platforms[i], CL_PLATFORM_NAME, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_NAME, str_info_size, str_info, NULL);
cout << " * Platform: " << str_info << endl;
free(str_info);

clGetPlatformInfo(
platforms[i], CL_PLATFORM_PROFILE, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_PROFILE, str_info_size, str_info, NULL);
cout << " * Profile: " << str_info << endl;
free(str_info);

clGetPlatformInfo(
platforms[i], CL_PLATFORM_VERSION, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_VERSION, str_info_size, str_info, NULL);
cout << " * " << str_info << endl;
free(str_info);

status = clGetDeviceIDs(
platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
if(status != CL_SUCCESS) {
cerr << "Failed to get the number of devices in the platform "
<< i + 1 << '.' << endl;
continue;
}

if(num_devices > 0) {
cout << " * The number of devices: " << num_devices << endl;
} else {
cerr << "Platform-" << i + 1 << " has no devices." << endl;
continue;
}

devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
clGetDeviceIDs(
platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);

for(int j = 0; j < num_devices; ++j) {
cout << " ================================================="
<< endl;
cout << " Device-" << j + 1 << endl;

clGetDeviceInfo(
devices[j], CL_DEVICE_NAME, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetDeviceInfo(
devices[j], CL_DEVICE_NAME, str_info_size, str_info, NULL);
cout << " * name: " << str_info << endl;
free(str_info);

clGetDeviceInfo(devices[j],
CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(uint_info),
&uint_info,
NULL);
cout << " * Max Compute Units: " << uint_info << endl;

clGetDeviceInfo(devices[j],
CL_DEVICE_MAX_CLOCK_FREQUENCY,
sizeof(uint_info),
&uint_info,
NULL);
cout << " * Max Clock freq: " << uint_info << endl;

cout << " ================================================="
<< endl;
}
cout << " -----------------------------------------------" << endl;
}

free(devices);
free(platforms);
return 0;
}
g++ -o get_info get_info.cpp -lOpenCL

SAXPY, CPU vs. GPU or Mali-GPU

saxpy_cpu_vs_gpu.cpp
#include <CL/cl.h>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <iostream>

using namespace std;

const int LENGTH = 1000000;

const char *source
= "__kernel void saxpy(const __global float *X, \n"
" __global float *Y, \n"
" const float a) { \n"
" uint gid = get_global_id(0); \n"
" Y[gid] = a * X[gid] + Y[gid]; \n"
"} \n";

int main(void) {
cout << fixed;

cl_int status;
time_t t, t2 = 0;

cl_uint num_platforms = 1;
cl_platform_id *platforms
= (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);

cl_uint num_devices = 2;
cl_device_id *devices
= (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
clGetDeviceIDs(
platforms[0], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);

cl_context context
= clCreateContext(NULL, 1, &devices[0], NULL, NULL, NULL);

cl_command_queue queue = clCreateCommandQueue(context, devices[0], 0, NULL);

cl_program program = clCreateProgramWithSource(
context, 1, (const char **)&source, NULL, NULL);

status = clBuildProgram(program, 1, &devices[0], NULL, NULL, NULL);
if(status != CL_SUCCESS) {
cerr << "clBuildProgram failed: " << status << endl;
char log[0x10000];
clGetProgramBuildInfo(
program, devices[0], CL_PROGRAM_BUILD_LOG, 0x10000, log, NULL);
cerr << log << endl;
return -1;
}

cl_kernel kernel = clCreateKernel(program, "saxpy", NULL);

cl_float *p_x = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float *p_y = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float *p_ret = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float *p_cl_ret = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float a = 2.f;

srand(time(NULL));
for(int i = 0; i < LENGTH; ++i) {
p_x[i] = (cl_float)rand() / RAND_MAX;
p_y[i] = (cl_float)rand() / RAND_MAX;
}

cl_mem buf_x = clCreateBuffer(
context, CL_MEM_READ_ONLY, sizeof(cl_float) * LENGTH, NULL, NULL);
cl_mem buf_y = clCreateBuffer(
context, CL_MEM_READ_WRITE, sizeof(cl_float) * LENGTH, NULL, NULL);

t = clock();

for(int i = 0; i < LENGTH; ++i) { p_ret[i] = a * p_x[i] + p_y[i]; }

t = clock() - t;
cout << "CPU" << endl;
cout << " * Time (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
cout << "---------------------------------------------------" << endl;

t = clock();

clEnqueueWriteBuffer(queue,
buf_x,
CL_TRUE,
0,
sizeof(cl_float) * LENGTH,
p_x,
0,
NULL,
NULL);
clEnqueueWriteBuffer(queue,
buf_y,
CL_TRUE,
0,
sizeof(cl_float) * LENGTH,
p_y,
0,
NULL,
NULL);

clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_x);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_y);
clSetKernelArg(kernel, 2, sizeof(cl_float), &a);

t = clock() - t;
t2 += t;
cout << "GPU" << endl;
cout << " * Writes buffers and sets arguments (s): "
<< (double)t / CLOCKS_PER_SEC << endl;

t = clock();

clEnqueueNDRangeKernel(
queue, kernel, 1, NULL, (size_t *)&LENGTH, NULL, 0, NULL, NULL);

clFinish(queue);

t = clock() - t;
t2 += t;
cout << " * Runnig a kernel (s): "
<< (double)t / CLOCKS_PER_SEC << endl;

t = clock();

clEnqueueReadBuffer(queue,
buf_y,
CL_TRUE,
0,
sizeof(cl_float) * LENGTH,
p_cl_ret,
0,
NULL,
NULL);

t = clock() - t;
t2 += t;
cout << " * Reads buffer (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
cout << endl
<< " * Total time (s): "
<< (double)t2 / CLOCKS_PER_SEC << endl;

for(int i = 0; i < LENGTH; ++i) {
if(abs(p_ret[i] - p_cl_ret[i]) > 0.01) {
cout << "Failed to SAXPY" << endl;
}
}

clReleaseMemObject(buf_y);
clReleaseMemObject(buf_x);
free(p_cl_ret);
free(p_ret);
free(p_y);
free(p_x);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(devices);
free(platforms);

return 0;
}
g++ -o saxpy_cpu_vs_gpu saxpy_cpu_vs_gpu.cpp -lOpenCL