Commit ec5699f8 authored by Marvin Damschen's avatar Marvin Damschen
Browse files

Device self-scheduling using device-side queueing

parent 285a847d
......@@ -84,6 +84,9 @@ static int initialize()
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
threads_per_block = 128;
......@@ -105,6 +108,9 @@ static int shutdown()
// release resources
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -88,6 +88,9 @@ static int initialize() {
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......@@ -104,6 +107,9 @@ static int shutdown()
// release resources
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -84,6 +84,9 @@ static int initialize()
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
threads_per_block = 128;
......@@ -105,6 +108,9 @@ static int shutdown()
// release resources
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -145,7 +145,7 @@ int findIndexBin(__global double * CDF, int beginIndex, int endIndex, double val
*****************************/
__kernel void find_index_kernel(__global double * arrayX, __global double * arrayY,
__global double * CDF, __global double * u, __global double * xj,
__global double * yj, __global double * weights, int Nparticles
__global double * yj, __global double * weights, int Nparticles SELF_SCHEDULE_ARGS
){
int i = get_global_id(0);
......@@ -172,9 +172,9 @@ __kernel void find_index_kernel(__global double * arrayX, __global double * arra
}
barrier(CLK_GLOBAL_MEM_FENCE);
SCHEDULE_CHILD_1D(find_index_kernel(arrayX, arrayY, CDF, u, xj, yj, weights, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void normalize_weights_kernel(__global double * weights, int Nparticles, __global double * partial_sums, __global double * CDF, __global double * u, __global int * seed)
__kernel void normalize_weights_kernel(__global double * weights, int Nparticles, __global double * partial_sums, __global double * CDF, __global double * u, __global int * seed SELF_SCHEDULE_ARGS)
{
int i = get_global_id(0);
int local_id = get_local_id(0);
......@@ -209,11 +209,11 @@ __kernel void normalize_weights_kernel(__global double * weights, int Nparticles
u[i] = u1 + i/((double)(Nparticles));
}
SCHEDULE_CHILD_1D(normalize_weights_kernel(weights, Nparticles, partial_sums, CDF, u, seed SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void sum_kernel(__global double* partial_sums, int Nparticles)
__kernel void sum_kernel(__global double* partial_sums, int Nparticles SELF_SCHEDULE_ARGS)
{
int i = get_global_id(0);
......@@ -230,7 +230,7 @@ __kernel void sum_kernel(__global double* partial_sums, int Nparticles)
partial_sums[0] = sum;
}
SCHEDULE_CHILD_1D(sum_kernel(partial_sums, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -252,7 +252,7 @@ __kernel void sum_kernel(__global double* partial_sums, int Nparticles)
* param11: IszY
* param12: Nfr
*****************************/
__kernel void likelihood_kernel(__global double * arrayX, __global double * arrayY,__global double * xj, __global double * yj, __global double * CDF, __global int * ind, __global int * objxy, __global double * likelihood, __global unsigned char * I, __global double * u, __global double * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global double * partial_sums, __local double* buffer){
__kernel void likelihood_kernel(__global double * arrayX, __global double * arrayY,__global double * xj, __global double * yj, __global double * CDF, __global int * ind, __global int * objxy, __global double * likelihood, __global unsigned char * I, __global double * u, __global double * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global double * partial_sums, __local double* buffer SELF_SCHEDULE_ARGS){
int block_id = get_group_id(0);
int thread_id = get_local_id(0);
int i = get_global_id(0);
......@@ -321,7 +321,8 @@ __kernel void likelihood_kernel(__global double * arrayX, __global double * arra
partial_sums[block_id] = buffer[0];
}
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^(local void *buffer){likelihood_kernel(arrayX, arrayY, xj, yj, CDF, ind, objxy, likelihood, I, u, weights, Nparticles, countOnes,
max_size, k, IszY, Nfr, seed, partial_sums, buffer SELF_SCHEDULE_CHILD_ARGS);} COMMA (uint)(sizeof(double)*THREADS_PER_BLOCK))
}//*/
#endif
......@@ -59,7 +59,7 @@ int findIndexBin(double * CDF, int beginIndex, int endIndex, double value)
* param6: yj
* param7: Nparticles
*****************************/
__kernel void particle_kernel(__global double * arrayX, __global double * arrayY, __global double * CDF, __global double * u, __global double * xj, __global double * yj, int Nparticles){
__kernel void particle_kernel(__global double * arrayX, __global double * arrayY, __global double * CDF, __global double * u, __global double * xj, __global double * yj, int Nparticles SELF_SCHEDULE_ARGS){
int i = get_global_id(0);
if(i < Nparticles){
......@@ -82,5 +82,5 @@ __kernel void particle_kernel(__global double * arrayX, __global double * arrayY
}
}
SCHEDULE_CHILD_1D(particle_kernel(arrayX, arrayY, CDF, u, xj, yj, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
\ No newline at end of file
......@@ -180,7 +180,7 @@ float tex1Dfetch(__read_only image2d_t img, int index){
*****************************/
__kernel void find_index_kernel(__global float * arrayX, __global float * arrayY,
__global float * CDF, __global float * u, __global float * xj,
__global float * yj, __global float * weights, int Nparticles
__global float * yj, __global float * weights, int Nparticles SELF_SCHEDULE_ARGS
){
int i = get_global_id(0);
......@@ -207,9 +207,9 @@ __kernel void find_index_kernel(__global float * arrayX, __global float * arrayY
}
barrier(CLK_GLOBAL_MEM_FENCE);
SCHEDULE_CHILD_1D(find_index_kernel(arrayX, arrayY, CDF, u, xj, yj, weights, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void normalize_weights_kernel(__global float * weights, int Nparticles, __global float * partial_sums, __global float * CDF, __global float * u, __global int * seed)
__kernel void normalize_weights_kernel(__global float * weights, int Nparticles, __global float * partial_sums, __global float * CDF, __global float * u, __global int * seed SELF_SCHEDULE_ARGS)
{
int i = get_global_id(0);
int local_id = get_local_id(0);
......@@ -244,11 +244,11 @@ __kernel void normalize_weights_kernel(__global float * weights, int Nparticles,
u[i] = u1 + i/((float)(Nparticles));
}
SCHEDULE_CHILD_1D(normalize_weights_kernel(weights, Nparticles, partial_sums, CDF, u, seed SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void sum_kernel(__global float* partial_sums, int Nparticles)
__kernel void sum_kernel(__global float* partial_sums, int Nparticles SELF_SCHEDULE_ARGS)
{
int i = get_global_id(0);
......@@ -265,7 +265,7 @@ __kernel void sum_kernel(__global float* partial_sums, int Nparticles)
partial_sums[0] = sum;
}
SCHEDULE_CHILD_1D(sum_kernel(partial_sums, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -287,7 +287,7 @@ __kernel void sum_kernel(__global float* partial_sums, int Nparticles)
* param11: IszY
* param12: Nfr
*****************************/
__kernel void likelihood_kernel(__global float * arrayX, __global float * arrayY,__global float * xj, __global float * yj, __global float * CDF, __global int * ind, __global int * objxy, __global float * likelihood, __global unsigned char * I, __global float * u, __global float * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global float * partial_sums, __local float* buffer){
__kernel void likelihood_kernel(__global float * arrayX, __global float * arrayY,__global float * xj, __global float * yj, __global float * CDF, __global int * ind, __global int * objxy, __global float * likelihood, __global unsigned char * I, __global float * u, __global float * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global float * partial_sums, __local float* buffer SELF_SCHEDULE_ARGS){
int block_id = get_group_id(0);
int thread_id = get_local_id(0);
int i = get_global_id(0);
......@@ -356,7 +356,8 @@ __kernel void likelihood_kernel(__global float * arrayX, __global float * arrayY
partial_sums[block_id] = buffer[0];
}
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^(local void *buffer){likelihood_kernel(arrayX, arrayY, xj, yj, CDF, ind, objxy, likelihood, I, u, weights, Nparticles, countOnes,
max_size, k, IszY, Nfr, seed, partial_sums, buffer SELF_SCHEDULE_CHILD_ARGS);} COMMA (uint)(sizeof(float)*THREADS_PER_BLOCK))
}//*/
//#endif
......@@ -13,7 +13,7 @@ __kernel void dynproc_kernel (int iteration,
int startStep,
int border,
int HALO,
__global int* outputBuffer)
__global int* outputBuffer SELF_SCHEDULE_ARGS)
{
__local int prev[LOCAL_WORK];
__local int result[LOCAL_WORK];
......@@ -23,7 +23,7 @@ __kernel void dynproc_kernel (int iteration,
int tx = get_local_id(0);
if (get_global_id(0) > rows*cols)
return;
goto self_schedule;
// Each block finally computes result for a small block
// after N iterations.
......@@ -116,6 +116,9 @@ __kernel void dynproc_kernel (int iteration,
gpuResults[xidx] = result[tx];
}
self_schedule:
SCHEDULE_CHILD_1D(dynproc_kernel (iteration, gpuWall, gpuSrc, gpuResults, cols, rows, startStep, border, HALO, outputBuffer SELF_SCHEDULE_CHILD_ARGS))
return;
}
......
......@@ -110,6 +110,9 @@ int main(int argc, char** argv)
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
init(context, argc, argv);
// Pyramid parameters.
......@@ -193,5 +196,8 @@ int main(int argc, char** argv)
// delete[] data;
// delete[] wall;
// delete[] result;
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
return EXIT_SUCCESS;
}
......@@ -3,7 +3,7 @@ export LC_ALL=C
runsperrun=2
date=$(date +%Y%m%d)
for percentWorkCPU in `seq -50 50 100`;
for percentWorkCPU in `seq -150 50 -150`;
do
echo $(date +%H:%M) - Cleaning...
......
......@@ -20,7 +20,7 @@
__kernel void
extract_kernel(long d_Ne,
__global fp* d_I){ // pointer to input image (DEVICE GLOBAL MEMORY)
__global fp* d_I SELF_SCHEDULE_ARGS){ // pointer to input image (DEVICE GLOBAL MEMORY)
// indexes
int bx = (get_group_id(0) + (get_global_offset(0)/get_local_size(0))); // get current horizontal block index
......@@ -34,7 +34,7 @@ extract_kernel(long d_Ne,
}
SCHEDULE_CHILD_1D(extract_kernel(d_Ne, d_I SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......@@ -45,7 +45,7 @@ __kernel void
prepare_kernel( long d_Ne,
__global fp* d_I, // pointer to output image (DEVICE GLOBAL MEMORY)
__global fp* d_sums, // pointer to input image (DEVICE GLOBAL MEMORY)
__global fp* d_sums2){
__global fp* d_sums2 SELF_SCHEDULE_ARGS){
// indexes
int bx = (get_group_id(0) + (get_global_offset(0)/get_local_size(0))); // get current horizontal block index
......@@ -60,7 +60,7 @@ prepare_kernel( long d_Ne,
}
SCHEDULE_CHILD_1D(prepare_kernel(d_Ne, d_I, d_sums, d_sums2 SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......@@ -73,7 +73,7 @@ reduce_kernel( long d_Ne, // number of elements in array
int d_mul, // increment
__global fp* d_sums, // pointer to partial sums variable (DEVICE GLOBAL MEMORY)
__global fp* d_sums2,
int gridDim){
int gridDim SELF_SCHEDULE_ARGS){
// indexes
int bx = (get_group_id(0) + (get_global_offset(0)/get_local_size(0))); // get current horizontal block index
......@@ -173,7 +173,7 @@ reduce_kernel( long d_Ne, // number of elements in array
}
}
SCHEDULE_CHILD_1D(reduce_kernel(d_Ne, d_no, d_mul, d_sums, d_sums2, gridDim SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......@@ -197,7 +197,7 @@ srad_kernel(fp d_lambda,
__global fp* d_dW,
fp d_q0sqr,
__global fp* d_c,
__global fp* d_I){
__global fp* d_I SELF_SCHEDULE_ARGS){
// indexes
int bx = (get_group_id(0) + (get_global_offset(0)/get_local_size(0))); // get current horizontal block index
......@@ -263,7 +263,7 @@ srad_kernel(fp d_lambda,
}
SCHEDULE_CHILD_1D(srad_kernel(d_lambda, d_Nr, d_Nc, d_Ne, d_iN, d_iS, d_jE, d_jW, d_dN, d_dS, d_dE, d_dW, d_q0sqr, d_c, d_I SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......@@ -286,7 +286,7 @@ srad2_kernel( fp d_lambda,
__global fp* d_dE,
__global fp* d_dW,
__global fp* d_c,
__global fp* d_I){
__global fp* d_I SELF_SCHEDULE_ARGS){
// indexes
int bx = (get_group_id(0) + (get_global_offset(0)/get_local_size(0))); // get current horizontal block index
......@@ -323,7 +323,7 @@ srad2_kernel( fp d_lambda,
}
SCHEDULE_CHILD_1D(srad2_kernel(d_lambda, d_Nr, d_Nc, d_Ne, d_iN, d_iS, d_jE, d_jW, d_dN, d_dS, d_dE, d_dW, d_c, d_I SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......@@ -332,7 +332,7 @@ srad2_kernel( fp d_lambda,
__kernel void
compress_kernel(long d_Ne,
__global fp* d_I){ // pointer to output image (DEVICE GLOBAL MEMORY)
__global fp* d_I SELF_SCHEDULE_ARGS){ // pointer to output image (DEVICE GLOBAL MEMORY)
// indexes
int bx = (get_group_id(0) + (get_global_offset(0)/get_local_size(0))); // get current horizontal block index
......@@ -346,7 +346,7 @@ compress_kernel(long d_Ne,
}
SCHEDULE_CHILD_1D(compress_kernel(d_Ne, d_I SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......
......@@ -68,6 +68,9 @@ main( int argc,
}
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
//======================================================================================================================================================150
// VARIABLES
......@@ -260,6 +263,9 @@ main( int argc,
clSVMFree(context, iS);
clSVMFree(context, jW);
clSVMFree(context, jE);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context);
time6 = get_time();
......
......@@ -22,7 +22,7 @@ __kernel void pgain_kernel(
int num,
int dim,
long x,
int K){
int K SELF_SCHEDULE_ARGS){
/* block ID and global thread ID */
const int thread_id = get_global_id(0);
const int local_id = get_local_id(0);
......@@ -60,6 +60,6 @@ __kernel void pgain_kernel(
}
}
SCHEDULE_CHILD_1D(pgain_kernel(p, coord_d, work_mem_d, center_table_d, switch_membership_d, num, dim, x, K SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -924,7 +924,10 @@ int main(int argc, char **argv)
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
initKernel(context, dim);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
initKernel(context, dim);
srand48(SEED);
PStream* stream;
......
......@@ -73,6 +73,9 @@ void freeDevMem(){
clReleaseProgram(program);
clReleaseCommandQueue(commandsCPU);
clReleaseCommandQueue(commandsGPU);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
}
float pgain( long x, Points *points, float z, long int *numcenters, int kmax, bool *is_center, int *center_table, char *switch_membership,
......
// normally, offset is not applied to group_id, but this is necessary for splitting work onto multiple devices
#define get_group_id(dim) (get_group_id(dim) + (get_global_offset(dim)/get_local_size(dim)))
#define COMMA ,
#ifdef SELF_SCHEDULE
#define NUM_CHILDREN_PER_WG 1
#define SELF_SCHEDULE_ARGS , const int __divideWorkDim, const size_t __totalWorkItems, __global atomic_int *__workLeft
#define SELF_SCHEDULE_CHILD_ARGS , __divideWorkDim, __totalWorkItems, __workLeft
// different approach to dividing work: __global_size[__divideWorkDim] = x*get_local_size(__divideWorkDim);
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(kernel_block_with_local_ptrs)\
if (get_local_id(0) < NUM_CHILDREN_PER_WG) {\
const int leftBefore = atomic_fetch_sub_explicit(__workLeft, get_global_size(0), memory_order_relaxed, memory_scope_all_svm_devices);\
if (leftBefore > 0) {\
const ndrange_t child_ndrange = ndrange_1D((size_t)(__totalWorkItems-leftBefore), ((get_global_size(0) < leftBefore) ? get_global_size(0) : leftBefore), get_local_size(0));\
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_NO_WAIT, child_ndrange, kernel_block_with_local_ptrs);\
} else atomic_store_explicit(__workLeft, 0, memory_order_relaxed, memory_scope_all_svm_devices);\
}
#define SCHEDULE_CHILD_1D(kernel_call)\
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^{kernel_call;})
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(kernel_block_with_local_ptrs)\
if (get_local_id(__divideWorkDim) < NUM_CHILDREN_PER_WG) {\
const int leftBefore = atomic_fetch_sub_explicit(__workLeft, get_global_size(__divideWorkDim), memory_order_relaxed, memory_scope_all_svm_devices);\
if (leftBefore > 0) {\
size_t __offset[3] = {0, 0, 0};\
__offset[__divideWorkDim] = (size_t)(__totalWorkItems-leftBefore);\
size_t __global_size[3] = {get_global_size(0), get_global_size(1), get_global_size(2)};\
__global_size[__divideWorkDim] = ((__global_size[__divideWorkDim] < leftBefore) ? __global_size[__divideWorkDim] : leftBefore);\
const size_t __local_size[3] = {get_local_size(0), get_local_size(1), get_local_size(2)};\
const ndrange_t child_ndrange = ndrange_3D(__offset, __global_size, __local_size);\
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_NO_WAIT, child_ndrange, kernel_block_with_local_ptrs);\
} else atomic_store_explicit(__workLeft, 0, memory_order_relaxed, memory_scope_all_svm_devices);\
}
#define SCHEDULE_CHILD_ND(kernel_call)\
SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(^{kernel_call;})
#else
#define SELF_SCHEDULE_ARGS
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(kernel_block_with_local_ptrs)
#define SCHEDULE_CHILD_1D(kernel_call)
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(kernel_block_with_local_ptrs)
#define SCHEDULE_CHILD_ND(kernel_call)
#define SELF_SCHEDULE_CHILD_ARGS
#endif
......@@ -63,6 +63,8 @@ cl_program getBuiltProgramFromFile(cl_context context, const char *sourceFileNam
}
char clOptions[512] = "-cl-std=CL2.0";
if (commandsCPUOnDevice && commandsGPUOnDevice)
sprintf(clOptions + strlen(clOptions), " -DSELF_SCHEDULE");
if (options)
sprintf(clOptions + strlen(clOptions), " %s", options);
......@@ -94,6 +96,31 @@ cl_program getBuiltProgramFromFile(cl_context context, const char *sourceFileNam
return program;
}
cl_int initOnDeviceCommandQueues(cl_context context) {
cl_int err;
cl_uint num;
clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &num, NULL);
cl_device_id devices[num];
clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*num, devices, NULL);
workLeft = (atomic_int*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(atomic_int), 0);
atomic_init(workLeft, 0);
cl_queue_properties cmdQueuePropsDevice[] = {CL_QUEUE_PROPERTIES, (cl_command_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, 0};
commandsCPUOnDevice = clCreateCommandQueueWithProperties(context, devices[0], cmdQueuePropsDevice, &err);
if (err != CL_SUCCESS)
return err;
commandsGPUOnDevice = clCreateCommandQueueWithProperties(context, devices[1], cmdQueuePropsDevice, &err);
return err;
}
void releaseOnDeviceCommandQueues(cl_context context) {
clReleaseCommandQueue(commandsCPUOnDevice);
clReleaseCommandQueue(commandsGPUOnDevice);
commandsCPUOnDevice = 0;
commandsGPUOnDevice = 0;
}
cl_int getDevicesfromPlatformIdx(int platformIdx, cl_uint* numDevices, cl_device_id** devices) {
cl_int status;
......@@ -163,6 +190,47 @@ cl_int launchKernelFused(cl_command_queue commandsCPU, cl_command_queue commands
cl_int launchKernelFusedDivideWorkInDim(cl_command_queue commandsCPU, cl_command_queue commandsGPU, cl_kernel kernel, int workDim, int divideWorkDim, int* percentWorkCPU,
const size_t global_work_size[], const size_t local_work_size[], int launchKernelSynchronous, double* timeCPU, double* timeGPU, double* timeHost) {
totalMeasurements++;
size_t initial_work_size[3] = {global_work_size[0], workDim > 1 ? global_work_size[1] : 1, workDim > 2 ? global_work_size[2] : 1};
if (commandsCPUOnDevice && commandsGPUOnDevice) { // prepare initial work groups for device-side queueing
initial_work_size[divideWorkDim] = local_work_size[divideWorkDim]*16;
if (initial_work_size[divideWorkDim] > global_work_size[divideWorkDim])
initial_work_size[divideWorkDim] = global_work_size[divideWorkDim];
atomic_store(workLeft, (int)(global_work_size[divideWorkDim]-initial_work_size[divideWorkDim]));
#ifdef DEBUG
printf("\n### launchKernelFused -- self-scheduling ###\n");
printf("divideWorkDim: %d, global_work_size[divideWorkDim]: %d, initial_work_size[divideWorkDim]: %d\n", divideWorkDim, global_work_size[divideWorkDim], initial_work_size[divideWorkDim]);
#endif
cl_uint kernel_num_args;
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &kernel_num_args, NULL);
cl_int err;
err = clSetKernelArg(kernel, kernel_num_args-3, sizeof(int), &divideWorkDim);
if (err != CL_SUCCESS) {
#ifdef DEBUG
printf("\n### launchKernelFused -- failed to set divideWorkDim argument (arg #%d)\n", kernel_num_args-3);
#endif
return err;
}
err = clSetKernelArg(kernel, kernel_num_args-2, sizeof(size_t), &global_work_size[divideWorkDim]);
if (err != CL_SUCCESS) {
#ifdef DEBUG
printf("\n### launchKernelFused -- failed to set global_work_size argument (arg #%d)\n", kernel_num_args-2);
#endif
return err;
}
err = clSetKernelArgSVMPointer(kernel, kernel_num_args-1, workLeft);
if (err != CL_SUCCESS) {
#ifdef DEBUG
printf("\n### launchKernelFused -- failed to set workLeft argument (arg #%d)\n", kernel_num_args-1);
#endif
return err;
}
*percentWorkCPU = ((initial_work_size[divideWorkDim] >= 2*local_work_size[divideWorkDim] && initial_work_size[divideWorkDim] < global_work_size[divideWorkDim]) ? 50 : 100);
}
#ifdef DEBUG
else
printf("\n### launchKernelFused -- divide work on host ###\n");
......@@ -177,10 +245,10 @@ cl_int launchKernelFusedDivideWorkInDim(cl_command_queue commandsCPU, cl_command
}
cl_int err = 0;
if (*percentWorkCPU < 0) // NOTE percentWorkCPU < 0 triggers host-side profiling. Then, -percentWorkCPU of total workitems will be used for profiling
err = launchKernelFusedParallelAutoDivideWorkInDim(commandsCPU, commandsGPU, kernel, workDim, divideWorkDim, percentWorkCPU, global_work_size, local_work_size, launchKernelSynchronous, timeCPU, timeGPU, timeHost);
if (*percentWorkCPU < 0)
err = launchKernelFusedParallelAutoDivideWorkInDim(commandsCPU, commandsGPU, kernel, workDim, divideWorkDim, percentWorkCPU, ((commandsCPUOnDevice && commandsGPUOnDevice) ? initial_work_size : global_work_size), local_work_size, launchKernelSynchronous, timeCPU, timeGPU, timeHost);
else
err = launchKernelFusedParallel(commandsCPU, commandsGPU, kernel, workDim, divideWorkDim, percentWorkCPU, global_work_size, local_work_size, launchKernelSynchronous, timeCPU, timeGPU, timeHost);
err = launchKernelFusedParallel(commandsCPU, commandsGPU, kernel, workDim, divideWorkDim, percentWorkCPU, ((commandsCPUOnDevice && commandsGPUOnDevice) ? initial_work_size : global_work_size), local_work_size, launchKernelSynchronous, timeCPU, timeGPU, timeHost);
#if defined (__CL_PRINT_STATS) || defined (__CL_PRINT_DAT) || defined (__CL_MEASURE_ALL_KERNELS)
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, 92, kernelName[currentKernelRun], NULL);
......
......@@ -27,6 +27,10 @@ char* readSourceFromFileName(const char* fileName);
cl_program getBuiltProgramFromFile(cl_context context, const char *sourceFileName, char *options, cl_int *err);
cl_int initOnDeviceCommandQueues(cl_context context);
void releaseOnDeviceCommandQueues(cl_context context);
cl_int getDevicesfromPlatformIdx(int platformIdx, cl_uint* numDevices, cl_device_id** devices);
void getCPUGPUIds(int* iCPU, int* iGPU, cl_device_id* devices, cl_uint numDevices);
......
#ifdef SELF_SCHEDULE
override SELF_SCHEDULE = -DSELF_SCHEDULE
#endif
OPENCL_INC=/opt/intel/opencl/include
OPENCL_LIB=/opt/intel/opencl -lOpenCL
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment