Commit 2bae44dd authored by Marvin Damschen's avatar Marvin Damschen

Device self-scheduling using atomic counting

parent ec5699f8
......@@ -1870,7 +1870,7 @@ main( int argc,
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
......@@ -2432,7 +2432,7 @@ main( int argc,
free(mem);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
clReleaseContext(context);
return EXIT_SUCCESS;
......
......@@ -81,11 +81,11 @@ main( int argc, char** argv)
context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
setup(context, argc, argv);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
clReleaseContext(context);
}
......
......@@ -199,7 +199,7 @@ int main(int argc, char * argv[])
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int no_of_nodes;
......@@ -308,7 +308,7 @@ int main(int argc, char * argv[])
clSVMFree(context, h_graph_visited);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
}
......
......@@ -247,7 +247,7 @@ int main(int argc, char** argv){
exit(1);
}
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......@@ -417,7 +417,7 @@ int main(int argc, char** argv){
std::cout << "Cleaning up..." << std::endl;
// TODO
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
std::cout << "Done..." << std::endl;
clSVMFree(context, h_ff_variable);
......
......@@ -331,7 +331,7 @@ void horizontalStep (__local struct TransformBuffer *buffer,
const int prevOffset,
const int midOffset,
const int nextOffset,
int flag)
int flag, const uint done_local, global_work_state *__globalWorkState)
{
const int STEPS = count / buffer->SIZE_X;
const int finalCount = count % buffer->SIZE_X;
......@@ -370,14 +370,14 @@ void horizontalStep (__local struct TransformBuffer *buffer,
void forEachHorizontalOdd(__local struct TransformBuffer *buffer,
const int firstLine,
const int numLines,
int flag)
int flag, const uint done_local, global_work_state *__globalWorkState)
{
const int count = numLines * buffer->VERTICAL_STRIDE - 1 ;
const int prevOffset = firstLine * buffer->VERTICAL_STRIDE ;
const int centerOffset = prevOffset + buffer->ODD_OFFSET ;
const int nextOffset = prevOffset + 1;
horizontalStep (buffer, count, prevOffset, centerOffset, nextOffset, flag);
horizontalStep (buffer, count, prevOffset, centerOffset, nextOffset, flag, done_local, __globalWorkState);
}
......@@ -385,14 +385,14 @@ void forEachHorizontalOdd(__local struct TransformBuffer *buffer,
void forEachHorizontalEven(__local struct TransformBuffer *buffer,
const int firstLine,
const int numLines,
int flag)
int flag, const uint done_local, global_work_state *__globalWorkState)
{
const int count = numLines * buffer->VERTICAL_STRIDE - 1 ;
const int centerOffset = firstLine * buffer->VERTICAL_STRIDE + 1;
const int prevOffset = firstLine * buffer->VERTICAL_STRIDE + buffer->ODD_OFFSET;
const int nextOffset = prevOffset + 1;
horizontalStep (buffer, count, prevOffset, centerOffset, nextOffset, flag);
horizontalStep (buffer, count, prevOffset, centerOffset, nextOffset, flag, done_local, __globalWorkState);
}
......@@ -502,7 +502,7 @@ void initColumn(__local struct FDWT53 * fdwt53,
const int sizeY,
const int colIndex,
const int firstY,
struct VerticalDWTPixelIO *pIO)
struct VerticalDWTPixelIO *pIO, const uint done_local, global_work_state *__globalWorkState)
{
column->CHECKED_LOADER = CHECKED;
column->offset = getColumnOffset(colIndex, &fdwt53->buffer);
......@@ -566,7 +566,7 @@ void transform(__local struct FDWT53 *fdwt53,
__global int * out,
const int sizeX,
const int sizeY,
const int winSteps)
const int winSteps, const uint done_local, global_work_state *__globalWorkState)
{
// info about one main and one boundary columns processed by this thread
struct FDWT53Column column; column.CHECKED_LOADER = CHECK_LOADS;
......@@ -577,7 +577,7 @@ void transform(__local struct FDWT53 *fdwt53,
// Initialize all column info: initialize loaders, compute offset of
// column in shared buffer and initialize loader of column.
const int firstY = get_group_id(1) * fdwt53->WIN_SIZE_Y * winSteps;
initColumn(fdwt53, &column, CHECK_LOADS, in, sizeX, sizeY, get_local_id(0), firstY, &pIO);
initColumn(fdwt53, &column, CHECK_LOADS, in, sizeX, sizeY, get_local_id(0), firstY, &pIO, done_local, __globalWorkState);
// first 3 threads initialize boundary columns, others do not use them
......@@ -587,7 +587,7 @@ void transform(__local struct FDWT53 *fdwt53,
const int colId = get_local_id(0) + ((get_local_id(0)== 0) ? fdwt53->WIN_SIZE_X : -3);
// initialize the column
initColumn (fdwt53, &boundaryColumn, CHECK_LOADS, in, sizeX, sizeY, colId, firstY, &pIO_b);
initColumn (fdwt53, &boundaryColumn, CHECK_LOADS, in, sizeX, sizeY, colId, firstY, &pIO_b, done_local, __globalWorkState);
}
// index of column which will be written into output by this thread
......@@ -620,11 +620,11 @@ void transform(__local struct FDWT53 *fdwt53,
int flag = 0; //flag = 0 execute Forward53Predict, flag = 1 execute Forward53Update
forEachHorizontalOdd(&(fdwt53->buffer), 2, fdwt53->WIN_SIZE_Y, flag);
forEachHorizontalOdd(&(fdwt53->buffer), 2, fdwt53->WIN_SIZE_Y, flag, done_local, __globalWorkState);
barrier(CLK_LOCAL_MEM_FENCE);
flag = 1;
forEachHorizontalEven(&(fdwt53->buffer), 2, fdwt53->WIN_SIZE_Y, flag);
forEachHorizontalEven(&(fdwt53->buffer), 2, fdwt53->WIN_SIZE_Y, flag, done_local, __globalWorkState);
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -695,17 +695,17 @@ __kernel void cl_fdwt53Kernel(__global const int * const in,
if(atBottomBoudary)
{
// near bottom boundary => check both writing and reading
transform(&fdwt53, true, true, in, out, sx, sy, steps);
transform(&fdwt53, true, true, in, out, sx, sy, steps, done_local, __globalWorkState);
}
else if(atRightBoudary)
{
// near right boundary only => check writing only
transform(&fdwt53, false, true, in, out, sx, sy, steps);
transform(&fdwt53, false, true, in, out, sx, sy, steps, done_local, __globalWorkState);
}
else
{
// no nearby boundary => check nothing
transform(&fdwt53, false, false, in, out, sx, sy, steps);
transform(&fdwt53, false, false, in, out, sx, sy, steps, done_local, __globalWorkState);
}
SCHEDULE_CHILD_ND(cl_fdwt53Kernel(in, out, sx, sy, steps, WIN_SIZE_X, WIN_SIZE_Y SELF_SCHEDULE_CHILD_ARGS))
......
......@@ -770,7 +770,7 @@ int main(int argc, char **argv)
context = clCreateContext(NULL, num, devices, NULL, NULL, &errNum);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
if (errNum != CL_SUCCESS)
{
......@@ -869,7 +869,7 @@ int main(int argc, char **argv)
clReleaseKernel(c_CopySrcToComponents);
clReleaseKernel(c_CopySrcToComponent);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
clSVMFree(context, d->srcImg);
......
......@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
if(size < 1)
......@@ -180,7 +180,7 @@ int main(int argc, char *argv[]) {
clSVMFree(context, b);
free(finalVec);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
cl_cleanup();
//OpenClGaussianElimination(context,timing);
......
......@@ -84,7 +84,7 @@ main( int argc,
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
......@@ -273,7 +273,7 @@ main( int argc,
//====================================================================================================100
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
err = clReleaseContext(context);
if (err != CL_SUCCESS)
......
......@@ -159,7 +159,7 @@ int main(int argc, char** argv) {
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......@@ -249,7 +249,7 @@ int main(int argc, char** argv) {
clSVMFree(context, MatrixPower);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
clReleaseContext(context);
......
......@@ -186,7 +186,7 @@ int main(int argc, char** argv)
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......@@ -314,7 +314,7 @@ int main(int argc, char** argv)
clReleaseCommandQueue(commandsCPU);
clReleaseCommandQueue(commandsGPU);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
clReleaseContext(context);
......
......@@ -51,7 +51,7 @@ int main(int argc, char** argv)
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
// Fill our data set with random float values
......@@ -212,7 +212,7 @@ int main(int argc, char** argv)
clSVMFree(context, nullElements);
clSVMFree(context, origOffsets);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
clReleaseContext(context);
......
......@@ -73,7 +73,7 @@ static int initialize()
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......@@ -92,7 +92,7 @@ static int shutdown()
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -118,7 +118,7 @@ main( int argc,
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
printf("WG size of kernel = %d \n", NUMBER_THREADS);
......@@ -370,7 +370,7 @@ main( int argc,
// RETURN
//======================================================================================================================================================150
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
return 0.0; // always returns 0.0
......
......@@ -156,7 +156,7 @@ void select_device() {
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......
......@@ -135,7 +135,7 @@ static int initialize()
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......@@ -154,7 +154,7 @@ static int shutdown()
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -208,7 +208,7 @@ main( int argc,
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
//======================================================================================================================================================150
......@@ -349,7 +349,7 @@ main( int argc,
free(com);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
//====================================================================================================100
......
......@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int numRecords = loadData(filename,records,locations);
......@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
clSVMFree(context, records);
clSVMFree(context, locations);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
return 0;
}
......
......@@ -76,7 +76,7 @@ static int initialize()
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......@@ -95,7 +95,7 @@ static int shutdown()
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -85,7 +85,7 @@ static int initialize()
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
threads_per_block = 128;
......@@ -109,7 +109,7 @@ static int shutdown()
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -89,7 +89,7 @@ static int initialize() {
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
int iCPU, iGPU;
......@@ -108,7 +108,7 @@ static int shutdown()
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -85,7 +85,7 @@ static int initialize()
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
threads_per_block = 128;
......@@ -109,7 +109,7 @@ static int shutdown()
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -111,7 +111,7 @@ int main(int argc, char** argv)
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
init(context, argc, argv);
......@@ -197,7 +197,7 @@ int main(int argc, char** argv)
// delete[] wall;
// delete[] result;
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
return EXIT_SUCCESS;
}
......@@ -925,7 +925,7 @@ int main(int argc, char **argv)
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
initSelfScheduling(context);
#endif
initKernel(context, dim);
......
......@@ -74,7 +74,7 @@ void freeDevMem(){
clReleaseCommandQueue(commandsCPU);
clReleaseCommandQueue(commandsGPU);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
cleanupSelfScheduling(context);
#endif
}
......
// normally, offset is not applied to group_id, but this is necessary for splitting work onto multiple devices
#define get_group_id(dim) (get_group_id(dim) + (get_global_offset(dim)/get_local_size(dim)))
typedef struct global_work_state_struct {
atomic_uint done;
size_t globalOffset;
size_t totalWorkItems;
int divideWorkDim;
int percentWorkCPU;
size_t gsize[3];
size_t lsize[3];
} global_work_state;
#ifdef __OPENCL_VERSION__
int __isCPU(global_work_state *__globalWorkState) {
return get_global_offset(__globalWorkState->divideWorkDim) == __globalWorkState->globalOffset && __globalWorkState->percentWorkCPU != 0;
}
#endif
#define SELF_SCHEDULE_AFTER_NUM_ITS 2
#define SELF_SCHEDULE_SAFETY_NUM_ITS 1
#define COMMA ,
#ifdef SELF_SCHEDULE
#define NUM_CHILDREN_PER_WG 1
#define SELF_SCHEDULE_ARGS , __global global_work_state *__globalWorkState){\
local unsigned int done_local;\
if (get_local_id(3) == get_local_size(3)-1)\
done_local = atomic_fetch_add_explicit(&__globalWorkState->done, (uint)get_local_size(3), memory_order_relaxed, memory_scope_all_svm_devices);\
\
barrier(CLK_LOCAL_MEM_FENCE);\
while (done_local < __globalWorkState->totalWorkItems
#define SELF_SCHEDULE_ARGS , const int __divideWorkDim, const size_t __totalWorkItems, __global atomic_int *__workLeft
#define SELF_SCHEDULE_CHILD_ARGS , __divideWorkDim, __totalWorkItems, __workLeft
#define SELF_SCHEDULE_CHILD_ARGS , __global global_work_state *__globalWorkState
// different approach to dividing work: __global_size[__divideWorkDim] = x*get_local_size(__divideWorkDim);
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(kernel_block_with_local_ptrs)\
if (get_local_id(0) < NUM_CHILDREN_PER_WG) {\
const int leftBefore = atomic_fetch_sub_explicit(__workLeft, get_global_size(0), memory_order_relaxed, memory_scope_all_svm_devices);\
if (leftBefore > 0) {\
const ndrange_t child_ndrange = ndrange_1D((size_t)(__totalWorkItems-leftBefore), ((get_global_size(0) < leftBefore) ? get_global_size(0) : leftBefore), get_local_size(0));\
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_NO_WAIT, child_ndrange, kernel_block_with_local_ptrs);\
} else atomic_store_explicit(__workLeft, 0, memory_order_relaxed, memory_scope_all_svm_devices);\
if (get_local_id(3) == get_local_size(3)-1)\
done_local = atomic_fetch_add_explicit(&__globalWorkState->done, (uint)get_local_size(3), memory_order_relaxed, memory_scope_all_svm_devices);\
barrier(CLK_LOCAL_MEM_FENCE);\
}
#define SCHEDULE_CHILD_1D(kernel_call)\
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^{kernel_call;})
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(kernel_block_with_local_ptrs)\
if (get_local_id(__divideWorkDim) < NUM_CHILDREN_PER_WG) {\
const int leftBefore = atomic_fetch_sub_explicit(__workLeft, get_global_size(__divideWorkDim), memory_order_relaxed, memory_scope_all_svm_devices);\
if (leftBefore > 0) {\
size_t __offset[3] = {0, 0, 0};\
__offset[__divideWorkDim] = (size_t)(__totalWorkItems-leftBefore);\
size_t __global_size[3] = {get_global_size(0), get_global_size(1), get_global_size(2)};\
__global_size[__divideWorkDim] = ((__global_size[__divideWorkDim] < leftBefore) ? __global_size[__divideWorkDim] : leftBefore);\
const size_t __local_size[3] = {get_local_size(0), get_local_size(1), get_local_size(2)};\
const ndrange_t child_ndrange = ndrange_3D(__offset, __global_size, __local_size);\
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_NO_WAIT, child_ndrange, kernel_block_with_local_ptrs);\
} else atomic_store_explicit(__workLeft, 0, memory_order_relaxed, memory_scope_all_svm_devices);\
}
if (get_local_id(3) == get_local_size(3)-1)\
done_local = atomic_fetch_add_explicit(&__globalWorkState->done, (uint)get_local_size(3), memory_order_relaxed, memory_scope_all_svm_devices);\
barrier(CLK_LOCAL_MEM_FENCE);\
}
#define SCHEDULE_CHILD_ND(kernel_call)\
SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(^{kernel_call;})
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^{kernel_call;})
#define get_global_size(dim) __globalWorkState->gsize[dim]
#define get_local_size(dim) ((dim == 3) ? get_local_size(0) : __globalWorkState->lsize[dim])
#define flat_id (get_local_id(3)+done_local)
#define get_global_id(dim) ((dim == 0) ? flat_id%__globalWorkState->gsize[0] : ((dim == 1) ? (flat_id/__globalWorkState->gsize[0])%__globalWorkState->gsize[1] : flat_id/(__globalWorkState->gsize[0]*__globalWorkState->gsize[1])))
#define get_group_id(dim) (get_global_id(dim)/__globalWorkState->lsize[dim])
#define get_local_id(dim) ((dim == 3) ? get_local_id(0) : (((dim == 0) ? (get_local_id(0)+done_local)%__globalWorkState->gsize[0] : ((dim == 1) ? ((get_local_id(0)+done_local)/__globalWorkState->gsize[0])%__globalWorkState->gsize[1] : (get_local_id(0)+done_local)/(__globalWorkState->gsize[0]*__globalWorkState->gsize[1])))%__globalWorkState->lsize[dim]))
#define get_global_offset(dim) ((dim == __globalWorkState->divideWorkDim) ? __globalWorkState->globalOffset : get_global_offset(dim))
#else
#define SELF_SCHEDULE_ARGS
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(kernel_block_with_local_ptrs)
......@@ -45,4 +67,7 @@
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(kernel_block_with_local_ptrs)
#define SCHEDULE_CHILD_ND(kernel_call)
#define SELF_SCHEDULE_CHILD_ARGS
// normally, offset is not applied to group_id, but this is necessary for splitting work onto multiple devices
#define get_group_id(dim) (get_group_id(dim) + (get_global_offset(dim)/get_local_size(dim)))
#endif
#include "cl_utils.h"
#include "cl_common.h"
#include <time.h>
#include <string.h>
static cl_command_queue commandsCPUOnDevice = 0, commandsGPUOnDevice = 0;
static int selfSchedulingInitialized = 0;
static atomic_int* workLeft;
static int previousPercentWorkCPU = -1;
static int percentWorkCPUoverride = -1;
......@@ -26,6 +27,7 @@ static cl_event eventCPUdone[__CL_RECORD_STATS_MAX], eventGPUdone[__CL_RECORD_ST
char kernelName[__CL_RECORD_STATS_MAX][128];
#endif
static int previousKernelRun;
static global_work_state* globalWorkState;
char* readSourceFromFileName(const char* fileName) {
#ifdef DEBUG
......@@ -63,7 +65,7 @@ cl_program getBuiltProgramFromFile(cl_context context, const char *sourceFileNam
}
char clOptions[512] = "-cl-std=CL2.0";
if (commandsCPUOnDevice && commandsGPUOnDevice)
if (selfSchedulingInitialized)
sprintf(clOptions + strlen(clOptions), " -DSELF_SCHEDULE");
if (options)
......@@ -96,29 +98,21 @@ cl_program getBuiltProgramFromFile(cl_context context, const char *sourceFileNam
return program;
}
cl_int initOnDeviceCommandQueues(cl_context context) {
cl_int initSelfScheduling(cl_context context) {
cl_int err;
cl_uint num;
clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &num, NULL);
cl_device_id devices[num];
clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*num, devices, NULL);
workLeft = (atomic_int*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(atomic_int), 0);
atomic_init(workLeft, 0);