Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
CES
Rodinia-SVM
Commits
ec5699f8
Commit
ec5699f8
authored
Jul 11, 2018
by
Marvin Damschen
Browse files
Device self-scheduling using device-side queueing
parent
285a847d
Changes
58
Hide whitespace changes
Inline
Side-by-side
Rodinia-SVM/particlefilter-svm/ex_particle_OCL_double_seq.cpp
View file @
ec5699f8
...
...
@@ -84,6 +84,9 @@ static int initialize()
}
context
=
clCreateContext
(
NULL
,
num
,
devices
,
NULL
,
NULL
,
&
err
);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues
(
context
);
#endif
threads_per_block
=
128
;
...
...
@@ -105,6 +108,9 @@ static int shutdown()
// release resources
if
(
commandsCPU
)
clReleaseCommandQueue
(
commandsCPU
);
if
(
commandsGPU
)
clReleaseCommandQueue
(
commandsGPU
);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues
(
context
);
#endif
if
(
context
)
clReleaseContext
(
context
);
if
(
devices
)
delete
devices
;
...
...
Rodinia-SVM/particlefilter-svm/ex_particle_OCL_naive_seq.cpp
View file @
ec5699f8
...
...
@@ -88,6 +88,9 @@ static int initialize() {
}
context
=
clCreateContext
(
NULL
,
num
,
devices
,
NULL
,
NULL
,
&
err
);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues
(
context
);
#endif
int
iCPU
,
iGPU
;
getCPUGPUIds
(
&
iCPU
,
&
iGPU
,
devices
,
num
);
...
...
@@ -104,6 +107,9 @@ static int shutdown()
// release resources
if
(
commandsCPU
)
clReleaseCommandQueue
(
commandsCPU
);
if
(
commandsGPU
)
clReleaseCommandQueue
(
commandsGPU
);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues
(
context
);
#endif
if
(
context
)
clReleaseContext
(
context
);
if
(
devices
)
delete
devices
;
...
...
Rodinia-SVM/particlefilter-svm/ex_particle_OCL_single_seq.cpp
View file @
ec5699f8
...
...
@@ -84,6 +84,9 @@ static int initialize()
}
context
=
clCreateContext
(
NULL
,
num
,
devices
,
NULL
,
NULL
,
&
err
);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues
(
context
);
#endif
threads_per_block
=
128
;
...
...
@@ -105,6 +108,9 @@ static int shutdown()
// release resources
if
(
commandsCPU
)
clReleaseCommandQueue
(
commandsCPU
);
if
(
commandsGPU
)
clReleaseCommandQueue
(
commandsGPU
);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues
(
context
);
#endif
if
(
context
)
clReleaseContext
(
context
);
if
(
devices
)
delete
devices
;
...
...
Rodinia-SVM/particlefilter-svm/particle_double.cl
View file @
ec5699f8
...
...
@@ -145,7 +145,7 @@ int findIndexBin(__global double * CDF, int beginIndex, int endIndex, double val
*****************************/
__kernel void find_index_kernel(__global double * arrayX, __global double * arrayY,
__global double * CDF, __global double * u, __global double * xj,
__global double * yj, __global double * weights, int Nparticles
__global double * yj, __global double * weights, int Nparticles
SELF_SCHEDULE_ARGS
){
int i = get_global_id(0);
...
...
@@ -172,9 +172,9 @@ __kernel void find_index_kernel(__global double * arrayX, __global double * arra
}
barrier(CLK_GLOBAL_MEM_FENCE);
SCHEDULE_CHILD_1D(find_index_kernel(arrayX, arrayY, CDF, u, xj, yj, weights, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void normalize_weights_kernel(__global double * weights, int Nparticles, __global double * partial_sums, __global double * CDF, __global double * u, __global int * seed)
__kernel void normalize_weights_kernel(__global double * weights, int Nparticles, __global double * partial_sums, __global double * CDF, __global double * u, __global int * seed
SELF_SCHEDULE_ARGS
)
{
int i = get_global_id(0);
int local_id = get_local_id(0);
...
...
@@ -209,11 +209,11 @@ __kernel void normalize_weights_kernel(__global double * weights, int Nparticles
u[i] = u1 + i/((double)(Nparticles));
}
SCHEDULE_CHILD_1D(normalize_weights_kernel(weights, Nparticles, partial_sums, CDF, u, seed SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void sum_kernel(__global double* partial_sums, int Nparticles)
__kernel void sum_kernel(__global double* partial_sums, int Nparticles
SELF_SCHEDULE_ARGS
)
{
int i = get_global_id(0);
...
...
@@ -230,7 +230,7 @@ __kernel void sum_kernel(__global double* partial_sums, int Nparticles)
partial_sums[0] = sum;
}
SCHEDULE_CHILD_1D(sum_kernel(partial_sums, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
...
...
@@ -252,7 +252,7 @@ __kernel void sum_kernel(__global double* partial_sums, int Nparticles)
* param11: IszY
* param12: Nfr
*****************************/
__kernel void likelihood_kernel(__global double * arrayX, __global double * arrayY,__global double * xj, __global double * yj, __global double * CDF, __global int * ind, __global int * objxy, __global double * likelihood, __global unsigned char * I, __global double * u, __global double * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global double * partial_sums, __local double* buffer){
__kernel void likelihood_kernel(__global double * arrayX, __global double * arrayY,__global double * xj, __global double * yj, __global double * CDF, __global int * ind, __global int * objxy, __global double * likelihood, __global unsigned char * I, __global double * u, __global double * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global double * partial_sums, __local double* buffer
SELF_SCHEDULE_ARGS
){
int block_id = get_group_id(0);
int thread_id = get_local_id(0);
int i = get_global_id(0);
...
...
@@ -321,7 +321,8 @@ __kernel void likelihood_kernel(__global double * arrayX, __global double * arra
partial_sums[block_id]
=
buffer[0]
;
}
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D
(
^
(
local
void
*buffer
)
{likelihood_kernel
(
arrayX,
arrayY,
xj,
yj,
CDF,
ind,
objxy,
likelihood,
I,
u,
weights,
Nparticles,
countOnes,
max_size,
k,
IszY,
Nfr,
seed,
partial_sums,
buffer
SELF_SCHEDULE_CHILD_ARGS
)
;} COMMA (uint)(sizeof(double)*THREADS_PER_BLOCK))
}//*/
#
endif
Rodinia-SVM/particlefilter-svm/particle_naive.cl
View file @
ec5699f8
...
...
@@ -59,7 +59,7 @@ int findIndexBin(double * CDF, int beginIndex, int endIndex, double value)
*
param6:
yj
*
param7:
Nparticles
*****************************
/
__kernel
void
particle_kernel
(
__global
double
*
arrayX,
__global
double
*
arrayY,
__global
double
*
CDF,
__global
double
*
u,
__global
double
*
xj,
__global
double
*
yj,
int
Nparticles
)
{
__kernel
void
particle_kernel
(
__global
double
*
arrayX,
__global
double
*
arrayY,
__global
double
*
CDF,
__global
double
*
u,
__global
double
*
xj,
__global
double
*
yj,
int
Nparticles
SELF_SCHEDULE_ARGS
)
{
int
i
=
get_global_id
(
0
)
;
if
(
i
<
Nparticles
)
{
...
...
@@ -82,5 +82,5 @@ __kernel void particle_kernel(__global double * arrayX, __global double * arrayY
}
}
SCHEDULE_CHILD_1D
(
particle_kernel
(
arrayX,
arrayY,
CDF,
u,
xj,
yj,
Nparticles
SELF_SCHEDULE_CHILD_ARGS
))
}
\ No newline at end of file
Rodinia-SVM/particlefilter-svm/particle_single.cl
View file @
ec5699f8
...
...
@@ -180,7 +180,7 @@ float tex1Dfetch(__read_only image2d_t img, int index){
*****************************/
__kernel void find_index_kernel(__global float * arrayX, __global float * arrayY,
__global float * CDF, __global float * u, __global float * xj,
__global float * yj, __global float * weights, int Nparticles
__global float * yj, __global float * weights, int Nparticles
SELF_SCHEDULE_ARGS
){
int i = get_global_id(0);
...
...
@@ -207,9 +207,9 @@ __kernel void find_index_kernel(__global float * arrayX, __global float * arrayY
}
barrier(CLK_GLOBAL_MEM_FENCE);
SCHEDULE_CHILD_1D(find_index_kernel(arrayX, arrayY, CDF, u, xj, yj, weights, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void normalize_weights_kernel(__global float * weights, int Nparticles, __global float * partial_sums, __global float * CDF, __global float * u, __global int * seed)
__kernel void normalize_weights_kernel(__global float * weights, int Nparticles, __global float * partial_sums, __global float * CDF, __global float * u, __global int * seed
SELF_SCHEDULE_ARGS
)
{
int i = get_global_id(0);
int local_id = get_local_id(0);
...
...
@@ -244,11 +244,11 @@ __kernel void normalize_weights_kernel(__global float * weights, int Nparticles,
u[i] = u1 + i/((float)(Nparticles));
}
SCHEDULE_CHILD_1D(normalize_weights_kernel(weights, Nparticles, partial_sums, CDF, u, seed SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void sum_kernel(__global float* partial_sums, int Nparticles)
__kernel void sum_kernel(__global float* partial_sums, int Nparticles
SELF_SCHEDULE_ARGS
)
{
int i = get_global_id(0);
...
...
@@ -265,7 +265,7 @@ __kernel void sum_kernel(__global float* partial_sums, int Nparticles)
partial_sums[0] = sum;
}
SCHEDULE_CHILD_1D(sum_kernel(partial_sums, Nparticles SELF_SCHEDULE_CHILD_ARGS))
}
...
...
@@ -287,7 +287,7 @@ __kernel void sum_kernel(__global float* partial_sums, int Nparticles)
* param11: IszY
* param12: Nfr
*****************************/
__kernel void likelihood_kernel(__global float * arrayX, __global float * arrayY,__global float * xj, __global float * yj, __global float * CDF, __global int * ind, __global int * objxy, __global float * likelihood, __global unsigned char * I, __global float * u, __global float * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global float * partial_sums, __local float* buffer){
__kernel void likelihood_kernel(__global float * arrayX, __global float * arrayY,__global float * xj, __global float * yj, __global float * CDF, __global int * ind, __global int * objxy, __global float * likelihood, __global unsigned char * I, __global float * u, __global float * weights, const int Nparticles, const int countOnes, const int max_size, int k, const int IszY, const int Nfr, __global int *seed, __global float * partial_sums, __local float* buffer
SELF_SCHEDULE_ARGS
){
int block_id = get_group_id(0);
int thread_id = get_local_id(0);
int i = get_global_id(0);
...
...
@@ -356,7 +356,8 @@ __kernel void likelihood_kernel(__global float * arrayX, __global float * arrayY
partial_sums[block_id]
=
buffer[0]
;
}
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D
(
^
(
local
void
*buffer
)
{likelihood_kernel
(
arrayX,
arrayY,
xj,
yj,
CDF,
ind,
objxy,
likelihood,
I,
u,
weights,
Nparticles,
countOnes,
max_size,
k,
IszY,
Nfr,
seed,
partial_sums,
buffer
SELF_SCHEDULE_CHILD_ARGS
)
;} COMMA (uint)(sizeof(float)*THREADS_PER_BLOCK))
}//*/
//#endif
Rodinia-SVM/pathfinder-svm/kernels.cl
View file @
ec5699f8
...
...
@@ -13,7 +13,7 @@ __kernel void dynproc_kernel (int iteration,
int
startStep,
int
border,
int
HALO,
__global
int*
outputBuffer
)
__global
int*
outputBuffer
SELF_SCHEDULE_ARGS
)
{
__local
int
prev[LOCAL_WORK]
;
__local
int
result[LOCAL_WORK]
;
...
...
@@ -23,7 +23,7 @@ __kernel void dynproc_kernel (int iteration,
int
tx
=
get_local_id
(
0
)
;
if
(
get_global_id
(
0
)
>
rows*cols
)
return
;
goto
self_schedule
;
//
Each
block
finally
computes
result
for
a
small
block
//
after
N
iterations.
...
...
@@ -116,6 +116,9 @@ __kernel void dynproc_kernel (int iteration,
gpuResults[xidx]
=
result[tx]
;
}
self_schedule:
SCHEDULE_CHILD_1D
(
dynproc_kernel
(
iteration,
gpuWall,
gpuSrc,
gpuResults,
cols,
rows,
startStep,
border,
HALO,
outputBuffer
SELF_SCHEDULE_CHILD_ARGS
))
return
;
}
...
...
Rodinia-SVM/pathfinder-svm/main.cpp
View file @
ec5699f8
...
...
@@ -110,6 +110,9 @@ int main(int argc, char** argv)
}
context
=
clCreateContext
(
NULL
,
num
,
devices
,
NULL
,
NULL
,
&
err
);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues
(
context
);
#endif
init
(
context
,
argc
,
argv
);
// Pyramid parameters.
...
...
@@ -193,5 +196,8 @@ int main(int argc, char** argv)
// delete[] data;
// delete[] wall;
// delete[] result;
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues
(
context
);
#endif
return
EXIT_SUCCESS
;
}
Rodinia-SVM/runall_svm.sh
View file @
ec5699f8
...
...
@@ -3,7 +3,7 @@ export LC_ALL=C
runsperrun
=
2
date
=
$(
date
+%Y%m%d
)
for
percentWorkCPU
in
`
seq
-50
50
10
0
`
;
for
percentWorkCPU
in
`
seq
-
1
50
50
-15
0
`
;
do
echo
$(
date
+%H:%M
)
- Cleaning...
...
...
Rodinia-SVM/srad-svm_DISABLED/kernel/kernel_gpu_opencl.cl
View file @
ec5699f8
...
...
@@ -20,7 +20,7 @@
__kernel
void
extract_kernel
(
long
d_Ne,
__global
fp*
d_I
)
{
//
pointer
to
input
image
(
DEVICE
GLOBAL
MEMORY
)
__global
fp*
d_I
SELF_SCHEDULE_ARGS
)
{
//
pointer
to
input
image
(
DEVICE
GLOBAL
MEMORY
)
//
indexes
int
bx
=
(
get_group_id
(
0
)
+
(
get_global_offset
(
0
)
/get_local_size
(
0
)))
; // get current horizontal block index
...
...
@@ -34,7 +34,7 @@ extract_kernel(long d_Ne,
}
SCHEDULE_CHILD_1D
(
extract_kernel
(
d_Ne,
d_I
SELF_SCHEDULE_CHILD_ARGS
))
}
//========================================================================================================================================================================================================200
...
...
@@ -45,7 +45,7 @@ __kernel void
prepare_kernel
(
long
d_Ne,
__global
fp*
d_I,
//
pointer
to
output
image
(
DEVICE
GLOBAL
MEMORY
)
__global
fp*
d_sums,
//
pointer
to
input
image
(
DEVICE
GLOBAL
MEMORY
)
__global
fp*
d_sums2
)
{
__global
fp*
d_sums2
SELF_SCHEDULE_ARGS
)
{
//
indexes
int
bx
=
(
get_group_id
(
0
)
+
(
get_global_offset
(
0
)
/get_local_size
(
0
)))
; // get current horizontal block index
...
...
@@ -60,7 +60,7 @@ prepare_kernel( long d_Ne,
}
SCHEDULE_CHILD_1D
(
prepare_kernel
(
d_Ne,
d_I,
d_sums,
d_sums2
SELF_SCHEDULE_CHILD_ARGS
))
}
//========================================================================================================================================================================================================200
...
...
@@ -73,7 +73,7 @@ reduce_kernel( long d_Ne, // number of elements in array
int
d_mul,
//
increment
__global
fp*
d_sums,
//
pointer
to
partial
sums
variable
(
DEVICE
GLOBAL
MEMORY
)
__global
fp*
d_sums2,
int
gridDim
)
{
int
gridDim
SELF_SCHEDULE_ARGS
)
{
//
indexes
int
bx
=
(
get_group_id
(
0
)
+
(
get_global_offset
(
0
)
/get_local_size
(
0
)))
; // get current horizontal block index
...
...
@@ -173,7 +173,7 @@ reduce_kernel( long d_Ne, // number of elements in array
}
}
SCHEDULE_CHILD_1D
(
reduce_kernel
(
d_Ne,
d_no,
d_mul,
d_sums,
d_sums2,
gridDim
SELF_SCHEDULE_CHILD_ARGS
))
}
//========================================================================================================================================================================================================200
...
...
@@ -197,7 +197,7 @@ srad_kernel(fp d_lambda,
__global
fp*
d_dW,
fp
d_q0sqr,
__global
fp*
d_c,
__global
fp*
d_I
)
{
__global
fp*
d_I
SELF_SCHEDULE_ARGS
)
{
//
indexes
int
bx
=
(
get_group_id
(
0
)
+
(
get_global_offset
(
0
)
/get_local_size
(
0
)))
; // get current horizontal block index
...
...
@@ -263,7 +263,7 @@ srad_kernel(fp d_lambda,
}
SCHEDULE_CHILD_1D
(
srad_kernel
(
d_lambda,
d_Nr,
d_Nc,
d_Ne,
d_iN,
d_iS,
d_jE,
d_jW,
d_dN,
d_dS,
d_dE,
d_dW,
d_q0sqr,
d_c,
d_I
SELF_SCHEDULE_CHILD_ARGS
))
}
//========================================================================================================================================================================================================200
...
...
@@ -286,7 +286,7 @@ srad2_kernel( fp d_lambda,
__global
fp*
d_dE,
__global
fp*
d_dW,
__global
fp*
d_c,
__global
fp*
d_I
)
{
__global
fp*
d_I
SELF_SCHEDULE_ARGS
)
{
//
indexes
int
bx
=
(
get_group_id
(
0
)
+
(
get_global_offset
(
0
)
/get_local_size
(
0
)))
; // get current horizontal block index
...
...
@@ -323,7 +323,7 @@ srad2_kernel( fp d_lambda,
}
SCHEDULE_CHILD_1D
(
srad2_kernel
(
d_lambda,
d_Nr,
d_Nc,
d_Ne,
d_iN,
d_iS,
d_jE,
d_jW,
d_dN,
d_dS,
d_dE,
d_dW,
d_c,
d_I
SELF_SCHEDULE_CHILD_ARGS
))
}
//========================================================================================================================================================================================================200
...
...
@@ -332,7 +332,7 @@ srad2_kernel( fp d_lambda,
__kernel
void
compress_kernel
(
long
d_Ne,
__global
fp*
d_I
)
{
//
pointer
to
output
image
(
DEVICE
GLOBAL
MEMORY
)
__global
fp*
d_I
SELF_SCHEDULE_ARGS
)
{
//
pointer
to
output
image
(
DEVICE
GLOBAL
MEMORY
)
//
indexes
int
bx
=
(
get_group_id
(
0
)
+
(
get_global_offset
(
0
)
/get_local_size
(
0
)))
; // get current horizontal block index
...
...
@@ -346,7 +346,7 @@ compress_kernel(long d_Ne,
}
SCHEDULE_CHILD_1D
(
compress_kernel
(
d_Ne,
d_I
SELF_SCHEDULE_CHILD_ARGS
))
}
//========================================================================================================================================================================================================200
...
...
Rodinia-SVM/srad-svm_DISABLED/main.c
View file @
ec5699f8
...
...
@@ -68,6 +68,9 @@ main( int argc,
}
cl_context
context
=
clCreateContext
(
NULL
,
num
,
devices
,
NULL
,
NULL
,
&
error
);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues
(
context
);
#endif
//======================================================================================================================================================150
// VARIABLES
...
...
@@ -260,6 +263,9 @@ main( int argc,
clSVMFree
(
context
,
iS
);
clSVMFree
(
context
,
jW
);
clSVMFree
(
context
,
jE
);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues
(
context
);
#endif
clReleaseContext
(
context
);
time6
=
get_time
();
...
...
Rodinia-SVM/streamcluster-svm/Kernels.cl
View file @
ec5699f8
...
...
@@ -22,7 +22,7 @@ __kernel void pgain_kernel(
int
num,
int
dim,
long
x,
int
K
)
{
int
K
SELF_SCHEDULE_ARGS
)
{
/*
block
ID
and
global
thread
ID
*/
const
int
thread_id
=
get_global_id
(
0
)
;
const
int
local_id
=
get_local_id
(
0
)
;
...
...
@@ -60,6 +60,6 @@ __kernel void pgain_kernel(
}
}
SCHEDULE_CHILD_1D
(
pgain_kernel
(
p,
coord_d,
work_mem_d,
center_table_d,
switch_membership_d,
num,
dim,
x,
K
SELF_SCHEDULE_CHILD_ARGS
))
}
Rodinia-SVM/streamcluster-svm/streamcluster.cpp
View file @
ec5699f8
...
...
@@ -924,7 +924,10 @@ int main(int argc, char **argv)
}
context
=
clCreateContext
(
NULL
,
num
,
devices
,
NULL
,
NULL
,
&
err
);
initKernel
(
context
,
dim
);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues
(
context
);
#endif
initKernel
(
context
,
dim
);
srand48
(
SEED
);
PStream
*
stream
;
...
...
Rodinia-SVM/streamcluster-svm/streamcluster_cl.h
View file @
ec5699f8
...
...
@@ -73,6 +73,9 @@ void freeDevMem(){
clReleaseProgram
(
program
);
clReleaseCommandQueue
(
commandsCPU
);
clReleaseCommandQueue
(
commandsGPU
);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues
(
context
);
#endif
}
float
pgain
(
long
x
,
Points
*
points
,
float
z
,
long
int
*
numcenters
,
int
kmax
,
bool
*
is_center
,
int
*
center_table
,
char
*
switch_membership
,
...
...
common/cl_common.h
View file @
ec5699f8
// normally, offset is not applied to group_id, but this is necessary for splitting work onto multiple devices
#define get_group_id(dim) (get_group_id(dim) + (get_global_offset(dim)/get_local_size(dim)))
#define COMMA ,
#ifdef SELF_SCHEDULE
#define NUM_CHILDREN_PER_WG 1
#define SELF_SCHEDULE_ARGS , const int __divideWorkDim, const size_t __totalWorkItems, __global atomic_int *__workLeft
#define SELF_SCHEDULE_CHILD_ARGS , __divideWorkDim, __totalWorkItems, __workLeft
// different approach to dividing work: __global_size[__divideWorkDim] = x*get_local_size(__divideWorkDim);
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(kernel_block_with_local_ptrs)\
if (get_local_id(0) < NUM_CHILDREN_PER_WG) {\
const int leftBefore = atomic_fetch_sub_explicit(__workLeft, get_global_size(0), memory_order_relaxed, memory_scope_all_svm_devices);\
if (leftBefore > 0) {\
const ndrange_t child_ndrange = ndrange_1D((size_t)(__totalWorkItems-leftBefore), ((get_global_size(0) < leftBefore) ? get_global_size(0) : leftBefore), get_local_size(0));\
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_NO_WAIT, child_ndrange, kernel_block_with_local_ptrs);\
} else atomic_store_explicit(__workLeft, 0, memory_order_relaxed, memory_scope_all_svm_devices);\
}
#define SCHEDULE_CHILD_1D(kernel_call)\
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^{kernel_call;})
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(kernel_block_with_local_ptrs)\
if (get_local_id(__divideWorkDim) < NUM_CHILDREN_PER_WG) {\
const int leftBefore = atomic_fetch_sub_explicit(__workLeft, get_global_size(__divideWorkDim), memory_order_relaxed, memory_scope_all_svm_devices);\
if (leftBefore > 0) {\
size_t __offset[3] = {0, 0, 0};\
__offset[__divideWorkDim] = (size_t)(__totalWorkItems-leftBefore);\
size_t __global_size[3] = {get_global_size(0), get_global_size(1), get_global_size(2)};\
__global_size[__divideWorkDim] = ((__global_size[__divideWorkDim] < leftBefore) ? __global_size[__divideWorkDim] : leftBefore);\
const size_t __local_size[3] = {get_local_size(0), get_local_size(1), get_local_size(2)};\
const ndrange_t child_ndrange = ndrange_3D(__offset, __global_size, __local_size);\
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_NO_WAIT, child_ndrange, kernel_block_with_local_ptrs);\
} else atomic_store_explicit(__workLeft, 0, memory_order_relaxed, memory_scope_all_svm_devices);\
}
#define SCHEDULE_CHILD_ND(kernel_call)\
SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(^{kernel_call;})
#else
#define SELF_SCHEDULE_ARGS
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(kernel_block_with_local_ptrs)
#define SCHEDULE_CHILD_1D(kernel_call)
#define SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(kernel_block_with_local_ptrs)
#define SCHEDULE_CHILD_ND(kernel_call)
#define SELF_SCHEDULE_CHILD_ARGS
#endif
common/cl_utils.c
View file @
ec5699f8
...
...
@@ -63,6 +63,8 @@ cl_program getBuiltProgramFromFile(cl_context context, const char *sourceFileNam
}
char
clOptions
[
512
]
=
"-cl-std=CL2.0"
;
if
(
commandsCPUOnDevice
&&
commandsGPUOnDevice
)
sprintf
(
clOptions
+
strlen
(
clOptions
),
" -DSELF_SCHEDULE"
);
if
(
options
)
sprintf
(
clOptions
+
strlen
(
clOptions
),
" %s"
,
options
);
...
...
@@ -94,6 +96,31 @@ cl_program getBuiltProgramFromFile(cl_context context, const char *sourceFileNam
return
program
;
}
cl_int
initOnDeviceCommandQueues
(
cl_context
context
)
{
cl_int
err
;
cl_uint
num
;
clGetContextInfo
(
context
,
CL_CONTEXT_NUM_DEVICES
,
sizeof
(
cl_uint
),
&
num
,
NULL
);
cl_device_id
devices
[
num
];
clGetContextInfo
(
context
,
CL_CONTEXT_DEVICES
,
sizeof
(
cl_device_id
)
*
num
,
devices
,
NULL
);
workLeft
=
(
atomic_int
*
)
clSVMAlloc
(
context
,
CL_MEM_READ_WRITE
|
CL_MEM_SVM_FINE_GRAIN_BUFFER
|
CL_MEM_SVM_ATOMICS
,
sizeof
(
atomic_int
),
0
);
atomic_init
(
workLeft
,
0
);
cl_queue_properties
cmdQueuePropsDevice
[]
=
{
CL_QUEUE_PROPERTIES
,
(
cl_command_queue_properties
)
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
|
CL_QUEUE_ON_DEVICE
|
CL_QUEUE_ON_DEVICE_DEFAULT
,
0
};
commandsCPUOnDevice
=
clCreateCommandQueueWithProperties
(
context
,
devices
[
0
],
cmdQueuePropsDevice
,
&
err
);
if
(
err
!=
CL_SUCCESS
)
return
err
;
commandsGPUOnDevice
=
clCreateCommandQueueWithProperties
(
context
,
devices
[
1
],
cmdQueuePropsDevice
,
&
err
);
return
err
;
}
void
releaseOnDeviceCommandQueues
(
cl_context
context
)
{
clReleaseCommandQueue
(
commandsCPUOnDevice
);
clReleaseCommandQueue
(
commandsGPUOnDevice
);
commandsCPUOnDevice
=
0
;
commandsGPUOnDevice
=
0
;
}
cl_int
getDevicesfromPlatformIdx
(
int
platformIdx
,
cl_uint
*
numDevices
,
cl_device_id
**
devices
)
{
cl_int
status
;
...
...
@@ -163,6 +190,47 @@ cl_int launchKernelFused(cl_command_queue commandsCPU, cl_command_queue commands
cl_int
launchKernelFusedDivideWorkInDim
(
cl_command_queue
commandsCPU
,
cl_command_queue
commandsGPU
,
cl_kernel
kernel
,
int
workDim
,
int
divideWorkDim
,
int
*
percentWorkCPU
,
const
size_t
global_work_size
[],
const
size_t
local_work_size
[],
int
launchKernelSynchronous
,
double
*
timeCPU
,
double
*
timeGPU
,
double
*
timeHost
)
{
totalMeasurements
++
;
size_t
initial_work_size
[
3
]
=
{
global_work_size
[
0
],
workDim
>
1
?
global_work_size
[
1
]
:
1
,
workDim
>
2
?
global_work_size
[
2
]
:
1
};
if
(
commandsCPUOnDevice
&&
commandsGPUOnDevice
)
{
// prepare initial work groups for device-side queueing
initial_work_size
[
divideWorkDim
]
=
local_work_size
[
divideWorkDim
]
*
16
;
if
(
initial_work_size
[
divideWorkDim
]
>
global_work_size
[
divideWorkDim
])
initial_work_size
[
divideWorkDim
]
=
global_work_size
[
divideWorkDim
];
atomic_store
(
workLeft
,
(
int
)(
global_work_size
[
divideWorkDim
]
-
initial_work_size
[
divideWorkDim
]));
#ifdef DEBUG
printf
(
"
\n
### launchKernelFused -- self-scheduling ###
\n
"
);
printf
(
"divideWorkDim: %d, global_work_size[divideWorkDim]: %d, initial_work_size[divideWorkDim]: %d
\n
"
,
divideWorkDim
,
global_work_size
[
divideWorkDim
],
initial_work_size
[
divideWorkDim
]);
#endif
cl_uint
kernel_num_args
;
clGetKernelInfo
(
kernel
,
CL_KERNEL_NUM_ARGS
,
sizeof
(
cl_uint
),
&
kernel_num_args
,
NULL
);
cl_int
err
;
err
=
clSetKernelArg
(
kernel
,
kernel_num_args
-
3
,
sizeof
(
int
),
&
divideWorkDim
);
if
(
err
!=
CL_SUCCESS
)
{
#ifdef DEBUG
printf
(
"
\n
### launchKernelFused -- failed to set divideWorkDim argument (arg #%d)
\n
"
,
kernel_num_args
-
3
);
#endif
return
err
;
}
err
=
clSetKernelArg
(
kernel
,
kernel_num_args
-
2
,
sizeof
(
size_t
),
&
global_work_size
[
divideWorkDim
]);
if
(
err
!=
CL_SUCCESS
)
{
#ifdef DEBUG
printf
(
"
\n
### launchKernelFused -- failed to set global_work_size argument (arg #%d)
\n
"
,
kernel_num_args
-
2
);
#endif
return
err
;
}
err
=
clSetKernelArgSVMPointer
(
kernel
,
kernel_num_args
-
1
,
workLeft
);
if
(
err
!=
CL_SUCCESS
)
{
#ifdef DEBUG
printf
(
"
\n
### launchKernelFused -- failed to set workLeft argument (arg #%d)
\n
"
,
kernel_num_args
-
1
);
#endif
return
err
;
}
*
percentWorkCPU
=
((
initial_work_size
[
divideWorkDim
]
>=
2
*
local_work_size
[
divideWorkDim
]
&&
initial_work_size
[
divideWorkDim
]
<
global_work_size
[
divideWorkDim
])
?
50
:
100
);
}
#ifdef DEBUG
else
printf
(
"
\n
### launchKernelFused -- divide work on host ###
\n
"
);
...
...
@@ -177,10 +245,10 @@ cl_int launchKernelFusedDivideWorkInDim(cl_command_queue commandsCPU, cl_command
}
cl_int
err
=
0
;
if
(
*
percentWorkCPU
<
0
)
// NOTE percentWorkCPU < 0 triggers host-side profiling. Then, -percentWorkCPU of total workitems will be used for profiling
err
=
launchKernelFusedParallelAutoDivideWorkInDim
(
commandsCPU
,
commandsGPU
,
kernel
,
workDim
,
divideWorkDim
,
percentWorkCPU
,
global_work_size
,
local_work_size
,
launchKernelSynchronous
,
timeCPU
,
timeGPU
,
timeHost
);
if
(
*
percentWorkCPU
<
0
)
err
=
launchKernelFusedParallelAutoDivideWorkInDim
(
commandsCPU
,
commandsGPU
,
kernel
,
workDim
,
divideWorkDim
,
percentWorkCPU
,
((
commandsCPUOnDevice
&&
commandsGPUOnDevice
)
?
initial_work_size
:
global_work_size
)
,
local_work_size
,
launchKernelSynchronous
,
timeCPU
,
timeGPU
,
timeHost
);
else
err
=
launchKernelFusedParallel
(
commandsCPU
,
commandsGPU
,
kernel
,
workDim
,
divideWorkDim
,
percentWorkCPU
,
global_work_size
,
local_work_size
,
launchKernelSynchronous
,
timeCPU
,
timeGPU
,
timeHost
);
err
=
launchKernelFusedParallel
(
commandsCPU
,
commandsGPU
,
kernel
,
workDim
,
divideWorkDim
,
percentWorkCPU
,
((
commandsCPUOnDevice
&&
commandsGPUOnDevice
)
?
initial_work_size
:
global_work_size
)
,
local_work_size
,
launchKernelSynchronous
,
timeCPU
,
timeGPU
,
timeHost
);
#if defined (__CL_PRINT_STATS) || defined (__CL_PRINT_DAT) || defined (__CL_MEASURE_ALL_KERNELS)
clGetKernelInfo
(
kernel
,
CL_KERNEL_FUNCTION_NAME
,
92
,
kernelName
[
currentKernelRun
],
NULL
);
...
...
common/cl_utils.h
View file @
ec5699f8
...
...
@@ -27,6 +27,10 @@ char* readSourceFromFileName(const char* fileName);
cl_program
getBuiltProgramFromFile
(
cl_context
context
,
const
char
*
sourceFileName
,
char
*
options
,
cl_int
*
err
);
cl_int
initOnDeviceCommandQueues
(
cl_context
context
);
void
releaseOnDeviceCommandQueues
(
cl_context
context
);
cl_int
getDevicesfromPlatformIdx
(
int
platformIdx
,
cl_uint
*
numDevices
,
cl_device_id
**
devices
);
void
getCPUGPUIds
(
int
*
iCPU
,
int
*
iGPU
,
cl_device_id
*
devices
,
cl_uint
numDevices
);
...
...
common/make.config
View file @
ec5699f8
#ifdef SELF_SCHEDULE
override
SELF_SCHEDULE
= -
DSELF_SCHEDULE
#endif
OPENCL_INC
=/
opt
/
intel
/
opencl
/
include
OPENCL_LIB
=/
opt
/
intel
/
opencl
-
lOpenCL
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment