Commit ec5699f8 authored by Marvin Damschen's avatar Marvin Damschen
Browse files

Device self-scheduling using device-side queueing

parent 285a847d
......@@ -14,7 +14,7 @@ int addOffset(volatile __local uint *s_offset, uint data, uint threadTag){
}
__kernel void
bucketcount( global float *input, global int *indice, global uint *d_prefixoffsets, const int size, global float *l_pivotpoints, const size_t totalWorkItems){
bucketcount( global float *input, global int *indice, global uint *d_prefixoffsets, const int size, global float *l_pivotpoints, const size_t totalWorkItems SELF_SCHEDULE_ARGS){
__local uint s_offset[BLOCK_MEMORY];
......@@ -49,10 +49,10 @@ bucketcount( global float *input, global int *indice, global uint *d_prefixoffse
for (int i = get_local_id(0); i < BLOCK_MEMORY; i += get_local_size(0))
d_prefixoffsets[prefixBase + i] = s_offset[i]; // & ((1 << (32 - WARP_LOG_SIZE)) - 1);
SCHEDULE_CHILD_1D(bucketcount(input, indice, d_prefixoffsets, size, l_pivotpoints, totalWorkItems SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_offsets, const int blocks){
__kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_offsets, const int blocks SELF_SCHEDULE_ARGS){
int tid = get_global_id(0);
int size = blocks * BLOCK_MEMORY;
int sum = 0;
......@@ -65,12 +65,12 @@ __kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_of
d_offsets[tid] = sum;
SCHEDULE_CHILD_1D(bucketprefixoffset(d_prefixoffsets, d_offsets, blocks SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void
bucketsort(global float *input, global int *indice, __global float *output, const int size, global uint *d_prefixoffsets,
global uint *l_offsets, const size_t totalWorkItems){
global uint *l_offsets, const size_t totalWorkItems SELF_SCHEDULE_ARGS){
volatile __local unsigned int s_offset[BLOCK_MEMORY];
int prefixBase = get_group_id(0) * BLOCK_MEMORY;
......@@ -93,5 +93,5 @@ bucketsort(global float *input, global int *indice, __global float *output, cons
// }
}
SCHEDULE_CHILD_1D(bucketsort(input, indice, output, size, d_prefixoffsets, l_offsets, totalWorkItems SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -39,7 +39,7 @@ inline void addData1024(volatile __local uint *s_WarpHist, uint data, uint tag){
float maximum,
uint dataCount,
size_t totalWorkItems
){
SELF_SCHEDULE_ARGS){
//Per-warp substorage storage
const int warpBase = (get_local_id(0) >> WARP_LOG_SIZE) * BIN_COUNT;
__local unsigned int s_Hist[BLOCK_MEMORY];
......@@ -70,6 +70,6 @@ inline void addData1024(volatile __local uint *s_WarpHist, uint data, uint tag){
atomic_add(d_Result+pos,sum);
}
SCHEDULE_CHILD_1D(histogram1024Kernel(d_Result, d_Data, minimum, maximum, dataCount, totalWorkItems SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -50,6 +50,9 @@ int main(int argc, char** argv)
}
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
// Fill our data set with random float values
//
......@@ -208,6 +211,9 @@ int main(int argc, char** argv)
clSVMFree(context, d_output);
clSVMFree(context, nullElements);
clSVMFree(context, origOffsets);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context);
return 0;
......
......@@ -40,7 +40,7 @@ float4 getHighest(float4 a, float4 b)
}
__kernel void mergeSortFirst(__global float4 *input,__global float4 *result, const int listsize){
__kernel void mergeSortFirst(__global float4 *input,__global float4 *result, const int listsize SELF_SCHEDULE_ARGS){
int bx = get_group_id(0);
......@@ -49,16 +49,16 @@ __kernel void mergeSortFirst(__global float4 *input,__global float4 *result, con
result[bx * get_local_size(0) + get_local_id(0)] = sortElem(r);
}
SCHEDULE_CHILD_1D(mergeSortFirst(input, result, listsize SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void
mergeSortPass(__global float4 *input, __global float4 *result,const int nrElems,int threadsPerDiv, __global int *constStartAddr){
mergeSortPass(__global float4 *input, __global float4 *result,const int nrElems,int threadsPerDiv, __global int *constStartAddr SELF_SCHEDULE_ARGS){
int gid = get_global_id(0);
// The division to work on
int division = gid / threadsPerDiv;
if(division >= BIN_COUNT) return;
if(division >= BIN_COUNT) goto self_schedule;
// The block within the division
int int_gid = gid - division * threadsPerDiv;
int Astart = constStartAddr[division] + int_gid * nrElems;
......@@ -68,13 +68,13 @@ mergeSortPass(__global float4 *input, __global float4 *result,const int nrElems,
resStart= &(result[Astart]);
if(Astart >= constStartAddr[division + 1])
return;
goto self_schedule;
if(Bstart >= constStartAddr[division + 1]){
for(int i=0; i<(constStartAddr[division + 1] - Astart); i++)
{
resStart[i] = input[Astart + i];
}
return;
goto self_schedule;
}
int aidx = 0;
......@@ -127,14 +127,20 @@ mergeSortPass(__global float4 *input, __global float4 *result,const int nrElems,
}
resStart[outidx++] = b;
self_schedule:
SCHEDULE_CHILD_1D(mergeSortPass(input, result, nrElems, threadsPerDiv, constStartAddr SELF_SCHEDULE_CHILD_ARGS))
return;
}
__kernel void
mergepack(__global float *orig, __global float *result, __constant int *constStartAddr, __constant int *nullElems, __constant int *finalStartAddr){
mergepack(__global float *orig, __global float *result, __constant int *constStartAddr, __constant int *nullElems, __constant int *finalStartAddr SELF_SCHEDULE_ARGS){
int idx = get_global_id(0);
int division = get_group_id(1);
if((finalStartAddr[division] + idx) >= finalStartAddr[division + 1]) return;
if((finalStartAddr[division] + idx) >= finalStartAddr[division + 1]) goto self_schedule;
result[finalStartAddr[division] + idx] = orig[constStartAddr[division]*4 + nullElems[division] + idx];
self_schedule:
SCHEDULE_CHILD_ND(mergepack(orig, result, constStartAddr, nullElems, finalStartAddr SELF_SCHEDULE_CHILD_ARGS))
return;
}
......@@ -13,7 +13,7 @@ kmeans_kernel_c(__global float *feature,
int nfeatures,
int offset,
int size
)
SELF_SCHEDULE_ARGS)
{
unsigned int point_id = get_global_id(0);
int index = 0;
......@@ -41,7 +41,7 @@ kmeans_kernel_c(__global float *feature,
membership[point_id] = index;
}
SCHEDULE_CHILD_1D(kmeans_kernel_c(feature, clusters, membership, npoints, nclusters, nfeatures, offset, size SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void
......@@ -49,7 +49,7 @@ kmeans_swap(__global float *feature,
__global float *feature_swap,
int npoints,
int nfeatures
){
SELF_SCHEDULE_ARGS){
unsigned int tid = get_global_id(0);
//for(int i = 0; i < nfeatures; i++)
......@@ -61,5 +61,5 @@ kmeans_swap(__global float *feature,
}
// end of Lingjie Zhang's modification
SCHEDULE_CHILD_1D(kmeans_swap(feature, feature_swap, npoints, nfeatures SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -72,6 +72,9 @@ static int initialize()
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......@@ -88,6 +91,9 @@ static int shutdown()
// release resources
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -93,7 +93,7 @@ __kernel void kernel_gpu_opencl( par_str d_par_gpu,
__global FOUR_VECTOR *d_rv_gpu,
__global fp *d_qv_gpu,
__global FOUR_VECTOR *d_fv_gpu
)
SELF_SCHEDULE_ARGS)
{
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
......@@ -277,7 +277,7 @@ __kernel void kernel_gpu_opencl( par_str d_par_gpu,
}
SCHEDULE_CHILD_1D(kernel_gpu_opencl(d_par_gpu, d_dim_gpu, d_box_gpu, d_rv_gpu, d_qv_gpu, d_fv_gpu SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......@@ -286,4 +286,4 @@ __kernel void kernel_gpu_opencl( par_str d_par_gpu,
#ifdef __cplusplus
}
#endif
#endif
\ No newline at end of file
......@@ -117,6 +117,9 @@ main( int argc,
}
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
printf("WG size of kernel = %d \n", NUMBER_THREADS);
......@@ -366,6 +369,9 @@ main( int argc,
//======================================================================================================================================================150
// RETURN
//======================================================================================================================================================150
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
return 0.0; // always returns 0.0
}
......
......@@ -17,13 +17,13 @@ __kernel void GICOV_kernel(int grad_m, image2d_t grad_x, image2d_t grad_y, __con
__constant float *c_cos_angle, __constant int *c_tX, __constant int *c_tY, __global float *gicov) {
#else
__kernel void GICOV_kernel(int grad_m, __global float *grad_x, __global float *grad_y, __constant float *c_sin_angle,
__constant float *c_cos_angle, __constant int *c_tX, __constant int *c_tY, __global float *gicov, int width, int height) {
__constant float *c_cos_angle, __constant int *c_tX, __constant int *c_tY, __global float *gicov, int width, int height SELF_SCHEDULE_ARGS) {
#endif
int i, j, k, n, x, y;
int gid = get_global_id(0);
if(gid>=width*height)
return;
goto self_schedule;
// Determine this thread's pixel
i = gid/width + MAX_RAD + 2;
......@@ -81,6 +81,9 @@ __kernel void GICOV_kernel(int grad_m, __global float *grad_x, __global float *g
// Store the maximal GICOV value
gicov[(i * grad_m) + j] = max_GICOV;
self_schedule:
SCHEDULE_CHILD_1D(GICOV_kernel(grad_m, grad_x, grad_y, c_sin_angle, c_cos_angle, c_tX, c_tY, gicov, width, height SELF_SCHEDULE_CHILD_ARGS))
return;
}
......@@ -93,7 +96,7 @@ __kernel void dilate_kernel(int img_m, int img_n, int strel_m, int strel_n, __co
image2d_t img, __global float *dilated) {
#else
__kernel void dilate_kernel(int img_m, int img_n, int strel_m, int strel_n, __constant float *c_strel,
__global float *img, __global float *dilated) {
__global float *img, __global float *dilated SELF_SCHEDULE_ARGS) {
#endif
// Find the center of the structuring element
......@@ -105,7 +108,7 @@ __kernel void dilate_kernel(int img_m, int img_n, int strel_m, int strel_n, __co
int i = thread_id % img_m;
int j = thread_id / img_m;
if(j > img_n) return;
if(j > img_n) goto self_schedule;
// Initialize the maximum GICOV score seen so far to zero
float max = 0.0f;
......@@ -151,4 +154,7 @@ __kernel void dilate_kernel(int img_m, int img_n, int strel_m, int strel_n, __co
}
// end of Lingjie Zhang's modification
self_schedule:
SCHEDULE_CHILD_1D(dilate_kernel(img_m, img_n, strel_m, strel_n, c_strel, img, dilated SELF_SCHEDULE_CHILD_ARGS))
return;
}
......@@ -155,6 +155,9 @@ void select_device() {
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......
......@@ -25,7 +25,7 @@ float heaviside(float x) {
// Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
__kernel void IMGVF_kernel(__global float *IMGVF_array, __global float *I_array, __constant int *I_offsets, int __constant *m_array,
__constant int *n_array, float vx, float vy, float e, int max_iterations, float cutoff) {
__constant int *n_array, float vx, float vy, float e, int max_iterations, float cutoff SELF_SCHEDULE_ARGS) {
// Shared copy of the matrix being computed
__local float IMGVF[41 * 81];
......@@ -211,5 +211,5 @@ __kernel void IMGVF_kernel(__global float *IMGVF_array, __global float *I_array,
// if (thread_id == 0) IMGVF_global[0] = (float) iterations;
SCHEDULE_CHILD_1D(IMGVF_kernel(IMGVF_array, I_array, I_offsets, m_array, n_array, vx, vy, e, max_iterations, cutoff SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -34,7 +34,7 @@ FP_TYPE heaviside(FP_TYPE x) {
// Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
__kernel void IMGVF_kernel(__global float *IMGVF_array, __global float *I_array, __constant int *I_offsets, int __constant *m_array,
__constant int *n_array, float vx, float vy, float e, int max_iterations, float cutoff) {
__constant int *n_array, float vx, float vy, float e, int max_iterations, float cutoff SELF_SCHEDULE_ARGS) {
// Shared copy of the matrix being computed
__local FP_TYPE IMGVF[41 * 81];
......@@ -217,5 +217,5 @@ __kernel void IMGVF_kernel(__global float *IMGVF_array, __global float *I_array,
IMGVF_global[offset] = IMGVF[offset];
}
SCHEDULE_CHILD_1D(IMGVF_kernel(IMGVF_array, I_array, I_offsets, m_array, n_array, vx, vy, e, max_iterations, cutoff SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -134,6 +134,9 @@ static int initialize()
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......@@ -150,6 +153,9 @@ static int shutdown()
// release resources
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -4,7 +4,7 @@ __kernel void
lud_diagonal(__global float *m,
__local float *shadow,
int matrix_dim,
int offset)
int offset SELF_SCHEDULE_ARGS)
{
int i,j;
int tx = get_local_id(0);
......@@ -41,8 +41,8 @@ lud_diagonal(__global float *m,
array_offset += matrix_dim;
}
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^(local void *shadow){lud_diagonal(m, shadow, matrix_dim, offset SELF_SCHEDULE_CHILD_ARGS);}\
COMMA (uint)(sizeof(float) * BLOCK_SIZE * BLOCK_SIZE))
}
__kernel void
......@@ -51,7 +51,7 @@ lud_perimeter(__global float *m,
__local float *peri_row,
__local float *peri_col,
int matrix_dim,
int offset)
int offset SELF_SCHEDULE_ARGS)
{
int i,j, array_offset;
int idx;
......@@ -124,8 +124,8 @@ lud_perimeter(__global float *m,
}
}
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^(local void *dia, local void *peri_row, local void *peri_col){lud_perimeter(m, dia, peri_row, peri_col, matrix_dim, offset SELF_SCHEDULE_CHILD_ARGS);}\
COMMA (uint)(sizeof(float) * BLOCK_SIZE * BLOCK_SIZE) COMMA (uint)(sizeof(float) * BLOCK_SIZE * BLOCK_SIZE) COMMA (uint)(sizeof(float) * BLOCK_SIZE * BLOCK_SIZE))
}
__kernel void
......@@ -133,7 +133,7 @@ lud_internal(__global float *m,
__local float *peri_row,
__local float *peri_col,
int matrix_dim,
int offset)
int offset SELF_SCHEDULE_ARGS)
{
int bx = get_group_id(0);
......@@ -158,8 +158,8 @@ lud_internal(__global float *m,
sum += peri_col[ty * BLOCK_SIZE + i] * peri_row[i * BLOCK_SIZE + tx];
m[(global_row_id+ty)*matrix_dim+global_col_id+tx] -= sum;
SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(^(local void *peri_row, local void *peri_col){lud_internal(m, peri_row, peri_col, matrix_dim, offset SELF_SCHEDULE_CHILD_ARGS);}\
COMMA (uint)(sizeof(float) * BLOCK_SIZE * BLOCK_SIZE) COMMA (uint)(sizeof(float) * BLOCK_SIZE * BLOCK_SIZE))
}
......
......@@ -1314,7 +1314,7 @@ kernel_gpu_opencl( int timeinst,
__global fp *d_initvalu,
__global fp *d_finavalu,
__global fp *d_params,
__global fp *d_com)
__global fp *d_com SELF_SCHEDULE_ARGS)
{
//======================================================================================================================================================150
......@@ -1436,7 +1436,7 @@ kernel_gpu_opencl( int timeinst,
// END
//======================================================================================================================================================150
SCHEDULE_CHILD_1D(kernel_gpu_opencl(timeinst, d_initvalu, d_finavalu, d_params, d_com SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......
......@@ -207,6 +207,9 @@ main( int argc,
}
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
//======================================================================================================================================================150
// EXECUTION IF THERE IS 1 WORKLOAD, PARALLELIZE INSIDE 1 WORKLOAD
......@@ -345,6 +348,9 @@ main( int argc,
// com
free(com);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
//====================================================================================================100
// END
......
......@@ -41,6 +41,9 @@ int main(int argc, char *argv[]) {
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int numRecords = loadData(filename,records,locations);
......@@ -68,6 +71,9 @@ int main(int argc, char *argv[]) {
clSVMFree(context,recordDistances);
clSVMFree(context, records);
clSVMFree(context, locations);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
return 0;
}
......
......@@ -12,7 +12,7 @@ __kernel void NearestNeighbor(__global LatLong *d_locations,
__global float *d_distances,
const int numRecords,
const float lat,
const float lng) {
const float lng SELF_SCHEDULE_ARGS) {
int globalId = get_global_id(0);
if (globalId < numRecords) {
......@@ -22,5 +22,5 @@ __kernel void NearestNeighbor(__global LatLong *d_locations,
*dist = (float)sqrt((lat-latLong->lat)*(lat-latLong->lat)+(lng-latLong->lng)*(lng-latLong->lng));
}
}
SCHEDULE_CHILD_1D(NearestNeighbor(d_locations, d_distances, numRecords, lat, lng SELF_SCHEDULE_CHILD_ARGS))
}
\ No newline at end of file
......@@ -75,6 +75,9 @@ static int initialize()
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......@@ -91,6 +94,9 @@ static int shutdown()
// release resources
if( commandsCPU ) clReleaseCommandQueue( commandsCPU );
if( commandsGPU ) clReleaseCommandQueue( commandsGPU );
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
if( context ) clReleaseContext( context );
if( devices ) delete devices;
......
......@@ -30,7 +30,7 @@ nw_kernel1(__global int * reference_d,
int block_width,
int worksize,
int offset_r,
int offset_c
int offset_c SELF_SCHEDULE_ARGS
)
{
......@@ -110,8 +110,9 @@ nw_kernel1(__global int * reference_d,
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
itemsets_d[index + cols * ty] = SCORE((ty+1), (tx+1));
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^(local void *input_itemsets_l, local void *reference_l){nw_kernel1(reference_d, itemsets_d, input_itemsets_l, reference_l, cols, penalty,
blk, block_width, worksize, offset_r, offset_c SELF_SCHEDULE_CHILD_ARGS);}\
COMMA (uint)(sizeof(int)*(BLOCK_SIZE+1)*(BLOCK_SIZE+1)) COMMA (uint)(sizeof(int)*BLOCK_SIZE*BLOCK_SIZE))
}
__kernel void
......@@ -125,7 +126,7 @@ nw_kernel2(__global int * reference_d,
int block_width,
int worksize,
int offset_r,
int offset_c
int offset_c SELF_SCHEDULE_ARGS
)
{
......@@ -197,6 +198,7 @@ nw_kernel2(__global int * reference_d,
itemsets_d[index + ty * cols] = SCORE((ty+1), (tx+1));
SCHEDULE_CHILD_WITH_LOCAL_PTRS_1D(^(local void *input_itemsets_l, local void *reference_l){nw_kernel2(reference_d, itemsets_d, input_itemsets_l, reference_l, cols, penalty,
blk, block_width, worksize, offset_r, offset_c SELF_SCHEDULE_CHILD_ARGS);}\
COMMA (uint)(sizeof(int)*(BLOCK_SIZE+1)*(BLOCK_SIZE+1)) COMMA (uint)(sizeof(int)*BLOCK_SIZE*BLOCK_SIZE))
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment