Commit ec5699f8 authored by Marvin Damschen's avatar Marvin Damschen

Device self-scheduling using device-side queueing

parent 285a847d
......@@ -62,7 +62,7 @@ findK( long height,
__global long *currKnodeD,
__global long *offsetD,
__global int *keysD,
__global record *ansD)
__global record *ansD SELF_SCHEDULE_ARGS)
{
// private thread IDs
......@@ -99,7 +99,7 @@ findK( long height,
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
}
SCHEDULE_CHILD_1D(findK(height, knodesD, knodes_elem, recordsD, currKnodeD, offsetD, keysD, ansD SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......
......@@ -55,7 +55,7 @@ findRangeK( long height,
__global int *startD,
__global int *endD,
__global int *RecstartD,
__global int *ReclenD)
__global int *ReclenD SELF_SCHEDULE_ARGS)
{
// private thread IDs
......@@ -105,7 +105,7 @@ findRangeK( long height,
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
}
SCHEDULE_CHILD_1D(findRangeK(height, knodesD, knodes_elem, currKnodeD, offsetD, lastKnodeD, offset_2D, startD, endD, RecstartD, ReclenD SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......
......@@ -1869,6 +1869,9 @@ main( int argc,
//====================================================================================================100
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
printf("WG size of kernel 1 = %d WG size of kernel 2 = %d \n", DEFAULT_ORDER, DEFAULT_ORDER_2);
......@@ -2428,6 +2431,9 @@ main( int argc,
// ------------------------------------------------------------60
free(mem);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context);
return EXIT_SUCCESS;
......
......@@ -18,7 +18,7 @@ bpnn_layerforward_ocl(__global float *input_cuda,
__local float *input_node,
__local float *weight_matrix,
int in,
int hid)
int hid SELF_SCHEDULE_ARGS)
{
int by = get_group_id(1);
......@@ -61,8 +61,8 @@ bpnn_layerforward_ocl(__global float *input_cuda,
hidden_partial_sum[by * hid + ty] = weight_matrix[tx* WIDTH + ty];
}
SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(^(local void *input_node, local void *weight_matrix){bpnn_layerforward_ocl(input_cuda, output_hidden_cuda, input_hidden_cuda, hidden_partial_sum, input_node, weight_matrix, in, hid SELF_SCHEDULE_CHILD_ARGS);}\
COMMA (uint)(HEIGHT*sizeof(float)) COMMA (uint)(HEIGHT*WIDTH*sizeof(float)))
}
......@@ -71,7 +71,7 @@ __kernel void bpnn_adjust_weights_ocl( __global float * delta,
__global float * ly,
int in,
__global float * w,
__global float * oldw)
__global float * oldw SELF_SCHEDULE_ARGS)
{
int by = get_group_id(1);
......@@ -92,6 +92,6 @@ __kernel void bpnn_adjust_weights_ocl( __global float * delta,
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
}
SCHEDULE_CHILD_ND(bpnn_adjust_weights_ocl(delta, hid, ly, in, w, oldw SELF_SCHEDULE_CHILD_ARGS))
}
#endif
......@@ -80,7 +80,13 @@ main( int argc, char** argv)
}
context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
setup(context, argc, argv);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context);
}
......
......@@ -18,7 +18,7 @@ __kernel void BFS_1( const __global Node* g_graph_nodes,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global int* g_cost,
const int no_of_nodes){
const int no_of_nodes SELF_SCHEDULE_ARGS){
int tid = get_global_id(0);
if( tid<no_of_nodes && g_graph_mask[tid]){
g_graph_mask[tid]=false;
......@@ -31,7 +31,7 @@ __kernel void BFS_1( const __global Node* g_graph_nodes,
}
}
SCHEDULE_CHILD_1D(BFS_1(g_graph_nodes, g_graph_edges, g_graph_mask, g_updating_graph_mask, g_graph_visited, g_cost, no_of_nodes SELF_SCHEDULE_CHILD_ARGS))
}
//--5 parameters
......@@ -40,7 +40,7 @@ __kernel void BFS_2(__global char* g_graph_mask,
__global char* g_graph_visited,
__global char* g_over,
const int no_of_nodes
) {
SELF_SCHEDULE_ARGS) {
int tid = get_global_id(0);
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
......@@ -50,7 +50,7 @@ __kernel void BFS_2(__global char* g_graph_mask,
g_updating_graph_mask[tid]=false;
}
SCHEDULE_CHILD_1D(BFS_2(g_graph_mask, g_updating_graph_mask, g_graph_visited, g_over, no_of_nodes SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -198,6 +198,9 @@ int main(int argc, char * argv[])
}
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int no_of_nodes;
int edge_list_size;
......@@ -304,6 +307,9 @@ int main(int argc, char * argv[])
clSVMFree(context, h_updating_graph_mask);
clSVMFree(context, h_graph_visited);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
}
catch(std::string msg){
......
......@@ -38,11 +38,14 @@ typedef struct{
num_bytes: the number of bytes all together
@return: through mem_d
------------------------------------------------------------*/
__kernel void memset_kernel(__global char * mem_d, short val, int ct){
__kernel void memset_kernel(__global char * mem_d, short val, int ct SELF_SCHEDULE_ARGS){
const int thread_id = get_global_id(0);
if( thread_id >= ct) return;
if( thread_id >= ct) goto self_schedule;
mem_d[thread_id] = val;
self_schedule:
SCHEDULE_CHILD_1D(memset_kernel(mem_d, val, ct SELF_SCHEDULE_CHILD_ARGS))
return;
}
//--cambine: omit &
......@@ -83,22 +86,25 @@ inline void compute_flux_contribution(float density, FLOAT3 momentum, float dens
fc_density_energy->y = velocity.y*de_p;
fc_density_energy->z = velocity.z*de_p;
}
__kernel void initialize_variables(__global float* variables, __constant float* ff_variable, int nelr){
__kernel void initialize_variables(__global float* variables, __constant float* ff_variable, int nelr SELF_SCHEDULE_ARGS){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
if( i >= nelr) goto self_schedule;
for(int j = 0; j < NVAR; j++)
variables[i + j*nelr] = ff_variable[j];
self_schedule:
SCHEDULE_CHILD_1D(initialize_variables(variables, ff_variable, nelr SELF_SCHEDULE_CHILD_ARGS))
return;
}
__kernel void compute_step_factor(__global float* variables,
__global float* areas,
__global float* step_factors,
int nelr){
int nelr SELF_SCHEDULE_ARGS){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
if( i >= nelr) goto self_schedule;
float density = variables[i + VAR_DENSITY*nelr];
FLOAT3 momentum;
......@@ -119,6 +125,9 @@ __kernel void compute_step_factor(__global float* variables,
//step_factors[i] = (float)(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
step_factors[i] = (float)(0.5f) / (sqrt(areas[i]) * (sqrt(speed_sqd) + speed_of_sound));
self_schedule:
SCHEDULE_CHILD_1D(compute_step_factor(variables, areas, step_factors, nelr SELF_SCHEDULE_CHILD_ARGS))
return;
}
__kernel void compute_flux(
......@@ -131,11 +140,11 @@ __kernel void compute_flux(
__constant FLOAT3* ff_flux_contribution_momentum_x,
__constant FLOAT3* ff_flux_contribution_momentum_y,
__constant FLOAT3* ff_flux_contribution_momentum_z,
int nelr){
int nelr SELF_SCHEDULE_ARGS){
const float smoothing_coefficient = (float)(0.2f);
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
if( i >= nelr) goto self_schedule;
int j, nb;
FLOAT3 normal; float normal_len;
float factor;
......@@ -266,7 +275,7 @@ __kernel void compute_flux(
fluxes[i + VAR_DENSITY_ENERGY*nelr] = flux_i_density_energy;
self_schedule:
SCHEDULE_CHILD_1D(compute_flux(elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z, nelr SELF_SCHEDULE_CHILD_ARGS))
return;
}
......@@ -274,10 +283,10 @@ __kernel void time_step(int j, int nelr,
__global float* old_variables,
__global float* variables,
__global float* step_factors,
__global float* fluxes){
__global float* fluxes SELF_SCHEDULE_ARGS){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
if( i >= nelr) goto self_schedule;
float factor = step_factors[i]/(float)(RK+1-j);
......@@ -287,6 +296,9 @@ __kernel void time_step(int j, int nelr,
variables[i + (VAR_MOMENTUM+1)*nelr] = old_variables[i + (VAR_MOMENTUM+1)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+1)*nelr];
variables[i + (VAR_MOMENTUM+2)*nelr] = old_variables[i + (VAR_MOMENTUM+2)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+2)*nelr];
self_schedule:
SCHEDULE_CHILD_1D(time_step(j, nelr, old_variables, variables, step_factors, fluxes SELF_SCHEDULE_CHILD_ARGS))
return;
}
#endif
......@@ -246,6 +246,9 @@ int main(int argc, char** argv){
printf("Error: Failed to create context (%d)!\n", err);
exit(1);
}
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......@@ -413,6 +416,9 @@ int main(int argc, char** argv){
std::cout << "Saved solution..." << std::endl;
std::cout << "Cleaning up..." << std::endl;
// TODO
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
std::cout << "Done..." << std::endl;
clSVMFree(context, h_ff_variable);
clSVMFree(context, h_ff_flux_contribution_momentum_x);
......
......@@ -57,7 +57,7 @@ __kernel void c_CopySrcToComponents (__global int *d_r,
__global int *d_g,
__global int *d_b,
__global unsigned char * cl_d_src,
int pixels)
int pixels SELF_SCHEDULE_ARGS)
{
int x = get_local_id(0);
int gX= get_local_size(0) * get_group_id(0);
......@@ -85,14 +85,14 @@ __kernel void c_CopySrcToComponents (__global int *d_r,
storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
}
SCHEDULE_CHILD_1D(c_CopySrcToComponents (d_r, d_g, d_b, cl_d_src, pixels SELF_SCHEDULE_CHILD_ARGS))
}
// Copy img src data into three separated component buffers
__kernel void c_CopySrcToComponent (__global int *d_c,
__global unsigned char * cl_d_src,
int pixels)
int pixels SELF_SCHEDULE_ARGS)
{
int x = get_local_id(0);
int gX = get_local_size(0) * get_group_id(0);
......@@ -113,7 +113,7 @@ __kernel void c_CopySrcToComponent (__global int *d_c,
storeComponent(d_c, c, globalOutputPosition);
}
SCHEDULE_CHILD_1D(c_CopySrcToComponent (d_c, cl_d_src, pixels SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -662,7 +662,7 @@ __kernel void cl_fdwt53Kernel(__global const int * const in,
const int sy,
const int steps,
int WIN_SIZE_X,
int WIN_SIZE_Y)
int WIN_SIZE_Y SELF_SCHEDULE_ARGS)
{
__local struct FDWT53 fdwt53;
fdwt53.WIN_SIZE_X = WIN_SIZE_X;
......@@ -708,5 +708,5 @@ __kernel void cl_fdwt53Kernel(__global const int * const in,
transform(&fdwt53, false, false, in, out, sx, sy, steps);
}
SCHEDULE_CHILD_ND(cl_fdwt53Kernel(in, out, sx, sy, steps, WIN_SIZE_X, WIN_SIZE_Y SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -769,7 +769,10 @@ int main(int argc, char **argv)
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create OpenCL context." << std::endl;
return 1;
......@@ -865,6 +868,9 @@ int main(int argc, char **argv)
clReleaseKernel(c_CopySrcToComponents);
clReleaseKernel(c_CopySrcToComponent);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clSVMFree(context, d->srcImg);
return 0;
......
......@@ -103,6 +103,9 @@ int main(int argc, char *argv[]) {
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
if(size < 1)
{
......@@ -176,6 +179,9 @@ int main(int argc, char *argv[]) {
clSVMFree(context, a);
clSVMFree(context, b);
free(finalVec);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
cl_cleanup();
//OpenClGaussianElimination(context,timing);
......
......@@ -11,14 +11,14 @@ __kernel void Fan1(__global float *m_dev,
__global float *a_dev,
__global float *b_dev,
const int size,
const int t) {
const int t SELF_SCHEDULE_ARGS) {
int globalId = get_global_id(0);
if (globalId < size-1-t) {
*(m_dev + size * (globalId + t + 1)+t) = *(a_dev + size * (globalId + t + 1) + t) / *(a_dev + size * t + t);
}
SCHEDULE_CHILD_1D(Fan1(m_dev, a_dev, b_dev, size, t SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -26,7 +26,7 @@ __kernel void Fan2(__global float *m_dev,
__global float *a_dev,
__global float *b_dev,
const int size,
const int t) {
const int t SELF_SCHEDULE_ARGS) {
int globalId = get_global_id(0);
int globalIdx = get_global_id(0);
......@@ -49,5 +49,5 @@ __kernel void Fan2(__global float *m_dev,
// b_dev[globalIdx+1+t] -= m_dev[size*(globalIdx+1+t)+(globalIdy+t)] * b_dev[t];
// }
SCHEDULE_CHILD_ND(Fan2(m_dev, a_dev, b_dev, size, t SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -69,7 +69,7 @@ kernel_gpu_opencl( // structures
__global fp* in_sqr_final_sum_all, // 31 OUTPUT common.allPoints
__global fp* denomT_all, // 32 OUTPUT common.allPoints
__global fp* checksum) // 33 OUTPUT 100
__global fp* checksum SELF_SCHEDULE_ARGS) // 33 OUTPUT 100
{
......@@ -2228,7 +2228,10 @@ kernel_gpu_opencl( // structures
//======================================================================================================================================================150
// End
//======================================================================================================================================================150
SCHEDULE_CHILD_1D(kernel_gpu_opencl(d_common, d_frame, d_frame_no, d_endoRow, d_endoCol, d_tEndoRowLoc, d_tEndoColLoc, d_epiRow, d_epiCol, d_tEpiRowLoc, d_tEpiColLoc, d_endoT, d_epiT, d_in2_all, d_conv_all, d_in2_pad_cumv_all, d_in2_pad_cumv_sel_all,
d_in2_sub_cumh_all, d_in2_sub_cumh_sel_all, d_in2_sub2_all, d_in2_sqr_all, d_in2_sqr_sub2_all, d_in_sqr_all, d_tMask_all, d_mask_conv_all, d_in_mod_temp_all, in_partial_sum_all, in_sqr_partial_sum_all, par_max_val_all,
par_max_coo_all, in_final_sum_all, in_sqr_final_sum_all, denomT_all, checksum SELF_SCHEDULE_CHILD_ARGS))
}
//========================================================================================================================================================================================================200
......
......@@ -83,6 +83,9 @@ main( int argc,
}
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
//======================================================================================================================================================150
......@@ -269,6 +272,9 @@ main( int argc,
// End
//====================================================================================================100
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
err = clReleaseContext(context);
if (err != CL_SUCCESS)
fatal_CL(err, __LINE__);
......
......@@ -158,6 +158,9 @@ int main(int argc, char** argv) {
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......@@ -245,6 +248,9 @@ int main(int argc, char** argv) {
clSVMFree(context, MatrixTemp[1]);
clSVMFree(context, MatrixPower);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context);
......
......@@ -13,7 +13,7 @@ __kernel void hotspot( int iteration, //number of iteration
float Rx,
float Ry,
float Rz,
float step) {
float step SELF_SCHEDULE_ARGS) {
local float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
local float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
......@@ -114,5 +114,5 @@ __kernel void hotspot( int iteration, //number of iteration
temp_dst[index]= temp_t[ty][tx];
}
SCHEDULE_CHILD_ND(hotspot(iteration, power, temp_src, temp_dst, grid_cols, grid_rows, border_cols, border_rows, Cap, Rx, Ry, Rz, step SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -185,6 +185,9 @@ int main(int argc, char** argv)
}
context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num);
......@@ -310,6 +313,9 @@ int main(int argc, char** argv)
clReleaseKernel(ko_vadd);
clReleaseCommandQueue(commandsCPU);
clReleaseCommandQueue(commandsGPU);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context);
return 0;
......
......@@ -5,7 +5,7 @@ __kernel void hotspotOpt1(__global float *p, __global float* tIn, __global float
float ce, float cw,
float cn, float cs,
float ct, float cb,
float cc)
float cc SELF_SCHEDULE_ARGS)
{
float amb_temp = 80.0;
......@@ -47,7 +47,7 @@ __kernel void hotspotOpt1(__global float *p, __global float* tIn, __global float
tOut[c] = cc * temp2 + cw * tIn[W] + ce * tIn[E] + cs * tIn[S]
+ cn * tIn[N] + cb * temp1 + ct * temp3 + sdc * p[c] + ct * amb_temp;
SCHEDULE_CHILD_ND(hotspotOpt1(p, tIn, tOut, sdc, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc SELF_SCHEDULE_CHILD_ARGS))
}
CC = gcc
CC_FLAGS = -std=gnu11
OPENCL_INC=/opt/intel/opencl/include
OPENCL_LIB=/opt/intel/opencl
include ../../common/make.config
ifdef VERIFY
override VERIFY = -DVERIFY
......@@ -12,10 +11,6 @@ ifdef DEBUG
override DEBUG = -DDEBUG
endif
ifdef SELF_SCHEDULE
override SELF_SCHEDULE = -DSELF_SCHEDULE
endif
ifdef OUTPUT
override OUTPUT = -DOUTPUT
endif
......
......@@ -14,7 +14,7 @@ int addOffset(volatile __local uint *s_offset, uint data, uint threadTag){
}
__kernel void
bucketcount( global float *input, global int *indice, global uint *d_prefixoffsets, const int size, global float *l_pivotpoints, const size_t totalWorkItems){
bucketcount( global float *input, global int *indice, global uint *d_prefixoffsets, const int size, global float *l_pivotpoints, const size_t totalWorkItems SELF_SCHEDULE_ARGS){
__local uint s_offset[BLOCK_MEMORY];
......@@ -49,10 +49,10 @@ bucketcount( global float *input, global int *indice, global uint *d_prefixoffse
for (int i = get_local_id(0); i < BLOCK_MEMORY; i += get_local_size(0))
d_prefixoffsets[prefixBase + i] = s_offset[i]; // & ((1 << (32 - WARP_LOG_SIZE)) - 1);
SCHEDULE_CHILD_1D(bucketcount(input, indice, d_prefixoffsets, size, l_pivotpoints, totalWorkItems SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_offsets, const int blocks){
__kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_offsets, const int blocks SELF_SCHEDULE_ARGS){
int tid = get_global_id(0);
int size = blocks * BLOCK_MEMORY;
int sum = 0;
......@@ -65,12 +65,12 @@ __kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_of
d_offsets[tid] = sum;
SCHEDULE_CHILD_1D(bucketprefixoffset(d_prefixoffsets, d_offsets, blocks SELF_SCHEDULE_CHILD_ARGS))
}
__kernel void
bucketsort(global float *input, global int *indice, __global float *output, const int size, global uint *d_prefixoffsets,
global uint *l_offsets, const size_t totalWorkItems){
global uint *l_offsets, const size_t totalWorkItems SELF_SCHEDULE_ARGS){
volatile __local unsigned int s_offset[BLOCK_MEMORY];
int prefixBase = get_group_id(0) * BLOCK_MEMORY;
......@@ -93,5 +93,5 @@ bucketsort(global float *input, global int *indice, __global float *output, cons
// }
}
SCHEDULE_CHILD_1D(bucketsort(input, indice, output, size, d_prefixoffsets, l_offsets, totalWorkItems SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -39,7 +39,7 @@ inline void addData1024(volatile __local uint *s_WarpHist, uint data, uint tag){
float maximum,
uint dataCount,
size_t totalWorkItems
){
SELF_SCHEDULE_ARGS){
//Per-warp substorage storage
const int warpBase = (get_local_id(0) >> WARP_LOG_SIZE) * BIN_COUNT;
__local unsigned int s_Hist[BLOCK_MEMORY];
......@@ -70,6 +70,6 @@ inline void addData1024(volatile __local uint *s_WarpHist, uint data, uint tag){
atomic_add(d_Result+pos,sum);
}
SCHEDULE_CHILD_1D(histogram1024Kernel(d_Result, d_Data, minimum, maximum, dataCount, totalWorkItems SELF_SCHEDULE_CHILD_ARGS))
}
......@@ -50,6 +50,9 @@ int main(int argc, char** argv)
}
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
// Fill our data set with random float values
//
......@@ -208,6 +211,9 @@ int main(int argc, char** argv)
clSVMFree(context, d_output);