Commit ec5699f8 authored by Marvin Damschen's avatar Marvin Damschen
Browse files

Device self-scheduling using device-side queueing

parent 285a847d
...@@ -62,7 +62,7 @@ findK( long height, ...@@ -62,7 +62,7 @@ findK( long height,
__global long *currKnodeD, __global long *currKnodeD,
__global long *offsetD, __global long *offsetD,
__global int *keysD, __global int *keysD,
__global record *ansD) __global record *ansD SELF_SCHEDULE_ARGS)
{ {
// private thread IDs // private thread IDs
...@@ -99,7 +99,7 @@ findK( long height, ...@@ -99,7 +99,7 @@ findK( long height,
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value; ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
} }
SCHEDULE_CHILD_1D(findK(height, knodesD, knodes_elem, recordsD, currKnodeD, offsetD, keysD, ansD SELF_SCHEDULE_CHILD_ARGS))
} }
//========================================================================================================================================================================================================200 //========================================================================================================================================================================================================200
......
...@@ -55,7 +55,7 @@ findRangeK( long height, ...@@ -55,7 +55,7 @@ findRangeK( long height,
__global int *startD, __global int *startD,
__global int *endD, __global int *endD,
__global int *RecstartD, __global int *RecstartD,
__global int *ReclenD) __global int *ReclenD SELF_SCHEDULE_ARGS)
{ {
// private thread IDs // private thread IDs
...@@ -105,7 +105,7 @@ findRangeK( long height, ...@@ -105,7 +105,7 @@ findRangeK( long height,
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1; ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
} }
SCHEDULE_CHILD_1D(findRangeK(height, knodesD, knodes_elem, currKnodeD, offsetD, lastKnodeD, offset_2D, startD, endD, RecstartD, ReclenD SELF_SCHEDULE_CHILD_ARGS))
} }
//========================================================================================================================================================================================================200 //========================================================================================================================================================================================================200
......
...@@ -1869,6 +1869,9 @@ main( int argc, ...@@ -1869,6 +1869,9 @@ main( int argc,
//====================================================================================================100 //====================================================================================================100
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error); cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &error);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
printf("WG size of kernel 1 = %d WG size of kernel 2 = %d \n", DEFAULT_ORDER, DEFAULT_ORDER_2); printf("WG size of kernel 1 = %d WG size of kernel 2 = %d \n", DEFAULT_ORDER, DEFAULT_ORDER_2);
...@@ -2428,6 +2431,9 @@ main( int argc, ...@@ -2428,6 +2431,9 @@ main( int argc,
// ------------------------------------------------------------60 // ------------------------------------------------------------60
free(mem); free(mem);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context); clReleaseContext(context);
return EXIT_SUCCESS; return EXIT_SUCCESS;
......
...@@ -18,7 +18,7 @@ bpnn_layerforward_ocl(__global float *input_cuda, ...@@ -18,7 +18,7 @@ bpnn_layerforward_ocl(__global float *input_cuda,
__local float *input_node, __local float *input_node,
__local float *weight_matrix, __local float *weight_matrix,
int in, int in,
int hid) int hid SELF_SCHEDULE_ARGS)
{ {
int by = get_group_id(1); int by = get_group_id(1);
...@@ -61,8 +61,8 @@ bpnn_layerforward_ocl(__global float *input_cuda, ...@@ -61,8 +61,8 @@ bpnn_layerforward_ocl(__global float *input_cuda,
hidden_partial_sum[by * hid + ty] = weight_matrix[tx* WIDTH + ty]; hidden_partial_sum[by * hid + ty] = weight_matrix[tx* WIDTH + ty];
} }
SCHEDULE_CHILD_WITH_LOCAL_PTRS_ND(^(local void *input_node, local void *weight_matrix){bpnn_layerforward_ocl(input_cuda, output_hidden_cuda, input_hidden_cuda, hidden_partial_sum, input_node, weight_matrix, in, hid SELF_SCHEDULE_CHILD_ARGS);}\
COMMA (uint)(HEIGHT*sizeof(float)) COMMA (uint)(HEIGHT*WIDTH*sizeof(float)))
} }
...@@ -71,7 +71,7 @@ __kernel void bpnn_adjust_weights_ocl( __global float * delta, ...@@ -71,7 +71,7 @@ __kernel void bpnn_adjust_weights_ocl( __global float * delta,
__global float * ly, __global float * ly,
int in, int in,
__global float * w, __global float * w,
__global float * oldw) __global float * oldw SELF_SCHEDULE_ARGS)
{ {
int by = get_group_id(1); int by = get_group_id(1);
...@@ -92,6 +92,6 @@ __kernel void bpnn_adjust_weights_ocl( __global float * delta, ...@@ -92,6 +92,6 @@ __kernel void bpnn_adjust_weights_ocl( __global float * delta,
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x])); oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
} }
SCHEDULE_CHILD_ND(bpnn_adjust_weights_ocl(delta, hid, ly, in, w, oldw SELF_SCHEDULE_CHILD_ARGS))
} }
#endif #endif
...@@ -80,7 +80,13 @@ main( int argc, char** argv) ...@@ -80,7 +80,13 @@ main( int argc, char** argv)
} }
context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &err); context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
setup(context, argc, argv); setup(context, argc, argv);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clReleaseContext(context); clReleaseContext(context);
} }
......
...@@ -18,7 +18,7 @@ __kernel void BFS_1( const __global Node* g_graph_nodes, ...@@ -18,7 +18,7 @@ __kernel void BFS_1( const __global Node* g_graph_nodes,
__global char* g_updating_graph_mask, __global char* g_updating_graph_mask,
__global char* g_graph_visited, __global char* g_graph_visited,
__global int* g_cost, __global int* g_cost,
const int no_of_nodes){ const int no_of_nodes SELF_SCHEDULE_ARGS){
int tid = get_global_id(0); int tid = get_global_id(0);
if( tid<no_of_nodes && g_graph_mask[tid]){ if( tid<no_of_nodes && g_graph_mask[tid]){
g_graph_mask[tid]=false; g_graph_mask[tid]=false;
...@@ -31,7 +31,7 @@ __kernel void BFS_1( const __global Node* g_graph_nodes, ...@@ -31,7 +31,7 @@ __kernel void BFS_1( const __global Node* g_graph_nodes,
} }
} }
SCHEDULE_CHILD_1D(BFS_1(g_graph_nodes, g_graph_edges, g_graph_mask, g_updating_graph_mask, g_graph_visited, g_cost, no_of_nodes SELF_SCHEDULE_CHILD_ARGS))
} }
//--5 parameters //--5 parameters
...@@ -40,7 +40,7 @@ __kernel void BFS_2(__global char* g_graph_mask, ...@@ -40,7 +40,7 @@ __kernel void BFS_2(__global char* g_graph_mask,
__global char* g_graph_visited, __global char* g_graph_visited,
__global char* g_over, __global char* g_over,
const int no_of_nodes const int no_of_nodes
) { SELF_SCHEDULE_ARGS) {
int tid = get_global_id(0); int tid = get_global_id(0);
if( tid<no_of_nodes && g_updating_graph_mask[tid]){ if( tid<no_of_nodes && g_updating_graph_mask[tid]){
...@@ -50,7 +50,7 @@ __kernel void BFS_2(__global char* g_graph_mask, ...@@ -50,7 +50,7 @@ __kernel void BFS_2(__global char* g_graph_mask,
g_updating_graph_mask[tid]=false; g_updating_graph_mask[tid]=false;
} }
SCHEDULE_CHILD_1D(BFS_2(g_graph_mask, g_updating_graph_mask, g_graph_visited, g_over, no_of_nodes SELF_SCHEDULE_CHILD_ARGS))
} }
...@@ -198,6 +198,9 @@ int main(int argc, char * argv[]) ...@@ -198,6 +198,9 @@ int main(int argc, char * argv[])
} }
cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err); cl_context context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int no_of_nodes; int no_of_nodes;
int edge_list_size; int edge_list_size;
...@@ -304,6 +307,9 @@ int main(int argc, char * argv[]) ...@@ -304,6 +307,9 @@ int main(int argc, char * argv[])
clSVMFree(context, h_updating_graph_mask); clSVMFree(context, h_updating_graph_mask);
clSVMFree(context, h_graph_visited); clSVMFree(context, h_graph_visited);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
} }
catch(std::string msg){ catch(std::string msg){
......
...@@ -38,11 +38,14 @@ typedef struct{ ...@@ -38,11 +38,14 @@ typedef struct{
num_bytes: the number of bytes all together num_bytes: the number of bytes all together
@return: through mem_d @return: through mem_d
------------------------------------------------------------*/ ------------------------------------------------------------*/
__kernel void memset_kernel(__global char * mem_d, short val, int ct){ __kernel void memset_kernel(__global char * mem_d, short val, int ct SELF_SCHEDULE_ARGS){
const int thread_id = get_global_id(0); const int thread_id = get_global_id(0);
if( thread_id >= ct) return; if( thread_id >= ct) goto self_schedule;
mem_d[thread_id] = val; mem_d[thread_id] = val;
self_schedule:
SCHEDULE_CHILD_1D(memset_kernel(mem_d, val, ct SELF_SCHEDULE_CHILD_ARGS))
return;
} }
//--cambine: omit & //--cambine: omit &
...@@ -83,22 +86,25 @@ inline void compute_flux_contribution(float density, FLOAT3 momentum, float dens ...@@ -83,22 +86,25 @@ inline void compute_flux_contribution(float density, FLOAT3 momentum, float dens
fc_density_energy->y = velocity.y*de_p; fc_density_energy->y = velocity.y*de_p;
fc_density_energy->z = velocity.z*de_p; fc_density_energy->z = velocity.z*de_p;
} }
__kernel void initialize_variables(__global float* variables, __constant float* ff_variable, int nelr){ __kernel void initialize_variables(__global float* variables, __constant float* ff_variable, int nelr SELF_SCHEDULE_ARGS){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x); //const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0); const int i = get_global_id(0);
if( i >= nelr) return; if( i >= nelr) goto self_schedule;
for(int j = 0; j < NVAR; j++) for(int j = 0; j < NVAR; j++)
variables[i + j*nelr] = ff_variable[j]; variables[i + j*nelr] = ff_variable[j];
self_schedule:
SCHEDULE_CHILD_1D(initialize_variables(variables, ff_variable, nelr SELF_SCHEDULE_CHILD_ARGS))
return;
} }
__kernel void compute_step_factor(__global float* variables, __kernel void compute_step_factor(__global float* variables,
__global float* areas, __global float* areas,
__global float* step_factors, __global float* step_factors,
int nelr){ int nelr SELF_SCHEDULE_ARGS){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x); //const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0); const int i = get_global_id(0);
if( i >= nelr) return; if( i >= nelr) goto self_schedule;
float density = variables[i + VAR_DENSITY*nelr]; float density = variables[i + VAR_DENSITY*nelr];
FLOAT3 momentum; FLOAT3 momentum;
...@@ -119,6 +125,9 @@ __kernel void compute_step_factor(__global float* variables, ...@@ -119,6 +125,9 @@ __kernel void compute_step_factor(__global float* variables,
//step_factors[i] = (float)(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound)); //step_factors[i] = (float)(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
step_factors[i] = (float)(0.5f) / (sqrt(areas[i]) * (sqrt(speed_sqd) + speed_of_sound)); step_factors[i] = (float)(0.5f) / (sqrt(areas[i]) * (sqrt(speed_sqd) + speed_of_sound));
self_schedule:
SCHEDULE_CHILD_1D(compute_step_factor(variables, areas, step_factors, nelr SELF_SCHEDULE_CHILD_ARGS))
return;
} }
__kernel void compute_flux( __kernel void compute_flux(
...@@ -131,11 +140,11 @@ __kernel void compute_flux( ...@@ -131,11 +140,11 @@ __kernel void compute_flux(
__constant FLOAT3* ff_flux_contribution_momentum_x, __constant FLOAT3* ff_flux_contribution_momentum_x,
__constant FLOAT3* ff_flux_contribution_momentum_y, __constant FLOAT3* ff_flux_contribution_momentum_y,
__constant FLOAT3* ff_flux_contribution_momentum_z, __constant FLOAT3* ff_flux_contribution_momentum_z,
int nelr){ int nelr SELF_SCHEDULE_ARGS){
const float smoothing_coefficient = (float)(0.2f); const float smoothing_coefficient = (float)(0.2f);
//const int i = (blockDim.x*blockIdx.x + threadIdx.x); //const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0); const int i = get_global_id(0);
if( i >= nelr) return; if( i >= nelr) goto self_schedule;
int j, nb; int j, nb;
FLOAT3 normal; float normal_len; FLOAT3 normal; float normal_len;
float factor; float factor;
...@@ -266,7 +275,7 @@ __kernel void compute_flux( ...@@ -266,7 +275,7 @@ __kernel void compute_flux(
fluxes[i + VAR_DENSITY_ENERGY*nelr] = flux_i_density_energy; fluxes[i + VAR_DENSITY_ENERGY*nelr] = flux_i_density_energy;
self_schedule: self_schedule:
SCHEDULE_CHILD_1D(compute_flux(elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z, nelr SELF_SCHEDULE_CHILD_ARGS))
return; return;
} }
...@@ -274,10 +283,10 @@ __kernel void time_step(int j, int nelr, ...@@ -274,10 +283,10 @@ __kernel void time_step(int j, int nelr,
__global float* old_variables, __global float* old_variables,
__global float* variables, __global float* variables,
__global float* step_factors, __global float* step_factors,
__global float* fluxes){ __global float* fluxes SELF_SCHEDULE_ARGS){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x); //const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0); const int i = get_global_id(0);
if( i >= nelr) return; if( i >= nelr) goto self_schedule;
float factor = step_factors[i]/(float)(RK+1-j); float factor = step_factors[i]/(float)(RK+1-j);
...@@ -287,6 +296,9 @@ __kernel void time_step(int j, int nelr, ...@@ -287,6 +296,9 @@ __kernel void time_step(int j, int nelr,
variables[i + (VAR_MOMENTUM+1)*nelr] = old_variables[i + (VAR_MOMENTUM+1)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+1)*nelr]; variables[i + (VAR_MOMENTUM+1)*nelr] = old_variables[i + (VAR_MOMENTUM+1)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+1)*nelr];
variables[i + (VAR_MOMENTUM+2)*nelr] = old_variables[i + (VAR_MOMENTUM+2)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+2)*nelr]; variables[i + (VAR_MOMENTUM+2)*nelr] = old_variables[i + (VAR_MOMENTUM+2)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+2)*nelr];
self_schedule:
SCHEDULE_CHILD_1D(time_step(j, nelr, old_variables, variables, step_factors, fluxes SELF_SCHEDULE_CHILD_ARGS))
return;
} }
#endif #endif
...@@ -246,6 +246,9 @@ int main(int argc, char** argv){ ...@@ -246,6 +246,9 @@ int main(int argc, char** argv){
printf("Error: Failed to create context (%d)!\n", err); printf("Error: Failed to create context (%d)!\n", err);
exit(1); exit(1);
} }
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
int iCPU, iGPU; int iCPU, iGPU;
getCPUGPUIds(&iCPU, &iGPU, devices, num); getCPUGPUIds(&iCPU, &iGPU, devices, num);
...@@ -413,6 +416,9 @@ int main(int argc, char** argv){ ...@@ -413,6 +416,9 @@ int main(int argc, char** argv){
std::cout << "Saved solution..." << std::endl; std::cout << "Saved solution..." << std::endl;
std::cout << "Cleaning up..." << std::endl; std::cout << "Cleaning up..." << std::endl;
// TODO // TODO
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
std::cout << "Done..." << std::endl; std::cout << "Done..." << std::endl;
clSVMFree(context, h_ff_variable); clSVMFree(context, h_ff_variable);
clSVMFree(context, h_ff_flux_contribution_momentum_x); clSVMFree(context, h_ff_flux_contribution_momentum_x);
......
...@@ -57,7 +57,7 @@ __kernel void c_CopySrcToComponents (__global int *d_r, ...@@ -57,7 +57,7 @@ __kernel void c_CopySrcToComponents (__global int *d_r,
__global int *d_g, __global int *d_g,
__global int *d_b, __global int *d_b,
__global unsigned char * cl_d_src, __global unsigned char * cl_d_src,
int pixels) int pixels SELF_SCHEDULE_ARGS)
{ {
int x = get_local_id(0); int x = get_local_id(0);
int gX= get_local_size(0) * get_group_id(0); int gX= get_local_size(0) * get_group_id(0);
...@@ -85,14 +85,14 @@ __kernel void c_CopySrcToComponents (__global int *d_r, ...@@ -85,14 +85,14 @@ __kernel void c_CopySrcToComponents (__global int *d_r,
storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition); storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
} }
SCHEDULE_CHILD_1D(c_CopySrcToComponents (d_r, d_g, d_b, cl_d_src, pixels SELF_SCHEDULE_CHILD_ARGS))
} }
// Copy img src data into three separated component buffers // Copy img src data into three separated component buffers
__kernel void c_CopySrcToComponent (__global int *d_c, __kernel void c_CopySrcToComponent (__global int *d_c,
__global unsigned char * cl_d_src, __global unsigned char * cl_d_src,
int pixels) int pixels SELF_SCHEDULE_ARGS)
{ {
int x = get_local_id(0); int x = get_local_id(0);
int gX = get_local_size(0) * get_group_id(0); int gX = get_local_size(0) * get_group_id(0);
...@@ -113,7 +113,7 @@ __kernel void c_CopySrcToComponent (__global int *d_c, ...@@ -113,7 +113,7 @@ __kernel void c_CopySrcToComponent (__global int *d_c,
storeComponent(d_c, c, globalOutputPosition); storeComponent(d_c, c, globalOutputPosition);
} }
SCHEDULE_CHILD_1D(c_CopySrcToComponent (d_c, cl_d_src, pixels SELF_SCHEDULE_CHILD_ARGS))
} }
...@@ -662,7 +662,7 @@ __kernel void cl_fdwt53Kernel(__global const int * const in, ...@@ -662,7 +662,7 @@ __kernel void cl_fdwt53Kernel(__global const int * const in,
const int sy, const int sy,
const int steps, const int steps,
int WIN_SIZE_X, int WIN_SIZE_X,
int WIN_SIZE_Y) int WIN_SIZE_Y SELF_SCHEDULE_ARGS)
{ {
__local struct FDWT53 fdwt53; __local struct FDWT53 fdwt53;
fdwt53.WIN_SIZE_X = WIN_SIZE_X; fdwt53.WIN_SIZE_X = WIN_SIZE_X;
...@@ -708,5 +708,5 @@ __kernel void cl_fdwt53Kernel(__global const int * const in, ...@@ -708,5 +708,5 @@ __kernel void cl_fdwt53Kernel(__global const int * const in,
transform(&fdwt53, false, false, in, out, sx, sy, steps); transform(&fdwt53, false, false, in, out, sx, sy, steps);
} }
SCHEDULE_CHILD_ND(cl_fdwt53Kernel(in, out, sx, sy, steps, WIN_SIZE_X, WIN_SIZE_Y SELF_SCHEDULE_CHILD_ARGS))
} }
...@@ -769,7 +769,10 @@ int main(int argc, char **argv) ...@@ -769,7 +769,10 @@ int main(int argc, char **argv)
} }
context = clCreateContext(NULL, num, devices, NULL, NULL, &errNum); context = clCreateContext(NULL, num, devices, NULL, NULL, &errNum);
if (errNum != CL_SUCCESS) #ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
if (errNum != CL_SUCCESS)
{ {
std::cerr << "Failed to create OpenCL context." << std::endl; std::cerr << "Failed to create OpenCL context." << std::endl;
return 1; return 1;
...@@ -865,6 +868,9 @@ int main(int argc, char **argv) ...@@ -865,6 +868,9 @@ int main(int argc, char **argv)
clReleaseKernel(c_CopySrcToComponents); clReleaseKernel(c_CopySrcToComponents);
clReleaseKernel(c_CopySrcToComponent); clReleaseKernel(c_CopySrcToComponent);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
clSVMFree(context, d->srcImg); clSVMFree(context, d->srcImg);
return 0; return 0;
......
...@@ -103,6 +103,9 @@ int main(int argc, char *argv[]) { ...@@ -103,6 +103,9 @@ int main(int argc, char *argv[]) {
} }
context = clCreateContext(NULL, num, devices, NULL, NULL, &err); context = clCreateContext(NULL, num, devices, NULL, NULL, &err);
#ifdef SELF_SCHEDULE
initOnDeviceCommandQueues(context);
#endif
if(size < 1) if(size < 1)
{ {
...@@ -176,6 +179,9 @@ int main(int argc, char *argv[]) { ...@@ -176,6 +179,9 @@ int main(int argc, char *argv[]) {
clSVMFree(context, a); clSVMFree(context, a);
clSVMFree(context, b); clSVMFree(context, b);
free(finalVec); free(finalVec);
#ifdef SELF_SCHEDULE
releaseOnDeviceCommandQueues(context);
#endif
cl_cleanup(); cl_cleanup();
//OpenClGaussianElimination(context,timing); //OpenClGaussianElimination(context,timing);
......
...@@ -11,14 +11,14 @@ __kernel void Fan1(__global float *m_dev, ...@@ -11,14 +11,14 @@ __kernel void Fan1(__global float *m_dev,
__global float *a_dev, __global float *a_dev,
__global float *b_dev, __global float *b_dev,
const int size, const int size,
const int t) { const int t SELF_SCHEDULE_ARGS) {
int globalId = get_global_id(0); int globalId = get_global_id(0);
if (globalId < size-1-t) { if (globalId < size-1-t) {
*(m_dev + size * (globalId + t + 1)+t) = *(a_dev + size * (globalId + t + 1) + t) / *(a_dev + size * t + t); *(m_dev + size * (globalId + t + 1)+t) = *(a_dev + size * (globalId + t + 1) + t) / *(a_dev + size * t + t);
} }
SCHEDULE_CHILD_1D(Fan1(m_dev, a_dev, b_dev, size, t SELF_SCHEDULE_CHILD_ARGS))
} }
...@@ -26,7 +26,7 @@ __kernel void Fan2(__global float *m_dev, ...@@ -26,7 +26,7 @@ __kernel void Fan2(__global float *m_dev,
__global float *a_dev, __global float *a_dev,
__global float *b_dev, __global float *b_dev,
const int size, const int size,
const int t) { const int t SELF_SCHEDULE_ARGS) {
int globalId = get_global_id(0);