kernel void detectCirclesInSlices(global const float4 *pointsBuffer, global const float *dirAndOffsetBuffer, global float *clResultBuffer, 
	const float x0, const float y0, const float z0, const float sliceSize, const float sliceThickness,
	const int numTasks, const int nPoints,  const float scoreThresholdPix, 
	const float aboutThisRadius, const float radiusMaxAllowed, const int passingThruOrigin) 
{
	const int iWorker = get_global_id(0); //The worker ID
	// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
	if (iWorker >= numTasks)  
		return;
		
	const float3 dir = (float3)(dirAndOffsetBuffer[4*iWorker], dirAndOffsetBuffer[4*iWorker+1], dirAndOffsetBuffer[4*iWorker+2]);
	float offsetZ = dirAndOffsetBuffer[4*iWorker+3];
		
	//Initialize the cartesian system
	float cartesianSystem[12] = {0};
	//const float3 dir = dirAndOffsetBuffer[iWorker].xyz;
	if (!setVerticalAxis(dir, cartesianSystem, true))
		return;
		
	setOrigin((float3)(x0, y0, z0), cartesianSystem);
	
	bool bPassingThruOrigin = passingThruOrigin==1;
	
	//Initialize the topology grid
	uchar grid[CIRCLE_DETECTION_GRID_SIZE*CIRCLE_DETECTION_GRID_SIZE] = {0};
	
	//Setup a maximum number of points to process, in order to avoid a crash on the GPU
	//due to a too long computation
	const int nbMaxPointsAllowed = 1000000;

	//Do the circle detection
	int taskSize = 4 + CIRCLE_DETECTION_RESULT_SIZE;
	int iStartTask = taskSize * iWorker;
	clResultBuffer[iStartTask + 0] = dir.x;
	clResultBuffer[iStartTask + 1] = dir.y;
	clResultBuffer[iStartTask + 2] = dir.z;
	clResultBuffer[iStartTask + 3] = offsetZ;

	iStartTask += 4;
	//for (int i=0; i<CIRCLE_DETECTION_RESULT_SIZE; i++)
	//	clResultBuffer[iStartTask + i] = -1;
	
	detectCirclesInSlice(grid, CIRCLE_DETECTION_GRID_SIZE, CIRCLE_DETECTION_GRID_SIZE,
		pointsBuffer, nPoints, nbMaxPointsAllowed, cartesianSystem, offsetZ, sliceSize, sliceThickness,
		aboutThisRadius, radiusMaxAllowed, 
		clResultBuffer, iStartTask, scoreThresholdPix, bPassingThruOrigin);
	
}