kernel void detectCircles(global const float4 *pointsBuffer, global const float4 *dirsBuffer, global float *clResultBuffer, 
	const float x0, const float y0, const float z0, const float sliceSize, const float sliceThickness, 
	const int numTasks, const int nPoints, const float scoreThresholdPix) 
{
	const int iWorker = get_global_id(0); //The worker ID
	// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
	if (iWorker >= numTasks)  
		return;
			
	bool bPassingThruOrigin = true;
			
	int startIndex = CIRCLE_DETECTION_RESULT_SIZE * iWorker;
	for (int i=0; i<CIRCLE_DETECTION_RESULT_SIZE; i++)
		clResultBuffer[startIndex + i] = -1;
	
	//Initialize the cartesian system
	float m_data[12] = {0};
	const float3 dir = (float3)(dirsBuffer[iWorker].x, dirsBuffer[iWorker].y, dirsBuffer[iWorker].z);
	const float3 origin = (float3)(x0, y0, z0);
	if (!setVerticalAxis(dir, m_data, true))
		return;
	setOrigin(origin, m_data);
	
	//Initialize the topology grid
	uchar grid[CIRCLE_DETECTION_GRID_SIZE*CIRCLE_DETECTION_GRID_SIZE] = {0};
	
	//Setup a maximum number of points to process, in order to avoid a crash on the GPU
	//due to a too long computation
	const int nbMaxPointsAllowed = 1000000;

	//Do the circle detection
	detectCirclesInSlice(grid, CIRCLE_DETECTION_GRID_SIZE, CIRCLE_DETECTION_GRID_SIZE,
		pointsBuffer, nPoints, nbMaxPointsAllowed,m_data, 0, sliceSize, sliceThickness,
		-1, infinity, clResultBuffer, startIndex, scoreThresholdPix, bPassingThruOrigin);
		
}