/*Some notes about OpenCL.
 * Many GPU's (Quadro included) use little-Endian byte ordering, and Java uses Big-Endian by default, luckily the
 * Jogamp library already converts the CLBuffers whenever necessary; however, sending custom raw byte data requires
 * paying attention to this.
 */

#ifndef __OPENCL_VERSION__ //This should fail during an actual openCL compile, used only to trick Eclipse into syntax highlighting this file as "C" code.
#define __kernel
#define __global
#define kernel
#define global
#define constant
#define local
#define float2 float
#define int3 int
#define float3 float
#define float4 float
#define uchar4 char
#endif
#define N 101

kernel void detectCircles(global const float4 *pointsBuffer, global const float4 *dirsBuffer, global float4 *clResultBuffer, 
	const float x0, const float y0, const float z0, const float sliceSize, const float sliceThickness, 
	const int numTasks, const int nPoints, const float scoreThreshold) 
{
	const int iWorker = get_global_id(0); //The worker ID
	// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
	if (iWorker >= numTasks)  
		return;
	
	clResultBuffer[iWorker].x = -1;
	clResultBuffer[iWorker].y = -1;
	clResultBuffer[iWorker].z = -1;
	clResultBuffer[iWorker].w = -1;

	const float3 dir = (float3)(dirsBuffer[iWorker].x, dirsBuffer[iWorker].y, dirsBuffer[iWorker].z);
	const float3 origin = (float3)(x0, y0, z0);
	
	float m_data[12] = {0};
	if (!setVerticalAxis(dir, m_data, true))
		return;
		
	setOrigin(origin, m_data);
	
	/*	
	float3 axisX = getAxisX(m_data);
	float3 axisY = getAxisY(m_data);
	float3 axisZ = getAxisZ(m_data);
	float3 origin = getOrigin(m_data);
	*/

	uchar grid[N*N] = {0};
	
	float pixelDim = sliceSize / (float)N;
	
	const int nbMaxPointsAllowed = 1000000;
	
	for (int x=0; x<N; x++)
		for (int y=0; y<N; y++)
			grid[y*N+x] = 0;
			
	int nbCellsFilled = 0;
	for (int i=0; i<nPoints; i++)
	{
		if (i < nbMaxPointsAllowed)
		{
			float3 point = (float3)(pointsBuffer[i].x, pointsBuffer[i].y, pointsBuffer[i].z);
			float u = getLocalX(point, m_data);
			float v = getLocalY(point, m_data);
			float w = getLocalZ(point, m_data);
			if ((fabs(u) < 0.5*sliceSize) && (fabs(v) < 0.5*sliceSize) && (fabs(w) < 0.5*sliceThickness))
			{
				int iCol = (int)round((u+0.5*sliceSize)/pixelDim);
				int iRow = (int)round((v+0.5*sliceSize)/pixelDim);
				if (iCol<0 || iCol>=N || iRow<0 || iRow>=N)
					continue;
				if (grid[iRow * N + iCol] == 0)
				{
					grid[iRow * N + iCol] = 1;
					nbCellsFilled ++;
				}
			}
		}
	}

	bool bOk = identifyDistinctFormsInGrid_max254Forms(grid, N, N);
	if (!bOk)
		return;
	
	const int searchHalfSize = N/2+1;
	uchar centricFormId = grid[searchHalfSize * N + searchHalfSize];
	if (centricFormId == 0)
		return;
		
	float radiusMaxAllowed = infinity;
	float parametresOptimises[circleSearchDim] = { 0 };
	float score = doCircleLevenbergMarquardt(grid, centricFormId, N, N, radiusMaxAllowed, parametresOptimises);		

	if (score > scoreThreshold)
		return;
	
	clResultBuffer[iWorker].x = parametresOptimises[circleParamIndexX];
	clResultBuffer[iWorker].y = parametresOptimises[circleParamIndexY];
	clResultBuffer[iWorker].z = parametresOptimises[circleParamIndexR];
	clResultBuffer[iWorker].w = score;	

	/*
	uchar maxFormId = 0;
	for (int k=0; k<N*N; k++)
	{
		if (grid[k] > maxFormId)
			maxFormId = grid[k];
	}
	clResultBuffer[iWorker].x = iWorker;
	clResultBuffer[iWorker].y = centricFormId;
	clResultBuffer[iWorker].z = maxFormId;
	clResultBuffer[iWorker].w = nbCellsFilled;
	*/
}
