/*Some notes about OpenCL.
 * Many GPU's (Quadro included) use little-Endian byte ordering, and Java uses Big-Endian by default, luckily the
 * Jogamp library already converts the CLBuffers whenever necessary; however, sending custom raw byte data requires
 * paying attention to this.
 */

#ifndef __OPENCL_VERSION__ //This should fail during an actual openCL compile, used only to trick Eclipse into syntax highlighting this file as "C" code.
#define __kernel
#define __global
#define kernel
#define global
#define constant
#define local
#define float2 float
#define int3 int
#define float3 float
#define float4 float
#define uchar4 char
#endif

kernel void calculateZBufferAABB(__global float* zData, int W, int H, float fov, 
	float eyex, float eyey, float eyez, float dirx, float diry, float dirz, float upx, float upy, float upz,
	float xmin, float ymin, float zmin, float xmax, float ymax, float zmax)
{	
 	int i = get_global_id(0); // get index into global data array
 	if (i >= W * H)
 		return;

 	float tanfov = tan(0.5f*fov);
 	float Wphys = 2 * tanfov;
 	float phys2pix = (float)W / Wphys;
 	float pix2phys = 1 / phys2pix;

 	float3 eye = (float3)(eyex, eyey, eyez);
 	float3 w = (float3)(-dirx, -diry, -dirz);
	float norm = length(w);
	w.x /= norm;
	w.y /= norm;
	w.z /= norm;
 	float3 v = (float3)(upx, upy, upz);
	norm = length(v);
	v.x /= norm;
	v.y /= norm;
	v.z /= norm;
	float3 u = cross(v, w);
	norm = length(u);
	u /= norm;

	int iRow = (int)(i / W);
 	int iCol = i - iRow * W;
 	float uOnUnitPlane =  (-0.5f*W + iCol + 0.5f) * pix2phys;
 	float vOnUnitPlane = -(-0.5f*H + iRow + 0.5f) * pix2phys;

	float3 towards = (float3)(-w.x, -w.y, -w.z);
	towards.x += uOnUnitPlane * u.x + vOnUnitPlane * v.x;
	towards.y += uOnUnitPlane * u.y + vOnUnitPlane * v.y;
	towards.z += uOnUnitPlane * u.z + vOnUnitPlane * v.z;
	norm = length(towards);
	towards.x /= norm;
	towards.y /= norm;
	towards.z /= norm;

 	float3 point = (float3)(0, 0, 0);
 	float3 normal = (float3)(0, 0, 0);

 	float3 bbmin = (float3)(xmin, ymin, zmin);
	float3 bbmax = (float3)(xmax, ymax, zmax);

	bool bOk = getRayCastedSurfaceElementAABB(eye, towards, bbmin, bbmax, &point, &normal);
	if (bOk)
		zData[i] = sqrt((point.x-eye.x) * (point.x-eye.x) + (point.y-eye.y) * (point.y-eye.y) + (point.z-eye.z) * (point.z-eye.z));
}
kernel void indirectZBufferPoints(global const float4 *listPoints, const int nPoints, constant float *matDataToLocal,
	global int* zIndices, global float* zValues, int W, int H, float fov, float thresholdZ,
	float eyex, float eyey, float eyez, float dirx, float diry, float dirz, float upx, float upy, float upz)
{	
	const int i = get_global_id(0); 
	if (i >= nPoints)
		return;

 	float tanfov = tan(0.5f*fov);
 	float Wphys = 2 * tanfov;
 	float phys2pix = (float)W / Wphys;
 	float pix2phys = 1 / phys2pix;

 	float3 eye = (float3)(eyex, eyey, eyez);
 	float3 w = (float3)(-dirx, -diry, -dirz);
	float norm = length(w);
	w.x /= norm;
	w.y /= norm;
	w.z /= norm;
 	float3 v = (float3)(upx, upy, upz);
	norm = length(v);
	v.x /= norm;
	v.y /= norm;
	v.z /= norm;
	float3 u = cross(v, w);
	norm = length(u);
	u /= norm;

	float3 towards = (float3)(-w.x, -w.y, -w.z);

	const float3 coords = listPoints[i].xyz;

	float3 coordLocal = transformPoint_arrayVersion(matDataToLocal, coords);

	float3 vLocal = (float3)(coordLocal.x-eye.x, coordLocal.y-eye.y, coordLocal.z-eye.z);
	float zCart = dot(vLocal, w);
	if (zCart > -thresholdZ)
		 return;
	float xCart = dot(vLocal, u);
	float yCart = dot(vLocal, v);

	float3 coordCart = (float3)(xCart, yCart, zCart);
	float4 projCart = projectionPerspectiveSurPlan(coordCart, (float3)(0,0,-1), (float3)(0,0,1), (float3)(0,0,0));
	if (projCart.w == 0)
		 return;

	 int iCol = round(0.5f*W + projCart.x / pix2phys);
	 int iRow = round(0.5f*H - projCart.y / pix2phys);
	 if (iCol>=0 && iCol<W && iRow>=0 && iRow<H)
	 {
	 	zIndices[i] = iRow * W + iCol;
	 	zValues[i] = (float)sqrt(xCart*xCart + yCart*yCart + zCart*zCart);
	 }
}
