bullet-physics/Demos/OpenCLClothDemo/shaders.cl

#pragma OPENCL EXTENSION cl_amd_printf : enable

#define float3 float4
#define uint3  uint4

#define PARTICLE_RADIUS 0.05;

#define width 1280
#define height 1024

#define B 0
#define T height
#define L 0
#define R width

#define shiftNumber 4
#define shiftMask 0xF
#define shiftValue 16.0f
#define stride 4

#define screenWidth1 width
#define screenHeight1 height
#define halfScreenWidth1 screenWidth1/2
#define halfScreenHeight1 screenHeight1/2
#define screenWidth1SubOne (screenWidth1-1)
#define screenHeight1SubOne (screenHeight1-1)
#define stride screenWidth1
#define screenPixelNumber screenWidth1*screenHeight1
#define depthBufferSize screenPixelNumber*depthComplexity

#define WGS 1

//---------------------------------------------------------------

struct __VSSpriteOut
{
    float4 position;
    float4 particlePosition;
};

typedef struct __VSSpriteout VSSpriteOut;

struct __GSSpriteOut
{
    float4 position;
    float2 textureUV;
//	float4 viewSpacePosition;
//	float4 particlePosition;
};

typedef struct __GSSpriteout GSSpriteOut;

//------------------------------------------------------------------------------

__constant float4 g_positions[4] =
{
  (float4)(-1.0f, 1.0f, 0.0f, 0.0f),
  (float4)( 1.0f, 1.0f, 0.0f, 0.0f),
  (float4)( -1.0f, -1.0f, 0.0f, 0.0f),
  (float4)( 1.0f, -1.0f, 0.0f, 0.0f)
};

__constant float2 g_texcoords[4] =
{
	(float2)(0.0f,0.0f),
    (float2)(1.0f,0.0f),
    (float2)(0.0f,1.0f),
    (float2)(1.0f,1.0f)
};

//------------------------------------------------------------------------------

void copyMatrix(
	float matrix[16],
	__constant float matrix0[16])
{
	uint i;

	for (i = 0; i < 16; i++) {
		matrix[i] = matrix0[i];
	}
}

void matrixMulLoopBody(
	uint i,
	float matrix[16],
	__constant float matrix0[16],
	__constant float matrix1[16])
{
	matrix[i] = 0.0f;
	matrix[i] += matrix0[(i%4) + (0*4)] * matrix1[(0) + ((i/4)*4)];
	matrix[i] += matrix0[(i%4) + (1*4)] * matrix1[(1) + ((i/4)*4)];
	matrix[i] += matrix0[(i%4) + (2*4)] * matrix1[(2) + ((i/4)*4)];
	matrix[i] += matrix0[(i%4) + (3*4)] * matrix1[(3) + ((i/4)*4)];
}

void matrixMul(
	float matrix[16],
	__constant float matrix0[16],
	__constant float matrix1[16])
{
	matrixMulLoopBody(0, matrix, matrix0, matrix1);
	matrixMulLoopBody(1, matrix, matrix0, matrix1);
	matrixMulLoopBody(2, matrix, matrix0, matrix1);
	matrixMulLoopBody(3, matrix, matrix0, matrix1);
	matrixMulLoopBody(4, matrix, matrix0, matrix1);
	matrixMulLoopBody(5, matrix, matrix0, matrix1);
	matrixMulLoopBody(6, matrix, matrix0, matrix1);
	matrixMulLoopBody(7, matrix, matrix0, matrix1);
	matrixMulLoopBody(8, matrix, matrix0, matrix1);
	matrixMulLoopBody(9, matrix, matrix0, matrix1);
	matrixMulLoopBody(10, matrix, matrix0, matrix1);
	matrixMulLoopBody(11, matrix, matrix0, matrix1);
	matrixMulLoopBody(12, matrix, matrix0, matrix1);
	matrixMulLoopBody(13, matrix, matrix0, matrix1);
	matrixMulLoopBody(14, matrix, matrix0, matrix1);
	matrixMulLoopBody(15, matrix, matrix0, matrix1);
}

float4 matrixVectorMul(float matrix[16], float4 vector)
{
	float4 result;

	result.x = matrix[0]*vector.x + matrix[4+0]*vector.y + matrix[8+0]*vector.z + matrix[12+0]*vector.w;
	result.y = matrix[1]*vector.x + matrix[4+1]*vector.y + matrix[8+1]*vector.z + matrix[12+1]*vector.w;
	result.z = matrix[2]*vector.x + matrix[4+2]*vector.y + matrix[8+2]*vector.z + matrix[12+2]*vector.w;
	result.w = matrix[3]*vector.x + matrix[4+3]*vector.y + matrix[8+3]*vector.z + matrix[12+3]*vector.w;

	return result;
}

float3 matrixVector3Mul(__constant float matrix[9], float3 vector)
{
	float3 result;

	result.x = matrix[0]*vector.x + matrix[3+0]*vector.y + matrix[6+0]*vector.z;
	result.y = matrix[1]*vector.x + matrix[3+1]*vector.y + matrix[6+1]*vector.z;
	result.z = matrix[2]*vector.x + matrix[3+2]*vector.y + matrix[6+2]*vector.z;

	return result;
}

//------------------------------------------------------------------------------

//#define DEVICE_CPU 1
#if defined(DEVICE_CPU)
void printMatrix(char * name, __constant float matrix[16])
{
	printf("%s[0] = %f, %f, %f, %f\n", name, matrix[0], matrix[1], matrix[2], matrix[3]);
	printf("%s[1] = %f, %f, %f, %f\n", name, matrix[4], matrix[5], matrix[6], matrix[7]);
	printf("%s[2] = %f, %f, %f, %f\n", name, matrix[8], matrix[9], matrix[10], matrix[11]);
	printf("%s[3] = %f, %f, %f, %f\n", name, matrix[12], matrix[13], matrix[14], matrix[15]);
}
#endif

#if 1
__kernel void vertexShader(
    __constant float modelview[16],
	__constant float projection[16],
	__global float4 * inputPrimitives,
	__global float4 * outputPrimitives)
{
	float matrix[16];
	float4 gl_Vertex;
	float4 gl_Position;

	uint id = get_global_id(0);

	gl_Vertex = inputPrimitives[id];

	// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
	matrixMul(matrix, projection, modelview);

	gl_Position = matrixVectorMul(matrix, gl_Vertex);

	outputPrimitives[id] = gl_Position;
}

#else

__kernel void vertexShader(
    __constant float modelview[16],
	__constant float projection[16],
	__global float4 * inputPrimitives,
	__global float4 * outputPrimitives)
{
	uint id = get_global_id(0);

	outputPrimitives[id] = inputPrimitives[id];
}

#endif

//-----------------------------------------------------------------------------------

__kernel void
clearImage(
	__write_only image2d_t image,
	float4 color)
{

	int2 coords = (int2)(get_global_id(0), get_global_id(1));
	write_imagef(image, coords, color);
}

// OpenGL viewport transformation
// The site http://research.cs.queensu.ca/~jstewart/454/notes/pipeline/
// contains a description of this process
void
viewportTransform(float4 v, __constant int4 viewport[1], float2 * output)
{
	int4 vp = viewport[0];
	*output
		= 0.5f *
		  (float2)(v.x+1,v.y+1) *
		  (float2)((vp.s2-vp.s0) + vp.s0,
				   (vp.s3-vp.s1) + vp.s1);
}

#define PARTICLE_WIDTH  32.0f
#define PARTICLE_HEIGHT 32.0f

// Unoptimized triangle rasterizer function
// Details of the algorithm can be found here:
//		http://www.devmaster.net/forums/showthread.php?t=1884
//
void
rasterizerUnOpt(
    __global struct __GSSpriteOut * outputPrimitives,
//	 __global float4 * outputPrimitives,
	__constant int4  viewport[1],
	__write_only image2d_t screen,
	__read_only image2d_t particle,
	uint v1Offset,
	uint v2Offset,
	uint v3Offset,
	__global float4 * debugOut1)
{
	sampler_t sampler =
		CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

	uint id = get_global_id(0);

	struct __GSSpriteOut output;
	float2 v1, v2, v3;
	float2 uv1, uv2, uv3;

	output = outputPrimitives[id*4+v1Offset];
	uv1    = output.textureUV;
	viewportTransform(output.position, viewport, &v1);

	output = outputPrimitives[id*4+v2Offset];
	uv2    = output.textureUV;
	viewportTransform(output.position, viewport, &v2);

	output = outputPrimitives[id*4+v3Offset];
	uv3    = output.textureUV;
	viewportTransform(output.position, viewport, &v3);

	// Bounding rectangle
	int2 min_ = convert_int2(min(v1, min(v2, v3)));
	int2 max_ = convert_int2(max(v1, max(v2, v3)));

	// naive bi-linear interploation for texture coords, note this is
	// broken with respect to OpenGL and needs to be fixed for the
	// general case.
	float p1x = v2.x - v1.x;
	float p1y = v2.y - v1.y;

	float p2x = v3.x - v1.x;
	float p2y = v3.y - v1.y;

	// Scan through bounding rectangle
	for(int y = min_.y; y < max_.y; y++) {
		for(int x = min_.x; x < max_.x; x++) {
			// When all half-space functions positive, pixel is in triangle
			if((v1.x - v2.x) * (y - v1.y) - (v1.y - v2.y) * (x - v1.x) > 0 &&
			 (v2.x - v3.x) * (y - v2.y) - (v2.y - v3.y) * (x - v2.x) > 0 &&
			 (v3.x - v1.x) * (y - v3.y) - (v3.y - v1.y) * (x - v3.x) > 0) {

				float px = x - v1.x;
				float py = y - v1.y;

					write_imagef(
						screen,
						(int2)(x,y),
					//	texel);
						(float4)(1.0f,1.0f,1.0f,1.0f));
			}
		}
	}
}

// Optimized rasterizer function
// Details of the algorithm can be found here:
//		http://www.devmaster.net/forums/showthread.php?t=1884
//
// Currently has a bug, still work in progess
__kernel void
rasterizerXX(
    __global float4 * outputPrimitives,
	__write_only image2d_t screen,
	__global float4 * debugOut1,
	__global int2 * debugOut2)
{
	uint id = get_global_id(0);

//	printf("ras\n");

	float4 v1 = outputPrimitives[id*4+0];
	float4 v2 = outputPrimitives[id*4+1];
	float4 v3 = outputPrimitives[id*4+2];

	float y1 = 0.5f* (v1.y+1) * (T - B) + B;
	float y2 = 0.5f* (v2.y+1) * (T - B) + B;
    float y3 = 0.5f* (v3.y+1) * (T - B) + B;

    float x1 = 0.5f * (v1.x+1) * (R - L) + L;
	float x2 = 0.5f * (v2.x+1) * (R - L) + L;
	float x3 = 0.5f * (v3.x+1) * (R - L) + L;

    const int Y1 = convert_int(shiftValue * y1);
    const int Y2 = convert_int(shiftValue * y2);
    const int Y3 = convert_int(shiftValue * y3);

    const int X1 = convert_int(shiftValue * x1);
    const int X2 = convert_int(shiftValue * x2);
    const int X3 = convert_int(shiftValue * x3);

	debugOut1[id*4+0]   = v1;
	debugOut1[id*4+1]   = v2;
	debugOut1[id*4+2]   = v3;

	debugOut2[id*3+0] = (int2)(X1, Y1);
	debugOut2[id*3+1] = (int2)(X2, Y2);
	debugOut2[id*3+2] = (int2)(X3, Y3);

    // Deltas
    const int DX12 = X1 - X2;
    const int DX23 = X2 - X3;
    const int DX31 = X3 - X1;

    const int DY12 = Y1 - Y2;
    const int DY23 = Y2 - Y3;
    const int DY31 = Y3 - Y1;

    // Fixed-point deltas
    const int FDX12 = DX12 << shiftNumber;
    const int FDX23 = DX23 << shiftNumber;
    const int FDX31 = DX31 << shiftNumber;

    const int FDY12 = DY12 << shiftNumber;
    const int FDY23 = DY23 << shiftNumber;
    const int FDY31 = DY31 << shiftNumber;

    // Bounding rectangle
    int minx = (min(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
	//minx = max(0,minx);

	int maxx = (max(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
	//min(maxx , screenWidth1SubOne);

	int miny = (min(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
    //max(0,miny);

	int maxy = (max(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
	//min(maxy , screenHeight1SubOne);

    //(char*&)colorBuffer += miny * stride;
	int offset = miny * stride;

    // Half-edge constants
    int C1 = DY12 * X1 - DX12 * Y1;
    int C2 = DY23 * X2 - DX23 * Y2;
    int C3 = DY31 * X3 - DX31 * Y3;

    // Correct for fill convention
    if(DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++;
    if(DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++;
    if(DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;

    int CY1 = C1 + DX12 * (miny << shiftNumber) - DY12 * (minx << shiftNumber);
    int CY2 = C2 + DX23 * (miny << shiftNumber) - DY23 * (minx << shiftNumber);
    int CY3 = C3 + DX31 * (miny << shiftNumber) - DY31 * (minx << shiftNumber);

    for(int y = miny; y < maxy; y++) {
        int CX1 = CY1;
        int CX2 = CY2;
        int CX3 = CY3;

		debugOut2[id*3+0] = (int2)(minx, maxx);

        for(int x = minx; x < maxx; x++) {
			debugOut2[id*3+0] = (int2)(CX1, CX2);

            if(CX1 > 0 && CX2 > 0 && CX3 > 0) {
				debugOut2[id*3+0] = (int2)(1, 1);
				write_imagef(
					screen,
					(int2)(x,y),
					(float4)(1.0f,1.0f,1.0f,1.0f));
           }

            CX1 -= FDY12;
            CX2 -= FDY23;
            CX3 -= FDY31;
        }

        CY1 += FDX12;
        CY2 += FDX23;
        CY3 += FDX31;

        //(char*&)colorBuffer += stride;
		offset += stride;
    }
}

//------------------------------------------------------------------------------

void geometryShader(
    __constant float modelview[16],
	__constant float projection[16],
	__constant float inverseView[9],
	__constant int4  viewport[1],
	__local struct __VSSpriteOut  * vsOutputPrimitives,
	__global struct __GSSpriteOut * outputPrimitives,
//	 __global float4 * outputPrimitives,
	__write_only image2d_t screen,
	__read_only image2d_t particle,
	__global float4 * debugOut1,
	__global int * debugOut2)
{
	float2 texcoords[4] =
	{
		(float2)(0.0f,0.0f),
		(float2)(1.0f,0.0f),
		(float2)(0.0f,1.0f),
		(float2)(1.0f,1.0f)
	};

	float matrix[16];

	uint id  = get_global_id(0);
	uint lid = get_local_id(0);

	float4 vsPosition = vsOutputPrimitives[lid].position;

	matrixMul(matrix, projection, modelview);
	//
	// Emit two new triangles
	//
	for (uint i = 0; i<4; i++) {
		float3 position = g_positions[i] * PARTICLE_RADIUS;
		position        = matrixVector3Mul(inverseView, position) + vsPosition;
		float3 particlePosition =
			matrixVector3Mul(
				inverseView,
				(float4)(0.0f,0.0f,0.0f,0.0f)) + vsPosition;	// world space

		// Compute view space position
		position.w               = 1.0f;
		position                 = matrixVectorMul(matrix, position);

		//perspective division
		position /= position.w;

		struct __GSSpriteOut output;
		output.position  = position;
		//output.textureUV = g_texcoords[i];
		output.textureUV = texcoords[i];
		outputPrimitives[id*4+i] = output;
	}

	// Render QUAD - Triangle 1
	rasterizerUnOpt(
		outputPrimitives,
		viewport,
		screen,
		particle,
		0,
		1,
		2,
		debugOut1);

	// Render QUAD - Triangle 2
	rasterizerUnOpt(
		outputPrimitives,
		viewport,
		screen,
		particle,
		2,
		1,
		3,
		debugOut1);
}

__kernel void vertexShaderSprite(
    __constant float modelview[16],
	__constant float projection[16],
	__constant float inverseView[9],
	__constant int4  viewport[1],
	__local struct __VSSpriteOut  * vsOutputPrimitives,
	__global float4               * inputPrimitives,
	__global struct __GSSpriteOut * outputPrimitives,
//	__global float4 * outputPrimitives,
	__write_only image2d_t screen,
	__read_only image2d_t particle,
	__global float4 * debugOut1,
	__global int * debugOut2)
{
	float matrix[16];

	uint id  = get_global_id(0);
	uint lid = get_local_id(0);

	// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
	matrixMul(matrix, projection, modelview);

	float4 position                          = inputPrimitives[id];
	vsOutputPrimitives[lid].position         = position;
    vsOutputPrimitives[lid].particlePosition =
		matrixVectorMul(matrix, position);

	geometryShader(
		modelview,
		projection,
		inverseView,
		viewport,
		vsOutputPrimitives,
		outputPrimitives,
		screen,
		particle,
		debugOut1,
		debugOut2);
}