535 lines
14 KiB
Common Lisp

#pragma OPENCL EXTENSION cl_amd_printf : enable
#define float3 float4
#define uint3 uint4
#define PARTICLE_RADIUS 0.05;
#define width 1280
#define height 1024
#define B 0
#define T height
#define L 0
#define R width
#define shiftNumber 4
#define shiftMask 0xF
#define shiftValue 16.0f
#define stride 4
#define screenWidth1 width
#define screenHeight1 height
#define halfScreenWidth1 screenWidth1/2
#define halfScreenHeight1 screenHeight1/2
#define screenWidth1SubOne (screenWidth1-1)
#define screenHeight1SubOne (screenHeight1-1)
#define stride screenWidth1
#define screenPixelNumber screenWidth1*screenHeight1
#define depthBufferSize screenPixelNumber*depthComplexity
#define WGS 1
//---------------------------------------------------------------
struct __VSSpriteOut
{
float4 position;
float4 particlePosition;
};
typedef struct __VSSpriteout VSSpriteOut;
struct __GSSpriteOut
{
float4 position;
float2 textureUV;
// float4 viewSpacePosition;
// float4 particlePosition;
};
typedef struct __GSSpriteout GSSpriteOut;
//------------------------------------------------------------------------------
__constant float4 g_positions[4] =
{
(float4)(-1.0f, 1.0f, 0.0f, 0.0f),
(float4)( 1.0f, 1.0f, 0.0f, 0.0f),
(float4)( -1.0f, -1.0f, 0.0f, 0.0f),
(float4)( 1.0f, -1.0f, 0.0f, 0.0f)
};
__constant float2 g_texcoords[4] =
{
(float2)(0.0f,0.0f),
(float2)(1.0f,0.0f),
(float2)(0.0f,1.0f),
(float2)(1.0f,1.0f)
};
//------------------------------------------------------------------------------
void copyMatrix(
float matrix[16],
__constant float matrix0[16])
{
uint i;
for (i = 0; i < 16; i++) {
matrix[i] = matrix0[i];
}
}
void matrixMulLoopBody(
uint i,
float matrix[16],
__constant float matrix0[16],
__constant float matrix1[16])
{
matrix[i] = 0.0f;
matrix[i] += matrix0[(i%4) + (0*4)] * matrix1[(0) + ((i/4)*4)];
matrix[i] += matrix0[(i%4) + (1*4)] * matrix1[(1) + ((i/4)*4)];
matrix[i] += matrix0[(i%4) + (2*4)] * matrix1[(2) + ((i/4)*4)];
matrix[i] += matrix0[(i%4) + (3*4)] * matrix1[(3) + ((i/4)*4)];
}
void matrixMul(
float matrix[16],
__constant float matrix0[16],
__constant float matrix1[16])
{
matrixMulLoopBody(0, matrix, matrix0, matrix1);
matrixMulLoopBody(1, matrix, matrix0, matrix1);
matrixMulLoopBody(2, matrix, matrix0, matrix1);
matrixMulLoopBody(3, matrix, matrix0, matrix1);
matrixMulLoopBody(4, matrix, matrix0, matrix1);
matrixMulLoopBody(5, matrix, matrix0, matrix1);
matrixMulLoopBody(6, matrix, matrix0, matrix1);
matrixMulLoopBody(7, matrix, matrix0, matrix1);
matrixMulLoopBody(8, matrix, matrix0, matrix1);
matrixMulLoopBody(9, matrix, matrix0, matrix1);
matrixMulLoopBody(10, matrix, matrix0, matrix1);
matrixMulLoopBody(11, matrix, matrix0, matrix1);
matrixMulLoopBody(12, matrix, matrix0, matrix1);
matrixMulLoopBody(13, matrix, matrix0, matrix1);
matrixMulLoopBody(14, matrix, matrix0, matrix1);
matrixMulLoopBody(15, matrix, matrix0, matrix1);
}
float4 matrixVectorMul(float matrix[16], float4 vector)
{
float4 result;
result.x = matrix[0]*vector.x + matrix[4+0]*vector.y + matrix[8+0]*vector.z + matrix[12+0]*vector.w;
result.y = matrix[1]*vector.x + matrix[4+1]*vector.y + matrix[8+1]*vector.z + matrix[12+1]*vector.w;
result.z = matrix[2]*vector.x + matrix[4+2]*vector.y + matrix[8+2]*vector.z + matrix[12+2]*vector.w;
result.w = matrix[3]*vector.x + matrix[4+3]*vector.y + matrix[8+3]*vector.z + matrix[12+3]*vector.w;
return result;
}
float3 matrixVector3Mul(__constant float matrix[9], float3 vector)
{
float3 result;
result.x = matrix[0]*vector.x + matrix[3+0]*vector.y + matrix[6+0]*vector.z;
result.y = matrix[1]*vector.x + matrix[3+1]*vector.y + matrix[6+1]*vector.z;
result.z = matrix[2]*vector.x + matrix[3+2]*vector.y + matrix[6+2]*vector.z;
return result;
}
//------------------------------------------------------------------------------
//#define DEVICE_CPU 1
#if defined(DEVICE_CPU)
void printMatrix(char * name, __constant float matrix[16])
{
printf("%s[0] = %f, %f, %f, %f\n", name, matrix[0], matrix[1], matrix[2], matrix[3]);
printf("%s[1] = %f, %f, %f, %f\n", name, matrix[4], matrix[5], matrix[6], matrix[7]);
printf("%s[2] = %f, %f, %f, %f\n", name, matrix[8], matrix[9], matrix[10], matrix[11]);
printf("%s[3] = %f, %f, %f, %f\n", name, matrix[12], matrix[13], matrix[14], matrix[15]);
}
#endif
#if 1
__kernel void vertexShader(
__constant float modelview[16],
__constant float projection[16],
__global float4 * inputPrimitives,
__global float4 * outputPrimitives)
{
float matrix[16];
float4 gl_Vertex;
float4 gl_Position;
uint id = get_global_id(0);
gl_Vertex = inputPrimitives[id];
// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
matrixMul(matrix, projection, modelview);
gl_Position = matrixVectorMul(matrix, gl_Vertex);
outputPrimitives[id] = gl_Position;
}
#else
__kernel void vertexShader(
__constant float modelview[16],
__constant float projection[16],
__global float4 * inputPrimitives,
__global float4 * outputPrimitives)
{
uint id = get_global_id(0);
outputPrimitives[id] = inputPrimitives[id];
}
#endif
//-----------------------------------------------------------------------------------
__kernel void
clearImage(
__write_only image2d_t image,
float4 color)
{
int2 coords = (int2)(get_global_id(0), get_global_id(1));
write_imagef(image, coords, color);
}
// OpenGL viewport transformation
// The site http://research.cs.queensu.ca/~jstewart/454/notes/pipeline/
// contains a description of this process
void
viewportTransform(float4 v, __constant int4 viewport[1], float2 * output)
{
int4 vp = viewport[0];
*output
= 0.5f *
(float2)(v.x+1,v.y+1) *
(float2)((vp.s2-vp.s0) + vp.s0,
(vp.s3-vp.s1) + vp.s1);
}
#define PARTICLE_WIDTH 32.0f
#define PARTICLE_HEIGHT 32.0f
// Unoptimized triangle rasterizer function
// Details of the algorithm can be found here:
// http://www.devmaster.net/forums/showthread.php?t=1884
//
void
rasterizerUnOpt(
__global struct __GSSpriteOut * outputPrimitives,
// __global float4 * outputPrimitives,
__constant int4 viewport[1],
__write_only image2d_t screen,
__read_only image2d_t particle,
uint v1Offset,
uint v2Offset,
uint v3Offset,
__global float4 * debugOut1)
{
sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
uint id = get_global_id(0);
struct __GSSpriteOut output;
float2 v1, v2, v3;
float2 uv1, uv2, uv3;
output = outputPrimitives[id*4+v1Offset];
uv1 = output.textureUV;
viewportTransform(output.position, viewport, &v1);
output = outputPrimitives[id*4+v2Offset];
uv2 = output.textureUV;
viewportTransform(output.position, viewport, &v2);
output = outputPrimitives[id*4+v3Offset];
uv3 = output.textureUV;
viewportTransform(output.position, viewport, &v3);
// Bounding rectangle
int2 min_ = convert_int2(min(v1, min(v2, v3)));
int2 max_ = convert_int2(max(v1, max(v2, v3)));
// naive bi-linear interploation for texture coords, note this is
// broken with respect to OpenGL and needs to be fixed for the
// general case.
float p1x = v2.x - v1.x;
float p1y = v2.y - v1.y;
float p2x = v3.x - v1.x;
float p2y = v3.y - v1.y;
// Scan through bounding rectangle
for(int y = min_.y; y < max_.y; y++) {
for(int x = min_.x; x < max_.x; x++) {
// When all half-space functions positive, pixel is in triangle
if((v1.x - v2.x) * (y - v1.y) - (v1.y - v2.y) * (x - v1.x) > 0 &&
(v2.x - v3.x) * (y - v2.y) - (v2.y - v3.y) * (x - v2.x) > 0 &&
(v3.x - v1.x) * (y - v3.y) - (v3.y - v1.y) * (x - v3.x) > 0) {
float px = x - v1.x;
float py = y - v1.y;
write_imagef(
screen,
(int2)(x,y),
// texel);
(float4)(1.0f,1.0f,1.0f,1.0f));
}
}
}
}
// Optimized rasterizer function
// Details of the algorithm can be found here:
// http://www.devmaster.net/forums/showthread.php?t=1884
//
// Currently has a bug, still work in progess
__kernel void
rasterizerXX(
__global float4 * outputPrimitives,
__write_only image2d_t screen,
__global float4 * debugOut1,
__global int2 * debugOut2)
{
uint id = get_global_id(0);
// printf("ras\n");
float4 v1 = outputPrimitives[id*4+0];
float4 v2 = outputPrimitives[id*4+1];
float4 v3 = outputPrimitives[id*4+2];
float y1 = 0.5f* (v1.y+1) * (T - B) + B;
float y2 = 0.5f* (v2.y+1) * (T - B) + B;
float y3 = 0.5f* (v3.y+1) * (T - B) + B;
float x1 = 0.5f * (v1.x+1) * (R - L) + L;
float x2 = 0.5f * (v2.x+1) * (R - L) + L;
float x3 = 0.5f * (v3.x+1) * (R - L) + L;
const int Y1 = convert_int(shiftValue * y1);
const int Y2 = convert_int(shiftValue * y2);
const int Y3 = convert_int(shiftValue * y3);
const int X1 = convert_int(shiftValue * x1);
const int X2 = convert_int(shiftValue * x2);
const int X3 = convert_int(shiftValue * x3);
debugOut1[id*4+0] = v1;
debugOut1[id*4+1] = v2;
debugOut1[id*4+2] = v3;
debugOut2[id*3+0] = (int2)(X1, Y1);
debugOut2[id*3+1] = (int2)(X2, Y2);
debugOut2[id*3+2] = (int2)(X3, Y3);
// Deltas
const int DX12 = X1 - X2;
const int DX23 = X2 - X3;
const int DX31 = X3 - X1;
const int DY12 = Y1 - Y2;
const int DY23 = Y2 - Y3;
const int DY31 = Y3 - Y1;
// Fixed-point deltas
const int FDX12 = DX12 << shiftNumber;
const int FDX23 = DX23 << shiftNumber;
const int FDX31 = DX31 << shiftNumber;
const int FDY12 = DY12 << shiftNumber;
const int FDY23 = DY23 << shiftNumber;
const int FDY31 = DY31 << shiftNumber;
// Bounding rectangle
int minx = (min(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
//minx = max(0,minx);
int maxx = (max(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
//min(maxx , screenWidth1SubOne);
int miny = (min(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
//max(0,miny);
int maxy = (max(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
//min(maxy , screenHeight1SubOne);
//(char*&)colorBuffer += miny * stride;
int offset = miny * stride;
// Half-edge constants
int C1 = DY12 * X1 - DX12 * Y1;
int C2 = DY23 * X2 - DX23 * Y2;
int C3 = DY31 * X3 - DX31 * Y3;
// Correct for fill convention
if(DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++;
if(DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++;
if(DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;
int CY1 = C1 + DX12 * (miny << shiftNumber) - DY12 * (minx << shiftNumber);
int CY2 = C2 + DX23 * (miny << shiftNumber) - DY23 * (minx << shiftNumber);
int CY3 = C3 + DX31 * (miny << shiftNumber) - DY31 * (minx << shiftNumber);
for(int y = miny; y < maxy; y++) {
int CX1 = CY1;
int CX2 = CY2;
int CX3 = CY3;
debugOut2[id*3+0] = (int2)(minx, maxx);
for(int x = minx; x < maxx; x++) {
debugOut2[id*3+0] = (int2)(CX1, CX2);
if(CX1 > 0 && CX2 > 0 && CX3 > 0) {
debugOut2[id*3+0] = (int2)(1, 1);
write_imagef(
screen,
(int2)(x,y),
(float4)(1.0f,1.0f,1.0f,1.0f));
}
CX1 -= FDY12;
CX2 -= FDY23;
CX3 -= FDY31;
}
CY1 += FDX12;
CY2 += FDX23;
CY3 += FDX31;
//(char*&)colorBuffer += stride;
offset += stride;
}
}
//------------------------------------------------------------------------------
void geometryShader(
__constant float modelview[16],
__constant float projection[16],
__constant float inverseView[9],
__constant int4 viewport[1],
__local struct __VSSpriteOut * vsOutputPrimitives,
__global struct __GSSpriteOut * outputPrimitives,
// __global float4 * outputPrimitives,
__write_only image2d_t screen,
__read_only image2d_t particle,
__global float4 * debugOut1,
__global int * debugOut2)
{
float2 texcoords[4] =
{
(float2)(0.0f,0.0f),
(float2)(1.0f,0.0f),
(float2)(0.0f,1.0f),
(float2)(1.0f,1.0f)
};
float matrix[16];
uint id = get_global_id(0);
uint lid = get_local_id(0);
float4 vsPosition = vsOutputPrimitives[lid].position;
matrixMul(matrix, projection, modelview);
//
// Emit two new triangles
//
for (uint i = 0; i<4; i++) {
float3 position = g_positions[i] * PARTICLE_RADIUS;
position = matrixVector3Mul(inverseView, position) + vsPosition;
float3 particlePosition =
matrixVector3Mul(
inverseView,
(float4)(0.0f,0.0f,0.0f,0.0f)) + vsPosition; // world space
// Compute view space position
position.w = 1.0f;
position = matrixVectorMul(matrix, position);
//perspective division
position /= position.w;
struct __GSSpriteOut output;
output.position = position;
//output.textureUV = g_texcoords[i];
output.textureUV = texcoords[i];
outputPrimitives[id*4+i] = output;
}
// Render QUAD - Triangle 1
rasterizerUnOpt(
outputPrimitives,
viewport,
screen,
particle,
0,
1,
2,
debugOut1);
// Render QUAD - Triangle 2
rasterizerUnOpt(
outputPrimitives,
viewport,
screen,
particle,
2,
1,
3,
debugOut1);
}
__kernel void vertexShaderSprite(
__constant float modelview[16],
__constant float projection[16],
__constant float inverseView[9],
__constant int4 viewport[1],
__local struct __VSSpriteOut * vsOutputPrimitives,
__global float4 * inputPrimitives,
__global struct __GSSpriteOut * outputPrimitives,
// __global float4 * outputPrimitives,
__write_only image2d_t screen,
__read_only image2d_t particle,
__global float4 * debugOut1,
__global int * debugOut2)
{
float matrix[16];
uint id = get_global_id(0);
uint lid = get_local_id(0);
// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
matrixMul(matrix, projection, modelview);
float4 position = inputPrimitives[id];
vsOutputPrimitives[lid].position = position;
vsOutputPrimitives[lid].particlePosition =
matrixVectorMul(matrix, position);
geometryShader(
modelview,
projection,
inverseView,
viewport,
vsOutputPrimitives,
outputPrimitives,
screen,
particle,
debugOut1,
debugOut2);
}