I have an implementation of the SAT algorithm as described by Dirk's GDC talk and I'm trying to improve the performance. I'm only doing discrete collision detection. This is how my collision detection currently works:

1. Perform broadphase

2. Transform all potentially colliding convex polyhedra verts/normals into world space

3. Perform SAT on all pairs found by broadphase and regenerate all contact points

I don't do any caching of separating axes, I found it slower than just brute force retesting surprisingly. Currently the slowest part of my SAT implementation is the edge-edge tests.

For my implementation, I basically just dumbly converted Dirk's algorithm to SIMD and it seems to be around 3-4x faster than a scalar implementation.

First I just find all potential edge pairs and then iterate over the pairs checking for overlap or separation.

Finding the pairs tends to takes around 50-70% of the time. I tried also computing edge distance in the inner loop but it's maybe 30% slower than computing edge distances after pruning.

Code: Select all

```
// From Daniel Lemire's simd prune library
static inline __m128i prune_epi32(__m128i x, int mask)
{
return _mm_shuffle_epi8(x, _mm_loadu_si128((const __m128i *)mask128_epi32 + mask));
}
struct TransformedConvexShapeData
{
int num_edges;
float *edge_tangents_x;
float *edge_tangents_y;
float *edge_tangents_z;
float *edge_normals_L_x;
float *edge_normals_L_y;
float *edge_normals_L_z;
float *edge_normals_R_x;
float *edge_normals_R_y;
float *edge_normals_R_z;
float *edge_verts_x;
float *edge_verts_y;
float *edge_verts_z;
};
bool test_edge_axes_sse(const TransformedConvexShapeData &a, const TransformedConvexShapeData &b, TestEdgeAxesRes &res)
{
int edges_a[MAX_EDGE_PAIRS];
int edges_b[MAX_EDGE_PAIRS];
int num_pairs = 0;
__m128i index_a = _mm_set_epi32(3,2,1,0);
__m128i max_edges_a = _mm_set1_epi32(a.num_edges);
for(int i = 0; i < a.num_edges; i += 4)
{
// Load normals for the 2 faces connected to the edge.
// These normals are the arc endpoints on the unit sphere.
__m128 Ax = _mm_loadu_ps(a.edge_normals_L_x+i);
__m128 Ay = _mm_loadu_ps(a.edge_normals_L_y+i);
__m128 Az = _mm_loadu_ps(a.edge_normals_L_z+i);
__m128 Bx = _mm_loadu_ps(a.edge_normals_R_x+i);
__m128 By = _mm_loadu_ps(a.edge_normals_R_y+i);
__m128 Bz = _mm_loadu_ps(a.edge_normals_R_z+i);
// Load the edge's direction which is normal to the plane through points A and B
__m128 B_x_Ax = _mm_loadu_ps(a.edge_tangents_x+i);
__m128 B_x_Ay = _mm_loadu_ps(a.edge_tangents_y+i);
__m128 B_x_Az = _mm_loadu_ps(a.edge_tangents_z+i);
// This will mask out lanes which are out of bounds.
__m128i out_of_bounds = _mm_xor_si128(_mm_cmplt_epi32(index_a, max_edges_a), _mm_set1_epi32(~0));
__m128i index_b = _mm_set1_epi32(0);
for (int j = 0; j < b.num_edges; j++)
{
// Load normals as in the outer loop, we need to negate these since we're testing a minkowski _difference_
__m128 Cx = _mm_xor_ps(_mm_set1_ps(-0.), _mm_broadcast_ss(b.edge_normals_L_x+j));
__m128 Cy = _mm_xor_ps(_mm_set1_ps(-0.), _mm_broadcast_ss(b.edge_normals_L_y+j));
__m128 Cz = _mm_xor_ps(_mm_set1_ps(-0.), _mm_broadcast_ss(b.edge_normals_L_z+j));
__m128 Dx = _mm_xor_ps(_mm_set1_ps(-0.), _mm_broadcast_ss(b.edge_normals_R_x+j));
__m128 Dy = _mm_xor_ps(_mm_set1_ps(-0.), _mm_broadcast_ss(b.edge_normals_R_y+j));
__m128 Dz = _mm_xor_ps(_mm_set1_ps(-0.), _mm_broadcast_ss(b.edge_normals_R_z+j));
// Same as outer loop.
__m128 D_x_Cx = _mm_broadcast_ss(b.edge_tangents_x+j);
__m128 D_x_Cy = _mm_broadcast_ss(b.edge_tangents_y+j);
__m128 D_x_Cz = _mm_broadcast_ss(b.edge_tangents_z+j);
// Two edges form a face on the MD if [C,B,A]*[D,B,A] < 0 && [A,D,C]*[B,D,C] < 0 && [C,B,A]*[B,D,C] > 0 (brackets denote triple product)
// - [C,B,A]*[D,B,A] < 0 Implies the points C and D lie on different sides of the plane A,B and the origin.
// - [A,D,C]*[B,D,C] < 0 Implies the points A and B lie on different sides of the plane through C,D and the origin.
// - [C,B,A]*[B,D,C] > 0 We need to test if the arcs AB and CD are in the same hemisphere. This is true if A and D lie on the same side of the plane through C,D and the origin.
// This can be tested with 0 < [A,C,B]*[D,C,B] == [C,B,A]*[B,D,C] since the triple product is invariant under cyclic permutations.
__m128 CBA = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Cx,B_x_Ax), _mm_mul_ps(Cy,B_x_Ay)), _mm_mul_ps(Cz,B_x_Az));
__m128 DBA = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Dx,B_x_Ax), _mm_mul_ps(Dy,B_x_Ay)), _mm_mul_ps(Dz,B_x_Az));
__m128 ADC = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Ax,D_x_Cx), _mm_mul_ps(Ay,D_x_Cy)), _mm_mul_ps(Az,D_x_Cz));
__m128 BDC = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Bx,D_x_Cx), _mm_mul_ps(By,D_x_Cy)), _mm_mul_ps(Bz,D_x_Cz));
__m128 CBA_DBA = _mm_mul_ps(CBA, DBA);
__m128 ADC_BDC = _mm_mul_ps(ADC, BDC);
__m128 CBA_BDC = _mm_mul_ps(CBA, BDC);
__m128 not_mf = _mm_or_ps(_mm_or_ps(_mm_cmpge_ps(CBA_DBA,_mm_set1_ps(0)), _mm_cmpge_ps(ADC_BDC,_mm_set1_ps(0))), _mm_cmple_ps(CBA_BDC,_mm_set1_ps(0)));
// We store only the pairs which form a minkowski face
__m128 prune_mask = _mm_or_ps(_mm_castsi128_ps(out_of_bounds), not_mf);
int move_mask = _mm_movemask_ps(prune_mask);
__m128i store_index_a = prune_epi32(index_a,move_mask);
__m128i store_index_b = prune_epi32(index_b,move_mask);
_mm_storeu_si128((__m128i*)(edges_a+num_pairs),store_index_a);
_mm_storeu_si128((__m128i*)(edges_b+num_pairs),store_index_b);
num_pairs += 4-__popcnt(move_mask);
index_b = _mm_add_epi32(index_b,_mm_set1_epi32(1));
}
index_a = _mm_add_epi32(index_a, _mm_set1_epi32(4));
}
// Naively Iterate over all potential pairs and find contact points/early out/etc.
}
```