From bdea14cdf51919c655f5826d0289bd4e464cc5a5 Mon Sep 17 00:00:00 2001 From: Erin Catto Date: Sat, 25 Jan 2025 16:28:44 -0800 Subject: [PATCH] SIMD rolling resistance --- samples/car.cpp | 1 + samples/sample_benchmark.cpp | 2 + samples/sample_joints.cpp | 8 +- samples/sample_shapes.cpp | 6 +- src/constraint_graph.c | 2 +- src/contact_solver.c | 302 +++++++++++++---------------------- 6 files changed, 122 insertions(+), 199 deletions(-) diff --git a/samples/car.cpp b/samples/car.cpp index cb7109b07..d9e7f34e0 100644 --- a/samples/car.cpp +++ b/samples/car.cpp @@ -53,6 +53,7 @@ void Car::Spawn( b2WorldId worldId, b2Vec2 position, float scale, float hertz, f shapeDef.density = 2.0f / scale; shapeDef.friction = 1.5f; + shapeDef.rollingResistance = 0.1f; bodyDef.position = b2Add( { -1.0f * scale, 0.35f * scale }, position ); bodyDef.allowFastRotation = true; diff --git a/samples/sample_benchmark.cpp b/samples/sample_benchmark.cpp index 62dac89a4..c02dbf64a 100644 --- a/samples/sample_benchmark.cpp +++ b/samples/sample_benchmark.cpp @@ -221,6 +221,7 @@ class BenchmarkBarrel : public Sample { m_bodies[index] = b2CreateBody( m_worldId, &bodyDef ); circle.radius = RandomFloatRange( 0.25f, 0.75f ); + shapeDef.rollingResistance = 0.2f; b2CreateCircleShape( m_bodies[index], &shapeDef, &circle ); } else if ( m_shapeType == e_capsuleShape ) @@ -230,6 +231,7 @@ class BenchmarkBarrel : public Sample float length = RandomFloatRange( 0.25f, 1.0f ); capsule.center1 = { 0.0f, -0.5f * length }; capsule.center2 = { 0.0f, 0.5f * length }; + shapeDef.rollingResistance = 0.2f; b2CreateCapsuleShape( m_bodies[index], &shapeDef, &capsule ); } else if ( m_shapeType == e_mixShape ) diff --git a/samples/sample_joints.cpp b/samples/sample_joints.cpp index 43b88c72e..3dd06d951 100644 --- a/samples/sample_joints.cpp +++ b/samples/sample_joints.cpp @@ -2072,11 +2072,11 @@ class Driving : public Sample m_throttle = 0.0f; m_speed = 35.0f; - m_torque = 2.5f; + m_torque = 5.0f; m_hertz = 5.0f; m_dampingRatio = 0.7f; - m_car.Spawn( m_worldId, { 0.0f, 0.0f }, 1.0f, m_hertz, m_dampingRatio, m_torque, NULL ); + m_car.Spawn( m_worldId, { 0.0f, 0.0f }, 1.0f, m_hertz, m_dampingRatio, m_torque, nullptr ); } void UpdateUI() override @@ -2103,7 +2103,7 @@ class Driving : public Sample m_car.SetSpeed( m_throttle * m_speed ); } - if ( ImGui::SliderFloat( "Torque", &m_torque, 0.0f, 5.0f, "%.1f" ) ) + if ( ImGui::SliderFloat( "Torque", &m_torque, 0.0f, 10.0f, "%.1f" ) ) { m_car.SetTorque( m_torque ); } @@ -2491,7 +2491,7 @@ class ScissorLift : public Sample m_liftJointId = b2CreateDistanceJoint( m_worldId, &distanceDef ); Car car; - car.Spawn( m_worldId, { 0.0f, y + 2.0f }, 1.0f, 3.0f, 0.7f, 0.0f, NULL ); + car.Spawn( m_worldId, { 0.0f, y + 2.0f }, 1.0f, 3.0f, 0.7f, 0.0f, nullptr ); } void UpdateUI() override diff --git a/samples/sample_shapes.cpp b/samples/sample_shapes.cpp index aaa831a2b..26d8ff3af 100644 --- a/samples/sample_shapes.cpp +++ b/samples/sample_shapes.cpp @@ -967,6 +967,7 @@ class RollingResistance : public Sample } m_lift = 0.0f; + m_resistScale = 0.02f; CreateScene(); } @@ -990,7 +991,7 @@ class RollingResistance : public Sample bodyDef.linearVelocity = { 5.0f, 0.0f }; b2BodyId bodyId = b2CreateBody( m_worldId, &bodyDef ); - shapeDef.rollingResistance = 0.1f * i; + shapeDef.rollingResistance = m_resistScale * i; b2CreateCircleShape( bodyId, &shapeDef, &circle ); } } @@ -1029,7 +1030,7 @@ class RollingResistance : public Sample for ( int i = 0; i < 20; ++i ) { - g_draw.DrawString( { -41.5f, 2.0f * i + 1.0f }, "%.2f", 0.1f * i ); + g_draw.DrawString( { -41.5f, 2.0f * i + 1.0f }, "%.2f", m_resistScale * i ); } } @@ -1038,6 +1039,7 @@ class RollingResistance : public Sample return new RollingResistance( settings ); } + float m_resistScale; float m_lift; }; diff --git a/src/constraint_graph.c b/src/constraint_graph.c index fbd843f76..950ee6542 100644 --- a/src/constraint_graph.c +++ b/src/constraint_graph.c @@ -23,7 +23,7 @@ // cause horrible cache stalls. To make this feasible I would need a way to block these writes. // This is used for debugging by making all constraints be assigned to overflow. -#define B2_FORCE_OVERFLOW 1 +#define B2_FORCE_OVERFLOW 0 _Static_assert( B2_GRAPH_COLOR_COUNT == 12, "graph color count assumed to be 12" ); diff --git a/src/contact_solver.c b/src/contact_solver.c index 4731ddd64..f8b8cf6bf 100644 --- a/src/contact_solver.c +++ b/src/contact_solver.c @@ -203,8 +203,8 @@ void b2WarmStartOverflowContacts( b2StepContext* context ) vB = b2MulAdd( vB, mB, P ); } - wA -= constraint->rollingImpulse; - wB += constraint->rollingImpulse; + wA -= iA * constraint->rollingImpulse; + wB += iB * constraint->rollingImpulse; stateA->linearVelocity = vA; stateA->angularVelocity = wA; @@ -349,8 +349,6 @@ void b2SolveOverflowContacts( b2StepContext* context, bool useBias ) { float deltaLambda = -constraint->rollingMass * ( wB - wA ); float lambda = constraint->rollingImpulse; - constraint->rollingImpulse = lambda + deltaLambda; - float maxLambda = constraint->rollingResistance * totalNormalImpulse; constraint->rollingImpulse = b2ClampFloat( lambda + deltaLambda, -maxLambda, maxLambda ); deltaLambda = constraint->rollingImpulse - lambda; @@ -590,6 +588,13 @@ static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) return _mm256_max_ps( a, b ); } +// a = clamp(a, -b, b) +static inline b2FloatW b2ClampSymW( b2FloatW a, b2FloatW b ) +{ + b2FloatW nb = _mm256_sub_ps( _mm256_setzero_ps(), b ); + return _mm256_max_ps(nb, _mm256_min_ps( a, b )); +} + static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) { return _mm256_or_ps( a, b ); @@ -664,6 +669,13 @@ static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) return vmaxq_f32( a, b ); } +// a = clamp(a, -b, b) +static inline b2FloatW b2ClampSymW( b2FloatW a, b2FloatW b ) +{ + b2FloatW nb = vnegq_f32( b ); + return vmaxq_f32( nb, vminq_f32( a, b ) ); +} + static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) { return vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32( a ), vreinterpretq_u32_f32( b ) ) ); @@ -772,6 +784,18 @@ static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) return _mm_max_ps( a, b ); } +// a = clamp(a, -b, b) +static inline b2FloatW b2ClampSymW( b2FloatW a, b2FloatW b ) +{ + // Create a mask with the sign bit set for each element + __m128 mask = _mm_set1_ps( -0.0f ); + + // XOR the input with the mask to negate each element + __m128 nb = _mm_xor_ps( b, mask ); + + return _mm_max_ps( nb, _mm_min_ps( a, b ) ); +} + static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) { return _mm_or_ps( a, b ); @@ -870,6 +894,17 @@ static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) return r; } +// a = clamp(a, -b, b) +static inline b2FloatW b2ClampSymW( b2FloatW a, b2FloatW b ) +{ + b2FloatW r; + r.x = b2ClampFloat(a.x, -b.x, b.x); + r.y = b2ClampFloat(a.y, -b.y, b.y); + r.z = b2ClampFloat(a.z, -b.z, b.z); + r.w = b2ClampFloat(a.w, -b.w, b.w); + return r; +} + static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) { b2FloatW r; @@ -942,6 +977,9 @@ typedef struct b2ContactConstraintSIMD b2FloatW invIA, invIB; b2Vec2W normal; b2FloatW friction; + b2FloatW rollingResistance; + b2FloatW rollingMass; + b2FloatW rollingImpulse; b2FloatW biasRate; b2FloatW massScale; b2FloatW impulseScale; @@ -967,20 +1005,20 @@ int b2GetContactConstraintSIMDByteCount( void ) } // wide version of b2BodyState -typedef struct b2SimdBody +typedef struct b2BodyStateW { b2Vec2W v; b2FloatW w; b2FloatW flags; b2Vec2W dp; b2RotW dq; -} b2SimdBody; +} b2BodyStateW; // Custom gather/scatter for each SIMD type #if defined( B2_SIMD_AVX2 ) // This is a load and 8x8 transpose -static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) +static b2BodyStateW b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); @@ -1012,7 +1050,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 b2FloatW tt6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) ); b2FloatW tt7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) ); - b2SimdBody simdBody; + b2BodyStateW simdBody; simdBody.v.X = _mm256_permute2f128_ps( tt0, tt4, 0x20 ); simdBody.v.Y = _mm256_permute2f128_ps( tt1, tt5, 0x20 ); simdBody.w = _mm256_permute2f128_ps( tt2, tt6, 0x20 ); @@ -1025,7 +1063,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 } // This writes everything back to the solver bodies but only the velocities change -static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2SimdBody* B2_RESTRICT simdBody ) +static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2BodyStateW* B2_RESTRICT simdBody ) { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); @@ -1069,7 +1107,7 @@ static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT i #elif defined( B2_SIMD_NEON ) // This is a load and transpose -static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) +static b2BodyStateW b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); @@ -1102,7 +1140,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 // [w2 w4 f2 f4] b2FloatW t4a = b2UnpackHiW( b2a, b4a ); - b2SimdBody simdBody; + b2BodyStateW simdBody; simdBody.v.X = b2UnpackLoW( t1a, t2a ); simdBody.v.Y = b2UnpackHiW( t1a, t2a ); simdBody.w = b2UnpackLoW( t3a, t4a ); @@ -1123,7 +1161,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 // This writes only the velocities back to the solver bodies // https://developer.arm.com/documentation/102107a/0100/Floating-point-4x4-matrix-transposition -static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2SimdBody* B2_RESTRICT simdBody ) +static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2BodyStateW* B2_RESTRICT simdBody ) { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); @@ -1175,7 +1213,7 @@ static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT i #elif defined( B2_SIMD_SSE2 ) // This is a load and transpose -static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) +static b2BodyStateW b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); @@ -1207,7 +1245,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 // [w2 w4 f2 f4] b2FloatW t4a = b2UnpackHiW( b2a, b4a ); - b2SimdBody simdBody; + b2BodyStateW simdBody; simdBody.v.X = b2UnpackLoW( t1a, t2a ); simdBody.v.Y = b2UnpackHiW( t1a, t2a ); simdBody.w = b2UnpackLoW( t3a, t4a ); @@ -1227,7 +1265,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 } // This writes only the velocities back to the solver bodies -static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2SimdBody* B2_RESTRICT simdBody ) +static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2BodyStateW* B2_RESTRICT simdBody ) { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); @@ -1271,7 +1309,7 @@ static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT i #else // This is a load and transpose -static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) +static b2BodyStateW b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices ) { b2BodyState identity = b2_identityBodyState; @@ -1280,7 +1318,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 b2BodyState s3 = indices[2] == B2_NULL_INDEX ? identity : states[indices[2]]; b2BodyState s4 = indices[3] == B2_NULL_INDEX ? identity : states[indices[3]]; - b2SimdBody simdBody; + b2BodyStateW simdBody; simdBody.v.X = ( b2FloatW ){ s1.linearVelocity.x, s2.linearVelocity.x, s3.linearVelocity.x, s4.linearVelocity.x }; simdBody.v.Y = ( b2FloatW ){ s1.linearVelocity.y, s2.linearVelocity.y, s3.linearVelocity.y, s4.linearVelocity.y }; simdBody.w = ( b2FloatW ){ s1.angularVelocity, s2.angularVelocity, s3.angularVelocity, s4.angularVelocity }; @@ -1294,7 +1332,7 @@ static b2SimdBody b2GatherBodies( const b2BodyState* B2_RESTRICT states, int* B2 } // This writes only the velocities back to the solver bodies -static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2SimdBody* B2_RESTRICT simdBody ) +static void b2ScatterBodies( b2BodyState* B2_RESTRICT states, int* B2_RESTRICT indices, const b2BodyStateW* B2_RESTRICT simdBody ) { if ( indices[0] != B2_NULL_INDEX ) { @@ -1402,6 +1440,11 @@ void b2PrepareContactsTask( int startIndex, int endIndex, b2StepContext* context ( (float*)&constraint->invIA )[j] = iA; ( (float*)&constraint->invIB )[j] = iB; + { + float k = iA + iB; + ( (float*)&constraint->rollingMass )[j] = k > 0.0f ? 1.0f / k : 0.0f; + } + b2Softness soft = ( indexA == B2_NULL_INDEX || indexB == B2_NULL_INDEX ) ? staticSoftness : contactSoftness; b2Vec2 normal = manifold->normal; @@ -1410,6 +1453,9 @@ void b2PrepareContactsTask( int startIndex, int endIndex, b2StepContext* context ( (float*)&constraint->friction )[j] = contactSim->friction; ( (float*)&constraint->restitution )[j] = contactSim->restitution; + ( (float*)&constraint->rollingResistance )[j] = contactSim->rollingResistance; + ( (float*)&constraint->rollingImpulse )[j] = warmStartScale * manifold->rollingImpulse; + ( (float*)&constraint->biasRate )[j] = soft.biasRate; ( (float*)&constraint->massScale )[j] = soft.massScale; ( (float*)&constraint->impulseScale )[j] = soft.impulseScale; @@ -1561,8 +1607,8 @@ void b2WarmStartContactsTask( int startIndex, int endIndex, b2StepContext* conte for ( int i = startIndex; i < endIndex; ++i ) { b2ContactConstraintSIMD* c = constraints + i; - b2SimdBody bA = b2GatherBodies( states, c->indexA ); - b2SimdBody bB = b2GatherBodies( states, c->indexB ); + b2BodyStateW bA = b2GatherBodies( states, c->indexA ); + b2BodyStateW bB = b2GatherBodies( states, c->indexB ); b2FloatW tangentX = c->normal.Y; b2FloatW tangentY = b2SubW( b2ZeroW(), c->normal.X ); @@ -1599,6 +1645,9 @@ void b2WarmStartContactsTask( int startIndex, int endIndex, b2StepContext* conte bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, P.Y ); } + bA.w = b2MulSubW(bA.w, c->invIA, c->rollingImpulse); + bB.w = b2MulAddW(bB.w, c->invIB, c->rollingImpulse); + b2ScatterBodies( states, c->indexA, &bA ); b2ScatterBodies( states, c->indexB, &bB ); } @@ -1619,8 +1668,8 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, { b2ContactConstraintSIMD* c = constraints + i; - b2SimdBody bA = b2GatherBodies( states, c->indexA ); - b2SimdBody bB = b2GatherBodies( states, c->indexB ); + b2BodyStateW bA = b2GatherBodies( states, c->indexA ); + b2BodyStateW bB = b2GatherBodies( states, c->indexB ); b2FloatW biasRate, massScale, impulseScale; if ( useBias ) @@ -1636,6 +1685,8 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, impulseScale = b2ZeroW(); } + b2FloatW totalNormalImpulse = b2ZeroW(); + b2Vec2W dp = { b2SubW( bB.dp.X, bA.dp.X ), b2SubW( bB.dp.Y, bA.dp.Y ) }; // point1 non-penetration constraint @@ -1675,6 +1726,8 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, c->normalImpulse1 = newImpulse; c->maxNormalImpulse1 = b2MaxW( c->maxNormalImpulse1, newImpulse ); + totalNormalImpulse = b2AddW( totalNormalImpulse, newImpulse ); + // Apply contact impulse b2FloatW Px = b2MulW( impulse, c->normal.X ); b2FloatW Py = b2MulW( impulse, c->normal.Y ); @@ -1722,6 +1775,8 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, c->normalImpulse2 = newImpulse; c->maxNormalImpulse2 = b2MaxW( c->maxNormalImpulse2, newImpulse ); + totalNormalImpulse = b2AddW( totalNormalImpulse, newImpulse ); + // Apply contact impulse b2FloatW Px = b2MulW( impulse, c->normal.X ); b2FloatW Py = b2MulW( impulse, c->normal.Y ); @@ -1806,6 +1861,18 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, bB.w = b2MulAddW( bB.w, c->invIB, b2SubW( b2MulW( rB.X, Py ), b2MulW( rB.Y, Px ) ) ); } + // Rolling resistance + { + b2FloatW deltaLambda = b2MulW( c->rollingMass, b2SubW( bA.w, bB.w )); + b2FloatW lambda = c->rollingImpulse; + b2FloatW maxLambda = b2MulW( c->rollingResistance, totalNormalImpulse ); + c->rollingImpulse = b2ClampSymW( b2AddW(lambda, deltaLambda), maxLambda ); + deltaLambda = b2SubW(c->rollingImpulse, lambda); + + bA.w = b2MulSubW( bA.w, c->invIA, deltaLambda ); + bB.w = b2MulAddW( bB.w, c->invIB, deltaLambda ); + } + b2ScatterBodies( states, c->indexA, &bA ); b2ScatterBodies( states, c->indexB, &bB ); } @@ -1826,8 +1893,8 @@ void b2ApplyRestitutionTask( int startIndex, int endIndex, b2StepContext* contex { b2ContactConstraintSIMD* c = constraints + i; - b2SimdBody bA = b2GatherBodies( states, c->indexA ); - b2SimdBody bB = b2GatherBodies( states, c->indexB ); + b2BodyStateW bA = b2GatherBodies( states, c->indexA ); + b2BodyStateW bB = b2GatherBodies( states, c->indexB ); // first point non-penetration constraint { @@ -1912,9 +1979,6 @@ void b2ApplyRestitutionTask( int startIndex, int endIndex, b2StepContext* contex b2TracyCZoneEnd( restitution ); } -#if B2_SIMD_WIDTH == 8 - -// todo try making an inner loop on B2_SIMD_WIDTH to have a single implementation of this function void b2StoreImpulsesTask( int startIndex, int endIndex, b2StepContext* context ) { b2TracyCZoneNC( store_impulses, "Store", b2_colorFireBrick, true ); @@ -1924,9 +1988,10 @@ void b2StoreImpulsesTask( int startIndex, int endIndex, b2StepContext* context ) b2Manifold dummy = { 0 }; - for ( int i = startIndex; i < endIndex; ++i ) + for ( int constraintIndex = startIndex; constraintIndex < endIndex; ++constraintIndex ) { - const b2ContactConstraintSIMD* c = constraints + i; + const b2ContactConstraintSIMD* c = constraints + constraintIndex; + const float* rollingImpulse = (float*)&c->rollingImpulse; const float* normalImpulse1 = (float*)&c->normalImpulse1; const float* normalImpulse2 = (float*)&c->normalImpulse2; const float* tangentImpulse1 = (float*)&c->tangentImpulse1; @@ -1936,171 +2001,24 @@ void b2StoreImpulsesTask( int startIndex, int endIndex, b2StepContext* context ) const float* normalVelocity1 = (float*)&c->relativeVelocity1; const float* normalVelocity2 = (float*)&c->relativeVelocity2; - int base = 8 * i; - b2Manifold* m0 = contacts[base + 0] == NULL ? &dummy : &contacts[base + 0]->manifold; - b2Manifold* m1 = contacts[base + 1] == NULL ? &dummy : &contacts[base + 1]->manifold; - b2Manifold* m2 = contacts[base + 2] == NULL ? &dummy : &contacts[base + 2]->manifold; - b2Manifold* m3 = contacts[base + 3] == NULL ? &dummy : &contacts[base + 3]->manifold; - b2Manifold* m4 = contacts[base + 4] == NULL ? &dummy : &contacts[base + 4]->manifold; - b2Manifold* m5 = contacts[base + 5] == NULL ? &dummy : &contacts[base + 5]->manifold; - b2Manifold* m6 = contacts[base + 6] == NULL ? &dummy : &contacts[base + 6]->manifold; - b2Manifold* m7 = contacts[base + 7] == NULL ? &dummy : &contacts[base + 7]->manifold; - - m0->points[0].normalImpulse = normalImpulse1[0]; - m0->points[0].tangentImpulse = tangentImpulse1[0]; - m0->points[0].maxNormalImpulse = maxNormalImpulse1[0]; - m0->points[0].normalVelocity = normalVelocity1[0]; - - m0->points[1].normalImpulse = normalImpulse2[0]; - m0->points[1].tangentImpulse = tangentImpulse2[0]; - m0->points[1].maxNormalImpulse = maxNormalImpulse2[0]; - m0->points[1].normalVelocity = normalVelocity2[0]; - - m1->points[0].normalImpulse = normalImpulse1[1]; - m1->points[0].tangentImpulse = tangentImpulse1[1]; - m1->points[0].maxNormalImpulse = maxNormalImpulse1[1]; - m1->points[0].normalVelocity = normalVelocity1[1]; - - m1->points[1].normalImpulse = normalImpulse2[1]; - m1->points[1].tangentImpulse = tangentImpulse2[1]; - m1->points[1].maxNormalImpulse = maxNormalImpulse2[1]; - m1->points[1].normalVelocity = normalVelocity2[1]; - - m2->points[0].normalImpulse = normalImpulse1[2]; - m2->points[0].tangentImpulse = tangentImpulse1[2]; - m2->points[0].maxNormalImpulse = maxNormalImpulse1[2]; - m2->points[0].normalVelocity = normalVelocity1[2]; - - m2->points[1].normalImpulse = normalImpulse2[2]; - m2->points[1].tangentImpulse = tangentImpulse2[2]; - m2->points[1].maxNormalImpulse = maxNormalImpulse2[2]; - m2->points[1].normalVelocity = normalVelocity2[2]; - - m3->points[0].normalImpulse = normalImpulse1[3]; - m3->points[0].tangentImpulse = tangentImpulse1[3]; - m3->points[0].maxNormalImpulse = maxNormalImpulse1[3]; - m3->points[0].normalVelocity = normalVelocity1[3]; - - m3->points[1].normalImpulse = normalImpulse2[3]; - m3->points[1].tangentImpulse = tangentImpulse2[3]; - m3->points[1].maxNormalImpulse = maxNormalImpulse2[3]; - m3->points[1].normalVelocity = normalVelocity2[3]; - - m4->points[0].normalImpulse = normalImpulse1[4]; - m4->points[0].tangentImpulse = tangentImpulse1[4]; - m4->points[0].maxNormalImpulse = maxNormalImpulse1[4]; - m4->points[0].normalVelocity = normalVelocity1[4]; - - m4->points[1].normalImpulse = normalImpulse2[4]; - m4->points[1].tangentImpulse = tangentImpulse2[4]; - m4->points[1].maxNormalImpulse = maxNormalImpulse2[4]; - m4->points[1].normalVelocity = normalVelocity2[4]; - - m5->points[0].normalImpulse = normalImpulse1[5]; - m5->points[0].tangentImpulse = tangentImpulse1[5]; - m5->points[0].maxNormalImpulse = maxNormalImpulse1[5]; - m5->points[0].normalVelocity = normalVelocity1[5]; - - m5->points[1].normalImpulse = normalImpulse2[5]; - m5->points[1].tangentImpulse = tangentImpulse2[5]; - m5->points[1].maxNormalImpulse = maxNormalImpulse2[5]; - m5->points[1].normalVelocity = normalVelocity2[5]; - - m6->points[0].normalImpulse = normalImpulse1[6]; - m6->points[0].tangentImpulse = tangentImpulse1[6]; - m6->points[0].maxNormalImpulse = maxNormalImpulse1[6]; - m6->points[0].normalVelocity = normalVelocity1[6]; - - m6->points[1].normalImpulse = normalImpulse2[6]; - m6->points[1].tangentImpulse = tangentImpulse2[6]; - m6->points[1].maxNormalImpulse = maxNormalImpulse2[6]; - m6->points[1].normalVelocity = normalVelocity2[6]; - - m7->points[0].normalImpulse = normalImpulse1[7]; - m7->points[0].tangentImpulse = tangentImpulse1[7]; - m7->points[0].maxNormalImpulse = maxNormalImpulse1[7]; - m7->points[0].normalVelocity = normalVelocity1[7]; - - m7->points[1].normalImpulse = normalImpulse2[7]; - m7->points[1].tangentImpulse = tangentImpulse2[7]; - m7->points[1].maxNormalImpulse = maxNormalImpulse2[7]; - m7->points[1].normalVelocity = normalVelocity2[7]; - } - - b2TracyCZoneEnd( store_impulses ); -} - -#else - -void b2StoreImpulsesTask( int startIndex, int endIndex, b2StepContext* context ) -{ - b2TracyCZoneNC( store_impulses, "Store", b2_colorFirebrick, true ); - - b2ContactSim** contacts = context->contacts; - const b2ContactConstraintSIMD* constraints = context->simdContactConstraints; - - b2Manifold dummy = { 0 }; - - for ( int i = startIndex; i < endIndex; ++i ) - { - const b2ContactConstraintSIMD* c = constraints + i; - const float* normalImpulse1 = (float*)&c->normalImpulse1; - const float* normalImpulse2 = (float*)&c->normalImpulse2; - const float* tangentImpulse1 = (float*)&c->tangentImpulse1; - const float* tangentImpulse2 = (float*)&c->tangentImpulse2; - const float* maxNormalImpulse1 = (float*)&c->maxNormalImpulse1; - const float* maxNormalImpulse2 = (float*)&c->maxNormalImpulse2; - const float* normalVelocity1 = (float*)&c->relativeVelocity1; - const float* normalVelocity2 = (float*)&c->relativeVelocity2; + int baseIndex = B2_SIMD_WIDTH * constraintIndex; - int base = 4 * i; - b2Manifold* m0 = contacts[base + 0] == NULL ? &dummy : &contacts[base + 0]->manifold; - b2Manifold* m1 = contacts[base + 1] == NULL ? &dummy : &contacts[base + 1]->manifold; - b2Manifold* m2 = contacts[base + 2] == NULL ? &dummy : &contacts[base + 2]->manifold; - b2Manifold* m3 = contacts[base + 3] == NULL ? &dummy : &contacts[base + 3]->manifold; - - m0->points[0].normalImpulse = normalImpulse1[0]; - m0->points[0].tangentImpulse = tangentImpulse1[0]; - m0->points[0].maxNormalImpulse = maxNormalImpulse1[0]; - m0->points[0].normalVelocity = normalVelocity1[0]; - - m0->points[1].normalImpulse = normalImpulse2[0]; - m0->points[1].tangentImpulse = tangentImpulse2[0]; - m0->points[1].maxNormalImpulse = maxNormalImpulse2[0]; - m0->points[1].normalVelocity = normalVelocity2[0]; - - m1->points[0].normalImpulse = normalImpulse1[1]; - m1->points[0].tangentImpulse = tangentImpulse1[1]; - m1->points[0].maxNormalImpulse = maxNormalImpulse1[1]; - m1->points[0].normalVelocity = normalVelocity1[1]; - - m1->points[1].normalImpulse = normalImpulse2[1]; - m1->points[1].tangentImpulse = tangentImpulse2[1]; - m1->points[1].maxNormalImpulse = maxNormalImpulse2[1]; - m1->points[1].normalVelocity = normalVelocity2[1]; - - m2->points[0].normalImpulse = normalImpulse1[2]; - m2->points[0].tangentImpulse = tangentImpulse1[2]; - m2->points[0].maxNormalImpulse = maxNormalImpulse1[2]; - m2->points[0].normalVelocity = normalVelocity1[2]; - - m2->points[1].normalImpulse = normalImpulse2[2]; - m2->points[1].tangentImpulse = tangentImpulse2[2]; - m2->points[1].maxNormalImpulse = maxNormalImpulse2[2]; - m2->points[1].normalVelocity = normalVelocity2[2]; - - m3->points[0].normalImpulse = normalImpulse1[3]; - m3->points[0].tangentImpulse = tangentImpulse1[3]; - m3->points[0].maxNormalImpulse = maxNormalImpulse1[3]; - m3->points[0].normalVelocity = normalVelocity1[3]; - - m3->points[1].normalImpulse = normalImpulse2[3]; - m3->points[1].tangentImpulse = tangentImpulse2[3]; - m3->points[1].maxNormalImpulse = maxNormalImpulse2[3]; - m3->points[1].normalVelocity = normalVelocity2[3]; + for ( int laneIndex = 0; laneIndex < B2_SIMD_WIDTH; ++laneIndex ) + { + b2Manifold* m = contacts[baseIndex + laneIndex] == NULL ? &dummy : &contacts[baseIndex + laneIndex]->manifold; + m->rollingImpulse = rollingImpulse[laneIndex]; + + m->points[0].normalImpulse = normalImpulse1[laneIndex]; + m->points[0].tangentImpulse = tangentImpulse1[laneIndex]; + m->points[0].maxNormalImpulse = maxNormalImpulse1[laneIndex]; + m->points[0].normalVelocity = normalVelocity1[laneIndex]; + + m->points[1].normalImpulse = normalImpulse2[laneIndex]; + m->points[1].tangentImpulse = tangentImpulse2[laneIndex]; + m->points[1].maxNormalImpulse = maxNormalImpulse2[laneIndex]; + m->points[1].normalVelocity = normalVelocity2[laneIndex]; + } } b2TracyCZoneEnd( store_impulses ); } - -#endif