-
Notifications
You must be signed in to change notification settings - Fork 2.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Apply integer math narrowing before VFPU sin/cos #14406
Changes from 6 commits
86585e9
e9076c9
ad9ad0f
cb8745b
8f41c78
ad876f0
07cb37c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,11 @@ | |
#define V(i) (currentMIPS->v[voffset[i]]) | ||
#define VI(i) (currentMIPS->vi[voffset[i]]) | ||
|
||
union float2int { | ||
uint32_t i; | ||
float f; | ||
}; | ||
|
||
void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) { | ||
int mtx = (vectorReg >> 2) & 7; | ||
int col = vectorReg & 3; | ||
|
@@ -610,10 +615,7 @@ bool GetVFPUCtrlMask(int reg, u32 *mask) { | |
|
||
float Float16ToFloat32(unsigned short l) | ||
{ | ||
union float2int { | ||
unsigned int i; | ||
float f; | ||
} float2int; | ||
float2int f2i; | ||
|
||
unsigned short float16 = l; | ||
unsigned int sign = (float16 >> VFPU_SH_FLOAT16_SIGN) & VFPU_MASK_FLOAT16_SIGN; | ||
|
@@ -623,10 +625,10 @@ float Float16ToFloat32(unsigned short l) | |
float f; | ||
if (exponent == VFPU_FLOAT16_EXP_MAX) | ||
{ | ||
float2int.i = sign << 31; | ||
float2int.i |= 255 << 23; | ||
float2int.i |= fraction; | ||
f = float2int.f; | ||
f2i.i = sign << 31; | ||
f2i.i |= 255 << 23; | ||
f2i.i |= fraction; | ||
f = f2i.f; | ||
} | ||
else if (exponent == 0 && fraction == 0) | ||
{ | ||
|
@@ -647,10 +649,10 @@ float Float16ToFloat32(unsigned short l) | |
} | ||
|
||
/* Convert to 32-bit single-precision IEEE754. */ | ||
float2int.i = sign << 31; | ||
float2int.i |= (exponent + 112) << 23; | ||
float2int.i |= fraction << 13; | ||
f=float2int.f; | ||
f2i.i = sign << 31; | ||
f2i.i |= (exponent + 112) << 23; | ||
f2i.i |= fraction << 13; | ||
f=f2i.f; | ||
} | ||
return f; | ||
} | ||
|
@@ -674,10 +676,6 @@ static int32_t get_sign(uint32_t x) { | |
|
||
float vfpu_dot(float a[4], float b[4]) { | ||
static const int EXTRA_BITS = 2; | ||
union float2int { | ||
uint32_t i; | ||
float f; | ||
}; | ||
float2int result; | ||
float2int src[2]; | ||
|
||
|
@@ -791,31 +789,27 @@ float vfpu_dot(float a[4], float b[4]) { | |
|
||
// TODO: This is still not completely accurate compared to the PSP's vsqrt. | ||
float vfpu_sqrt(float a) { | ||
union float2int { | ||
uint32_t u; | ||
float f; | ||
}; | ||
float2int val; | ||
val.f = a; | ||
|
||
if ((val.u & 0xff800000) == 0x7f800000) { | ||
if ((val.u & 0x007fffff) != 0) { | ||
val.u = 0x7f800001; | ||
if ((val.i & 0xff800000) == 0x7f800000) { | ||
if ((val.i & 0x007fffff) != 0) { | ||
val.i = 0x7f800001; | ||
} | ||
return val.f; | ||
} | ||
if ((val.u & 0x7f800000) == 0) { | ||
if ((val.i & 0x7f800000) == 0) { | ||
// Kill any sign. | ||
val.u = 0; | ||
val.i = 0; | ||
return val.f; | ||
} | ||
if (val.u & 0x80000000) { | ||
val.u = 0x7f800001; | ||
if (val.i & 0x80000000) { | ||
val.i = 0x7f800001; | ||
return val.f; | ||
} | ||
|
||
int k = get_exp(val.u); | ||
uint32_t sp = get_mant(val.u); | ||
int k = get_exp(val.i); | ||
uint32_t sp = get_mant(val.i); | ||
int less_bits = k & 1; | ||
k >>= 1; | ||
|
||
|
@@ -826,9 +820,9 @@ float vfpu_sqrt(float a) { | |
z = (z >> 1) + (uint32_t)(halfsp / z); | ||
} | ||
|
||
val.u = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF); | ||
val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF); | ||
// The lower two bits never end up set on the PSP, it seems like. | ||
val.u &= 0xFFFFFFFC; | ||
val.i &= 0xFFFFFFFC; | ||
|
||
return val.f; | ||
} | ||
|
@@ -842,31 +836,27 @@ static inline uint32_t mant_mul(uint32_t a, uint32_t b) { | |
} | ||
|
||
float vfpu_rsqrt(float a) { | ||
union float2int { | ||
uint32_t u; | ||
float f; | ||
}; | ||
float2int val; | ||
val.f = a; | ||
|
||
if (val.u == 0x7f800000) { | ||
if (val.i == 0x7f800000) { | ||
return 0.0f; | ||
} | ||
if ((val.u & 0x7fffffff) > 0x7f800000) { | ||
val.u = (val.u & 0x80000000) | 0x7f800001; | ||
if ((val.i & 0x7fffffff) > 0x7f800000) { | ||
val.i = (val.i & 0x80000000) | 0x7f800001; | ||
return val.f; | ||
} | ||
if ((val.u & 0x7f800000) == 0) { | ||
val.u = (val.u & 0x80000000) | 0x7f800000; | ||
if ((val.i & 0x7f800000) == 0) { | ||
val.i = (val.i & 0x80000000) | 0x7f800000; | ||
return val.f; | ||
} | ||
if (val.u & 0x80000000) { | ||
val.u = 0xff800001; | ||
if (val.i & 0x80000000) { | ||
val.i = 0xff800001; | ||
return val.f; | ||
} | ||
|
||
int k = get_exp(val.u); | ||
uint32_t sp = get_mant(val.u); | ||
int k = get_exp(val.i); | ||
uint32_t sp = get_mant(val.i); | ||
int less_bits = k & 1; | ||
k = -(k >> 1); | ||
|
||
|
@@ -889,8 +879,8 @@ float vfpu_rsqrt(float a) { | |
|
||
z >>= less_bits; | ||
|
||
val.u = ((k + 127) << 23) | (z & 0x007FFFFF); | ||
val.u &= 0xFFFFFFFC; | ||
val.i = ((k + 127) << 23) | (z & 0x007FFFFF); | ||
val.i &= 0xFFFFFFFC; | ||
|
||
return val.f; | ||
} | ||
|
@@ -946,34 +936,182 @@ void vfpu_sincos_single(float angle, float &sine, float &cosine) { | |
} | ||
} | ||
|
||
float vfpu_sin_double(float angle) { | ||
return (float)sin((double)angle * M_PI_2); | ||
float vfpu_sin_mod2(float a) { | ||
float2int val; | ||
val.f = a; | ||
|
||
int32_t k = get_uexp(val.i); | ||
if (k == 255) { | ||
val.i = (val.i & 0xFF800001) | 1; | ||
return val.f; | ||
} | ||
|
||
if (k < 0x68) { | ||
val.i &= 0x80000000; | ||
return val.f; | ||
} | ||
|
||
// Okay, now modulus by 4 to begin with (identical wave every 4.) | ||
int32_t mantissa = get_mant(val.i); | ||
if (k > 0x80) { | ||
const uint8_t over = k & 0x1F; | ||
mantissa = (mantissa << over) & 0x00FFFFFF; | ||
k = 0x80; | ||
} | ||
// This subtracts off the 2. If we do, flip sign to inverse the wave. | ||
if (k == 0x80 && mantissa >= (1 << 23)) { | ||
val.i ^= 0x80000000; | ||
mantissa -= 1 << 23; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Couldn't we just add a phase shift here for one of [sin, cos], and share the rest of the code between them? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I almost had it shared, and still waffling on it. But the NAN handling is a bit different and I was worried about the -0 cases making it messy. -[Unknown] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We definitely can swap sin/cos and negative though (and have to for CORDIC), I was just trying to minimize perf impact here. -[Unknown] |
||
|
||
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8; | ||
mantissa <<= norm_shift; | ||
k -= norm_shift; | ||
|
||
if (k <= 0 || mantissa == 0) { | ||
val.i &= 0x80000000; | ||
return val.f; | ||
} | ||
|
||
// This is the value with modulus applied. | ||
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23)); | ||
val.f = (float)sin((double)val.f * M_PI_2); | ||
val.i &= 0xFFFFFFFC; | ||
return val.f; | ||
} | ||
|
||
float vfpu_cos_double(float angle) { | ||
return (float)cos((double)angle * M_PI_2); | ||
float vfpu_cos_mod2(float a) { | ||
float2int val; | ||
val.f = a; | ||
bool negate = false; | ||
|
||
int32_t k = get_uexp(val.i); | ||
if (k == 255) { | ||
// Note: unlike sin, cos always returns +NAN. | ||
val.i = (val.i & 0x7F800001) | 1; | ||
return val.f; | ||
} | ||
|
||
if (k < 0x68) | ||
return 1.0f; | ||
|
||
// Okay, now modulus by 4 to begin with (identical wave every 4.) | ||
int32_t mantissa = get_mant(val.i); | ||
if (k > 0x80) { | ||
const uint8_t over = k & 0x1F; | ||
mantissa = (mantissa << over) & 0x00FFFFFF; | ||
k = 0x80; | ||
} | ||
// This subtracts off the 2. If we do, negate the result value. | ||
if (k == 0x80 && mantissa >= (1 << 23)) { | ||
mantissa -= 1 << 23; | ||
negate = true; | ||
} | ||
|
||
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8; | ||
mantissa <<= norm_shift; | ||
k -= norm_shift; | ||
|
||
if (k <= 0 || mantissa == 0) | ||
return negate ? -1.0f : 1.0f; | ||
|
||
// This is the value with modulus applied. | ||
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23)); | ||
if (val.f == 1.0f || val.f == -1.0f) { | ||
return negate ? 0.0f : -0.0f; | ||
} | ||
val.f = (float)cos((double)val.f * M_PI_2); | ||
val.i &= 0xFFFFFFFC; | ||
return negate ? -val.f : val.f; | ||
} | ||
|
||
void vfpu_sincos_double(float angle_f, float &sine, float &cosine) { | ||
double angle = (double)angle_f * M_PI_2; | ||
void vfpu_sincos_mod2(float a, float &s, float &c) { | ||
float2int val; | ||
val.f = a; | ||
// For sin, negate the input, for cos negate the output. | ||
bool negate = false; | ||
|
||
int32_t k = get_uexp(val.i); | ||
if (k == 255) { | ||
val.i = (val.i & 0xFF800001) | 1; | ||
s = val.f; | ||
val.i &= 0x7F800001; | ||
c = val.f; | ||
return; | ||
} | ||
|
||
if (k < 0x68) { | ||
val.i &= 0x80000000; | ||
s = val.f; | ||
c = 1.0f; | ||
return; | ||
} | ||
|
||
// Okay, now modulus by 4 to begin with (identical wave every 4.) | ||
int32_t mantissa = get_mant(val.i); | ||
if (k > 0x80) { | ||
const uint8_t over = k & 0x1F; | ||
mantissa = (mantissa << over) & 0x00FFFFFF; | ||
k = 0x80; | ||
} | ||
// This subtracts off the 2. If we do, flip signs. | ||
if (k == 0x80 && mantissa >= (1 << 23)) { | ||
mantissa -= 1 << 23; | ||
negate = true; | ||
} | ||
|
||
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8; | ||
mantissa <<= norm_shift; | ||
k -= norm_shift; | ||
|
||
if (k <= 0 || mantissa == 0) { | ||
val.i &= 0x80000000; | ||
if (negate) | ||
val.i ^= 0x80000000; | ||
s = val.f; | ||
c = 1.0f; | ||
return; | ||
} | ||
|
||
// This is the value with modulus applied. | ||
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23)); | ||
float2int i_sine, i_cosine; | ||
if (val.f == 1.0f) { | ||
i_sine.f = negate ? -1.0f : 1.0f; | ||
i_cosine.f = negate ? 0.0f : -0.0f; | ||
} else if (val.f == -1.0f) { | ||
i_sine.f = negate ? 1.0f : -1.0f; | ||
i_cosine.f = negate ? 0.0f : -0.0f; | ||
} else if (negate) { | ||
i_sine.f = (float)sin((double)-val.f * M_PI_2); | ||
i_cosine.f = -(float)cos((double)val.f * M_PI_2); | ||
} else { | ||
double angle = (double)val.f * M_PI_2; | ||
#if defined(__linux__) | ||
double d_sine; | ||
double d_cosine; | ||
sincos(angle, &d_sine, &d_cosine); | ||
sine = (float)d_sine; | ||
cosine = (float)d_cosine; | ||
double d_sine; | ||
double d_cosine; | ||
sincos(angle, &d_sine, &d_cosine); | ||
i_sine.f = (float)d_sine; | ||
i_cosine.f = (float)d_cosine; | ||
#else | ||
sine = (float)sin(angle); | ||
cosine = (float)cos(angle); | ||
i_sine.f = (float)sin(angle); | ||
i_cosine.f = (float)cos(angle); | ||
#endif | ||
} | ||
|
||
i_sine.i &= 0xFFFFFFFC; | ||
i_cosine.i &= 0xFFFFFFFC; | ||
s = i_sine.f; | ||
c = i_cosine.f; | ||
return ; | ||
} | ||
|
||
float (*vfpu_sin)(float); | ||
float (*vfpu_cos)(float); | ||
void (*vfpu_sincos)(float, float&, float&); | ||
|
||
void InitVFPUSinCos(bool useDoublePrecision) { | ||
vfpu_sin = useDoublePrecision ? vfpu_sin_double : vfpu_sin_single; | ||
vfpu_cos = useDoublePrecision ? vfpu_cos_double : vfpu_cos_single; | ||
vfpu_sincos = useDoublePrecision ? vfpu_sincos_double : vfpu_sincos_single; | ||
vfpu_sin = useDoublePrecision ? vfpu_sin_mod2 : vfpu_sin_single; | ||
vfpu_cos = useDoublePrecision ? vfpu_cos_mod2 : vfpu_cos_single; | ||
vfpu_sincos = useDoublePrecision ? vfpu_sincos_mod2 : vfpu_sincos_single; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wouldn't mod4 be a more accurate name?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
First step is mod by 4 (since it has a repeating pattern by 4), but then it mods by 2 right below that (which may negate the result or negate the input.) Ultimately, as in the other note, I may want to mod by 1 and swap sin/cos (which I was already doing in my CORDIC test code), but that part adds unnecessary extra instructions without much accuracy benefit at this point.
-[Unknown]