[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up flo
From: |
Programmingkid |
Subject: |
[Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit |
Date: |
Sat, 3 Dec 2016 00:59:52 -0500 |
The floating point code used in fpu_helper.c can be sped up by using the IEEE
754 support added to the C99 standard. To test this code out simply set and
unset the I_NEED_SPEED macro. The program to test out each version of the
helper_fmadd() function is below the patch. It needs to be ran in the guest.
The emulator to use is qemu-system-ppc. I used a Mac OS X guest, but the test
program would compile on a Linux guest.
This patch does make the fused multiply-add instruction fmadd work faster and
still give a correct result.
This documentation might be of help to those who want to learn more about C99's
IEEE 754 support:
http://grouper.ieee.org/groups/754/meeting-materials/2001-07-18-c99.pdf
---
target-ppc/fpu_helper.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 102 insertions(+), 3 deletions(-)
diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c
index b0760f0..05eb26e 100644
--- a/target-ppc/fpu_helper.c
+++ b/target-ppc/fpu_helper.c
@@ -20,10 +20,22 @@
#include "cpu.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
+#include <inttypes.h>
+#include <math.h>
+#include <fenv.h>
#define float64_snan_to_qnan(x) ((x) | 0x0008000000000000ULL)
#define float32_snan_to_qnan(x) ((x) | 0x00400000)
+#define DEBUG_FPU 0
+
+#define DPRINTF(fmt, ...) do { \
+ if (DEBUG_FPU) { \
+ printf("FPU: " fmt , ## __VA_ARGS__); \
+ } \
+} while (0);
+
+
/*****************************************************************************/
/* Floating point operations helpers */
uint64_t helper_float32_to_float64(CPUPPCState *env, uint32_t arg)
@@ -281,29 +293,36 @@ static inline void float_inexact_excp(CPUPPCState *env)
static inline void fpscr_set_rounding_mode(CPUPPCState *env)
{
- int rnd_type;
+ int rnd_type, result = 0;
/* Set rounding mode */
switch (fpscr_rn) {
case 0:
/* Best approximation (round to nearest) */
rnd_type = float_round_nearest_even;
+ result = fesetround(FE_TONEAREST);
break;
case 1:
/* Smaller magnitude (round toward zero) */
rnd_type = float_round_to_zero;
+ result = fesetround(FE_TOWARDZERO);
break;
case 2:
/* Round toward +infinite */
rnd_type = float_round_up;
+ result = fesetround(FE_UPWARD);
break;
default:
case 3:
/* Round toward -infinite */
rnd_type = float_round_down;
+ result = fesetround(FE_DOWNWARD);
break;
}
set_float_rounding_mode(rnd_type, &env->fp_status);
+ if (result != 0) {
+ printf("Error: rounding mode was not set\n");
+ }
}
void helper_fpscr_clrbit(CPUPPCState *env, uint32_t bit)
@@ -534,6 +553,7 @@ void helper_float_check_status(CPUPPCState *env)
void helper_reset_fpstatus(CPUPPCState *env)
{
set_float_exception_flags(0, &env->fp_status);
+ feclearexcept(FE_ALL_EXCEPT);
}
/* fadd - fadd. */
@@ -737,16 +757,94 @@ uint64_t helper_frim(CPUPPCState *env, uint64_t arg)
return do_fri(env, arg, float_round_down);
}
-/* fmadd - fmadd. */
+#define I_NEED_SPEED 1
+#ifdef I_NEED_SPEED
+
+union Converter {
+ uint64_t i;
+ double d;
+};
+
+typedef union Converter Converter;
+
+/* fmadd - fmadd. - fast */
uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, uint64_t arg2,
uint64_t arg3)
{
+ DPRINTF("Fast helper_fmadd() called\n");
+ Converter farg1, farg2, farg3, result;
+
+ farg1.i = arg1;
+ farg2.i = arg2;
+ farg3.i = arg3;
+
+ DPRINTF("farg1.d = %f\n", farg1.d);
+ DPRINTF("farg2.d = %f\n", farg2.d);
+ DPRINTF("farg3.d = %f\n", farg3.d);
+
+ /* if signalling NaN operation */
+ if (unlikely(float64_is_signaling_nan(farg1.d, &env->fp_status) ||
+ float64_is_signaling_nan(farg2.d, &env->fp_status) ||
+ float64_is_signaling_nan(farg3.d, &env->fp_status))) {
+ float_invalid_op_excp(env, POWERPC_EXCP_FP_VXSNAN, 1);
+ }
+
+ result.d = fma(farg1.d, farg2.d, farg3.d); /* fused multiply-add function
*/
+ if (fetestexcept(FE_INEXACT)) {
+ DPRINTF("FE_INEXACT\n");
+ float_inexact_excp(env);
+ }
+ if (fetestexcept(FE_INVALID)) {
+ DPRINTF("FE_INVALID\n");
+
+ /* 0 * infinity */
+ if ((fpclassify(farg1.d) == FP_ZERO) && isinf(farg2.d)) {
+ result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXIMZ, 1);
+ }
+
+ /* infinity * 0 */
+ else if (isinf(farg1.d) && (fpclassify(farg2.d) == FP_ZERO)) {
+ result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXIMZ, 1);
+ }
+
+ /* infinity - infinity */
+ else if (isinf(farg1.d * farg2.d) && isinf(farg3.d) &&
(signbit(farg3.d) != 0)) {
+ result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXISI, 1);
+ }
+ }
+ if (fetestexcept(FE_OVERFLOW)) {
+ DPRINTF("FE_OVERFLOW\n");
+ float_overflow_excp(env);
+ }
+ if (fetestexcept(FE_UNDERFLOW)) {
+ DPRINTF("FE_UNDERFLOW\n");
+ float_underflow_excp(env);
+ }
+
+ DPRINTF("result.d = %f\n", result.d);
+ DPRINTF("result.i = 0x%" PRIx64 "\n", result.i);
+
+ return result.i;
+}
+
+#else
+
+/* fmadd - fmadd. - original */
+uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, uint64_t arg2,
+ uint64_t arg3)
+{
+ DPRINTF("old helper_fmadd() called\n");
+
CPU_DoubleU farg1, farg2, farg3;
farg1.ll = arg1;
farg2.ll = arg2;
farg3.ll = arg3;
+ DPRINTF("farg1.d = %f\n", farg1.d);
+ DPRINTF("farg2.d = %f\n", farg2.d);
+ DPRINTF("farg3.d = %f\n", farg3.d);
+
if (unlikely((float64_is_infinity(farg1.d) && float64_is_zero(farg2.d)) ||
(float64_is_zero(farg1.d) && float64_is_infinity(farg2.d)))) {
/* Multiplication of zero by infinity */
@@ -775,9 +873,10 @@ uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1,
uint64_t arg2,
farg1.d = float128_to_float64(ft0_128, &env->fp_status);
}
}
-
+ DPRINTF("farg1.ll = 0x%" PRIx64 "\n", farg1.ll);
return farg1.ll;
}
+#endif
/* fmsub - fmsub. */
uint64_t helper_fmsub(CPUPPCState *env, uint64_t arg1, uint64_t arg2,
--
2.7.2
/********************************************/
/************** Test program ****************/
/********************************************/
#include <stdio.h>
#include <time.h>
#include <math.h>
/*
fmadds basically does this frD = frA * frC + frB
*/
int main (int argc, const char * argv[]) {
const int iteration_count = pow(10,6); // You might have to change the
second argument
double iter, frD, frA, frB, frC, total, answer;
clock_t start_time, end_time;
frA = 1.0014;
frB = 0.00001;
frC = 0.000015;
total = 0.0;
answer = 88351739272997344.0;
start_time = clock();
for(iter = 0; iter < iteration_count; iter++)
{
frA *= 1.00001;
frB *= 1.00002;
frC *= 1.00003;
asm volatile("fmadds %0, %1, %2, %3" : "=f" (frD) : "f" (frA), "f"
(frC), "f" (frB));
total = total + frD;
}
end_time = clock();
printf("Total = %f\n", total);
if (total == answer) {
printf("Value is correct\n");
} else {
printf("Value is incorrect. Answer should be %f\n", answer);
}
printf("Time elapsed: %0.2f seconds\n", (float)(end_time - start_time) /
CLOCKS_PER_SEC);
return 0;
}