[Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up flo

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up flo

From:	Programmingkid
Subject:	[Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit
Date:	Sat, 3 Dec 2016 00:59:52 -0500

The floating point code used in fpu_helper.c can be sped up by using the IEEE 
754 support added to the C99 standard. To test this code out simply set and 
unset the I_NEED_SPEED macro. The program to test out each version of the 
helper_fmadd() function is below the patch. It needs to be ran in the guest. 
The emulator to use is qemu-system-ppc. I used a Mac OS X guest, but the test 
program would compile on a Linux guest.

This patch does make the fused multiply-add instruction fmadd work faster and 
still give a correct result.

This documentation might be of help to those who want to learn more about C99's 
IEEE 754 support:
http://grouper.ieee.org/groups/754/meeting-materials/2001-07-18-c99.pdf

---
 target-ppc/fpu_helper.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 102 insertions(+), 3 deletions(-)

diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c
index b0760f0..05eb26e 100644
--- a/target-ppc/fpu_helper.c
+++ b/target-ppc/fpu_helper.c
@@ -20,10 +20,22 @@
 #include "cpu.h"
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
+#include <inttypes.h>
+#include <math.h>
+#include <fenv.h>
 
 #define float64_snan_to_qnan(x) ((x) | 0x0008000000000000ULL)
 #define float32_snan_to_qnan(x) ((x) | 0x00400000)
 
+#define DEBUG_FPU 0
+
+#define DPRINTF(fmt, ...) do {                       \
+    if (DEBUG_FPU) {                                 \
+        printf("FPU: " fmt , ## __VA_ARGS__);        \
+    }                                                \
+} while (0);
+
+
 /*****************************************************************************/
 /* Floating point operations helpers */
 uint64_t helper_float32_to_float64(CPUPPCState *env, uint32_t arg)
@@ -281,29 +293,36 @@ static inline void float_inexact_excp(CPUPPCState *env)
 
 static inline void fpscr_set_rounding_mode(CPUPPCState *env)
 {
-    int rnd_type;
+    int rnd_type, result = 0;
 
     /* Set rounding mode */
     switch (fpscr_rn) {
     case 0:
         /* Best approximation (round to nearest) */
         rnd_type = float_round_nearest_even;
+        result = fesetround(FE_TONEAREST);
         break;
     case 1:
         /* Smaller magnitude (round toward zero) */
         rnd_type = float_round_to_zero;
+        result = fesetround(FE_TOWARDZERO);
         break;
     case 2:
         /* Round toward +infinite */
         rnd_type = float_round_up;
+        result = fesetround(FE_UPWARD);
         break;
     default:
     case 3:
         /* Round toward -infinite */
         rnd_type = float_round_down;
+        result = fesetround(FE_DOWNWARD);
         break;
     }
     set_float_rounding_mode(rnd_type, &env->fp_status);
+    if (result != 0) {
+        printf("Error: rounding mode was not set\n");
+    }
 }
 
 void helper_fpscr_clrbit(CPUPPCState *env, uint32_t bit)
@@ -534,6 +553,7 @@ void helper_float_check_status(CPUPPCState *env)
 void helper_reset_fpstatus(CPUPPCState *env)
 {
     set_float_exception_flags(0, &env->fp_status);
+    feclearexcept(FE_ALL_EXCEPT);
 }
 
 /* fadd - fadd. */
@@ -737,16 +757,94 @@ uint64_t helper_frim(CPUPPCState *env, uint64_t arg)
     return do_fri(env, arg, float_round_down);
 }
 
-/* fmadd - fmadd. */
+#define I_NEED_SPEED 1
+#ifdef I_NEED_SPEED
+
+union Converter {
+    uint64_t i;
+    double d;
+};
+
+typedef union Converter Converter;
+
+/* fmadd - fmadd. - fast */
 uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, uint64_t arg2,
                       uint64_t arg3)
 {
+    DPRINTF("Fast helper_fmadd() called\n");
+    Converter farg1, farg2, farg3, result;
+
+    farg1.i = arg1;
+    farg2.i = arg2;
+    farg3.i = arg3;
+
+    DPRINTF("farg1.d = %f\n", farg1.d);
+    DPRINTF("farg2.d = %f\n", farg2.d);
+    DPRINTF("farg3.d = %f\n", farg3.d);
+
+    /* if signalling NaN operation */
+    if (unlikely(float64_is_signaling_nan(farg1.d, &env->fp_status) ||
+                 float64_is_signaling_nan(farg2.d, &env->fp_status) ||
+                 float64_is_signaling_nan(farg3.d, &env->fp_status))) {
+        float_invalid_op_excp(env, POWERPC_EXCP_FP_VXSNAN, 1);
+    }
+
+    result.d = fma(farg1.d, farg2.d, farg3.d); /* fused multiply-add function 
*/
+    if (fetestexcept(FE_INEXACT)) {
+        DPRINTF("FE_INEXACT\n");
+        float_inexact_excp(env);
+    }
+    if (fetestexcept(FE_INVALID)) {
+        DPRINTF("FE_INVALID\n");
+
+        /* 0 * infinity */
+        if ((fpclassify(farg1.d) == FP_ZERO) && isinf(farg2.d)) {
+            result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXIMZ, 1);
+        }
+
+        /* infinity * 0 */
+        else if (isinf(farg1.d) && (fpclassify(farg2.d) == FP_ZERO)) {
+            result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXIMZ, 1);
+        }
+
+        /* infinity - infinity */
+        else if (isinf(farg1.d * farg2.d) && isinf(farg3.d) && 
(signbit(farg3.d) != 0)) {
+            result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXISI, 1);
+        }
+    }
+    if (fetestexcept(FE_OVERFLOW)) {
+        DPRINTF("FE_OVERFLOW\n");
+        float_overflow_excp(env);
+    }
+    if (fetestexcept(FE_UNDERFLOW)) {
+        DPRINTF("FE_UNDERFLOW\n");
+        float_underflow_excp(env);
+    }
+
+    DPRINTF("result.d = %f\n", result.d);
+    DPRINTF("result.i = 0x%" PRIx64 "\n", result.i);
+
+    return result.i;
+}
+
+#else
+
+/* fmadd - fmadd. - original */
+uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, uint64_t arg2,
+                      uint64_t arg3)
+{
+    DPRINTF("old helper_fmadd() called\n");
+
     CPU_DoubleU farg1, farg2, farg3;
 
     farg1.ll = arg1;
     farg2.ll = arg2;
     farg3.ll = arg3;
 
+    DPRINTF("farg1.d = %f\n", farg1.d);
+    DPRINTF("farg2.d = %f\n", farg2.d);
+    DPRINTF("farg3.d = %f\n", farg3.d);
+
     if (unlikely((float64_is_infinity(farg1.d) && float64_is_zero(farg2.d)) ||
                  (float64_is_zero(farg1.d) && float64_is_infinity(farg2.d)))) {
         /* Multiplication of zero by infinity */
@@ -775,9 +873,10 @@ uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, 
uint64_t arg2,
             farg1.d = float128_to_float64(ft0_128, &env->fp_status);
         }
     }
-
+    DPRINTF("farg1.ll = 0x%" PRIx64 "\n", farg1.ll);
     return farg1.ll;
 }
+#endif
 
 /* fmsub - fmsub. */
 uint64_t helper_fmsub(CPUPPCState *env, uint64_t arg1, uint64_t arg2,
-- 
2.7.2


/********************************************/
/************** Test program ****************/
/********************************************/


#include <stdio.h>
#include <time.h>
#include <math.h>

/*
    fmadds basically does this frD = frA * frC + frB
*/

int main (int argc, const char * argv[]) {
    const int iteration_count = pow(10,6); // You might have to change the 
second argument
    double iter, frD, frA, frB, frC, total, answer;
    clock_t start_time, end_time;
    
    frA = 1.0014;
    frB = 0.00001;
    frC = 0.000015;
    total = 0.0;
    answer = 88351739272997344.0;
    
    start_time = clock();
    for(iter = 0; iter < iteration_count; iter++)
    {
        frA *= 1.00001;
        frB *= 1.00002;
        frC *= 1.00003;
        asm volatile("fmadds %0, %1, %2, %3" : "=f" (frD) : "f" (frA), "f" 
(frC), "f" (frB));
        total = total + frD;
    }
    end_time = clock();
    printf("Total = %f\n", total);
    
    if (total == answer) {
        printf("Value is correct\n");
    } else {
        printf("Value is incorrect. Answer should be %f\n", answer);
    }
    
    printf("Time elapsed: %0.2f seconds\n", (float)(end_time - start_time) / 
CLOCKS_PER_SEC);
    
    return 0;
}

[Prev in Thread]

Current Thread

[Next in Thread]

[Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit, Programmingkid <=
- Re: [Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit, Paolo Bonzini, 2016/12/03
  - Re: [Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit, Programmingkid, 2016/12/03
    - Re: [Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit, Richard Henderson, 2016/12/03
    - Re: [Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit, Eric Blake, 2016/12/05
- Re: [Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit, no-reply, 2016/12/05

Prev by Date: [Qemu-devel] [PATCH 7/7] target-ppc: Implement bcdtrunc. instruction
Next by Date: Re: [Qemu-devel] [PATCH v2 1/6] arm: Uniquely name imx25 I2C buses.
Previous by thread: [Qemu-devel] [PATCH 0/7] POWER9 TCG enablements - BCD functions - final part
Next by thread: Re: [Qemu-devel] [RFC] target-ppc/fpu_helper.c: Use C99 code to speed up floating point unit
Index(es):
- Date
- Thread