libjit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Libjit] looking at the generated code


From: Tom Tromey
Subject: [Libjit] looking at the generated code
Date: Sun, 18 Feb 2018 06:19:36 -0700

The attached program is a simple test case that emulates what my Emacs
Lisp JIT does for a very simple function: (lambda (x) (consp x)).

Emacs uses tagged objects, like many lisps.  So, the above expands to
something like:

    if x & 7 == 3:
      result = t
    else
      result = nil
    return result

The generated function looks like:

    7f48c9cc2144:       55                      push   %rbp
    7f48c9cc2145:       48 8b ec                mov    %rsp,%rbp
    7f48c9cc2148:       48 83 ec 10             sub    $0x10,%rsp
    7f48c9cc214c:       4c 89 3c 24             mov    %r15,(%rsp)
    7f48c9cc2150:       48 89 7d f8             mov    %rdi,-0x8(%rbp)
    7f48c9cc2154:       48 8b 45 f8             mov    -0x8(%rbp),%rax
    7f48c9cc2158:       48 83 e0 07             and    $0x7,%rax
    7f48c9cc215c:       48 83 f8 03             cmp    $0x3,%rax
    7f48c9cc2160:       0f 85 0b 00 00 00       jne    0x7f48c9cc2171
    7f48c9cc2166:       41 bf f0 c0 00 00       mov    $0xc0f0,%r15d
    7f48c9cc216c:       e9 03 00 00 00          jmpq   0x7f48c9cc2174
    7f48c9cc2171:       45 33 ff                xor    %r15d,%r15d
    7f48c9cc2174:       49 8b c7                mov    %r15,%rax
    7f48c9cc2177:       4c 8b 3c 24             mov    (%rsp),%r15
    7f48c9cc217b:       48 8b e5                mov    %rbp,%rsp
    7f48c9cc217e:       5d                      pop    %rbp
    7f48c9cc217f:       c3                      retq   


Much of this seems sensible to me, but there are a few bits that seem
like they could be improved.  The program has a few comments that show
my attempts to improve things -- but I only ever succeeded in making the
generated code worse, not better.  (Also, in my real JIT, not this
simple example, there are a couple of differences -- more stack space is
used -- but I couldn't easily see what extra things the JIT is doing.)

First, notice the type check:

    7f48c9cc2158:       48 83 e0 07             and    $0x7,%rax
    7f48c9cc215c:       48 83 f8 03             cmp    $0x3,%rax

If I compile similar code with gcc (-g -O0 -- see below for what the
optimizer does), I get:

   0x00000000005e2b2a <+16>:    83 e0 07        and    $0x7,%eax
   0x00000000005e2b2d <+19>:    83 f8 03        cmp    $0x3,%eax

gcc notices that the comparison can be done in a narrower mode.  I don't
know if this is faster, but it does use less space.


Second, notice the extra moves using %r15.  I don't think this register
needs to be used at all.  This would remove the spills as well.  Maybe
this is just a limitation of the register allocation?


Here's what gcc -O2 emits for essentially the same code:

   0x000000000054aec0 <+0>:     83 e7 07        and    $0x7,%edi
   0x000000000054aec3 <+3>:     ba 00 00 00 00  mov    $0x0,%edx
   0x000000000054aec8 <+8>:     b8 f0 c0 00 00  mov    $0xc0f0,%eax
   0x000000000054aecd <+13>:    83 ff 03        cmp    $0x3,%edi
   0x000000000054aed0 <+16>:    48 0f 45 c2     cmovne %rdx,%rax
   0x000000000054aed4 <+20>:    c3      retq   


Wow!  Probably this is a bit too hard to get to for libjit, but I do
wonder if it would be possible to use cmov somehow when computing the
nil-or-t result.  Like maybe if there were a conditional store
instruction?  The reason I'm interested in this is that, in my JIT, the
kernel of this function (the type check) is inlined wherever consp is
used in lisp.  That is, it would be nice if the prologue and register
shuffling were gone, but optimizing this short sequence would probably
have bigger benefits.

Tom

#include <jit/jit.h>
#include <jit/jit-dump.h>

// defining as void_ptr or ubyte makes no difference
#define ITYPE jit_type_ubyte

static jit_value_t
get_type (jit_function_t func, jit_value_t val)
{
#if 0 // makes it worse
  val = jit_insn_convert (func, val, jit_type_ubyte, 0);
#endif
  jit_value_t mask = jit_value_create_nint_constant (func, ITYPE, 7);
  return jit_insn_and (func, val, mask);
}

static jit_value_t
compare_type (jit_function_t func, jit_value_t val, int to_be_checked)
{
  jit_value_t real_type = get_type (func, val);
  jit_value_t type_val
    = jit_value_create_nint_constant (func, ITYPE, to_be_checked);
  return jit_insn_eq (func, real_type, type_val);
}

#define CONSTANT(FUNC, VAL) \
  jit_value_create_long_constant (FUNC, jit_type_void_ptr, (jit_long) (VAL))

#define Qnil 0
#define Qt 0xc0f0

static void
emit_qnil_or_qt (jit_function_t func, jit_value_t compare,
                 jit_value_t dest, jit_label_t *next_insn)
{
  // first version is obvious approach, second version loads both
  // values first, then conditionally moves
  // first one works best
#if 1
  jit_value_t tem;

  jit_label_t nil_label = jit_label_undefined;
  jit_insn_branch_if_not (func, compare, &nil_label);
  tem = CONSTANT (func, Qt);
  jit_insn_store (func, dest, tem);
  jit_insn_branch (func, next_insn);
  jit_insn_label (func, &nil_label);
  tem = CONSTANT (func, Qnil);
  jit_insn_store (func, dest, tem);
#else
  jit_value_t tval = CONSTANT (func, Qt);
  jit_value_t nilval = CONSTANT (func, Qnil);
  jit_insn_store (func, dest, nilval);
  jit_insn_branch_if (func, compare, next_insn);
  jit_insn_store (func, dest, tval);
#endif
}

int main()
{
  jit_init ();
  jit_context_t ctx = jit_context_create ();

  jit_type_t params[1] = { jit_type_void_ptr };
  jit_type_t signature = jit_type_create_signature (jit_abi_cdecl,
                                                    jit_type_void_ptr,
                                                    params, 1, 1);

  jit_function_t func = jit_function_create (ctx, signature);

  jit_value_t stack0 = jit_value_create (func, jit_type_void_ptr);
#define WANT_DUP
#ifdef WANT_DUP
  jit_value_t stack1 = jit_value_create (func, jit_type_void_ptr);
#endif

  jit_value_t temp = jit_value_create (func, jit_type_void_ptr);

  jit_insn_store (func, stack0, jit_value_get_param (func, 0));
  

#ifdef WANT_DUP
  // the byte compiler emits a Bdup here, so we do it as well.
  jit_insn_store (func, stack1, stack0);
#endif

#ifdef WANT_DUP
#define INTER stack1
#else
#define INTER stack0
#endif

  jit_value_t compare = compare_type (func, INTER, 3);
  jit_label_t next = jit_label_undefined;

  // define as "stack1" to emulate the stack machine
  // this makes no difference though
#define DEST temp

  emit_qnil_or_qt (func, compare, DEST, &next);
  jit_insn_label (func, &next);

  jit_insn_return (func, DEST);

  jit_function_compile (func);
  jit_dump_function (stdout, func, "Function");

  return 0;
}

reply via email to

[Prev in Thread] Current Thread [Next in Thread]