qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PULL v2 07/14] accel/tcg: Use interval tree for user-only page trac


From: Ilya Leoshkevich
Subject: Re: [PULL v2 07/14] accel/tcg: Use interval tree for user-only page tracking
Date: Fri, 23 Dec 2022 15:32:39 +0100

On Tue, Dec 20, 2022 at 09:03:06PM -0800, Richard Henderson wrote:
> Finish weaning user-only away from PageDesc.
> 
> Using an interval tree to track page permissions means that
> we can represent very large regions efficiently.
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/290
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/967
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1214
> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  accel/tcg/internal.h           |   4 +-
>  accel/tcg/tb-maint.c           |  20 +-
>  accel/tcg/user-exec.c          | 615 ++++++++++++++++++++++-----------
>  tests/tcg/multiarch/test-vma.c |  22 ++
>  4 files changed, 451 insertions(+), 210 deletions(-)
>  create mode 100644 tests/tcg/multiarch/test-vma.c

Hi,

After staring at vma-pthread.c failures for some time, I finally
spotted a few lines here that look suspicious.

<skip>

>  int page_get_flags(target_ulong address)
>  {
> -    PageDesc *p;
> +    PageFlagsNode *p = pageflags_find(address, address);
>  
> -    p = page_find(address >> TARGET_PAGE_BITS);
> -    if (!p) {
> +    /*
> +     * See util/interval-tree.c re lockless lookups: no false positives but
> +     * there are false negatives.  If we find nothing, retry with the mmap
> +     * lock acquired.
> +     */
> +    if (p) {
> +        return p->flags;
> +    }
> +    if (have_mmap_lock()) {
>          return 0;
>      }
> -    return p->flags;
> +
> +    mmap_lock();
> +    p = pageflags_find(address, address);
> +    mmap_unlock();

How does the code ensure that p is not freed here?

> +    return p ? p->flags : 0;
> +}

<skip>

>  int page_check_range(target_ulong start, target_ulong len, int flags)
>  {
> -    PageDesc *p;
> -    target_ulong end;
> -    target_ulong addr;
> -
> -    /*
> -     * This function should never be called with addresses outside the
> -     * guest address space.  If this assert fires, it probably indicates
> -     * a missing call to h2g_valid.
> -     */
> -    if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
> -        assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
> -    }
> +    target_ulong last;
>  
>      if (len == 0) {
> -        return 0;
> -    }
> -    if (start + len - 1 < start) {
> -        /* We've wrapped around.  */
> -        return -1;
> +        return 0;  /* trivial length */
>      }
>  
> -    /* must do before we loose bits in the next step */
> -    end = TARGET_PAGE_ALIGN(start + len);
> -    start = start & TARGET_PAGE_MASK;
> +    last = start + len - 1;
> +    if (last < start) {
> +        return -1; /* wrap around */
> +    }
> +
> +    while (true) {
> +        PageFlagsNode *p = pageflags_find(start, last);

We can end up here without mmap_lock if we come from the syscall code.
Do we need a retry like in page_get_flags()?
Or would it make sense to just take mmap_lock in lock_user()?

Speaking of which: does lock_user() actually guarantee that it's safe
to access the respective pages until unlock_user()? If yes, doesn't
this mean that mmap_lock must be held between the two? And if no, and
the SEGV handler is already supposed to gracefully handle SEGVs in
syscall.c, do we need to call access_ok_untagged() there at all?

> +        int missing;
>  
> -    for (addr = start, len = end - start;
> -         len != 0;
> -         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
> -        p = page_find(addr >> TARGET_PAGE_BITS);
>          if (!p) {
> -            return -1;
> +            return -1; /* entire region invalid */
>          }
> -        if (!(p->flags & PAGE_VALID)) {
> -            return -1;
> +        if (start < p->itree.start) {
> +            return -1; /* initial bytes invalid */
>          }
>  
> -        if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
> -            return -1;
> +        missing = flags & ~p->flags;
> +        if (missing & PAGE_READ) {
> +            return -1; /* page not readable */
>          }
> -        if (flags & PAGE_WRITE) {
> +        if (missing & PAGE_WRITE) {
>              if (!(p->flags & PAGE_WRITE_ORG)) {
> +                return -1; /* page not writable */
> +            }
> +            /* Asking about writable, but has been protected: undo. */
> +            if (!page_unprotect(start, 0)) {
>                  return -1;
>              }
> -            /* unprotect the page if it was put read-only because it
> -               contains translated code */
> -            if (!(p->flags & PAGE_WRITE)) {
> -                if (!page_unprotect(addr, 0)) {
> -                    return -1;
> -                }
> +            /* TODO: page_unprotect should take a range, not a single page. 
> */
> +            if (last - start < TARGET_PAGE_SIZE) {
> +                return 0; /* ok */
>              }
> +            start += TARGET_PAGE_SIZE;
> +            continue;
>          }
> +
> +        if (last <= p->itree.last) {
> +            return 0; /* ok */
> +        }
> +        start = p->itree.last + 1;
>      }
> -    return 0;
>  }

<skip>

>  int page_unprotect(target_ulong address, uintptr_t pc)
>  {
> -    unsigned int prot;
> +    PageFlagsNode *p;
>      bool current_tb_invalidated;
> -    PageDesc *p;
> -    target_ulong host_start, host_end, addr;
>  
>      /*
>       * Technically this isn't safe inside a signal handler.  However we
> @@ -429,40 +627,54 @@ int page_unprotect(target_ulong address, uintptr_t pc)
>       */
>      mmap_lock();
>  
> -    p = page_find(address >> TARGET_PAGE_BITS);
> -    if (!p) {
> +    p = pageflags_find(address, address);
> +
> +    /* If this address was not really writable, nothing to do. */
> +    if (!p || !(p->flags & PAGE_WRITE_ORG)) {
>          mmap_unlock();
>          return 0;
>      }
>  
> -    /*
> -     * If the page was really writable, then we change its
> -     * protection back to writable.
> -     */
> -    if (p->flags & PAGE_WRITE_ORG) {
> -        current_tb_invalidated = false;
> -        if (p->flags & PAGE_WRITE) {
> -            /*
> -             * If the page is actually marked WRITE then assume this is 
> because
> -             * this thread raced with another one which got here first and
> -             * set the page to PAGE_WRITE and did the TB invalidate for us.
> -             */
> +    current_tb_invalidated = false;
> +    if (p->flags & PAGE_WRITE) {
> +        /*
> +         * If the page is actually marked WRITE then assume this is because
> +         * this thread raced with another one which got here first and
> +         * set the page to PAGE_WRITE and did the TB invalidate for us.
> +         */
>  #ifdef TARGET_HAS_PRECISE_SMC
> -            TranslationBlock *current_tb = tcg_tb_lookup(pc);
> -            if (current_tb) {
> -                current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
> -            }
> +        TranslationBlock *current_tb = tcg_tb_lookup(pc);
> +        if (current_tb) {
> +            current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
> +        }
>  #endif
> +    } else {
> +        target_ulong start, len, i;
> +        int prot;
> +
> +        if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
> +            start = address & TARGET_PAGE_MASK;
> +            len = TARGET_PAGE_SIZE;
> +            prot = p->flags | PAGE_WRITE;
> +            pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
> +            current_tb_invalidated = tb_invalidate_phys_page_unwind(start, 
> pc);

When we come from page_check_range(), pc == 0 and the assertion in
tb_invalidate_phys_page_unwind() fires. Should we pass
current_cpu->cc->get_pc() to page_unprotect() instead of 0, so that
current_tb is resolved to the TB that invoked the syscall?

>          } else {
> -            host_start = address & qemu_host_page_mask;
> -            host_end = host_start + qemu_host_page_size;
> -
> +            start = address & qemu_host_page_mask;
> +            len = qemu_host_page_size;
>              prot = 0;
> -            for (addr = host_start; addr < host_end; addr += 
> TARGET_PAGE_SIZE) {
> -                p = page_find(addr >> TARGET_PAGE_BITS);
> -                p->flags |= PAGE_WRITE;
> -                prot |= p->flags;
>  
> +            for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
> +                target_ulong addr = start + i;
> +
> +                p = pageflags_find(addr, addr);
> +                if (p) {
> +                    prot |= p->flags;
> +                    if (p->flags & PAGE_WRITE_ORG) {
> +                        prot |= PAGE_WRITE;
> +                        pageflags_set_clear(addr, addr + TARGET_PAGE_SIZE - 
> 1,
> +                                            PAGE_WRITE, 0);
> +                    }
> +                }
>                  /*
>                   * Since the content will be modified, we must invalidate
>                   * the corresponding translated code.
> @@ -470,15 +682,16 @@ int page_unprotect(target_ulong address, uintptr_t pc)
>                  current_tb_invalidated |=
>                      tb_invalidate_phys_page_unwind(addr, pc);
>              }
> -            mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
> -                     prot & PAGE_BITS);
>          }
> -        mmap_unlock();
> -        /* If current TB was invalidated return to main loop */
> -        return current_tb_invalidated ? 2 : 1;
> +        if (prot & PAGE_EXEC) {
> +            prot = (prot & ~PAGE_EXEC) | PAGE_READ;
> +        }
> +        mprotect((void *)g2h_untagged(start), len, prot & PAGE_BITS);
>      }
>      mmap_unlock();
> -    return 0;
> +
> +    /* If current TB was invalidated return to main loop */
> +    return current_tb_invalidated ? 2 : 1;
>  }

Best regards,
Ilya



reply via email to

[Prev in Thread] Current Thread [Next in Thread]