Re: [PATCH 1/3] pmap: dynamically allocate the whole user page tree map

bug-hurd
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 1/3] pmap: dynamically allocate the whole user page tree map

From:	Samuel Thibault
Subject:	Re: [PATCH 1/3] pmap: dynamically allocate the whole user page tree map
Date:	Sun, 21 May 2023 21:17:54 +0200
User-agent:	NeoMutt/20170609 (1.8.3)
Applied, thanks!!


Luca Dariz, le dim. 21 mai 2023 10:57:56 +0200, a ecrit:
> * i386/intel/pmap.c: switch to dynamic allocation of all the page tree
>   map levels for the user-space address range, using a separate kmem
>   cache for each level. This allows to extend the usable memory space
>   on x86_64 to use more than one L3 page for user space. The kernel
>   address map is left untouched for now as it needs a different
>   initialization.
> * i386/intel/pmap.h: remove hardcoded user pages and add macro to
>   recontruct the page-to-virtual mapping
> ---
>  i386/intel/pmap.c | 544 ++++++++++++++++++++++------------------------
>  i386/intel/pmap.h |  21 +-
>  2 files changed, 277 insertions(+), 288 deletions(-)
> 
> diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
> index e867ed59..3a30271e 100644
> --- a/i386/intel/pmap.c
> +++ b/i386/intel/pmap.c
> @@ -398,6 +398,7 @@ struct pmap       kernel_pmap_store;
>  pmap_t               kernel_pmap;
>  
>  struct kmem_cache pmap_cache;  /* cache of pmap structures */
> +struct kmem_cache pt_cache;    /* cache of page tables */
>  struct kmem_cache pd_cache;    /* cache of page directories */
>  #if PAE
>  struct kmem_cache pdpt_cache;  /* cache of page directory pointer tables */
> @@ -429,6 +430,14 @@ pt_entry_t *kernel_page_dir;
>   */
>  static pmap_mapwindow_t mapwindows[PMAP_NMAPWINDOWS * NCPUS];
>  
> +#ifdef __x86_64__
> +static inline pt_entry_t *
> +pmap_l4base(const pmap_t pmap, vm_offset_t lin_addr)
> +{
> +     return &pmap->l4base[lin2l4num(lin_addr)];
> +}
> +#endif
> +
>  #ifdef PAE
>  static inline pt_entry_t *
>  pmap_ptp(const pmap_t pmap, vm_offset_t lin_addr)
> @@ -443,7 +452,7 @@ pmap_ptp(const pmap_t pmap, vm_offset_t lin_addr)
>  #else /* __x86_64__ */
>       pdp_table = pmap->pdpbase;
>  #endif /* __x86_64__ */
> -     return pdp_table;
> +     return &pdp_table[lin2pdpnum(lin_addr)];
>  }
>  #endif
>  
> @@ -456,7 +465,9 @@ pmap_pde(const pmap_t pmap, vm_offset_t addr)
>  #if PAE
>       pt_entry_t *pdp_table;
>       pdp_table = pmap_ptp(pmap, addr);
> -     pt_entry_t pde = pdp_table[lin2pdpnum(addr)];
> +        if (pdp_table == 0)
> +             return(PT_ENTRY_NULL);
> +     pt_entry_t pde = *pdp_table;
>       if ((pde & INTEL_PTE_VALID) == 0)
>               return PT_ENTRY_NULL;
>       page_dir = (pt_entry_t *) ptetokv(pde);
> @@ -1092,15 +1103,18 @@ void pmap_init(void)
>        */
>       s = (vm_size_t) sizeof(struct pmap);
>       kmem_cache_init(&pmap_cache, "pmap", s, 0, NULL, 0);
> -     kmem_cache_init(&pd_cache, "pd",
> +     kmem_cache_init(&pt_cache, "pmap_L1",
> +                     INTEL_PGBYTES, INTEL_PGBYTES, NULL,
> +                     KMEM_CACHE_PHYSMEM);
> +     kmem_cache_init(&pd_cache, "pmap_L2",
>                       INTEL_PGBYTES, INTEL_PGBYTES, NULL,
>                       KMEM_CACHE_PHYSMEM);
>  #if PAE
> -     kmem_cache_init(&pdpt_cache, "pdpt",
> +     kmem_cache_init(&pdpt_cache, "pmap_L3",
>                       INTEL_PGBYTES, INTEL_PGBYTES, NULL,
>                       KMEM_CACHE_PHYSMEM);
>  #ifdef __x86_64__
> -     kmem_cache_init(&l4_cache, "L4",
> +     kmem_cache_init(&l4_cache, "pmap_L4",
>                       INTEL_PGBYTES, INTEL_PGBYTES, NULL,
>                       KMEM_CACHE_PHYSMEM);
>  #endif /* __x86_64__ */
> @@ -1244,6 +1258,11 @@ pmap_page_table_page_dealloc(vm_offset_t pa)
>       vm_object_lock(pmap_object);
>       m = vm_page_lookup(pmap_object, pa);
>       vm_page_lock_queues();
> +#ifdef       MACH_PV_PAGETABLES
> +        if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, pa_to_mfn(pa)))
> +                panic("couldn't unpin page %llx(%lx)\n", pa, (vm_offset_t) 
> kv_to_ma(pa));
> +        pmap_set_page_readwrite((void*) phystokv(pa));
> +#endif       /* MACH_PV_PAGETABLES */
>       vm_page_free(m);
>       inuse_ptepages_count--;
>       vm_page_unlock_queues();
> @@ -1265,7 +1284,7 @@ pmap_page_table_page_dealloc(vm_offset_t pa)
>  pmap_t pmap_create(vm_size_t size)
>  {
>  #ifdef __x86_64__
> -     // needs to be reworked if we want to dynamically allocate PDPs
> +     // needs to be reworked if we want to dynamically allocate PDPs for 
> kernel
>       const int PDPNUM = PDPNUM_KERNEL;
>  #endif
>       pt_entry_t              *page_dir[PDPNUM];
> @@ -1360,30 +1379,6 @@ pmap_t pmap_create(vm_size_t size)
>       memset(p->l4base, 0, INTEL_PGBYTES);
>       WRITE_PTE(&p->l4base[lin2l4num(VM_MIN_KERNEL_ADDRESS)],
>                 pa_to_pte(kvtophys((vm_offset_t) pdp_kernel)) | 
> INTEL_PTE_VALID | INTEL_PTE_WRITE);
> -#if lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_USER_ADDRESS)
> -     // kernel vm and user vm are not in the same l4 entry, so add the user 
> one
> -        // TODO alloc only PDPTE for the user range VM_MIN_USER_ADDRESS, 
> VM_MAX_USER_ADDRESS
> -     // and keep the same for kernel range, in l4 table we have different 
> entries
> -     pt_entry_t *pdp_user = (pt_entry_t *) kmem_cache_alloc(&pdpt_cache);
> -     if (pdp_user == NULL) {
> -             panic("pmap create");
> -     }
> -        memset(pdp_user, 0, INTEL_PGBYTES);
> -     WRITE_PTE(&p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)],
> -               pa_to_pte(kvtophys((vm_offset_t) pdp_user)) | INTEL_PTE_VALID 
> | INTEL_PTE_WRITE | INTEL_PTE_USER);
> -#endif /* lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_USER_ADDRESS) 
> */
> -     for (int i = 0; i < PDPNUM_USER; i++) {
> -             pt_entry_t *user_page_dir = (pt_entry_t *) 
> kmem_cache_alloc(&pd_cache);
> -             memset(user_page_dir, 0, INTEL_PGBYTES);
> -             WRITE_PTE(&pdp_user[i + lin2pdpnum(VM_MIN_USER_ADDRESS)],  // 
> pdp_user
> -                       pa_to_pte(kvtophys((vm_offset_t)user_page_dir))
> -                       | INTEL_PTE_VALID
> -#if (defined(__x86_64__) && !defined(MACH_HYP)) || 
> defined(MACH_PV_PAGETABLES)
> -                       | INTEL_PTE_WRITE | INTEL_PTE_USER
> -#endif
> -                     );
> -     }
> -
>  #ifdef       MACH_PV_PAGETABLES
>       // FIXME: use kmem_cache_alloc instead
>       if (kmem_alloc_wired(kernel_map,
> @@ -1443,15 +1438,7 @@ pmap_t pmap_create(vm_size_t size)
>  
>  void pmap_destroy(pmap_t p)
>  {
> -#if PAE
> -     int             i;
> -#endif
> -     boolean_t       free_all;
> -     pt_entry_t      *page_dir;
> -     pt_entry_t      *pdep;
> -     phys_addr_t     pa;
>       int             c, s;
> -     vm_page_t       m;
>  
>       if (p == PMAP_NULL)
>               return;
> @@ -1466,87 +1453,54 @@ void pmap_destroy(pmap_t p)
>           return;     /* still in use */
>       }
>  
> +        /*
> +         * Free the page table tree.
> +         */
>  #if PAE
> -     for (i = 0; i < lin2pdpnum(VM_MAX_USER_ADDRESS); i++) {
>  #ifdef __x86_64__
> -#ifdef USER32
> -         /* In this case we know we have one PDP for user space */
> -         pt_entry_t *pdp = (pt_entry_t *) 
> ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
> -#else
> -#warning "TODO do 64-bit userspace need more that 512G?"
> -         pt_entry_t *pdp = (pt_entry_t *) 
> ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
> -#endif /* USER32 */
> -         page_dir = (pt_entry_t *) ptetokv(pdp[i]);
> +     for (int l4i = 0; l4i < lin2l4num(VM_MAX_USER_ADDRESS); l4i++) {
> +             pt_entry_t pdp = (pt_entry_t) p->l4base[l4i];
> +             if (!(pdp & INTEL_PTE_VALID))
> +                     continue;
> +             pt_entry_t *pdpbase = (pt_entry_t*) ptetokv(pdp);
> +             for (int l3i = 0; l3i < 512; l3i++) {
>  #else /* __x86_64__ */
> -         page_dir = (pt_entry_t *) ptetokv(p->pdpbase[i]);
> +             pt_entry_t *pdpbase = p->pdpbase;
> +             for (int l3i = 0; l3i < lin2pdpnum(VM_MAX_USER_ADDRESS); l3i++) 
> {
>  #endif /* __x86_64__ */
> -         free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
> +                     pt_entry_t pde = (pt_entry_t) pdpbase[l3i];
> +                     if (!(pde & INTEL_PTE_VALID))
> +                             continue;
> +                     pt_entry_t *pdebase = (pt_entry_t*) ptetokv(pde);
> +                     for (int l2i = 0; l2i < 512; l2i++) {
>  #else /* PAE */
> -         free_all = FALSE;
> -         page_dir = p->dirbase;
> +                     pt_entry_t *pdebase = p->dirbase;
> +                     for (int l2i = 0; l2i < 
> lin2pdenum(VM_MAX_USER_ADDRESS); l2i++) {
>  #endif /* PAE */
> -
> -#ifdef __x86_64__
> -#warning FIXME 64bit need to free l3
> -#endif
> -         /*
> -          *  Free the memory maps, then the
> -          *  pmap structure.
> -          */
> -         for (pdep = page_dir;
> -              (free_all
> -               || pdep < &page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS)])
> -                  && pdep < &page_dir[NPTES];
> -              pdep += ptes_per_vm_page) {
> -             if (*pdep & INTEL_PTE_VALID) {
> -                 pa = pte_to_pa(*pdep);
> -                 assert(pa == (vm_offset_t) pa);
> -                 vm_object_lock(pmap_object);
> -                 m = vm_page_lookup(pmap_object, pa);
> -                 if (m == VM_PAGE_NULL)
> -                     panic("pmap_destroy: pte page not in object");
> -                 vm_page_lock_queues();
> -#ifdef       MACH_PV_PAGETABLES
> -                 if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, pa_to_mfn(pa)))
> -                     panic("pmap_destroy: couldn't unpin page %llx(%lx)\n", 
> pa, (vm_offset_t) kv_to_ma(pa));
> -                 pmap_set_page_readwrite((void*) phystokv(pa));
> -#endif       /* MACH_PV_PAGETABLES */
> -                 vm_page_free(m);
> -                 inuse_ptepages_count--;
> -                 vm_page_unlock_queues();
> -                 vm_object_unlock(pmap_object);
> -             }
> -         }
> -#ifdef       MACH_PV_PAGETABLES
> -         pmap_set_page_readwrite((void*) page_dir);
> -#endif       /* MACH_PV_PAGETABLES */
> -         kmem_cache_free(&pd_cache, (vm_offset_t) page_dir);
> +                             pt_entry_t pte = (pt_entry_t) pdebase[l2i];
> +                             if (!(pte & INTEL_PTE_VALID))
> +                                     continue;
> +                             kmem_cache_free(&pt_cache, 
> (vm_offset_t)ptetokv(pte));
> +                     }
>  #if PAE
> -     }
> -
> -#ifdef       MACH_PV_PAGETABLES
> +                     kmem_cache_free(&pd_cache, (vm_offset_t)pdebase);
> +             }
>  #ifdef __x86_64__
> -     pmap_set_page_readwrite(p->l4base);
> -     pmap_set_page_readwrite(p->user_l4base);
> -     pmap_set_page_readwrite(p->user_pdpbase);
> +             kmem_cache_free(&pdpt_cache, (vm_offset_t)pdpbase);
> +     }
>  #endif /* __x86_64__ */
> -     pmap_set_page_readwrite(p->pdpbase);
> -#endif       /* MACH_PV_PAGETABLES */
> +#endif /* PAE */
>  
> +        /* Finally, free the page table tree root and the pmap itself */
> +#if PAE
>  #ifdef __x86_64__
> -     kmem_cache_free(&pdpt_cache, (vm_offset_t) pmap_ptp(p, 
> VM_MIN_USER_ADDRESS));
> -#if lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_USER_ADDRESS)
> -     // TODO kernel vm and user vm are not in the same l4 entry
> -#endif
>       kmem_cache_free(&l4_cache, (vm_offset_t) p->l4base);
> -#ifdef MACH_PV_PAGETABLES
> -     kmem_free(kernel_map, (vm_offset_t)p->user_l4base, INTEL_PGBYTES);
> -     kmem_free(kernel_map, (vm_offset_t)p->user_pdpbase, INTEL_PGBYTES);
> -#endif /* MACH_PV_PAGETABLES */
>  #else /* __x86_64__ */
> -     kmem_cache_free(&pdpt_cache, (vm_offset_t) p->pdpbase);
> +        kmem_cache_free(&pdpt_cache, (vm_offset_t) p->pdpbase);
>  #endif /* __x86_64__ */
> -#endif       /* PAE */
> +#else /* PAE */
> +        kmem_cache_free(&pd_cache, (vm_offset_t) p->dirbase);
> +#endif /* PAE */
>       kmem_cache_free(&pmap_cache, (vm_offset_t) p);
>  }
>  
> @@ -1756,7 +1710,7 @@ void pmap_remove(
>           l = (s + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE-1);
>           if (l > e)
>               l = e;
> -         if (*pde & INTEL_PTE_VALID) {
> +         if (pde && (*pde & INTEL_PTE_VALID)) {
>               spte = (pt_entry_t *)ptetokv(*pde);
>               spte = &spte[ptenum(s)];
>               epte = &spte[intel_btop(l-s)];
> @@ -2036,86 +1990,24 @@ void pmap_protect(
>       SPLX(spl);
>  }
>  
> +typedef      pt_entry_t* (*pmap_level_getter_t)(const pmap_t pmap, 
> vm_offset_t addr);
>  /*
> - *   Insert the given physical page (p) at
> - *   the specified virtual address (v) in the
> - *   target physical map with the protection requested.
> - *
> - *   If specified, the page will be wired down, meaning
> - *   that the related pte can not be reclaimed.
> - *
> - *   NB:  This is the only routine which MAY NOT lazy-evaluate
> - *   or lose information.  That is, this routine must actually
> - *   insert this page into the given map NOW.
> - */
> -void pmap_enter(
> -     pmap_t                  pmap,
> -     vm_offset_t             v,
> -     phys_addr_t             pa,
> -     vm_prot_t               prot,
> -     boolean_t               wired)
> +* Expand one single level of the page table tree
> +*/
> +static inline pt_entry_t* pmap_expand_level(pmap_t pmap, vm_offset_t v, int 
> spl,
> +                                            pmap_level_getter_t pmap_level,
> +                                            pmap_level_getter_t 
> pmap_level_upper,
> +                                            int n_per_vm_page,
> +                                            struct kmem_cache *cache)
>  {
> -     boolean_t               is_physmem;
>       pt_entry_t              *pte;
> -     pv_entry_t              pv_h;
> -     unsigned long           i, pai;
> -     pv_entry_t              pv_e;
> -     pt_entry_t              template;
> -     int                     spl;
> -     phys_addr_t             old_pa;
> -
> -     assert(pa != vm_page_fictitious_addr);
> -     if (pmap_debug) printf("pmap(%zx, %llx)\n", v, (unsigned long long) pa);
> -     if (pmap == PMAP_NULL)
> -             return;
> -
> -#if !MACH_KDB
> -     if (pmap == kernel_pmap && (v < kernel_virtual_start || v >= 
> kernel_virtual_end))
> -             panic("pmap_enter(%zx, %llx) falls in physical memory area!\n", 
> v, (unsigned long long) pa);
> -#endif
> -#if !(__i486__ || __i586__ || __i686__)
> -     if (pmap == kernel_pmap && (prot & VM_PROT_WRITE) == 0
> -         && !wired /* hack for io_wire */ ) {
> -         /*
> -          *  Because the 386 ignores write protection in kernel mode,
> -          *  we cannot enter a read-only kernel mapping, and must
> -          *  remove an existing mapping if changing it.
> -          */
> -         PMAP_READ_LOCK(pmap, spl);
> -
> -         pte = pmap_pte(pmap, v);
> -         if (pte != PT_ENTRY_NULL && *pte != 0) {
> -             /*
> -              *      Invalidate the translation buffer,
> -              *      then remove the mapping.
> -              */
> -             pmap_remove_range(pmap, v, pte,
> -                               pte + ptes_per_vm_page);
> -             PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
> -         }
> -         PMAP_READ_UNLOCK(pmap, spl);
> -         return;
> -     }
> -#endif
> -
> -     /*
> -      *      Must allocate a new pvlist entry while we're unlocked;
> -      *      Allocating may cause pageout (which will lock the pmap system).
> -      *      If we determine we need a pvlist entry, we will unlock
> -      *      and allocate one.  Then we will retry, throughing away
> -      *      the allocated entry later (if we no longer need it).
> -      */
> -     pv_e = PV_ENTRY_NULL;
> -Retry:
> -     PMAP_READ_LOCK(pmap, spl);
>  
>       /*
>        *      Expand pmap to include this pte.  Assume that
>        *      pmap is always expanded to include enough hardware
>        *      pages to map one VM page.
>        */
> -
> -     while ((pte = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
> +     while ((pte = pmap_level(pmap, v)) == PT_ENTRY_NULL) {
>           /*
>            * Need to allocate a new page-table page.
>            */
> @@ -2136,7 +2028,9 @@ Retry:
>            */
>           PMAP_READ_UNLOCK(pmap, spl);
>  
> -         ptp = phystokv(pmap_page_table_page_alloc());
> +         while (!(ptp = kmem_cache_alloc(cache)))
> +             VM_PAGE_WAIT((void (*)()) 0);
> +         memset((void *)ptp, 0, PAGE_SIZE);
>  
>           /*
>            * Re-lock the pmap and check that another thread has
> @@ -2146,12 +2040,12 @@ Retry:
>            */
>           PMAP_READ_LOCK(pmap, spl);
>  
> -         if (pmap_pte(pmap, v) != PT_ENTRY_NULL) {
> +         if (pmap_level(pmap, v) != PT_ENTRY_NULL) {
>               /*
>                * Oops...
>                */
>               PMAP_READ_UNLOCK(pmap, spl);
> -             pmap_page_table_page_dealloc(kvtophys(ptp));
> +             kmem_cache_free(cache, ptp);
>               PMAP_READ_LOCK(pmap, spl);
>               continue;
>           }
> @@ -2159,8 +2053,8 @@ Retry:
>           /*
>            * Enter the new page table page in the page directory.
>            */
> -         i = ptes_per_vm_page;
> -         pdp = pmap_pde(pmap, v);
> +         i = n_per_vm_page;
> +         pdp = pmap_level_upper(pmap, v);
>           do {
>  #ifdef       MACH_PV_PAGETABLES
>               pmap_set_page_readonly((void *) ptp);
> @@ -2185,6 +2079,100 @@ Retry:
>            */
>           continue;
>       }
> +        return pte;
> +}
> +
> +/*
> + * Expand, if required, the PMAP to include the virtual address V.
> + * PMAP needs to be locked, and it will be still locked on return. It
> + * can temporarily unlock the PMAP, during allocation or deallocation
> + * of physical pages.
> + */
> +static inline pt_entry_t* pmap_expand(pmap_t pmap, vm_offset_t v, int spl)
> +{
> +#ifdef PAE
> +#ifdef __x86_64__
> +     pmap_expand_level(pmap, v, spl, pmap_ptp, pmap_l4base, 
> ptes_per_vm_page, &pdpt_cache);
> +#endif /* __x86_64__ */
> +     pmap_expand_level(pmap, v, spl, pmap_pde, pmap_ptp, ptes_per_vm_page, 
> &pd_cache);
> +#endif /* PAE */
> +     return pmap_expand_level(pmap, v, spl, pmap_pte, pmap_pde, 
> ptes_per_vm_page, &pt_cache);
> +}
> +
> +/*
> + *   Insert the given physical page (p) at
> + *   the specified virtual address (v) in the
> + *   target physical map with the protection requested.
> + *
> + *   If specified, the page will be wired down, meaning
> + *   that the related pte can not be reclaimed.
> + *
> + *   NB:  This is the only routine which MAY NOT lazy-evaluate
> + *   or lose information.  That is, this routine must actually
> + *   insert this page into the given map NOW.
> + */
> +void pmap_enter(
> +     pmap_t                  pmap,
> +     vm_offset_t             v,
> +     phys_addr_t             pa,
> +     vm_prot_t               prot,
> +     boolean_t               wired)
> +{
> +     boolean_t               is_physmem;
> +     pt_entry_t              *pte;
> +     pv_entry_t              pv_h;
> +     unsigned long           i, pai;
> +     pv_entry_t              pv_e;
> +     pt_entry_t              template;
> +     int                     spl;
> +     phys_addr_t             old_pa;
> +
> +     assert(pa != vm_page_fictitious_addr);
> +     if (pmap_debug) printf("pmap(%zx, %llx)\n", v, (unsigned long long) pa);
> +     if (pmap == PMAP_NULL)
> +             return;
> +
> +#if !MACH_KDB
> +     if (pmap == kernel_pmap && (v < kernel_virtual_start || v >= 
> kernel_virtual_end))
> +             panic("pmap_enter(%llx, %llx) falls in physical memory 
> area!\n", v, (unsigned long long) pa);
> +#endif
> +#if !(__i486__ || __i586__ || __i686__)
> +     if (pmap == kernel_pmap && (prot & VM_PROT_WRITE) == 0
> +         && !wired /* hack for io_wire */ ) {
> +         /*
> +          *  Because the 386 ignores write protection in kernel mode,
> +          *  we cannot enter a read-only kernel mapping, and must
> +          *  remove an existing mapping if changing it.
> +          */
> +         PMAP_READ_LOCK(pmap, spl);
> +
> +         pte = pmap_pte(pmap, v);
> +         if (pte != PT_ENTRY_NULL && *pte != 0) {
> +             /*
> +              *      Invalidate the translation buffer,
> +              *      then remove the mapping.
> +              */
> +             pmap_remove_range(pmap, v, pte,
> +                               pte + ptes_per_vm_page);
> +             PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
> +         }
> +         PMAP_READ_UNLOCK(pmap, spl);
> +         return;
> +     }
> +#endif
> +
> +     /*
> +      *      Must allocate a new pvlist entry while we're unlocked;
> +      *      Allocating may cause pageout (which will lock the pmap system).
> +      *      If we determine we need a pvlist entry, we will unlock
> +      *      and allocate one.  Then we will retry, throughing away
> +      *      the allocated entry later (if we no longer need it).
> +      */
> +     pv_e = PV_ENTRY_NULL;
> +Retry:
> +     PMAP_READ_LOCK(pmap, spl);
> +
> +     pte = pmap_expand(pmap, v, spl);
>  
>       if (vm_page_ready())
>               is_physmem = (vm_page_lookup_pa(pa) != NULL);
> @@ -2462,10 +2450,7 @@ void pmap_copy(
>   */
>  void pmap_collect(pmap_t p)
>  {
> -     int                     i;
> -     boolean_t               free_all;
> -     pt_entry_t              *page_dir;
> -     pt_entry_t              *pdp, *ptp;
> +     pt_entry_t              *ptp;
>       pt_entry_t              *eptp;
>       phys_addr_t             pa;
>       int                     spl, wired;
> @@ -2476,119 +2461,104 @@ void pmap_collect(pmap_t p)
>       if (p == kernel_pmap)
>               return;
>  
> +     /*
> +      * Free the page table tree.
> +      */
>  #if PAE
> -     for (i = 0; i < lin2pdpnum(VM_MAX_USER_ADDRESS); i++) {
>  #ifdef __x86_64__
> -#ifdef USER32
> -         /* In this case we know we have one PDP for user space */
> -         pdp = (pt_entry_t *) 
> ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
> -#else
> -#warning "TODO do 64-bit userspace need more that 512G?"
> -         pdp = (pt_entry_t *) 
> ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
> -#endif /* USER32 */
> -         page_dir = (pt_entry_t *) ptetokv(pdp[i]);
> +     for (int l4i = 0; l4i < lin2l4num(VM_MAX_USER_ADDRESS); l4i++) {
> +             pt_entry_t pdp = (pt_entry_t) p->l4base[l4i];
> +             if (!(pdp & INTEL_PTE_VALID))
> +                     continue;
> +             pt_entry_t *pdpbase = (pt_entry_t*) ptetokv(pdp);
> +             for (int l3i = 0; l3i < 512; l3i++) {
>  #else /* __x86_64__ */
> -         page_dir = (pt_entry_t *) ptetokv(p->pdpbase[i]);
> +             pt_entry_t *pdpbase = p->pdpbase;
> +             for (int l3i = 0; l3i < lin2pdpnum(VM_MAX_USER_ADDRESS); l3i++) 
> {
>  #endif /* __x86_64__ */
> -         free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
> -#else
> -         i = 0;
> -         free_all = FALSE;
> -         page_dir = p->dirbase;
> -#endif
> -
> -         /*
> -          *  Garbage collect map.
> -          */
> -         PMAP_READ_LOCK(p, spl);
> -         for (pdp = page_dir;
> -              (free_all
> -               || pdp < &page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS)])
> -                  && pdp < &page_dir[NPTES];
> -              pdp += ptes_per_vm_page) {
> -             if (*pdp & INTEL_PTE_VALID) {
> -
> -                 pa = pte_to_pa(*pdp);
> -                 ptp = (pt_entry_t *)phystokv(pa);
> -                 eptp = ptp + NPTES*ptes_per_vm_page;
> -
> -                 /*
> -                  * If the pte page has any wired mappings, we cannot
> -                  * free it.
> -                  */
> -                 wired = 0;
> -                 {
> -                     pt_entry_t *ptep;
> -                     for (ptep = ptp; ptep < eptp; ptep++) {
> -                         if (*ptep & INTEL_PTE_WIRED) {
> -                             wired = 1;
> -                             break;
> -                         }
> -                     }
> -                 }
> -                 if (!wired) {
> -                     /*
> -                      * Remove the virtual addresses mapped by this pte page.
> -                      */
> -                     { /*XXX big hack*/
> -                         vm_offset_t va = pdenum2lin(pdp - page_dir
> -                                                     + i * NPTES);
> -                         if (p == kernel_pmap)
> -                             va = lintokv(va);
> -                         pmap_remove_range(p,
> -                                           va,
> -                                           ptp,
> -                                           eptp);
> -                     }
> -
> -                     /*
> -                      * Invalidate the page directory pointer.
> -                      */
> -                     {
> -                         int i = ptes_per_vm_page;
> -                         pt_entry_t *pdep = pdp;
> -                         do {
> +                     pt_entry_t pde = (pt_entry_t ) pdpbase[l3i];
> +                     if (!(pde & INTEL_PTE_VALID))
> +                             continue;
> +                     pt_entry_t *pdebase = (pt_entry_t*) ptetokv(pde);
> +                     for (int l2i = 0; l2i < 512; l2i++) {
> +#else /* PAE */
> +                     pt_entry_t *pdebase = p->dirbase;
> +                     for (int l2i = 0; l2i < 
> lin2pdenum(VM_MAX_USER_ADDRESS); l2i++) {
> +#endif /* PAE */
> +                             pt_entry_t pte = (pt_entry_t) pdebase[l2i];
> +                             if (!(pte & INTEL_PTE_VALID))
> +                                     continue;
> +
> +                             pa = pte_to_pa(pte);
> +                             ptp = (pt_entry_t *)phystokv(pa);
> +                             eptp = ptp + NPTES*ptes_per_vm_page;
> +
> +                             /*
> +                              * If the pte page has any wired mappings, we 
> cannot
> +                              * free it.
> +                              */
> +                             wired = 0;
> +                             {
> +                                 pt_entry_t *ptep;
> +                                 for (ptep = ptp; ptep < eptp; ptep++) {
> +                                     if (*ptep & INTEL_PTE_WIRED) {
> +                                         wired = 1;
> +                                         break;
> +                                     }
> +                                 }
> +                             }
> +                             if (!wired) {
> +                                 /*
> +                                  * Remove the virtual addresses mapped by 
> this pte page.
> +                                  */
> +                                 { /*XXX big hack*/
> +                                     vm_offset_t va = pagenum2lin(l4i, l3i, 
> l2i, 0);
> +                                     if (p == kernel_pmap)
> +                                         va = lintokv(va);
> +                                     pmap_remove_range(p, va, ptp, eptp);
> +                                 }
> +
> +                                 /*
> +                                  * Invalidate the page directory pointer.
> +                                  */
> +                                 {
> +                                     int i = ptes_per_vm_page;
> +                                     pt_entry_t *pdep = &pdebase[l2i];
> +                                     do {
>  #ifdef       MACH_PV_PAGETABLES
> -                             unsigned long pte = *pdep;
> -                             void *ptable = (void*) ptetokv(pte);
> -                             if 
> (!(hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdep++)), 0)))
> -                                 panic("%s:%d could not clear pde 
> %p\n",__FILE__,__LINE__,pdep-1);
> -                             if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, 
> kv_to_mfn(ptable)))
> -                                 panic("couldn't unpin page %p(%lx)\n", 
> ptable, (vm_offset_t) pa_to_ma(kvtophys((vm_offset_t)ptable)));
> -                             pmap_set_page_readwrite(ptable);
> +                                         unsigned long pte = *pdep;
> +                                         void *ptable = (void*) ptetokv(pte);
> +                                         if 
> (!(hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdep++)), 0)))
> +                                             panic("%s:%d could not clear 
> pde %p\n",__FILE__,__LINE__,pdep-1);
> +                                         if (!hyp_mmuext_op_mfn 
> (MMUEXT_UNPIN_TABLE, kv_to_mfn(ptable)))
> +                                             panic("couldn't unpin page 
> %p(%lx)\n", ptable, (vm_offset_t) pa_to_ma(kvtophys((vm_offset_t)ptable)));
> +                                         pmap_set_page_readwrite(ptable);
>  #else        /* MACH_PV_PAGETABLES */
> -                             *pdep++ = 0;
> +                                         *pdep++ = 0;
>  #endif       /* MACH_PV_PAGETABLES */
> -                         } while (--i > 0);
> -                     }
> +                                     } while (--i > 0);
> +                                 }
>  
> -                     PMAP_READ_UNLOCK(p, spl);
> +                                 PMAP_READ_UNLOCK(p, spl);
>  
> -                     /*
> -                      * And free the pte page itself.
> -                      */
> -                     {
> -                         vm_page_t m;
> -
> -                         vm_object_lock(pmap_object);
> -                         assert(pa == (vm_offset_t) pa);
> -                         m = vm_page_lookup(pmap_object, pa);
> -                         if (m == VM_PAGE_NULL)
> -                             panic("pmap_collect: pte page not in object");
> -                         vm_page_lock_queues();
> -                         vm_page_free(m);
> -                         inuse_ptepages_count--;
> -                         vm_page_unlock_queues();
> -                         vm_object_unlock(pmap_object);
> -                     }
> +                                 /*
> +                                  * And free the pte page itself.
> +                                  */
> +                                 kmem_cache_free(&pt_cache, 
> (vm_offset_t)ptetokv(pte));
>  
> -                     PMAP_READ_LOCK(p, spl);
> -                 }
> -             }
> -         }
> +                                 PMAP_READ_LOCK(p, spl);
> +
> +                             }
> +                     }
>  #if PAE
> +                     // TODO check l2?
> +             }
> +#ifdef __x86_64__
> +                     // TODO check l3?
>       }
> -#endif
> +#endif /* __x86_64__ */
> +#endif /* PAE */
> +
>       PMAP_UPDATE_TLBS(p, VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
>  
>       PMAP_READ_UNLOCK(p, spl);
> diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
> index 4c1b9bd5..5fc7fb25 100644
> --- a/i386/intel/pmap.h
> +++ b/i386/intel/pmap.h
> @@ -75,7 +75,6 @@ typedef phys_addr_t pt_entry_t;
>  #define L4SHIFT              39      /* L4 shift */
>  #define L4MASK               0x1ff   /* mask for L4 index */
>  #define PDPNUM_KERNEL        (((VM_MAX_KERNEL_ADDRESS - 
> VM_MIN_KERNEL_ADDRESS) >> PDPSHIFT) + 1)
> -#define PDPNUM_USER  (((VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS) >> 
> PDPSHIFT) + 1)
>  #define PDPMASK              0x1ff   /* mask for page directory pointer 
> index */
>  #else /* __x86_64__ */
>  #define PDPNUM               4       /* number of page directory pointers */
> @@ -130,6 +129,26 @@ typedef phys_addr_t pt_entry_t;
>   */
>  #define pdenum2lin(a)        ((vm_offset_t)(a) << PDESHIFT)
>  
> +#if PAE
> +#ifdef __x86_64__
> +#define pagenum2lin(l4num, l3num, l2num, l1num) \
> +    (((vm_offset_t)(l4num) << L4SHIFT) +        \
> +     ((vm_offset_t)(l3num) << PDPSHIFT) +       \
> +     ((vm_offset_t)(l2num) << PDESHIFT) +       \
> +     ((vm_offset_t)(l1num) << PTESHIFT))
> +#else /* __x86_64__ */
> +#define pagenum2lin(l4num, l3num, l2num, l1num) \
> +    (((vm_offset_t)(l3num) << PDPSHIFT) +       \
> +     ((vm_offset_t)(l2num) << PDESHIFT) +       \
> +     ((vm_offset_t)(l1num) << PTESHIFT))
> +#endif
> +#else /* PAE */
> +#define pagenum2lin(l4num, l3num, l2num, l1num) \
> +    (((vm_offset_t)(l2num) << PDESHIFT) +       \
> +     ((vm_offset_t)(l1num) << PTESHIFT))
> +#endif
> +
> +
>  /*
>   *   Convert linear offset to page table index
>   */
> -- 
> 2.30.2
> 
> 

-- 
Samuel
---
Pour une évaluation indépendante, transparente et rigoureuse !
Je soutiens la Commission d'Évaluation de l'Inria.
[Prev in Thread]
Current Thread
[Next in Thread]
[PATCH 1/3] pmap: dynamically allocate the whole user page tree map, Luca Dariz, 2023/05/21
- [PATCH 3/3] x86_64: fix descriptor loading for 64-bit addresses, Luca Dariz, 2023/05/21
  - Re: [PATCH 3/3] x86_64: fix descriptor loading for 64-bit addresses, Samuel Thibault, 2023/05/21
- [PATCH 2/3] x86_64: push user's VM_MAX_ADDRESS, Luca Dariz, 2023/05/21
  - Re: [PATCH 2/3] x86_64: push user's VM_MAX_ADDRESS, Samuel Thibault, 2023/05/21
    - Re: [PATCH 2/3] x86_64: push user's VM_MAX_ADDRESS, Luca Dariz, 2023/05/21
    - [PATCH v2] x86_64: push user's VM_MAX_ADDRESS, Luca Dariz, 2023/05/21
    - Re: [PATCH v2] x86_64: push user's VM_MAX_ADDRESS, Samuel Thibault, 2023/05/21
- Re: [PATCH 1/3] pmap: dynamically allocate the whole user page tree map, Samuel Thibault <=
Prev by Date: Re: [PATCH 2/3] x86_64: push user's VM_MAX_ADDRESS
Next by Date: Re: [PATCH 1/2] streamio: Implement trivfs_append_args ()
Previous by thread: Re: [PATCH v2] x86_64: push user's VM_MAX_ADDRESS
Next by thread: [PATCH] x86_64: fix APIC initialization
Index(es):
- Date
- Thread