[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 5/9] use L4 page table directly on x86_64 instead of short-ci
From: |
Samuel Thibault |
Subject: |
Re: [PATCH 5/9] use L4 page table directly on x86_64 instead of short-circuiting to pdpbase |
Date: |
Sun, 12 Feb 2023 19:01:03 +0100 |
User-agent: |
NeoMutt/20170609 (1.8.3) |
Applied, thanks!
Luca Dariz, le dim. 12 févr. 2023 18:28:14 +0100, a ecrit:
> This is a preparation to run the kernel on high addresses, where the
> user vm region and the kernel vm region will use different L3 page
> tables.
>
> * i386/intel/pmap.c: on x86_64, retrieve the value of pdpbase from the
> L4 table, and add the pmap_pdp() helper (useful also for PAE).
> * i386/intel/pmap.h: remove pdpbase on x86_64.
> ---
> i386/intel/pmap.c | 97 ++++++++++++++++++++++++++++++++++++-----------
> i386/intel/pmap.h | 7 ++--
> 2 files changed, 78 insertions(+), 26 deletions(-)
>
> diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
> index 470be744..9e9f91db 100644
> --- a/i386/intel/pmap.c
> +++ b/i386/intel/pmap.c
> @@ -430,14 +430,11 @@ pt_entry_t *kernel_page_dir;
> static pmap_mapwindow_t mapwindows[PMAP_NMAPWINDOWS];
> def_simple_lock_data(static, pmapwindows_lock)
>
> +#ifdef PAE
> static inline pt_entry_t *
> -pmap_pde(const pmap_t pmap, vm_offset_t addr)
> +pmap_ptp(const pmap_t pmap, vm_offset_t addr)
> {
> - pt_entry_t *page_dir;
> - if (pmap == kernel_pmap)
> - addr = kvtolin(addr);
> -#if PAE
> - pt_entry_t *pdp_table, pdp, pde;
> + pt_entry_t *pdp_table, pdp;
> #ifdef __x86_64__
> pdp = pmap->l4base[lin2l4num(addr)];
> if ((pdp & INTEL_PTE_VALID) == 0)
> @@ -446,6 +443,19 @@ pmap_pde(const pmap_t pmap, vm_offset_t addr)
> #else /* __x86_64__ */
> pdp_table = pmap->pdpbase;
> #endif /* __x86_64__ */
> + return pdp_table;
> +}
> +#endif
> +
> +static inline pt_entry_t *
> +pmap_pde(const pmap_t pmap, vm_offset_t addr)
> +{
> + pt_entry_t *page_dir;
> + if (pmap == kernel_pmap)
> + addr = kvtolin(addr);
> +#if PAE
> + pt_entry_t *pdp_table, pde;
> + pdp_table = pmap_ptp(pmap, addr);
> pde = pdp_table[lin2pdpnum(addr)];
> if ((pde & INTEL_PTE_VALID) == 0)
> return PT_ENTRY_NULL;
> @@ -585,6 +595,7 @@ vm_offset_t pmap_map_bd(
> static void pmap_bootstrap_pae(void)
> {
> vm_offset_t addr;
> + pt_entry_t *pdp_kernel;
>
> #ifdef __x86_64__
> #ifdef MACH_HYP
> @@ -595,13 +606,15 @@ static void pmap_bootstrap_pae(void)
> memset(kernel_pmap->l4base, 0, INTEL_PGBYTES);
> #endif /* x86_64 */
>
> + // TODO: allocate only the PDPTE for kernel virtual space
> + // this means all directmap and the stupid limit above it
> init_alloc_aligned(PDPNUM * INTEL_PGBYTES, &addr);
> kernel_page_dir = (pt_entry_t*)phystokv(addr);
>
> - kernel_pmap->pdpbase = (pt_entry_t*)phystokv(pmap_grab_page());
> - memset(kernel_pmap->pdpbase, 0, INTEL_PGBYTES);
> + pdp_kernel = (pt_entry_t*)phystokv(pmap_grab_page());
> + memset(pdp_kernel, 0, INTEL_PGBYTES);
> for (int i = 0; i < PDPNUM; i++)
> - WRITE_PTE(&kernel_pmap->pdpbase[i],
> + WRITE_PTE(&pdp_kernel[i],
> pa_to_pte(_kvtophys((void *) kernel_page_dir
> + i * INTEL_PGBYTES))
> | INTEL_PTE_VALID
> @@ -611,10 +624,14 @@ static void pmap_bootstrap_pae(void)
> );
>
> #ifdef __x86_64__
> - WRITE_PTE(&kernel_pmap->l4base[0],
> pa_to_pte(_kvtophys(kernel_pmap->pdpbase)) | INTEL_PTE_VALID |
> INTEL_PTE_WRITE);
> + /* only fill the kernel pdpte during bootstrap */
> + WRITE_PTE(&kernel_pmap->l4base[lin2l4num(VM_MIN_KERNEL_ADDRESS)],
> + pa_to_pte(_kvtophys(pdp_kernel)) | INTEL_PTE_VALID |
> INTEL_PTE_WRITE);
> #ifdef MACH_PV_PAGETABLES
> pmap_set_page_readonly_init(kernel_pmap->l4base);
> -#endif
> +#endif /* MACH_PV_PAGETABLES */
> +#else /* x86_64 */
> + kernel_pmap->pdpbase = pdp_kernel;
> #endif /* x86_64 */
> }
> #endif /* PAE */
> @@ -1243,7 +1260,7 @@ pmap_page_table_page_dealloc(vm_offset_t pa)
> */
> pmap_t pmap_create(vm_size_t size)
> {
> - pt_entry_t *page_dir[PDPNUM];
> + pt_entry_t *page_dir[PDPNUM], *pdp_kernel;
> int i;
> pmap_t p;
> pmap_statistics_t stats;
> @@ -1301,34 +1318,40 @@ pmap_t pmap_create(vm_size_t size)
> #endif /* MACH_PV_PAGETABLES */
>
> #if PAE
> - p->pdpbase = (pt_entry_t *) kmem_cache_alloc(&pdpt_cache);
> - if (p->pdpbase == NULL) {
> + pdp_kernel = (pt_entry_t *) kmem_cache_alloc(&pdpt_cache);
> + if (pdp_kernel == NULL) {
> for (i = 0; i < PDPNUM; i++)
> kmem_cache_free(&pd_cache, (vm_address_t) page_dir[i]);
> kmem_cache_free(&pmap_cache, (vm_address_t) p);
> return PMAP_NULL;
> }
>
> - memset(p->pdpbase, 0, INTEL_PGBYTES);
> + memset(pdp_kernel, 0, INTEL_PGBYTES);
> {
> for (i = 0; i < PDPNUM; i++)
> - WRITE_PTE(&p->pdpbase[i],
> + WRITE_PTE(&pdp_kernel[i],
> pa_to_pte(kvtophys((vm_offset_t) page_dir[i]))
> | INTEL_PTE_VALID
> #if (defined(__x86_64__) && !defined(MACH_HYP)) ||
> defined(MACH_PV_PAGETABLES)
> | INTEL_PTE_WRITE
> #ifdef __x86_64__
> | INTEL_PTE_USER
> -#endif
> +#endif /* __x86_64__ */
> #endif
> );
> }
> #ifdef __x86_64__
> + // TODO alloc only PDPTE for the user range VM_MIN_ADDRESS,
> VM_MAX_ADDRESS
> + // and keep the same for kernel range, in l4 table we have different
> entries
> p->l4base = (pt_entry_t *) kmem_cache_alloc(&l4_cache);
> if (p->l4base == NULL)
> panic("pmap_create");
> memset(p->l4base, 0, INTEL_PGBYTES);
> - WRITE_PTE(&p->l4base[0], pa_to_pte(kvtophys((vm_offset_t) p->pdpbase))
> | INTEL_PTE_VALID | INTEL_PTE_WRITE | INTEL_PTE_USER);
> + WRITE_PTE(&p->l4base[lin2l4num(VM_MIN_KERNEL_ADDRESS)],
> + pa_to_pte(kvtophys((vm_offset_t) pdp_kernel)) |
> INTEL_PTE_VALID | INTEL_PTE_WRITE | INTEL_PTE_USER);
> +#if lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_ADDRESS)
> + // TODO kernel vm and user vm are not in the same l4 entry, so add the
> user one
> +#endif
> #ifdef MACH_PV_PAGETABLES
> // FIXME: use kmem_cache_alloc instead
> if (kmem_alloc_wired(kernel_map,
> @@ -1349,6 +1372,8 @@ pmap_t pmap_create(vm_size_t size)
> memset(p->user_l4base, 0, INTEL_PGBYTES);
> WRITE_PTE(&p->user_l4base[0], pa_to_pte(kvtophys((vm_offset_t)
> p->user_pdpbase)) | INTEL_PTE_VALID | INTEL_PTE_WRITE);
> #endif /* MACH_PV_PAGETABLES */
> +#else /* _x86_64 */
> + p->pdpbase = pdp_kernel;
> #endif /* _x86_64 */
> #ifdef MACH_PV_PAGETABLES
> #ifdef __x86_64__
> @@ -1411,12 +1436,22 @@ void pmap_destroy(pmap_t p)
>
> #if PAE
> for (i = 0; i <= lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS); i++) {
> - free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
> - page_dir = (pt_entry_t *) ptetokv(p->pdpbase[i]);
> +#ifdef __x86_64__
> +#ifdef USER32
> + /* In this case we know we have one PDP for user space */
> + pt_entry_t *pdp = (pt_entry_t *)
> ptetokv(p->l4base[lin2l4num(VM_MIN_ADDRESS)]);
> #else
> +#error "TODO do 64-bit userspace need more that 512G?"
> +#endif /* USER32 */
> + page_dir = (pt_entry_t *) ptetokv(pdp[i]);
> +#else /* __x86_64__ */
> + page_dir = (pt_entry_t *) ptetokv(p->pdpbase[i]);
> +#endif /* __x86_64__ */
> + free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
> +#else /* PAE */
> free_all = FALSE;
> page_dir = p->dirbase;
> -#endif
> +#endif /* PAE */
>
> #ifdef __x86_64__
> #warning FIXME 64bit need to free l3
> @@ -1464,14 +1499,20 @@ void pmap_destroy(pmap_t p)
> #endif /* __x86_64__ */
> pmap_set_page_readwrite(p->pdpbase);
> #endif /* MACH_PV_PAGETABLES */
> +
> #ifdef __x86_64__
> + kmem_cache_free(&pdpt_cache, (vm_offset_t) pmap_ptp(p, VM_MIN_ADDRESS));
> +#if lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_ADDRESS)
> + // TODO kernel vm and user vm are not in the same l4 entry
> +#endif
> kmem_cache_free(&l4_cache, (vm_offset_t) p->l4base);
> #ifdef MACH_PV_PAGETABLES
> kmem_free(kernel_map, (vm_offset_t)p->user_l4base, INTEL_PGBYTES);
> kmem_free(kernel_map, (vm_offset_t)p->user_pdpbase, INTEL_PGBYTES);
> #endif /* MACH_PV_PAGETABLES */
> -#endif /* __x86_64__ */
> +#else /* __x86_64__ */
> kmem_cache_free(&pdpt_cache, (vm_offset_t) p->pdpbase);
> +#endif /* __x86_64__ */
> #endif /* PAE */
> kmem_cache_free(&pmap_cache, (vm_offset_t) p);
> }
> @@ -2404,8 +2445,18 @@ void pmap_collect(pmap_t p)
>
> #if PAE
> for (i = 0; i <= lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS); i++) {
> - free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
> +#ifdef __x86_64__
> +#ifdef USER32
> + /* In this case we know we have one PDP for user space */
> + pdp = (pt_entry_t *) ptetokv(p->l4base[lin2l4num(VM_MIN_ADDRESS)]);
> +#else
> +#error "TODO do 64-bit userspace need more that 512G?"
> +#endif /* USER32 */
> + page_dir = (pt_entry_t *) ptetokv(pdp[i]);
> +#else /* __x86_64__ */
> page_dir = (pt_entry_t *) ptetokv(p->pdpbase[i]);
> +#endif /* __x86_64__ */
> + free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
> #else
> i = 0;
> free_all = FALSE;
> diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
> index 92247faa..1c6a0d30 100644
> --- a/i386/intel/pmap.h
> +++ b/i386/intel/pmap.h
> @@ -182,16 +182,17 @@ typedef volatile long cpu_set; /* set of CPUs
> - must be <= 32 */
> struct pmap {
> #if ! PAE
> pt_entry_t *dirbase; /* page directory table */
> -#else
> - pt_entry_t *pdpbase; /* page directory pointer table */
> -#endif /* ! PAE */
> +#else /* PAE */
> #ifdef __x86_64__
> pt_entry_t *l4base; /* l4 table */
> #ifdef MACH_HYP
> pt_entry_t *user_l4base; /* Userland l4 table */
> pt_entry_t *user_pdpbase; /* Userland l4 table */
> #endif /* MACH_HYP */
> +#else /* x86_64 */
> + pt_entry_t *pdpbase; /* page directory pointer table */
> #endif /* x86_64 */
> +#endif /* PAE */
> int ref_count; /* reference count */
> decl_simple_lock_data(,lock)
> /* lock on map */
> --
> 2.30.2
>
>
--
Samuel
---
Pour une évaluation indépendante, transparente et rigoureuse !
Je soutiens la Commission d'Évaluation de l'Inria.
- [PATCH 0/9 gnumach] move kernel vm map to high addresses on x86_64, Luca Dariz, 2023/02/12
- [PATCH 6/9] add more explicit names for user space virtual space limits, Luca Dariz, 2023/02/12
- [PATCH 5/9] use L4 page table directly on x86_64 instead of short-circuiting to pdpbase, Luca Dariz, 2023/02/12
- Re: [PATCH 5/9] use L4 page table directly on x86_64 instead of short-circuiting to pdpbase,
Samuel Thibault <=
- [PATCH 1/9] prepare pmap helpers for full 64 bit memory map, Luca Dariz, 2023/02/12
- [PATCH 2/9] fix x86_64 asm for higher kernel addresses, Luca Dariz, 2023/02/12
- [PATCH 3/9] factor out xen-specific bootstrap, Luca Dariz, 2023/02/12
- [PATCH 4/9] factor out PAE-specific bootstrap, Luca Dariz, 2023/02/12
- [PATCH 7/9] extend data types to hold a 64-bit address, Luca Dariz, 2023/02/12