qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 2/5] intel-iommu: introduce Intel IOMMU (VT-d


From: Jan Kiszka
Subject: Re: [Qemu-devel] [PATCH v3 2/5] intel-iommu: introduce Intel IOMMU (VT-d) emulation
Date: Tue, 12 Aug 2014 09:34:30 +0200
User-agent: Mozilla/5.0 (X11; U; Linux i686 (x86_64); de; rv:1.8.1.12) Gecko/20080226 SUSE/2.0.0.12-1.1 Thunderbird/2.0.0.12 Mnenhy/0.7.5.666

On 2014-08-11 09:04, Le Tan wrote:
> Add support for emulating Intel IOMMU according to the VT-d specification for
> the q35 chipset machine. Implement the logics for DMAR (DMA remapping) without
> PASID support. The emulation supports register-based invalidation and primary
> fault logging.

Some arbitrary comments below (means, I didn't read every line and
likely missed some things). In general, this looks and works pretty good!

> 
> Signed-off-by: Le Tan <address@hidden>
> ---
>  hw/i386/Makefile.objs          |    1 +
>  hw/i386/intel_iommu.c          | 1345 
> ++++++++++++++++++++++++++++++++++++++++
>  hw/i386/intel_iommu_internal.h |  345 +++++++++++
>  include/hw/i386/intel_iommu.h  |   90 +++
>  4 files changed, 1781 insertions(+)
>  create mode 100644 hw/i386/intel_iommu.c
>  create mode 100644 hw/i386/intel_iommu_internal.h
>  create mode 100644 include/hw/i386/intel_iommu.h
> 
> diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
> index 48014ab..6936111 100644
> --- a/hw/i386/Makefile.objs
> +++ b/hw/i386/Makefile.objs
> @@ -2,6 +2,7 @@ obj-$(CONFIG_KVM) += kvm/
>  obj-y += multiboot.o smbios.o
>  obj-y += pc.o pc_piix.o pc_q35.o
>  obj-y += pc_sysfw.o
> +obj-y += intel_iommu.o
>  obj-$(CONFIG_XEN) += ../xenpv/ xen/
>  
>  obj-y += kvmvapic.o
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> new file mode 100644
> index 0000000..b3a4f78
> --- /dev/null
> +++ b/hw/i386/intel_iommu.c
> @@ -0,0 +1,1345 @@
> +/*
> + * QEMU emulation of an Intel IOMMU (VT-d)
> + *   (DMA Remapping device)
> + *
> + * Copyright (C) 2013 Knut Omang, Oracle <address@hidden>
> + * Copyright (C) 2014 Le Tan, <address@hidden>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "hw/sysbus.h"
> +#include "exec/address-spaces.h"
> +#include "intel_iommu_internal.h"
> +
> +
> +/*#define DEBUG_INTEL_IOMMU*/
> +#ifdef DEBUG_INTEL_IOMMU
> +enum {
> +    DEBUG_GENERAL, DEBUG_CSR, DEBUG_INV, DEBUG_MMU, DEBUG_FLOG,
> +};
> +#define VTD_DBGBIT(x)   (1 << DEBUG_##x)
> +static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR) |
> +                          VTD_DBGBIT(FLOG);
> +
> +#define VTD_DPRINTF(what, fmt, ...) do { \
> +    if (vtd_dbgflags & VTD_DBGBIT(what)) { \
> +        fprintf(stderr, "(vtd)%s: " fmt "\n", __func__, \
> +                ## __VA_ARGS__); } \
> +    } while (0)
> +#else
> +#define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> +#endif
> +
> +static inline void define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
> +                               uint64_t wmask, uint64_t w1cmask)

In general, don't declare functions inline needlessly. It makes sense
for trivial ones you export via a header, but even a 2- or 3-liner like
this can be more efficient as stand-alone function. Anything bigger
definitely does not deserve that tag.

Inline is just a hint to the compiler anyway, so you can perfectly leave
it out for any almost-trivial static function.

> +{
> +    stq_le_p(&s->csr[addr], val);
> +    stq_le_p(&s->wmask[addr], wmask);
> +    stq_le_p(&s->w1cmask[addr], w1cmask);
> +}
> +
> +static inline void define_quad_wo(IntelIOMMUState *s, hwaddr addr,
> +                                  uint64_t mask)
> +{
> +    stq_le_p(&s->womask[addr], mask);
> +}
> +
> +static inline void define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
> +                               uint32_t wmask, uint32_t w1cmask)
> +{
> +    stl_le_p(&s->csr[addr], val);
> +    stl_le_p(&s->wmask[addr], wmask);
> +    stl_le_p(&s->w1cmask[addr], w1cmask);
> +}
> +
> +static inline void define_long_wo(IntelIOMMUState *s, hwaddr addr,
> +                                  uint32_t mask)
> +{
> +    stl_le_p(&s->womask[addr], mask);
> +}
> +
> +/* "External" get/set operations */
> +static inline void set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
> +{
> +    uint64_t oldval = ldq_le_p(&s->csr[addr]);
> +    uint64_t wmask = ldq_le_p(&s->wmask[addr]);
> +    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
> +    stq_le_p(&s->csr[addr],
> +             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
> +}
> +
> +static inline void set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
> +{
> +    uint32_t oldval = ldl_le_p(&s->csr[addr]);
> +    uint32_t wmask = ldl_le_p(&s->wmask[addr]);
> +    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
> +    stl_le_p(&s->csr[addr],
> +             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
> +}
> +
> +static inline uint64_t get_quad(IntelIOMMUState *s, hwaddr addr)
> +{
> +    uint64_t val = ldq_le_p(&s->csr[addr]);
> +    uint64_t womask = ldq_le_p(&s->womask[addr]);
> +    return val & ~womask;
> +}
> +
> +
> +static inline uint32_t get_long(IntelIOMMUState *s, hwaddr addr)
> +{
> +    uint32_t val = ldl_le_p(&s->csr[addr]);
> +    uint32_t womask = ldl_le_p(&s->womask[addr]);
> +    return val & ~womask;
> +}
> +
> +/* "Internal" get/set operations */
> +static inline uint64_t get_quad_raw(IntelIOMMUState *s, hwaddr addr)
> +{
> +    return ldq_le_p(&s->csr[addr]);
> +}
> +
> +static inline uint32_t get_long_raw(IntelIOMMUState *s, hwaddr addr)
> +{
> +    return ldl_le_p(&s->csr[addr]);
> +}
> +
> +static inline void set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t 
> val)
> +{
> +    stq_le_p(&s->csr[addr], val);
> +}
> +
> +static inline uint32_t set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
> +                                           uint32_t clear, uint32_t mask)
> +{
> +    uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
> +    stl_le_p(&s->csr[addr], new_val);
> +    return new_val;
> +}
> +
> +static inline uint64_t set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> +                                           uint64_t clear, uint64_t mask)
> +{
> +    uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
> +    stq_le_p(&s->csr[addr], new_val);
> +    return new_val;
> +}
> +
> +/* Given the reg addr of both the message data and address, generate an
> + * interrupt via MSI.
> + */
> +static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
> +                                   hwaddr mesg_data_reg)
> +{
> +    hwaddr addr;
> +    uint32_t data;
> +
> +    assert(mesg_data_reg < DMAR_REG_SIZE);
> +    assert(mesg_addr_reg < DMAR_REG_SIZE);
> +
> +    addr = get_long_raw(s, mesg_addr_reg);
> +    data = get_long_raw(s, mesg_data_reg);
> +
> +    VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, addr, data);
> +    stl_le_phys(&address_space_memory, addr, data);
> +}
> +
> +/* Generate a fault event to software via MSI if conditions are met.
> + * Notice that the value of FSTS_REG being passed to it should be the one
> + * before any update.
> + */
> +static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
> +{
> +    /* Check if there are any previously reported interrupt conditions */
> +    if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
> +        pre_fsts & VTD_FSTS_IQE) {
> +        VTD_DPRINTF(FLOG, "there are previous interrupt conditions "
> +                    "to be serviced by software, fault event is not 
> generated "
> +                    "(FSTS_REG 0x%"PRIx32 ")", pre_fsts);
> +        return;
> +    }
> +    set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
> +    if (get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
> +        /* Interrupt Mask */
> +        VTD_DPRINTF(FLOG, "Interrupt Mask set, fault event is not 
> generated");
> +    } else {
> +        /* generate interrupt */
> +        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
> +        set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
> +    }
> +}
> +
> +/* Check if the Fault (F) field of the Fault Recording Register referenced by
> + * @index is Set.
> + */
> +static inline bool is_frcd_set(IntelIOMMUState *s, uint16_t index)
> +{
> +    /* Each reg is 128-bit */
> +    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
> +    addr += 8; /* Access the high 64-bit half */
> +
> +    assert(index < DMAR_FRCD_REG_NR);
> +
> +    return get_quad_raw(s, addr) & VTD_FRCD_F;
> +}
> +
> +/* Update the PPF field of Fault Status Register.
> + * Should be called whenever change the F field of any fault recording
> + * registers.
> + */
> +static inline void update_fsts_ppf(IntelIOMMUState *s)
> +{
> +    uint32_t i;
> +    uint32_t ppf_mask = 0;
> +
> +    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
> +        if (is_frcd_set(s, i)) {
> +            ppf_mask = VTD_FSTS_PPF;
> +            break;
> +        }
> +    }
> +    set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
> +    VTD_DPRINTF(FLOG, "set PPF of FSTS_REG to %d", ppf_mask ? 1 : 0);
> +}
> +
> +static inline void set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t 
> index)
> +{
> +    /* Each reg is 128-bit */
> +    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
> +    addr += 8; /* Access the high 64-bit half */
> +
> +    assert(index < DMAR_FRCD_REG_NR);
> +
> +    set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
> +    update_fsts_ppf(s);
> +}
> +
> +/* Must not update F field now, should be done later */
> +static void record_frcd(IntelIOMMUState *s, uint16_t index, uint16_t 
> source_id,
> +                        hwaddr addr, VTDFaultReason fault, bool is_write)
> +{
> +    uint64_t hi = 0, lo;
> +    hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
> +
> +    assert(index < DMAR_FRCD_REG_NR);
> +
> +    lo = VTD_FRCD_FI(addr);
> +    hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
> +    if (!is_write) {
> +        hi |= VTD_FRCD_T;
> +    }
> +
> +    set_quad_raw(s, frcd_reg_addr, lo);
> +    set_quad_raw(s, frcd_reg_addr + 8, hi);
> +    VTD_DPRINTF(FLOG, "record to FRCD_REG #%"PRIu16 ": hi 0x%"PRIx64
> +                ", lo 0x%"PRIx64, index, hi, lo);
> +}
> +
> +/* Try to collapse multiple pending faults from the same requester */
> +static inline bool try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
> +{
> +    uint32_t i;
> +    uint64_t frcd_reg;
> +    hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
> +
> +    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
> +        frcd_reg = get_quad_raw(s, addr);
> +        VTD_DPRINTF(FLOG, "frcd_reg #%d 0x%"PRIx64, i, frcd_reg);
> +        if ((frcd_reg & VTD_FRCD_F) &&
> +            ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
> +            return true;
> +        }
> +        addr += 16; /* 128-bit for each */
> +    }
> +
> +    return false;
> +}
> +
> +/* Log and report an DMAR (address translation) fault to software */
> +static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
> +                                  hwaddr addr, VTDFaultReason fault,
> +                                  bool is_write)
> +{
> +    uint32_t fsts_reg = get_long_raw(s, DMAR_FSTS_REG);
> +
> +    assert(fault < VTD_FR_MAX);
> +
> +    if (fault == VTD_FR_RESERVED_ERR) {
> +        /* This is not a normal fault reason case. Drop it. */
> +        return;
> +    }
> +
> +    VTD_DPRINTF(FLOG, "sid 0x%"PRIx16 ", fault %d, addr 0x%"PRIx64
> +                ", is_write %d", source_id, fault, addr, is_write);
> +
> +    /* Check PFO field in FSTS_REG */
> +    if (fsts_reg & VTD_FSTS_PFO) {
> +        VTD_DPRINTF(FLOG, "new fault is not recorded due to "
> +                    "Primary Fault Overflow");
> +        return;
> +    }
> +
> +    /* Compression of multiple faults from the same requester */
> +    if (try_collapse_fault(s, source_id)) {
> +        VTD_DPRINTF(FLOG, "new fault is not recorded due to "
> +                    "compression of faults");
> +        return;
> +    }
> +
> +    /* Check next_frcd_reg to see whether it is overflow now */
> +    if (is_frcd_set(s, s->next_frcd_reg)) {
> +        VTD_DPRINTF(FLOG, "Primary Fault Overflow and "
> +                    "new fault is not recorded, set PFO field");
> +        set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
> +        return;
> +    }
> +
> +    record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
> +
> +    if (fsts_reg & VTD_FSTS_PPF) {
> +        /* There are already one or more pending faults */
> +        VTD_DPRINTF(FLOG, "there are pending faults already, "
> +                    "fault event is not generated");
> +        set_frcd_and_update_ppf(s, s->next_frcd_reg);
> +        s->next_frcd_reg++;
> +        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
> +            s->next_frcd_reg = 0;
> +        }
> +    } else {
> +        set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
> +                            VTD_FSTS_FRI(s->next_frcd_reg));
> +        set_frcd_and_update_ppf(s, s->next_frcd_reg); /* It will also set 
> PPF */
> +        s->next_frcd_reg++;
> +        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
> +            s->next_frcd_reg = 0;
> +        }
> +
> +        /* This case actually cause the PPF to be Set.
> +         * So generate fault event (interrupt).
> +         */
> +         vtd_generate_fault_event(s, fsts_reg);
> +    }
> +}
> +
> +static inline bool root_entry_present(VTDRootEntry *root)
> +{
> +    return root->val & VTD_ROOT_ENTRY_P;
> +}
> +
> +static int get_root_entry(IntelIOMMUState *s, uint32_t index, VTDRootEntry 
> *re)
> +{
> +    dma_addr_t addr;
> +
> +    assert(index < VTD_ROOT_ENTRY_NR);
> +
> +    addr = s->root + index * sizeof(*re);
> +
> +    if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
> +        VTD_DPRINTF(GENERAL, "error: fail to access root-entry at 0x%"PRIx64
> +                    " + %"PRIu32, s->root, index);
> +        re->val = 0;
> +        return -VTD_FR_ROOT_TABLE_INV;
> +    }
> +
> +    re->val = le64_to_cpu(re->val);
> +    return VTD_FR_RESERVED;

This looks a bit weird, here and elsewhere: VTD_FR_RESERVED is a
reserved error code in the VT-d specification, and it's 0. OK, but here
the meaning of returning 0 is actually "everything went well". So either
provide a constant that documents this or simply use 0 consistently to
declare the absence of errors.

> +}
> +
> +static inline bool context_entry_present(VTDContextEntry *context)
> +{
> +    return context->lo & VTD_CONTEXT_ENTRY_P;
> +}
> +
> +static int get_context_entry_from_root(VTDRootEntry *root, uint32_t index,
> +                                       VTDContextEntry *ce)
> +{
> +    dma_addr_t addr;
> +
> +    if (!root_entry_present(root)) {
> +        ce->lo = 0;
> +        ce->hi = 0;
> +        VTD_DPRINTF(GENERAL, "error: root-entry is not present");
> +        return -VTD_FR_ROOT_ENTRY_P;
> +    }
> +
> +    assert(index < VTD_CONTEXT_ENTRY_NR);
> +
> +    addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce);
> +
> +    if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) {
> +        VTD_DPRINTF(GENERAL, "error: fail to access context-entry at 
> 0x%"PRIx64
> +                    " + %"PRIu32,
> +                    (uint64_t)(root->val & VTD_ROOT_ENTRY_CTP), index);
> +        ce->lo = 0;
> +        ce->hi = 0;
> +        return -VTD_FR_CONTEXT_TABLE_INV;
> +    }
> +
> +    ce->lo = le64_to_cpu(ce->lo);
> +    ce->hi = le64_to_cpu(ce->hi);
> +    return VTD_FR_RESERVED;
> +}
> +
> +static inline dma_addr_t get_slpt_base_from_context(VTDContextEntry *ce)
> +{
> +    return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
> +}
> +
> +/* The shift of an addr for a certain level of paging structure */
> +static inline uint32_t slpt_level_shift(uint32_t level)
> +{
> +    return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
> +}
> +
> +static inline uint64_t get_slpte_addr(uint64_t slpte)
> +{
> +    return slpte & VTD_SL_PT_BASE_ADDR_MASK;
> +}
> +
> +/* Whether the pte indicates the address of the page frame */
> +static inline bool is_last_slpte(uint64_t slpte, uint32_t level)
> +{
> +    return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
> +}
> +
> +/* Get the content of a spte located in @address@hidden */
> +static inline uint64_t get_slpte(dma_addr_t base_addr, uint32_t index)
> +{
> +    uint64_t slpte;
> +
> +    assert(index < VTD_SL_PT_ENTRY_NR);
> +
> +    if (dma_memory_read(&address_space_memory,
> +                        base_addr + index * sizeof(slpte), &slpte,
> +                        sizeof(slpte))) {
> +        slpte = (uint64_t)-1;
> +        return slpte;
> +    }
> +
> +    slpte = le64_to_cpu(slpte);
> +    return slpte;
> +}
> +
> +/* Given a gpa and the level of paging structure, return the offset of 
> current
> + * level.
> + */
> +static inline uint32_t gpa_level_offset(uint64_t gpa, uint32_t level)
> +{
> +    return (gpa >> slpt_level_shift(level)) & ((1ULL << VTD_SL_LEVEL_BITS) - 
> 1);
> +}
> +
> +/* Check Capability Register to see if the @level of page-table is supported 
> */
> +static inline bool is_level_supported(IntelIOMMUState *s, uint32_t level)
> +{
> +    return VTD_CAP_SAGAW_MASK & s->cap &
> +           (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
> +}
> +
> +/* Get the page-table level that hardware should use for the second-level
> + * page-table walk from the Address Width field of context-entry.
> + */
> +static inline uint32_t get_level_from_context_entry(VTDContextEntry *ce)
> +{
> +    return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
> +}
> +
> +static inline uint32_t get_agaw_from_context_entry(VTDContextEntry *ce)
> +{
> +    return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
> +}
> +
> +static const uint64_t paging_entry_rsvd_field[] = {
> +    [0] = ~0ULL,
> +    /* For not large page */
> +    [1] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +    [2] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +    [3] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +    [4] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +    /* For large page */
> +    [5] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +    [6] = 0x1ff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +    [7] = 0x3ffff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +    [8] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
> +};
> +
> +static inline bool slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
> +{
> +    if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
> +        /* Maybe large page */
> +        return slpte & paging_entry_rsvd_field[level + 4];
> +    } else {
> +        return slpte & paging_entry_rsvd_field[level];
> +    }
> +}
> +
> +/* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
> + * of the translation, can be used for deciding the size of large page.
> + * @slptep and @slpte_level will not be touched if error happens.
> + */
> +static int gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
> +                        uint64_t *slptep, uint32_t *slpte_level)
> +{
> +    dma_addr_t addr = get_slpt_base_from_context(ce);
> +    uint32_t level = get_level_from_context_entry(ce);
> +    uint32_t offset;
> +    uint64_t slpte;
> +    uint32_t ce_agaw = get_agaw_from_context_entry(ce);
> +    uint64_t access_right_check;
> +
> +    /* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in 
> CAP_REG
> +     * and AW in context-entry.
> +     */
> +    if (gpa & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
> +        VTD_DPRINTF(GENERAL, "error: gpa 0x%"PRIx64 " exceeds limits", gpa);
> +        return -VTD_FR_ADDR_BEYOND_MGAW;
> +    }
> +
> +    /* FIXME: what is the Atomics request here? */
> +    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
> +
> +    while (true) {
> +        offset = gpa_level_offset(gpa, level);
> +        slpte = get_slpte(addr, offset);
> +
> +        if (slpte == (uint64_t)-1) {
> +            VTD_DPRINTF(GENERAL, "error: fail to access second-level paging "
> +                        "entry at level %"PRIu32 " for gpa 0x%"PRIx64,
> +                        level, gpa);
> +            if (level == get_level_from_context_entry(ce)) {
> +                /* Invalid programming of context-entry */
> +                return -VTD_FR_CONTEXT_ENTRY_INV;
> +            } else {
> +                return -VTD_FR_PAGING_ENTRY_INV;
> +            }
> +        }
> +        if (!(slpte & access_right_check)) {
> +            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
> +                        "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
> +                        (is_write ? "write" : "read"), gpa, slpte);
> +            return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
> +        }
> +        if (slpte_nonzero_rsvd(slpte, level)) {
> +            VTD_DPRINTF(GENERAL, "error: non-zero reserved field in second "
> +                        "level paging entry level %"PRIu32 " slpte 
> 0x%"PRIx64,
> +                        level, slpte);
> +            return -VTD_FR_PAGING_ENTRY_RSVD;
> +        }
> +
> +        if (is_last_slpte(slpte, level)) {
> +            *slptep = slpte;
> +            *slpte_level = level;
> +            return VTD_FR_RESERVED;
> +        }
> +        addr = get_slpte_addr(slpte);
> +        level--;
> +    }
> +}
> +
> +/* Map a device to its corresponding domain (context-entry). @ce will be set
> + * to Zero if error happens while accessing the context-entry.
> + */
> +static inline int dev_to_context_entry(IntelIOMMUState *s, int bus_num,
> +                                       int devfn, VTDContextEntry *ce)
> +{
> +    VTDRootEntry re;
> +    int ret_fr;
> +
> +    assert(0 <= bus_num && bus_num < VTD_PCI_BUS_MAX);
> +    assert(0 <= devfn && devfn < VTD_PCI_SLOT_MAX * VTD_PCI_FUNC_MAX);

Use the proper types for bus_num and devfn, and your can get rid of
these assertions: uint8_t. I know that the PCI layer improperly uses int
for them in many places, but you don't need to copy this.

> +
> +    ret_fr = get_root_entry(s, bus_num, &re);
> +    if (ret_fr) {
> +        ce->hi = 0;
> +        ce->lo = 0;

That's a bit too defensive programming: The context entry is simply
invalid when such a function returns an error, no? You can document that
in the function description.

> +        return ret_fr;
> +    }
> +
> +    if (!root_entry_present(&re)) {
> +        VTD_DPRINTF(GENERAL, "error: root-entry #%d is not present", 
> bus_num);
> +        ce->hi = 0;
> +        ce->lo = 0;
> +        return -VTD_FR_ROOT_ENTRY_P;
> +    } else if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) {
> +        VTD_DPRINTF(GENERAL, "error: non-zero reserved field in root-entry "
> +                    "hi 0x%"PRIx64 " lo 0x%"PRIx64, re.rsvd, re.val);
> +        ce->hi = 0;
> +        ce->lo = 0;
> +        return -VTD_FR_ROOT_ENTRY_RSVD;
> +    }
> +
> +    ret_fr = get_context_entry_from_root(&re, devfn, ce);
> +    if (ret_fr) {
> +        return ret_fr;
> +    }
> +
> +    if (!context_entry_present(ce)) {
> +        VTD_DPRINTF(GENERAL,
> +                    "error: context-entry #%d(bus #%d) is not present", 
> devfn,
> +                    bus_num);
> +        return -VTD_FR_CONTEXT_ENTRY_P;
> +    } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> +               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> +        VTD_DPRINTF(GENERAL,
> +                    "error: non-zero reserved field in context-entry "
> +                    "hi 0x%"PRIx64 " lo 0x%"PRIx64, ce->hi, ce->lo);
> +        return -VTD_FR_CONTEXT_ENTRY_RSVD;
> +    }
> +
> +    /* Check if the programming of context-entry is valid */
> +    if (!is_level_supported(s, get_level_from_context_entry(ce))) {
> +        VTD_DPRINTF(GENERAL, "error: unsupported Address Width value in "
> +                    "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64,
> +                    ce->hi, ce->lo);
> +        return -VTD_FR_CONTEXT_ENTRY_INV;
> +    } else if (ce->lo & VTD_CONTEXT_ENTRY_TT) {
> +        VTD_DPRINTF(GENERAL, "error: unsupported Translation Type in "
> +                    "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64,
> +                    ce->hi, ce->lo);
> +        return -VTD_FR_CONTEXT_ENTRY_INV;
> +    }
> +
> +    return VTD_FR_RESERVED;
> +}
> +
> +static inline uint16_t make_source_id(int bus_num, int devfn)
> +{
> +    return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
> +}
> +
> +static const bool qualified_faults[] = {
> +    [VTD_FR_RESERVED] = false,
> +    [VTD_FR_ROOT_ENTRY_P] = false,
> +    [VTD_FR_CONTEXT_ENTRY_P] = true,
> +    [VTD_FR_CONTEXT_ENTRY_INV] = true,
> +    [VTD_FR_ADDR_BEYOND_MGAW] = true,
> +    [VTD_FR_WRITE] = true,
> +    [VTD_FR_READ] = true,
> +    [VTD_FR_PAGING_ENTRY_INV] = true,
> +    [VTD_FR_ROOT_TABLE_INV] = false,
> +    [VTD_FR_CONTEXT_TABLE_INV] = false,
> +    [VTD_FR_ROOT_ENTRY_RSVD] = false,
> +    [VTD_FR_PAGING_ENTRY_RSVD] = true,
> +    [VTD_FR_CONTEXT_ENTRY_TT] = true,
> +    [VTD_FR_RESERVED_ERR] = false,
> +    [VTD_FR_MAX] = false,
> +};
> +
> +/* To see if a fault condition is "qualified", which is reported to software
> + * only if the FPD field in the context-entry used to process the faulting
> + * request is 0.
> + */
> +static inline bool is_qualified_fault(VTDFaultReason fault)
> +{
> +    return qualified_faults[fault];
> +}
> +
> +static inline bool is_interrupt_addr(hwaddr addr)
> +{
> +    return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= 
> VTD_INTERRUPT_ADDR_LAST;
> +}
> +
> +/* Map dev to context-entry then do a paging-structures walk to do a iommu
> + * translation.
> + * @bus_num: The bus number
> + * @devfn: The devfn, which is the  combined of device and function number
> + * @is_write: The access is a write operation
> + * @entry: IOMMUTLBEntry that contain the addr to be translated and result
> + */
> +static void iommu_translate(IntelIOMMUState *s, int bus_num, int devfn,
> +                            hwaddr addr, bool is_write, IOMMUTLBEntry *entry)
> +{
> +    VTDContextEntry ce;
> +    uint64_t slpte;
> +    uint32_t level;
> +    uint64_t page_mask;
> +    uint16_t source_id = make_source_id(bus_num, devfn);
> +    int ret_fr;
> +    bool is_fpd_set = false;
> +
> +    /* Check if the request is in interrupt address range */
> +    if (is_interrupt_addr(addr)) {
> +        if (is_write) {
> +            /* FIXME: since we don't know the length of the access here, we
> +             * treat Non-DWORD length write requests without PASID as
> +             * interrupt requests, too. Withoud interrupt remapping support,
> +             * we just use 1:1 mapping.
> +             */
> +            VTD_DPRINTF(MMU, "write request to interrupt address "
> +                        "gpa 0x%"PRIx64, addr);
> +            entry->iova = addr & VTD_PAGE_MASK_4K;
> +            entry->translated_addr = addr & VTD_PAGE_MASK_4K;
> +            entry->addr_mask = ~VTD_PAGE_MASK_4K;
> +            entry->perm = IOMMU_WO;
> +            return;
> +        } else {
> +            VTD_DPRINTF(GENERAL, "error: read request from interrupt address 
> "
> +                        "gpa 0x%"PRIx64, addr);
> +            vtd_report_dmar_fault(s, source_id, addr, VTD_FR_READ, is_write);
> +            return;
> +        }
> +    }
> +
> +    ret_fr = dev_to_context_entry(s, bus_num, devfn, &ce);
> +    is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
> +    if (ret_fr) {
> +        ret_fr = -ret_fr;
> +        if (is_fpd_set && is_qualified_fault(ret_fr)) {
> +            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests 
> "
> +                        "through this context-entry (with FPD Set)");
> +        } else {
> +            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +        }
> +        return;
> +    }
> +
> +    ret_fr = gpa_to_slpte(&ce, addr, is_write, &slpte, &level);
> +    if (ret_fr) {
> +        ret_fr = -ret_fr;
> +        if (is_fpd_set && is_qualified_fault(ret_fr)) {
> +            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests 
> "
> +                        "through this context-entry (with FPD Set)");
> +        } else {
> +            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +        }
> +        return;
> +    }
> +
> +    if (level == VTD_SL_PT_LEVEL) {
> +        /* 4-KB page */
> +        page_mask = VTD_PAGE_MASK_4K;
> +    } else if (level == VTD_SL_PDP_LEVEL) {
> +        /* 1-GB page */
> +        page_mask = VTD_PAGE_MASK_1G;
> +    } else {
> +        /* 2-MB page */
> +        page_mask = VTD_PAGE_MASK_2M;
> +    }

You don't declare 1G and 2M pages as supported in caps.sllps, do you?
I'm wondering if we should have some device property for intel-iommu
that enables all available features, even if our emulated chipset never
supported them (I guess, Q35 had no support - my younger QM57 does not
have as well). Then you could do "-global intel-iommu.full_featured=on"
and have all those nice things available.

> +
> +    entry->iova = addr & page_mask;
> +    entry->translated_addr = get_slpte_addr(slpte) & page_mask;
> +    entry->addr_mask = ~page_mask;
> +    entry->perm = slpte & VTD_SL_RW_MASK;
> +}
> +
> +static void vtd_root_table_setup(IntelIOMMUState *s)
> +{
> +    s->root = get_quad_raw(s, DMAR_RTADDR_REG);
> +    s->root_extended = s->root & VTD_RTADDR_RTT;
> +    s->root &= VTD_RTADDR_ADDR_MASK;
> +
> +    VTD_DPRINTF(CSR, "root_table addr 0x%"PRIx64 " %s", s->root,
> +                (s->root_extended ? "(extended)" : ""));
> +}
> +
> +/* Context-cache invalidation
> + * Returns the Context Actual Invalidation Granularity.
> + * @val: the content of the CCMD_REG
> + */
> +static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t 
> val)
> +{
> +    uint64_t caig;
> +    uint64_t type = val & VTD_CCMD_CIRG_MASK;
> +
> +    switch (type) {
> +    case VTD_CCMD_GLOBAL_INVL:
> +        VTD_DPRINTF(INV, "Global invalidation request");
> +        caig = VTD_CCMD_GLOBAL_INVL_A;
> +        break;
> +
> +    case VTD_CCMD_DOMAIN_INVL:
> +        VTD_DPRINTF(INV, "Domain-selective invalidation request");
> +        caig = VTD_CCMD_DOMAIN_INVL_A;
> +        break;
> +
> +    case VTD_CCMD_DEVICE_INVL:
> +        VTD_DPRINTF(INV, "Domain-selective invalidation request");
> +        caig = VTD_CCMD_DEVICE_INVL_A;
> +        break;
> +
> +    default:
> +        VTD_DPRINTF(GENERAL,
> +                    "error: wrong context-cache invalidation granularity");
> +        caig = 0;
> +    }
> +
> +    return caig;
> +}
> +
> +/* Flush IOTLB
> + * Returns the IOTLB Actual Invalidation Granularity.
> + * @val: the content of the IOTLB_REG
> + */
> +static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
> +{
> +    uint64_t iaig;
> +    uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
> +
> +    switch (type) {
> +    case VTD_TLB_GLOBAL_FLUSH:
> +        VTD_DPRINTF(INV, "Global IOTLB flush");
> +        iaig = VTD_TLB_GLOBAL_FLUSH_A;
> +        break;
> +
> +    case VTD_TLB_DSI_FLUSH:
> +        VTD_DPRINTF(INV, "Domain-selective IOTLB flush");
> +        iaig = VTD_TLB_DSI_FLUSH_A;
> +        break;
> +
> +    case VTD_TLB_PSI_FLUSH:
> +        VTD_DPRINTF(INV, "Page-selective-within-domain IOTLB flush");
> +        iaig = VTD_TLB_PSI_FLUSH_A;
> +        break;
> +
> +    default:
> +        VTD_DPRINTF(GENERAL, "error: wrong iotlb flush granularity");
> +        iaig = 0;
> +    }
> +
> +    return iaig;
> +}
> +
> +/* Set Root Table Pointer */
> +static void handle_gcmd_srtp(IntelIOMMUState *s)
> +{
> +    VTD_DPRINTF(CSR, "set Root Table Pointer");
> +
> +    vtd_root_table_setup(s);
> +    /* Ok - report back to driver */
> +    set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
> +}
> +
> +/* Handle Translation Enable/Disable */
> +static void handle_gcmd_te(IntelIOMMUState *s, bool en)
> +{
> +    VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off"));
> +
> +    if (en) {
> +        s->dmar_enabled = true;
> +        /* Ok - report back to driver */
> +        set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
> +    } else {
> +        s->dmar_enabled = false;
> +
> +        /* Clear the index of Fault Recording Register */
> +        s->next_frcd_reg = 0;
> +        /* Ok - report back to driver */
> +        set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
> +    }
> +}
> +
> +/* Handle write to Global Command Register */
> +static void handle_gcmd_write(IntelIOMMUState *s)
> +{
> +    uint32_t status = get_long_raw(s, DMAR_GSTS_REG);
> +    uint32_t val = get_long_raw(s, DMAR_GCMD_REG);
> +    uint32_t changed = status ^ val;
> +
> +    VTD_DPRINTF(CSR, "value 0x%"PRIx32 " status 0x%"PRIx32, val, status);
> +    if (changed & VTD_GCMD_TE) {
> +        /* Translation enable/disable */
> +        handle_gcmd_te(s, val & VTD_GCMD_TE);
> +    }
> +    if (val & VTD_GCMD_SRTP) {
> +        /* Set/update the root-table pointer */
> +        handle_gcmd_srtp(s);
> +    }
> +}
> +
> +/* Handle write to Context Command Register */
> +static void handle_ccmd_write(IntelIOMMUState *s)
> +{
> +    uint64_t ret;
> +    uint64_t val = get_quad_raw(s, DMAR_CCMD_REG);
> +
> +    /* Context-cache invalidation request */
> +    if (val & VTD_CCMD_ICC) {
> +        ret = vtd_context_cache_invalidate(s, val);
> +
> +        /* Invalidation completed. Change something to show */
> +        set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
> +        ret = set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, ret);
> +        VTD_DPRINTF(INV, "CCMD_REG write-back val: 0x%"PRIx64, ret);
> +    }
> +}
> +
> +/* Handle write to IOTLB Invalidation Register */
> +static void handle_iotlb_write(IntelIOMMUState *s)
> +{
> +    uint64_t ret;
> +    uint64_t val = get_quad_raw(s, DMAR_IOTLB_REG);
> +
> +    /* IOTLB invalidation request */
> +    if (val & VTD_TLB_IVT) {
> +        ret = vtd_iotlb_flush(s, val);
> +
> +        /* Invalidation completed. Change something to show */
> +        set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
> +        ret = set_clear_mask_quad(s, DMAR_IOTLB_REG,
> +                                  VTD_TLB_FLUSH_GRANU_MASK_A, ret);
> +        VTD_DPRINTF(INV, "IOTLB_REG write-back val: 0x%"PRIx64, ret);
> +    }
> +}
> +
> +static inline void handle_fsts_write(IntelIOMMUState *s)
> +{
> +    uint32_t fsts_reg = get_long_raw(s, DMAR_FSTS_REG);
> +    uint32_t fectl_reg = get_long_raw(s, DMAR_FECTL_REG);
> +    uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
> +
> +    if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
> +        set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
> +        VTD_DPRINTF(FLOG, "all pending interrupt conditions serviced, clear "
> +                    "IP field of FECTL_REG");
> +    }
> +}
> +
> +static inline void handle_fectl_write(IntelIOMMUState *s)
> +{
> +    uint32_t fectl_reg;
> +    /* When software clears the IM field, check the IP field. But do we
> +     * need to compare the old value and the new value to conclude that
> +     * software clears the IM field? Or just check if the IM field is zero?
> +     */
> +    fectl_reg = get_long_raw(s, DMAR_FECTL_REG);
> +    if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
> +        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
> +        set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
> +        VTD_DPRINTF(FLOG, "IM field is cleared, generate "
> +                    "fault event interrupt");
> +    }
> +}
> +
> +static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    IntelIOMMUState *s = opaque;
> +    uint64_t val;
> +
> +    if (addr + size > DMAR_REG_SIZE) {
> +        VTD_DPRINTF(GENERAL, "error: addr outside region: max 0x%"PRIx64
> +                    ", got 0x%"PRIx64 " %d",
> +                    (uint64_t)DMAR_REG_SIZE, addr, size);
> +        return (uint64_t)-1;
> +    }
> +
> +    assert(size == 4 || size == 8);

You already declare in the ops that you only support 4 and 8 byte
accesses, no?

> +
> +    switch (addr) {
> +    /* Root Table Address Register, 64-bit */
> +    case DMAR_RTADDR_REG:
> +        if (size == 4) {
> +            val = s->root & ((1ULL << 32) - 1);
> +        } else {
> +            val = s->root;
> +        }
> +        break;
> +
> +    case DMAR_RTADDR_REG_HI:
> +        assert(size == 4);
> +        val = s->root >> 32;
> +        break;
> +
> +    default:
> +        if (size == 4) {
> +            val = get_long(s, addr);
> +        } else {
> +            val = get_quad(s, addr);
> +        }
> +    }
> +
> +    VTD_DPRINTF(CSR, "addr 0x%"PRIx64 " size %d val 0x%"PRIx64,
> +                addr, size, val);
> +    return val;
> +}
> +
> +static void vtd_mem_write(void *opaque, hwaddr addr,
> +                          uint64_t val, unsigned size)
> +{
> +    IntelIOMMUState *s = opaque;
> +
> +    if (addr + size > DMAR_REG_SIZE) {
> +        VTD_DPRINTF(GENERAL, "error: addr outside region: max 0x%"PRIx64
> +                    ", got 0x%"PRIx64 " %d",
> +                    (uint64_t)DMAR_REG_SIZE, addr, size);
> +        return;
> +    }
> +
> +    assert(size == 4 || size == 8);
> +
> +    switch (addr) {
> +    /* Global Command Register, 32-bit */
> +    case DMAR_GCMD_REG:
> +        VTD_DPRINTF(CSR, "DMAR_GCMD_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        set_long(s, addr, val);
> +        handle_gcmd_write(s);
> +        break;
> +
> +    /* Context Command Register, 64-bit */
> +    case DMAR_CCMD_REG:
> +        VTD_DPRINTF(CSR, "DMAR_CCMD_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        if (size == 4) {
> +            set_long(s, addr, val);
> +        } else {
> +            set_quad(s, addr, val);
> +            handle_ccmd_write(s);
> +        }
> +        break;
> +
> +    case DMAR_CCMD_REG_HI:
> +        VTD_DPRINTF(CSR, "DMAR_CCMD_REG_HI write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        handle_ccmd_write(s);
> +        break;
> +
> +
> +    /* IOTLB Invalidation Register, 64-bit */
> +    case DMAR_IOTLB_REG:
> +        VTD_DPRINTF(INV, "DMAR_IOTLB_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        if (size == 4) {
> +            set_long(s, addr, val);
> +        } else {
> +            set_quad(s, addr, val);
> +            handle_iotlb_write(s);
> +        }
> +        break;
> +
> +    case DMAR_IOTLB_REG_HI:
> +        VTD_DPRINTF(INV, "DMAR_IOTLB_REG_HI write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        handle_iotlb_write(s);
> +        break;
> +
> +    /* Fault Status Register, 32-bit */
> +    case DMAR_FSTS_REG:
> +        VTD_DPRINTF(FLOG, "DMAR_FSTS_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        handle_fsts_write(s);
> +        break;
> +
> +    /* Fault Event Control Register, 32-bit */
> +    case DMAR_FECTL_REG:
> +        VTD_DPRINTF(FLOG, "DMAR_FECTL_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        handle_fectl_write(s);
> +        break;
> +
> +    /* Fault Event Data Register, 32-bit */
> +    case DMAR_FEDATA_REG:
> +        VTD_DPRINTF(FLOG, "DMAR_FEDATA_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        break;
> +
> +    /* Fault Event Address Register, 32-bit */
> +    case DMAR_FEADDR_REG:
> +        VTD_DPRINTF(FLOG, "DMAR_FEADDR_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        break;
> +
> +    /* Fault Event Upper Address Register, 32-bit */
> +    case DMAR_FEUADDR_REG:
> +        VTD_DPRINTF(FLOG, "DMAR_FEUADDR_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        break;
> +
> +    /* Protected Memory Enable Register, 32-bit */
> +    case DMAR_PMEN_REG:
> +        VTD_DPRINTF(CSR, "DMAR_PMEN_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        break;
> +
> +
> +    /* Root Table Address Register, 64-bit */
> +    case DMAR_RTADDR_REG:
> +        VTD_DPRINTF(CSR, "DMAR_RTADDR_REG write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        if (size == 4) {
> +            set_long(s, addr, val);
> +        } else {
> +            set_quad(s, addr, val);
> +        }
> +        break;
> +
> +    case DMAR_RTADDR_REG_HI:
> +        VTD_DPRINTF(CSR, "DMAR_RTADDR_REG_HI write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        break;
> +
> +    /* Fault Recording Registers, 128-bit */
> +    case DMAR_FRCD_REG_0_0:
> +        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_0 write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        if (size == 4) {
> +            set_long(s, addr, val);
> +        } else {
> +            set_quad(s, addr, val);
> +        }
> +        break;
> +
> +    case DMAR_FRCD_REG_0_1:
> +        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_1 write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        break;
> +
> +    case DMAR_FRCD_REG_0_2:
> +        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_2 write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        if (size == 4) {
> +            set_long(s, addr, val);
> +        } else {
> +            set_quad(s, addr, val);
> +            /* May clear bit 127 (Fault), update PPF */
> +            update_fsts_ppf(s);
> +        }
> +        break;
> +
> +    case DMAR_FRCD_REG_0_3:
> +        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_3 write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        assert(size == 4);
> +        set_long(s, addr, val);
> +        /* May clear bit 127 (Fault), update PPF */
> +        update_fsts_ppf(s);
> +        break;
> +
> +    default:
> +        VTD_DPRINTF(GENERAL, "error: unhandled reg write addr 0x%"PRIx64
> +                    ", size %d, val 0x%"PRIx64, addr, size, val);
> +        if (size == 4) {
> +            set_long(s, addr, val);
> +        } else {
> +            set_quad(s, addr, val);
> +        }
> +    }
> +
> +}
> +
> +static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
> +                                         bool is_write)
> +{
> +    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    int bus_num = vtd_as->bus_num;
> +    int devfn = vtd_as->devfn;
> +    IOMMUTLBEntry ret = {
> +        .target_as = &address_space_memory,
> +        .iova = addr,
> +        .translated_addr = 0,
> +        .addr_mask = ~(hwaddr)0,
> +        .perm = IOMMU_NONE,
> +    };
> +
> +    if (!s->dmar_enabled) {
> +        /* DMAR disabled, passthrough, use 4k-page*/
> +        ret.iova = addr & VTD_PAGE_MASK_4K;
> +        ret.translated_addr = addr & VTD_PAGE_MASK_4K;
> +        ret.addr_mask = ~VTD_PAGE_MASK_4K;
> +        ret.perm = IOMMU_RW;
> +        return ret;
> +    }
> +
> +    iommu_translate(s, bus_num, devfn, addr, is_write, &ret);
> +
> +    VTD_DPRINTF(MMU,
> +                "bus %d slot %d func %d devfn %d gpa %"PRIx64 " hpa %"PRIx64,
> +                bus_num, VTD_PCI_SLOT(devfn), VTD_PCI_FUNC(devfn), devfn, 
> addr,
> +                ret.translated_addr);
> +    return ret;
> +}
> +
> +static const VMStateDescription vtd_vmstate = {
> +    .name = "iommu_intel",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};

Did you test migration? I suppose not. :)

Background: you mirror several register states into IntelIOMMUState
fields, I guess to make them more handle to use. However, those need to
be updated on vmload. And there are surely more internal states that
have to be migrated as well, e.g. the currently active root pointer.

I would suggest to either review and fix this or leave migration support
out for now (".unmigratable = 1").

> +
> +static const MemoryRegionOps vtd_mem_ops = {
> +    .read = vtd_mem_read,
> +    .write = vtd_mem_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +static Property iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +/* Do the real initialization. It will also be called when reset, so pay
> + * attention when adding new initialization stuff.
> + */
> +static void do_vtd_init(IntelIOMMUState *s)
> +{
> +    memset(s->csr, 0, DMAR_REG_SIZE);
> +    memset(s->wmask, 0, DMAR_REG_SIZE);
> +    memset(s->w1cmask, 0, DMAR_REG_SIZE);
> +    memset(s->womask, 0, DMAR_REG_SIZE);
> +
> +    s->iommu_ops.translate = vtd_iommu_translate;
> +    s->root = 0;
> +    s->root_extended = false;
> +    s->dmar_enabled = false;
> +    s->iq_head = 0;
> +    s->iq_tail = 0;
> +    s->iq = 0;
> +    s->iq_size = 0;
> +    s->qi_enabled = false;
> +    s->iq_last_desc_type = VTD_INV_DESC_NONE;
> +    s->next_frcd_reg = 0;
> +
> +    /* b.0:2 = 6: Number of domains supported: 64K using 16 bit ids
> +     * b.3   = 0: Advanced fault logging not supported
> +     * b.4   = 0: Required write buffer flushing not supported
> +     * b.5   = 0: Protected low memory region not supported
> +     * b.6   = 0: Protected high memory region not supported
> +     * b.8:12 = 2: SAGAW(Supported Adjusted Guest Address Widths), 39-bit,
> +     *             3-level page-table
> +     * b.16:21 = 38: MGAW(Maximum Guest Address Width) = 39
> +     * b.22 = 0: ZLR(Zero Length Read) zero length DMA read requests
> +     *           to write-only pages not supported
> +     * b.24:33 = 34: FRO(Fault-recording Register offset)
> +     * b.54 = 0: DWD(Write Draining), draining of write requests not 
> supported
> +     * b.55 = 0: DRD(Read Draining), draining of read requests not supported
> +     */

I think this level of documentation is a bit overkill. You already
document the register layout implicitly by defining the constants.
Applies elsewhere, too.

> +    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> +             VTD_CAP_SAGAW;
> +
> +    /* b.1 = 0: QI(Queued Invalidation support) not supported
> +     * b.2 = 0: DT(Device-TLB support) not supported
> +     * b.3 = 0: IR(Interrupt Remapping support) not supported
> +     * b.4 = 0: EIM(Extended Interrupt Mode) not supported
> +     * b.8:17 = 15: IRO(IOTLB Register Offset)
> +     * b.20:23 = 0: MHMV(Maximum Handle Mask Value) not valid
> +     */
> +    s->ecap = VTD_ECAP_IRO;
> +
> +    /* Define registers with default values and bit semantics */
> +    define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);  /* set MAX = 1, RO */
> +    define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
> +    define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
> +    define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
> +    define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
> +    define_long(s, DMAR_GSTS_REG, 0, 0, 0); /* All bits RO, default 0 */
> +    define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffff000ULL, 0);
> +    define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
> +    define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
> +
> +    /* Advanced Fault Logging not supported */
> +    define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
> +    define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
> +    define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); /* 15:0 RW */
> +    define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); /* 31:2 RW */
> +
> +    /* Treated as RsvdZ when EIM in ECAP_REG is not supported
> +     * define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
> +     */
> +    define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
> +
> +    /* Treated as RO for implementations that PLMR and PHMR fields reported
> +     * as Clear in the CAP_REG.
> +     * define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
> +     */
> +    define_long(s, DMAR_PMEN_REG, 0, 0, 0);
> +
> +    /* IOTLB registers */
> +    define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
> +    define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
> +    define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
> +
> +    /* Fault Recording Registers, 128-bit */
> +    define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
> +    define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
> +}
> +
> +/* Reset function of QOM
> + * Should not reset address_spaces when reset

What does "should not" mean here? Is it an open todo?

> + */
> +static void vtd_reset(DeviceState *dev)
> +{
> +    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
> +
> +    VTD_DPRINTF(GENERAL, "");
> +    do_vtd_init(s);
> +}
> +
> +/* Initialization function of QOM */
> +static void vtd_realize(DeviceState *dev, Error **errp)
> +{
> +    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
> +
> +    VTD_DPRINTF(GENERAL, "");
> +    memset(s->address_spaces, 0, sizeof(s->address_spaces));
> +    memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
> +                          "intel_iommu", DMAR_REG_SIZE);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
> +    do_vtd_init(s);
> +}
> +
> +static void vtd_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->reset = vtd_reset;
> +    dc->realize = vtd_realize;
> +    dc->vmsd = &vtd_vmstate;
> +    dc->props = iommu_properties;
> +}
> +
> +static const TypeInfo vtd_info = {
> +    .name          = TYPE_INTEL_IOMMU_DEVICE,
> +    .parent        = TYPE_SYS_BUS_DEVICE,
> +    .instance_size = sizeof(IntelIOMMUState),
> +    .class_init    = vtd_class_init,
> +};
> +
> +static void vtd_register_types(void)
> +{
> +    VTD_DPRINTF(GENERAL, "");
> +    type_register_static(&vtd_info);
> +}
> +
> +type_init(vtd_register_types)
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> new file mode 100644
> index 0000000..7bc679a
> --- /dev/null
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -0,0 +1,345 @@
> +/*
> + * QEMU emulation of an Intel IOMMU (VT-d)
> + *   (DMA Remapping device)
> + *
> + * Copyright (C) 2013 Knut Omang, Oracle <address@hidden>
> + * Copyright (C) 2014 Le Tan, <address@hidden>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + *
> + * Lots of defines copied from kernel/include/linux/intel-iommu.h:
> + *   Copyright (C) 2006-2008 Intel Corporation
> + *   Author: Ashok Raj <address@hidden>
> + *   Author: Anil S Keshavamurthy <address@hidden>
> + *
> + */
> +
> +#ifndef HW_I386_INTEL_IOMMU_INTERNAL_H
> +#define HW_I386_INTEL_IOMMU_INTERNAL_H
> +#include "hw/i386/intel_iommu.h"
> +
> +/*
> + * Intel IOMMU register specification
> + */
> +#define DMAR_VER_REG    0x0 /* Arch version supported by this IOMMU */
> +#define DMAR_CAP_REG    0x8 /* Hardware supported capabilities */
> +#define DMAR_CAP_REG_HI 0xc /* High 32-bit of DMAR_CAP_REG */
> +#define DMAR_ECAP_REG   0x10    /* Extended capabilities supported */
> +#define DMAR_ECAP_REG_HI    0X14
> +#define DMAR_GCMD_REG   0x18    /* Global command register */
> +#define DMAR_GSTS_REG   0x1c    /* Global status register */
> +#define DMAR_RTADDR_REG 0x20    /* Root entry table */
> +#define DMAR_RTADDR_REG_HI  0X24
> +#define DMAR_CCMD_REG   0x28  /* Context command reg */
> +#define DMAR_CCMD_REG_HI    0x2c
> +#define DMAR_FSTS_REG   0x34  /* Fault Status register */
> +#define DMAR_FECTL_REG  0x38 /* Fault control register */
> +#define DMAR_FEDATA_REG 0x3c    /* Fault event interrupt data register */
> +#define DMAR_FEADDR_REG 0x40    /* Fault event interrupt addr register */
> +#define DMAR_FEUADDR_REG    0x44   /* Upper address register */
> +#define DMAR_AFLOG_REG  0x58 /* Advanced Fault control */
> +#define DMAR_AFLOG_REG_HI   0X5c
> +#define DMAR_PMEN_REG   0x64  /* Enable Protected Memory Region */
> +#define DMAR_PLMBASE_REG    0x68    /* PMRR Low addr */
> +#define DMAR_PLMLIMIT_REG 0x6c  /* PMRR low limit */
> +#define DMAR_PHMBASE_REG 0x70   /* pmrr high base addr */
> +#define DMAR_PHMBASE_REG_HI 0X74
> +#define DMAR_PHMLIMIT_REG 0x78  /* pmrr high limit */
> +#define DMAR_PHMLIMIT_REG_HI 0x7c
> +#define DMAR_IQH_REG    0x80   /* Invalidation queue head register */
> +#define DMAR_IQH_REG_HI 0X84
> +#define DMAR_IQT_REG    0x88   /* Invalidation queue tail register */
> +#define DMAR_IQT_REG_HI 0X8c
> +#define DMAR_IQ_SHIFT   4 /* Invalidation queue head/tail shift */
> +#define DMAR_IQA_REG    0x90   /* Invalidation queue addr register */
> +#define DMAR_IQA_REG_HI 0x94
> +#define DMAR_ICS_REG    0x9c   /* Invalidation complete status register */
> +#define DMAR_IRTA_REG   0xb8    /* Interrupt remapping table addr register */
> +#define DMAR_IRTA_REG_HI    0xbc

Please align all those constants:

#define CONSTANT                        0x1234
#define CONSTANT_WITH_LONGER_NAME       0x5678

> +
> +#define DMAR_IECTL_REG  0xa0    /* Invalidation event control register */
> +#define DMAR_IEDATA_REG 0xa4    /* Invalidation event data register */
> +#define DMAR_IEADDR_REG 0xa8    /* Invalidation event address register */
> +#define DMAR_IEUADDR_REG 0xac    /* Invalidation event address register */
> +#define DMAR_PQH_REG    0xc0    /* Page request queue head register */
> +#define DMAR_PQH_REG_HI 0xc4
> +#define DMAR_PQT_REG    0xc8    /* Page request queue tail register*/
> +#define DMAR_PQT_REG_HI     0xcc
> +#define DMAR_PQA_REG    0xd0    /* Page request queue address register */
> +#define DMAR_PQA_REG_HI 0xd4
> +#define DMAR_PRS_REG    0xdc    /* Page request status register */
> +#define DMAR_PECTL_REG  0xe0    /* Page request event control register */
> +#define DMAR_PEDATA_REG 0xe4    /* Page request event data register */
> +#define DMAR_PEADDR_REG 0xe8    /* Page request event address register */
> +#define DMAR_PEUADDR_REG  0xec  /* Page event upper address register */
> +#define DMAR_MTRRCAP_REG 0x100  /* MTRR capability register */
> +#define DMAR_MTRRCAP_REG_HI 0x104
> +#define DMAR_MTRRDEF_REG 0x108  /* MTRR default type register */
> +#define DMAR_MTRRDEF_REG_HI 0x10c
> +
> +/* IOTLB */
> +#define DMAR_IOTLB_REG_OFFSET 0xf0  /* Offset to the IOTLB registers */
> +#define DMAR_IVA_REG DMAR_IOTLB_REG_OFFSET  /* Invalidate Address Register */
> +#define DMAR_IVA_REG_HI (DMAR_IVA_REG + 4)
> +/* IOTLB Invalidate Register */
> +#define DMAR_IOTLB_REG (DMAR_IOTLB_REG_OFFSET + 0x8)
> +#define DMAR_IOTLB_REG_HI (DMAR_IOTLB_REG + 4)
> +
> +/* FRCD */
> +#define DMAR_FRCD_REG_OFFSET 0x220 /* Offset to the Fault Recording 
> Registers */
> +/* NOTICE: If you change the DMAR_FRCD_REG_NR, please remember to change the
> + * DMAR_REG_SIZE in include/hw/i386/intel_iommu.h.
> + * #define DMAR_REG_SIZE   (DMAR_FRCD_REG_OFFSET + 16 * DMAR_FRCD_REG_NR)
> + */
> +#define DMAR_FRCD_REG_NR 1ULL /* Num of Fault Recording Registers */
> +
> +#define DMAR_FRCD_REG_0_0    0x220 /* The 0th Fault Recording Register */
> +#define DMAR_FRCD_REG_0_1    0x224
> +#define DMAR_FRCD_REG_0_2    0x228
> +#define DMAR_FRCD_REG_0_3    0x22c
> +
> +/* Interrupt Address Range */
> +#define VTD_INTERRUPT_ADDR_FIRST    0xfee00000ULL
> +#define VTD_INTERRUPT_ADDR_LAST     0xfeefffffULL
> +
> +/* IOTLB_REG */
> +#define VTD_TLB_GLOBAL_FLUSH (1ULL << 60) /* Global invalidation */
> +#define VTD_TLB_DSI_FLUSH (2ULL << 60)  /* Domain-selective invalidation */
> +#define VTD_TLB_PSI_FLUSH (3ULL << 60)  /* Page-selective invalidation */
> +#define VTD_TLB_FLUSH_GRANU_MASK (3ULL << 60)
> +#define VTD_TLB_GLOBAL_FLUSH_A (1ULL << 57)
> +#define VTD_TLB_DSI_FLUSH_A (2ULL << 57)
> +#define VTD_TLB_PSI_FLUSH_A (3ULL << 57)
> +#define VTD_TLB_FLUSH_GRANU_MASK_A (3ULL << 57)
> +#define VTD_TLB_IVT (1ULL << 63)
> +
> +/* GCMD_REG */
> +#define VTD_GCMD_TE (1UL << 31)
> +#define VTD_GCMD_SRTP (1UL << 30)
> +#define VTD_GCMD_SFL (1UL << 29)
> +#define VTD_GCMD_EAFL (1UL << 28)
> +#define VTD_GCMD_WBF (1UL << 27)
> +#define VTD_GCMD_QIE (1UL << 26)
> +#define VTD_GCMD_IRE (1UL << 25)
> +#define VTD_GCMD_SIRTP (1UL << 24)
> +#define VTD_GCMD_CFI (1UL << 23)
> +
> +/* GSTS_REG */
> +#define VTD_GSTS_TES (1UL << 31)
> +#define VTD_GSTS_RTPS (1UL << 30)
> +#define VTD_GSTS_FLS (1UL << 29)
> +#define VTD_GSTS_AFLS (1UL << 28)
> +#define VTD_GSTS_WBFS (1UL << 27)
> +#define VTD_GSTS_QIES (1UL << 26)
> +#define VTD_GSTS_IRES (1UL << 25)
> +#define VTD_GSTS_IRTPS (1UL << 24)
> +#define VTD_GSTS_CFIS (1UL << 23)
> +
> +/* CCMD_REG */
> +#define VTD_CCMD_ICC (1ULL << 63)
> +#define VTD_CCMD_GLOBAL_INVL (1ULL << 61)
> +#define VTD_CCMD_DOMAIN_INVL (2ULL << 61)
> +#define VTD_CCMD_DEVICE_INVL (3ULL << 61)
> +#define VTD_CCMD_CIRG_MASK (3ULL << 61)
> +#define VTD_CCMD_GLOBAL_INVL_A (1ULL << 59)
> +#define VTD_CCMD_DOMAIN_INVL_A (2ULL << 59)
> +#define VTD_CCMD_DEVICE_INVL_A (3ULL << 59)
> +#define VTD_CCMD_CAIG_MASK (3ULL << 59)
> +
> +/* RTADDR_REG */
> +#define VTD_RTADDR_RTT (1ULL << 11)
> +#define VTD_RTADDR_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL)
> +
> +/* ECAP_REG */
> +#define VTD_ECAP_IRO (DMAR_IOTLB_REG_OFFSET << 4)  /* (offset >> 4) << 8 */
> +#define VTD_ECAP_QI  (1ULL << 1)
> +
> +/* CAP_REG */
> +#define VTD_CAP_FRO  (DMAR_FRCD_REG_OFFSET << 20) /* (offset >> 4) << 24 */
> +#define VTD_CAP_NFR  ((DMAR_FRCD_REG_NR - 1) << 40)
> +#define VTD_DOMAIN_ID_SHIFT     16  /* 16-bit domain id for 64K domains */
> +#define VTD_CAP_ND  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
> +#define VTD_MGAW    39  /* Maximum Guest Address Width */
> +#define VTD_CAP_MGAW    (((VTD_MGAW - 1) & 0x3fULL) << 16)
> +
> +/* Supported Adjusted Guest Address Widths */
> +#define VTD_CAP_SAGAW_SHIFT (8)
> +#define VTD_CAP_SAGAW_MASK  (0x1fULL << VTD_CAP_SAGAW_SHIFT)
> + /* 39-bit AGAW, 3-level page-table */
> +#define VTD_CAP_SAGAW_39bit (0x2ULL << VTD_CAP_SAGAW_SHIFT)
> + /* 48-bit AGAW, 4-level page-table */
> +#define VTD_CAP_SAGAW_48bit (0x4ULL << VTD_CAP_SAGAW_SHIFT)
> +#define VTD_CAP_SAGAW       VTD_CAP_SAGAW_39bit
> +
> +/* IQT_REG */
> +#define VTD_IQT_QT(val)     (((val) >> 4) & 0x7fffULL)
> +
> +/* IQA_REG */
> +#define VTD_IQA_IQA_MASK    (VTD_HAW_MASK ^ 0xfffULL)
> +#define VTD_IQA_QS          (0x7ULL)
> +
> +/* IQH_REG */
> +#define VTD_IQH_QH_SHIFT    (4)
> +#define VTD_IQH_QH_MASK     (0x7fff0ULL)

No need for braces around plain values (i.e. when there are no
operators), here and elsewhere.

> +
> +/* ICS_REG */
> +#define VTD_ICS_IWC         (1UL)
> +
> +/* IECTL_REG */
> +#define VTD_IECTL_IM        (1UL << 31)
> +#define VTD_IECTL_IP        (1UL << 30)
> +
> +/* FSTS_REG */
> +#define VTD_FSTS_FRI_MASK  (0xff00)
> +#define VTD_FSTS_FRI(val)  ((((uint32_t)(val)) << 8) & VTD_FSTS_FRI_MASK)
> +#define VTD_FSTS_IQE       (1UL << 4)
> +#define VTD_FSTS_PPF       (1UL << 1)
> +#define VTD_FSTS_PFO       (1UL)
> +
> +/* FECTL_REG */
> +#define VTD_FECTL_IM       (1UL << 31)
> +#define VTD_FECTL_IP       (1UL << 30)
> +
> +/* Fault Recording Register */
> +/* For the high 64-bit of 128-bit */
> +#define VTD_FRCD_F         (1ULL << 63)
> +#define VTD_FRCD_T         (1ULL << 62)
> +#define VTD_FRCD_FR(val)   (((val) & 0xffULL) << 32)
> +#define VTD_FRCD_SID_MASK   0xffffULL
> +#define VTD_FRCD_SID(val)  ((val) & VTD_FRCD_SID_MASK)
> +/* For the low 64-bit of 128-bit */
> +#define VTD_FRCD_FI(val)   ((val) & (((1ULL << VTD_MGAW) - 1) ^ 0xfffULL))
> +
> +/* DMA Remapping Fault Conditions */
> +typedef enum VTDFaultReason {
> +    /* Reserved for Advanced Fault logging. We use this to represent the case
> +     * with no fault event.
> +     */
> +    VTD_FR_RESERVED = 0,
> +    VTD_FR_ROOT_ENTRY_P = 1, /* The Present(P) field of root-entry is 0 */
> +    VTD_FR_CONTEXT_ENTRY_P, /* The Present(P) field of context-entry is 0 */
> +    VTD_FR_CONTEXT_ENTRY_INV, /* Invalid programming of a context-entry */
> +    VTD_FR_ADDR_BEYOND_MGAW, /* Input-address above (2^x-1) */
> +    VTD_FR_WRITE, /* No write permission */
> +    VTD_FR_READ, /* No read permission */
> +    /* Fail to access a second-level paging entry (not SL_PML4E) */
> +    VTD_FR_PAGING_ENTRY_INV,
> +    VTD_FR_ROOT_TABLE_INV, /* Fail to access a root-entry */
> +    VTD_FR_CONTEXT_TABLE_INV, /* Fail to access a context-entry */
> +    /* Non-zero reserved field in a present root-entry */
> +    VTD_FR_ROOT_ENTRY_RSVD,
> +    /* Non-zero reserved field in a present context-entry */
> +    VTD_FR_CONTEXT_ENTRY_RSVD,
> +    /* Non-zero reserved field in a second-level paging entry with at lease 
> one
> +     * Read(R) and Write(W) or Execute(E) field is Set.
> +     */
> +    VTD_FR_PAGING_ENTRY_RSVD,
> +    /* Translation request or translated request explicitly blocked dut to 
> the
> +     * programming of the Translation Type (T) field in the present
> +     * context-entry.
> +     */
> +    VTD_FR_CONTEXT_ENTRY_TT,
> +    /* This is not a normal fault reason. We use this to indicate some faults
> +     * that are not referenced by the VT-d specification.
> +     * Fault event with such reason should not be recorded.
> +     */
> +    VTD_FR_RESERVED_ERR,
> +    /* Guard */
> +    VTD_FR_MAX,
> +} VTDFaultReason;
> +
> +
> +/* Masks for Queued Invalidation Descriptor */
> +#define VTD_INV_DESC_TYPE  (0xf)
> +#define VTD_INV_DESC_CC    (0x1) /* Context-cache Invalidate Descriptor */
> +#define VTD_INV_DESC_IOTLB (0x2)
> +#define VTD_INV_DESC_WAIT  (0x5) /* Invalidation Wait Descriptor */
> +#define VTD_INV_DESC_NONE  (0)   /* Not an Invalidate Descriptor */
> +
> +
> +/* Pagesize of VTD paging structures, including root and context tables */
> +#define VTD_PAGE_SHIFT      (12)
> +#define VTD_PAGE_SIZE       (1ULL << VTD_PAGE_SHIFT)
> +
> +#define VTD_PAGE_SHIFT_4K   (12)
> +#define VTD_PAGE_MASK_4K    (~((1ULL << VTD_PAGE_SHIFT_4K) - 1))
> +#define VTD_PAGE_SHIFT_2M   (21)
> +#define VTD_PAGE_MASK_2M    (~((1ULL << VTD_PAGE_SHIFT_2M) - 1))
> +#define VTD_PAGE_SHIFT_1G   (30)
> +#define VTD_PAGE_MASK_1G    (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
> +
> +/* Root-Entry
> + * 0: Present
> + * 1-11: Reserved
> + * 12-63: Context-table Pointer
> + * 64-127: Reserved
> + */
> +struct VTDRootEntry {
> +    uint64_t val;
> +    uint64_t rsvd;
> +};
> +typedef struct VTDRootEntry VTDRootEntry;
> +
> +/* Masks for struct VTDRootEntry */
> +#define VTD_ROOT_ENTRY_P (1ULL << 0)
> +#define VTD_ROOT_ENTRY_CTP  (~0xfffULL)
> +
> +#define VTD_ROOT_ENTRY_NR   (VTD_PAGE_SIZE / sizeof(VTDRootEntry))
> +#define VTD_ROOT_ENTRY_RSVD (0xffeULL | ~VTD_HAW_MASK)
> +
> +/* Context-Entry */
> +struct VTDContextEntry {
> +    uint64_t lo;
> +    uint64_t hi;
> +};
> +typedef struct VTDContextEntry VTDContextEntry;
> +
> +/* Masks for struct VTDContextEntry */
> +/* lo */
> +#define VTD_CONTEXT_ENTRY_P (1ULL << 0)
> +#define VTD_CONTEXT_ENTRY_FPD   (1ULL << 1) /* Fault Processing Disable */
> +#define VTD_CONTEXT_ENTRY_TT    (3ULL << 2) /* Translation Type */
> +#define VTD_CONTEXT_TT_MULTI_LEVEL  (0)
> +#define VTD_CONTEXT_TT_DEV_IOTLB    (1)
> +#define VTD_CONTEXT_TT_PASS_THROUGH (2)
> +/* Second Level Page Translation Pointer*/
> +#define VTD_CONTEXT_ENTRY_SLPTPTR   (~0xfffULL)
> +#define VTD_CONTEXT_ENTRY_RSVD_LO   (0xff0ULL | ~VTD_HAW_MASK)
> +/* hi */
> +#define VTD_CONTEXT_ENTRY_AW    (7ULL) /* Adjusted guest-address-width */
> +#define VTD_CONTEXT_ENTRY_DID   (0xffffULL << 8)    /* Domain Identifier */
> +#define VTD_CONTEXT_ENTRY_RSVD_HI   (0xffffffffff000080ULL)
> +
> +#define VTD_CONTEXT_ENTRY_NR    (VTD_PAGE_SIZE / sizeof(VTDContextEntry))
> +
> +
> +/* Paging Structure common */
> +#define VTD_SL_PT_PAGE_SIZE_MASK   (1ULL << 7)
> +#define VTD_SL_LEVEL_BITS   9   /* Bits to decide the offset for each level 
> */
> +
> +/* Second Level Paging Structure */
> +#define VTD_SL_PML4_LEVEL   4
> +#define VTD_SL_PDP_LEVEL    3
> +#define VTD_SL_PD_LEVEL     2
> +#define VTD_SL_PT_LEVEL     1
> +#define VTD_SL_PT_ENTRY_NR  512
> +
> +/* Masks for Second Level Paging Entry */
> +#define VTD_SL_RW_MASK              (3ULL)
> +#define VTD_SL_R                    (1ULL)
> +#define VTD_SL_W                    (1ULL << 1)
> +#define VTD_SL_PT_BASE_ADDR_MASK    (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK)
> +#define VTD_SL_IGN_COM    (0xbff0000000000000ULL)
> +
> +#endif
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> new file mode 100644
> index 0000000..6601e62
> --- /dev/null
> +++ b/include/hw/i386/intel_iommu.h
> @@ -0,0 +1,90 @@
> +/*
> + * QEMU emulation of an Intel IOMMU (VT-d)
> + *   (DMA Remapping device)
> + *
> + * Copyright (C) 2013 Knut Omang, Oracle <address@hidden>
> + * Copyright (C) 2014 Le Tan, <address@hidden>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef INTEL_IOMMU_H
> +#define INTEL_IOMMU_H
> +#include "hw/qdev.h"
> +#include "sysemu/dma.h"
> +
> +#define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> +#define INTEL_IOMMU_DEVICE(obj) \
> +     OBJECT_CHECK(IntelIOMMUState, (obj), TYPE_INTEL_IOMMU_DEVICE)
> +
> +/* DMAR Hardware Unit Definition address (IOMMU unit) */
> +#define Q35_HOST_BRIDGE_IOMMU_ADDR 0xfed90000ULL
> +
> +#define VTD_PCI_BUS_MAX 256
> +#define VTD_PCI_SLOT_MAX 32
> +#define VTD_PCI_FUNC_MAX 8
> +#define VTD_PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
> +#define VTD_PCI_FUNC(devfn)         ((devfn) & 0x07)
> +
> +#define DMAR_REG_SIZE   0x230
> +
> +/* FIXME: do not know how to decide the haw */

Nothing to fix IMHO. Just state that this definition is arbitrary, just
large enough to cover all currently expected guest RAM sizes.

> +#define VTD_HOST_ADDRESS_WIDTH  39
> +#define VTD_HAW_MASK    ((1ULL << VTD_HOST_ADDRESS_WIDTH) - 1)
> +
> +typedef struct IntelIOMMUState IntelIOMMUState;
> +typedef struct VTDAddressSpace VTDAddressSpace;
> +
> +struct VTDAddressSpace {
> +    int bus_num;
> +    int devfn;
> +    AddressSpace as;
> +    MemoryRegion iommu;
> +    IntelIOMMUState *iommu_state;
> +};
> +
> +/* The iommu (DMAR) device state struct */
> +struct IntelIOMMUState {
> +    SysBusDevice busdev;
> +    MemoryRegion csrmem;
> +    uint8_t csr[DMAR_REG_SIZE];     /* register values */
> +    uint8_t wmask[DMAR_REG_SIZE];   /* R/W bytes */
> +    uint8_t w1cmask[DMAR_REG_SIZE]; /* RW1C(Write 1 to Clear) bytes */
> +    uint8_t womask[DMAR_REG_SIZE]; /* WO (write only - read returns 0) */
> +    uint32_t version;
> +
> +    dma_addr_t root;        /* Current root table pointer */
> +    bool root_extended;     /* Type of root table (extended or not) */
> +    bool dmar_enabled;      /* Set if DMA remapping is enabled */
> +
> +    uint16_t iq_head;       /* Current invalidation queue head */
> +    uint16_t iq_tail;       /* Current invalidation queue tail */
> +    dma_addr_t iq;          /* Current invalidation queue (IQ) pointer */
> +    uint16_t iq_size;       /* IQ Size in number of entries */
> +    bool qi_enabled;        /* Set if the QI is enabled */
> +    uint8_t iq_last_desc_type; /* The type of last completed descriptor */
> +
> +    /* The index of the Fault Recording Register to be used next.
> +     * Wraps around from N-1 to 0, where N is the number of FRCD_REG.
> +     */
> +    uint16_t next_frcd_reg;
> +
> +    uint64_t cap;           /* The value of Capability Register */
> +    uint64_t ecap;          /* The value of Extended Capability Register */
> +
> +    MemoryRegionIOMMUOps iommu_ops;
> +    VTDAddressSpace **address_spaces[VTD_PCI_BUS_MAX];
> +};
> +
> +#endif
> 

Very nice job!

Jan

Attachment: signature.asc
Description: OpenPGP digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]