qemu-riscv
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 03/15] hw/riscv: add RISC-V IOMMU base emulation


From: Daniel Henrique Barboza
Subject: Re: [PATCH v2 03/15] hw/riscv: add RISC-V IOMMU base emulation
Date: Tue, 14 May 2024 17:06:05 -0300
User-agent: Mozilla Thunderbird

Hi Jason,

On 5/1/24 08:57, Jason Chien wrote:
Daniel Henrique Barboza 於 2024/3/8 上午 12:03 寫道:
From: Tomasz Jeznach<tjeznach@rivosinc.com>

The RISC-V IOMMU specification is now ratified as-per the RISC-V
international process. The latest frozen specifcation can be found
at:

https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

Add the foundation of the device emulation for RISC-V IOMMU, which
includes an IOMMU that has no capabilities but MSI interrupt support and
fault queue interfaces. We'll add add more features incrementally in the
next patches.

Co-developed-by: Sebastien Boeuf<seb@rivosinc.com>
Signed-off-by: Sebastien Boeuf<seb@rivosinc.com>
Signed-off-by: Tomasz Jeznach<tjeznach@rivosinc.com>
Signed-off-by: Daniel Henrique Barboza<dbarboza@ventanamicro.com>
---
  hw/riscv/Kconfig         |    4 +
  hw/riscv/meson.build     |    1 +
  hw/riscv/riscv-iommu.c   | 1492 ++++++++++++++++++++++++++++++++++++++
  hw/riscv/riscv-iommu.h   |  141 ++++
  hw/riscv/trace-events    |   11 +
  hw/riscv/trace.h         |    2 +
  include/hw/riscv/iommu.h |   36 +
  meson.build              |    1 +
  8 files changed, 1688 insertions(+)
  create mode 100644 hw/riscv/riscv-iommu.c
  create mode 100644 hw/riscv/riscv-iommu.h
  create mode 100644 hw/riscv/trace-events
  create mode 100644 hw/riscv/trace.h
  create mode 100644 include/hw/riscv/iommu.h

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index 5d644eb7b1..faf6a10029 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -1,3 +1,6 @@
+config RISCV_IOMMU
+    bool
+

(...)

+
+/* IOMMU index for transactions without PASID specified. */
+#define RISCV_IOMMU_NOPASID 0
+
+static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
+{
+    const uint32_t ipsr =
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
+    const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
+    if (s->notify && !(ipsr & (1 << vec))) {
+        s->notify(s, (ivec >> (vec * 4)) & 0x0F);
+    }
+}
The RISC-V IOMMU also supports WSI.
+

I mentioned in the review with Frank that this impl does not support WSI, but
it really seems clearer to do the check here nevertheless. I'll add it.


+static void riscv_iommu_fault(RISCVIOMMUState *s,
+                              struct riscv_iommu_fq_record *ev)
+{

(...)

+
+    /*
+     * Check supported device id width (in bits).
+     * See IOMMU Specification, Chapter 6. Software guidelines.
+     * - if extended device-context format is used:
+     *   1LVL: 6, 2LVL: 15, 3LVL: 24
+     * - if base device-context format is used:
+     *   1LVL: 7, 2LVL: 16, 3LVL: 24
+     */
+    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;

The cause should be 260 not 258.

 From the RISC-V IOMMU Architecture Spec v1.0.0 section 2.3:
If the device_id is wider than that supported by the IOMMU mode, as determined by the 
following checks then stop and report "Transaction type disallowed" (cause = 
260).
a. ddtp.iommu_mode is 2LVL and DDI[2] is not 0
b. ddtp.iommu_mode is 1LVL and either DDI[2] is not 0 or DDI[1] is not 0


Changed.

+    }
+
+    /* Device directory tree walk */
+    for (; depth-- > 0; ) {
+        /*
+         * Select device id index bits based on device directory tree level
+         * and device context format.
+         * See IOMMU Specification, Chapter 2. Data Structures.
+         * - if extended device-context format is used:
+         *   device index: [23:15][14:6][5:0]
+         * - if base device-context format is used:
+         *   device index: [23:16][15:7][6:0]
+         */
+        const int split = depth * 9 + 6 + dc_fmt;
+        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
+            /* invalid directory entry */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
+            /* reserved bits set */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;

The cause should be 259 not 258.

 From RISC-V IOMMU Architecture Spec v1.0.0 section 2.3.1:
If any bits or encoding that are reserved for future standard use are set within ddte, 
stop and report "DDT entry misconfigured" (cause = 259).

Changed


+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
+    }
+
+    /* index into device context entry page */
+    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
+
+    memset(&dc, 0, sizeof(dc));
+    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+    }
+
+    /* Set translation context. */
+    ctx->tc = le64_to_cpu(dc.tc);
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->msiptp = le64_to_cpu(dc.msiptp);
+    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
+    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
+
According to RISC-V IOMMU Architecture spec v1.0.0 section 2.1.4, we should do 
some checks for the found device context.

I added a new helper to validate the device context at this point, following
section 2.1.4 steps.

+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+    }
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
+        if (ctx->pasid != RISCV_IOMMU_NOPASID) {
+            /* PASID is disabled */
+            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+        }
+        return 0;
+    }
+

(...)

+
+static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
+        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
+        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
+        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
+            RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
cqcsr.fence_w_ip should be set to 0 as well.

Done.


+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
+    } else {

(...)

+}
+
+static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState *s = opaque;
+    uint32_t regb = addr & ~3;
+    uint32_t busy = 0;
+    uint32_t exec = 0;
+
+    if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment or access size */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        /* Unsupported MMIO access location. */
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Track actionable MMIO write. */
+    switch (regb) {

There should be a case for IPSR register.

 From RISC-V IOMMU Architecture Spec v1.0.0 section 5.18:
If a bit in ipsr is 1 then a write of 1 to the bit transitions the bit from 
1→0. If the conditions to set that bit are still present (See [IPSR_FIELDS]) or 
if they occur after the bit is cleared then that bit transitions again from 0→1.


A new helper to handle ipsr updates via mmio_write was created.

+    case RISCV_IOMMU_REG_DDTP:
+    case RISCV_IOMMU_REG_DDTP + 4:
+        exec = BIT(RISCV_IOMMU_EXEC_DDTP);

(...)

+static void riscv_iommu_realize(DeviceState *dev, Error **errp)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
+    if (s->enable_msi) {
+        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
+    }
+    /* Report QEMU target physical address space limits */
+    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
+                       TARGET_PHYS_ADDR_SPACE_BITS);
+
+    /* TODO: method to report supported PASID bits */
+    s->pasid_bits = 8; /* restricted to size of MemTxAttrs.pasid */
+    s->cap |= RISCV_IOMMU_CAP_PD8;
+
+    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
+    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
+                        RISCV_IOMMU_DDTP_MODE_OFF : 
RISCV_IOMMU_DDTP_MODE_BARE);
+
+    /* register storage */
+    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+
+     /* Mark all registers read-only */
+    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
+
+    /*
+     * Register complete MMIO space, including MSI/PBA registers.
+     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
+     * managed directly by the PCIDevice implementation.
+     */
+    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
+        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
+
+    /* Set power-on register state */
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
s->fctl is not initialized.

I believe the idea is to init it as zero. I'll change it to init as zero
explicitly.

+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
+        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
+        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],

(...)

+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+        Error **errp)
+{
+    if (bus->iommu_ops &&
+        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
+        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
+        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
+        QLIST_INSERT_AFTER(last, iommu, iommus);
+    } else if (bus->iommu_ops == NULL) {
+        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
The original bus->iommu_op and bus->iommu_opaque will be lost.

Not sure what you meant with 'iommu_op'. We have 'iommu_ops', which is being 
checked
for NULL before calling pci_setup_iommu().

As for overwriting the original bus->iommu_opaque, I added an extra 
iommu_opaque == NULL
check. We'll make:

    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);

This will guarantee that we're not overwriting any existing ops or opaque by 
accident.



Thanks,


Daniel




reply via email to

[Prev in Thread] Current Thread [Next in Thread]