Unexpected work by certain Windows guests equipped with the e1000
interface can cause the network to go down and never come back up
again unless the guest's interface is reset.
To reproduce the failure:
1. Set up two guests with a Windows 2016 or 2019 server operating
system.
2. Set up the e1000 interface for the guests.
3. Pressurize the network slightly between two guests using the iPerf tool.
The network goes down after a few days (2-5days), and the issue
is the result of not adhering to the e1000 specification. Refer
to the details of the specification at the following link:
https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
as following:
This register holds a value that is an offset from the base, and
identifies the location beyond the last descriptor hardware can
process. Note that tail should still point to an area in the
descriptor ring (somewhere between RDBA and RDBA + RDLEN).
This is because tail points to the location where software writes
the first new descriptor.
This means that if the provider—in this case, QEMU—has not yet
loaded the packet, RDT should never point to that place. When
implementing the emulation of the e1000 interface, QEMU evaluates
if the receive ring buffer is full once the RDT equals the RDH,
based on the assumption that guest drivers adhere to this
criterion strictly.
We applied the following log patch to assist in analyzing the
issue and eventually obtained the unexpected information.
Log patch:
-----------------------------------------------------------------
|--- a/hw/net/e1000.c
|+++ b/hw/net/e1000.c
|@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
| static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
| {
| int bufs;
|+ DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
|+ s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
|+
| /* Fast-path short packets */
| if (total_size <= s->rxbuf_size) {
| if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
|@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
| s->rxbuf_min_shift)
| n |= E1000_ICS_RXDMT0;
|
|+ DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
|+ s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
|+
-----------------------------------------------------------------
The last few logs of information when the network is down:
e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
<- the receive ring buffer is checked for fullness in the
e1000_has_rxbufs function, not full.
e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
<- RDT stays the same, RDH updates to 898, and 1 descriptor
utilized after putting the packet to ring buffer.
e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
<- the receive ring buffer is checked for fullness in the
e1000_has_rxbufs function, not full.
e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
<- RDT stays the same, RDH updates to 899, and 1 descriptor
utilized after putting the packet to ring buffer.
e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
<- the receive ring buffer is checked for fullness in the
e1000_has_rxbufs function, not full.
e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
<- RDT stays the same, RDH updates to 900 , and 1 descriptor
utilized after putting the packet to ring buffer.
e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
<- The ring is full, according to e1000_has_rxbufs, because
of the RDT update to 900 and equals RDH ! But in reality,
the state of the ring buffer is empty because the producer
only used one descriptor the last time, and the ring buffer
was not full after that.
To sum up, QEMU claims that the receive ring buffer is full
in the aforementioned scenario, placing the packet in the
self-maintained queue and unregistering the tap device's
readable fd handler and then waiting for the guest to consume
the receive ring buffer. This brings down the network since
guests have nothing to consume and never update the RDT
location.
In the above scenario, QEMU assert that the ring is full,
put the packet on the queue, unregister the readable fd
handler of the tap device, waiting the guest to consume
the receive ring. While, guest have nothing to consume
on the receive ring and never update the RDT location,
this makes the network down.
To get around this issue, just mark the overrun if RDH
equals RDT at the end of placing the packet on the ring
buffer for the producer.
Signed-off-by: Hyman Huang <yong.huang@smartx.com>
---
hw/net/e1000.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 5012b96464..f80cb70283 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -126,6 +126,12 @@ struct E1000State_st {
QEMUTimer *flush_queue_timer;
+ /*
+ * Indicate that the receive circular buffer queue overrun
+ * the last time hardware produced packets.
+ */
+ bool last_overrun;
+
/* Compatibility flags for migration to/from qemu 1.3.0 and older */
#define E1000_FLAG_MAC_BIT 2
#define E1000_FLAG_TSO_BIT 3
@@ -832,7 +838,12 @@ static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
int bufs;
/* Fast-path short packets */
if (total_size <= s->rxbuf_size) {
- return s->mac_reg[RDH] != s->mac_reg[RDT];
+ if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun) {
+ return false;
+ }
+
+ DBGOUT(RX, "Receive ring buffer is not full unexpectedly!\n");
I'll drop this DBGOUT in the next version now that "treat RX ring with RDH == RDT as empty"
is the expected and original conclusion in commit:
+ return true;
}
if (s->mac_reg[RDH] < s->mac_reg[RDT]) {
bufs = s->mac_reg[RDT] - s->mac_reg[RDH];
@@ -840,7 +851,12 @@ static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
bufs = s->mac_reg[RDLEN] / sizeof(struct e1000_rx_desc) +
s->mac_reg[RDT] - s->mac_reg[RDH];
} else {
- return false;
+ if (s->last_overrun) {
+ return false;
+ }
+
+ DBGOUT(RX, "Receive ring buffer is not full unexpectedly!\n");
+ return true;
}
return total_size <= bufs * s->rxbuf_size;
}
@@ -999,6 +1015,8 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
e1000x_update_rx_total_stats(s->mac_reg, pkt_type, size, total_size);
+ s->last_overrun = (s->mac_reg[RDH] == s->mac_reg[RDT]) ? true : false;
+
n = E1000_ICS_RXT0;
if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH])
rdt += s->mac_reg[RDLEN] / sizeof(desc);
--
2.39.1