[dev][net][e1000] First stab at a working e1000 driver.

-Works against qemu's e1000 and e1000e driver.
-Untested on real hardware yet.
This commit is contained in:
Travis Geiselbrecht
2021-12-27 20:27:51 -08:00
parent b5a9c2d8b2
commit caafb3e2ad
6 changed files with 690 additions and 1 deletions

545
dev/net/e1000/e1000.cpp Normal file
View File

@@ -0,0 +1,545 @@
//
// Copyright (c) 2021 Travis Geiselbrecht
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <lk/init.h>
#include <lk/err.h>
#include <lk/cpp.h>
#include <lk/trace.h>
#include <lk/list.h>
#include <dev/bus/pci.h>
#include <kernel/event.h>
#include <kernel/thread.h>
#include <kernel/vm.h>
#include <lib/minip.h>
#include <lib/pktbuf.h>
#include <string.h>
#include <platform/interrupts.h>
#include <type_traits>
#include "e1000_hw.h"
#define LOCAL_TRACE 0
class e1000;
static e1000 *the_e; // XXX hack to remember the first e1000 seen and use for minip
// list of known 8086:x e1000 devices to match against
struct e1000_id_features {
uint16_t id;
bool e1000e;
};
const e1000_id_features e1000_ids[] = {
{ 0x100c, false }, // 82544GC QEMU 'e1000-82544gc'
{ 0x100e, false }, // 82540EM QEMU 'e1000'
{ 0x100f, false }, // 82545EM QEMU 'e1000-82544em'
{ 0x10d3, true }, // 82574L QEMU 'e1000e'
{ 0x1533, true }, // i210
};
// i210 ids
// 0x1533
// 0x1536
// 0x1537
// 0x1538
class e1000 {
public:
e1000();
~e1000();
status_t init_device(pci_location_t loc, const e1000_id_features *id);
int tx(pktbuf_t *p);
bool is_e1000e() const { return id_feat_->e1000e; }
private:
static const size_t rxring_len = 64;
static const size_t txring_len = 64;
static const size_t rxbuffer_len = 2048;
uint32_t read_reg(e1000_reg reg);
void write_reg(e1000_reg reg, uint32_t val);
uint16_t read_eeprom(uint8_t offset);
handler_return irq_handler();
void add_pktbuf_to_rxring(pktbuf_t *pkt);
void add_pktbuf_to_rxring_locked(pktbuf_t *pkt);
// main spinlock
spin_lock_t lock_ = SPIN_LOCK_INITIAL_VALUE;
// configuration
pci_location_t loc_ = {};
void *bar0_regs_ = nullptr;
uint8_t mac_addr_[6] = {};
const e1000_id_features *id_feat_ = nullptr;
// rx ring
rdesc *rxring_ = nullptr;
uint32_t rx_last_head_ = 0;
uint32_t rx_tail_ = 0;
pktbuf_t *rx_pktbuf_[rxring_len] = {};
uint8_t *rx_buf_ = nullptr; // rxbuffer_len * rxring_len byte buffer that rx_pktbuf[] points to
// rx worker thread
list_node rx_queue_ = LIST_INITIAL_VALUE(rx_queue_);
event_t rx_event_ = EVENT_INITIAL_VALUE(rx_event_, 0, EVENT_FLAG_AUTOUNSIGNAL);
thread_t *rx_worker_thread_ = nullptr;
int rx_worker_routine();
// tx ring
tdesc *txring_ = nullptr;
uint32_t tx_last_head_ = 0;
uint32_t tx_tail_ = 0;
pktbuf_t *tx_pktbuf_[txring_len] = {};
};
uint32_t e1000::read_reg(e1000_reg reg) {
volatile uint32_t *r = (volatile uint32_t *)((uintptr_t)bar0_regs_ + (size_t)reg);
return *r;
}
void e1000::write_reg(e1000_reg reg, uint32_t val) {
volatile uint32_t *r = (volatile uint32_t *)((uintptr_t)bar0_regs_ + (size_t)reg);
*r = val;
}
uint16_t e1000::read_eeprom(uint8_t offset) {
// 8257x+ seems to have a different EERD layout
uint32_t val;
if (is_e1000e()) {
write_reg(e1000_reg::EERD, (offset << 2) | 0x1); // data + start bit
// spin while bit 1 (DONE) is clear
while (((val = read_reg(e1000_reg::EERD)) & (1<<1)) == 0)
;
} else {
write_reg(e1000_reg::EERD, (offset << 8) | 0x1); // data + start bit
// spin while bit 4 (DONE) is clear
while (((val = read_reg(e1000_reg::EERD)) & (1<<4)) == 0)
;
}
return val >> 16;
}
e1000::e1000() = default;
e1000::~e1000() {
// TODO: free resources
}
handler_return e1000::irq_handler() {
// read the interrupt cause register, which also auto clears all bits
auto icr = read_reg(e1000_reg::ICR);
if (!icr) {
return INT_NO_RESCHEDULE;
}
LTRACEF("icr %#x\n", icr);
AutoSpinLockNoIrqSave guard(&lock_);
handler_return ret = INT_NO_RESCHEDULE;
if (icr & (1<<0)) { // TXDW - transmit descriptor written back
PANIC_UNIMPLEMENTED;
}
if (icr & (1<<1)) { // TXQE - transmit queue empty
//PANIC_UNIMPLEMENTED;
// nothing to really do here
}
if (icr & (1<<6)) {
printf("e1000: RX OVERRUN\n");
}
if (icr & (1<<7)) { // RXTO - rx timer interrupt
// rx timer fired, packets are probably ready
auto rdh = read_reg(e1000_reg::RDH);
auto rdt = read_reg(e1000_reg::RDT);
while (rx_last_head_ != rdh) {
// copy the current rx descriptor locally for better cache performance
rdesc rxd;
copy(&rxd, rxring_ + rx_last_head_);
LTRACEF("last_head %#x RDH %#x RDT %#x\n", rx_last_head_, rdh, rdt);
if (LOCAL_TRACE) rxd.dump();
// recover the pktbuf we queued in this spot
DEBUG_ASSERT(rx_pktbuf_[rx_last_head_]);
DEBUG_ASSERT(pktbuf_data_phys(rx_pktbuf_[rx_last_head_]) == rxd.addr);
pktbuf_t *pkt = rx_pktbuf_[rx_last_head_];
bool consumed_pkt = false;
if (rxd.status & (1 << 0)) { // descriptor done, we own it now
if (rxd.status & (1<<1)) { // end of packet
if (rxd.errors == 0) {
// good packet, trim data len according to the rx descriptor
pkt->dlen = rxd.length;
pkt->flags |= PKTBUF_FLAG_EOF; // just to make sure
// queue it in the rx queue
list_add_tail(&rx_queue_, &pkt->list);
// wake up the rx worker
event_signal(&rx_event_, false);
ret = INT_RESCHEDULE;
consumed_pkt = true;
}
}
}
if (!consumed_pkt) {
// TODO: return the pkt to the ring
add_pktbuf_to_rxring_locked(pkt);
}
rx_last_head_ = (rx_last_head_ + 1) % rxring_len;
}
}
return ret;
}
int e1000::rx_worker_routine() {
for (;;) {
event_wait(&rx_event_);
// pull some packets from the received queue
for (;;) {
pktbuf_t *p;
{
AutoSpinLock guard(&lock_);
p = list_remove_head_type(&rx_queue_, pktbuf_t, list);
}
if (!p) {
break; // nothing left in the queue, go back to waiting
}
if (LOCAL_TRACE) {
LTRACEF("got packet: ");
pktbuf_dump(p);
}
// push it up the stack
minip_rx_driver_callback(p);
// we own the pktbuf again
// set the data pointer to the start of the buffer and set dlen to 0
pktbuf_reset(p, 0);
// add it back to the rx ring at the current tail
add_pktbuf_to_rxring(p);
}
}
return 0;
}
int e1000::tx(pktbuf_t *p) {
LTRACE;
if (LOCAL_TRACE) {
pktbuf_dump(p);
}
// build a tx descriptor and stuff it in the tx ring
tdesc td = {};
td.addr = pktbuf_data_phys(p);
td.length = p->dlen;
td.cmd = (1<<0); // end of packet (EOP)
copy(&txring_[tx_tail_], &td);
// save a copy of the pktbuf in our list
tx_pktbuf_[tx_tail_] = p;
// bump tail forward
tx_tail_ = (tx_tail_ + 1) % txring_len;
write_reg(e1000_reg::TDT, tx_tail_);
LTRACEF("TDH %#x TDT %#x\n", read_reg(e1000_reg::TDH), read_reg(e1000_reg::TDT));
return NO_ERROR;
}
void e1000::add_pktbuf_to_rxring_locked(pktbuf_t *p) {
DEBUG_ASSERT(p);
DEBUG_ASSERT(p->dlen == 0);
DEBUG_ASSERT(p->blen == rxbuffer_len);
// add it to the next rxring entry at the tail
rdesc rd = {};
rd.addr = pktbuf_data_phys(p);
copy(&rxring_[rx_tail_], &rd);
// save a copy of the pktbuf in our list
rx_pktbuf_[rx_tail_] = p;
// bump tail forward
rx_tail_ = (rx_tail_ + 1) % rxring_len;
write_reg(e1000_reg::RDT, rx_tail_);
LTRACEF("after RDH %#x RDT %#x\n", read_reg(e1000_reg::RDH), read_reg(e1000_reg::RDT));
}
void e1000::add_pktbuf_to_rxring(pktbuf_t *pkt) {
AutoSpinLock guard(&lock_);
add_pktbuf_to_rxring_locked(pkt);
}
status_t e1000::init_device(pci_location_t loc, const e1000_id_features *id) {
loc_ = loc;
id_feat_ = id;
char str[14];
LTRACEF("pci location %s\n", pci_loc_string(loc_, str));
pci_bar_t bars[6];
size_t count;
status_t err = pci_bus_mgr_read_bars(loc_, bars, &count);
if (err != NO_ERROR) return err;
if (count < 2) {
return ERR_NOT_FOUND;
}
LTRACEF("e1000 BARS:\n");
if (LOCAL_TRACE) pci_dump_bars(bars, count);
if (!bars[0].valid) {
return ERR_NOT_FOUND;
}
pci_bus_mgr_enable_device(loc_);
// map bar 0, main memory mapped register interface, 128KB
err = vmm_alloc_physical(vmm_get_kernel_aspace(), "e1000_bar0", 128*1024, &bar0_regs_, 0,
bars[0].addr, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE);
if (err != NO_ERROR) {
return ERR_NOT_FOUND;
}
LTRACEF("bar 0 regs mapped to %p\n", bar0_regs_);
// read the mac address out of the eeprom
uint16_t tmp;
tmp = read_eeprom(0);
mac_addr_[0] = tmp & 0xff;
mac_addr_[1] = tmp >> 8;
tmp = read_eeprom(1);
mac_addr_[2] = tmp & 0xff;
mac_addr_[3] = tmp >> 8;
tmp = read_eeprom(2);
mac_addr_[4] = tmp & 0xff;
mac_addr_[5] = tmp >> 8;
printf("e1000: mac address %02x:%02x:%02x:%02x:%02x:%02x\n", mac_addr_[0], mac_addr_[1], mac_addr_[2],
mac_addr_[3], mac_addr_[4], mac_addr_[5]);
// allocate and map space for the rx and tx ring
err = vmm_alloc_contiguous(vmm_get_kernel_aspace(), "e1000 rxring", rxring_len * sizeof(rdesc), (void **)&rxring_, 0, 0, ARCH_MMU_FLAG_UNCACHED);
if (err != NO_ERROR) {
return ERR_NOT_FOUND;
}
memset(rxring_, 0, rxring_len * sizeof(rdesc));
paddr_t rxring_phys = vaddr_to_paddr(rxring_);
LTRACEF("rx ring at %p, physical %#lx\n", rxring_, rxring_phys);
err = vmm_alloc_contiguous(vmm_get_kernel_aspace(), "e1000 txring", txring_len * sizeof(tdesc), (void **)&txring_, 0, 0, ARCH_MMU_FLAG_UNCACHED);
if (err != NO_ERROR) {
return ERR_NOT_FOUND;
}
memset(txring_, 0, txring_len * sizeof(rdesc));
paddr_t txring_phys = vaddr_to_paddr(txring_);
LTRACEF("tx ring at %p, physical %#lx\n", txring_, txring_phys);
// allocate a large array of contiguous buffers to receive into
err = vmm_alloc_contiguous(vmm_get_kernel_aspace(), "e1000 rx buffers", rxring_len * rxbuffer_len, (void **)&rx_buf_, 0, 0, 0);
if (err != NO_ERROR) {
return ERR_NOT_FOUND;
}
// mask all IRQs
write_reg(e1000_reg::IMC, 0xffff);
// qemus 82574 emulation seems to want IAME to be set to auto-clear ICR bits.
if (is_e1000e()) {
auto ctrl_ext = read_reg(e1000_reg::CTL_EXT);
write_reg(e1000_reg::CTL_EXT, ctrl_ext | (1<<27)); // IAME - interrupt ack auto-mask
write_reg(e1000_reg::IAM, 0); // set such that no IMS bits are auto cleared
}
// set the interrupt treshold reg
const uint32_t irq_rate = 10000; // max 10k irqs/sec
write_reg(e1000_reg::ITR, 1000000 / irq_rate * 4);
if (is_e1000e()) {
write_reg(e1000_reg::EITR0, 1000000 / irq_rate * 4);
write_reg(e1000_reg::EITR1, 1000000 / irq_rate * 4);
write_reg(e1000_reg::EITR2, 1000000 / irq_rate * 4);
write_reg(e1000_reg::EITR3, 1000000 / irq_rate * 4);
write_reg(e1000_reg::EITR4, 1000000 / irq_rate * 4);
}
// set up minip's macaddr
// TODO: move to something smarter
minip_set_macaddr(mac_addr_);
// disable tx and rx
write_reg(e1000_reg::RCTL, 0);
write_reg(e1000_reg::TCTL, 0);
// irq handler lambda to get to inner method
auto irq_handler_wrapper = [](void *arg) -> handler_return {
e1000 *e = (e1000 *)arg;
return e->irq_handler();
};
// allocate a MSI interrupt
uint irq_base;
err = pci_bus_mgr_allocate_msi(loc_, 1, &irq_base);
if (err != NO_ERROR) {
// fall back to regular IRQs
err = pci_bus_mgr_allocate_irq(loc_, &irq_base);
if (err != NO_ERROR) {
printf("e1000: unable to allocate IRQ\n");
return err;
}
register_int_handler(irq_base, irq_handler_wrapper, this);
} else {
register_int_handler_msi(irq_base, irq_handler_wrapper, this, true);
}
LTRACEF("IRQ number %#x\n", irq_base);
unmask_interrupt(irq_base);
// set up the rx ring
write_reg(e1000_reg::RDBAL, rxring_phys & 0xffffffff);
#if __INTPTR_WIDTH__ == 64
write_reg(e1000_reg::RDBAH, rxring_phys >> 32);
#else
write_reg(e1000_reg::RDBAH, 0);
#endif
write_reg(e1000_reg::RDLEN, rxring_len * sizeof(rdesc));
// set head and tail to 0
write_reg(e1000_reg::RDH, 0);
write_reg(e1000_reg::RDT, 0);
// disable receive delay timer and absolute delay timer
write_reg(e1000_reg::RDTR, 0);
write_reg(e1000_reg::RADV, 0);
// disable small packet detect
write_reg(e1000_reg::RSRPD, 0);
// set up the flow control thresholds
write_reg(e1000_reg::FCRTL, 0);
write_reg(e1000_reg::FCRTH, 0);
// fill the rx ring with pktbufs
rx_last_head_ = read_reg(e1000_reg::RDH);
rx_tail_ = read_reg(e1000_reg::RDT);
for (size_t i = 0; i < rxring_len - 1; i++) {
// construct a 2K pktbuf, pointing outo our rx_buf_ block of memory
auto *pkt = pktbuf_alloc_empty();
if (!pkt) {
break;
}
pktbuf_add_buffer(pkt, rx_buf_ + i * rxbuffer_len, rxbuffer_len, 0, 0, nullptr, nullptr);
add_pktbuf_to_rxring_locked(pkt);
}
//hexdump(rxring_, rxring_len * sizeof(rdesc));
// start rx worker thread
auto wrapper_lambda = [](void *arg) -> int {
e1000 *e = (e1000 *)arg;
return e->rx_worker_routine();
};
rx_worker_thread_ = thread_create("e1000 rx worker", wrapper_lambda, this, HIGH_PRIORITY, DEFAULT_STACK_SIZE);
thread_resume(rx_worker_thread_);
// start receiver
// enable RX, unicast permiscuous, multicast permiscuous, broadcast accept, BSIZE 2048
write_reg(e1000_reg::RCTL, (1<<1) | (1<<3) | (1<<4) | (1<<15) | (0<<16));
// unmask receive irq
auto ims = read_reg(e1000_reg::IMS);
write_reg(e1000_reg::IMS, ims | (1<<7) | (1<<6)); // RXO, RXTO
// set up the tx path
write_reg(e1000_reg::TDH, 0);
write_reg(e1000_reg::TDT, 0);
tx_last_head_ = 0;
tx_tail_ = 0;
// set up the tx ring
write_reg(e1000_reg::TDBAL, txring_phys & 0xffffffff);
#if __INTPTR_WIDTH__ == 64
write_reg(e1000_reg::TDBAH, txring_phys >> 32);
#else
write_reg(e1000_reg::TDBAH, 0);
#endif
write_reg(e1000_reg::TDLEN, txring_len * sizeof(tdesc));
// enable the transmitter and appropriate irqs
write_reg(e1000_reg::TCTL, (1<<3) | (1<<1)); // short packet pad, tx enable
// unmask tx irq
ims = read_reg(e1000_reg::IMS);
write_reg(e1000_reg::IMS, ims | (1<<1) | (1<<0)); // transmit queue empty, tx descriptor write back
return NO_ERROR;
}
// XXX REMOVE HACK
extern "C"
int e1000_tx(pktbuf_t *p) {
if (the_e) {
the_e->tx(p);
}
return NO_ERROR;
}
static void e1000_init(uint level) {
LTRACE_ENTRY;
auto ac = lk::make_auto_call([]() { LTRACE_EXIT; });
// probe pci to find a device
for (auto id: e1000_ids) {
for (size_t i = 0; ; i++) {
pci_location_t loc;
status_t err = pci_bus_mgr_find_device(&loc, id.id, 0x8086, i);
if (err != NO_ERROR) {
break;
}
// we maybe found one, create a new device and initialize it
auto e = new e1000;
err = e->init_device(loc, &id);
if (err != NO_ERROR) {
char str[14];
printf("e1000: device at %s failed to initialize\n", pci_loc_string(loc, str));
delete e;
continue;
}
// XXX first e1000 found is remembered
the_e = e;
}
}
}
LK_INIT_HOOK(e1000, &e1000_init, LK_INIT_LEVEL_PLATFORM + 1);

128
dev/net/e1000/e1000_hw.h Normal file
View File

@@ -0,0 +1,128 @@
//
// Copyright (c) 2021 Travis Geiselbrecht
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#pragma once
#include <stdint.h>
#include <stdio.h>
// from 8254x Family SDM Table 13-2
enum class e1000_reg {
// general
CTRL = 0x0,
STATUS = 0x8,
EECD = 0x10,
EERD = 0x14,
CTL_EXT = 0x18,
MDIC = 0x20,
FACL = 0x28,
FACH = 0x2c,
FCT = 0x30,
VET = 0x38,
FCTTV = 0x170,
TXCW = 0x178,
RXCW = 0x180,
LEDCTL = 0xe00,
// DMA
DMA = 0x1000,
// interrupt
ICR = 0xc0,
ITR = 0xc4,
IMS = 0xd0,
IMC = 0xd8,
IAM = 0xe0,
EITR0 = 0x1680, // e1000e only (i210)+
EITR1 = 0x1684,
EITR2 = 0x1688,
EITR3 = 0x168c,
EITR4 = 0x1690,
// receive
RCTL = 0x100,
FCRTL = 0x2160,
FCRTH = 0x2168,
RDBAL = 0x2800,
RDBAH = 0x2804,
RDLEN = 0x2808,
RDH = 0x2810,
RDT = 0x2818,
RDTR = 0x2820,
RADV = 0x282c,
RSRPD = 0x2c00,
// transmit
TCTL = 0x400,
TIPG = 0x410,
AIFS = 0x458,
TDBAL = 0x3800,
TDBAH = 0x3804,
TDLEN = 0x3808,
TDH = 0x3810,
TDT = 0x3818,
TIDV = 0x3820,
// tx dma
TXDMAC = 0x3000,
TXDCTL = 0x3828,
TADV = 0x382c,
TSPMT = 0x3830,
// rx dma
RXDCTL = 0x2828,
RXCSUM = 0x5000,
};
// receive descriptor
struct rdesc {
uint64_t addr;
uint16_t length;
uint16_t checksum;
uint8_t status;
uint8_t errors;
uint16_t special;
void dump() {
printf("rdsec %p: addr %#llx len %hu cksum %#hx stat %#hhx err %#hhx spec %#hx\n",
this, addr, length, checksum, status, errors, special);
}
};
static_assert(sizeof(rdesc) == 16, "");
// transmit descriptor (legacy)
struct tdesc {
uint64_t addr;
uint16_t length;
uint8_t cso;
uint8_t cmd;
uint8_t sta_rsv;
uint8_t css;
uint16_t special;
void dump() {
printf("tdsec %p: addr %#llx len %hu cso %#hhx cmd %#hhx sta_rsv %#hhx css %#hhx spec %#hx\n",
this, addr, length, cso, cmd, sta_rsv, css, special);
}
};
static_assert(sizeof(tdesc) == 16, "");
// efficient copy for rx/tx descriptors out/into uncached memory
template <typename T>
inline void copy(T *_dst, const T *_src) {
// only allow this for structs that are precisely 16 bytes long
static_assert(sizeof(T) == 16, "");
// treat as two 8 byte copies
uint64_t *dst = (uint64_t *)_dst;
const uint64_t *src = (uint64_t *)_src;
dst[0] = src[0];
dst[1] = src[1];
}

10
dev/net/e1000/rules.mk Normal file
View File

@@ -0,0 +1,10 @@
LOCAL_DIR := $(GET_LOCAL_DIR)
MODULE := $(LOCAL_DIR)
MODULE_SRCS += $(LOCAL_DIR)/e1000.cpp
MODULE_DEPS += dev/bus/pci
MODULE_DEPS += lib/minip
include make/module.mk

View File

@@ -296,4 +296,9 @@ void platform_init(void) {
#endif
platform_init_mmu_mappings();
#if WITH_LIB_MINIP
extern int e1000_tx(pktbuf_t *p);
minip_init_dhcp(e1000_tx, 0);
#endif
}

View File

@@ -13,6 +13,7 @@ MODULE_DEPS += \
ifneq ($(CPU),legacy)
MODULE_DEPS += dev/bus/pci
MODULE_DEPS += dev/net/e1000
endif
MODULE_SRCS += \

View File

@@ -110,7 +110,7 @@ if (( $DO_NET )); then
exit 1
fi
else
ARGS+=" -nic none"
ARGS+=" -net none"
fi
$DIR/make-parallel $MAKE_VARS $PROJECT &&