[arch][riscv] more work on riscv MMU code

Use a callback based shared walker to implement the
same page table walking code for different operations.

Add SBI hooks for TLB flushing.
This commit is contained in:
Travis Geiselbrecht
2021-03-05 02:21:00 -08:00
parent c6d8476adb
commit 45a27cbf14
6 changed files with 308 additions and 96 deletions

View File

@@ -59,6 +59,10 @@ void arch_early_init(void) {
#if RISCV_S_MODE
sbi_early_init();
#endif
#if RISCV_MMU
riscv_early_mmu_init();
#endif
}
@@ -89,6 +93,7 @@ void arch_init(void) {
dprintf(INFO, "RISCV: Supervisor mode\n");
#if RISCV_MMU
dprintf(INFO, "RISCV: MMU enabled sv%u\n", RISCV_MMU);
riscv_mmu_init();
#endif
sbi_init();
#endif

View File

@@ -8,11 +8,14 @@
#pragma once
#include <lk/compiler.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <arch/riscv.h>
__BEGIN_CDECLS
#if !defined(RISCV_M_MODE) || !(RISCV_M_MODE)
struct sbiret {
@@ -61,10 +64,14 @@ void sbi_send_ipis(const unsigned long *hart_mask);
void sbi_clear_ipi(void);
status_t sbi_boot_hart(uint hartid, paddr_t start_addr, ulong arg);
void sbi_rfence_vma(const unsigned long *hart_mask, vaddr_t vma, size_t size);
bool sbi_probe_extension(ulong extension);
struct sbiret sbi_generic_call_2(ulong extension, ulong function);
struct sbiret sbi_generic_call_3(ulong extension, ulong function);
#endif
__END_CDECLS

View File

@@ -18,6 +18,7 @@
#include <arch/mmu.h>
#include <arch/riscv.h>
#include <arch/riscv/csr.h>
#include <arch/riscv/sbi.h>
#include <kernel/vm.h>
#define LOCAL_TRACE 0
@@ -30,6 +31,7 @@
riscv_pte_t kernel_pgtable[512] __ALIGNED(PAGE_SIZE);
paddr_t kernel_pgtable_phys; // filled in by start.S
static ulong riscv_asid_mask;
// initial memory mappings. VM uses to construct mappings after the fact
struct mmu_initial_mapping mmu_initial_mappings[] = {
@@ -52,6 +54,24 @@ struct mmu_initial_mapping mmu_initial_mappings[] = {
{ }
};
// called once on the boot cpu during very early (single threaded) init
extern "C"
void riscv_early_mmu_init() {
// figure out the number of support ASID bits by writing all 1s to
// the asid field in satp and seeing which ones 'stick'
auto satp_orig = riscv_csr_read(satp);
auto satp = satp_orig | (RISCV_SATP_ASID_MASK << RISCV_SATP_ASID_SHIFT);
riscv_csr_write(satp, satp);
riscv_asid_mask = (riscv_csr_read(satp) >> RISCV_SATP_ASID_SHIFT) & RISCV_SATP_ASID_MASK;
riscv_csr_write(satp, satp_orig);
}
// called a bit later once on the boot cpu
extern "C"
void riscv_mmu_init() {
printf("RISCV: MMU ASID mask %#lx\n", riscv_asid_mask);
}
static inline void riscv_set_satp(uint asid, paddr_t pt) {
ulong satp;
@@ -62,17 +82,40 @@ static inline void riscv_set_satp(uint asid, paddr_t pt) {
#endif
// make sure the asid is in range
DEBUG_ASSERT((asid & RISCV_SATP_ASID_MASK) == 0);
DEBUG_ASSERT(asid & riscv_asid_mask);
satp |= (ulong)asid << RISCV_SATP_ASID_SHIFT;
// make sure the page table is aligned
DEBUG_ASSERT(IS_PAGE_ALIGNED(pt));
satp |= pt;
satp |= pt >> PAGE_SIZE_SHIFT;
riscv_csr_write(RISCV_CSR_SATP, satp);
// TODO: TLB flush here or use asid properly
// sfence.vma zero, zero
asm("sfence.vma zero, zero");
}
static void riscv_tlb_flush_vma_range(vaddr_t base, size_t count) {
if (count == 0)
return;
// Use SBI to shoot down a range of vaddrs on all the cpus
ulong hart_mask = -1; // TODO: be more selective about the cpus
sbi_rfence_vma(&hart_mask, base, count * PAGE_SIZE);
// locally shoot down
// XXX: is this needed or does the sbi call do it if included in the local hart mask?
while (count > 0) {
asm volatile("sfence.vma %0, zero" :: "r"(base));
base += PAGE_SIZE;
count--;
}
}
static void riscv_tlb_flush_global() {
// Use SBI to do a global TLB shoot down on all cpus
ulong hart_mask = -1; // TODO: be more selective about the cpus
sbi_rfence_vma(&hart_mask, 0, -1);
}
// given a va address and the level, compute the index in the current PT
@@ -154,7 +197,7 @@ status_t arch_mmu_init_aspace(arch_aspace_t *aspace, vaddr_t base, size_t size,
aspace->flags = flags;
if (flags & ARCH_ASPACE_FLAG_KERNEL) {
// at the moment we can only deal with address spaces as globally defined
// kernel aspace is special and should be constructed once
DEBUG_ASSERT(base == KERNEL_ASPACE_BASE);
DEBUG_ASSERT(size == KERNEL_ASPACE_SIZE);
@@ -163,59 +206,81 @@ status_t arch_mmu_init_aspace(arch_aspace_t *aspace, vaddr_t base, size_t size,
aspace->pt_virt = kernel_pgtable;
aspace->pt_phys = kernel_pgtable_phys;
} else {
PANIC_UNIMPLEMENTED;
// at the moment can only deal with user aspaces that perfectly
// cover the predefined range
DEBUG_ASSERT(base == USER_ASPACE_BASE);
DEBUG_ASSERT(size == USER_ASPACE_SIZE);
// implement
return ERR_NOT_IMPLEMENTED;
}
LTRACEF("pt phys %#lx, pt virt %p\n", aspace->pt_phys, aspace->pt_virt);
return NO_ERROR;
}
status_t arch_mmu_destroy_aspace(arch_aspace_t *aspace) {
LTRACEF("aspace %p\n", aspace);
PANIC_UNIMPLEMENTED;
DEBUG_ASSERT(aspace);
if (aspace->flags & ARCH_ASPACE_FLAG_KERNEL) {
panic("trying to destroy kernel aspace\n");
} else {
return ERR_NOT_IMPLEMENTED;
}
}
// routines to map/unmap/query mappings per address space
int arch_mmu_map(arch_aspace_t *aspace, vaddr_t vaddr, paddr_t paddr, uint count, const uint flags) {
LTRACEF("vaddr %#lx paddr %#lx count %u flags %#x\n", vaddr, paddr, count, flags);
enum class walk_cb_ret_op {
HALT,
RESTART,
ALLOC_PT
};
struct walk_cb_ret {
static walk_cb_ret OpHalt(int err) { return { walk_cb_ret_op::HALT, err, false, 0, false }; }
static walk_cb_ret OpRestart() { return { walk_cb_ret_op::RESTART, NO_ERROR, false, 0, false }; }
static walk_cb_ret OpCommitHalt(riscv_pte_t pte, bool unmap, int err) { return { walk_cb_ret_op::HALT, err, true, pte, unmap }; }
static walk_cb_ret OpCommitRestart(riscv_pte_t pte, bool unmap) { return { walk_cb_ret_op::RESTART, NO_ERROR, true, pte, unmap }; }
static walk_cb_ret OpAllocPT() { return { walk_cb_ret_op::ALLOC_PT, 0, false, 0, false }; }
// overall continuation op
walk_cb_ret_op op;
// if halting, return error
int err;
// commit the pte entry
bool commit;
riscv_pte_t new_pte;
bool unmap; // we are unmapping, so test for empty page tables
};
// in the callback arg, define a function or lambda that matches this signature
using page_walk_cb = walk_cb_ret(*)(uint level, uint index, riscv_pte_t pte, vaddr_t *vaddr);
// generic walker routine to automate drilling through a page table structure
template <typename F>
static int riscv_pt_walk(arch_aspace_t *aspace, vaddr_t vaddr, F callback) {
LTRACEF("vaddr %#lx\n", vaddr);
DEBUG_ASSERT(aspace);
restart:
if (count == 0)
return NO_ERROR;
// bootstrap the top level walk
uint level = RISCV_MMU_PT_LEVELS - 1;
uint index = vaddr_to_index(vaddr, level);
volatile riscv_pte_t *ptep = aspace->pt_virt + index;
for (;;) {
LTRACEF_LEVEL(2, "level %u, index %u, pte %p (%#lx) va %#lx pa %#lx\n",
level, index, ptep, *ptep, vaddr, paddr);
LTRACEF_LEVEL(2, "level %u, index %u, pte %p (%#lx) va %#lx\n",
level, index, ptep, *ptep, vaddr);
// look at our page table entry
riscv_pte_t pte = *ptep;
if (level > 0 && !(pte & RISCV_PTE_V)) {
// invalid entry, will have to add a page table
paddr_t ptp;
volatile riscv_pte_t *ptv = alloc_ptable(&ptp);
if (!ptv) {
return ERR_NO_MEMORY;
}
LTRACEF_LEVEL(2, "new ptable table %p, pa %#lx\n", ptv, ptp);
// link it in. RMW == 0 is a page table link
pte = RISCV_PTE_PPN_TO_PTE(ptp) | RISCV_PTE_V;
*ptep = pte;
// go one level deeper
level--;
index = vaddr_to_index(vaddr, level);
ptep = ptv + index;
} else if ((pte & RISCV_PTE_V) && !(pte & RISCV_PTE_PERM_MASK)) {
if ((pte & RISCV_PTE_V) && !(pte & RISCV_PTE_PERM_MASK)) {
// next level page table pointer (RWX = 0)
paddr_t ptp = RISCV_PTE_PPN(pte);
volatile riscv_pte_t *ptv = (riscv_pte_t *)paddr_to_kvaddr(ptp);
@@ -226,32 +291,49 @@ restart:
level--;
index = vaddr_to_index(vaddr, level);
ptep = ptv + index;
} else if (pte & RISCV_PTE_V) {
// terminal entry already exists
if (level > 0) {
PANIC_UNIMPLEMENTED_MSG("terminal large page entry");
} else {
PANIC_UNIMPLEMENTED_MSG("terminal page entry");
}
} else {
DEBUG_ASSERT(level == 0 && !(pte & RISCV_PTE_V));
// it's a non valid page entry or a valid terminal entry
// call the callback, seeing what the user wants
auto ret = callback(level, index, pte, &vaddr);
switch (ret.op) {
case walk_cb_ret_op::HALT:
case walk_cb_ret_op::RESTART:
// see if we're being asked to commit a change
if (ret.commit) {
// commit the change
*ptep = ret.new_pte;
if (ret.unmap) {
// TODO: this was an unmap, test to see if we have emptied a page table
}
}
// hit a open terminal page table entry, lets add ours
pte = RISCV_PTE_PPN_TO_PTE(paddr);
pte |= mmu_flags_to_pte(flags);
pte |= RISCV_PTE_A | RISCV_PTE_D | RISCV_PTE_V;
pte |= (aspace->flags & ARCH_ASPACE_FLAG_KERNEL) ? RISCV_PTE_G : 0;
if (ret.op == walk_cb_ret_op::HALT) {
// stop here
return ret.err;
} else { // RESTART
// user should have modified vaddr or we'll probably be in a loop
goto restart;
}
case walk_cb_ret_op::ALLOC_PT:
// user wants us to add a page table and continue
paddr_t ptp;
volatile riscv_pte_t *ptv = alloc_ptable(&ptp);
if (!ptv) {
return ERR_NO_MEMORY;
}
LTRACEF_LEVEL(2, "added new terminal entry: pte %#lx\n", pte);
LTRACEF_LEVEL(2, "new ptable table %p, pa %#lx\n", ptv, ptp);
*ptep = pte;
// link it in. RMW == 0 is a page table link
pte = RISCV_PTE_PPN_TO_PTE(ptp) | RISCV_PTE_V;
*ptep = pte;
// simple algorithm: restart walk from top, one page at a time
// TODO: more efficiently deal with runs and large pages
count--;
paddr += PAGE_SIZE;
vaddr += PAGE_SIZE;
goto restart;
// go one level deeper
level--;
index = vaddr_to_index(vaddr, level);
ptep = ptv + index;
break;
}
}
// make sure we didn't decrement level one too many
@@ -260,49 +342,96 @@ restart:
// unreachable
}
int arch_mmu_unmap(arch_aspace_t *aspace, vaddr_t vaddr, uint count) {
LTRACEF("vaddr %#lx count %u\n", vaddr, count);
// routines to map/unmap/query mappings per address space
int arch_mmu_map(arch_aspace_t *aspace, const vaddr_t _vaddr, paddr_t paddr, uint count, const uint flags) {
LTRACEF("vaddr %#lx paddr %#lx count %u flags %#x\n", _vaddr, paddr, count, flags);
PANIC_UNIMPLEMENTED;
DEBUG_ASSERT(aspace);
if (count == 0) {
return NO_ERROR;
}
// trim the vaddr to the aspace
if (_vaddr < aspace->base || _vaddr > aspace->base + aspace->size - 1) {
return ERR_OUT_OF_RANGE;
}
// TODO: make sure _vaddr + count * PAGE_SIZE is within the address space
// construct a local callback for the walker routine that
// a) tells the walker to build a page table if it's not present
// b) fills in a terminal page table entry with a page and tells the walker to start over
auto map_cb = [&paddr, &count, aspace, flags](uint level, uint index, riscv_pte_t pte, vaddr_t *vaddr) -> walk_cb_ret {
LTRACEF("level %u, index %u, pte %#lx, vaddr %#lx [paddr %#lx count %u flags %#x]\n",
level, index, pte, *vaddr, paddr, count, flags);
if ((pte & RISCV_PTE_V)) {
// we have hit a valid pte of some kind
// assert that it's not a page table pointer, which we shouldn't be hitting in the callback
DEBUG_ASSERT(pte & RISCV_PTE_PERM_MASK);
// for now, panic
if (level > 0) {
PANIC_UNIMPLEMENTED_MSG("terminal large page entry");
} else {
PANIC_UNIMPLEMENTED_MSG("terminal page entry");
}
return walk_cb_ret::OpHalt(ERR_ALREADY_EXISTS);
}
// hit an open pate table entry
if (level > 0) {
// level is > 0, allocate a page table here
// TODO: optimize by allocating large page here if possible
return walk_cb_ret::OpAllocPT();
}
// adding a terminal page at level 0
riscv_pte_t temp_pte = RISCV_PTE_PPN_TO_PTE(paddr);
temp_pte |= mmu_flags_to_pte(flags);
temp_pte |= RISCV_PTE_A | RISCV_PTE_D | RISCV_PTE_V;
temp_pte |= (aspace->flags & ARCH_ASPACE_FLAG_KERNEL) ? RISCV_PTE_G : 0;
LTRACEF_LEVEL(2, "added new terminal entry: pte %#lx\n", temp_pte);
// modify what the walker handed us
*vaddr += PAGE_SIZE;
// bump our state forward
paddr += PAGE_SIZE;
count--;
// if we're done, tell the caller to commit our changes and either restart the walk or halt
if (count == 0) {
return walk_cb_ret::OpCommitHalt(temp_pte, false, NO_ERROR);
} else {
return walk_cb_ret::OpCommitRestart(temp_pte, false);
}
};
return riscv_pt_walk(aspace, _vaddr, map_cb);
}
status_t arch_mmu_query(arch_aspace_t *aspace, const vaddr_t vaddr, paddr_t *paddr, uint *flags) {
LTRACEF("aspace %p, vaddr %#lx\n", aspace, vaddr);
status_t arch_mmu_query(arch_aspace_t *aspace, const vaddr_t _vaddr, paddr_t *paddr, uint *flags) {
LTRACEF("aspace %p, vaddr %#lx\n", aspace, _vaddr);
DEBUG_ASSERT(aspace);
// trim the vaddr to the aspace
if (vaddr < aspace->base || vaddr > aspace->base + aspace->size - 1) {
if (_vaddr < aspace->base || _vaddr > aspace->base + aspace->size - 1) {
return ERR_OUT_OF_RANGE;
}
uint level = RISCV_MMU_PT_LEVELS - 1;
uint index = vaddr_to_index(vaddr, level);
volatile riscv_pte_t *ptep = aspace->pt_virt + index;
// construct a local callback for the walker routine that
// a) if it hits a terminal entry construct the flags we want and halt
// b) all other cases just halt and return ERR_NOT_FOUND
auto query_cb = [paddr, flags](uint level, uint index, riscv_pte_t pte, vaddr_t *vaddr) -> walk_cb_ret {
LTRACEF("level %u, index %u, pte %#lx, vaddr %#lx\n", level, index, pte, *vaddr);
// walk down through the levels, looking for a terminal entry that matches our address
for (;;) {
LTRACEF_LEVEL(2, "level %u, index %u, pte %p (%#lx)\n", level, index, ptep, *ptep);
// look at our page table entry
riscv_pte_t pte = *ptep;
if ((pte & RISCV_PTE_V) == 0) {
// invalid entry, terminate search
return ERR_NOT_FOUND;
} else if ((pte & RISCV_PTE_PERM_MASK) == 0) {
// next level page table pointer (RWX = 0)
paddr_t ptp = RISCV_PTE_PPN(pte);
volatile riscv_pte_t *ptv = (riscv_pte_t *)paddr_to_kvaddr(ptp);
LTRACEF_LEVEL(2, "next level page table at %p, pa %#lx\n", ptv, ptp);
// go one level deeper
level--;
index = vaddr_to_index(vaddr, level);
ptep = ptv + index;
} else {
// terminal entry
LTRACEF_LEVEL(3, "terminal entry\n");
if (pte & RISCV_PTE_V) {
// we have hit a valid pte of some kind
// assert that it's not a page table pointer, which we shouldn't be hitting in the callback
DEBUG_ASSERT(pte & RISCV_PTE_PERM_MASK);
if (paddr) {
// extract the ppn
@@ -310,7 +439,7 @@ status_t arch_mmu_query(arch_aspace_t *aspace, const vaddr_t vaddr, paddr_t *pad
uintptr_t page_mask = page_mask_per_level(level);
// add the va offset into the physical address
*paddr = pa | (vaddr & page_mask);
*paddr = pa | (*vaddr & page_mask);
LTRACEF_LEVEL(3, "raw pa %#lx, page_mask %#lx, final pa %#lx\n", pa, page_mask, *paddr);
}
@@ -319,16 +448,76 @@ status_t arch_mmu_query(arch_aspace_t *aspace, const vaddr_t vaddr, paddr_t *pad
*flags = pte_flags_to_mmu_flags(pte);
LTRACEF_LEVEL(3, "computed flags %#x\n", *flags);
}
return NO_ERROR;
// we found our page, so stop
return walk_cb_ret::OpHalt(NO_ERROR);
} else {
// couldnt find our page, stop
return walk_cb_ret::OpHalt(ERR_NOT_FOUND);
}
};
// make sure we didn't decrement level one too many
DEBUG_ASSERT(level < RISCV_MMU_PT_LEVELS);
}
// unreachable
return riscv_pt_walk(aspace, _vaddr, query_cb);
}
int arch_mmu_unmap(arch_aspace_t *aspace, const vaddr_t _vaddr, const uint _count) {
LTRACEF("vaddr %#lx count %u\n", _vaddr, _count);
DEBUG_ASSERT(aspace);
if (_count == 0) {
return NO_ERROR;
}
// trim the vaddr to the aspace
if (_vaddr < aspace->base || _vaddr > aspace->base + aspace->size - 1) {
return ERR_OUT_OF_RANGE;
}
// TODO: make sure _vaddr + count * PAGE_SIZE is within the address space
// construct a local callback for the walker routine that
// a) if it hits a terminal 4K entry write zeros to it
// b) if it hits an empty spot continue
auto count = _count;
auto unmap_cb = [&count]
(uint level, uint index, riscv_pte_t pte, vaddr_t *vaddr) -> walk_cb_ret {
LTRACEF("level %u, index %u, pte %#lx, vaddr %#lx\n", level, index, pte, *vaddr);
if (pte & RISCV_PTE_V) {
// we have hit a valid pte of some kind
// assert that it's not a page table pointer, which we shouldn't be hitting in the callback
DEBUG_ASSERT(pte & RISCV_PTE_PERM_MASK);
if (level > 0) {
PANIC_UNIMPLEMENTED_MSG("cannot handle unmapping of large page");
}
// zero it out, which should unmap the page
// TODO: handle freeing upper level page tables
*vaddr += PAGE_SIZE;
count--;
if (count == 0) {
return walk_cb_ret::OpCommitHalt(0, true, NO_ERROR);
} else {
return walk_cb_ret::OpCommitRestart(0, true);
}
} else {
// nothing here so skip forward and try the next page
*vaddr += PAGE_SIZE;
count--;
if (count == 0) {
return walk_cb_ret::OpHalt(NO_ERROR);
} else {
return walk_cb_ret::OpRestart();
}
}
};
int ret = riscv_pt_walk(aspace, _vaddr, unmap_cb);
// TLB shootdown the range we've unmapped
riscv_tlb_flush_vma_range(_vaddr, _count);
return ret;
}
// load a new user address space context.
// aspace argument NULL should load kernel-only context

View File

@@ -13,4 +13,6 @@ void riscv_early_init_percpu(void);
void riscv_init_percpu(void);
void riscv_boot_secondaries(void);
void riscv_configure_percpu_mp_early(uint hart_id, uint cpu_num);
void riscv_early_mmu_init(void);
void riscv_mmu_init(void);

View File

@@ -101,6 +101,15 @@ status_t sbi_boot_hart(uint hartid, paddr_t start_addr, ulong arg) {
return NO_ERROR;
}
void sbi_rfence_vma(const unsigned long *hart_mask, vaddr_t vma, size_t size) {
// use the new IPI extension
if (likely(sbi_ext_present(SBI_EXTENSION_RFENCE))) {
sbi_call(SBI_EXT_RFENCE_SIG, 1, *hart_mask, 0, vma, size);
} else {
PANIC_UNIMPLEMENTED;
}
}
void sbi_early_init(void) {
// read the presence of some features
sbi_ext |= sbi_probe_extension(SBI_EXT_TIMER_SIG) ? (1<<SBI_EXTENSION_TIMER) : 0;

View File

@@ -53,8 +53,8 @@ static inline void hexdump8(const void *ptr, size_t len) {
/* systemwide halts */
void panic(const char *fmt, ...) __PRINTFLIKE(1, 2) __NO_RETURN;
#define PANIC_UNIMPLEMENTED panic("%s unimplemented\n", __PRETTY_FUNCTION__)
#define PANIC_UNIMPLEMENTED_MSG(x...) panic("%s unimplemented: %s\n", __PRETTY_FUNCTION__, x)
#define PANIC_UNIMPLEMENTED panic("%s:%d unimplemented\n", __PRETTY_FUNCTION__, __LINE__)
#define PANIC_UNIMPLEMENTED_MSG(x...) panic("%s:%d unimplemented: %s\n", __PRETTY_FUNCTION__, __LINE__, x)
/* spin the cpu for a period of (short) time */
void spin(uint32_t usecs);