[arch][riscv] add fpu context switch support

Currently only implemented for double precision floating point.

Caveat: currently unable to only compile some code with or without
float. The linker is extremely picky about mixing float and no-float
objects, so stick with all on or off for now.

It's not as much of a problem currently because the toolchain is not
using any riscv vector instructions to assist normal code, so it's
generally only emitting fpu instructions for floating point code.
This commit is contained in:
Travis Geiselbrecht
2022-07-17 23:27:42 -07:00
parent 6462cbf51c
commit b0d8aeed18
7 changed files with 214 additions and 29 deletions

View File

@@ -5,7 +5,7 @@
* license that can be found in the LICENSE file or at
* https://opensource.org/licenses/MIT
*/
#if ARM_WITH_VFP || ARCH_ARM64 || X86_WITH_FPU
#if ARM_WITH_VFP || ARCH_ARM64 || X86_WITH_FPU || (ARCH_RISCV && RISCV_FPU)
#include <stdio.h>
#include <rand.h>

View File

@@ -55,6 +55,14 @@ void riscv_early_init_percpu(void) {
riscv_csr_clear(RISCV_CSR_XSTATUS, RISCV_CSR_XSTATUS_IE);
riscv_csr_clear(RISCV_CSR_XIE, RISCV_CSR_XIE_SIE | RISCV_CSR_XIE_TIE | RISCV_CSR_XIE_EIE);
#if RISCV_FPU
// enable the fpu and zero it out
riscv_csr_clear(RISCV_CSR_XSTATUS, RISCV_CSR_XSTATUS_FS_MASK);
riscv_csr_set(RISCV_CSR_XSTATUS, RISCV_CSR_XSTATUS_FS_INITIAL);
riscv_fpu_zero();
#endif
// enable cycle counter (disabled for now, unimplemented on sifive-e)
//riscv_csr_set(mcounteren, 1);
}
@@ -145,10 +153,6 @@ void arch_enter_uspace(vaddr_t entry_point, vaddr_t user_stack_top) {
status = RISCV_CSR_XSTATUS_PIE |
RISCV_CSR_XSTATUS_SUM;
#if RISCV_FPU
status |= (1ul << RISCV_CSR_XSTATUS_FS_SHIFT); // mark fpu state 'initial'
#endif
printf("user sstatus %#lx\n", status);
arch_disable_ints();
@@ -157,7 +161,10 @@ void arch_enter_uspace(vaddr_t entry_point, vaddr_t user_stack_top) {
riscv_csr_write(sepc, entry_point);
riscv_csr_write(sscratch, kernel_stack_top);
#if RISCV_FPU
status |= RISCV_CSR_XSTATUS_FS_INITIAL; // mark fpu state 'initial'
riscv_fpu_zero();
#endif
// put the current tp (percpu pointer) just below the top of the stack
// the exception code will recover it when coming from user space

View File

@@ -9,6 +9,8 @@
#include <arch/riscv.h>
#include <arch/riscv/asm.h>
#if RISCV_FPU
// enable full use of all of the fpu instructions
#if __riscv_xlen == 32
.attribute arch, "rv32imafdc"
@@ -30,7 +32,6 @@
// called just before entering user space for the first time.
// must not use the stack and is okay to be called with interrupts disabled.
FUNCTION(riscv_fpu_zero)
#if RISCV_FPU
// zero out the fpu state
// TODO: handle single precision implementations
csrw fcsr, zero
@@ -66,7 +67,93 @@ FUNCTION(riscv_fpu_zero)
ZERO_FPU_REG f29, d
ZERO_FPU_REG f30, d
ZERO_FPU_REG f31, d
#endif
// put the hardware in the initial state
// FS[1:0] == 1 set in two steps: one to set bit 0, second one to clear bit 1
// this ensures it doesn't go through the disabled state (00)
li a0, (1 << 13)
csrs RISCV_CSR_XSTATUS, a0
li a0, (1 << 14)
csrc RISCV_CSR_XSTATUS, a0
ret
END_FUNCTION(riscv_fpu_zero)
// void riscv_fpu_save(struct riscv_fpu_state *state);
FUNCTION(riscv_fpu_save)
fsd f0, 0*8(a0)
fsd f1, 1*8(a0)
fsd f2, 2*8(a0)
fsd f3, 3*8(a0)
fsd f4, 4*8(a0)
fsd f5, 5*8(a0)
fsd f6, 6*8(a0)
fsd f7, 7*8(a0)
fsd f8, 8*8(a0)
fsd f9, 9*8(a0)
fsd f10, 10*8(a0)
fsd f11, 11*8(a0)
fsd f12, 12*8(a0)
fsd f13, 13*8(a0)
fsd f14, 14*8(a0)
fsd f15, 15*8(a0)
fsd f16, 16*8(a0)
fsd f17, 17*8(a0)
fsd f18, 18*8(a0)
fsd f19, 19*8(a0)
fsd f20, 20*8(a0)
fsd f21, 21*8(a0)
fsd f22, 22*8(a0)
fsd f23, 23*8(a0)
fsd f24, 24*8(a0)
fsd f25, 25*8(a0)
fsd f26, 26*8(a0)
fsd f27, 27*8(a0)
fsd f28, 28*8(a0)
fsd f29, 29*8(a0)
fsd f30, 30*8(a0)
fsd f31, 31*8(a0)
csrr a1, fcsr
sw a1, 32*8(a0)
ret
END_FUNCTION(riscv_fpu_save)
// void riscv_fpu_restore(struct riscv_fpu_state *state);
FUNCTION(riscv_fpu_restore)
fld f0, 0*8(a0)
fld f1, 1*8(a0)
fld f2, 2*8(a0)
fld f3, 3*8(a0)
fld f4, 4*8(a0)
fld f5, 5*8(a0)
fld f6, 6*8(a0)
fld f7, 7*8(a0)
fld f8, 8*8(a0)
fld f9, 9*8(a0)
fld f10, 10*8(a0)
fld f11, 11*8(a0)
fld f12, 12*8(a0)
fld f13, 13*8(a0)
fld f14, 14*8(a0)
fld f15, 15*8(a0)
fld f16, 16*8(a0)
fld f17, 17*8(a0)
fld f18, 18*8(a0)
fld f19, 19*8(a0)
fld f20, 20*8(a0)
fld f21, 21*8(a0)
fld f22, 22*8(a0)
fld f23, 23*8(a0)
fld f24, 24*8(a0)
fld f25, 25*8(a0)
fld f26, 26*8(a0)
fld f27, 27*8(a0)
fld f28, 28*8(a0)
fld f29, 29*8(a0)
fld f30, 30*8(a0)
fld f31, 31*8(a0)
lw a1, 32*8(a0)
csrw fcsr, a1
ret
END_FUNCTION(riscv_fpu_restore)
#endif // RISCV_FPU

View File

@@ -8,6 +8,12 @@
#pragma once
#include <sys/types.h>
#include <stdbool.h>
struct riscv_fpu_state {
double f[32];
unsigned long fscr;
};
struct riscv_context_switch_frame {
unsigned long ra; // return address (x1)
@@ -26,6 +32,11 @@ struct riscv_context_switch_frame {
unsigned long s9;
unsigned long s10;
unsigned long s11;
#if RISCV_FPU
bool fpu_dirty;
struct riscv_fpu_state fpu;
#endif
};
struct arch_thread {
@@ -35,3 +46,13 @@ struct arch_thread {
void riscv_context_switch(struct riscv_context_switch_frame *oldcs,
struct riscv_context_switch_frame *newcs);
#if RISCV_FPU
// save and restore old and new state.
void riscv_fpu_save(struct riscv_fpu_state *state);
void riscv_fpu_restore(struct riscv_fpu_state *state);
// initialize the fpu state to zero
void riscv_fpu_zero(void);
#endif

View File

@@ -65,6 +65,10 @@
#define RISCV_CSR_XSTATUS_MXR (1ul << 19)
#define RISCV_CSR_XSTATUS_FS_SHIFT (13)
#define RISCV_CSR_XSTATUS_FS_MASK (3ul << RISCV_CSR_XSTATUS_FS_SHIFT)
#define RISCV_CSR_XSTATUS_FS_OFF (0ul << RISCV_CSR_XSTATUS_FS_SHIFT)
#define RISCV_CSR_XSTATUS_FS_INITIAL (1ul << RISCV_CSR_XSTATUS_FS_SHIFT)
#define RISCV_CSR_XSTATUS_FS_CLEAN (2ul << RISCV_CSR_XSTATUS_FS_SHIFT)
#define RISCV_CSR_XSTATUS_FS_DIRTY (3ul << RISCV_CSR_XSTATUS_FS_SHIFT)
#define RISCV_CSR_XIE_SIE (1ul << (RISCV_XMODE_OFFSET + 0))
#define RISCV_CSR_XIE_TIE (1ul << (RISCV_XMODE_OFFSET + 4))
@@ -216,9 +220,6 @@ enum handler_return riscv_software_exception(void);
enum handler_return riscv_platform_irq(void);
void riscv_syscall_handler(struct riscv_short_iframe *frame);
// initialize the fpu state to zero
void riscv_fpu_zero(void);
// If using S mode, time seems to be implemented in clint.h
// TODO: clean up by moving into its own header
#if RISCV_S_MODE

View File

@@ -32,15 +32,6 @@ ifeq (true,$(call TOBOOL,$(WITH_SMP)))
GLOBAL_DEFINES += WITH_SMP=1
endif
ifeq (true,$(call TOBOOL,$(RISCV_FPU)))
GLOBAL_DEFINES += RISCV_FPU=1
endif
# for the moment leave out all fpu support, even if the above flag is set
# TODO: add full riscv fpu context switch support
ARCH_COMPILEFLAGS_FLOAT :=
ARCH_COMPILEFLAGS_NOFLOAT :=
ifeq ($(strip $(RISCV_MODE)),machine)
$(info RISCV: Machine Mode)
GLOBAL_DEFINES += RISCV_M_MODE=1
@@ -129,27 +120,47 @@ GLOBAL_DEFINES += MEMSIZE=$(MEMSIZE)
# if ARCH_riscv{32|64}_TOOLCHAIN_PREFIX is set use it as an override
# for toolchain prefix.
ifdef ARCH_$(ARCH)$(SUBARCH)_TOOLCHAIN_PREFIX
TOOLCHAIN_PREFIX := $(ARCH_$(ARCH)$(SUBARCH)_TOOLCHAIN_PREFIX)
TOOLCHAIN_PREFIX := $(ARCH_$(ARCH)$(SUBARCH)_TOOLCHAIN_PREFIX)
endif
# default toolchain is riscv{32|64}-elf-. assume its in the path.
ifndef TOOLCHAIN_PREFIX
TOOLCHAIN_PREFIX := riscv$(SUBARCH)-elf-
TOOLCHAIN_PREFIX := riscv$(SUBARCH)-elf-
endif
ifeq (true,$(call TOBOOL,$(RISCV_FPU)))
GLOBAL_DEFINES += RISCV_FPU=1
endif
# for the moment simply build all sources the same way, with or without float based on
# the configuration of the platform
ARCH_COMPILEFLAGS_FLOAT :=
ARCH_COMPILEFLAGS_NOFLOAT :=
# based on 32 or 64 bitness, select the right toolchain and some
# compiler codegen flags
ifeq ($(SUBARCH),32)
ARCH_COMPILEFLAGS := -march=rv32imac -mabi=ilp32
# override machine for ld -r
GLOBAL_MODULE_LDFLAGS += -m elf32lriscv
ifeq (true,$(call TOBOOL,$(RISCV_FPU)))
ARCH_COMPILEFLAGS := -march=rv32gc -mabi=ilp32d
else
ARCH_COMPILEFLAGS := -march=rv32imac -mabi=ilp32
endif
# override machine for ld -r
GLOBAL_MODULE_LDFLAGS += -m elf32lriscv
else ifeq ($(SUBARCH),64)
GLOBAL_DEFINES += IS_64BIT=1
ARCH_COMPILEFLAGS := -march=rv64imac -mabi=lp64 -mcmodel=medany
# override machine for ld -r
GLOBAL_MODULE_LDFLAGS += -m elf64lriscv
GLOBAL_DEFINES += IS_64BIT=1
ifeq (true,$(call TOBOOL,$(RISCV_FPU)))
ARCH_COMPILEFLAGS := -march=rv64gc -mabi=lp64d -mcmodel=medany
else
ARCH_COMPILEFLAGS := -march=rv64imac -mabi=lp64 -mcmodel=medany
endif
# override machine for ld -r
GLOBAL_MODULE_LDFLAGS += -m elf64lriscv
else
$(error SUBARCH not set or set to something unknown)
$(error SUBARCH not set or set to something unknown)
endif
# test to see if -misa-spec=2.2 is a valid switch.
@@ -206,3 +217,5 @@ endif
$(info ARCH_COMPILEFLAGS = $(ARCH_COMPILEFLAGS))
include make/module.mk
# vim: set ts=4 sw=4 expandtab:

View File

@@ -44,6 +44,8 @@ void arch_thread_initialize(thread_t *t) {
/* zero out the thread context */
memset(&t->arch.cs_frame, 0, sizeof(t->arch.cs_frame));
/* if FPU is implemented, default state of zero is default for the thread */
/* make sure the top of the stack is 16 byte aligned */
vaddr_t stack_top = ROUNDDOWN((vaddr_t)t->stack + t->stack_size, 16);
@@ -58,12 +60,66 @@ void arch_context_switch(thread_t *oldthread, thread_t *newthread) {
LTRACEF("old %p (%s), new %p (%s)\n", oldthread, oldthread->name, newthread, newthread->name);
/* floating point context switch */
#if RISCV_FPU
/* based on a combination of current fpu dirty state in hardware and saved state
* on the new thread, do a partial or full context switch
*/
ulong status = riscv_csr_read(RISCV_CSR_XSTATUS);
ulong hw_state = status & RISCV_CSR_XSTATUS_FS_MASK;
LTRACEF("old fpu dirty %d, new fpu dirty %d, status %#lx\n", oldthread->arch.cs_frame.fpu_dirty, newthread->arch.cs_frame.fpu_dirty,
hw_state >> RISCV_CSR_XSTATUS_FS_SHIFT);
status &= ~(RISCV_CSR_XSTATUS_FS_MASK);
/* hardware currently is in the dirty state, so save the state of the fpu regs
* and mark the thread as dirty.
*/
switch (hw_state) {
case RISCV_CSR_XSTATUS_FS_DIRTY:
oldthread->arch.cs_frame.fpu_dirty = true;
riscv_fpu_save(&oldthread->arch.cs_frame.fpu);
break;
case RISCV_CSR_XSTATUS_FS_INITIAL:
oldthread->arch.cs_frame.fpu_dirty = false;
break;
case RISCV_CSR_XSTATUS_FS_OFF:
// TODO: handle fpu being disabled
PANIC_UNIMPLEMENTED;
}
if (newthread->arch.cs_frame.fpu_dirty) {
/* if the new thread has dirty saved state, load it here and mark the cpu as in the
* clean state, which will transition to dirty if any regs are modified
*/
status |= RISCV_CSR_XSTATUS_FS_CLEAN;
riscv_fpu_restore(&newthread->arch.cs_frame.fpu);
} else {
/* if the thread previously hadn't dirtied the state, zero out the fpu
* state and mark hardware as initial.
*/
status |= RISCV_CSR_XSTATUS_FS_INITIAL;
riscv_fpu_zero();
}
/* writeback the modified state to hardware */
riscv_csr_write(RISCV_CSR_XSTATUS, status);
#endif
/* integer context switch.
* stack is swapped as part of this routine, so the code will return only when
* the current thread context is switched back to.
*/
riscv_context_switch(&oldthread->arch.cs_frame, &newthread->arch.cs_frame);
}
void arch_dump_thread(thread_t *t) {
if (t->state != THREAD_RUNNING) {
dprintf(INFO, "\tarch: ");
#if RISCV_FPU
dprintf(INFO, "fpu dirty %u, ", t->arch.cs_frame.fpu_dirty);
#endif
dprintf(INFO, "sp %#lx\n", t->arch.cs_frame.sp);
}
}