From fd79fccdde78e039de71b1b72f131b62f97a61eb Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Fri, 6 Dec 2024 00:03:23 -0800 Subject: [PATCH 01/26] WIP x86 SMP --- .clang-tidy | 1 + arch/x86/32/mmu.c | 2 +- arch/x86/64/mmu.c | 8 +- arch/x86/64/start.S | 5 + arch/x86/arch.c | 24 +++-- arch/x86/descriptor.c | 123 +++++++++++++++---------- arch/x86/gdt.S | 3 + arch/x86/include/arch/defines.h | 9 +- arch/x86/include/arch/x86/descriptor.h | 36 ++++++-- arch/x86/include/arch/x86/feature.h | 1 - arch/x86/include/arch/x86/mp.h | 28 ++++++ arch/x86/mp.c | 42 +++++++++ arch/x86/rules.mk | 1 + 13 files changed, 203 insertions(+), 80 deletions(-) create mode 100644 arch/x86/include/arch/x86/mp.h create mode 100644 arch/x86/mp.c diff --git a/.clang-tidy b/.clang-tidy index 617972c5..f9a24d32 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -3,6 +3,7 @@ Checks: > -*, bugprone-*, -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, clang-diagnostic-*, -clang-diagnostic-unused-command-line-argument, diff --git a/arch/x86/32/mmu.c b/arch/x86/32/mmu.c index cd6f3b23..7e17e445 100644 --- a/arch/x86/32/mmu.c +++ b/arch/x86/32/mmu.c @@ -421,7 +421,7 @@ bool arch_mmu_supports_user_aspaces(void) { return false; } void x86_mmu_early_init(void) { /* Set WP bit in CR0*/ - volatile uint32_t cr0 = x86_get_cr0(); + uint32_t cr0 = x86_get_cr0(); cr0 |= X86_CR0_WP; x86_set_cr0(cr0); diff --git a/arch/x86/64/mmu.c b/arch/x86/64/mmu.c index 858448e7..46767db1 100644 --- a/arch/x86/64/mmu.c +++ b/arch/x86/64/mmu.c @@ -624,15 +624,13 @@ bool arch_mmu_supports_ns_mappings(void) { return false; } bool arch_mmu_supports_user_aspaces(void) { return false; } void x86_mmu_early_init(void) { - volatile uint64_t efer_msr, cr0, cr4; - /* Set WP bit in CR0*/ - cr0 = x86_get_cr0(); + uint64_t cr0 = x86_get_cr0(); cr0 |= X86_CR0_WP; x86_set_cr0(cr0); /* Setting the SMEP & SMAP bit in CR4 */ - cr4 = x86_get_cr4(); + uint64_t cr4 = x86_get_cr4(); if (x86_feature_test(X86_FEATURE_SMEP)) cr4 |= X86_CR4_SMEP; if (x86_feature_test(X86_FEATURE_SMAP)) @@ -640,7 +638,7 @@ void x86_mmu_early_init(void) { x86_set_cr4(cr4); /* Set NXE bit in MSR_EFER*/ - efer_msr = read_msr(X86_MSR_IA32_EFER); + uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER); efer_msr |= X86_EFER_NXE; write_msr(X86_MSR_IA32_EFER, efer_msr); diff --git a/arch/x86/64/start.S b/arch/x86/64/start.S index 7637525a..cc24658d 100644 --- a/arch/x86/64/start.S +++ b/arch/x86/64/start.S @@ -204,6 +204,11 @@ highaddr: /* set up the idt */ call setup_idt + /* set up the percpu data structure pointer for the boot cpu */ + xor %edi, %edi + xor %esi, %esi + call x86_percpu_init_early + /* call the main module */ call lk_main diff --git a/arch/x86/arch.c b/arch/x86/arch.c index 6493c82e..cd3854e5 100644 --- a/arch/x86/arch.c +++ b/arch/x86/arch.c @@ -19,35 +19,31 @@ #include #include #include -#include /* Describe how start.S sets up the MMU. * These data structures are later used by vm routines to lookup pointers * to physical pages based on physical addresses. */ struct mmu_initial_mapping mmu_initial_mappings[] = { -#if ARCH_X86_64 - /* 64GB of memory mapped where the kernel lives */ + /* 64GB of the first 64GB of memory mapped 1:1 */ { .phys = MEMBASE, .virt = KERNEL_ASPACE_BASE, - .size = PHYSMAP_SIZE, /* x86-64 maps first 64GB by default, 1GB on x86-32 */ + .size = PHYSMAP_SIZE, /* x86-64 maps first 64GB by default, 1GB on x86-32, 16MB in legacy mode */ .flags = 0, .name = "physmap" }, -#endif - /* 1GB of memory mapped where the kernel lives */ +#if ARCH_X86_64 + /* Another linear map of the first GB of memory where the kernel image + * lives at the top of the address space. */ { .phys = MEMBASE, .virt = KERNEL_BASE, -#if X86_LEGACY - .size = 16*MB, /* only map the first 16MB on legacy x86 due to page table usage */ -#else - .size = 1*GB, /* x86 maps first 1GB by default */ -#endif + .size = 1*GB, .flags = 0, .name = "kernel" }, +#endif /* null entry to terminate the list */ { 0 } @@ -70,6 +66,7 @@ void arch_early_init(void) { /* enable caches here for now */ clear_in_cr0(X86_CR0_NW | X86_CR0_CD); + /* configure the system TSS */ #if ARCH_X86_32 system_tss.esp0 = 0; system_tss.ss0 = DATA_SELECTOR; @@ -78,9 +75,10 @@ void arch_early_init(void) { system_tss.eflags = 0x00003002; system_tss.bitmap = offsetof(tss_32_t, tss_bitmap); system_tss.trace = 1; // trap on hardware task switch +#elif ARCH_X86_64 + /* nothing to be done here, a fully zeroed TSS is a good starting point */ #endif - - set_global_desc(TSS_SELECTOR, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); + x86_set_gdt_descriptor(TSS_SELECTOR, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); x86_ltr(TSS_SELECTOR); x86_feature_early_init(); diff --git a/arch/x86/descriptor.c b/arch/x86/descriptor.c index 883fb8ec..58ec998c 100644 --- a/arch/x86/descriptor.c +++ b/arch/x86/descriptor.c @@ -5,62 +5,89 @@ * license that can be found in the LICENSE file or at * https://opensource.org/licenses/MIT */ - -#include #include -/* not the best way to do this, but easy for now */ -typedef union { - struct { - uint16_t limit_15_0; - uint16_t base_15_0; - uint8_t base_23_16; +#include +#include - uint8_t type : 4; - uint8_t s : 1; - uint8_t dpl : 2; - uint8_t p : 1; +extern uint64_t _gdt[]; - uint8_t limit_19_16 : 4; - uint8_t avl : 1; - uint8_t reserved_0 : 1; - uint8_t d_b : 1; - uint8_t g : 1; - - uint8_t base_31_24; - } __PACKED seg_desc_legacy; - - struct { - uint32_t base_63_32; - uint32_t reserved_1; - } __PACKED seg_desc_64; -} __PACKED seg_desc_t; - -extern seg_desc_t _gdt[]; - -void set_global_desc(seg_sel_t sel, void *base, uint32_t limit, +void x86_set_gdt_descriptor(seg_sel_t sel, void *base, uint32_t limit, uint8_t present, uint8_t ring, uint8_t sys, uint8_t type, uint8_t gran, uint8_t bits) { - // convert selector into index + typedef struct { + struct { + uint16_t limit_15_0; + uint16_t base_15_0; + uint8_t base_23_16; + + uint8_t type : 4; + uint8_t s : 1; + uint8_t dpl : 2; + uint8_t p : 1; + + uint8_t limit_19_16 : 4; + uint8_t avl : 1; + uint8_t reserved : 1; + uint8_t d_b : 1; + uint8_t g : 1; + + uint8_t base_31_24; + } seg_desc_legacy; + +#if ARCH_X86_64 + // some descriptors have additional fields for x86-64 + struct { + uint32_t base_63_32; + uint32_t reserved; + } seg_desc_64; +#endif + } seg_desc_t; + +#if ARCH_X86_64 + static_assert(sizeof(seg_desc_t) == 16, "seg_desc_t size mismatch"); +#else + static_assert(sizeof(seg_desc_t) == 8, "seg_desc_t size mismatch"); +#endif + + seg_desc_t desc = {0}; + + desc.seg_desc_legacy.limit_15_0 = limit & 0x0000ffff; + desc.seg_desc_legacy.limit_19_16 = (limit & 0x000f0000) >> 16; + + desc.seg_desc_legacy.base_15_0 = ((uintptr_t) base) & 0x0000ffff; + desc.seg_desc_legacy.base_23_16 = (((uintptr_t) base) & 0x00ff0000) >> 16; + desc.seg_desc_legacy.base_31_24 = ((uintptr_t) base) >> 24; + + desc.seg_desc_legacy.type = type & 0x0f; // segment type + desc.seg_desc_legacy.s = sys != 0; // system / non-system + desc.seg_desc_legacy.dpl = ring & 0x03; // descriptor privilege level + desc.seg_desc_legacy.p = present != 0; // present + desc.seg_desc_legacy.avl = 0; + desc.seg_desc_legacy.reserved = 0; + desc.seg_desc_legacy.d_b = bits != 0; // 16 / 32 bit + desc.seg_desc_legacy.g = gran != 0; // granularity + + // convert selector into index, which are always 8 byte indexed uint16_t index = sel >> 3; - - _gdt[index].seg_desc_legacy.limit_15_0 = limit & 0x0000ffff; - _gdt[index].seg_desc_legacy.limit_19_16 = (limit & 0x000f0000) >> 16; - - _gdt[index].seg_desc_legacy.base_15_0 = ((uintptr_t) base) & 0x0000ffff; - _gdt[index].seg_desc_legacy.base_23_16 = (((uintptr_t) base) & 0x00ff0000) >> 16; - _gdt[index].seg_desc_legacy.base_31_24 = ((uintptr_t) base) >> 24; - - _gdt[index].seg_desc_legacy.type = type & 0x0f; // segment type - _gdt[index].seg_desc_legacy.p = present != 0; // present - _gdt[index].seg_desc_legacy.dpl = ring & 0x03; // descriptor privilege level - _gdt[index].seg_desc_legacy.g = gran != 0; // granularity - _gdt[index].seg_desc_legacy.s = sys != 0; // system / non-system - _gdt[index].seg_desc_legacy.d_b = bits != 0; // 16 / 32 bit + seg_desc_t *entry = (seg_desc_t *)&_gdt[index]; + entry->seg_desc_legacy = desc.seg_desc_legacy; #ifdef ARCH_X86_64 - if (TSS_SELECTOR == sel) { - _gdt[index + 1].seg_desc_64.base_63_32 = (uint32_t)((uintptr_t) base >> 32); - _gdt[index + 1].seg_desc_64.reserved_1 = 0; + if (sys == 0) { + // some of the system descriptors have two more words + switch (type) { + case SEG_TYPE_TSS: + case SEG_TYPE_TSS_BUSY: + case SEG_TYPE_LDT: + case SEG_TYPE_CALL_GATE: + // copy the lower 32 bits of the descriptor (base and limit) + desc.seg_desc_64.base_63_32 = (uint32_t)((uintptr_t) base >> 32); + desc.seg_desc_64.reserved = 0; + + // copy the upper 64 bits of the descriptor + entry->seg_desc_64 = desc.seg_desc_64; + break; + } } #endif } diff --git a/arch/x86/gdt.S b/arch/x86/gdt.S index abb9c794..577ee840 100644 --- a/arch/x86/gdt.S +++ b/arch/x86/gdt.S @@ -123,7 +123,10 @@ _tss_gde: .byte 0x89 /* P(1) DPL(00) S(0) TYPE(9) */ .byte 0x80 /* G(1) D/B(0) L(0) AVL(0) limit 19:16 */ .byte 0 /* base 31:24 */ +#if ARCH_X86_64 + /* 64-bit TSSs are 16 bytes long */ .quad 0x0000000000000000 +#endif .set i, i+1 .endr diff --git a/arch/x86/include/arch/defines.h b/arch/x86/include/arch/defines.h index 583e86ba..6244fbeb 100644 --- a/arch/x86/include/arch/defines.h +++ b/arch/x86/include/arch/defines.h @@ -18,8 +18,13 @@ /* based on how start.S sets up the physmap */ #if ARCH_X86_64 -#define PHYSMAP_SIZE (64ULL*GB) +#define PHYSMAP_SIZE (64ULL*1024*1024*1024) +#elif X86_LEGACY +/* Only map the first 16MB on legacy x86 due to page table usage + * due to lack of 4MB pages. */ +#define PHYSMAP_SIZE (16ULL*1024*1024) #elif ARCH_X86_32 -#define PHYSMAP_SIZE (1ULL*GB) +/* Map 1GB by default for x86-32 */ +#define PHYSMAP_SIZE (1ULL*1024*1024*1024) #endif diff --git a/arch/x86/include/arch/x86/descriptor.h b/arch/x86/include/arch/x86/descriptor.h index 804a9f66..be95817f 100644 --- a/arch/x86/include/arch/x86/descriptor.h +++ b/arch/x86/include/arch/x86/descriptor.h @@ -25,17 +25,33 @@ #define USER_CODE_64_SELECTOR 0x38 #define USER_DATA_64_SELECTOR 0x40 +/* base selector for a list of TSSes, one per cpu (SMP_MAX_CPUS) */ #define TSS_SELECTOR 0x48 -/* - * Descriptor Types - */ -#define SEG_TYPE_TSS 0x9 -#define SEG_TYPE_TSS_BUSY 0xb -#define SEG_TYPE_TASK_GATE 0x5 -#define SEG_TYPE_INT_GATE 0xe // 32 bit -#define SEG_TYPE_DATA_RW 0x2 -#define SEG_TYPE_CODE_RW 0xa +/* code/data segment types (S = 1) */ +/* bit 0 is accessed */ +#define SEG_TYPE_DATA_RO 0x0 +#define SEG_TYPE_DATA_RW 0x2 +#define SEG_TYPE_DATA_RO_EXPAND_DOWN 0x4 +#define SEG_TYPE_DATA_RW_EXPAND_DOWN 0x6 +#define SEG_TYPE_CODE_XO 0x8 +#define SEG_TYPE_CODE_RO 0xa +#define SEG_TYPE_CODE_XO_CONFORMING 0xc +#define SEG_TYPE_CODE_RO_CONFORMING 0xe + +/* system segment types (S = 0) */ +#define SEG_TYPE_TSS_16 0x1 +#define SEG_TYPE_LDT 0x2 +#define SEG_TYPE_TSS_16_BUSY 0x3 +#define SEG_TYPE_CALL_GATE_16 0x4 +#define SEG_TYPE_TASK_GATE 0x5 +#define SEG_TYPE_INT_GATE_16 0x6 +#define SEG_TYPE_TRAP_GATE_16 0x7 +#define SEG_TYPE_TSS 0x9 +#define SEG_TYPE_TSS_BUSY 0xb +#define SEG_TYPE_CALL_GATE 0xc +#define SEG_TYPE_INT_GATE 0xe +#define SEG_TYPE_TRAP_GATE 0xf #ifndef ASSEMBLY @@ -43,7 +59,7 @@ typedef uint16_t seg_sel_t; -void set_global_desc(seg_sel_t sel, void *base, uint32_t limit, +void x86_set_gdt_descriptor(seg_sel_t sel, void *base, uint32_t limit, uint8_t present, uint8_t ring, uint8_t sys, uint8_t type, uint8_t gran, uint8_t bits); #endif diff --git a/arch/x86/include/arch/x86/feature.h b/arch/x86/include/arch/x86/feature.h index bba5db4b..afc12bac 100644 --- a/arch/x86/include/arch/x86/feature.h +++ b/arch/x86/include/arch/x86/feature.h @@ -24,7 +24,6 @@ #pragma once #include -#include #include #include #include diff --git a/arch/x86/include/arch/x86/mp.h b/arch/x86/include/arch/x86/mp.h new file mode 100644 index 00000000..518bf8df --- /dev/null +++ b/arch/x86/include/arch/x86/mp.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2024 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#pragma once + +#include + +// per cpu pointer pointed to by a segment register on x86 +typedef struct x86_percpu { + // pointer back to ourselves so we can get a raw pointer via segment:0 + struct x86_percpu *self; + + uint cpu_num; + uint apic_id; + + struct thread *current_thread; + + // XXX add more stuff: + // per cpu TSS + // per cpu doublefault/nmi stacks +} x86_percpu_t; + +// called extremely early on the boot cpu and each secondary cpu +void x86_percpu_init_early(uint cpu_num, uint apic_id); \ No newline at end of file diff --git a/arch/x86/mp.c b/arch/x86/mp.c new file mode 100644 index 00000000..536c68dc --- /dev/null +++ b/arch/x86/mp.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include + +#include +#include +#include + +// the boot cpu's percpu struct +static x86_percpu_t x86_boot_percpu; +// pointer to an array of percpu structs for each of the secondary cpus +static x86_percpu_t **x86_ap_percpus; + +static x86_percpu_t *percpu_for_cpu(uint cpu_num) { + DEBUG_ASSERT(cpu_num < SMP_MAX_CPUS); + if (cpu_num == 0) { + return &x86_boot_percpu; + } + DEBUG_ASSERT(x86_ap_percpus); + return x86_ap_percpus[cpu_num - 1]; +} + +void x86_percpu_init_early(uint cpu_num, uint apic_id) { + x86_percpu_t *percpu = percpu_for_cpu(cpu_num); + + percpu->self = percpu; + percpu->cpu_num = cpu_num; + percpu->apic_id = apic_id; + + // XXX load into gs:/fs/etc +#if ARCH_X86_64 + write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0); + write_msr(X86_MSR_IA32_GS_BASE, (uint64_t)percpu); +#else +//#error implement +#endif +} \ No newline at end of file diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index c01e957a..2a11a3f1 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -58,6 +58,7 @@ MODULE_SRCS += \ $(LOCAL_DIR)/faults.c \ $(LOCAL_DIR)/feature.c \ $(LOCAL_DIR)/gdt.S \ + $(LOCAL_DIR)/mp.c \ $(LOCAL_DIR)/thread.c \ # legacy x86's dont have fpu support From 902e2fcb8a8cfc58cce69fb129da62c90ee5d7fa Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Fri, 6 Dec 2024 21:11:51 -0800 Subject: [PATCH 02/26] WIP set up per cpu structures for x86-64 only on the boot cpu for now --- arch/x86/gdt.S | 3 +- arch/x86/include/arch/arch_ops.h | 15 +++++++++ arch/x86/include/arch/x86.h | 36 ++++++++++++++++++++++ arch/x86/include/arch/x86/mp.h | 52 ++++++++++++++++++++++++++++++-- arch/x86/mp.c | 19 +++++++++--- arch/x86/rules.mk | 15 ++++++++- arch/x86/thread.c | 7 +++-- kernel/mp.c | 3 ++ 8 files changed, 139 insertions(+), 11 deletions(-) diff --git a/arch/x86/gdt.S b/arch/x86/gdt.S index 577ee840..41c71f88 100644 --- a/arch/x86/gdt.S +++ b/arch/x86/gdt.S @@ -125,7 +125,8 @@ _tss_gde: .byte 0 /* base 31:24 */ #if ARCH_X86_64 /* 64-bit TSSs are 16 bytes long */ - .quad 0x0000000000000000 + .int 0 /* base 63:32 */ + .int 0 /* type(0) + reserved */ #endif .set i, i+1 .endr diff --git a/arch/x86/include/arch/arch_ops.h b/arch/x86/include/arch/arch_ops.h index b3092d60..6b539c00 100644 --- a/arch/x86/include/arch/arch_ops.h +++ b/arch/x86/include/arch/arch_ops.h @@ -50,6 +50,20 @@ static inline ulong arch_cycle_count(void) { #endif } +#if WITH_SMP +#include +static inline struct thread *arch_get_current_thread(void) { + return x86_get_current_thread(); +} + +static inline void arch_set_current_thread(struct thread *t) { + x86_set_current_thread(t); +} + +static inline uint arch_curr_cpu_num(void) { + return x86_get_cpu_num(); +} +#else /* use a global pointer to store the current_thread */ extern struct thread *_current_thread; @@ -64,6 +78,7 @@ static inline void arch_set_current_thread(struct thread *t) { static inline uint arch_curr_cpu_num(void) { return 0; } +#endif #if ARCH_X86_64 // relies on SSE2 diff --git a/arch/x86/include/arch/x86.h b/arch/x86/include/arch/x86.h index c23a812c..e212b116 100644 --- a/arch/x86/include/arch/x86.h +++ b/arch/x86/include/arch/x86.h @@ -471,6 +471,42 @@ static inline void write_msr (uint32_t msr_id, uint64_t msr_write_val) { : : "c" (msr_id), "a" (low_val), "d"(high_val)); } +static inline uint64_t x86_read_gs_offset64(uintptr_t offset) { + uint64_t ret; + __asm__("movq %%gs:%1, %0" : "=r"(ret) : "m"(*(uint64_t*)(offset))); + return ret; +} + +static inline void x86_write_gs_offset64(uintptr_t offset, uint64_t val) { + __asm__("movq %0, %%gs:%1" : : "ir"(val), "m"(*(uint64_t*)(offset)) : "memory"); +} + +static inline uint32_t x86_read_gs_offset32(uintptr_t offset) { + uint32_t ret; + __asm__("movl %%gs:%1, %0" : "=r"(ret) : "m"(*(uint32_t*)(offset))); + return ret; +} + +static inline void x86_write_gs_offset32(uintptr_t offset, uint32_t val) { + __asm__("movl %0, %%gs:%1" : : "ir"(val), "m"(*(uint32_t*)(offset)) : "memory"); +} + +#if __SIZEOF_POINTER__ == 8 +static inline void *x86_read_gs_offset_ptr(uintptr_t offset) { + return (void *)x86_read_gs_offset64(offset); +} +static inline void x86_write_gs_offset_ptr(uintptr_t offset, void *val) { + x86_write_gs_offset64(offset, (uint64_t)(val)); +} +#else +static inline void *x86_read_gs_offset_ptr(uintptr_t offset) { + return (void *)x86_read_gs_offset32(offset); +} +static inline void x86_write_gs_offset_ptr(uintptr_t offset, void *val) { + x86_write_gs_offset32(offset, (uint64_t)(val)); +} +#endif + typedef ulong x86_flags_t; static inline x86_flags_t x86_save_flags(void) { diff --git a/arch/x86/include/arch/x86/mp.h b/arch/x86/include/arch/x86/mp.h index 518bf8df..ecb608c6 100644 --- a/arch/x86/include/arch/x86/mp.h +++ b/arch/x86/include/arch/x86/mp.h @@ -8,8 +8,9 @@ #pragma once #include +#include -// per cpu pointer pointed to by a segment register on x86 +// per cpu pointer pointed to by gs: typedef struct x86_percpu { // pointer back to ourselves so we can get a raw pointer via segment:0 struct x86_percpu *self; @@ -24,5 +25,52 @@ typedef struct x86_percpu { // per cpu doublefault/nmi stacks } x86_percpu_t; +#define X86_PERCPU_FIELD_OFFSET(field) offsetof(x86_percpu_t, field) + // called extremely early on the boot cpu and each secondary cpu -void x86_percpu_init_early(uint cpu_num, uint apic_id); \ No newline at end of file +void x86_percpu_init_early(uint cpu_num, uint apic_id); + +// get the percpu struct for the current cpu +static inline x86_percpu_t *x86_get_percpu(void) { + x86_percpu_t *percpu; + __asm__ volatile("mov %%gs:0, %0" : "=r" (percpu)); + return percpu; +} + +// get the percpu struct for a specific cpu +x86_percpu_t *x86_get_percpu_for_cpu(uint cpu_num); + +#if 0 +#define X86_PERCPU_GET(field) (_Generic(((x86_get_percpu())->field), \ + uint32_t: x86_read_gs_offset32, \ + uint64_t: x86_read_gs_offset64, \ + struct thread*: x86_read_gs_offset_ptr) \ + (X86_PERCPU_FIELD_OFFSET(field))) + +#define X86_PERCPU_SET(field, value) (_Generic(((x86_get_percpu())->field), \ + uint32_t: x86_write_gs_offset32, \ + uint64_t: x86_write_gs_offset64, \ + struct thread*: x86_write_gs_offset_ptr) \ + (X86_PERCPU_FIELD_OFFSET(field), value)) +#endif + +// get the current cpu number +static inline uint x86_get_cpu_num(void) { + return x86_read_gs_offset32(X86_PERCPU_FIELD_OFFSET(cpu_num)); +} + +// get the current apic id +static inline uint x86_get_apic_id(void) { + return x86_read_gs_offset32(X86_PERCPU_FIELD_OFFSET(apic_id)); +} + +// get/set the current thread +struct thread; + +static inline struct thread *x86_get_current_thread(void) { + return (struct thread *)x86_read_gs_offset_ptr(X86_PERCPU_FIELD_OFFSET(current_thread)); +} + +static inline void x86_set_current_thread(struct thread *t) { + x86_write_gs_offset_ptr(X86_PERCPU_FIELD_OFFSET(current_thread), t); +} \ No newline at end of file diff --git a/arch/x86/mp.c b/arch/x86/mp.c index 536c68dc..7a58c2ba 100644 --- a/arch/x86/mp.c +++ b/arch/x86/mp.c @@ -8,7 +8,10 @@ #include #include +#include +#include #include +#include #include // the boot cpu's percpu struct @@ -16,7 +19,7 @@ static x86_percpu_t x86_boot_percpu; // pointer to an array of percpu structs for each of the secondary cpus static x86_percpu_t **x86_ap_percpus; -static x86_percpu_t *percpu_for_cpu(uint cpu_num) { +x86_percpu_t *x86_get_percpu_for_cpu(uint cpu_num) { DEBUG_ASSERT(cpu_num < SMP_MAX_CPUS); if (cpu_num == 0) { return &x86_boot_percpu; @@ -26,8 +29,9 @@ static x86_percpu_t *percpu_for_cpu(uint cpu_num) { } void x86_percpu_init_early(uint cpu_num, uint apic_id) { - x86_percpu_t *percpu = percpu_for_cpu(cpu_num); + x86_percpu_t *percpu = x86_get_percpu_for_cpu(cpu_num); + // initialize the percpu structure for this cpu percpu->self = percpu; percpu->cpu_num = cpu_num; percpu->apic_id = apic_id; @@ -37,6 +41,13 @@ void x86_percpu_init_early(uint cpu_num, uint apic_id) { write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0); write_msr(X86_MSR_IA32_GS_BASE, (uint64_t)percpu); #else -//#error implement +#error implement #endif -} \ No newline at end of file +} + +status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { + PANIC_UNIMPLEMENTED; +} + +void arch_mp_init_percpu(void) { +} diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index 2a11a3f1..0ae86ce0 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -6,6 +6,11 @@ MODULE_OPTIONS := extra_warnings # x86 code always runs with the mmu enabled WITH_KERNEL_VM := 1 +ifneq ($(CPU),legacy) +WITH_SMP ?= 1 +else +WITH_SMP ?= 0 +endif ifeq ($(SUBARCH),x86-32) MEMBASE ?= 0x00000000 @@ -41,9 +46,15 @@ GLOBAL_DEFINES += \ KERNEL_LOAD_OFFSET=$(KERNEL_LOAD_OFFSET) \ KERNEL_ASPACE_BASE=$(KERNEL_ASPACE_BASE) \ KERNEL_ASPACE_SIZE=$(KERNEL_ASPACE_SIZE) \ - SMP_MAX_CPUS=1 \ ARCH_HAS_MMU=1 +ifeq ($(WITH_SMP),1) +SMP_MAX_CPUS ?= 8 +GLOBAL_DEFINES += \ + WITH_SMP=1 \ + SMP_MAX_CPUS=$(SMP_MAX_CPUS) +endif + MODULE_SRCS += \ $(SUBARCH_DIR)/start.S \ \ @@ -121,10 +132,12 @@ else ifeq ($(SUBARCH),x86-32) ARCH_COMPILEFLAGS += -march=i686 ARCH_OPTFLAGS := -O2 GLOBAL_DEFINES += X86_LEGACY=0 +GLOBAL_DEFINES += WITH_SMP=1 else ifeq ($(SUBARCH),x86-64) ARCH_COMPILEFLAGS += -march=x86-64 ARCH_OPTFLAGS := -O2 GLOBAL_DEFINES += X86_LEGACY=0 +GLOBAL_DEFINES += WITH_SMP=1 endif LIBGCC := $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) -print-libgcc-file-name) diff --git a/arch/x86/thread.c b/arch/x86/thread.c index 94aa3f5f..c8275372 100644 --- a/arch/x86/thread.c +++ b/arch/x86/thread.c @@ -17,18 +17,19 @@ #include #include +#if !WITH_SMP /* we're uniprocessor at this point for x86, so store a global pointer to the current thread */ struct thread *_current_thread; +#endif static void initial_thread_func(void) __NO_RETURN; static void initial_thread_func(void) { - int ret; - /* release the thread lock that was implicitly held across the reschedule */ spin_unlock(&thread_lock); arch_enable_ints(); - ret = _current_thread->entry(_current_thread->arg); + thread_t *ct = arch_get_current_thread(); + int ret = ct->entry(ct->arg); thread_exit(ret); } diff --git a/kernel/mp.c b/kernel/mp.c index 0787cab8..ea121458 100644 --- a/kernel/mp.c +++ b/kernel/mp.c @@ -38,6 +38,9 @@ void mp_reschedule(mp_cpu_mask_t target, uint flags) { target &= ~mp.realtime_cpus; } target &= ~(1U << local_cpu); + if (target == 0) { + return; + } LTRACEF("local %d, post mask target now 0x%x\n", local_cpu, target); From 181796e84378c2428f2b8ec03982b35dec715a4c Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Fri, 6 Dec 2024 22:10:49 -0800 Subject: [PATCH 03/26] WIP x86 get x86-32 working with a per-cpu gs: segment register for the kernel --- arch/x86/32/exceptions.S | 6 +++--- arch/x86/32/start.S | 5 +++++ arch/x86/64/exceptions.S | 2 ++ arch/x86/arch.c | 4 ++-- arch/x86/gdt.S | 14 ++++++++++++-- arch/x86/include/arch/x86.h | 24 +++++++++++++++++++++++- arch/x86/include/arch/x86/descriptor.h | 5 ++++- arch/x86/mp.c | 9 +++++++-- arch/x86/rules.mk | 5 +++-- 9 files changed, 61 insertions(+), 13 deletions(-) diff --git a/arch/x86/32/exceptions.S b/arch/x86/32/exceptions.S index e1bca374..9e980e07 100644 --- a/arch/x86/32/exceptions.S +++ b/arch/x86/32/exceptions.S @@ -45,7 +45,8 @@ LOCAL_FUNCTION(interrupt_common) pushl %ds pusha /* save general purpose registers */ movl $DATA_SELECTOR, %eax /* put known good value in segment registers */ - movl %eax, %gs + // do not reset %gs, as it is used by the kernel + // TODO: when dealing with user space, we need to reset %gs here movl %eax, %fs movl %eax, %es movl %eax, %ds @@ -61,8 +62,7 @@ LOCAL_FUNCTION(interrupt_common) popl %ds /* restore segment registers */ popl %es popl %fs - popl %gs - addl $8, %esp /* drop exception number and error code */ + addl $12, %esp /* drop gs, exception number, and error code */ iret END_FUNCTION(interrupt_common) diff --git a/arch/x86/32/start.S b/arch/x86/32/start.S index 4c7d585a..bd0ff483 100644 --- a/arch/x86/32/start.S +++ b/arch/x86/32/start.S @@ -182,6 +182,11 @@ main_lk: /* set up the idt */ call setup_idt + /* set up the percpu data structure pointer for the boot cpu */ + pushl $0 + pushl $0 + call x86_percpu_init_early + /* call the main module */ call lk_main 0: /* just sit around waiting for interrupts */ diff --git a/arch/x86/64/exceptions.S b/arch/x86/64/exceptions.S index 1a506ef1..c8f7ed9e 100644 --- a/arch/x86/64/exceptions.S +++ b/arch/x86/64/exceptions.S @@ -65,6 +65,8 @@ LOCAL_FUNCTION(interrupt_common) pushq %rsi pushq %rdi + /* TODO: deal with swapgs if coming from user space */ + /* pass the iframe using rdi */ movq %rsp, %rdi diff --git a/arch/x86/arch.c b/arch/x86/arch.c index cd3854e5..9db1bb40 100644 --- a/arch/x86/arch.c +++ b/arch/x86/arch.c @@ -78,8 +78,8 @@ void arch_early_init(void) { #elif ARCH_X86_64 /* nothing to be done here, a fully zeroed TSS is a good starting point */ #endif - x86_set_gdt_descriptor(TSS_SELECTOR, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); - x86_ltr(TSS_SELECTOR); + x86_set_gdt_descriptor(TSS_SELECTOR_BASE, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); + x86_ltr(TSS_SELECTOR_BASE); x86_feature_early_init(); diff --git a/arch/x86/gdt.S b/arch/x86/gdt.S index 41c71f88..acea65df 100644 --- a/arch/x86/gdt.S +++ b/arch/x86/gdt.S @@ -112,8 +112,7 @@ _user_data_64_gde: .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ .byte 0x0 /* base 31:24 */ -/* TSS descriptor */ -.set tsssel, . - _gdt +/* per-cpu TSS descriptor */ _tss_gde: .set i, 1 .rept SMP_MAX_CPUS @@ -131,6 +130,17 @@ _tss_gde: .set i, i+1 .endr +/* per-cpu GS descriptor for x86-32 */ +#if ARCH_X86_32 +_percpu_gde: +.set i, 1 +.rept SMP_MAX_CPUS + .int 0 /* filled in by C code later */ + .int 0 +.set i, i+1 +.endr +#endif + END_DATA(_gdt) DATA(_gdt_end) diff --git a/arch/x86/include/arch/x86.h b/arch/x86/include/arch/x86.h index e212b116..558d521b 100644 --- a/arch/x86/include/arch/x86.h +++ b/arch/x86/include/arch/x86.h @@ -316,6 +316,23 @@ static inline void x86_set_cr4(ulong in_val) { __asm__ __volatile__("mov %0,%%cr4 \n\t" : : "r"(in_val)); } +#define DEFINE_REGISTER_ACCESSOR(REG) \ + static inline void x86_set_##REG(uint16_t value) { \ + __asm__ volatile("mov %0, %%" #REG : : "r"(value)); \ + } \ + static inline uint16_t x86_get_##REG(void) { \ + uint16_t value; \ + __asm__ volatile("mov %%" #REG ", %0" : "=r"(value)); \ + return value; \ + } + +DEFINE_REGISTER_ACCESSOR(ds) +DEFINE_REGISTER_ACCESSOR(es) +DEFINE_REGISTER_ACCESSOR(fs) +DEFINE_REGISTER_ACCESSOR(gs) + +#undef DEFINE_REGISTER_ACCESSOR + static inline uint8_t inp(uint16_t _port) { uint8_t rv; __asm__ __volatile__("inb %1, %0" : "=a"(rv) : "dN"(_port)); @@ -471,6 +488,9 @@ static inline void write_msr (uint32_t msr_id, uint64_t msr_write_val) { : : "c" (msr_id), "a" (low_val), "d"(high_val)); } +#pragma GCC diagnostic push +/* The dereference of offset in the inline asm below generates this warning in GCC */ +#pragma GCC diagnostic ignored "-Warray-bounds" static inline uint64_t x86_read_gs_offset64(uintptr_t offset) { uint64_t ret; __asm__("movq %%gs:%1, %0" : "=r"(ret) : "m"(*(uint64_t*)(offset))); @@ -490,7 +510,9 @@ static inline uint32_t x86_read_gs_offset32(uintptr_t offset) { static inline void x86_write_gs_offset32(uintptr_t offset, uint32_t val) { __asm__("movl %0, %%gs:%1" : : "ir"(val), "m"(*(uint32_t*)(offset)) : "memory"); } +#pragma GCC diagnostic pop +/* cannot easily use C generics or C++ templates here, so do it the hard way */ #if __SIZEOF_POINTER__ == 8 static inline void *x86_read_gs_offset_ptr(uintptr_t offset) { return (void *)x86_read_gs_offset64(offset); @@ -503,7 +525,7 @@ static inline void *x86_read_gs_offset_ptr(uintptr_t offset) { return (void *)x86_read_gs_offset32(offset); } static inline void x86_write_gs_offset_ptr(uintptr_t offset, void *val) { - x86_write_gs_offset32(offset, (uint64_t)(val)); + x86_write_gs_offset32(offset, (uint32_t)(val)); } #endif diff --git a/arch/x86/include/arch/x86/descriptor.h b/arch/x86/include/arch/x86/descriptor.h index be95817f..cf4da2cb 100644 --- a/arch/x86/include/arch/x86/descriptor.h +++ b/arch/x86/include/arch/x86/descriptor.h @@ -26,7 +26,10 @@ #define USER_DATA_64_SELECTOR 0x40 /* base selector for a list of TSSes, one per cpu (SMP_MAX_CPUS) */ -#define TSS_SELECTOR 0x48 +#define TSS_SELECTOR_BASE 0x48 + +/* base selector for a gs segment per cpu (SMP_MAX_CPUS) */ +#define PERCPU_SELECTOR_BASE (TSS_SELECTOR_BASE + 8 * SMP_MAX_CPUS) /* code/data segment types (S = 1) */ /* bit 0 is accessed */ diff --git a/arch/x86/mp.c b/arch/x86/mp.c index 7a58c2ba..d0173aa0 100644 --- a/arch/x86/mp.c +++ b/arch/x86/mp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -36,13 +37,17 @@ void x86_percpu_init_early(uint cpu_num, uint apic_id) { percpu->cpu_num = cpu_num; percpu->apic_id = apic_id; - // XXX load into gs:/fs/etc #if ARCH_X86_64 + // use the 64-bit gs base msr to set up a pointer to the percpu struct write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0); write_msr(X86_MSR_IA32_GS_BASE, (uint64_t)percpu); #else -#error implement + // set up a gs descriptor for this cpu + uint16_t selector = PERCPU_SELECTOR_BASE + cpu_num; + x86_set_gdt_descriptor(selector, percpu, sizeof(*percpu), 1, 0, 1, SEG_TYPE_DATA_RW, 0, 1); + x86_set_gs(selector); #endif + __UNUSED volatile uint foo = x86_get_cpu_num(); } status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index 0ae86ce0..985f8f53 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -53,6 +53,9 @@ SMP_MAX_CPUS ?= 8 GLOBAL_DEFINES += \ WITH_SMP=1 \ SMP_MAX_CPUS=$(SMP_MAX_CPUS) +else +GLOBAL_DEFINES += \ + SMP_MAX_CPUS=1 endif MODULE_SRCS += \ @@ -132,12 +135,10 @@ else ifeq ($(SUBARCH),x86-32) ARCH_COMPILEFLAGS += -march=i686 ARCH_OPTFLAGS := -O2 GLOBAL_DEFINES += X86_LEGACY=0 -GLOBAL_DEFINES += WITH_SMP=1 else ifeq ($(SUBARCH),x86-64) ARCH_COMPILEFLAGS += -march=x86-64 ARCH_OPTFLAGS := -O2 GLOBAL_DEFINES += X86_LEGACY=0 -GLOBAL_DEFINES += WITH_SMP=1 endif LIBGCC := $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) -print-libgcc-file-name) From 1afb5d7a66c216c71f84329b23de446257c90eff Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Fri, 6 Dec 2024 23:40:26 -0800 Subject: [PATCH 04/26] WIP x86 smp: start the framework for detecting and starting secondary cores --- arch/x86/mp.c | 9 ++- lib/acpi_lite/acpi_lite.cpp | 35 ++++++++++- lib/acpi_lite/include/lib/acpi_lite.h | 6 +- lib/acpi_lite/include/lib/acpi_lite/structs.h | 7 +++ platform/pc/mp.c | 63 +++++++++++++++++++ platform/pc/platform.c | 44 +++++-------- platform/pc/platform_p.h | 2 + platform/pc/rules.mk | 1 + 8 files changed, 132 insertions(+), 35 deletions(-) create mode 100644 platform/pc/mp.c diff --git a/arch/x86/mp.c b/arch/x86/mp.c index d0173aa0..96e20c84 100644 --- a/arch/x86/mp.c +++ b/arch/x86/mp.c @@ -15,6 +15,8 @@ #include #include +#if WITH_SMP + // the boot cpu's percpu struct static x86_percpu_t x86_boot_percpu; // pointer to an array of percpu structs for each of the secondary cpus @@ -47,7 +49,6 @@ void x86_percpu_init_early(uint cpu_num, uint apic_id) { x86_set_gdt_descriptor(selector, percpu, sizeof(*percpu), 1, 0, 1, SEG_TYPE_DATA_RW, 0, 1); x86_set_gs(selector); #endif - __UNUSED volatile uint foo = x86_get_cpu_num(); } status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { @@ -56,3 +57,9 @@ status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { void arch_mp_init_percpu(void) { } + +#else + +void x86_percpu_init_early(uint cpu_num, uint apic_id) {} + +#endif \ No newline at end of file diff --git a/lib/acpi_lite/acpi_lite.cpp b/lib/acpi_lite/acpi_lite.cpp index beb1c2b5..5752a586 100644 --- a/lib/acpi_lite/acpi_lite.cpp +++ b/lib/acpi_lite/acpi_lite.cpp @@ -407,7 +407,7 @@ void acpi_lite_dump_tables(bool full_dump) { } } -status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_entry_callback callback) { +status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_entry_callback callback, void *cookie) { const acpi_madt_table* madt = reinterpret_cast(acpi_get_table_by_sig(ACPI_MADT_SIG)); if (!madt) { @@ -417,14 +417,17 @@ status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_ent // bytewise array of the same table const uint8_t* madt_array = reinterpret_cast(madt); + LTRACEF("table at %p\n", madt_array); + // walk the table off the end of the header, looking for the requested type size_t off = sizeof(*madt); while (off < madt->header.length) { uint8_t type = madt_array[off]; uint8_t length = madt_array[off + 1]; + LTRACEF("type %u, length %u\n", type, length); if (type == search_type) { - callback(static_cast(&madt_array[off]), length); + callback(static_cast(&madt_array[off]), length, cookie); } off += length; @@ -433,4 +436,32 @@ status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_ent return NO_ERROR; } +void acpi_lite_dump_madt_table() { + auto local_apic_callback = [](const void *_entry, size_t entry_len, void *cookie) { + const auto *entry = reinterpret_cast(_entry); + + printf("\tLOCAL APIC id %d, processor id %d, flags %#x\n", + entry->apic_id, entry->processor_id, entry->flags); + }; + + auto io_apic_callback = [](const void *_entry, size_t entry_len, void *cookie) { + const auto *entry = reinterpret_cast(_entry); + + printf("\tIO APIC id %d, address %#x gsi base %u\n", + entry->io_apic_id, entry->io_apic_address, entry->global_system_interrupt_base); + }; + + auto int_source_override_callback = [](const void *_entry, size_t entry_len, void *cookie) { + const auto *entry = reinterpret_cast(_entry); + + printf("\tINT OVERRIDE bus %u, source %u, gsi %u, flags %#x\n", + entry->bus, entry->source, entry->global_sys_interrupt, entry->flags); + }; + printf("MADT/APIC table:\n"); + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_LOCAL_APIC, local_apic_callback, nullptr); + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_IO_APIC, io_apic_callback, nullptr); + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_INT_SOURCE_OVERRIDE, int_source_override_callback, nullptr); +} + + // vim: set ts=2 sw=2 expandtab: diff --git a/lib/acpi_lite/include/lib/acpi_lite.h b/lib/acpi_lite/include/lib/acpi_lite.h index 47a63942..01fbaadf 100644 --- a/lib/acpi_lite/include/lib/acpi_lite.h +++ b/lib/acpi_lite/include/lib/acpi_lite.h @@ -17,12 +17,14 @@ __BEGIN_CDECLS status_t acpi_lite_init(paddr_t rsdt); void acpi_lite_dump_tables(bool full_dump); +void acpi_lite_dump_madt_table(void); const struct acpi_sdt_header* acpi_get_table_by_sig(const char* sig); // A routine to iterate over all the MADT entries of a particular type via a callback //using MadtEntryCallback = fbl::Function; -typedef void (*madt_entry_callback)(const void* entry, size_t entry_len); -status_t acpi_process_madt_entries_etc(uint8_t search_type, const madt_entry_callback); +typedef void (*madt_entry_callback)(const void* entry, size_t entry_len, void *cookie); +status_t acpi_process_madt_entries_etc(uint8_t search_type, const madt_entry_callback, void *cookie); + __END_CDECLS diff --git a/lib/acpi_lite/include/lib/acpi_lite/structs.h b/lib/acpi_lite/include/lib/acpi_lite/structs.h index b84c810d..df9d0d5c 100644 --- a/lib/acpi_lite/include/lib/acpi_lite/structs.h +++ b/lib/acpi_lite/include/lib/acpi_lite/structs.h @@ -230,6 +230,7 @@ static_assert(sizeof(struct acpi_madt_int_source_override_entry) == 10, ""); #define ACPI_MADT_FLAG_TRIGGER_MASK 0b1100 // DBG2 table +// From https://learn.microsoft.com/en-us/windows-hardware/drivers/bringup/acpi-debug-port-table #define ACPI_DBG2_SIG "DBG2" struct acpi_dbg2_table { struct acpi_sdt_header header; @@ -263,7 +264,13 @@ static_assert(sizeof(struct acpi_dbg2_device) == 22, ""); // debug port subtypes #define ACPI_DBG2_SUBTYPE_16550_COMPATIBLE 0x0000 #define ACPI_DBG2_SUBTYPE_16550_SUBSET 0x0001 +#define ACPI_DBG2_SUBTYPE_PL011 0x0003 +#define ACPI_DBG2_SUBTYPE_ARM_SBSA 0x000e +#define ACPI_DBG2_SUBTYPE_16550_DESCRIBED 0x0012 +#define ACPI_DBG2_SUBTYPE_RISCV_SBI 0x0015 + #define ACPI_DBG2_SUBTYPE_1394_STANDARD 0x0000 + #define ACPI_DBG2_SUBTYPE_USB_XHCI 0x0000 #define ACPI_DBG2_SUBTYPE_USB_EHCI 0x0001 diff --git a/platform/pc/mp.c b/platform/pc/mp.c new file mode 100644 index 00000000..2df28721 --- /dev/null +++ b/platform/pc/mp.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ + +#include "platform_p.h" + +#include +#include +#include + +#define LOCAL_TRACE 1 + +static void start_cpu(uint cpu_num, uint32_t apic_id) { + LTRACEF("cpu_num %u, apic_id %u\n", cpu_num, apic_id); + + // XXX do work here +} + +struct detected_cpus { + uint32_t num_detected; + uint32_t apic_ids[SMP_MAX_CPUS]; +}; + +static void local_apic_callback(const void *_entry, size_t entry_len, void *cookie) { + const struct acpi_madt_local_apic_entry *entry = _entry; + struct detected_cpus *cpus = cookie; + + if (entry->apic_id == 0) { + // skip the boot cpu + return; + } + if (cpus->num_detected < SMP_MAX_CPUS) { + cpus->apic_ids[cpus->num_detected++] = entry->apic_id; + } +} + +void platform_start_secondary_cpus(void) { + struct detected_cpus cpus; + cpus.num_detected = 1; + cpus.apic_ids[0] = 0; // the boot cpu + + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_LOCAL_APIC, &local_apic_callback, &cpus); + + // TODO: fall back to legacy methods if ACPI fails + // TODO: deal with cpu topology + + // start up the secondary cpus + if (cpus.num_detected > 1) { + dprintf(INFO, "PC: detected %u cpus\n", cpus.num_detected); + + lk_init_secondary_cpus(cpus.num_detected - 1); + + for (uint i = 1; i < cpus.num_detected; i++) { + dprintf(INFO, "PC: starting cpu %u\n", cpus.apic_ids[i]); + start_cpu(i, cpus.apic_ids[i]); + } + } +} + diff --git a/platform/pc/platform.c b/platform/pc/platform.c index c01c248e..7bd968ac 100644 --- a/platform/pc/platform.c +++ b/platform/pc/platform.c @@ -13,7 +13,6 @@ #include #include #include -#include "platform_p.h" #include #include #include @@ -22,12 +21,13 @@ #include #include #include -#include #include #include #include #include +#include "platform_p.h" + #if WITH_DEV_BUS_PCI #include #endif @@ -218,44 +218,28 @@ void platform_early_init(void) { dprintf(INFO, "PC: total memory detected %" PRIu64 " bytes\n", total_mem); } -void local_apic_callback(const void *_entry, size_t entry_len) { - const struct acpi_madt_local_apic_entry *entry = _entry; - - printf("\tLOCAL APIC id %d, processor id %d, flags %#x\n", - entry->apic_id, entry->processor_id, entry->flags); -} - -void io_apic_callback(const void *_entry, size_t entry_len) { - const struct acpi_madt_io_apic_entry *entry = _entry; - - printf("\tIO APIC id %d, address %#x gsi base %u\n", - entry->io_apic_id, entry->io_apic_address, entry->global_system_interrupt_base); -} - -void int_source_override_callback(const void *_entry, size_t entry_len) { - const struct acpi_madt_int_source_override_entry *entry = _entry; - - printf("\tINT OVERRIDE bus %u, source %u, gsi %u, flags %#x\n", - entry->bus, entry->source, entry->global_sys_interrupt, entry->flags); -} - void platform_init(void) { platform_init_debug(); platform_init_keyboard(&console_input_buf); -#if WITH_DEV_BUS_PCI - bool pci_initted = false; + // Look for the root ACPI table + bool found_acpi = false; if (acpi_lite_init(0) == NO_ERROR) { if (LOCAL_TRACE) { acpi_lite_dump_tables(false); } + acpi_lite_dump_madt_table(); + found_acpi = true; + } - // dump the APIC table - printf("MADT/APIC table:\n"); - acpi_process_madt_entries_etc(ACPI_MADT_TYPE_LOCAL_APIC, &local_apic_callback); - acpi_process_madt_entries_etc(ACPI_MADT_TYPE_IO_APIC, &io_apic_callback); - acpi_process_madt_entries_etc(ACPI_MADT_TYPE_INT_SOURCE_OVERRIDE, &int_source_override_callback); + // Look for secondary cpus + platform_start_secondary_cpus(); + +#if WITH_DEV_BUS_PCI + bool pci_initted = false; + if (found_acpi) { + // TODO: handle interrupt source overrides from the MADT table // try to find the mcfg table const struct acpi_mcfg_table *table = (const struct acpi_mcfg_table *)acpi_get_table_by_sig(ACPI_MCFG_SIG); diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index 59e340fd..516aea8e 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -26,3 +26,5 @@ void pic_mask_interrupts(void); void lapic_init(void); void lapic_eoi(unsigned int vector); +// secondary cpus +void platform_start_secondary_cpus(void); diff --git a/platform/pc/rules.mk b/platform/pc/rules.mk index 9b62107d..dba1abfe 100644 --- a/platform/pc/rules.mk +++ b/platform/pc/rules.mk @@ -23,6 +23,7 @@ MODULE_SRCS += \ $(LOCAL_DIR)/interrupts.c \ $(LOCAL_DIR)/keyboard.c \ $(LOCAL_DIR)/lapic.c \ + $(LOCAL_DIR)/mp.c \ $(LOCAL_DIR)/pic.c \ $(LOCAL_DIR)/platform.c \ $(LOCAL_DIR)/timer.c \ From 1ca821ec542f1adfa68264c0ea6b0ced4c17b6a9 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Fri, 6 Dec 2024 23:44:19 -0800 Subject: [PATCH 05/26] WIP x86-smp squelch some warnings in no smp mode --- platform/pc/mp.c | 3 +++ platform/pc/platform.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 2df28721..616a10d2 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -12,6 +12,8 @@ #include #include +#if WITH_SMP + #define LOCAL_TRACE 1 static void start_cpu(uint cpu_num, uint32_t apic_id) { @@ -61,3 +63,4 @@ void platform_start_secondary_cpus(void) { } } +#endif // WITH_SMP \ No newline at end of file diff --git a/platform/pc/platform.c b/platform/pc/platform.c index 7bd968ac..8095d6cb 100644 --- a/platform/pc/platform.c +++ b/platform/pc/platform.c @@ -224,7 +224,7 @@ void platform_init(void) { platform_init_keyboard(&console_input_buf); // Look for the root ACPI table - bool found_acpi = false; + __UNUSED bool found_acpi = false; if (acpi_lite_init(0) == NO_ERROR) { if (LOCAL_TRACE) { acpi_lite_dump_tables(false); @@ -234,7 +234,9 @@ void platform_init(void) { } // Look for secondary cpus +#if WITH_SMP platform_start_secondary_cpus(); +#endif #if WITH_DEV_BUS_PCI bool pci_initted = false; From 6538baea708cbf1023d4a977ef4d34b11e23f2ac Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Wed, 11 Dec 2024 00:17:09 -0800 Subject: [PATCH 06/26] WIP x86-smp add uspace mmu support for x86-64 trampoline x86-64 cpus to long mode and into the kernel aspace --- arch/arm64/mmu.c | 2 +- arch/x86/32/start.S | 20 +-- arch/x86/64/mmu.c | 86 ++++++++++--- arch/x86/64/start.S | 11 +- arch/x86/include/arch/aspace.h | 12 +- arch/x86/rules.mk | 10 +- lib/acpi_lite/acpi_lite.cpp | 3 +- lib/acpi_lite/include/lib/acpi_lite.h | 3 +- platform/pc/lapic.c | 99 ++++++++++++++- platform/pc/mp-boot.S | 167 ++++++++++++++++++++++++++ platform/pc/mp.c | 88 ++++++++++++-- platform/pc/platform_p.h | 3 + platform/pc/rules.mk | 1 + 13 files changed, 446 insertions(+), 59 deletions(-) create mode 100644 platform/pc/mp-boot.S diff --git a/arch/arm64/mmu.c b/arch/arm64/mmu.c index e5fbd83c..51e3b5ea 100644 --- a/arch/arm64/mmu.c +++ b/arch/arm64/mmu.c @@ -34,7 +34,7 @@ __SECTION(".bss.prebss.translation_table"); /* the base TCR flags, computed from early init code in start.S */ uint64_t arm64_mmu_tcr_flags __SECTION(".bss.prebss.tcr_flags"); -static inline bool is_valid_vaddr(arch_aspace_t *aspace, vaddr_t vaddr) { +static inline bool is_valid_vaddr(const arch_aspace_t *aspace, vaddr_t vaddr) { return (vaddr >= aspace->base && vaddr <= aspace->base + aspace->size - 1); } diff --git a/arch/x86/32/start.S b/arch/x86/32/start.S index bd0ff483..79b616fe 100644 --- a/arch/x86/32/start.S +++ b/arch/x86/32/start.S @@ -118,15 +118,6 @@ paging_setup: addl $4096, %eax movl %eax, 12(%esi) movl %eax, 12(%edi) - - /* Set PD in CR3 */ - movl $PHYS(kernel_pd), %eax - mov %eax, %cr3 - - /* Enabling Paging and from this point we are in */ - mov %cr0, %eax - btsl $(31), %eax - mov %eax, %cr0 #else /* map the first 1GB 1:1 using 4MB pages */ movl $PHYS(kernel_pd), %esi @@ -154,19 +145,20 @@ paging_setup: addl $0x00400000, %eax loop .Lfill_pd2 + /* enable PSE (4MB pages) */ + mov %cr4, %eax + orl $(1<<4), %eax + mov %eax, %cr4 +#endif + /* Set PD in CR3 */ movl $PHYS(kernel_pd), %eax mov %eax, %cr3 /* Enabling Paging and from this point we are in */ - mov %cr4, %eax - orl $0x10, %eax - mov %eax, %cr4 - mov %cr0, %eax btsl $(31), %eax mov %eax, %cr0 -#endif /* load the high kernel stack */ movl $(_kstack + 4096), %esp diff --git a/arch/x86/64/mmu.c b/arch/x86/64/mmu.c index 46767db1..a0343a73 100644 --- a/arch/x86/64/mmu.c +++ b/arch/x86/64/mmu.c @@ -24,6 +24,7 @@ #include #define LOCAL_TRACE 0 +#define TRACE_CONTEXT_SWITCH 1 /* Address width including virtual/physical address*/ static uint8_t vaddr_width = 0; @@ -85,6 +86,11 @@ static bool x86_mmu_check_paddr(const paddr_t paddr) { return paddr <= max_paddr; } +/* is the address within the aspace */ +static bool is_valid_vaddr(const arch_aspace_t *aspace, vaddr_t vaddr) { + return (vaddr >= aspace->base && vaddr <= aspace->base + aspace->size - 1); +} + static inline uint64_t get_pfn_from_pte(uint64_t pte) { uint64_t pfn = (pte & (X86_PG_FRAME & X86_PHY_ADDR_MASK)); return pfn; @@ -517,16 +523,13 @@ int arch_mmu_unmap(arch_aspace_t * const aspace, const vaddr_t vaddr, const uint DEBUG_ASSERT(aspace); - if (!(x86_mmu_check_vaddr(vaddr))) + if (!is_valid_vaddr(aspace, vaddr)) return ERR_INVALID_ARGS; if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - paddr_t current_cr3_val = x86_get_cr3(); - - return (x86_mmu_unmap(paddr_to_kvaddr(current_cr3_val), vaddr, count)); + return (x86_mmu_unmap(aspace->cr3, vaddr, count)); } /** @@ -573,13 +576,12 @@ status_t arch_mmu_query(arch_aspace_t * const aspace, const vaddr_t vaddr, paddr if (!paddr) return ERR_INVALID_ARGS; - DEBUG_ASSERT(x86_get_cr3()); - paddr_t current_cr3_val = (addr_t)x86_get_cr3(); - uint64_t *cr3_virt = paddr_to_kvaddr(current_cr3_val); + if (!is_valid_vaddr(aspace, vaddr)) + return ERR_INVALID_ARGS; arch_flags_t ret_flags; uint32_t ret_level; - status_t stat = x86_mmu_get_mapping(cr3_virt, vaddr, &ret_level, &ret_flags, paddr); + status_t stat = x86_mmu_get_mapping(aspace->cr3, vaddr, &ret_level, &ret_flags, paddr); if (stat) return stat; @@ -602,26 +604,23 @@ int arch_mmu_map(arch_aspace_t *const aspace, const vaddr_t vaddr, const paddr_t if ((!x86_mmu_check_paddr(paddr))) return ERR_INVALID_ARGS; - if (!x86_mmu_check_vaddr(vaddr)) + if (!is_valid_vaddr(aspace, vaddr)) return ERR_INVALID_ARGS; if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - addr_t current_cr3_val = (addr_t)x86_get_cr3(); - struct map_range range; range.start_vaddr = vaddr; range.start_paddr = paddr; range.size = count * PAGE_SIZE; - return (x86_mmu_map_range(paddr_to_kvaddr(current_cr3_val), &range, flags)); + return (x86_mmu_map_range(aspace->cr3, &range, flags)); } bool arch_mmu_supports_nx_mappings(void) { return true; } bool arch_mmu_supports_ns_mappings(void) { return false; } -bool arch_mmu_supports_user_aspaces(void) { return false; } +bool arch_mmu_supports_user_aspaces(void) { return true; } void x86_mmu_early_init(void) { /* Set WP bit in CR0*/ @@ -671,20 +670,69 @@ void x86_mmu_init(void) { status_t arch_mmu_init_aspace(arch_aspace_t * const aspace, const vaddr_t base, const size_t size, const uint flags) { DEBUG_ASSERT(aspace); - if ((flags & ARCH_ASPACE_FLAG_KERNEL) == 0) { - return ERR_NOT_SUPPORTED; + LTRACEF("aspace %p, base %#lx, size %zu, flags %#x\n", aspace, base, size, flags); + + /* validate that the base + size is sane and doesn't wrap */ + DEBUG_ASSERT(size > PAGE_SIZE); + DEBUG_ASSERT(base + size - 1 > base); + + aspace->flags = flags; + if (flags & ARCH_ASPACE_FLAG_KERNEL) { + /* at the moment we can only deal with address spaces as globally defined */ + DEBUG_ASSERT(base == KERNEL_ASPACE_BASE); + DEBUG_ASSERT(size == KERNEL_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + aspace->cr3 = kernel_pml4; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + } else { + DEBUG_ASSERT(base == USER_ASPACE_BASE); + DEBUG_ASSERT(size == USER_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + + map_addr_t *va = pmm_alloc_kpages(1, NULL); + if (!va) { + return ERR_NO_MEMORY; + } + + aspace->cr3 = va; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + + /* copy the top entries from the kernel top table */ + memcpy(aspace->cr3 + NO_OF_PT_ENTRIES/2, kernel_pml4 + NO_OF_PT_ENTRIES/2, PAGE_SIZE/2); + + /* zero out the rest */ + memset(aspace->cr3, 0, PAGE_SIZE/2); } return NO_ERROR; } status_t arch_mmu_destroy_aspace(arch_aspace_t *aspace) { + PANIC_UNIMPLEMENTED; return NO_ERROR; } void arch_mmu_context_switch(arch_aspace_t *aspace) { - if (aspace != NULL) { - PANIC_UNIMPLEMENTED; + if (TRACE_CONTEXT_SWITCH) + TRACEF("aspace %p\n", aspace); + + uint64_t cr3; + if (aspace) { + DEBUG_ASSERT((aspace->flags & ARCH_ASPACE_FLAG_KERNEL) == 0); + + cr3 = aspace->cr3_phys; + } else { + // TODO save copy of this + cr3 = vaddr_to_paddr(kernel_pml4); } + if (TRACE_CONTEXT_SWITCH) { + TRACEF("cr3 %#llx\n", cr3); + } + + x86_set_cr3(cr3); } diff --git a/arch/x86/64/start.S b/arch/x86/64/start.S index cc24658d..a8420441 100644 --- a/arch/x86/64/start.S +++ b/arch/x86/64/start.S @@ -96,7 +96,7 @@ paging_setup: /* PAE bit must be enabled for 64 bit paging*/ mov %cr4, %eax - btsl $(5), %eax + or $(1<<5), %eax mov %eax, %cr4 /* load the physical pointer to the top level page table */ @@ -201,6 +201,15 @@ highaddr: /* reload the gdtr */ lgdt _gdtr + /* zero out the data selectors */ + xor %eax, %eax + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %ss + movw %ax, %gs + movw %ax, %ss + /* set up the idt */ call setup_idt diff --git a/arch/x86/include/arch/aspace.h b/arch/x86/include/arch/aspace.h index e8d26ff0..32cbac86 100644 --- a/arch/x86/include/arch/aspace.h +++ b/arch/x86/include/arch/aspace.h @@ -8,11 +8,21 @@ #pragma once #include +#include +#include __BEGIN_CDECLS struct arch_aspace { - // nothing for now, does not support address spaces other than the kernel + /* pointer to the root page table */ + paddr_t cr3_phys; + map_addr_t *cr3; + + uint flags; + + /* range of address space */ + vaddr_t base; + size_t size; }; __END_CDECLS diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index 985f8f53..148c0eb7 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -18,8 +18,8 @@ KERNEL_BASE ?= 0x80000000 KERNEL_LOAD_OFFSET ?= 0x00200000 KERNEL_ASPACE_BASE ?= 0x80000000 KERNEL_ASPACE_SIZE ?= 0x7ff00000 -USER_ASPACE_BASE ?= 0 -USER_ASPACE_SIZE ?= 0x80000000 +USER_ASPACE_BASE ?= 0x1000 # 4KB +USER_ASPACE_SIZE ?= 0x7fffe000 # 2GB - 2*4KB SUBARCH_DIR := $(LOCAL_DIR)/32 endif @@ -32,8 +32,8 @@ KERNEL_BASE ?= 0xffffffff80000000 KERNEL_LOAD_OFFSET ?= 0x00200000 KERNEL_ASPACE_BASE ?= 0xffffff8000000000UL # -512GB KERNEL_ASPACE_SIZE ?= 0x0000008000000000UL -USER_ASPACE_BASE ?= 0x0000000000000000UL -USER_ASPACE_SIZE ?= 0x0000800000000000UL +USER_ASPACE_BASE ?= 0x0000000000001000UL # 4KB +USER_ASPACE_SIZE ?= 0x00007fffffffe000UL # ((1<<47) - 2*4KB) SUBARCH_DIR := $(LOCAL_DIR)/64 endif @@ -46,6 +46,8 @@ GLOBAL_DEFINES += \ KERNEL_LOAD_OFFSET=$(KERNEL_LOAD_OFFSET) \ KERNEL_ASPACE_BASE=$(KERNEL_ASPACE_BASE) \ KERNEL_ASPACE_SIZE=$(KERNEL_ASPACE_SIZE) \ + USER_ASPACE_BASE=$(USER_ASPACE_BASE) \ + USER_ASPACE_SIZE=$(USER_ASPACE_SIZE) \ ARCH_HAS_MMU=1 ifeq ($(WITH_SMP),1) diff --git a/lib/acpi_lite/acpi_lite.cpp b/lib/acpi_lite/acpi_lite.cpp index 5752a586..7b5b09f6 100644 --- a/lib/acpi_lite/acpi_lite.cpp +++ b/lib/acpi_lite/acpi_lite.cpp @@ -407,7 +407,7 @@ void acpi_lite_dump_tables(bool full_dump) { } } -status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_entry_callback callback, void *cookie) { +status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_entry_callback callback, void * const cookie) { const acpi_madt_table* madt = reinterpret_cast(acpi_get_table_by_sig(ACPI_MADT_SIG)); if (!madt) { @@ -463,5 +463,4 @@ void acpi_lite_dump_madt_table() { acpi_process_madt_entries_etc(ACPI_MADT_TYPE_INT_SOURCE_OVERRIDE, int_source_override_callback, nullptr); } - // vim: set ts=2 sw=2 expandtab: diff --git a/lib/acpi_lite/include/lib/acpi_lite.h b/lib/acpi_lite/include/lib/acpi_lite.h index 01fbaadf..4c6a19c7 100644 --- a/lib/acpi_lite/include/lib/acpi_lite.h +++ b/lib/acpi_lite/include/lib/acpi_lite.h @@ -11,7 +11,6 @@ #include #include #include -#include __BEGIN_CDECLS @@ -24,7 +23,7 @@ const struct acpi_sdt_header* acpi_get_table_by_sig(const char* sig); // A routine to iterate over all the MADT entries of a particular type via a callback //using MadtEntryCallback = fbl::Function; typedef void (*madt_entry_callback)(const void* entry, size_t entry_len, void *cookie); -status_t acpi_process_madt_entries_etc(uint8_t search_type, const madt_entry_callback, void *cookie); +status_t acpi_process_madt_entries_etc(uint8_t search_type, madt_entry_callback, void *cookie); __END_CDECLS diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index 145574a9..45ab1845 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -22,10 +22,59 @@ #include #include -#define LOCAL_TRACE 0 +#define LOCAL_TRACE 1 static bool lapic_present = false; -static uint8_t *lapic_mmio; +static volatile uint32_t *lapic_mmio; + +// local apic registers +enum lapic_regs { + LAPIC_ID = 0x20, + LAPIC_VERSION = 0x30, + LAPIC_TPR = 0x80, + LAPIC_APR = 0x90, + LAPIC_PPR = 0xa0, + LAPIC_EOI = 0xb0, + LAPIC_RRD = 0xc0, + LAPIC_LDR = 0xd0, + LAPIC_DFR = 0xe0, + LAPIC_SVR = 0xf0, + LAPIC_ISR0 = 0x100, + + LAPIC_TMR0 = 0x180, + + LAPIC_IRR0 = 0x200, + + LAPIC_ESR = 0x280, + + LAPIC_CMCI = 0x2f0, + LAPIC_ICRLO = 0x300, + LAPIC_ICRHI = 0x310, + LAPIC_TIMER = 0x320, + LAPIC_THERMAL = 0x330, + LAPIC_PERF = 0x340, + LAPIC_LINT0 = 0x350, + LAPIC_LINT1 = 0x360, + LAPIC_ERROR = 0x370, + LAPIC_TICR = 0x380, + LAPIC_TCCR = 0x390, + LAPIC_DIV = 0x3e0, + + // Extended features + LAPIC_EXT_FEATURES = 0x400, + LAPIC_EXT_CONTROL = 0x410, + LAPIC_EXT_SEOI = 0x420, + LAPIC_EXT_IER0 = 0x480, + LAPIC_EXT_LVT0 = 0x500, +}; + +static uint32_t lapic_read(enum lapic_regs reg) { + return mmio_read32(lapic_mmio + reg / 4); +} + +static void lapic_write(enum lapic_regs reg, uint32_t val) { + mmio_write32(lapic_mmio + reg / 4, val); +} void lapic_init(void) { // discover the presence of the local apic and map it @@ -43,9 +92,14 @@ void lapic_init_postvm(uint level) { // IA32_APIC_BASE_MSR uint64_t apic_base = read_msr(0x1b); - LTRACEF("apic base %#llx\n", apic_base); + LTRACEF("raw apic base msr %#llx\n", apic_base); - // TODO: assert that it's enabled + // make sure it's enabled + if ((apic_base & 0x800) == 0) { + dprintf(INFO, "X86: enabling lapic\n"); + apic_base |= 0x800; + write_msr(0x1b, apic_base); + } apic_base &= ~0xfff; dprintf(INFO, "X86: lapic physical address %#llx\n", apic_base); @@ -54,6 +108,20 @@ void lapic_init_postvm(uint level) { status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, (void **)&lapic_mmio, 0, apic_base & ~0xfff, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE); ASSERT(err == NO_ERROR); + + // Read the local apic id and version and features + uint32_t id = lapic_read(LAPIC_ID); + uint32_t version = lapic_read(LAPIC_VERSION); + bool eas = version & (1u<<31); + uint32_t max_lvt = (version >> 16) & 0xff; + version &= 0xff; + dprintf(INFO, "X86: local apic id %#x version %#x\n", id, version); + dprintf(INFO, "X86: local apic max lvt entries %u\n", max_lvt); + if (eas) { + dprintf(INFO, "X86: local apic EAS features %#x\n", lapic_read(LAPIC_EXT_FEATURES)); + } + + } LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM); @@ -61,7 +129,28 @@ LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM); void lapic_eoi(unsigned int vector) { LTRACEF("vector %#x\n", vector); if (lapic_present) { - *REG32(lapic_mmio + 0xb0) = 1; + lapic_write(LAPIC_EOI, 0); } } +void lapic_send_init_ipi(uint32_t apic_id, bool level) { + if (lapic_present) { + lapic_write(LAPIC_ICRHI, apic_id << 24); + lapic_write(LAPIC_ICRLO, (5u << 8) | (level ? (1u << 14) : 0)); + } +} + +void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector) { + if (lapic_present) { + lapic_write(LAPIC_ICRHI, apic_id << 24); + lapic_write(LAPIC_ICRLO, (6u << 8) | (startup_vector >> 12)); + } +} + +void lapic_send_ipi(uint32_t apic_id, uint32_t vector) { + if (lapic_present) { + lapic_write(LAPIC_ICRHI, apic_id << 24); + // XXX add correct flag bits + lapic_write(LAPIC_ICRLO, vector); + } +} \ No newline at end of file diff --git a/platform/pc/mp-boot.S b/platform/pc/mp-boot.S new file mode 100644 index 00000000..f693819b --- /dev/null +++ b/platform/pc/mp-boot.S @@ -0,0 +1,167 @@ +#include +#include + +#define LOAD_ADDRESS 0x4000 +#define MSR_EFER 0xc0000080 +#define EFER_LME 0x00000100 + +#define ARGS_ADDRESS (LOAD_ADDRESS + 0x1000) +#define ARGS_CR3 (ARGS_ADDRESS + 0x00) +#define ARGS_STACK (ARGS_ADDRESS + 0x08) + +.text +.code16 +// secondary cpu boot entry point and switch to protected mode +// enters with the following state: +// real mode, CS 0x0400, PC 0 (physical address 0x4000) +FUNCTION(mp_boot_start) + // jump over the temp GDT below and switch to a flat memory segment (0) + ljmp $0, $(LOAD_ADDRESS + 0x28) + +.org 0x8 +.Lgdt: + // temporary GDT to get us into protected mode + // stuff the GDTR in the first entry + .short (8*4) + .int (LOAD_ADDRESS + 0x8) // address of .Lgdt + .short 0 + + // 0x8 code flat 32bit + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10011010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + + // 0x10 data flat 32bit + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + + // 0x18 code 64bit + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10011010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ + .byte 0b10101111 /* G(1) D(0) L(1) AVL(0) limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +.org 0x28 // 0x08 + 0x20 + // load the above GDT + lgdt (LOAD_ADDRESS + 0x08) + + // switch to protected mode + movl %cr0, %eax + orl $1, %eax + movl %eax, %cr0 + + // jump to 32bit mode + ljmpl $0x8, $(LOAD_ADDRESS + 0x40) +.org 0x40 + .code32 +.Lprot: + // we're now in 32bit mode, set up the 32bit data segment registers + mov $0x10, %ax + mov %ax, %ss + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + +#if ARCH_X86_64 + // set up 64bit paging + // set PAE bit in CR4 + mov %cr4, %eax + or $(1<<5), %eax + mov %eax, %cr4 + + // Enable Long mode + movl $MSR_EFER ,%ecx + rdmsr + orl $EFER_LME,%eax + wrmsr + + // load trampoline page table + movl (ARGS_CR3), %eax + mov %eax, %cr3 + + // enable paging, now we're in 32bit compatibility mode + mov %cr0, %eax + btsl $(31), %eax + mov %eax, %cr0 + + movl $(LOAD_ADDRESS + 0x800), %esp + + // Use a far jump to get into 64bit mode + pushl $0x18 + pushl $(LOAD_ADDRESS + 0x90) + lret + +.org 0x90 +.code64 +farjump64: + /* branch to our high address */ + movq (.Lhigh_addr), %rax + jmp *%rax +.Lhigh_addr: +.quad mp_boot_start_high + +#else // ARCH_X86_32 + // set up 32bit paging + + // set PSE bit in CR4 + mov %cr4, %eax + or $(1<<4), %eax + mov %eax, %cr4 + + // XXX load trampoline page table + + // get into high address + + // set up stack pointer + + // call into C + cld + jmp . +#endif + +DATA(mp_boot_end) +END_FUNCTION(mp_boot_start) + +FUNCTION(mp_boot_start_high) +#if ARCH_X86_64 + // set up stack pointer + mov $(ARGS_STACK), %rsp + + // load the real GDT + lgdt _gdtr + + push $CODE_64_SELECTOR + lea .Lnext(%rip), %rax + push %rax + lretq + +.Lnext: + + // zero out the segment registers + xor %ax, %ax + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + mov %ax, %ss +#else // ARCH_X86_32 + + +#endif + + // set up stack pointer + + // call into C + cld + jmp . +END_FUNCTION(mp_boot_start_high) \ No newline at end of file diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 616a10d2..88aa68be 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -10,16 +10,52 @@ #include #include +#include #include +#include #if WITH_SMP +#define TRAMPOLINE_ADDRESS 0x4000 + #define LOCAL_TRACE 1 -static void start_cpu(uint cpu_num, uint32_t apic_id) { +extern void mp_boot_start(void); +extern void mp_boot_end(void); + +struct bootstrap_args { + uintptr_t trampoline_cr3; +}; + +static void start_cpu(uint cpu_num, uint32_t apic_id, struct bootstrap_args *args) { LTRACEF("cpu_num %u, apic_id %u\n", cpu_num, apic_id); // XXX do work here + + arch_disable_ints(); + + // start x86 secondary cpu + + // send INIT IPI + lapic_send_init_ipi(apic_id, true); + thread_sleep(10); + + // deassert INIT + lapic_send_init_ipi(apic_id, false); + thread_sleep(10); + + lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); + + // wait 200us + thread_sleep(1); + + // send SIPI again + lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); + + // wait 10ms + thread_sleep(10); + + for (;;); } struct detected_cpus { @@ -51,16 +87,48 @@ void platform_start_secondary_cpus(void) { // TODO: deal with cpu topology // start up the secondary cpus - if (cpus.num_detected > 1) { - dprintf(INFO, "PC: detected %u cpus\n", cpus.num_detected); - - lk_init_secondary_cpus(cpus.num_detected - 1); - - for (uint i = 1; i < cpus.num_detected; i++) { - dprintf(INFO, "PC: starting cpu %u\n", cpus.apic_ids[i]); - start_cpu(i, cpus.apic_ids[i]); - } + if (cpus.num_detected < 2) { + dprintf(INFO, "PC: no secondary cpus detected\n"); + return; } + + // create a new aspace to build an identity map in + vmm_aspace_t *aspace; + status_t err = vmm_create_aspace(&aspace, "identity map", 0); + if (err < 0) { + panic("failed to create identity map aspace\n"); + } + + // set up an identity map for the trampoline code + + void *ptr = (void *)TRAMPOLINE_ADDRESS; + err = vmm_alloc_physical(aspace, "trampoline", 0x10000, &ptr, 0, + TRAMPOLINE_ADDRESS, VMM_FLAG_VALLOC_SPECIFIC, ARCH_MMU_FLAG_CACHED); + if (err < 0) { + panic("failed to allocate trampoline memory\n"); + } + + vmm_aspace_t *old_aspace = vmm_set_active_aspace(aspace); + + // set up bootstrap code page at TRAMPOLINE_ADDRESS for secondary cpu + memcpy(ptr, mp_boot_start, mp_boot_end - mp_boot_start); + + // next page has args in it + struct bootstrap_args *args = (struct bootstrap_args *)((uintptr_t)ptr + 0x1000); + args->trampoline_cr3 = aspace->arch_aspace.cr3_phys; + + dprintf(INFO, "PC: detected %u cpus\n", cpus.num_detected); + + lk_init_secondary_cpus(cpus.num_detected - 1); + + for (uint i = 1; i < cpus.num_detected; i++) { + dprintf(INFO, "PC: starting cpu %u\n", cpus.apic_ids[i]); + start_cpu(i, cpus.apic_ids[i], args); + } + + // XXX restore old aspace + vmm_set_active_aspace(old_aspace); + // XXX free aspace when done } #endif // WITH_SMP \ No newline at end of file diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index 516aea8e..f673c732 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -25,6 +25,9 @@ void pic_mask_interrupts(void); // local apic void lapic_init(void); void lapic_eoi(unsigned int vector); +void lapic_send_init_ipi(uint32_t apic_id, bool level); +void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector); +void lapic_send_ipi(uint32_t apic_id, uint32_t vector); // secondary cpus void platform_start_secondary_cpus(void); diff --git a/platform/pc/rules.mk b/platform/pc/rules.mk index dba1abfe..7d7b0d7f 100644 --- a/platform/pc/rules.mk +++ b/platform/pc/rules.mk @@ -24,6 +24,7 @@ MODULE_SRCS += \ $(LOCAL_DIR)/keyboard.c \ $(LOCAL_DIR)/lapic.c \ $(LOCAL_DIR)/mp.c \ + $(LOCAL_DIR)/mp-boot.S \ $(LOCAL_DIR)/pic.c \ $(LOCAL_DIR)/platform.c \ $(LOCAL_DIR)/timer.c \ From 3ea007a237a10da62db2a0e490ad9e25665756a1 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Thu, 12 Dec 2024 22:23:50 -0800 Subject: [PATCH 07/26] [arch][x86] split the single GDT.S into two separate ones per subarch It's getting too hard to maintain a single layout that works with both, so go ahead and split it. Also redo the layout so it should be usable with user space and syscall and sysenter instructions from either mode. --- arch/x86/32/gdt.S | 100 +++++++++++++++++++++++++ arch/x86/{ => 64}/gdt.S | 92 ++++++++--------------- arch/x86/include/arch/x86/descriptor.h | 84 +++++++++++++++------ arch/x86/rules.mk | 2 +- 4 files changed, 195 insertions(+), 83 deletions(-) create mode 100644 arch/x86/32/gdt.S rename arch/x86/{ => 64}/gdt.S (78%) diff --git a/arch/x86/32/gdt.S b/arch/x86/32/gdt.S new file mode 100644 index 00000000..078333b2 --- /dev/null +++ b/arch/x86/32/gdt.S @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2009 Corey Tabaka + * Copyright (c) 2015 Intel Corporation + * Copyright (c) 2016 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include +#include + +#define PHYS_LOAD_ADDRESS (MEMBASE + KERNEL_LOAD_OFFSET) +#define PHYS_ADDR_DELTA (KERNEL_BASE + KERNEL_LOAD_OFFSET - PHYS_LOAD_ADDRESS) +#define PHYS(x) ((x) - PHYS_ADDR_DELTA) + +.section .rodata + +.balign 8 +DATA(_gdtr_phys) + .short _gdt_end - _gdt - 1 + .int PHYS(_gdt) +END_DATA(_gdtr_phys) + +.balign 8 +DATA(_gdtr) + .short _gdt_end - _gdt - 1 + .int _gdt +END_DATA(_gdtr) + + +// 32bit GDT, laid out in a specific way due to requirements by the SYSENTER/SYSEXIT and +// SYSCALL/SYSRET instructions: +// +// CODE32 <- IA32_SYSENTER_CS, IA32_STAR.SYSCALL_CS +// DATA +// UCODE32 <- IA32_STAR.SYSRET_CS +// UDATA +.data +.balign 8 +DATA(_gdt) + .int 0 + .int 0 + +/* ring 0 code 32bit (for bootstrapping into 64bit) */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10011010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 0 data 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 3 code 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11111010 /* P(1) DPL(11) S(1) 1 C(0) R(1) A(0) */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 3 data 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11110010 /* P(1) DPL(11) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* per-cpu TSS descriptor */ +.set i, 1 +.rept SMP_MAX_CPUS + .short 0 /* limit 15:00 */ + .short 0 /* base 15:00 */ + .byte 0 /* base 23:16 */ + .byte 0x89 /* P(1) DPL(00) S(0) TYPE(9) */ + .byte 0x80 /* G(1) D/B(0) L(0) AVL(0) limit 19:16 */ + .byte 0 /* base 31:24 */ +.set i, i+1 +.endr + +/* per-cpu GS descriptor for x86-32 */ +.set i, 1 +.rept SMP_MAX_CPUS + .int 0 /* filled in by C code later */ + .int 0 +.set i, i+1 +.endr + +END_DATA(_gdt) + +DATA(_gdt_end) + diff --git a/arch/x86/gdt.S b/arch/x86/64/gdt.S similarity index 78% rename from arch/x86/gdt.S rename to arch/x86/64/gdt.S index acea65df..50b45e42 100644 --- a/arch/x86/gdt.S +++ b/arch/x86/64/gdt.S @@ -25,22 +25,26 @@ END_DATA(_gdtr_phys) .balign 8 DATA(_gdtr) .short _gdt_end - _gdt - 1 -#if ARCH_X86_32 - .int _gdt -#elif ARCH_X86_64 .quad _gdt -#endif END_DATA(_gdtr) +// 64bit GDT, laid out in a specific way due to requirements by the SYSENTER/SYSEXIT and +// SYSCALL/SYSRET instructions: +// +// CODE32 (for bootstrap purposes) +// CODE64 <- IA32_SYSENTER_CS, IA32_STAR.SYSCALL_CS +// DATA64 +// UCODE32 <- IA32_STAR.SYSRET_CS +// UDATA32 +// UCODE64 +// UDATA64 (optional if no 64bit sysenter support) .data .balign 8 DATA(_gdt) .int 0 .int 0 -/* ring 0 descriptors */ -.set codesel_32, . - _gdt -_code_32_gde: +/* ring 0 code 32bit (for bootstrapping into 64bit) */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ @@ -48,36 +52,7 @@ _code_32_gde: .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set datasel, . - _gdt -_data_gde: - .short 0xffff /* limit 15:00 */ - .short 0x0000 /* base 15:00 */ - .byte 0x00 /* base 23:16 */ - .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ - .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ - .byte 0x0 /* base 31:24 */ - -.set user_codesel_32, . - _gdt -_user_code_32_gde: - .short 0xffff /* limit 15:00 */ - .short 0x0000 /* base 15:00 */ - .byte 0x00 /* base 23:16 */ - .byte 0b11111010 /* P(1) DPL(11) S(1) 1 C(0) R(1) A(0) */ - .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ - .byte 0x0 /* base 31:24 */ - - -.set user_datasel, . - _gdt -_user_data_32_gde: - .short 0xffff /* limit 15:00 */ - .short 0x0000 /* base 15:00 */ - .byte 0x00 /* base 23:16 */ - .byte 0b11110010 /* P(1) DPL(11) S(1) 0 E(0) W(1) A(0) */ - .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ - .byte 0x0 /* base 31:24 */ - -.set codesel_64, . - _gdt -_code_64_gde: +/* ring 0 code 64bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ @@ -85,17 +60,31 @@ _code_64_gde: .byte 0b10101111 /* G(1) D(0) L(1) AVL(0) limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set datasel_64, . - _gdt -_data_64_gde: +/* ring 0 data 64bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ - .byte 0b10010010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ - .byte 0b11001111 /* G(1) B(1) 0 AVL(0) limit 19:16 */ + .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set user_codesel_64, . - _gdt -_user_code_64_gde: +/* ring 3 code 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11111010 /* P(1) DPL(11) S(1) 1 C(0) R(1) A(0) */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 3 data 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11110010 /* P(1) DPL(11) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 3 code 64bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ @@ -103,8 +92,7 @@ _user_code_64_gde: .byte 0b10101111 /* G(1) D(1) L(0) AVL(0) limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set user_datasel_64, . - _gdt -_user_data_64_gde: +/* ring 3 data 64bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ @@ -113,7 +101,6 @@ _user_data_64_gde: .byte 0x0 /* base 31:24 */ /* per-cpu TSS descriptor */ -_tss_gde: .set i, 1 .rept SMP_MAX_CPUS .short 0 /* limit 15:00 */ @@ -122,25 +109,12 @@ _tss_gde: .byte 0x89 /* P(1) DPL(00) S(0) TYPE(9) */ .byte 0x80 /* G(1) D/B(0) L(0) AVL(0) limit 19:16 */ .byte 0 /* base 31:24 */ -#if ARCH_X86_64 /* 64-bit TSSs are 16 bytes long */ .int 0 /* base 63:32 */ .int 0 /* type(0) + reserved */ -#endif .set i, i+1 .endr -/* per-cpu GS descriptor for x86-32 */ -#if ARCH_X86_32 -_percpu_gde: -.set i, 1 -.rept SMP_MAX_CPUS - .int 0 /* filled in by C code later */ - .int 0 -.set i, i+1 -.endr -#endif - END_DATA(_gdt) DATA(_gdt_end) diff --git a/arch/x86/include/arch/x86/descriptor.h b/arch/x86/include/arch/x86/descriptor.h index cf4da2cb..03221654 100644 --- a/arch/x86/include/arch/x86/descriptor.h +++ b/arch/x86/include/arch/x86/descriptor.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2009 Corey Tabaka * Copyright (c) 2014 Intel Corporation + * Copyright (c) 2024 Travis Geiselbrecht * * Use of this source code is governed by a MIT-style * license that can be found in the LICENSE file or at @@ -8,53 +9,90 @@ */ #pragma once -/* - * System Selectors - */ +// System Selectors #define NULL_SELECTOR 0x00 -/********* x86 selectors *********/ +// ********* x86 selectors ********* +// Laid out slightly differently based on 32 vs 64 bit mode +#if ARCH_X86_64 + +#define CODE_SELECTOR 0x08 +#define CODE_64_SELECTOR 0x10 +#define DATA_SELECTOR 0x18 +#define USER_CODE_32_SELECTOR 0x20 +#define USER_DATA_32_SELECTOR 0x28 +#define USER_CODE_64_SELECTOR 0x30 +#define USER_DATA_64_SELECTOR 0x38 +#define TSS_SELECTOR_BASE 0x40 + +#elif ARCH_X86_32 + #define CODE_SELECTOR 0x08 #define DATA_SELECTOR 0x10 #define USER_CODE_32_SELECTOR 0x18 #define USER_DATA_32_SELECTOR 0x20 +#define TSS_SELECTOR_BASE 0x28 -/******* x86-64 selectors ********/ -#define CODE_64_SELECTOR 0x28 -#define STACK_64_SELECTOR 0x30 -#define USER_CODE_64_SELECTOR 0x38 -#define USER_DATA_64_SELECTOR 0x40 +#else +#error unknown architecture +#endif -/* base selector for a list of TSSes, one per cpu (SMP_MAX_CPUS) */ -#define TSS_SELECTOR_BASE 0x48 - -/* base selector for a gs segment per cpu (SMP_MAX_CPUS) */ +// Base selector for a gs segment per cpu (SMP_MAX_CPUS) #define PERCPU_SELECTOR_BASE (TSS_SELECTOR_BASE + 8 * SMP_MAX_CPUS) -/* code/data segment types (S = 1) */ -/* bit 0 is accessed */ +// Worksheet of what the syscall instructions do which affects the GDT layout: +// SYSENTER +// CS = IA32_SYSENTER_CS +// SS = IA32_SYSENTER_CS + 8 +// SYSEXIT 32 +// CS = IA32_SYSENTER_CS + 16 +// SS = IA32_SYSENTER_CS + 24 +// SYSEXIT 64 +// CS = IA32_SYSENTER_CS + 32 +// SS = IA32_SYSENTER_CS + 40 + +// SYSCALL +// CS = IA32_STAR.SYSCALL_CS +// SS = IA32_STAR.SYSCALL_CS + 8 +// SYSRET 32 +// CS = IA32_STAR.SYSRET_CS +// SS = IA32_STAR.SYSRET_CS + 8 +// SYSRET 64 +// CS = IA32_STAR.SYSRET_CS + 16 +// SS = IA32_STAR.SYSRET_CS + 8 + +// code/data segment types (S = 1) +// bit 0 is A (accessed) +// bit 1 is W (accessed) +// bit 2 is E (expand-down) +// bit 3 is data (0) vs code (1) + +// data segment types: #define SEG_TYPE_DATA_RO 0x0 #define SEG_TYPE_DATA_RW 0x2 #define SEG_TYPE_DATA_RO_EXPAND_DOWN 0x4 #define SEG_TYPE_DATA_RW_EXPAND_DOWN 0x6 -#define SEG_TYPE_CODE_XO 0x8 + +// code segment types: +// bit 3 is C (conforming) +#define SEG_TYPE_CODE_XO 0x9 #define SEG_TYPE_CODE_RO 0xa #define SEG_TYPE_CODE_XO_CONFORMING 0xc #define SEG_TYPE_CODE_RO_CONFORMING 0xe -/* system segment types (S = 0) */ +// system segment types (S = 0) #define SEG_TYPE_TSS_16 0x1 -#define SEG_TYPE_LDT 0x2 +#define SEG_TYPE_LDT 0x2 // usable in 64bit #define SEG_TYPE_TSS_16_BUSY 0x3 #define SEG_TYPE_CALL_GATE_16 0x4 #define SEG_TYPE_TASK_GATE 0x5 #define SEG_TYPE_INT_GATE_16 0x6 #define SEG_TYPE_TRAP_GATE_16 0x7 -#define SEG_TYPE_TSS 0x9 -#define SEG_TYPE_TSS_BUSY 0xb -#define SEG_TYPE_CALL_GATE 0xc -#define SEG_TYPE_INT_GATE 0xe -#define SEG_TYPE_TRAP_GATE 0xf +#define SEG_TYPE_TSS 0x9 // usable in 64bit +#define SEG_TYPE_TSS_BUSY 0xb // usable in 64bit +#define SEG_TYPE_CALL_GATE 0xc // usable in 64bit +#define SEG_TYPE_INT_GATE 0xe // usable in 64bit +#define SEG_TYPE_TRAP_GATE 0xf // usable in 64bit #ifndef ASSEMBLY diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index 148c0eb7..c6d83503 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -65,6 +65,7 @@ MODULE_SRCS += \ \ $(SUBARCH_DIR)/asm.S \ $(SUBARCH_DIR)/exceptions.S \ + $(SUBARCH_DIR)/gdt.S \ $(SUBARCH_DIR)/mmu.c \ $(SUBARCH_DIR)/ops.S \ \ @@ -73,7 +74,6 @@ MODULE_SRCS += \ $(LOCAL_DIR)/descriptor.c \ $(LOCAL_DIR)/faults.c \ $(LOCAL_DIR)/feature.c \ - $(LOCAL_DIR)/gdt.S \ $(LOCAL_DIR)/mp.c \ $(LOCAL_DIR)/thread.c \ From 6b89609887aead33cbb45bfb292498d2484d8d84 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Fri, 13 Dec 2024 00:07:46 -0800 Subject: [PATCH 08/26] WIP x86-64 SMP: get the 64bit secondaries fully started Rearrange some of the cpu initialization code to be runnable on each cpu as they come up. Complete the 64bit bootstrap mechanism and call into C code. Makes it as far as trying to reschedule via an IPI. Need to implement local apic based IPI mechanism. --- arch/x86/32/mmu.c | 7 ++- arch/x86/32/start.S | 2 +- arch/x86/64/mmu.c | 21 ++++++-- arch/x86/64/start.S | 2 +- arch/x86/arch.c | 29 ++++++---- arch/x86/fpu.c | 95 +++++++++++++++++---------------- arch/x86/include/arch/fpu.h | 1 + arch/x86/include/arch/x86.h | 2 + arch/x86/include/arch/x86/mmu.h | 1 + arch/x86/include/arch/x86/mp.h | 14 ++++- arch/x86/mp.c | 49 +++++++++++++++-- platform/pc/mp-boot.S | 18 +++---- platform/pc/mp.c | 66 ++++++++++++++++++----- 13 files changed, 214 insertions(+), 93 deletions(-) diff --git a/arch/x86/32/mmu.c b/arch/x86/32/mmu.c index 7e17e445..ff36507f 100644 --- a/arch/x86/32/mmu.c +++ b/arch/x86/32/mmu.c @@ -419,12 +419,15 @@ bool arch_mmu_supports_nx_mappings(void) { return false; } bool arch_mmu_supports_ns_mappings(void) { return false; } bool arch_mmu_supports_user_aspaces(void) { return false; } -void x86_mmu_early_init(void) { - /* Set WP bit in CR0*/ +/* called once per cpu as it is brought up */ +void x86_mmu_early_init_percpu(void) { + /* Set WP bit in CR0 */ uint32_t cr0 = x86_get_cr0(); cr0 |= X86_CR0_WP; x86_set_cr0(cr0); +} +void x86_mmu_early_init(void) { /* unmap the lower identity mapping */ for (uint i = 0; i < (1024*1024*1024) / (4*1024*1024); i++) { kernel_pd[i] = 0; diff --git a/arch/x86/32/start.S b/arch/x86/32/start.S index 79b616fe..57524d86 100644 --- a/arch/x86/32/start.S +++ b/arch/x86/32/start.S @@ -177,7 +177,7 @@ main_lk: /* set up the percpu data structure pointer for the boot cpu */ pushl $0 pushl $0 - call x86_percpu_init_early + call x86_configure_percpu_early /* call the main module */ call lk_main diff --git a/arch/x86/64/mmu.c b/arch/x86/64/mmu.c index a0343a73..8038d7a9 100644 --- a/arch/x86/64/mmu.c +++ b/arch/x86/64/mmu.c @@ -24,7 +24,7 @@ #include #define LOCAL_TRACE 0 -#define TRACE_CONTEXT_SWITCH 1 +#define TRACE_CONTEXT_SWITCH 0 /* Address width including virtual/physical address*/ static uint8_t vaddr_width = 0; @@ -622,8 +622,8 @@ bool arch_mmu_supports_nx_mappings(void) { return true; } bool arch_mmu_supports_ns_mappings(void) { return false; } bool arch_mmu_supports_user_aspaces(void) { return true; } -void x86_mmu_early_init(void) { - /* Set WP bit in CR0*/ +void x86_mmu_early_init_percpu(void) { + /* Set WP bit in CR0 */ uint64_t cr0 = x86_get_cr0(); cr0 |= X86_CR0_WP; x86_set_cr0(cr0); @@ -636,11 +636,13 @@ void x86_mmu_early_init(void) { cr4 |= X86_CR4_SMAP; x86_set_cr4(cr4); - /* Set NXE bit in MSR_EFER*/ + /* Set NXE bit in MSR_EFER */ uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER); efer_msr |= X86_EFER_NXE; write_msr(X86_MSR_IA32_EFER, efer_msr); +} +void x86_mmu_early_init(void) { /* getting the address width from CPUID instr */ paddr_width = x86_get_paddr_width(); vaddr_width = x86_get_vaddr_width(); @@ -712,7 +714,16 @@ status_t arch_mmu_init_aspace(arch_aspace_t * const aspace, const vaddr_t base, } status_t arch_mmu_destroy_aspace(arch_aspace_t *aspace) { - PANIC_UNIMPLEMENTED; + // TODO: assert that we're not active on any cpus + if (aspace->flags & ARCH_ASPACE_FLAG_KERNEL) { + // can't destroy the kernel aspace + panic("attempt to destroy kernel aspace\n"); + return ERR_NOT_ALLOWED; + } + + // free the page table + pmm_free_kpages(aspace->cr3, 1); + return NO_ERROR; } diff --git a/arch/x86/64/start.S b/arch/x86/64/start.S index a8420441..a3aa9d41 100644 --- a/arch/x86/64/start.S +++ b/arch/x86/64/start.S @@ -216,7 +216,7 @@ highaddr: /* set up the percpu data structure pointer for the boot cpu */ xor %edi, %edi xor %esi, %esi - call x86_percpu_init_early + call x86_configure_percpu_early /* call the main module */ call lk_main diff --git a/arch/x86/arch.c b/arch/x86/arch.c index 9db1bb40..daa1dd3f 100644 --- a/arch/x86/arch.c +++ b/arch/x86/arch.c @@ -59,14 +59,12 @@ __SECTION(".data") uint32_t _multiboot_info; /* main tss */ static tss_t system_tss __ALIGNED(16); -/* early initialization of the system, on the boot cpu, usually before any sort of - * printf output is available. - */ -void arch_early_init(void) { - /* enable caches here for now */ +void x86_early_init_percpu(void) { + // enable caches clear_in_cr0(X86_CR0_NW | X86_CR0_CD); - /* configure the system TSS */ + // configure the system TSS + // XXX move to a per cpu TSS in the percpu structure #if ARCH_X86_32 system_tss.esp0 = 0; system_tss.ss0 = DATA_SELECTOR; @@ -78,16 +76,27 @@ void arch_early_init(void) { #elif ARCH_X86_64 /* nothing to be done here, a fully zeroed TSS is a good starting point */ #endif - x86_set_gdt_descriptor(TSS_SELECTOR_BASE, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); - x86_ltr(TSS_SELECTOR_BASE); + const uint selector = TSS_SELECTOR_BASE + 8 * arch_curr_cpu_num(); + x86_set_gdt_descriptor(selector, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); + x86_ltr(selector); + x86_mmu_early_init_percpu(); +#if X86_WITH_FPU + x86_fpu_early_init_percpu(); +#endif +} + +/* early initialization of the system, on the boot cpu, usually before any sort of + * printf output is available. + */ +void arch_early_init(void) { x86_feature_early_init(); - x86_mmu_early_init(); - #if X86_WITH_FPU x86_fpu_early_init(); #endif + + x86_early_init_percpu(); } /* later initialization pass, once the main kernel is initialized and scheduling has begun */ diff --git a/arch/x86/fpu.c b/arch/x86/fpu.c index 5ddf5eba..0777af21 100644 --- a/arch/x86/fpu.c +++ b/arch/x86/fpu.c @@ -63,6 +63,56 @@ typedef struct { static fpu_features_t fpu_features; +/* called per cpu as they're brought up */ +void x86_fpu_early_init_percpu(void) { + if (!fp_supported) { + return; + } + + /* No x87 emul, monitor co-processor */ + ulong x = x86_get_cr0(); + x &= ~X86_CR0_EM; + x |= X86_CR0_NE; + x |= X86_CR0_MP; + x86_set_cr0(x); + + /* Init x87 */ + uint16_t fcw; + __asm__ __volatile__ ("finit"); + __asm__ __volatile__("fstcw %0" : "=m" (fcw)); +#if FPU_MASK_ALL_EXCEPTIONS + /* mask all exceptions */ + fcw |= 0x3f; +#else + /* unmask all exceptions */ + fcw &= 0xffc0; +#endif + __asm__ __volatile__("fldcw %0" : : "m" (fcw)); + + /* Init SSE */ + x = x86_get_cr4(); + x |= X86_CR4_OSXMMEXPT; // supports exceptions + x |= X86_CR4_OSFXSR; // supports fxsave + x &= ~X86_CR4_OSXSAVE; // no support for xsave (currently) + x86_set_cr4(x); + + uint32_t mxcsr; + __asm__ __volatile__("stmxcsr %0" : "=m" (mxcsr)); +#if FPU_MASK_ALL_EXCEPTIONS + /* mask all exceptions */ + mxcsr = (0x3f << 7); +#else + /* unmask all exceptions */ + mxcsr &= 0x0000003f; +#endif + __asm__ __volatile__("ldmxcsr %0" : : "m" (mxcsr)); + + /* save fpu initial states, and used when new thread creates */ + __asm__ __volatile__("fxsave %0" : "=m" (fpu_init_states)); + + x86_set_cr0(x86_get_cr0() | X86_CR0_TS); +} + /* called on the first cpu before the kernel is initialized. printfs may not work here */ void x86_fpu_early_init(void) { fp_supported = false; @@ -115,51 +165,6 @@ void x86_fpu_early_init(void) { } } } - - /* No x87 emul, monitor co-processor */ - ulong x = x86_get_cr0(); - x &= ~X86_CR0_EM; - x |= X86_CR0_NE; - x |= X86_CR0_MP; - x86_set_cr0(x); - - /* Init x87 */ - uint16_t fcw; - __asm__ __volatile__ ("finit"); - __asm__ __volatile__("fstcw %0" : "=m" (fcw)); -#if FPU_MASK_ALL_EXCEPTIONS - /* mask all exceptions */ - fcw |= 0x3f; -#else - /* unmask all exceptions */ - fcw &= 0xffc0; -#endif - __asm__ __volatile__("fldcw %0" : : "m" (fcw)); - - /* Init SSE */ - x = x86_get_cr4(); - x |= X86_CR4_OSXMMEXPT; // supports exceptions - x |= X86_CR4_OSFXSR; // supports fxsave - x &= ~X86_CR4_OSXSAVE; // no support for xsave (currently) - x86_set_cr4(x); - - uint32_t mxcsr; - __asm__ __volatile__("stmxcsr %0" : "=m" (mxcsr)); -#if FPU_MASK_ALL_EXCEPTIONS - /* mask all exceptions */ - mxcsr = (0x3f << 7); -#else - /* unmask all exceptions */ - mxcsr &= 0x0000003f; -#endif - __asm__ __volatile__("ldmxcsr %0" : : "m" (mxcsr)); - - /* save fpu initial states, and used when new thread creates */ - __asm__ __volatile__("fxsave %0" : "=m" (fpu_init_states)); - - x86_set_cr0(x86_get_cr0() | X86_CR0_TS); - - return; } void x86_fpu_init(void) { diff --git a/arch/x86/include/arch/fpu.h b/arch/x86/include/arch/fpu.h index 59f62961..3e1f930e 100644 --- a/arch/x86/include/arch/fpu.h +++ b/arch/x86/include/arch/fpu.h @@ -26,6 +26,7 @@ void x86_fpu_early_init(void); void x86_fpu_init(void); +void x86_fpu_early_init_percpu(void); void fpu_init_thread_states(thread_t *t); void fpu_context_switch(thread_t *old_thread, thread_t *new_thread); void fpu_dev_na_handler(void); diff --git a/arch/x86/include/arch/x86.h b/arch/x86/include/arch/x86.h index 558d521b..1930d2a6 100644 --- a/arch/x86/include/arch/x86.h +++ b/arch/x86/include/arch/x86.h @@ -555,4 +555,6 @@ static inline void tlbsync_local(vaddr_t address) { asm volatile("invlpg %0" :: "m"(*(uint8_t *)address)); } +void x86_early_init_percpu(void); + __END_CDECLS diff --git a/arch/x86/include/arch/x86/mmu.h b/arch/x86/include/arch/x86/mmu.h index 6b483f81..d1fddbc8 100644 --- a/arch/x86/include/arch/x86/mmu.h +++ b/arch/x86/include/arch/x86/mmu.h @@ -117,6 +117,7 @@ typedef uint32_t arch_flags_t; void x86_mmu_early_init(void); void x86_mmu_init(void); +void x86_mmu_early_init_percpu(void); __END_CDECLS diff --git a/arch/x86/include/arch/x86/mp.h b/arch/x86/include/arch/x86/mp.h index ecb608c6..c668f4d0 100644 --- a/arch/x86/include/arch/x86/mp.h +++ b/arch/x86/include/arch/x86/mp.h @@ -20,6 +20,9 @@ typedef struct x86_percpu { struct thread *current_thread; + // per cpu bootstrap stack + uint8_t bootstrap_stack[PAGE_SIZE] __ALIGNED(sizeof(uintptr_t) * 2); + // XXX add more stuff: // per cpu TSS // per cpu doublefault/nmi stacks @@ -27,8 +30,15 @@ typedef struct x86_percpu { #define X86_PERCPU_FIELD_OFFSET(field) offsetof(x86_percpu_t, field) -// called extremely early on the boot cpu and each secondary cpu -void x86_percpu_init_early(uint cpu_num, uint apic_id); +// called extremely early on the boot cpu and each secondary cpu to set +// up the percpu struct and segment descriptors pointing to it +void x86_configure_percpu_early(uint cpu_num, uint apic_id); + +// C entry point for secondary cpus +__NO_RETURN void x86_secondary_entry(uint cpu_num); + +// allocate and initialize secondary cpu percpu structs +status_t x86_allocate_percpu_array(uint num_cpus); // get the percpu struct for the current cpu static inline x86_percpu_t *x86_get_percpu(void) { diff --git a/arch/x86/mp.c b/arch/x86/mp.c index 96e20c84..284d3614 100644 --- a/arch/x86/mp.c +++ b/arch/x86/mp.c @@ -8,19 +8,25 @@ #include #include +#include #include +#include +#include #include +#include #include #include #include #include +#define LOCAL_TRACE 1 + #if WITH_SMP // the boot cpu's percpu struct static x86_percpu_t x86_boot_percpu; // pointer to an array of percpu structs for each of the secondary cpus -static x86_percpu_t **x86_ap_percpus; +static x86_percpu_t *x86_ap_percpus; x86_percpu_t *x86_get_percpu_for_cpu(uint cpu_num) { DEBUG_ASSERT(cpu_num < SMP_MAX_CPUS); @@ -28,10 +34,10 @@ x86_percpu_t *x86_get_percpu_for_cpu(uint cpu_num) { return &x86_boot_percpu; } DEBUG_ASSERT(x86_ap_percpus); - return x86_ap_percpus[cpu_num - 1]; + return &x86_ap_percpus[cpu_num - 1]; } -void x86_percpu_init_early(uint cpu_num, uint apic_id) { +void x86_configure_percpu_early(uint cpu_num, uint apic_id) { x86_percpu_t *percpu = x86_get_percpu_for_cpu(cpu_num); // initialize the percpu structure for this cpu @@ -52,14 +58,49 @@ void x86_percpu_init_early(uint cpu_num, uint apic_id) { } status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { + LTRACEF("caller %#x target 0x%x, ipi 0x%x\n", arch_curr_cpu_num(), target, ipi); + + // XXX call into local apic code to send IPI + PANIC_UNIMPLEMENTED; } void arch_mp_init_percpu(void) { } +static uintptr_t x86_get_apic_id_from_hardware(void) { + // read the apic id out of the hardware + return read_msr(X86_MSR_IA32_APIC_BASE) >> 24; +} + +void x86_secondary_entry(uint cpu_num) { + x86_configure_percpu_early(cpu_num, x86_get_apic_id_from_hardware()); + + x86_early_init_percpu(); + + // run early secondary cpu init routines up to the threading level + lk_init_level(LK_INIT_FLAG_SECONDARY_CPUS, LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_THREADING - 1); + + dprintf(INFO, "SMP: secondary cpu %u started\n", arch_curr_cpu_num()); + + lk_secondary_cpu_entry(); + + // should never get here except for an error condition + for (;;); +} + +status_t x86_allocate_percpu_array(uint num_cpus) { + x86_ap_percpus = memalign(_Alignof(x86_percpu_t), num_cpus * sizeof(x86_percpu_t)); + if (!x86_ap_percpus) { + return ERR_NO_MEMORY; + } + + memset(x86_ap_percpus, 0, num_cpus * sizeof(x86_percpu_t)); + return NO_ERROR; +} + #else -void x86_percpu_init_early(uint cpu_num, uint apic_id) {} +void x86_configure_percpu_early(uint cpu_num, uint apic_id) {} #endif \ No newline at end of file diff --git a/platform/pc/mp-boot.S b/platform/pc/mp-boot.S index f693819b..a3d15bc9 100644 --- a/platform/pc/mp-boot.S +++ b/platform/pc/mp-boot.S @@ -135,7 +135,7 @@ END_FUNCTION(mp_boot_start) FUNCTION(mp_boot_start_high) #if ARCH_X86_64 // set up stack pointer - mov $(ARGS_STACK), %rsp + mov (ARGS_STACK), %rsp // load the real GDT lgdt _gdtr @@ -144,9 +144,7 @@ FUNCTION(mp_boot_start_high) lea .Lnext(%rip), %rax push %rax lretq - .Lnext: - // zero out the segment registers xor %ax, %ax mov %ax, %ds @@ -154,14 +152,16 @@ FUNCTION(mp_boot_start_high) mov %ax, %fs mov %ax, %gs mov %ax, %ss -#else // ARCH_X86_32 - - -#endif - - // set up stack pointer // call into C cld + mov $(ARGS_ADDRESS), %rdi + call secondary_entry jmp . + +#else // ARCH_X86_32 + + jmp . + +#endif END_FUNCTION(mp_boot_start_high) \ No newline at end of file diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 88aa68be..0900f4ac 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -8,11 +8,12 @@ #include "platform_p.h" -#include -#include -#include -#include +#include #include +#include +#include +#include +#include #if WITH_SMP @@ -24,15 +25,36 @@ extern void mp_boot_start(void); extern void mp_boot_end(void); struct bootstrap_args { + // referenced in mp-boot.S, do not move without updating assembly uintptr_t trampoline_cr3; + uintptr_t stack_top; + + uintptr_t cpu_num; + volatile uint32_t *boot_completed_ptr; // set by the secondary cpu when it's done }; +__NO_RETURN void secondary_entry(struct bootstrap_args *args) { + volatile uint32_t *boot_completed = args->boot_completed_ptr; + uint cpu_num = args->cpu_num; + + // context switch to the kernels cr3 + x86_set_cr3(vmm_get_kernel_aspace()->arch_aspace.cr3_phys); + // from now on out the boot args structure is not visible + + // we're done, let the primary cpu know so it can reuse the args + *boot_completed = 1; + + x86_secondary_entry(cpu_num); +} + static void start_cpu(uint cpu_num, uint32_t apic_id, struct bootstrap_args *args) { LTRACEF("cpu_num %u, apic_id %u\n", cpu_num, apic_id); - // XXX do work here + // assert that this thread is pinned to the current cpu + DEBUG_ASSERT(thread_pinned_cpu(get_current_thread()) == (int)arch_curr_cpu_num()); - arch_disable_ints(); + volatile uint32_t boot_completed = 0; + args->boot_completed_ptr = &boot_completed; // start x86 secondary cpu @@ -44,18 +66,20 @@ static void start_cpu(uint cpu_num, uint32_t apic_id, struct bootstrap_args *arg lapic_send_init_ipi(apic_id, false); thread_sleep(10); + // send SIPI and wait 200us lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); - - // wait 200us thread_sleep(1); - // send SIPI again + // send SIPI again for good measure and wait 10ms lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); - - // wait 10ms thread_sleep(10); - for (;;); + // wait for the cpu to finish booting + while (!boot_completed) { + thread_yield(); + } + + LTRACEF("cpu %u booted\n", cpu_num); } struct detected_cpus { @@ -120,15 +144,29 @@ void platform_start_secondary_cpus(void) { dprintf(INFO, "PC: detected %u cpus\n", cpus.num_detected); lk_init_secondary_cpus(cpus.num_detected - 1); + err = x86_allocate_percpu_array(cpus.num_detected - 1); + if (err < 0) { + panic("failed to allocate percpu array\n"); + } for (uint i = 1; i < cpus.num_detected; i++) { dprintf(INFO, "PC: starting cpu %u\n", cpus.apic_ids[i]); + + args->cpu_num = i; + + x86_percpu_t *percpu = x86_get_percpu_for_cpu(i); + args->stack_top = (uintptr_t)percpu->bootstrap_stack + sizeof(percpu->bootstrap_stack); + + LTRACEF("args for cpu %lu: trampoline_cr3 %#lx, stack_top 0x%lx\n", args->cpu_num, args->trampoline_cr3, args->stack_top); + start_cpu(i, cpus.apic_ids[i], args); } - // XXX restore old aspace + // restore old aspace vmm_set_active_aspace(old_aspace); - // XXX free aspace when done + + // free the trampoline aspace + vmm_free_aspace(aspace); } #endif // WITH_SMP \ No newline at end of file From d1a332891c17a07305d9b45d3f6a1344dce31938 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Tue, 17 Dec 2024 23:57:56 -0800 Subject: [PATCH 09/26] [arch][x86] add x2apic mode to the local apic driver Fill in some more x86 feature bits while at it. --- arch/x86/include/arch/x86.h | 1 + arch/x86/include/arch/x86/feature.h | 104 +++++++++++++++++++++++++++- platform/pc/lapic.c | 86 ++++++++++++++++------- 3 files changed, 165 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/arch/x86.h b/arch/x86/include/arch/x86.h index 1930d2a6..585c2b2f 100644 --- a/arch/x86/include/arch/x86.h +++ b/arch/x86/include/arch/x86.h @@ -199,6 +199,7 @@ typedef tss_64_t tss_t; #define X86_MSR_IA32_PM_ENABLE 0x00000770 /* enable/disable HWP */ #define X86_MSR_IA32_HWP_CAPABILITIES 0x00000771 /* HWP performance range enumeration */ #define X86_MSR_IA32_HWP_REQUEST 0x00000774 /* power manage control hints */ +#define X86_MSR_IA32_X2APIC_BASE 0x00000800 /* X2APIC base register */ #define X86_MSR_IA32_EFER 0xc0000080 /* EFER */ #define X86_MSR_IA32_STAR 0xc0000081 /* system call address */ #define X86_MSR_IA32_LSTAR 0xc0000082 /* long mode call address */ diff --git a/arch/x86/include/arch/x86/feature.h b/arch/x86/include/arch/x86/feature.h index afc12bac..f32d7bb8 100644 --- a/arch/x86/include/arch/x86/feature.h +++ b/arch/x86/include/arch/x86/feature.h @@ -182,71 +182,171 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) { /* add feature bits to test here */ /* format: X86_CPUID_BIT(cpuid leaf, register (eax-edx:0-3), bit) */ #define X86_FEATURE_SSE3 X86_CPUID_BIT(0x1, 2, 0) +#define X86_FEATURE_PCLMULQDQ X86_CPUID_BIT(0x1, 2, 1) +#define X86_FEATURE_DTES64 X86_CPUID_BIT(0x1, 2, 2) #define X86_FEATURE_MON X86_CPUID_BIT(0x1, 2, 3) +#define X86_FEATURE_DSCPL X86_CPUID_BIT(0x1, 2, 4) #define X86_FEATURE_VMX X86_CPUID_BIT(0x1, 2, 5) +#define X86_FEATURE_SMX X86_CPUID_BIT(0x1, 2, 6) +#define X86_FEATURE_EIST X86_CPUID_BIT(0x1, 2, 7) #define X86_FEATURE_TM2 X86_CPUID_BIT(0x1, 2, 8) #define X86_FEATURE_SSSE3 X86_CPUID_BIT(0x1, 2, 9) +#define X86_FEATURE_CNXT_ID X86_CPUID_BIT(0x1, 2, 10) +#define X86_FEATURE_SDBG X86_CPUID_BIT(0x1, 2, 11) +#define X86_FEATURE_FMA X86_CPUID_BIT(0x1, 2, 12) +#define X86_FEATURE_CMPXCHG16B X86_CPUID_BIT(0x1, 2, 13) +#define X86_FEATURE_XTPR X86_CPUID_BIT(0x1, 2, 14) #define X86_FEATURE_PDCM X86_CPUID_BIT(0x1, 2, 15) #define X86_FEATURE_PCID X86_CPUID_BIT(0x1, 2, 17) +#define X86_FEATURE_DCA X86_CPUID_BIT(0x1, 2, 18) #define X86_FEATURE_SSE4_1 X86_CPUID_BIT(0x1, 2, 19) #define X86_FEATURE_SSE4_2 X86_CPUID_BIT(0x1, 2, 20) #define X86_FEATURE_X2APIC X86_CPUID_BIT(0x1, 2, 21) +#define X86_FEATURE_MOVBE X86_CPUID_BIT(0x1, 2, 22) +#define X86_FEATURE_POPCNT X86_CPUID_BIT(0x1, 2, 23) #define X86_FEATURE_TSC_DEADLINE X86_CPUID_BIT(0x1, 2, 24) #define X86_FEATURE_AESNI X86_CPUID_BIT(0x1, 2, 25) #define X86_FEATURE_XSAVE X86_CPUID_BIT(0x1, 2, 26) +#define X86_FEATURE_OSXSAVE X86_CPUID_BIT(0x1, 2, 27) #define X86_FEATURE_AVX X86_CPUID_BIT(0x1, 2, 28) #define X86_FEATURE_RDRAND X86_CPUID_BIT(0x1, 2, 30) #define X86_FEATURE_HYPERVISOR X86_CPUID_BIT(0x1, 2, 31) #define X86_FEATURE_FPU X86_CPUID_BIT(0x1, 3, 0) +#define X86_FEATURE_VM86 X86_CPUID_BIT(0x1, 3, 1) +#define X86_FEATURE_DE X86_CPUID_BIT(0x1, 3, 2) #define X86_FEATURE_PSE X86_CPUID_BIT(0x1, 3, 3) +#define X86_FEATURE_TSC X86_CPUID_BIT(0x1, 3, 4) +#define X86_FEATURE_MSR X86_CPUID_BIT(0x1, 3, 5) #define X86_FEATURE_PAE X86_CPUID_BIT(0x1, 3, 6) +#define X86_FEATURE_MCE X86_CPUID_BIT(0x1, 3, 7) +#define X86_FEATURE_CX8 X86_CPUID_BIT(0x1, 3, 8) #define X86_FEATURE_APIC X86_CPUID_BIT(0x1, 3, 9) #define X86_FEATURE_SEP X86_CPUID_BIT(0x1, 3, 11) +#define X86_FEATURE_MTRR X86_CPUID_BIT(0x1, 3, 12) #define X86_FEATURE_PGE X86_CPUID_BIT(0x1, 3, 13) +#define X86_FEATURE_MCA X86_CPUID_BIT(0x1, 3, 14) +#define X86_FEATURE_CMOV X86_CPUID_BIT(0x1, 3, 15) #define X86_FEATURE_PAT X86_CPUID_BIT(0x1, 3, 16) #define X86_FEATURE_PSE36 X86_CPUID_BIT(0x1, 3, 17) +#define X86_FEATURE_PSN X86_CPUID_BIT(0x1, 3, 18) #define X86_FEATURE_CLFLUSH X86_CPUID_BIT(0x1, 3, 19) +#define X86_FEATURE_DS X86_CPUID_BIT(0x1, 3, 21) #define X86_FEATURE_ACPI X86_CPUID_BIT(0x1, 3, 22) #define X86_FEATURE_MMX X86_CPUID_BIT(0x1, 3, 23) #define X86_FEATURE_FXSR X86_CPUID_BIT(0x1, 3, 24) #define X86_FEATURE_SSE X86_CPUID_BIT(0x1, 3, 25) #define X86_FEATURE_SSE2 X86_CPUID_BIT(0x1, 3, 26) +#define X86_FEATURE_SS X86_CPUID_BIT(0x1, 3, 27) +#define X86_FEATURE_HTT X86_CPUID_BIT(0x1, 3, 28) #define X86_FEATURE_TM X86_CPUID_BIT(0x1, 3, 29) +#define X86_FEATURE_PBE X86_CPUID_BIT(0x1, 3, 31) + #define X86_FEATURE_DTS X86_CPUID_BIT(0x6, 0, 0) #define X86_FEATURE_TURBO X86_CPUID_BIT(0x6, 0, 1) +#define X86_FEATURE_ARAT X86_CPUID_BIT(0x6, 0, 2) #define X86_FEATURE_PLN X86_CPUID_BIT(0x6, 0, 4) +#define X86_FEATURE_ECMD X86_CPUID_BIT(0x6, 0, 5) #define X86_FEATURE_PTM X86_CPUID_BIT(0x6, 0, 6) #define X86_FEATURE_HWP X86_CPUID_BIT(0x6, 0, 7) #define X86_FEATURE_HWP_NOT X86_CPUID_BIT(0x6, 0, 8) #define X86_FEATURE_HWP_ACT X86_CPUID_BIT(0x6, 0, 9) #define X86_FEATURE_HWP_PREF X86_CPUID_BIT(0x6, 0, 10) +#define X86_FEATURE_HWP_EPP X86_CPUID_BIT(0x6, 0, 11) +#define X86_FEATURE_HWP_PKG X86_CPUID_BIT(0x6, 0, 12) +#define X86_FEATURE_HDC X86_CPUID_BIT(0x6, 0, 13) #define X86_FEATURE_TURBO_MAX X86_CPUID_BIT(0x6, 0, 14) +#define X86_FEATURE_HWP_CAP X86_CPUID_BIT(0x6, 0, 15) +#define X86_FEATURE_HWP_PECI X86_CPUID_BIT(0x6, 0, 16) +#define X86_FEATURE_HWP_FLEX X86_CPUID_BIT(0x6, 0, 17) +#define X86_FEATURE_HWP_FAST X86_CPUID_BIT(0x6, 0, 18) #define X86_FEATURE_HW_FEEDBACK X86_CPUID_BIT(0x6, 2, 0) #define X86_FEATURE_PERF_BIAS X86_CPUID_BIT(0x6, 2, 3) + #define X86_FEATURE_FSGSBASE X86_CPUID_BIT(0x7, 1, 0) #define X86_FEATURE_TSC_ADJUST X86_CPUID_BIT(0x7, 1, 1) +#define X86_FEATURE_SGX X86_CPUID_BIT(0x7, 1, 2) +#define X86_FEATURE_BMI1 X86_CPUID_BIT(0x7, 1, 3) +#define X86_FEATURE_HLE X86_CPUID_BIT(0x7, 1, 4) #define X86_FEATURE_AVX2 X86_CPUID_BIT(0x7, 1, 5) #define X86_FEATURE_SMEP X86_CPUID_BIT(0x7, 1, 7) +#define X86_FEATURE_BMI2 X86_CPUID_BIT(0x7, 1, 8) #define X86_FEATURE_ERMS X86_CPUID_BIT(0x7, 1, 9) #define X86_FEATURE_INVPCID X86_CPUID_BIT(0x7, 1, 10) +#define X86_FEATURE_RTM X86_CPUID_BIT(0x7, 1, 11) +#define X86_FEATURE_MPX X86_CPUID_BIT(0x7, 1, 14) +#define X86_FEATURE_AVX512F X86_CPUID_BIT(0x7, 1, 16) +#define X86_FEATURE_AVX512DQ X86_CPUID_BIT(0x7, 1, 17) #define X86_FEATURE_RDSEED X86_CPUID_BIT(0x7, 1, 18) +#define X86_FEATURE_ADX X86_CPUID_BIT(0x7, 1, 19) #define X86_FEATURE_SMAP X86_CPUID_BIT(0x7, 1, 20) +#define X86_FEATURE_AVX512IFMA X86_CPUID_BIT(0x7, 1, 21) #define X86_FEATURE_CLFLUSHOPT X86_CPUID_BIT(0x7, 1, 23) #define X86_FEATURE_CLWB X86_CPUID_BIT(0x7, 1, 24) #define X86_FEATURE_PT X86_CPUID_BIT(0x7, 1, 25) +#define X86_FEATURE_AVX512PF X86_CPUID_BIT(0x7, 1, 26) +#define X86_FEATURE_AVX512ER X86_CPUID_BIT(0x7, 1, 27) +#define X86_FEATURE_AVX512CD X86_CPUID_BIT(0x7, 1, 28) +#define X86_FEATURE_SHA X86_CPUID_BIT(0x7, 1, 29) +#define X86_FEATURE_AVX512BW X86_CPUID_BIT(0x7, 1, 30) +#define X86_FEATURE_AVX512VL X86_CPUID_BIT(0x7, 1, 31) +#define X86_FEATURE_PREFETCHWT1 X86_CPUID_BIT(0x7, 2, 0) +#define X86_FEATURE_AVX512VBMI X86_CPUID_BIT(0x7, 2, 1) #define X86_FEATURE_UMIP X86_CPUID_BIT(0x7, 2, 2) #define X86_FEATURE_PKU X86_CPUID_BIT(0x7, 2, 3) +#define X86_FEATURE_OSPKE X86_CPUID_BIT(0x7, 2, 4) +#define X86_FEATURE_WAITPKG X86_CPUID_BIT(0x7, 2, 5) +#define X86_FEATURE_AVX512_VBMI2 X86_CPUID_BIT(0x7, 2, 6) +#define X86_FEATURE_CET_SS X86_CPUID_BIT(0x7, 2, 7) +#define X86_FEATURE_GFNI X86_CPUID_BIT(0x7, 2, 8) +#define X86_FEATURE_VAES X86_CPUID_BIT(0x7, 2, 9) +#define X86_FEATURE_VPCLMULQDQ X86_CPUID_BIT(0x7, 2, 10) +#define X86_FEATURE_AVX512_VNNI X86_CPUID_BIT(0x7, 2, 11) +#define X86_FEATURE_AVX512_BITALG X86_CPUID_BIT(0x7, 2, 12) +#define X86_FEATURE_TIME_EN X86_CPUID_BIT(0x7, 2, 13) +#define X86_FEATURE_AVX512_VPOPCNTDQ X86_CPUID_BIT(0x7, 2, 14) +#define X86_FEATURE_LA57 X86_CPUID_BIT(0x7, 2, 16) +#define X86_FEATURE_RDPID X86_CPUID_BIT(0x7, 2, 22) +#define X86_FEATURE_KL X86_CPUID_BIT(0x7, 2, 23) +#define X86_FEATURE_CLDEMOTE X86_CPUID_BIT(0x7, 2, 25) +#define X86_FEATURE_MOVDIRI X86_CPUID_BIT(0x7, 2, 27) +#define X86_FEATURE_MOVDIR64B X86_CPUID_BIT(0x7, 2, 28) +#define X86_FEATURE_SGX_LC X86_CPUID_BIT(0x7, 2, 30) +#define X86_FEATURE_PKS X86_CPUID_BIT(0x7, 2, 31) +#define X86_FEATURE_AVX512_4VNNIW X86_CPUID_BIT(0x7, 3, 2) +#define X86_FEATURE_AVX512_4FMAPS X86_CPUID_BIT(0x7, 3, 3) +#define X86_FEATURE_FSRM X86_CPUID_BIT(0x7, 3, 4) +#define X86_FEATURE_AVX512_VP2INTERSECT X86_CPUID_BIT(0x7, 3, 8) #define X86_FEATURE_MD_CLEAR X86_CPUID_BIT(0x7, 3, 10) +#define X86_FEATURE_SERIALIZE X86_CPUID_BIT(0x7, 3, 14) +#define X86_FEATURE_HYBRID X86_CPUID_BIT(0x7, 3, 15) +#define X86_FEATURE_PCONFIG X86_CPUID_BIT(0x7, 3, 18) +#define X86_FEATURE_CET_IBT X86_CPUID_BIT(0x7, 3, 20) #define X86_FEATURE_IBRS_IBPB X86_CPUID_BIT(0x7, 3, 26) #define X86_FEATURE_STIBP X86_CPUID_BIT(0x7, 3, 27) #define X86_FEATURE_L1D_FLUSH X86_CPUID_BIT(0x7, 3, 28) #define X86_FEATURE_ARCH_CAPABILITIES X86_CPUID_BIT(0x7, 3, 29) +#define X86_FEATURE_CORE_CAPABILITIES X86_CPUID_BIT(0x7, 3, 30) #define X86_FEATURE_SSBD X86_CPUID_BIT(0x7, 3, 31) -#define X86_FEATURE_KVM_PV_CLOCK X86_CPUID_BIT(0x40000001, 0, 3) +#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000000, 0, 0) +#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000000, 0, 1) +#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000000, 0, 2) +#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000000, 0, 3) +#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000000, 0, 4) +#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000000, 0, 5) #define X86_FEATURE_KVM_PV_EOI X86_CPUID_BIT(0x40000001, 0, 6) +#define X86_FEATURE_KVM_PV_UNHALT X86_CPUID_BIT(0x40000001, 0, 7) +#define X86_FEATURE_KVM_PV_TLB_FLUSH X86_CPUID_BIT(0x40000001, 0, 9) +#define X86_FEATURE_KVM_ASYNC_PF_VMEXIT X86_CPUID_BIT(0x40000001, 0, 10) #define X86_FEATURE_KVM_PV_IPI X86_CPUID_BIT(0x40000001, 0, 11) -#define X86_FEATURE_KVM_PV_CLOCK_STABLE X86_CPUID_BIT(0x40000001, 0, 24) +#define X86_FEATURE_KVM_POLL_CONTROL X86_CPUID_BIT(0x40000001, 0, 12) +#define X86_FEATURE_KVM_PV_SCHED_YIELD X86_CPUID_BIT(0x40000001, 0, 13) +#define X86_FEATURE_KVM_ASYNC_PF_INT X86_CPUID_BIT(0x40000001, 0, 14) +#define X86_FEATURE_KVM_MSI_EXT_DEST_ID X86_CPUID_BIT(0x40000001, 0, 15) +#define X86_FEATURE_KVM_HC_MAP_GPA_RANGE X86_CPUID_BIT(0x40000001, 0, 16) +#define X86_FEATURE_KVM_MIGRATION_CONTROL X86_CPUID_BIT(0x40000001, 0, 17) +#define X86_FEATURE_KVM_CLOCKSOURCE_STABLE X86_CPUID_BIT(0x40000001, 0, 24) + #define X86_FEATURE_AMD_TOPO X86_CPUID_BIT(0x80000001, 2, 22) #define X86_FEATURE_SSE4A X86_CPUID_BIT(0x80000001, 3, 6) diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index 45ab1845..decc3adf 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -25,6 +25,7 @@ #define LOCAL_TRACE 1 static bool lapic_present = false; +static bool lapic_x2apic = false; static volatile uint32_t *lapic_mmio; // local apic registers @@ -69,13 +70,38 @@ enum lapic_regs { }; static uint32_t lapic_read(enum lapic_regs reg) { - return mmio_read32(lapic_mmio + reg / 4); + LTRACEF("reg %#x\n", reg); + DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); + if (lapic_x2apic) { + // TODO: do we need barriers here? + return read_msr(X86_MSR_IA32_X2APIC_BASE + reg / 0x10); + } else { + return mmio_read32(lapic_mmio + reg / 4); + } } static void lapic_write(enum lapic_regs reg, uint32_t val) { - mmio_write32(lapic_mmio + reg / 4, val); + LTRACEF("reg %#x val %#x\n", reg, val); + DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); + if (lapic_x2apic) { + write_msr(X86_MSR_IA32_X2APIC_BASE + reg / 0x10, val); + } else { + mmio_write32(lapic_mmio + reg / 4, val); + } } +// special case to write to the ICR register +static void lapic_write_icr(uint32_t low, uint32_t apic_id) { + LTRACEF("%#x apic_id %#x\n", low, apic_id); + if (lapic_x2apic) { + write_msr(X86_MSR_IA32_X2APIC_BASE + 0x30, ((uint64_t)apic_id << 32) | low); + } else { + lapic_write(LAPIC_ICRHI, apic_id << 24); + lapic_write(LAPIC_ICRLO, low); + } +} + + void lapic_init(void) { // discover the presence of the local apic and map it LTRACE_ENTRY; @@ -91,23 +117,33 @@ void lapic_init_postvm(uint level) { dprintf(INFO, "X86: local apic detected\n"); // IA32_APIC_BASE_MSR - uint64_t apic_base = read_msr(0x1b); + uint64_t apic_base = read_msr(X86_MSR_IA32_APIC_BASE); LTRACEF("raw apic base msr %#llx\n", apic_base); // make sure it's enabled - if ((apic_base & 0x800) == 0) { + if ((apic_base & (1u<<11)) == 0) { dprintf(INFO, "X86: enabling lapic\n"); - apic_base |= 0x800; + apic_base |= (1u<<11); write_msr(0x1b, apic_base); } - apic_base &= ~0xfff; - dprintf(INFO, "X86: lapic physical address %#llx\n", apic_base); + dprintf(INFO, "X86: lapic physical address %#llx\n", apic_base & ~0xfff); + + // see if x2APIC mode is supported and enable + if (x86_feature_test(X86_FEATURE_X2APIC)) { + lapic_x2apic = true; + dprintf(INFO, "X86: local apic supports x2APIC mode\n"); + + write_msr(X86_MSR_IA32_APIC_BASE, apic_base | (1u<<10)); + } // map the lapic into the kernel since it's not guaranteed that the physmap covers it - status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, (void **)&lapic_mmio, 0, - apic_base & ~0xfff, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE); - ASSERT(err == NO_ERROR); + if (!lapic_mmio) { + dprintf(INFO, "X86: mapping lapic into kernel\n"); + status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, (void **)&lapic_mmio, 0, + apic_base & ~0xfff, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE); + ASSERT(err == NO_ERROR); + } // Read the local apic id and version and features uint32_t id = lapic_read(LAPIC_ID); @@ -120,37 +156,39 @@ void lapic_init_postvm(uint level) { if (eas) { dprintf(INFO, "X86: local apic EAS features %#x\n", lapic_read(LAPIC_EXT_FEATURES)); } - - } LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM); void lapic_eoi(unsigned int vector) { LTRACEF("vector %#x\n", vector); - if (lapic_present) { - lapic_write(LAPIC_EOI, 0); + if (!lapic_present) { + return; } + + lapic_write(LAPIC_EOI, 0); } void lapic_send_init_ipi(uint32_t apic_id, bool level) { - if (lapic_present) { - lapic_write(LAPIC_ICRHI, apic_id << 24); - lapic_write(LAPIC_ICRLO, (5u << 8) | (level ? (1u << 14) : 0)); + if (!lapic_present) { + return; } + + lapic_write_icr((5u << 8) | (level ? (1u << 14) : 0), apic_id); } void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector) { - if (lapic_present) { - lapic_write(LAPIC_ICRHI, apic_id << 24); - lapic_write(LAPIC_ICRLO, (6u << 8) | (startup_vector >> 12)); + if (!lapic_present) { + return; } + + lapic_write_icr((6u << 8) | (startup_vector >> 12), apic_id); } void lapic_send_ipi(uint32_t apic_id, uint32_t vector) { - if (lapic_present) { - lapic_write(LAPIC_ICRHI, apic_id << 24); - // XXX add correct flag bits - lapic_write(LAPIC_ICRLO, vector); + if (!lapic_present) { + return; } + + lapic_write_icr(vector, apic_id); } \ No newline at end of file From 164f9fa47e90bf1fc436055d907f4f56ff0526a2 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Fri, 3 Jan 2025 20:54:30 -0800 Subject: [PATCH 10/26] WIP working with lapic to trigger a timer --- platform/pc/lapic.c | 51 ++++++++++++++++++++++++++++++++++++++++++++- scripts/do-qemux86 | 2 +- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index decc3adf..045d766f 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -18,10 +18,12 @@ #include #include #include -#include "platform_p.h" +#include #include #include +#include "platform_p.h" + #define LOCAL_TRACE 1 static bool lapic_present = false; @@ -69,6 +71,19 @@ enum lapic_regs { LAPIC_EXT_LVT0 = 0x500, }; +enum lapic_interrupts { + LAPIC_INT_TIMER = 0xf8, + LAPIC_INT_GENERIC, + LAPIC_INT_RESCHEDULE, +}; + +enum lapic_timer_mode { + LAPIC_TIMER_MODE_ONESHOT = 0, + LAPIC_TIMER_MODE_PERIODIC = 1, + LAPIC_TIMER_MODE_TSC_DEADLINE = 2, +}; + + static uint32_t lapic_read(enum lapic_regs reg) { LTRACEF("reg %#x\n", reg); DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); @@ -101,6 +116,29 @@ static void lapic_write_icr(uint32_t low, uint32_t apic_id) { } } +void lapic_set_oneshot_timer(uint32_t tick) { + LTRACEF("tick %u\n", tick); + + // set the initial count, which should trigger the timer + lapic_write(LAPIC_TICR, tick); +} + +void lapic_cancel_oneshot_timer(void) { + LTRACE; + + // set the counter to 0 which disables it + lapic_write(LAPIC_TICR, 0); +} + +enum handler_return lapic_timer_handler(void *arg) { + //PANIC_UNIMPLEMENTED; + +// return timer_tick(NULL, current_time()); + + lapic_set_oneshot_timer(100000000); + + return INT_NO_RESCHEDULE; +} void lapic_init(void) { // discover the presence of the local apic and map it @@ -156,6 +194,17 @@ void lapic_init_postvm(uint level) { if (eas) { dprintf(INFO, "X86: local apic EAS features %#x\n", lapic_read(LAPIC_EXT_FEATURES)); } + + lapic_cancel_oneshot_timer(); + + // configure the local timer and make sure it is not set to fire + uint32_t val = (LAPIC_TIMER_MODE_ONESHOT << 17) | LAPIC_INT_TIMER; + lapic_write(LAPIC_TIMER, val); + + // register the local apic interrupts + register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); + + lapic_set_oneshot_timer(1000000); } LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM); diff --git a/scripts/do-qemux86 b/scripts/do-qemux86 index 685718b4..b2db507a 100755 --- a/scripts/do-qemux86 +++ b/scripts/do-qemux86 @@ -70,7 +70,7 @@ elif (( $DO_LEGACY )); then else QEMU="qemu-system-i386" PROJECT="pc-x86-test" - CPU=max + CPU=pentium3 MACHINE=pc fi From 09412c194fb05e49fa8a9c19955a5c68cda77cdc Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Sun, 30 Mar 2025 14:49:48 -0700 Subject: [PATCH 11/26] [platform][pc] refactor existing PIT code into separate file Extend the PIT driver to allow for one shot timers even though it monotonically runs a 1kHz tick. This allows it to keep time and provide one shot events, though only at 1ms resolution. --- platform/pc/lapic.c | 2 +- platform/pc/pit.c | 194 +++++++++++++++++++++++++++++++++++++++ platform/pc/platform.c | 3 - platform/pc/platform_p.h | 10 +- platform/pc/rules.mk | 4 + platform/pc/timer.c | 191 ++++++++++---------------------------- 6 files changed, 254 insertions(+), 150 deletions(-) create mode 100644 platform/pc/pit.c diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index 045d766f..cc7c2448 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -24,7 +24,7 @@ #include "platform_p.h" -#define LOCAL_TRACE 1 +#define LOCAL_TRACE 0 static bool lapic_present = false; static bool lapic_x2apic = false; diff --git a/platform/pc/pit.c b/platform/pc/pit.c new file mode 100644 index 00000000..d9930ff9 --- /dev/null +++ b/platform/pc/pit.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2009 Corey Tabaka + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "platform_p.h" +#include +#include + +#define LOCAL_TRACE 0 + +static platform_timer_callback t_callback; +static void *callback_arg; +static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE; + +static uint64_t ticks_per_ms; + +// next callback event time in 32.32 fixed point milliseconds +static uint64_t next_trigger_time; + +// if periodic, the delta to set to the next event. if oneshot, 0 +static uint64_t next_trigger_delta; + +// time in 32.32 fixed point milliseconds +static volatile uint64_t timer_current_time; +// delta time per periodic tick in 32.32 +static uint64_t timer_delta_time; + +#define INTERNAL_FREQ 1193182ULL +#define INTERNAL_FREQ_3X 3579546ULL + +/* Maximum amount of time that can be program on the timer to schedule the next + * interrupt, in milliseconds */ +#define MAX_TIMER_INTERVAL 55 + +lk_time_t pit_current_time(void) { + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + lk_time_t time = (lk_time_t) (timer_current_time >> 32); + + spin_unlock_irqrestore(&lock, state); + + return time; +} + +lk_bigtime_t pit_current_time_hires(void) { + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + lk_bigtime_t time = (lk_bigtime_t) ((timer_current_time >> 22) * 1000) >> 10; + + spin_unlock_irqrestore(&lock, state); + + return time; +} + +static enum handler_return pit_timer_tick(void *arg) { + if (next_trigger_time != 0 || next_trigger_delta) { + LTRACEF("ntt %#" PRIx64 ", ntd %#" PRIx64 "\n", next_trigger_time, next_trigger_delta); + } + + spin_lock(&lock); + timer_current_time += timer_delta_time; + spin_unlock(&lock); + + lk_time_t time = current_time(); + + if (t_callback && next_trigger_time != 0 && timer_current_time >= next_trigger_time) { + if (next_trigger_delta != 0) { + uint64_t delta = timer_current_time - next_trigger_time; + next_trigger_time = timer_current_time + next_trigger_delta - delta; + } else { + next_trigger_time = 0; + } + + return t_callback(callback_arg, time); + } else { + return INT_NO_RESCHEDULE; + } +} + +static void set_pit_frequency(uint32_t frequency) { + uint32_t count, remainder; + + LTRACEF("frequency %u\n", frequency); + + /* figure out the correct divisor for the desired frequency */ + if (frequency <= 18) { + count = 0xffff; + } else if (frequency >= INTERNAL_FREQ) { + count = 1; + } else { + count = INTERNAL_FREQ_3X / frequency; + remainder = INTERNAL_FREQ_3X % frequency; + + if (remainder >= INTERNAL_FREQ_3X / 2) { + count += 1; + } + + count /= 3; + remainder = count % 3; + + if (remainder >= 1) { + count += 1; + } + } + + uint16_t divisor = count & 0xffff; + + /* + * funky math that i don't feel like explaining. essentially 32.32 fixed + * point representation of the configured timer delta. + */ + timer_delta_time = (3685982306ULL * count) >> 10; + + LTRACEF("dt 0x%016" PRIx64 "\n", timer_delta_time); + LTRACEF("divisor 0x%04" PRIx16 "\n", divisor); + + /* + * setup the Programmable Interval Timer + * timer 0, mode 2, binary counter, LSB followed by MSB + */ + outp(I8253_CONTROL_REG, 0x34); + outp(I8253_DATA_REG, divisor & 0xff); // LSB + outp(I8253_DATA_REG, divisor >> 8); // MSB +} + +void pit_init(void) { + timer_current_time = 0; + ticks_per_ms = INTERNAL_FREQ/1000; + set_pit_frequency(1000); // ~1ms granularity + register_int_handler(INT_PIT, &pit_timer_tick, NULL); +} + +status_t pit_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + LTRACEF("pit_set_periodic_timer: interval %u\n", interval); + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + t_callback = callback; + callback_arg = arg; + + next_trigger_delta = (uint64_t) interval << 32; + next_trigger_time = timer_current_time + next_trigger_delta; + + unmask_interrupt(INT_PIT); + spin_unlock_irqrestore(&lock, state); + + return NO_ERROR; +} + +status_t pit_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + LTRACEF("pit_set_oneshot_timer: interval %u\n", interval); + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + t_callback = callback; + callback_arg = arg; + + next_trigger_delta = 0; + next_trigger_time = timer_current_time + ((uint64_t)interval << 32); + + unmask_interrupt(INT_PIT); + spin_unlock_irqrestore(&lock, state); + + return NO_ERROR; +} + +void pit_stop_timer(void) { + LTRACE; + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + mask_interrupt(INT_PIT); + spin_unlock_irqrestore(&lock, state); +} \ No newline at end of file diff --git a/platform/pc/platform.c b/platform/pc/platform.c index 8095d6cb..69dc9033 100644 --- a/platform/pc/platform.c +++ b/platform/pc/platform.c @@ -188,9 +188,6 @@ void platform_early_init(void) { /* initialize the interrupt controller */ platform_init_interrupts(); - /* initialize the timer */ - platform_init_timer(); - /* look at multiboot to determine our memory size */ size_t found_arenas; platform_parse_multiboot_info(&found_arenas); diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index f673c732..5e17c41e 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -8,13 +8,13 @@ #pragma once #include +#include extern cbuf_t console_input_buf; void platform_init_debug_early(void); void platform_init_debug(void); void platform_init_interrupts(void); -void platform_init_timer(void); // legacy programmable interrupt controller void pic_init(void); @@ -29,5 +29,13 @@ void lapic_send_init_ipi(uint32_t apic_id, bool level); void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector); void lapic_send_ipi(uint32_t apic_id, uint32_t vector); +// programable interval timer +void pit_init(void); +status_t pit_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +status_t pit_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +void pit_stop_timer(void); +lk_time_t pit_current_time(void); +lk_bigtime_t pit_current_time_hires(void); + // secondary cpus void platform_start_secondary_cpus(void); diff --git a/platform/pc/rules.mk b/platform/pc/rules.mk index 7d7b0d7f..1f5d5e23 100644 --- a/platform/pc/rules.mk +++ b/platform/pc/rules.mk @@ -26,11 +26,15 @@ MODULE_SRCS += \ $(LOCAL_DIR)/mp.c \ $(LOCAL_DIR)/mp-boot.S \ $(LOCAL_DIR)/pic.c \ + $(LOCAL_DIR)/pit.c \ $(LOCAL_DIR)/platform.c \ $(LOCAL_DIR)/timer.c \ $(LOCAL_DIR)/uart.c \ LK_HEAP_IMPLEMENTATION ?= dlmalloc +GLOBAL_DEFINES += \ + PLATFORM_HAS_DYNAMIC_TIMER=1 + include make/module.mk diff --git a/platform/pc/timer.c b/platform/pc/timer.c index ba4b852b..c92a2d99 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -1,184 +1,85 @@ /* - * Copyright (c) 2009 Corey Tabaka + * Copyright (c) 2025 Travis Geiselbrecht * * Use of this source code is governed by a MIT-style * license that can be found in the LICENSE file or at * https://opensource.org/licenses/MIT */ #include -#include -#include #include +#include +#include +#include +#include #include -#include #include -#include -#include #include #include #include "platform_p.h" #include -static platform_timer_callback t_callback; -static void *callback_arg; -static spin_lock_t lock; +#define LOCAL_TRACE 0 -static uint64_t next_trigger_time; -static uint64_t next_trigger_delta; -static uint64_t ticks_per_ms; +// Deals with all of the various clock sources and event timers on the PC platform. -static uint64_t timer_delta_time; -static volatile uint64_t timer_current_time; +static enum clock_source { + CLOCK_SOURCE_INITIAL, + CLOCK_SOURCE_PIT, + CLOCK_SOURCE_TSC, + CLOCK_SOURCE_HPET, +} clock_source = CLOCK_SOURCE_INITIAL; -static uint16_t divisor; - -#define INTERNAL_FREQ 1193182ULL -#define INTERNAL_FREQ_3X 3579546ULL - -/* Maximum amount of time that can be program on the timer to schedule the next - * interrupt, in milliseconds */ -#define MAX_TIMER_INTERVAL 55 - - - -status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { - t_callback = callback; - callback_arg = arg; - - next_trigger_delta = (uint64_t) interval << 32; - next_trigger_time = timer_current_time + next_trigger_delta; - - return NO_ERROR; +static const char *clock_source_name(void) { + switch (clock_source) { + case CLOCK_SOURCE_INITIAL: + return "initial"; + case CLOCK_SOURCE_PIT: + return "pit"; + case CLOCK_SOURCE_TSC: + return "tsc"; + case CLOCK_SOURCE_HPET: + return "hpet"; + default: + return "unknown"; + } } lk_time_t current_time(void) { - lk_time_t time; - - // XXX slight race - time = (lk_time_t) (timer_current_time >> 32); - - return time; + switch (clock_source) { + case CLOCK_SOURCE_PIT: + return pit_current_time(); + default: + return 0; + } } lk_bigtime_t current_time_hires(void) { - lk_bigtime_t time; - - // XXX slight race - time = (lk_bigtime_t) ((timer_current_time >> 22) * 1000) >> 10; - - return time; -} -static enum handler_return os_timer_tick(void *arg) { - uint64_t delta; - - timer_current_time += timer_delta_time; - - lk_time_t time = current_time(); - //lk_bigtime_t btime = current_time_hires(); - //printf_xy(71, 0, WHITE, "%08u", (uint32_t) time); - //printf_xy(63, 1, WHITE, "%016llu", (uint64_t) btime); - - if (t_callback && timer_current_time >= next_trigger_time) { - delta = timer_current_time - next_trigger_time; - next_trigger_time = timer_current_time + next_trigger_delta - delta; - - return t_callback(callback_arg, time); - } else { - return INT_NO_RESCHEDULE; + switch (clock_source) { + case CLOCK_SOURCE_PIT: + return pit_current_time_hires(); + default: + return 0; } } -static void set_pit_frequency(uint32_t frequency) { - uint32_t count, remainder; +void pc_init_timer(unsigned int level) { + LTRACE_ENTRY; - /* figure out the correct divisor for the desired frequency */ - if (frequency <= 18) { - count = 0xffff; - } else if (frequency >= INTERNAL_FREQ) { - count = 1; - } else { - count = INTERNAL_FREQ_3X / frequency; - remainder = INTERNAL_FREQ_3X % frequency; - - if (remainder >= INTERNAL_FREQ_3X / 2) { - count += 1; - } - - count /= 3; - remainder = count % 3; - - if (remainder >= 1) { - count += 1; - } - } - - divisor = count & 0xffff; - - /* - * funky math that i don't feel like explaining. essentially 32.32 fixed - * point representation of the configured timer delta. - */ - timer_delta_time = (3685982306ULL * count) >> 10; - - //dprintf(DEBUG, "set_pit_frequency: dt=%016llx\n", timer_delta_time); - //dprintf(DEBUG, "set_pit_frequency: divisor=%04x\n", divisor); - - /* - * setup the Programmable Interval Timer - * timer 0, mode 2, binary counter, LSB followed by MSB - */ - outp(I8253_CONTROL_REG, 0x34); - outp(I8253_DATA_REG, divisor & 0xff); // LSB - outp(I8253_DATA_REG, divisor >> 8); // MSB + pit_init(); + clock_source = CLOCK_SOURCE_PIT; } -void platform_init_timer(void) { +LK_INIT_HOOK(pc_timer, pc_init_timer, LK_INIT_LEVEL_VM); - timer_current_time = 0; - ticks_per_ms = INTERNAL_FREQ/1000; - set_pit_frequency(1000); // ~1ms granularity - register_int_handler(INT_PIT, &os_timer_tick, NULL); - unmask_interrupt(INT_PIT); -} - -static void platform_halt_timers(void) { - mask_interrupt(INT_PIT); +status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + return pit_set_periodic_timer(callback, arg, interval); } status_t platform_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { - uint32_t count; - - spin_lock_saved_state_t state; - spin_lock_irqsave(&lock, state); - - t_callback = callback; - callback_arg = arg; - - - if (interval > MAX_TIMER_INTERVAL) - interval = MAX_TIMER_INTERVAL; - if (interval < 1) interval = 1; - - count = ticks_per_ms * interval; - - divisor = count & 0xffff; - timer_delta_time = (3685982306ULL * count) >> 10; - /* Program PIT in the software strobe configuration, to send one pulse - * after the count reach 0 */ - outp(I8253_CONTROL_REG, 0x38); - outp(I8253_DATA_REG, divisor & 0xff); // LSB - outp(I8253_DATA_REG, divisor >> 8); // MSB - - - unmask_interrupt(INT_PIT); - spin_unlock_irqrestore(&lock, state); - - return NO_ERROR; + return pit_set_oneshot_timer(callback, arg, interval); } void platform_stop_timer(void) { - /* Enable interrupt mode that will stop the decreasing counter of the PIT */ - outp(I8253_CONTROL_REG, 0x30); - return; + pit_stop_timer(); } From 2987f73d088c02e98fbc49a9f78765164aaf143b Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Sun, 30 Mar 2025 21:59:39 -0700 Subject: [PATCH 12/26] [platform][pc] add support for TSC based clock -Detect if under KVM hypervisor and read tick rate or -calibrate tick against PIT --- arch/x86/feature.c | 36 +++++- arch/x86/include/arch/x86/feature.h | 13 +- lib/acpi_lite/acpi_lite.cpp | 2 +- platform/pc/lapic.c | 10 -- platform/pc/mp.c | 1 + platform/pc/pit.c | 59 ++++++++- platform/pc/platform_p.h | 1 + platform/pc/rules.mk | 8 +- platform/pc/timer.c | 179 +++++++++++++++++++++++++++- scripts/do-qemux86 | 4 +- 10 files changed, 276 insertions(+), 37 deletions(-) diff --git a/arch/x86/feature.c b/arch/x86/feature.c index e4811652..ce28c280 100644 --- a/arch/x86/feature.c +++ b/arch/x86/feature.c @@ -133,10 +133,13 @@ static void x86_cpu_detect(void) { // read max hypervisor leaf cpuid(X86_CPUID_HYP_BASE, &a, &b, &c, &d); - // TODO: actually check that it's an understood hypervisor before setting this. - // It's possible on real hardware it's just returning the last valid regular cpuid. - if (a >= X86_CPUID_HYP_BASE) { + + // Check that it's an understood hypervisor leaf + if ((b == 0x4b4d564b && c == 0x564b4d56 && d == 0x4d) || /* KVMKVMKVM */ + (b == 0x54474354 && c == 0x43544743 && d == 0x47435447)) { /* TCGTCGTCGTCG */ max_cpuid_leaf_hyp = MIN(a, __X86_MAX_SUPPORTED_CPUID_HYP); + } else { + max_cpuid_leaf_hyp = 0; } } else { __x86_cpu_vendor = X86_CPU_VENDOR_INTEL; // intrinsically Intel without cpuid @@ -191,12 +194,12 @@ void x86_feature_early_init(void) { // cache a copy of the cpuid bits if (has_cpuid) { - for (uint32_t i = 1; i <= max_cpuid_leaf; i++) { + for (uint32_t i = 0; i <= max_cpuid_leaf; i++) { cpuid_c(i, 0, &saved_cpuids[i].a, &saved_cpuids[i].b, &saved_cpuids[i].c, &saved_cpuids[i].d); } if (max_cpuid_leaf_ext > 0) { - for (uint32_t i = X86_CPUID_EXT_BASE + 1; i - 1 < max_cpuid_leaf_ext; i++) { + for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) { uint32_t index = i - X86_CPUID_EXT_BASE; cpuid_c(i, 0, &saved_cpuids_ext[index].a, &saved_cpuids_ext[index].b, &saved_cpuids_ext[index].c, &saved_cpuids_ext[index].d); @@ -204,7 +207,7 @@ void x86_feature_early_init(void) { } if (max_cpuid_leaf_hyp > 0) { - for (uint32_t i = X86_CPUID_HYP_BASE + 1; i - 1 < max_cpuid_leaf_hyp; i++) { + for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) { uint32_t index = i - X86_CPUID_HYP_BASE; cpuid_c(i, 0, &saved_cpuids_hyp[index].a, &saved_cpuids_hyp[index].b, &saved_cpuids_hyp[index].c, &saved_cpuids_hyp[index].d); @@ -213,6 +216,23 @@ void x86_feature_early_init(void) { } } +static void x86_feature_dump_cpuid(void) { + for (uint32_t i = X86_CPUID_BASE; i <= max_cpuid_leaf; i++) { + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids[i - X86_CPUID_BASE].a, saved_cpuids[i - X86_CPUID_BASE].b, saved_cpuids[i - X86_CPUID_BASE].c, saved_cpuids[i - X86_CPUID_BASE].d); + } + for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) { + uint32_t index = i - X86_CPUID_HYP_BASE; + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids_hyp[index].a, saved_cpuids_hyp[index].b, saved_cpuids_hyp[index].c, saved_cpuids_hyp[index].d); + } + for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) { + uint32_t index = i - X86_CPUID_EXT_BASE; + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids[index].a, saved_cpuids[index].b, saved_cpuids[index].c, saved_cpuids[index].d); + } +} + /* later feature init hook, called after the kernel is able to schedule */ void x86_feature_init(void) { dprintf(SPEW, "X86: detected cpu level %d has_cpuid %d\n", x86_get_cpu_level(), has_cpuid); @@ -243,6 +263,10 @@ void x86_feature_init(void) { printf("X86: processor model info type %#x family %#x model %#x stepping %#x\n", model->processor_type, model->family, model->model, model->stepping); printf("\tdisplay_family %#x display_model %#x\n", model->display_family, model->display_model); + + if (has_cpuid && LK_DEBUGLEVEL > 1) { + x86_feature_dump_cpuid(); + } } bool x86_get_cpuid_subleaf(enum x86_cpuid_leaf_num num, uint32_t subleaf, struct x86_cpuid_leaf* leaf) { diff --git a/arch/x86/include/arch/x86/feature.h b/arch/x86/include/arch/x86/feature.h index f32d7bb8..a743a634 100644 --- a/arch/x86/include/arch/x86/feature.h +++ b/arch/x86/include/arch/x86/feature.h @@ -328,12 +328,12 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) { #define X86_FEATURE_CORE_CAPABILITIES X86_CPUID_BIT(0x7, 3, 30) #define X86_FEATURE_SSBD X86_CPUID_BIT(0x7, 3, 31) -#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000000, 0, 0) -#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000000, 0, 1) -#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000000, 0, 2) -#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000000, 0, 3) -#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000000, 0, 4) -#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000000, 0, 5) +#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000001, 0, 0) +#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000001, 0, 1) +#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000001, 0, 2) +#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000001, 0, 3) +#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000001, 0, 4) +#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000001, 0, 5) #define X86_FEATURE_KVM_PV_EOI X86_CPUID_BIT(0x40000001, 0, 6) #define X86_FEATURE_KVM_PV_UNHALT X86_CPUID_BIT(0x40000001, 0, 7) #define X86_FEATURE_KVM_PV_TLB_FLUSH X86_CPUID_BIT(0x40000001, 0, 9) @@ -355,6 +355,7 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) { #define X86_FEATURE_HUGE_PAGE X86_CPUID_BIT(0x80000001, 3, 26) #define X86_FEATURE_RDTSCP X86_CPUID_BIT(0x80000001, 3, 27) #define X86_FEATURE_INVAR_TSC X86_CPUID_BIT(0x80000007, 3, 8) +#define X86_FEATURE_CONSTANT_TSC X86_CPUID_BIT(0x80000007, 3, 8) // accessor to read some fields out of a register static inline uint32_t x86_get_vaddr_width(void) { diff --git a/lib/acpi_lite/acpi_lite.cpp b/lib/acpi_lite/acpi_lite.cpp index 7b5b09f6..1059cf0d 100644 --- a/lib/acpi_lite/acpi_lite.cpp +++ b/lib/acpi_lite/acpi_lite.cpp @@ -17,7 +17,7 @@ #include // uses the vm to map in ACPI tables as they are found -static_assert(WITH_KERNEL_VM, ""); +static_assert(WITH_KERNEL_VM); #define LOCAL_TRACE 0 diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index cc7c2448..bb38ed8b 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -141,14 +141,6 @@ enum handler_return lapic_timer_handler(void *arg) { } void lapic_init(void) { - // discover the presence of the local apic and map it - LTRACE_ENTRY; - - // check feature bit 9 in edx of leaf 1 for presence of lapic - lapic_present = x86_feature_test(X86_FEATURE_APIC); -} - -void lapic_init_postvm(uint level) { if (!lapic_present) return; @@ -207,8 +199,6 @@ void lapic_init_postvm(uint level) { lapic_set_oneshot_timer(1000000); } -LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM); - void lapic_eoi(unsigned int vector) { LTRACEF("vector %#x\n", vector); if (!lapic_present) { diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 0900f4ac..07ddf487 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -91,6 +91,7 @@ static void local_apic_callback(const void *_entry, size_t entry_len, void *cook const struct acpi_madt_local_apic_entry *entry = _entry; struct detected_cpus *cpus = cookie; + // TODO: read the current APIC id and skip it, instead of assuming 0 is the boot cpu if (entry->apic_id == 0) { // skip the boot cpu return; diff --git a/platform/pc/pit.c b/platform/pc/pit.c index d9930ff9..6b6faa2f 100644 --- a/platform/pc/pit.c +++ b/platform/pc/pit.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,9 @@ #define LOCAL_TRACE 0 + +// TODO: switch this logic to lib/fixed_point math + static platform_timer_callback t_callback; static void *callback_arg; static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE; @@ -42,6 +46,7 @@ static uint64_t timer_delta_time; #define INTERNAL_FREQ 1193182ULL #define INTERNAL_FREQ_3X 3579546ULL +#define INTERNAL_FREQ_TICKS_PER_MS (INTERNAL_FREQ / 1000u) /* Maximum amount of time that can be program on the timer to schedule the next * interrupt, in milliseconds */ @@ -128,8 +133,8 @@ static void set_pit_frequency(uint32_t frequency) { */ timer_delta_time = (3685982306ULL * count) >> 10; - LTRACEF("dt 0x%016" PRIx64 "\n", timer_delta_time); - LTRACEF("divisor 0x%04" PRIx16 "\n", divisor); + LTRACEF("dt %#x.%08x\n", (uint32_t)(timer_delta_time >> 32), (uint32_t)(timer_delta_time & 0xffffffff)); + LTRACEF("divisor %" PRIu16 "\n", divisor); /* * setup the Programmable Interval Timer @@ -191,4 +196,54 @@ void pit_stop_timer(void) { mask_interrupt(INT_PIT); spin_unlock_irqrestore(&lock, state); +} + +uint64_t pit_calibrate_tsc(void) { + DEBUG_ASSERT(arch_ints_disabled()); + + uint64_t tsc_ticks[5] = {0}; + uint32_t countdown_ms[5] = {0}; + + uint64_t tsc_freq = 0; + for (uint i = 0; i < countof(tsc_ticks); i++) { + // calibrate the tsc frequency using the PIT + countdown_ms[i] = 2 * (i + 1); + + uint16_t pic_ticks = INTERNAL_FREQ_TICKS_PER_MS * countdown_ms[i]; + outp(I8253_CONTROL_REG, 0x30); + outp(I8253_DATA_REG, pic_ticks & 0xff); // LSB + outp(I8253_DATA_REG, pic_ticks >> 8); // MSB + + // read the tsc + uint64_t tsc_start = __builtin_ia32_rdtsc(); + + // wait for countdown_ms + uint8_t status = 0; + do { + // Send a read-back command that latches the status of ch0 + outp(I8253_CONTROL_REG, 0xe2); + status = inp(I8253_DATA_REG); + // Wait for bit 7 (output) to go high and for bit 6 (null count) to go low + } while ((status & 0xc0) != 0x80); + + uint64_t tsc_end = __builtin_ia32_rdtsc(); + tsc_ticks[i] = tsc_end - tsc_start; + } + + // find the best time + uint best_index = 0; + for (uint i = 1; i < countof(tsc_ticks); i++) { + if (tsc_ticks[i] < tsc_ticks[best_index]) { + best_index = i; + } + } + + // calculate the tsc frequency + tsc_freq = (tsc_ticks[best_index] * 1000) / countdown_ms[best_index]; + dprintf(INFO, "PIT: calibrated TSC frequency: %" PRIu64 "Hz\n", tsc_freq); + + // put the PIT back to 1ms countdown + set_pit_frequency(1000); + + return tsc_freq; } \ No newline at end of file diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index 5e17c41e..59df1e0f 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -21,6 +21,7 @@ void pic_init(void); void pic_enable(unsigned int vector, bool enable); void pic_eoi(unsigned int vector); void pic_mask_interrupts(void); +uint64_t pit_calibrate_tsc(void); // local apic void lapic_init(void); diff --git a/platform/pc/rules.mk b/platform/pc/rules.mk index 1f5d5e23..dbee0748 100644 --- a/platform/pc/rules.mk +++ b/platform/pc/rules.mk @@ -6,10 +6,10 @@ MODULE := $(LOCAL_DIR) # legacy implies older hardware, pre pentium, pre pci CPU ?= modern -MODULE_DEPS += \ - lib/acpi_lite \ - lib/bio \ - lib/cbuf +MODULE_DEPS += lib/acpi_lite +MODULE_DEPS += lib/bio +MODULE_DEPS += lib/cbuf +MODULE_DEPS += lib/fixed_point ifneq ($(CPU),legacy) MODULE_DEPS += dev/bus/pci/drivers diff --git a/platform/pc/timer.c b/platform/pc/timer.c index c92a2d99..03e77cdc 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -12,13 +12,17 @@ #include #include #include +#include #include #include #include #include "platform_p.h" #include +#include +#include +#include -#define LOCAL_TRACE 0 +#define LOCAL_TRACE 1 // Deals with all of the various clock sources and event timers on the PC platform. @@ -29,16 +33,19 @@ static enum clock_source { CLOCK_SOURCE_HPET, } clock_source = CLOCK_SOURCE_INITIAL; +struct fp_32_64 tsc_to_timebase; +struct fp_32_64 tsc_to_timebase_hires; + static const char *clock_source_name(void) { switch (clock_source) { case CLOCK_SOURCE_INITIAL: return "initial"; case CLOCK_SOURCE_PIT: - return "pit"; + return "PIT"; case CLOCK_SOURCE_TSC: - return "tsc"; + return "TSC"; case CLOCK_SOURCE_HPET: - return "hpet"; + return "HPET"; default: return "unknown"; } @@ -48,6 +55,8 @@ lk_time_t current_time(void) { switch (clock_source) { case CLOCK_SOURCE_PIT: return pit_current_time(); + case CLOCK_SOURCE_TSC: + return u32_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase); default: return 0; } @@ -57,16 +66,174 @@ lk_bigtime_t current_time_hires(void) { switch (clock_source) { case CLOCK_SOURCE_PIT: return pit_current_time_hires(); + case CLOCK_SOURCE_TSC: + return u64_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase_hires); default: return 0; } } -void pc_init_timer(unsigned int level) { - LTRACE_ENTRY; +// From https://www.kernel.org/doc/html/v6.14/virt/kvm/x86/msr.html +struct pvclock_wall_clock { + uint32_t version; + uint32_t sec; + uint32_t nsec; +} __PACKED; +static_assert(sizeof(struct pvclock_wall_clock) == 12, "pvclock_wall_clock size mismatch"); +struct pvclock_vcpu_time_info { + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; + uint64_t system_time; + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + uint8_t flags; + uint8_t pad[2]; +} __PACKED; +static_assert(sizeof(struct pvclock_vcpu_time_info) == 32, "pvclock_vcpu_time_info size mismatch"); + +static volatile struct pvclock_wall_clock *wall_clock; +static volatile struct pvclock_vcpu_time_info *vcpu_time_info; + +status_t pvclock_init(void) { + uint32_t clocksource_msr_base = 0; + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE)) { + clocksource_msr_base = 0x11; + } + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE2)) { + clocksource_msr_base = 0x4b564d00; + } + if (!clocksource_msr_base) { + return ERR_NOT_SUPPORTED; + } + dprintf(INFO, "pv_clock: clocksource detected, msr base %#x\n", clocksource_msr_base); + + // map a page of memory and point the KVM clocksource msrs at it + void *clocksource_page; + status_t err = vmm_alloc(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, &clocksource_page, 0, 0, 0); + if (err != NO_ERROR) { + printf("pv_clock: failed to allocate page for clocksource msrs\n"); + return err; + } + + paddr_t paddr; + arch_mmu_query(&vmm_get_kernel_aspace()->arch_aspace, (vaddr_t)clocksource_page, &paddr, NULL); + LTRACEF("clocksource page %p, paddr %#" PRIxPTR "\n", clocksource_page, paddr); + + write_msr(clocksource_msr_base, paddr); + write_msr(clocksource_msr_base + 1, paddr + sizeof(struct pvclock_wall_clock) + 1); + + wall_clock = (struct pvclock_wall_clock *)clocksource_page; + vcpu_time_info = (struct pvclock_vcpu_time_info *)(wall_clock + 1); + + dprintf(SPEW, "pv_clock: wall clock version %u, sec %u, nsec %u\n", + wall_clock->version, wall_clock->sec, wall_clock->nsec); + + dprintf(SPEW, "pv_clock: vcpu time info version %u, tsc timestamp %llu, system time %llu\n", + vcpu_time_info->version, vcpu_time_info->tsc_timestamp, vcpu_time_info->system_time); + dprintf(SPEW, "pv_clock: tsc to system mul %u, tsc shift %d, flags %u\n", + vcpu_time_info->tsc_to_system_mul, vcpu_time_info->tsc_shift, vcpu_time_info->flags); + + return NO_ERROR; +} + +uint64_t pvclock_get_tsc_freq(void) { + uint32_t tsc_mul = 0; + int8_t tsc_shift = 0; + + if (!vcpu_time_info) { + return 0; + } + + uint32_t pre_version = 0, post_version = 0; + do { + pre_version = vcpu_time_info->version; + if (pre_version % 2 != 0) { + asm("pause"); + continue; + } + tsc_mul = vcpu_time_info->tsc_to_system_mul; + tsc_shift = vcpu_time_info->tsc_shift; + post_version = vcpu_time_info->version; + } while (pre_version != post_version); + + uint64_t tsc_khz = 1000000ULL << 32; + tsc_khz = tsc_khz / tsc_mul; + if (tsc_shift > 0) { + tsc_khz >>= tsc_shift; + } else { + tsc_khz <<= -tsc_shift; + } + return tsc_khz * 1000; +} + +bool pv_clock_is_stable(void) { + if (!vcpu_time_info) { + return false; + } + bool is_stable = (vcpu_time_info->flags & (1<<0)) || + x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE_STABLE); + return is_stable; +} + +void pc_init_timer(unsigned int level) { + // Initialize the PIT, it's always present in PC hardware pit_init(); clock_source = CLOCK_SOURCE_PIT; + + lapic_init(); + +#if !X86_LEGACY + // XXX update note about what invariant TSC means + bool invariant_tsc = x86_feature_test(X86_FEATURE_INVAR_TSC); + LTRACEF("invariant TSC %d\n", invariant_tsc); + + // Test for hypervisor PV clock, which also effectively says if TSC is invariant across + // all cpus. + if (pvclock_init() == NO_ERROR) { + bool pv_clock_stable = pv_clock_is_stable(); + + invariant_tsc |= pv_clock_stable; + + printf("pv_clock: Clocksource is %sstable\n", (pv_clock_stable ? "" : "not ")); + } + + // XXX test for HPET and use it over PIT if present + + if (invariant_tsc) { + // We're going to try to use the TSC as a time base, obtain the TSC frequency. + uint64_t tsc_hz = 0; + + tsc_hz = pvclock_get_tsc_freq(); + if (tsc_hz == 0) { + // TODO: some x86 cores describe the TSC and lapic clocks in cpuid + + // Calibrate the TSC against the PIT, which should always be present + tsc_hz = pit_calibrate_tsc(); + if (tsc_hz == 0) { + dprintf(CRITICAL, "PC: failed to calibrate TSC frequency\n"); + goto out; + } + } + + dprintf(INFO, "PC: TSC frequency %" PRIu64 "Hz\n", tsc_hz); + + // Compute the ratio of TSC to timebase + fp_32_64_div_32_32(&tsc_to_timebase, 1000, tsc_hz); + dprintf(INFO, "PC: TSC to timebase ratio %u.%08u...\n", + tsc_to_timebase.l0, tsc_to_timebase.l32); + + fp_32_64_div_32_32(&tsc_to_timebase_hires, 1000*1000, tsc_hz); + dprintf(INFO, "PC: TSC to hires timebase ratio %u.%08u...\n", + tsc_to_timebase_hires.l0, tsc_to_timebase_hires.l32); + + clock_source = CLOCK_SOURCE_TSC; + } +out: +#endif // !X86_LEGACY + + dprintf(INFO, "PC: using %s clock source\n", clock_source_name()); } LK_INIT_HOOK(pc_timer, pc_init_timer, LK_INIT_LEVEL_VM); diff --git a/scripts/do-qemux86 b/scripts/do-qemux86 index b2db507a..96e3997c 100755 --- a/scripts/do-qemux86 +++ b/scripts/do-qemux86 @@ -70,7 +70,7 @@ elif (( $DO_LEGACY )); then else QEMU="qemu-system-i386" PROJECT="pc-x86-test" - CPU=pentium3 + CPU=max MACHINE=pc fi @@ -86,7 +86,7 @@ fi ARGS="" if (( $DO_KVM )); then - ARGS+=" -enable-kvm -cpu host" + ARGS+=" -accel kvm -cpu host" else ARGS+=" -cpu $CPU" fi From d05bed3a25fb0902a177bbafd70212f6ec2366c2 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Mon, 31 Mar 2025 00:01:45 -0700 Subject: [PATCH 13/26] [platform][pc] add local apic timer support Supports deadline TSC and regular timer support. Calibrated from the PIT if regular timer support is used. --- platform/pc/lapic.c | 129 ++++++++++++++++++++++++++++++++------- platform/pc/pit.c | 71 ++++++++++++++++++++- platform/pc/platform_p.h | 10 ++- platform/pc/timer.c | 53 ++++++++++++---- 4 files changed, 229 insertions(+), 34 deletions(-) diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index bb38ed8b..a5ab505f 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,13 @@ static bool lapic_present = false; static bool lapic_x2apic = false; +static bool use_tsc_deadline = false; static volatile uint32_t *lapic_mmio; +static struct fp_32_64 timebase_to_lapic; + +// TODO: move these callbacks into the shared timer code +static platform_timer_callback t_callback; +static void *callback_arg; // local apic registers enum lapic_regs { @@ -83,7 +90,6 @@ enum lapic_timer_mode { LAPIC_TIMER_MODE_TSC_DEADLINE = 2, }; - static uint32_t lapic_read(enum lapic_regs reg) { LTRACEF("reg %#x\n", reg); DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); @@ -116,33 +122,60 @@ static void lapic_write_icr(uint32_t low, uint32_t apic_id) { } } -void lapic_set_oneshot_timer(uint32_t tick) { - LTRACEF("tick %u\n", tick); +status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + LTRACEF("tick %u\n", interval); - // set the initial count, which should trigger the timer - lapic_write(LAPIC_TICR, tick); + t_callback = callback; + callback_arg = arg; + + if (use_tsc_deadline) { + uint64_t now = __builtin_ia32_rdtsc(); + uint64_t delta = time_to_tsc_ticks(interval); + uint64_t deadline = now + delta; + LTRACEF("now %llu delta %llu deadline %llu\n", now, delta, deadline); + write_msr(X86_MSR_IA32_TSC_DEADLINE, deadline); + } else { + // set the initial count, which should trigger the timer + uint64_t ticks = u64_mul_u32_fp32_64(interval, timebase_to_lapic); + if (ticks > UINT32_MAX) { + ticks = UINT32_MAX; + } + + lapic_write(LAPIC_TICR, ticks & 0xffffffff); + } + + return NO_ERROR; } -void lapic_cancel_oneshot_timer(void) { +void lapic_cancel_timer(void) { LTRACE; - // set the counter to 0 which disables it - lapic_write(LAPIC_TICR, 0); + if (use_tsc_deadline) { + write_msr(X86_MSR_IA32_TSC_DEADLINE, 0); + } else { + lapic_write(LAPIC_TICR, 0); + } } -enum handler_return lapic_timer_handler(void *arg) { - //PANIC_UNIMPLEMENTED; +static enum handler_return lapic_timer_handler(void *arg) { + LTRACE; -// return timer_tick(NULL, current_time()); + enum handler_return ret = INT_NO_RESCHEDULE; + if (t_callback) { + ret = t_callback(callback_arg, current_time()); + } - lapic_set_oneshot_timer(100000000); - - return INT_NO_RESCHEDULE; + return ret; } void lapic_init(void) { - if (!lapic_present) + lapic_present = x86_feature_test(X86_FEATURE_APIC); +} + +void lapic_init_postvm(uint level) { + if (!lapic_present) { return; + } dprintf(INFO, "X86: local apic detected\n"); @@ -169,7 +202,7 @@ void lapic_init(void) { // map the lapic into the kernel since it's not guaranteed that the physmap covers it if (!lapic_mmio) { - dprintf(INFO, "X86: mapping lapic into kernel\n"); + LTRACEF("mapping lapic into kernel\n"); status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, (void **)&lapic_mmio, 0, apic_base & ~0xfff, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE); ASSERT(err == NO_ERROR); @@ -186,17 +219,71 @@ void lapic_init(void) { if (eas) { dprintf(INFO, "X86: local apic EAS features %#x\n", lapic_read(LAPIC_EXT_FEATURES)); } +} +LK_INIT_HOOK(lapic_init_postvm, lapic_init_postvm, LK_INIT_LEVEL_VM + 1); - lapic_cancel_oneshot_timer(); +static uint32_t lapic_read_current_tick(void) { + if (!lapic_present) { + return 0; + } - // configure the local timer and make sure it is not set to fire - uint32_t val = (LAPIC_TIMER_MODE_ONESHOT << 17) | LAPIC_INT_TIMER; - lapic_write(LAPIC_TIMER, val); + return lapic_read(LAPIC_TCCR); +} + +void lapic_timer_init_percpu(uint level) { + // check for deadline mode + if (use_tsc_deadline) { + // put the timer in TSC deadline and clear the match register + uint32_t val = (LAPIC_TIMER_MODE_TSC_DEADLINE << 17) | LAPIC_INT_TIMER; + lapic_write(LAPIC_TIMER, val); + write_msr(X86_MSR_IA32_TSC_DEADLINE, 0); + } else { + // configure the local timer and make sure it is not set to fire + uint32_t val = (LAPIC_TIMER_MODE_ONESHOT << 17) | LAPIC_INT_TIMER; + lapic_write(LAPIC_TIMER, val); + lapic_write(LAPIC_TICR, 0); + } + + // register the local apic interrupts + register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); +} + +LK_INIT_HOOK_FLAGS(lapic_timer_init_percpu, lapic_timer_init_percpu, LK_INIT_LEVEL_VM, LK_INIT_FLAG_SECONDARY_CPUS); + +status_t lapic_timer_init(bool invariant_tsc_supported) { + if (!lapic_present) { + return ERR_NOT_FOUND; + } + + lapic_cancel_timer(); + + // check for deadline mode + bool tsc_deadline = x86_feature_test(X86_FEATURE_TSC_DEADLINE); + if (invariant_tsc_supported && tsc_deadline) { + dprintf(INFO, "X86: local apic timer supports TSC deadline mode\n"); + use_tsc_deadline = true; + } else { + // configure the local timer and make sure it is not set to fire + uint32_t val = (LAPIC_TIMER_MODE_ONESHOT << 17) | LAPIC_INT_TIMER; + lapic_write(LAPIC_TIMER, val); + + // calibrate the timer frequency + lapic_write(LAPIC_TICR, 0xffffffff); // countdown from the max count + uint32_t lapic_hz = pit_calibrate_lapic(&lapic_read_current_tick); + lapic_write(LAPIC_TICR, 0); + printf("X86: local apic timer frequency %uHz\n", lapic_hz); + + fp_32_64_div_32_32(&timebase_to_lapic, lapic_hz, 1000); + dprintf(INFO, "X86: timebase to local apic timer ratio %u.%08u...\n", + timebase_to_lapic.l0, timebase_to_lapic.l32); + } + + lapic_timer_init_percpu(0); // register the local apic interrupts register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); - lapic_set_oneshot_timer(1000000); + return NO_ERROR; } void lapic_eoi(unsigned int vector) { diff --git a/platform/pc/pit.c b/platform/pc/pit.c index 6b6faa2f..0b57f5f2 100644 --- a/platform/pc/pit.c +++ b/platform/pc/pit.c @@ -24,7 +24,6 @@ #define LOCAL_TRACE 0 - // TODO: switch this logic to lib/fixed_point math static platform_timer_callback t_callback; @@ -146,10 +145,12 @@ static void set_pit_frequency(uint32_t frequency) { } void pit_init(void) { + // start the PIT at 1Khz in free-running mode to keep a time base timer_current_time = 0; ticks_per_ms = INTERNAL_FREQ/1000; set_pit_frequency(1000); // ~1ms granularity register_int_handler(INT_PIT, &pit_timer_tick, NULL); + unmask_interrupt(INT_PIT); } status_t pit_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { @@ -188,13 +189,32 @@ status_t pit_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_t return NO_ERROR; } +void pit_cancel_timer(void) { + LTRACE; + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + next_trigger_time = 0; + + spin_unlock_irqrestore(&lock, state); +} + void pit_stop_timer(void) { LTRACE; spin_lock_saved_state_t state; spin_lock_irqsave(&lock, state); + next_trigger_time = 0; + next_trigger_delta = 0; + + // stop the PIT + outp(I8253_CONTROL_REG, 0x34); + outp(I8253_DATA_REG, 0); // LSB + outp(I8253_DATA_REG, 0); // MSB mask_interrupt(INT_PIT); + spin_unlock_irqrestore(&lock, state); } @@ -246,4 +266,53 @@ uint64_t pit_calibrate_tsc(void) { set_pit_frequency(1000); return tsc_freq; +} + +uint32_t pit_calibrate_lapic(uint32_t (*lapic_read_tick)(void)) { + DEBUG_ASSERT(arch_ints_disabled()); + + uint64_t lapic_ticks[5] = {0}; + uint32_t countdown_ms[5] = {0}; + + for (uint i = 0; i < countof(lapic_ticks); i++) { + // calibrate the tsc frequency using the PIT + countdown_ms[i] = 2 * (i + 1); + + uint16_t pic_ticks = INTERNAL_FREQ_TICKS_PER_MS * countdown_ms[i]; + outp(I8253_CONTROL_REG, 0x30); + outp(I8253_DATA_REG, pic_ticks & 0xff); // LSB + outp(I8253_DATA_REG, pic_ticks >> 8); // MSB + + // read the tsc + uint32_t tick_start = lapic_read_tick(); + + // wait for countdown_ms + uint8_t status = 0; + do { + // Send a read-back command that latches the status of ch0 + outp(I8253_CONTROL_REG, 0xe2); + status = inp(I8253_DATA_REG); + // Wait for bit 7 (output) to go high and for bit 6 (null count) to go low + } while ((status & 0xc0) != 0x80); + + uint32_t tick_end = lapic_read_tick(); + lapic_ticks[i] = tick_start - tick_end; + } + + // find the best time + uint best_index = 0; + for (uint i = 1; i < countof(lapic_ticks); i++) { + if (lapic_ticks[i] < lapic_ticks[best_index]) { + best_index = i; + } + } + + // calculate the tsc frequency + uint32_t freq = (lapic_ticks[best_index] * 1000) / countdown_ms[best_index]; + dprintf(INFO, "PIT: calibrated local apic frequency: %" PRIu32 "Hz\n", freq); + + // put the PIT back to 1ms countdown + set_pit_frequency(1000); + + return freq; } \ No newline at end of file diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index 59df1e0f..7e945aab 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -21,22 +21,30 @@ void pic_init(void); void pic_enable(unsigned int vector, bool enable); void pic_eoi(unsigned int vector); void pic_mask_interrupts(void); -uint64_t pit_calibrate_tsc(void); // local apic void lapic_init(void); +status_t lapic_timer_init(bool invariant_tsc_supported); void lapic_eoi(unsigned int vector); void lapic_send_init_ipi(uint32_t apic_id, bool level); void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector); void lapic_send_ipi(uint32_t apic_id, uint32_t vector); +status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +void lapic_cancel_timer(void); + +uint64_t time_to_tsc_ticks(lk_time_t time); + // programable interval timer void pit_init(void); status_t pit_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval); status_t pit_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +void pit_cancel_timer(void); void pit_stop_timer(void); lk_time_t pit_current_time(void); lk_bigtime_t pit_current_time_hires(void); +uint64_t pit_calibrate_tsc(void); +uint32_t pit_calibrate_lapic(uint32_t (*lapic_read_tick)(void)); // secondary cpus void platform_start_secondary_cpus(void); diff --git a/platform/pc/timer.c b/platform/pc/timer.c index 03e77cdc..71460e22 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -22,7 +22,7 @@ #include #include -#define LOCAL_TRACE 1 +#define LOCAL_TRACE 0 // Deals with all of the various clock sources and event timers on the PC platform. @@ -33,8 +33,10 @@ static enum clock_source { CLOCK_SOURCE_HPET, } clock_source = CLOCK_SOURCE_INITIAL; -struct fp_32_64 tsc_to_timebase; -struct fp_32_64 tsc_to_timebase_hires; +static struct fp_32_64 tsc_to_timebase; +static struct fp_32_64 tsc_to_timebase_hires; +static struct fp_32_64 timebase_to_tsc; +static bool use_lapic_timer = false; static const char *clock_source_name(void) { switch (clock_source) { @@ -138,6 +140,11 @@ status_t pvclock_init(void) { return NO_ERROR; } +// Convert lk_time_t to TSC ticks +uint64_t time_to_tsc_ticks(lk_time_t time) { + return u64_mul_u32_fp32_64(time, timebase_to_tsc); +} + uint64_t pvclock_get_tsc_freq(void) { uint32_t tsc_mul = 0; int8_t tsc_shift = 0; @@ -182,26 +189,24 @@ void pc_init_timer(unsigned int level) { pit_init(); clock_source = CLOCK_SOURCE_PIT; - lapic_init(); - #if !X86_LEGACY // XXX update note about what invariant TSC means - bool invariant_tsc = x86_feature_test(X86_FEATURE_INVAR_TSC); - LTRACEF("invariant TSC %d\n", invariant_tsc); + bool use_invariant_tsc = x86_feature_test(X86_FEATURE_INVAR_TSC); + LTRACEF("invariant TSC %d\n", use_invariant_tsc); // Test for hypervisor PV clock, which also effectively says if TSC is invariant across // all cpus. if (pvclock_init() == NO_ERROR) { bool pv_clock_stable = pv_clock_is_stable(); - invariant_tsc |= pv_clock_stable; + use_invariant_tsc |= pv_clock_stable; printf("pv_clock: Clocksource is %sstable\n", (pv_clock_stable ? "" : "not ")); } // XXX test for HPET and use it over PIT if present - if (invariant_tsc) { + if (use_invariant_tsc) { // We're going to try to use the TSC as a time base, obtain the TSC frequency. uint64_t tsc_hz = 0; @@ -228,25 +233,51 @@ void pc_init_timer(unsigned int level) { dprintf(INFO, "PC: TSC to hires timebase ratio %u.%08u...\n", tsc_to_timebase_hires.l0, tsc_to_timebase_hires.l32); + fp_32_64_div_32_32(&timebase_to_tsc, tsc_hz, 1000); + dprintf(INFO, "PC: timebase to TSC ratio %u.%08u...\n", + timebase_to_tsc.l0, timebase_to_tsc.l32); + clock_source = CLOCK_SOURCE_TSC; } out: + + // Set up the local apic for event timer interrupts + if (lapic_timer_init(use_invariant_tsc) == NO_ERROR) { + dprintf(INFO, "PC: using LAPIC timer for event timer\n"); + use_lapic_timer = true; + } + + // If we're not using the PIT for time base and using the LAPIC timer for events, stop the PIT. + if (use_lapic_timer && clock_source != CLOCK_SOURCE_PIT) { + pit_stop_timer(); + } + #endif // !X86_LEGACY dprintf(INFO, "PC: using %s clock source\n", clock_source_name()); } -LK_INIT_HOOK(pc_timer, pc_init_timer, LK_INIT_LEVEL_VM); +LK_INIT_HOOK(pc_timer, pc_init_timer, LK_INIT_LEVEL_VM + 2); status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + if (use_lapic_timer) { + PANIC_UNIMPLEMENTED; + } return pit_set_periodic_timer(callback, arg, interval); } status_t platform_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + if (use_lapic_timer) { + return lapic_set_oneshot_timer(callback, arg, interval); + } return pit_set_oneshot_timer(callback, arg, interval); } void platform_stop_timer(void) { - pit_stop_timer(); + if (use_lapic_timer) { + lapic_cancel_timer(); + } else { + pit_cancel_timer(); + } } From 21ce5333272b2c16d241bb1bef8cd3a75cfc6537 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Mon, 31 Mar 2025 01:05:13 -0700 Subject: [PATCH 14/26] [platform][pc] make sure lapic is initialized per cpu Actually boots secondaries to the point where it actually panics because of missing IPI support. --- platform/pc/lapic.c | 24 +++++++++++++++++++----- platform/pc/mp.c | 1 + 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index a5ab505f..36030e87 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -37,6 +37,8 @@ static struct fp_32_64 timebase_to_lapic; static platform_timer_callback t_callback; static void *callback_arg; +static void lapic_timer_init_percpu(void); + // local apic registers enum lapic_regs { LAPIC_ID = 0x20, @@ -187,7 +189,7 @@ void lapic_init_postvm(uint level) { if ((apic_base & (1u<<11)) == 0) { dprintf(INFO, "X86: enabling lapic\n"); apic_base |= (1u<<11); - write_msr(0x1b, apic_base); + write_msr(X86_MSR_IA32_APIC_BASE, apic_base); } dprintf(INFO, "X86: lapic physical address %#llx\n", apic_base & ~0xfff); @@ -222,6 +224,20 @@ void lapic_init_postvm(uint level) { } LK_INIT_HOOK(lapic_init_postvm, lapic_init_postvm, LK_INIT_LEVEL_VM + 1); +void lapic_init_percpu(uint level) { + // Make sure the apic is enabled and x2apic mode is set (if supported) + uint64_t apic_base = read_msr(X86_MSR_IA32_APIC_BASE); + apic_base |= (1u<<11); + if (lapic_x2apic) { + apic_base |= (1u<<10); + } + write_msr(X86_MSR_IA32_APIC_BASE, apic_base); + + lapic_timer_init_percpu(); +} + +LK_INIT_HOOK_FLAGS(lapic_init_percpu, lapic_init_percpu, LK_INIT_LEVEL_VM, LK_INIT_FLAG_SECONDARY_CPUS); + static uint32_t lapic_read_current_tick(void) { if (!lapic_present) { return 0; @@ -230,7 +246,7 @@ static uint32_t lapic_read_current_tick(void) { return lapic_read(LAPIC_TCCR); } -void lapic_timer_init_percpu(uint level) { +static void lapic_timer_init_percpu(void) { // check for deadline mode if (use_tsc_deadline) { // put the timer in TSC deadline and clear the match register @@ -248,8 +264,6 @@ void lapic_timer_init_percpu(uint level) { register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); } -LK_INIT_HOOK_FLAGS(lapic_timer_init_percpu, lapic_timer_init_percpu, LK_INIT_LEVEL_VM, LK_INIT_FLAG_SECONDARY_CPUS); - status_t lapic_timer_init(bool invariant_tsc_supported) { if (!lapic_present) { return ERR_NOT_FOUND; @@ -278,7 +292,7 @@ status_t lapic_timer_init(bool invariant_tsc_supported) { timebase_to_lapic.l0, timebase_to_lapic.l32); } - lapic_timer_init_percpu(0); + lapic_timer_init_percpu(); // register the local apic interrupts register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 07ddf487..2fc4908c 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -92,6 +92,7 @@ static void local_apic_callback(const void *_entry, size_t entry_len, void *cook struct detected_cpus *cpus = cookie; // TODO: read the current APIC id and skip it, instead of assuming 0 is the boot cpu + // read BSP from X86_IA32_APIC_BASE_MSR bit 8? if (entry->apic_id == 0) { // skip the boot cpu return; From 5a520eca3ed6313e0aac3b7eec6101ebc8b8906b Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Tue, 1 Apr 2025 00:40:50 -0700 Subject: [PATCH 15/26] [arch][x86] start getting inter-processor-interrupts working -Move the local apic driver to arch/x86 -Add routines to send IPIs between cpus Something is unstable at the moment and the system crashes after a while with random corruptions when using SMP. --- arch/x86/32/exceptions.S | 2 - arch/x86/64/exceptions.S | 2 - arch/x86/arch.c | 3 + arch/x86/include/arch/x86/lapic.h | 26 +++++++ arch/x86/include/arch/x86/mp.h | 9 ++- {platform/pc => arch/x86}/lapic.c | 91 +++++++++++++++++++------ arch/x86/mp.c | 45 +++++++++--- arch/x86/pv.c | 7 ++ arch/x86/rules.mk | 3 + platform/pc/include/platform/pc/timer.h | 15 ++++ platform/pc/interrupts.c | 1 + platform/pc/mp.c | 1 + platform/pc/pit.c | 1 + platform/pc/platform_p.h | 14 ---- platform/pc/rules.mk | 1 - platform/pc/timer.c | 5 +- 16 files changed, 172 insertions(+), 54 deletions(-) create mode 100644 arch/x86/include/arch/x86/lapic.h rename {platform/pc => arch/x86}/lapic.c (78%) create mode 100644 arch/x86/pv.c create mode 100644 platform/pc/include/platform/pc/timer.h diff --git a/arch/x86/32/exceptions.S b/arch/x86/32/exceptions.S index 9e980e07..6cb06277 100644 --- a/arch/x86/32/exceptions.S +++ b/arch/x86/32/exceptions.S @@ -83,8 +83,6 @@ FUNCTION(setup_idt) loop .Lloop - lidt _idtr - ret END_FUNCTION(setup_idt) diff --git a/arch/x86/64/exceptions.S b/arch/x86/64/exceptions.S index c8f7ed9e..092056a1 100644 --- a/arch/x86/64/exceptions.S +++ b/arch/x86/64/exceptions.S @@ -113,8 +113,6 @@ FUNCTION(setup_idt) loop .Lloop - lidt _idtr - ret END_FUNCTION(setup_idt) diff --git a/arch/x86/arch.c b/arch/x86/arch.c index daa1dd3f..a20dc873 100644 --- a/arch/x86/arch.c +++ b/arch/x86/arch.c @@ -80,6 +80,9 @@ void x86_early_init_percpu(void) { x86_set_gdt_descriptor(selector, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); x86_ltr(selector); + /* load the kernel's IDT */ + asm("lidt _idtr"); + x86_mmu_early_init_percpu(); #if X86_WITH_FPU x86_fpu_early_init_percpu(); diff --git a/arch/x86/include/arch/x86/lapic.h b/arch/x86/include/arch/x86/lapic.h new file mode 100644 index 00000000..2fa421af --- /dev/null +++ b/arch/x86/include/arch/x86/lapic.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ + +#pragma once + +#include +#include +#include +#include + +// local apic +void lapic_init(void); +status_t lapic_timer_init(bool invariant_tsc_supported); +void lapic_eoi(unsigned int vector); +void lapic_send_init_ipi(uint32_t apic_id, bool level); +void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector); +void lapic_send_ipi(uint32_t apic_id, mp_ipi_t ipi); + +status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +void lapic_cancel_timer(void); + diff --git a/arch/x86/include/arch/x86/mp.h b/arch/x86/include/arch/x86/mp.h index c668f4d0..8b4c6e0b 100644 --- a/arch/x86/include/arch/x86/mp.h +++ b/arch/x86/include/arch/x86/mp.h @@ -16,7 +16,7 @@ typedef struct x86_percpu { struct x86_percpu *self; uint cpu_num; - uint apic_id; + uint32_t apic_id; struct thread *current_thread; @@ -70,10 +70,13 @@ static inline uint x86_get_cpu_num(void) { } // get the current apic id -static inline uint x86_get_apic_id(void) { +static inline uint32_t x86_get_apic_id(void) { return x86_read_gs_offset32(X86_PERCPU_FIELD_OFFSET(apic_id)); } +// read it from hardware directly +uint32_t x86_get_apic_id_from_hardware(void); + // get/set the current thread struct thread; @@ -83,4 +86,4 @@ static inline struct thread *x86_get_current_thread(void) { static inline void x86_set_current_thread(struct thread *t) { x86_write_gs_offset_ptr(X86_PERCPU_FIELD_OFFSET(current_thread), t); -} \ No newline at end of file +} diff --git a/platform/pc/lapic.c b/arch/x86/lapic.c similarity index 78% rename from platform/pc/lapic.c rename to arch/x86/lapic.c index 36030e87..4b7e0793 100644 --- a/platform/pc/lapic.c +++ b/arch/x86/lapic.c @@ -5,6 +5,8 @@ * license that can be found in the LICENSE file or at * https://opensource.org/licenses/MIT */ +#include "arch/x86/lapic.h" + #include #include #include @@ -20,10 +22,10 @@ #include #include #include -#include +#include +#include #include - -#include "platform_p.h" +#include #define LOCAL_TRACE 0 @@ -37,7 +39,7 @@ static struct fp_32_64 timebase_to_lapic; static platform_timer_callback t_callback; static void *callback_arg; -static void lapic_timer_init_percpu(void); +static void lapic_init_percpu(uint level); // local apic registers enum lapic_regs { @@ -82,6 +84,7 @@ enum lapic_regs { enum lapic_interrupts { LAPIC_INT_TIMER = 0xf8, + LAPIC_INT_SPURIOUS, LAPIC_INT_GENERIC, LAPIC_INT_RESCHEDULE, }; @@ -93,7 +96,7 @@ enum lapic_timer_mode { }; static uint32_t lapic_read(enum lapic_regs reg) { - LTRACEF("reg %#x\n", reg); + LTRACEF_LEVEL(2, "reg %#x\n", reg); DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); if (lapic_x2apic) { // TODO: do we need barriers here? @@ -104,7 +107,7 @@ static uint32_t lapic_read(enum lapic_regs reg) { } static void lapic_write(enum lapic_regs reg, uint32_t val) { - LTRACEF("reg %#x val %#x\n", reg, val); + LTRACEF_LEVEL(2, "reg %#x val %#x\n", reg, val); DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); if (lapic_x2apic) { write_msr(X86_MSR_IA32_X2APIC_BASE + reg / 0x10, val); @@ -115,7 +118,7 @@ static void lapic_write(enum lapic_regs reg, uint32_t val) { // special case to write to the ICR register static void lapic_write_icr(uint32_t low, uint32_t apic_id) { - LTRACEF("%#x apic_id %#x\n", low, apic_id); + LTRACEF_LEVEL(2, "%#x apic_id %#x\n", low, apic_id); if (lapic_x2apic) { write_msr(X86_MSR_IA32_X2APIC_BASE + 0x30, ((uint64_t)apic_id << 32) | low); } else { @@ -125,7 +128,9 @@ static void lapic_write_icr(uint32_t low, uint32_t apic_id) { } status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { - LTRACEF("tick %u\n", interval); + LTRACEF("cpu %u interval %u\n", arch_curr_cpu_num(), interval); + + DEBUG_ASSERT(arch_ints_disabled()); t_callback = callback; callback_arg = arg; @@ -152,6 +157,8 @@ status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk void lapic_cancel_timer(void) { LTRACE; + DEBUG_ASSERT(arch_ints_disabled()); + if (use_tsc_deadline) { write_msr(X86_MSR_IA32_TSC_DEADLINE, 0); } else { @@ -160,7 +167,7 @@ void lapic_cancel_timer(void) { } static enum handler_return lapic_timer_handler(void *arg) { - LTRACE; + LTRACEF("cpu %u\n", arch_curr_cpu_num()); enum handler_return ret = INT_NO_RESCHEDULE; if (t_callback) { @@ -170,11 +177,29 @@ static enum handler_return lapic_timer_handler(void *arg) { return ret; } +static enum handler_return lapic_spurious_handler(void *arg) { + LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg); + + return INT_NO_RESCHEDULE; +} + +static enum handler_return lapic_generic_handler(void *arg) { + LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg); + + return INT_NO_RESCHEDULE; +} + +static enum handler_return lapic_reschedule_handler(void *arg) { + LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg); + + return mp_mbx_reschedule_irq(); +} + void lapic_init(void) { lapic_present = x86_feature_test(X86_FEATURE_APIC); } -void lapic_init_postvm(uint level) { +static void lapic_init_postvm(uint level) { if (!lapic_present) { return; } @@ -221,10 +246,13 @@ void lapic_init_postvm(uint level) { if (eas) { dprintf(INFO, "X86: local apic EAS features %#x\n", lapic_read(LAPIC_EXT_FEATURES)); } + + // Finish up some local initialization that all cpus will want to do + lapic_init_percpu(0); } LK_INIT_HOOK(lapic_init_postvm, lapic_init_postvm, LK_INIT_LEVEL_VM + 1); -void lapic_init_percpu(uint level) { +static void lapic_init_percpu(uint level) { // Make sure the apic is enabled and x2apic mode is set (if supported) uint64_t apic_base = read_msr(X86_MSR_IA32_APIC_BASE); apic_base |= (1u<<11); @@ -233,7 +261,15 @@ void lapic_init_percpu(uint level) { } write_msr(X86_MSR_IA32_APIC_BASE, apic_base); - lapic_timer_init_percpu(); + // set the spurious vector register + uint32_t svr = (LAPIC_INT_SPURIOUS | (1u<<8)); // enable + lapic_write(LAPIC_SVR, svr); + + TRACEF("lapic svr %#x\n", lapic_read(LAPIC_SVR)); + + register_int_handler_msi(LAPIC_INT_SPURIOUS, &lapic_spurious_handler, NULL, false); + register_int_handler_msi(LAPIC_INT_GENERIC, &lapic_generic_handler, NULL, false); + register_int_handler_msi(LAPIC_INT_RESCHEDULE, &lapic_reschedule_handler, NULL, false); } LK_INIT_HOOK_FLAGS(lapic_init_percpu, lapic_init_percpu, LK_INIT_LEVEL_VM, LK_INIT_FLAG_SECONDARY_CPUS); @@ -246,7 +282,7 @@ static uint32_t lapic_read_current_tick(void) { return lapic_read(LAPIC_TCCR); } -static void lapic_timer_init_percpu(void) { +static void lapic_timer_init_percpu(uint level) { // check for deadline mode if (use_tsc_deadline) { // put the timer in TSC deadline and clear the match register @@ -260,17 +296,16 @@ static void lapic_timer_init_percpu(void) { lapic_write(LAPIC_TICR, 0); } - // register the local apic interrupts + // register the timer interrupt vector register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); } +LK_INIT_HOOK_FLAGS(lapic_timer_init_percpu, lapic_timer_init_percpu, LK_INIT_LEVEL_VM + 1, LK_INIT_FLAG_SECONDARY_CPUS); status_t lapic_timer_init(bool invariant_tsc_supported) { if (!lapic_present) { return ERR_NOT_FOUND; } - lapic_cancel_timer(); - // check for deadline mode bool tsc_deadline = x86_feature_test(X86_FEATURE_TSC_DEADLINE); if (invariant_tsc_supported && tsc_deadline) { @@ -292,10 +327,7 @@ status_t lapic_timer_init(bool invariant_tsc_supported) { timebase_to_lapic.l0, timebase_to_lapic.l32); } - lapic_timer_init_percpu(); - - // register the local apic interrupts - register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); + lapic_timer_init_percpu(0); return NO_ERROR; } @@ -325,10 +357,25 @@ void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector) { lapic_write_icr((6u << 8) | (startup_vector >> 12), apic_id); } -void lapic_send_ipi(uint32_t apic_id, uint32_t vector) { +void lapic_send_ipi(uint32_t apic_id, mp_ipi_t ipi) { if (!lapic_present) { return; } - lapic_write_icr(vector, apic_id); + LTRACEF("cpu %u target apic_id %#x, ipi %u\n", arch_curr_cpu_num(), apic_id, ipi); + + uint32_t vector; + switch (ipi) { + case MP_IPI_GENERIC: + vector = LAPIC_INT_GENERIC; + break; + case MP_IPI_RESCHEDULE: + vector = LAPIC_INT_RESCHEDULE; + break; + default: + panic("X86: unknown IPI %u\n", ipi); + } + + // send fixed mode, level asserted, no destination shorthand interrupt + lapic_write_icr(vector | (1U << 14), apic_id); } \ No newline at end of file diff --git a/arch/x86/mp.c b/arch/x86/mp.c index 284d3614..36c11e88 100644 --- a/arch/x86/mp.c +++ b/arch/x86/mp.c @@ -18,8 +18,9 @@ #include #include #include +#include -#define LOCAL_TRACE 1 +#define LOCAL_TRACE 0 #if WITH_SMP @@ -58,30 +59,56 @@ void x86_configure_percpu_early(uint cpu_num, uint apic_id) { } status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { - LTRACEF("caller %#x target 0x%x, ipi 0x%x\n", arch_curr_cpu_num(), target, ipi); + LTRACEF("cpu %u target 0x%x, ipi 0x%x\n", arch_curr_cpu_num(), target, ipi); - // XXX call into local apic code to send IPI + DEBUG_ASSERT(arch_ints_disabled()); + uint curr_cpu_num = arch_curr_cpu_num(); - PANIC_UNIMPLEMENTED; + // translate the target bitmap to apic id + while (target) { + uint cpu_num = __builtin_ctz(target); + target &= ~(1u << cpu_num); + + // skip the current cpu + if (cpu_num == curr_cpu_num) { + continue; + } + + x86_percpu_t *percpu = x86_get_percpu_for_cpu(cpu_num); + uint32_t apic_id = percpu->apic_id; + + // send the ipi to the target cpu + lapic_send_ipi(apic_id, ipi); + } + + return NO_ERROR; } void arch_mp_init_percpu(void) { } -static uintptr_t x86_get_apic_id_from_hardware(void) { - // read the apic id out of the hardware - return read_msr(X86_MSR_IA32_APIC_BASE) >> 24; +uint32_t x86_get_apic_id_from_hardware(void) { + // read the apic id out of cpuid leaf 1, which should be present if SMP is enabled. + uint32_t apic_id, unused; + cpuid(0x1, &unused, &apic_id, &unused, &unused); + + apic_id >>= 24; + + // TODO: read full 32bit apic id from x2apic msr if available + + return apic_id; } void x86_secondary_entry(uint cpu_num) { - x86_configure_percpu_early(cpu_num, x86_get_apic_id_from_hardware()); + uint32_t apic_id = x86_get_apic_id_from_hardware(); + x86_configure_percpu_early(cpu_num, apic_id); x86_early_init_percpu(); // run early secondary cpu init routines up to the threading level lk_init_level(LK_INIT_FLAG_SECONDARY_CPUS, LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_THREADING - 1); - dprintf(INFO, "SMP: secondary cpu %u started\n", arch_curr_cpu_num()); + dprintf(INFO, "SMP: secondary cpu %u started, apic id %u\n", arch_curr_cpu_num(), apic_id); lk_secondary_cpu_entry(); diff --git a/arch/x86/pv.c b/arch/x86/pv.c new file mode 100644 index 00000000..fdf8b40f --- /dev/null +++ b/arch/x86/pv.c @@ -0,0 +1,7 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index c6d83503..a6b31948 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -3,6 +3,7 @@ LOCAL_DIR := $(GET_LOCAL_DIR) MODULE := $(LOCAL_DIR) MODULE_OPTIONS := extra_warnings +MODULE_DEPS := lib/fixed_point # x86 code always runs with the mmu enabled WITH_KERNEL_VM := 1 @@ -74,7 +75,9 @@ MODULE_SRCS += \ $(LOCAL_DIR)/descriptor.c \ $(LOCAL_DIR)/faults.c \ $(LOCAL_DIR)/feature.c \ + $(LOCAL_DIR)/lapic.c \ $(LOCAL_DIR)/mp.c \ + $(LOCAL_DIR)/pv.c \ $(LOCAL_DIR)/thread.c \ # legacy x86's dont have fpu support diff --git a/platform/pc/include/platform/pc/timer.h b/platform/pc/include/platform/pc/timer.h new file mode 100644 index 00000000..a06aceff --- /dev/null +++ b/platform/pc/include/platform/pc/timer.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#pragma once + +#include +#include + +// A few shared timer routines needed by the arch/x86 layer +uint32_t pit_calibrate_lapic(uint32_t (*lapic_read_tick)(void)); +uint64_t time_to_tsc_ticks(lk_time_t time); diff --git a/platform/pc/interrupts.c b/platform/pc/interrupts.c index 89dad926..47c6fbaf 100644 --- a/platform/pc/interrupts.c +++ b/platform/pc/interrupts.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "platform_p.h" #include diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 2fc4908c..85ffdd7f 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -14,6 +14,7 @@ #include #include #include +#include #if WITH_SMP diff --git a/platform/pc/pit.c b/platform/pc/pit.c index 0b57f5f2..f0f7c97c 100644 --- a/platform/pc/pit.c +++ b/platform/pc/pit.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "platform_p.h" #include #include diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index 7e945aab..bd6e428a 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -22,19 +22,6 @@ void pic_enable(unsigned int vector, bool enable); void pic_eoi(unsigned int vector); void pic_mask_interrupts(void); -// local apic -void lapic_init(void); -status_t lapic_timer_init(bool invariant_tsc_supported); -void lapic_eoi(unsigned int vector); -void lapic_send_init_ipi(uint32_t apic_id, bool level); -void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector); -void lapic_send_ipi(uint32_t apic_id, uint32_t vector); - -status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval); -void lapic_cancel_timer(void); - -uint64_t time_to_tsc_ticks(lk_time_t time); - // programable interval timer void pit_init(void); status_t pit_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval); @@ -44,7 +31,6 @@ void pit_stop_timer(void); lk_time_t pit_current_time(void); lk_bigtime_t pit_current_time_hires(void); uint64_t pit_calibrate_tsc(void); -uint32_t pit_calibrate_lapic(uint32_t (*lapic_read_tick)(void)); // secondary cpus void platform_start_secondary_cpus(void); diff --git a/platform/pc/rules.mk b/platform/pc/rules.mk index dbee0748..2ddfd39c 100644 --- a/platform/pc/rules.mk +++ b/platform/pc/rules.mk @@ -22,7 +22,6 @@ MODULE_SRCS += \ $(LOCAL_DIR)/ide.c \ $(LOCAL_DIR)/interrupts.c \ $(LOCAL_DIR)/keyboard.c \ - $(LOCAL_DIR)/lapic.c \ $(LOCAL_DIR)/mp.c \ $(LOCAL_DIR)/mp-boot.S \ $(LOCAL_DIR)/pic.c \ diff --git a/platform/pc/timer.c b/platform/pc/timer.c index 71460e22..ec50e712 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -16,12 +16,15 @@ #include #include #include -#include "platform_p.h" +#include #include #include +#include #include #include +#include "platform_p.h" + #define LOCAL_TRACE 0 // Deals with all of the various clock sources and event timers on the PC platform. From 8fdadd9b339d43914e65002ed9b4847385a12664 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Tue, 1 Apr 2025 20:02:00 -0700 Subject: [PATCH 16/26] [arch][x86] implement basic spinlocks -This fixes the instability, seems stable on x86-64. --- arch/x86/32/spinlock.S | 44 ++++++++++++++++++++++++++++++++ arch/x86/64/spinlock.S | 40 +++++++++++++++++++++++++++++ arch/x86/include/arch/spinlock.h | 13 ++++++++-- arch/x86/rules.mk | 1 + 4 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 arch/x86/32/spinlock.S create mode 100644 arch/x86/64/spinlock.S diff --git a/arch/x86/32/spinlock.S b/arch/x86/32/spinlock.S new file mode 100644 index 00000000..bf48ca2b --- /dev/null +++ b/arch/x86/32/spinlock.S @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include + +#if WITH_SMP + +// void arch_spin_lock(spin_lock_t *lock); +FUNCTION(arch_spin_lock) + mov 4(%esp), %ecx + + mov $1, %edx +0: + xor %eax, %eax + lock cmpxchg %edx, (%ecx) + jz 1f + pause + jmp 0b +1: + ret +END_FUNCTION(arch_spin_lock) + +// int arch_spin_trylock(spin_lock_t *lock); +FUNCTION(arch_spin_trylock) + mov 4(%esp), %ecx + + mov $1, %eax + lock xchg %eax, (%ecx) + + ret +END_FUNCTION(arch_spin_trylock) + +// void arch_spin_unlock(spin_lock_t *lock); +FUNCTION(arch_spin_unlock) + mov 4(%esp), %ecx + movl $0, (%ecx) + ret +END_FUNCTION(arch_spin_unlock) + +#endif // WITH_SMP \ No newline at end of file diff --git a/arch/x86/64/spinlock.S b/arch/x86/64/spinlock.S new file mode 100644 index 00000000..53082af7 --- /dev/null +++ b/arch/x86/64/spinlock.S @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include + +#if WITH_SMP + +// void arch_spin_lock(spin_lock_t *lock); +FUNCTION(arch_spin_lock) + mov $1, %esi +0: + xor %eax, %eax + lock cmpxchg %esi, (%rdi) + jz 1f + pause + jmp 0b +1: + ret +END_FUNCTION(arch_spin_lock) + +// int arch_spin_trylock(spin_lock_t *lock); +FUNCTION(arch_spin_trylock) + mov $1, %eax + + lock xchg %eax, (%rdi) + + ret +END_FUNCTION(arch_spin_trylock) + +// void arch_spin_unlock(spin_lock_t *lock); +FUNCTION(arch_spin_unlock) + movl $0, (%rdi) + ret +END_FUNCTION(arch_spin_unlock) + +#endif // WITH_SMP \ No newline at end of file diff --git a/arch/x86/include/arch/spinlock.h b/arch/x86/include/arch/spinlock.h index a875846c..5c796028 100644 --- a/arch/x86/include/arch/spinlock.h +++ b/arch/x86/include/arch/spinlock.h @@ -7,13 +7,16 @@ */ #pragma once +#include #include #include #include #define SPIN_LOCK_INITIAL_VALUE (0) -typedef unsigned long spin_lock_t; +__BEGIN_CDECLS + +typedef unsigned int spin_lock_t; typedef x86_flags_t spin_lock_saved_state_t; typedef uint spin_lock_save_flags_t; @@ -27,6 +30,11 @@ static inline bool arch_spin_lock_held(spin_lock_t *lock) { return *lock != 0; } +#if WITH_SMP +void arch_spin_lock(spin_lock_t *lock); +int arch_spin_trylock(spin_lock_t *lock); +void arch_spin_unlock(spin_lock_t *lock); +#else static inline void arch_spin_lock(spin_lock_t *lock) { *lock = 1; } @@ -38,6 +46,7 @@ static inline int arch_spin_trylock(spin_lock_t *lock) { static inline void arch_spin_unlock(spin_lock_t *lock) { *lock = 0; } +#endif /* flags are unused on x86 */ #define ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS 0 @@ -53,4 +62,4 @@ arch_interrupt_restore(spin_lock_saved_state_t old_state, spin_lock_save_flags_t x86_restore_flags(old_state); } - +__END_CDECLS diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index a6b31948..1737cce2 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -69,6 +69,7 @@ MODULE_SRCS += \ $(SUBARCH_DIR)/gdt.S \ $(SUBARCH_DIR)/mmu.c \ $(SUBARCH_DIR)/ops.S \ + $(SUBARCH_DIR)/spinlock.S \ \ $(LOCAL_DIR)/arch.c \ $(LOCAL_DIR)/cache.c \ From 71e795de19c79caa6ee0053b243eb5766cefa9b2 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Sun, 6 Apr 2025 19:09:32 -0700 Subject: [PATCH 17/26] [arch][x86] get SMP working on x86-32 - Added very basic user page table support (needed to bootstrap the secondary cpus) - Added MP bootup code for 32bit. --- arch/x86/32/mmu.c | 79 +++++++++++++++++++++++++++++++++---------- arch/x86/64/mmu.c | 6 +++- arch/x86/faults.c | 1 + arch/x86/mp.c | 5 ++- platform/pc/mp-boot.S | 69 +++++++++++++++++++++++++++---------- platform/pc/mp.c | 1 + 6 files changed, 121 insertions(+), 40 deletions(-) diff --git a/arch/x86/32/mmu.c b/arch/x86/32/mmu.c index ff36507f..e00a28c1 100644 --- a/arch/x86/32/mmu.c +++ b/arch/x86/32/mmu.c @@ -23,7 +23,12 @@ #include #include +// TODO: +// - proper tlb flush (local and SMP) +// - synchronization of top level page tables for user space aspaces + #define LOCAL_TRACE 0 +#define TRACE_CONTEXT_SWITCH 0 /* top level kernel page tables, initialized in start.S */ #if X86_LEGACY @@ -309,8 +314,6 @@ static status_t x86_mmu_unmap(map_addr_t * const init_table, const vaddr_t vaddr } int arch_mmu_unmap(arch_aspace_t * const aspace, const vaddr_t vaddr, const uint count) { - map_addr_t init_table_from_cr3; - LTRACEF("aspace %p, vaddr %#lx, count %u\n", aspace, vaddr, count); DEBUG_ASSERT(aspace); @@ -321,10 +324,7 @@ int arch_mmu_unmap(arch_aspace_t * const aspace, const vaddr_t vaddr, const uint if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - init_table_from_cr3 = x86_get_cr3(); - - return (x86_mmu_unmap(paddr_to_kvaddr(init_table_from_cr3), vaddr, count)); + return (x86_mmu_unmap(aspace->cr3, vaddr, count)); } /** @@ -372,12 +372,9 @@ status_t arch_mmu_query(arch_aspace_t * const aspace, const vaddr_t vaddr, paddr if (!paddr) return ERR_INVALID_ARGS; - DEBUG_ASSERT(x86_get_cr3()); - uint32_t current_cr3_val = (map_addr_t)x86_get_cr3(); - arch_flags_t ret_flags; uint32_t ret_level; - status_t stat = x86_mmu_get_mapping(paddr_to_kvaddr(current_cr3_val), vaddr, &ret_level, &ret_flags, paddr); + status_t stat = x86_mmu_get_mapping(aspace->cr3, vaddr, &ret_level, &ret_flags, paddr); if (stat) return stat; @@ -404,15 +401,12 @@ int arch_mmu_map(arch_aspace_t * const aspace, const vaddr_t vaddr, const paddr_ if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - uint32_t current_cr3_val = (map_addr_t)x86_get_cr3(); - struct map_range range; range.start_vaddr = vaddr; range.start_paddr = (map_addr_t)paddr; range.size = count * PAGE_SIZE; - return (x86_mmu_map_range(paddr_to_kvaddr(current_cr3_val), &range, flags)); + return (x86_mmu_map_range(aspace->cr3, &range, flags)); } bool arch_mmu_supports_nx_mappings(void) { return false; } @@ -447,8 +441,43 @@ void x86_mmu_init(void) { status_t arch_mmu_init_aspace(arch_aspace_t * const aspace, const vaddr_t base, const size_t size, const uint flags) { DEBUG_ASSERT(aspace); - if ((flags & ARCH_ASPACE_FLAG_KERNEL) == 0) { - return ERR_NOT_SUPPORTED; + TRACEF("aspace %p, base %#lx, size %#zx, flags %#x\n", aspace, base, size, flags); + + /* validate that the base + size is sane and doesn't wrap */ + DEBUG_ASSERT(size > PAGE_SIZE); + DEBUG_ASSERT(base + size - 1 > base); + + aspace->flags = flags; + aspace->flags = flags; + if (flags & ARCH_ASPACE_FLAG_KERNEL) { + /* at the moment we can only deal with address spaces as globally defined */ + DEBUG_ASSERT(base == KERNEL_ASPACE_BASE); + DEBUG_ASSERT(size == KERNEL_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + aspace->cr3 = kernel_pd; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + } else { + DEBUG_ASSERT(base == USER_ASPACE_BASE); + DEBUG_ASSERT(size == USER_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + + map_addr_t *va = pmm_alloc_kpages(1, NULL); + if (!va) { + return ERR_NO_MEMORY; + } + + aspace->cr3 = va; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + + /* copy the top entries from the kernel top table */ + memcpy(aspace->cr3 + NO_OF_PT_ENTRIES/2, kernel_pd + NO_OF_PT_ENTRIES/2, PAGE_SIZE/2); + + /* zero out the rest */ + memset(aspace->cr3, 0, PAGE_SIZE/2); } return NO_ERROR; @@ -459,8 +488,22 @@ status_t arch_mmu_destroy_aspace(arch_aspace_t * const aspace) { } void arch_mmu_context_switch(arch_aspace_t * const aspace) { - if (aspace != NULL) { - PANIC_UNIMPLEMENTED; + if (TRACE_CONTEXT_SWITCH) + TRACEF("aspace %p\n", aspace); + + uint64_t cr3; + if (aspace) { + DEBUG_ASSERT((aspace->flags & ARCH_ASPACE_FLAG_KERNEL) == 0); + + cr3 = aspace->cr3_phys; + } else { + // TODO save copy of this + cr3 = vaddr_to_paddr(kernel_pd); } + if (TRACE_CONTEXT_SWITCH) { + TRACEF("cr3 %#llx\n", cr3); + } + + x86_set_cr3(cr3); } diff --git a/arch/x86/64/mmu.c b/arch/x86/64/mmu.c index 8038d7a9..fd08fb1a 100644 --- a/arch/x86/64/mmu.c +++ b/arch/x86/64/mmu.c @@ -26,6 +26,10 @@ #define LOCAL_TRACE 0 #define TRACE_CONTEXT_SWITCH 0 +// TODO: +// - proper tlb flush (local and SMP) +// - synchronization of top level page tables for user space aspaces + /* Address width including virtual/physical address*/ static uint8_t vaddr_width = 0; static uint8_t paddr_width = 0; @@ -672,7 +676,7 @@ void x86_mmu_init(void) { status_t arch_mmu_init_aspace(arch_aspace_t * const aspace, const vaddr_t base, const size_t size, const uint flags) { DEBUG_ASSERT(aspace); - LTRACEF("aspace %p, base %#lx, size %zu, flags %#x\n", aspace, base, size, flags); + LTRACEF("aspace %p, base %#lx, size %#zx, flags %#x\n", aspace, base, size, flags); /* validate that the base + size is sane and doesn't wrap */ DEBUG_ASSERT(size > PAGE_SIZE); diff --git a/arch/x86/faults.c b/arch/x86/faults.c index a59ec33e..2d1fbcd4 100644 --- a/arch/x86/faults.c +++ b/arch/x86/faults.c @@ -26,6 +26,7 @@ extern enum handler_return platform_irq(x86_iframe_t *frame); static void dump_fault_frame(x86_iframe_t *frame) { + dprintf(CRITICAL, "cpu %u:\n", arch_curr_cpu_num()); #if ARCH_X86_32 dprintf(CRITICAL, " CS: %04hx EIP: %08x EFL: %08x CR2: %08lx\n", frame->cs, frame->ip, frame->flags, x86_get_cr2()); diff --git a/arch/x86/mp.c b/arch/x86/mp.c index 36c11e88..c45cd592 100644 --- a/arch/x86/mp.c +++ b/arch/x86/mp.c @@ -52,7 +52,7 @@ void x86_configure_percpu_early(uint cpu_num, uint apic_id) { write_msr(X86_MSR_IA32_GS_BASE, (uint64_t)percpu); #else // set up a gs descriptor for this cpu - uint16_t selector = PERCPU_SELECTOR_BASE + cpu_num; + uint16_t selector = PERCPU_SELECTOR_BASE + cpu_num * 8; x86_set_gdt_descriptor(selector, percpu, sizeof(*percpu), 1, 0, 1, SEG_TYPE_DATA_RW, 0, 1); x86_set_gs(selector); #endif @@ -84,8 +84,7 @@ status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { return NO_ERROR; } -void arch_mp_init_percpu(void) { -} +void arch_mp_init_percpu(void) {} uint32_t x86_get_apic_id_from_hardware(void) { // read the apic id out of cpuid leaf 1, which should be present if SMP is enabled. diff --git a/platform/pc/mp-boot.S b/platform/pc/mp-boot.S index a3d15bc9..c902153c 100644 --- a/platform/pc/mp-boot.S +++ b/platform/pc/mp-boot.S @@ -1,26 +1,32 @@ #include #include +#if WITH_SMP + #define LOAD_ADDRESS 0x4000 #define MSR_EFER 0xc0000080 #define EFER_LME 0x00000100 #define ARGS_ADDRESS (LOAD_ADDRESS + 0x1000) #define ARGS_CR3 (ARGS_ADDRESS + 0x00) +#if ARCH_X86_64 #define ARGS_STACK (ARGS_ADDRESS + 0x08) +#else +#define ARGS_STACK (ARGS_ADDRESS + 0x04) +#endif .text .code16 // secondary cpu boot entry point and switch to protected mode // enters with the following state: // real mode, CS 0x0400, PC 0 (physical address 0x4000) +// LOAD_ADDRESS (physical) == mp_boot_start (virtual) FUNCTION(mp_boot_start) // jump over the temp GDT below and switch to a flat memory segment (0) - ljmp $0, $(LOAD_ADDRESS + 0x28) + ljmp $0, $(LOAD_ADDRESS + (.Lafter_gdt - mp_boot_start)) .org 0x8 .Lgdt: - // temporary GDT to get us into protected mode // stuff the GDTR in the first entry .short (8*4) .int (LOAD_ADDRESS + 0x8) // address of .Lgdt @@ -50,7 +56,7 @@ FUNCTION(mp_boot_start) .byte 0b10101111 /* G(1) D(0) L(1) AVL(0) limit 19:16 */ .byte 0x0 /* base 31:24 */ -.org 0x28 // 0x08 + 0x20 +.Lafter_gdt: // load the above GDT lgdt (LOAD_ADDRESS + 0x08) @@ -60,10 +66,9 @@ FUNCTION(mp_boot_start) movl %eax, %cr0 // jump to 32bit mode - ljmpl $0x8, $(LOAD_ADDRESS + 0x40) -.org 0x40 - .code32 + ljmpl $0x8, $(LOAD_ADDRESS + (.Lprot - mp_boot_start)) .Lprot: + .code32 // we're now in 32bit mode, set up the 32bit data segment registers mov $0x10, %ax mov %ax, %ss @@ -94,16 +99,16 @@ FUNCTION(mp_boot_start) btsl $(31), %eax mov %eax, %cr0 + // load a very temporary stack pointer movl $(LOAD_ADDRESS + 0x800), %esp // Use a far jump to get into 64bit mode pushl $0x18 - pushl $(LOAD_ADDRESS + 0x90) + pushl $(LOAD_ADDRESS + (.Lfarjump64 - mp_boot_start)) lret -.org 0x90 .code64 -farjump64: +.Lfarjump64: /* branch to our high address */ movq (.Lhigh_addr), %rax jmp *%rax @@ -118,15 +123,18 @@ farjump64: or $(1<<4), %eax mov %eax, %cr4 - // XXX load trampoline page table + // load trampoline page table + movl (ARGS_CR3), %eax + mov %eax, %cr3 - // get into high address + // enable paging + mov %cr0, %eax + btsl $(31), %eax + mov %eax, %cr0 - // set up stack pointer - - // call into C - cld - jmp . + // Branch to the high address + lea mp_boot_start_high, %eax + jmp *%eax #endif DATA(mp_boot_end) @@ -155,13 +163,38 @@ FUNCTION(mp_boot_start_high) // call into C cld - mov $(ARGS_ADDRESS), %rdi + mov $ARGS_ADDRESS, %rdi call secondary_entry jmp . #else // ARCH_X86_32 + // set up stack pointer + mov (ARGS_STACK), %esp + // load the real GDT + lgdt _gdtr + + push $CODE_SELECTOR + lea .Lnext, %eax + push %eax + lret +.Lnext: + + // Load the real segment registers + mov $DATA_SELECTOR, %ax + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + mov %ax, %ss + + // call into C + cld + push $ARGS_ADDRESS + call secondary_entry jmp . #endif -END_FUNCTION(mp_boot_start_high) \ No newline at end of file +END_FUNCTION(mp_boot_start_high) + +#endif // WITH_SMP \ No newline at end of file diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 85ffdd7f..d60ebe0f 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -30,6 +30,7 @@ struct bootstrap_args { uintptr_t trampoline_cr3; uintptr_t stack_top; + // referenced in C, okay to move uintptr_t cpu_num; volatile uint32_t *boot_completed_ptr; // set by the secondary cpu when it's done }; From 30136623455d2e4a54f23129b5ddb5f99610c311 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Sun, 6 Apr 2025 19:28:08 -0700 Subject: [PATCH 18/26] [arch][x86] move the KVM clock stuff to arch from platform One can argue it more logically fits there, and eventually more KVM features will arrive in this file that have nothing to do with timers. --- arch/x86/pv.c | 122 ++++++++++++++++++++++++++++++++++++++++++++ platform/pc/timer.c | 105 +------------------------------------- 2 files changed, 123 insertions(+), 104 deletions(-) diff --git a/arch/x86/pv.c b/arch/x86/pv.c index fdf8b40f..65e669d2 100644 --- a/arch/x86/pv.c +++ b/arch/x86/pv.c @@ -5,3 +5,125 @@ * license that can be found in the LICENSE file or at * https://opensource.org/licenses/MIT */ +#include "arch/x86/pv.h" + +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_TRACE 0 + +#if !X86_LEGACY + +// Deals with paravirtualized clock sources and event timers on the PC platform, +// specifically KVM. + +// From https://www.kernel.org/doc/html/v6.14/virt/kvm/x86/msr.html +struct pvclock_wall_clock { + uint32_t version; + uint32_t sec; + uint32_t nsec; +} __PACKED; +static_assert(sizeof(struct pvclock_wall_clock) == 12, "pvclock_wall_clock size mismatch"); + +struct pvclock_vcpu_time_info { + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; + uint64_t system_time; + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + uint8_t flags; + uint8_t pad[2]; +} __PACKED; +static_assert(sizeof(struct pvclock_vcpu_time_info) == 32, "pvclock_vcpu_time_info size mismatch"); +#define VCPU_TIME_INFO_FLAG_STABLE 0x1 + +static volatile struct pvclock_wall_clock *wall_clock; +static volatile struct pvclock_vcpu_time_info *vcpu_time_info; + +status_t pvclock_init(void) { + uint32_t clocksource_msr_base = 0; + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE)) { + clocksource_msr_base = 0x11; + } + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE2)) { + clocksource_msr_base = 0x4b564d00; + } + if (!clocksource_msr_base) { + return ERR_NOT_SUPPORTED; + } + dprintf(INFO, "pv_clock: clocksource detected, msr base %#x\n", clocksource_msr_base); + + // map a page of memory and point the KVM clocksource msrs at it + void *clocksource_page; + status_t err = vmm_alloc(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, &clocksource_page, 0, 0, 0); + if (err != NO_ERROR) { + printf("pv_clock: failed to allocate page for clocksource msrs\n"); + return err; + } + + paddr_t paddr; + arch_mmu_query(&vmm_get_kernel_aspace()->arch_aspace, (vaddr_t)clocksource_page, &paddr, NULL); + LTRACEF("clocksource page %p, paddr %#" PRIxPTR "\n", clocksource_page, paddr); + + write_msr(clocksource_msr_base, paddr); + write_msr(clocksource_msr_base + 1, paddr + sizeof(struct pvclock_wall_clock) + 1); + + wall_clock = (struct pvclock_wall_clock *)clocksource_page; + vcpu_time_info = (struct pvclock_vcpu_time_info *)(wall_clock + 1); + + dprintf(SPEW, "pv_clock: wall clock version %u, sec %u, nsec %u\n", + wall_clock->version, wall_clock->sec, wall_clock->nsec); + + dprintf(SPEW, "pv_clock: vcpu time info version %u, tsc timestamp %llu, system time %llu\n", + vcpu_time_info->version, vcpu_time_info->tsc_timestamp, vcpu_time_info->system_time); + dprintf(SPEW, "pv_clock: tsc to system mul %u, tsc shift %d, flags %u\n", + vcpu_time_info->tsc_to_system_mul, vcpu_time_info->tsc_shift, vcpu_time_info->flags); + + return NO_ERROR; +} + +uint64_t pvclock_get_tsc_freq(void) { + if (!vcpu_time_info) { + return 0; + } + + uint32_t tsc_mul = 0; + int8_t tsc_shift = 0; + uint32_t pre_version = 0, post_version = 0; + do { + pre_version = vcpu_time_info->version; + if (pre_version % 2 != 0) { + asm("pause"); + continue; + } + tsc_mul = vcpu_time_info->tsc_to_system_mul; + tsc_shift = vcpu_time_info->tsc_shift; + post_version = vcpu_time_info->version; + } while (pre_version != post_version); + + uint64_t tsc_khz = 1000000ULL << 32; + tsc_khz = tsc_khz / tsc_mul; + if (tsc_shift > 0) { + tsc_khz >>= tsc_shift; + } else { + tsc_khz <<= -tsc_shift; + } + return tsc_khz * 1000; +} + +bool pv_clock_is_stable(void) { + if (!vcpu_time_info) { + return false; + } + bool is_stable = (vcpu_time_info->flags & VCPU_TIME_INFO_FLAG_STABLE) || + x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE_STABLE); + return is_stable; +} + +#endif // !X86_LEGACY \ No newline at end of file diff --git a/platform/pc/timer.c b/platform/pc/timer.c index ec50e712..0cf1a904 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -78,115 +79,11 @@ lk_bigtime_t current_time_hires(void) { } } -// From https://www.kernel.org/doc/html/v6.14/virt/kvm/x86/msr.html -struct pvclock_wall_clock { - uint32_t version; - uint32_t sec; - uint32_t nsec; -} __PACKED; -static_assert(sizeof(struct pvclock_wall_clock) == 12, "pvclock_wall_clock size mismatch"); - -struct pvclock_vcpu_time_info { - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; - uint64_t system_time; - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - uint8_t flags; - uint8_t pad[2]; -} __PACKED; -static_assert(sizeof(struct pvclock_vcpu_time_info) == 32, "pvclock_vcpu_time_info size mismatch"); - -static volatile struct pvclock_wall_clock *wall_clock; -static volatile struct pvclock_vcpu_time_info *vcpu_time_info; - -status_t pvclock_init(void) { - uint32_t clocksource_msr_base = 0; - if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE)) { - clocksource_msr_base = 0x11; - } - if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE2)) { - clocksource_msr_base = 0x4b564d00; - } - if (!clocksource_msr_base) { - return ERR_NOT_SUPPORTED; - } - dprintf(INFO, "pv_clock: clocksource detected, msr base %#x\n", clocksource_msr_base); - - // map a page of memory and point the KVM clocksource msrs at it - void *clocksource_page; - status_t err = vmm_alloc(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, &clocksource_page, 0, 0, 0); - if (err != NO_ERROR) { - printf("pv_clock: failed to allocate page for clocksource msrs\n"); - return err; - } - - paddr_t paddr; - arch_mmu_query(&vmm_get_kernel_aspace()->arch_aspace, (vaddr_t)clocksource_page, &paddr, NULL); - LTRACEF("clocksource page %p, paddr %#" PRIxPTR "\n", clocksource_page, paddr); - - write_msr(clocksource_msr_base, paddr); - write_msr(clocksource_msr_base + 1, paddr + sizeof(struct pvclock_wall_clock) + 1); - - wall_clock = (struct pvclock_wall_clock *)clocksource_page; - vcpu_time_info = (struct pvclock_vcpu_time_info *)(wall_clock + 1); - - dprintf(SPEW, "pv_clock: wall clock version %u, sec %u, nsec %u\n", - wall_clock->version, wall_clock->sec, wall_clock->nsec); - - dprintf(SPEW, "pv_clock: vcpu time info version %u, tsc timestamp %llu, system time %llu\n", - vcpu_time_info->version, vcpu_time_info->tsc_timestamp, vcpu_time_info->system_time); - dprintf(SPEW, "pv_clock: tsc to system mul %u, tsc shift %d, flags %u\n", - vcpu_time_info->tsc_to_system_mul, vcpu_time_info->tsc_shift, vcpu_time_info->flags); - - return NO_ERROR; -} - // Convert lk_time_t to TSC ticks uint64_t time_to_tsc_ticks(lk_time_t time) { return u64_mul_u32_fp32_64(time, timebase_to_tsc); } -uint64_t pvclock_get_tsc_freq(void) { - uint32_t tsc_mul = 0; - int8_t tsc_shift = 0; - - if (!vcpu_time_info) { - return 0; - } - - uint32_t pre_version = 0, post_version = 0; - do { - pre_version = vcpu_time_info->version; - if (pre_version % 2 != 0) { - asm("pause"); - continue; - } - tsc_mul = vcpu_time_info->tsc_to_system_mul; - tsc_shift = vcpu_time_info->tsc_shift; - post_version = vcpu_time_info->version; - } while (pre_version != post_version); - - uint64_t tsc_khz = 1000000ULL << 32; - tsc_khz = tsc_khz / tsc_mul; - if (tsc_shift > 0) { - tsc_khz >>= tsc_shift; - } else { - tsc_khz <<= -tsc_shift; - } - return tsc_khz * 1000; -} - -bool pv_clock_is_stable(void) { - if (!vcpu_time_info) { - return false; - } - bool is_stable = (vcpu_time_info->flags & (1<<0)) || - x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE_STABLE); - return is_stable; -} - void pc_init_timer(unsigned int level) { // Initialize the PIT, it's always present in PC hardware pit_init(); From a8bc048648fde7239ad199c8e5e7710ee672f90d Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Sun, 6 Apr 2025 21:13:06 -0700 Subject: [PATCH 19/26] [arch][x86] tighten up x86 cpu bootstrap code - Fix an assert in local apic code when not using x2apic and starting secondaries. - Follow the spec a bit closer and wait up till a second for each secondary core to start. --- arch/x86/lapic.c | 23 +++++++++++++++++++---- platform/pc/mp.c | 37 ++++++++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/arch/x86/lapic.c b/arch/x86/lapic.c index 4b7e0793..c7a84b63 100644 --- a/arch/x86/lapic.c +++ b/arch/x86/lapic.c @@ -97,9 +97,9 @@ enum lapic_timer_mode { static uint32_t lapic_read(enum lapic_regs reg) { LTRACEF_LEVEL(2, "reg %#x\n", reg); - DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); if (lapic_x2apic) { // TODO: do we need barriers here? + DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); return read_msr(X86_MSR_IA32_X2APIC_BASE + reg / 0x10); } else { return mmio_read32(lapic_mmio + reg / 4); @@ -108,14 +108,26 @@ static uint32_t lapic_read(enum lapic_regs reg) { static void lapic_write(enum lapic_regs reg, uint32_t val) { LTRACEF_LEVEL(2, "reg %#x val %#x\n", reg, val); - DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); if (lapic_x2apic) { + DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); write_msr(X86_MSR_IA32_X2APIC_BASE + reg / 0x10, val); } else { mmio_write32(lapic_mmio + reg / 4, val); } } +static void lapic_wait_for_icr_delivery(void) { + LTRACEF_LEVEL(2, "waiting for icr\n"); + uint32_t val; + do { + if (lapic_x2apic) { + val = read_msr(X86_MSR_IA32_X2APIC_BASE + 0x30); + } else { + val = lapic_read(LAPIC_ICRLO); + } + } while (val & (1u << 12)); +} + // special case to write to the ICR register static void lapic_write_icr(uint32_t low, uint32_t apic_id) { LTRACEF_LEVEL(2, "%#x apic_id %#x\n", low, apic_id); @@ -124,6 +136,7 @@ static void lapic_write_icr(uint32_t low, uint32_t apic_id) { } else { lapic_write(LAPIC_ICRHI, apic_id << 24); lapic_write(LAPIC_ICRLO, low); + lapic_wait_for_icr_delivery(); } } @@ -265,7 +278,7 @@ static void lapic_init_percpu(uint level) { uint32_t svr = (LAPIC_INT_SPURIOUS | (1u<<8)); // enable lapic_write(LAPIC_SVR, svr); - TRACEF("lapic svr %#x\n", lapic_read(LAPIC_SVR)); + LTRACEF("lapic svr %#x\n", lapic_read(LAPIC_SVR)); register_int_handler_msi(LAPIC_INT_SPURIOUS, &lapic_spurious_handler, NULL, false); register_int_handler_msi(LAPIC_INT_GENERIC, &lapic_generic_handler, NULL, false); @@ -346,7 +359,8 @@ void lapic_send_init_ipi(uint32_t apic_id, bool level) { return; } - lapic_write_icr((5u << 8) | (level ? (1u << 14) : 0), apic_id); + // Level triggered mode, level according to arg, INIT delivery mode, no shorthand + lapic_write_icr((1u << 15) | (level ? (1u << 14) : 0) | (5u << 8), apic_id); } void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector) { @@ -354,6 +368,7 @@ void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector) { return; } + // Startup IPI, no shorthand lapic_write_icr((6u << 8) | (startup_vector >> 12), apic_id); } diff --git a/platform/pc/mp.c b/platform/pc/mp.c index d60ebe0f..c20c7ee3 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ struct bootstrap_args { volatile uint32_t *boot_completed_ptr; // set by the secondary cpu when it's done }; +// called from assembly code in mp-boot.S __NO_RETURN void secondary_entry(struct bootstrap_args *args) { volatile uint32_t *boot_completed = args->boot_completed_ptr; uint cpu_num = args->cpu_num; @@ -49,7 +51,7 @@ __NO_RETURN void secondary_entry(struct bootstrap_args *args) { x86_secondary_entry(cpu_num); } -static void start_cpu(uint cpu_num, uint32_t apic_id, struct bootstrap_args *args) { +static status_t start_cpu(uint cpu_num, uint32_t apic_id, struct bootstrap_args *args) { LTRACEF("cpu_num %u, apic_id %u\n", cpu_num, apic_id); // assert that this thread is pinned to the current cpu @@ -68,20 +70,33 @@ static void start_cpu(uint cpu_num, uint32_t apic_id, struct bootstrap_args *arg lapic_send_init_ipi(apic_id, false); thread_sleep(10); - // send SIPI and wait 200us - lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); - thread_sleep(1); + // send Startup IPI up to 2 times as recommended by Intel + for (int i = 0; i < 2; i++) { + lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); - // send SIPI again for good measure and wait 10ms - lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); - thread_sleep(10); - - // wait for the cpu to finish booting - while (!boot_completed) { - thread_yield(); + // Wait a little bit for the cpu to start before trying a second time + thread_sleep(10); + if (boot_completed) { + goto booted; + } } + // Wait up to a second for the cpu to finish starting + for (int i = 0; i < 1000; i++) { + if (boot_completed) { + goto booted; + } + thread_sleep(10); + } + + // we have failed to start this core + // TODO: handle trying to shut the core down before moving on. + printf("PC: failed to start cpu %u\n", cpu_num); + return ERR_TIMED_OUT; + +booted: LTRACEF("cpu %u booted\n", cpu_num); + return NO_ERROR; } struct detected_cpus { From 699ec6344e0a3495656f8117a0fb5941033453a7 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Sun, 6 Apr 2025 22:22:20 -0700 Subject: [PATCH 20/26] [arch][x86][smp] few misc tweaks - Skip cpus in the MADT table that are not enabled - Bump count to 16 cpus - Move the spurious interrupt vector to 0xff since it needs to end in 0xf on <=P6. --- arch/x86/lapic.c | 3 ++- arch/x86/rules.mk | 2 +- platform/pc/mp.c | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/lapic.c b/arch/x86/lapic.c index c7a84b63..4b997e65 100644 --- a/arch/x86/lapic.c +++ b/arch/x86/lapic.c @@ -84,9 +84,10 @@ enum lapic_regs { enum lapic_interrupts { LAPIC_INT_TIMER = 0xf8, - LAPIC_INT_SPURIOUS, LAPIC_INT_GENERIC, LAPIC_INT_RESCHEDULE, + + LAPIC_INT_SPURIOUS = 0xff, // Bits 0-3 must be 1 for P6 and below compatibility }; enum lapic_timer_mode { diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index 1737cce2..fdb0b380 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -52,7 +52,7 @@ GLOBAL_DEFINES += \ ARCH_HAS_MMU=1 ifeq ($(WITH_SMP),1) -SMP_MAX_CPUS ?= 8 +SMP_MAX_CPUS ?= 16 GLOBAL_DEFINES += \ WITH_SMP=1 \ SMP_MAX_CPUS=$(SMP_MAX_CPUS) diff --git a/platform/pc/mp.c b/platform/pc/mp.c index c20c7ee3..149e4656 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -108,6 +108,10 @@ static void local_apic_callback(const void *_entry, size_t entry_len, void *cook const struct acpi_madt_local_apic_entry *entry = _entry; struct detected_cpus *cpus = cookie; + if ((entry->flags & ACPI_MADT_FLAG_ENABLED) == 0) { + return; + } + // TODO: read the current APIC id and skip it, instead of assuming 0 is the boot cpu // read BSP from X86_IA32_APIC_BASE_MSR bit 8? if (entry->apic_id == 0) { From 80a08c177d1eb30699d8c42b3a74de329e1a790a Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Tue, 8 Apr 2025 23:45:55 -0700 Subject: [PATCH 21/26] [platform][pc] add a few comments to the top of timer.c --- platform/pc/timer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/platform/pc/timer.c b/platform/pc/timer.c index 0cf1a904..46c3e18a 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -29,6 +29,9 @@ #define LOCAL_TRACE 0 // Deals with all of the various clock sources and event timers on the PC platform. +// TODO: +// HPET +// cpuid leaves that describe clock rates static enum clock_source { CLOCK_SOURCE_INITIAL, From bc01491bc91835e631d03812479117ad222e9659 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Tue, 8 Apr 2025 23:46:18 -0700 Subject: [PATCH 22/26] [arch][x86][mmu] disable SMAP, add PGE feature 32 and 64 bit: - For now SMAP causes the mmu unit tests to fail, so disable. - Make sure CR4.PGE is set if present. - Make sure the rest of the system knows that user aspaces are available on 32bit. --- arch/x86/32/mmu.c | 14 +++++++++++++- arch/x86/64/mmu.c | 11 +++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/x86/32/mmu.c b/arch/x86/32/mmu.c index e00a28c1..3b2a6a51 100644 --- a/arch/x86/32/mmu.c +++ b/arch/x86/32/mmu.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -411,7 +412,7 @@ int arch_mmu_map(arch_aspace_t * const aspace, const vaddr_t vaddr, const paddr_ bool arch_mmu_supports_nx_mappings(void) { return false; } bool arch_mmu_supports_ns_mappings(void) { return false; } -bool arch_mmu_supports_user_aspaces(void) { return false; } +bool arch_mmu_supports_user_aspaces(void) { return true; } /* called once per cpu as it is brought up */ void x86_mmu_early_init_percpu(void) { @@ -419,6 +420,17 @@ void x86_mmu_early_init_percpu(void) { uint32_t cr0 = x86_get_cr0(); cr0 |= X86_CR0_WP; x86_set_cr0(cr0); + + /* Set some mmu control bits in CR4 */ + uint32_t cr4 = x86_get_cr4(); + if (x86_feature_test(X86_FEATURE_PGE)) + cr4 |= X86_CR4_PGE; + if (x86_feature_test(X86_FEATURE_SMEP)) + cr4 |= X86_CR4_SMEP; + /* TODO: enable SMAP when the rest of the system is ready for it */ + //if (x86_feature_test(X86_FEATURE_SMAP)) + // cr4 |= X86_CR4_SMAP; + x86_set_cr4(cr4); } void x86_mmu_early_init(void) { diff --git a/arch/x86/64/mmu.c b/arch/x86/64/mmu.c index fd08fb1a..3d3c78f9 100644 --- a/arch/x86/64/mmu.c +++ b/arch/x86/64/mmu.c @@ -632,12 +632,15 @@ void x86_mmu_early_init_percpu(void) { cr0 |= X86_CR0_WP; x86_set_cr0(cr0); - /* Setting the SMEP & SMAP bit in CR4 */ - uint64_t cr4 = x86_get_cr4(); + /* Set some mmu control bits in CR4 */ + uint32_t cr4 = x86_get_cr4(); + if (x86_feature_test(X86_FEATURE_PGE)) + cr4 |= X86_CR4_PGE; if (x86_feature_test(X86_FEATURE_SMEP)) cr4 |= X86_CR4_SMEP; - if (x86_feature_test(X86_FEATURE_SMAP)) - cr4 |= X86_CR4_SMAP; + /* TODO: enable SMAP when the rest of the system is ready for it */ + //if (x86_feature_test(X86_FEATURE_SMAP)) + // cr4 |= X86_CR4_SMAP; x86_set_cr4(cr4); /* Set NXE bit in MSR_EFER */ From bdf62a7e41ac4ccc5147162cd0efc9b452f7b91d Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Tue, 8 Apr 2025 23:47:54 -0700 Subject: [PATCH 23/26] [clang-tidy] turn off readability-math-missing-parenthesis This one is just too annoying for me. --- .clang-tidy | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-tidy b/.clang-tidy index f9a24d32..f82b1fa1 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -39,6 +39,7 @@ Checks: > -readability-implicit-bool-conversion, -readability-isolate-declaration, -readability-magic-numbers, + -readability-math-missing-parentheses, -readability-named-parameter, -readability-qualified-auto, -readability-uppercase-literal-suffix, From f52ef453fe723cb9a47d3304e914410f24df8653 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Tue, 8 Apr 2025 23:48:30 -0700 Subject: [PATCH 24/26] [tests][mmu] update the file a bit for C++, add another test Make sure a page appears to be unmapped after it is told to. --- arch/test/mmu.cpp | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/test/mmu.cpp b/arch/test/mmu.cpp index 0807d2a1..b6fd6aa5 100644 --- a/arch/test/mmu.cpp +++ b/arch/test/mmu.cpp @@ -15,7 +15,9 @@ #include #include -static bool create_user_aspace(void) { +namespace { + +bool create_user_aspace() { BEGIN_TEST; if (arch_mmu_supports_user_aspaces()) { @@ -34,7 +36,7 @@ static bool create_user_aspace(void) { END_TEST; } -static bool map_user_pages(void) { +bool map_user_pages() { BEGIN_TEST; if (arch_mmu_supports_user_aspaces()) { @@ -89,7 +91,7 @@ static bool map_user_pages(void) { END_TEST; } -static bool map_region_query_result(vmm_aspace_t *aspace, uint arch_flags) { +bool map_region_query_result(vmm_aspace_t *aspace, uint arch_flags) { BEGIN_TEST; void *ptr = NULL; @@ -109,10 +111,17 @@ static bool map_region_query_result(vmm_aspace_t *aspace, uint arch_flags) { // free this region we made EXPECT_EQ(NO_ERROR, vmm_free_region(aspace, (vaddr_t)ptr), "free region"); + // query that the page is not there anymore + { + paddr_t pa = 0; + uint flags = ~arch_flags; + EXPECT_EQ(ERR_NOT_FOUND, arch_mmu_query(&aspace->arch_aspace, (vaddr_t)ptr, &pa, &flags), "arch_query"); + } + END_TEST; } -static bool map_region_expect_failure(vmm_aspace_t *aspace, uint arch_flags, int expected_error) { +bool map_region_expect_failure(vmm_aspace_t *aspace, uint arch_flags, int expected_error) { BEGIN_TEST; void *ptr = NULL; @@ -123,7 +132,7 @@ static bool map_region_expect_failure(vmm_aspace_t *aspace, uint arch_flags, int END_TEST; } -static bool map_query_pages(void) { +bool map_query_pages() { BEGIN_TEST; vmm_aspace_t *kaspace = vmm_get_kernel_aspace(); @@ -153,9 +162,12 @@ static bool map_query_pages(void) { END_TEST; } -static bool context_switch(void) { +bool context_switch() { BEGIN_TEST; + // create a user space, map a page or two and access it + // NOTE: this assumes that kernel code can directly access user space, which isn't necessarily true + // on all architectures. See SMAP on x86, PAN on ARM, and SUM on RISC-V. if (arch_mmu_supports_user_aspaces()) { arch_aspace_t as; status_t err = arch_mmu_init_aspace(&as, USER_ASPACE_BASE, USER_ASPACE_SIZE, 0); @@ -218,4 +230,6 @@ RUN_TEST(map_query_pages); RUN_TEST(context_switch); END_TEST_CASE(arch_mmu_tests) +} // namespace + #endif // ARCH_HAS_MMU From c054ee89c247c87f15bb639c9296a485cd6be9fa Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Thu, 10 Apr 2025 00:45:41 -0700 Subject: [PATCH 25/26] [x86][mmu] only write to CR4 if necessary In legacy builds it's possible to boot on a cpu that doesn't appear to have CR4 implemented (Am586 to be precise), but there's no features needed to set, so it seems that this was architecturally okay. --- arch/x86/32/mmu.c | 23 +++++++++++++---------- arch/x86/64/mmu.c | 21 ++++++++++++--------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/x86/32/mmu.c b/arch/x86/32/mmu.c index 3b2a6a51..34264019 100644 --- a/arch/x86/32/mmu.c +++ b/arch/x86/32/mmu.c @@ -422,15 +422,18 @@ void x86_mmu_early_init_percpu(void) { x86_set_cr0(cr0); /* Set some mmu control bits in CR4 */ - uint32_t cr4 = x86_get_cr4(); - if (x86_feature_test(X86_FEATURE_PGE)) - cr4 |= X86_CR4_PGE; - if (x86_feature_test(X86_FEATURE_SMEP)) - cr4 |= X86_CR4_SMEP; - /* TODO: enable SMAP when the rest of the system is ready for it */ - //if (x86_feature_test(X86_FEATURE_SMAP)) - // cr4 |= X86_CR4_SMAP; - x86_set_cr4(cr4); + uint32_t bits = 0; + bits |= x86_feature_test(X86_FEATURE_PGE) ? X86_CR4_PGE : 0; + bits |= x86_feature_test(X86_FEATURE_PSE) ? X86_CR4_PSE : 0; + bits |= x86_feature_test(X86_FEATURE_SMEP) ? X86_CR4_SMEP : 0; + /* for now, we dont support SMAP due to some tests that assume they can access user space */ + // bits |= x86_feature_test(X86_FEATURE_SMAP) ? X86_CR4_SMAP : 0; + if (bits) { + /* don't touch cr4 unless we need to, early cpus will fault if its not implemented */ + uint32_t cr4 = x86_get_cr4(); + cr4 |= bits; + x86_set_cr4(cr4); + } } void x86_mmu_early_init(void) { @@ -453,7 +456,7 @@ void x86_mmu_init(void) { status_t arch_mmu_init_aspace(arch_aspace_t * const aspace, const vaddr_t base, const size_t size, const uint flags) { DEBUG_ASSERT(aspace); - TRACEF("aspace %p, base %#lx, size %#zx, flags %#x\n", aspace, base, size, flags); + LTRACEF("aspace %p, base %#lx, size %#zx, flags %#x\n", aspace, base, size, flags); /* validate that the base + size is sane and doesn't wrap */ DEBUG_ASSERT(size > PAGE_SIZE); diff --git a/arch/x86/64/mmu.c b/arch/x86/64/mmu.c index 3d3c78f9..8f9dbca8 100644 --- a/arch/x86/64/mmu.c +++ b/arch/x86/64/mmu.c @@ -633,15 +633,18 @@ void x86_mmu_early_init_percpu(void) { x86_set_cr0(cr0); /* Set some mmu control bits in CR4 */ - uint32_t cr4 = x86_get_cr4(); - if (x86_feature_test(X86_FEATURE_PGE)) - cr4 |= X86_CR4_PGE; - if (x86_feature_test(X86_FEATURE_SMEP)) - cr4 |= X86_CR4_SMEP; - /* TODO: enable SMAP when the rest of the system is ready for it */ - //if (x86_feature_test(X86_FEATURE_SMAP)) - // cr4 |= X86_CR4_SMAP; - x86_set_cr4(cr4); + uint32_t bits = 0; + bits |= x86_feature_test(X86_FEATURE_PGE) ? X86_CR4_PGE : 0; + bits |= x86_feature_test(X86_FEATURE_PSE) ? X86_CR4_PSE : 0; + bits |= x86_feature_test(X86_FEATURE_SMEP) ? X86_CR4_SMEP : 0; + /* for now, we dont support SMAP due to some tests that assume they can access user space */ + // bits |= x86_feature_test(X86_FEATURE_SMAP) ? X86_CR4_SMAP : 0; + if (bits) { + /* don't touch cr4 unless we need to, early cpus will fault if its not implemented */ + uint32_t cr4 = x86_get_cr4(); + cr4 |= bits; + x86_set_cr4(cr4); + } /* Set NXE bit in MSR_EFER */ uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER); From c80efdc4ce63d35377ea1f91652f7d75225bed81 Mon Sep 17 00:00:00 2001 From: Travis Geiselbrecht Date: Thu, 10 Apr 2025 21:35:40 -0700 Subject: [PATCH 26/26] [arch][x86] add missing pv.h file added a few commits back --- arch/x86/include/arch/x86/pv.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 arch/x86/include/arch/x86/pv.h diff --git a/arch/x86/include/arch/x86/pv.h b/arch/x86/include/arch/x86/pv.h new file mode 100644 index 00000000..b6ef8c4c --- /dev/null +++ b/arch/x86/include/arch/x86/pv.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#pragma once + +#include +#include + +#if !X86_LEGACY + +status_t pvclock_init(void); +uint64_t pvclock_get_tsc_freq(void); +bool pv_clock_is_stable(void); + +#endif // !X86_LEGACY