diff --git a/.clang-tidy b/.clang-tidy index 617972c5..f82b1fa1 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -3,6 +3,7 @@ Checks: > -*, bugprone-*, -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, clang-diagnostic-*, -clang-diagnostic-unused-command-line-argument, @@ -38,6 +39,7 @@ Checks: > -readability-implicit-bool-conversion, -readability-isolate-declaration, -readability-magic-numbers, + -readability-math-missing-parentheses, -readability-named-parameter, -readability-qualified-auto, -readability-uppercase-literal-suffix, diff --git a/arch/arm64/mmu.c b/arch/arm64/mmu.c index e5fbd83c..51e3b5ea 100644 --- a/arch/arm64/mmu.c +++ b/arch/arm64/mmu.c @@ -34,7 +34,7 @@ __SECTION(".bss.prebss.translation_table"); /* the base TCR flags, computed from early init code in start.S */ uint64_t arm64_mmu_tcr_flags __SECTION(".bss.prebss.tcr_flags"); -static inline bool is_valid_vaddr(arch_aspace_t *aspace, vaddr_t vaddr) { +static inline bool is_valid_vaddr(const arch_aspace_t *aspace, vaddr_t vaddr) { return (vaddr >= aspace->base && vaddr <= aspace->base + aspace->size - 1); } diff --git a/arch/test/mmu.cpp b/arch/test/mmu.cpp index 0807d2a1..b6fd6aa5 100644 --- a/arch/test/mmu.cpp +++ b/arch/test/mmu.cpp @@ -15,7 +15,9 @@ #include #include -static bool create_user_aspace(void) { +namespace { + +bool create_user_aspace() { BEGIN_TEST; if (arch_mmu_supports_user_aspaces()) { @@ -34,7 +36,7 @@ static bool create_user_aspace(void) { END_TEST; } -static bool map_user_pages(void) { +bool map_user_pages() { BEGIN_TEST; if (arch_mmu_supports_user_aspaces()) { @@ -89,7 +91,7 @@ static bool map_user_pages(void) { END_TEST; } -static bool map_region_query_result(vmm_aspace_t *aspace, uint arch_flags) { +bool map_region_query_result(vmm_aspace_t *aspace, uint arch_flags) { BEGIN_TEST; void *ptr = NULL; @@ -109,10 +111,17 @@ static bool map_region_query_result(vmm_aspace_t *aspace, uint arch_flags) { // free this region we made EXPECT_EQ(NO_ERROR, vmm_free_region(aspace, (vaddr_t)ptr), "free region"); + // query that the page is not there anymore + { + paddr_t pa = 0; + uint flags = ~arch_flags; + EXPECT_EQ(ERR_NOT_FOUND, arch_mmu_query(&aspace->arch_aspace, (vaddr_t)ptr, &pa, &flags), "arch_query"); + } + END_TEST; } -static bool map_region_expect_failure(vmm_aspace_t *aspace, uint arch_flags, int expected_error) { +bool map_region_expect_failure(vmm_aspace_t *aspace, uint arch_flags, int expected_error) { BEGIN_TEST; void *ptr = NULL; @@ -123,7 +132,7 @@ static bool map_region_expect_failure(vmm_aspace_t *aspace, uint arch_flags, int END_TEST; } -static bool map_query_pages(void) { +bool map_query_pages() { BEGIN_TEST; vmm_aspace_t *kaspace = vmm_get_kernel_aspace(); @@ -153,9 +162,12 @@ static bool map_query_pages(void) { END_TEST; } -static bool context_switch(void) { +bool context_switch() { BEGIN_TEST; + // create a user space, map a page or two and access it + // NOTE: this assumes that kernel code can directly access user space, which isn't necessarily true + // on all architectures. See SMAP on x86, PAN on ARM, and SUM on RISC-V. if (arch_mmu_supports_user_aspaces()) { arch_aspace_t as; status_t err = arch_mmu_init_aspace(&as, USER_ASPACE_BASE, USER_ASPACE_SIZE, 0); @@ -218,4 +230,6 @@ RUN_TEST(map_query_pages); RUN_TEST(context_switch); END_TEST_CASE(arch_mmu_tests) +} // namespace + #endif // ARCH_HAS_MMU diff --git a/arch/x86/32/exceptions.S b/arch/x86/32/exceptions.S index e1bca374..6cb06277 100644 --- a/arch/x86/32/exceptions.S +++ b/arch/x86/32/exceptions.S @@ -45,7 +45,8 @@ LOCAL_FUNCTION(interrupt_common) pushl %ds pusha /* save general purpose registers */ movl $DATA_SELECTOR, %eax /* put known good value in segment registers */ - movl %eax, %gs + // do not reset %gs, as it is used by the kernel + // TODO: when dealing with user space, we need to reset %gs here movl %eax, %fs movl %eax, %es movl %eax, %ds @@ -61,8 +62,7 @@ LOCAL_FUNCTION(interrupt_common) popl %ds /* restore segment registers */ popl %es popl %fs - popl %gs - addl $8, %esp /* drop exception number and error code */ + addl $12, %esp /* drop gs, exception number, and error code */ iret END_FUNCTION(interrupt_common) @@ -83,8 +83,6 @@ FUNCTION(setup_idt) loop .Lloop - lidt _idtr - ret END_FUNCTION(setup_idt) diff --git a/arch/x86/32/gdt.S b/arch/x86/32/gdt.S new file mode 100644 index 00000000..078333b2 --- /dev/null +++ b/arch/x86/32/gdt.S @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2009 Corey Tabaka + * Copyright (c) 2015 Intel Corporation + * Copyright (c) 2016 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include +#include + +#define PHYS_LOAD_ADDRESS (MEMBASE + KERNEL_LOAD_OFFSET) +#define PHYS_ADDR_DELTA (KERNEL_BASE + KERNEL_LOAD_OFFSET - PHYS_LOAD_ADDRESS) +#define PHYS(x) ((x) - PHYS_ADDR_DELTA) + +.section .rodata + +.balign 8 +DATA(_gdtr_phys) + .short _gdt_end - _gdt - 1 + .int PHYS(_gdt) +END_DATA(_gdtr_phys) + +.balign 8 +DATA(_gdtr) + .short _gdt_end - _gdt - 1 + .int _gdt +END_DATA(_gdtr) + + +// 32bit GDT, laid out in a specific way due to requirements by the SYSENTER/SYSEXIT and +// SYSCALL/SYSRET instructions: +// +// CODE32 <- IA32_SYSENTER_CS, IA32_STAR.SYSCALL_CS +// DATA +// UCODE32 <- IA32_STAR.SYSRET_CS +// UDATA +.data +.balign 8 +DATA(_gdt) + .int 0 + .int 0 + +/* ring 0 code 32bit (for bootstrapping into 64bit) */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10011010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 0 data 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 3 code 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11111010 /* P(1) DPL(11) S(1) 1 C(0) R(1) A(0) */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 3 data 32bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11110010 /* P(1) DPL(11) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* per-cpu TSS descriptor */ +.set i, 1 +.rept SMP_MAX_CPUS + .short 0 /* limit 15:00 */ + .short 0 /* base 15:00 */ + .byte 0 /* base 23:16 */ + .byte 0x89 /* P(1) DPL(00) S(0) TYPE(9) */ + .byte 0x80 /* G(1) D/B(0) L(0) AVL(0) limit 19:16 */ + .byte 0 /* base 31:24 */ +.set i, i+1 +.endr + +/* per-cpu GS descriptor for x86-32 */ +.set i, 1 +.rept SMP_MAX_CPUS + .int 0 /* filled in by C code later */ + .int 0 +.set i, i+1 +.endr + +END_DATA(_gdt) + +DATA(_gdt_end) + diff --git a/arch/x86/32/mmu.c b/arch/x86/32/mmu.c index cd6f3b23..34264019 100644 --- a/arch/x86/32/mmu.c +++ b/arch/x86/32/mmu.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -23,7 +24,12 @@ #include #include +// TODO: +// - proper tlb flush (local and SMP) +// - synchronization of top level page tables for user space aspaces + #define LOCAL_TRACE 0 +#define TRACE_CONTEXT_SWITCH 0 /* top level kernel page tables, initialized in start.S */ #if X86_LEGACY @@ -309,8 +315,6 @@ static status_t x86_mmu_unmap(map_addr_t * const init_table, const vaddr_t vaddr } int arch_mmu_unmap(arch_aspace_t * const aspace, const vaddr_t vaddr, const uint count) { - map_addr_t init_table_from_cr3; - LTRACEF("aspace %p, vaddr %#lx, count %u\n", aspace, vaddr, count); DEBUG_ASSERT(aspace); @@ -321,10 +325,7 @@ int arch_mmu_unmap(arch_aspace_t * const aspace, const vaddr_t vaddr, const uint if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - init_table_from_cr3 = x86_get_cr3(); - - return (x86_mmu_unmap(paddr_to_kvaddr(init_table_from_cr3), vaddr, count)); + return (x86_mmu_unmap(aspace->cr3, vaddr, count)); } /** @@ -372,12 +373,9 @@ status_t arch_mmu_query(arch_aspace_t * const aspace, const vaddr_t vaddr, paddr if (!paddr) return ERR_INVALID_ARGS; - DEBUG_ASSERT(x86_get_cr3()); - uint32_t current_cr3_val = (map_addr_t)x86_get_cr3(); - arch_flags_t ret_flags; uint32_t ret_level; - status_t stat = x86_mmu_get_mapping(paddr_to_kvaddr(current_cr3_val), vaddr, &ret_level, &ret_flags, paddr); + status_t stat = x86_mmu_get_mapping(aspace->cr3, vaddr, &ret_level, &ret_flags, paddr); if (stat) return stat; @@ -404,27 +402,41 @@ int arch_mmu_map(arch_aspace_t * const aspace, const vaddr_t vaddr, const paddr_ if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - uint32_t current_cr3_val = (map_addr_t)x86_get_cr3(); - struct map_range range; range.start_vaddr = vaddr; range.start_paddr = (map_addr_t)paddr; range.size = count * PAGE_SIZE; - return (x86_mmu_map_range(paddr_to_kvaddr(current_cr3_val), &range, flags)); + return (x86_mmu_map_range(aspace->cr3, &range, flags)); } bool arch_mmu_supports_nx_mappings(void) { return false; } bool arch_mmu_supports_ns_mappings(void) { return false; } -bool arch_mmu_supports_user_aspaces(void) { return false; } +bool arch_mmu_supports_user_aspaces(void) { return true; } -void x86_mmu_early_init(void) { - /* Set WP bit in CR0*/ - volatile uint32_t cr0 = x86_get_cr0(); +/* called once per cpu as it is brought up */ +void x86_mmu_early_init_percpu(void) { + /* Set WP bit in CR0 */ + uint32_t cr0 = x86_get_cr0(); cr0 |= X86_CR0_WP; x86_set_cr0(cr0); + /* Set some mmu control bits in CR4 */ + uint32_t bits = 0; + bits |= x86_feature_test(X86_FEATURE_PGE) ? X86_CR4_PGE : 0; + bits |= x86_feature_test(X86_FEATURE_PSE) ? X86_CR4_PSE : 0; + bits |= x86_feature_test(X86_FEATURE_SMEP) ? X86_CR4_SMEP : 0; + /* for now, we dont support SMAP due to some tests that assume they can access user space */ + // bits |= x86_feature_test(X86_FEATURE_SMAP) ? X86_CR4_SMAP : 0; + if (bits) { + /* don't touch cr4 unless we need to, early cpus will fault if its not implemented */ + uint32_t cr4 = x86_get_cr4(); + cr4 |= bits; + x86_set_cr4(cr4); + } +} + +void x86_mmu_early_init(void) { /* unmap the lower identity mapping */ for (uint i = 0; i < (1024*1024*1024) / (4*1024*1024); i++) { kernel_pd[i] = 0; @@ -444,8 +456,43 @@ void x86_mmu_init(void) { status_t arch_mmu_init_aspace(arch_aspace_t * const aspace, const vaddr_t base, const size_t size, const uint flags) { DEBUG_ASSERT(aspace); - if ((flags & ARCH_ASPACE_FLAG_KERNEL) == 0) { - return ERR_NOT_SUPPORTED; + LTRACEF("aspace %p, base %#lx, size %#zx, flags %#x\n", aspace, base, size, flags); + + /* validate that the base + size is sane and doesn't wrap */ + DEBUG_ASSERT(size > PAGE_SIZE); + DEBUG_ASSERT(base + size - 1 > base); + + aspace->flags = flags; + aspace->flags = flags; + if (flags & ARCH_ASPACE_FLAG_KERNEL) { + /* at the moment we can only deal with address spaces as globally defined */ + DEBUG_ASSERT(base == KERNEL_ASPACE_BASE); + DEBUG_ASSERT(size == KERNEL_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + aspace->cr3 = kernel_pd; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + } else { + DEBUG_ASSERT(base == USER_ASPACE_BASE); + DEBUG_ASSERT(size == USER_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + + map_addr_t *va = pmm_alloc_kpages(1, NULL); + if (!va) { + return ERR_NO_MEMORY; + } + + aspace->cr3 = va; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + + /* copy the top entries from the kernel top table */ + memcpy(aspace->cr3 + NO_OF_PT_ENTRIES/2, kernel_pd + NO_OF_PT_ENTRIES/2, PAGE_SIZE/2); + + /* zero out the rest */ + memset(aspace->cr3, 0, PAGE_SIZE/2); } return NO_ERROR; @@ -456,8 +503,22 @@ status_t arch_mmu_destroy_aspace(arch_aspace_t * const aspace) { } void arch_mmu_context_switch(arch_aspace_t * const aspace) { - if (aspace != NULL) { - PANIC_UNIMPLEMENTED; + if (TRACE_CONTEXT_SWITCH) + TRACEF("aspace %p\n", aspace); + + uint64_t cr3; + if (aspace) { + DEBUG_ASSERT((aspace->flags & ARCH_ASPACE_FLAG_KERNEL) == 0); + + cr3 = aspace->cr3_phys; + } else { + // TODO save copy of this + cr3 = vaddr_to_paddr(kernel_pd); } + if (TRACE_CONTEXT_SWITCH) { + TRACEF("cr3 %#llx\n", cr3); + } + + x86_set_cr3(cr3); } diff --git a/arch/x86/32/spinlock.S b/arch/x86/32/spinlock.S new file mode 100644 index 00000000..bf48ca2b --- /dev/null +++ b/arch/x86/32/spinlock.S @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include + +#if WITH_SMP + +// void arch_spin_lock(spin_lock_t *lock); +FUNCTION(arch_spin_lock) + mov 4(%esp), %ecx + + mov $1, %edx +0: + xor %eax, %eax + lock cmpxchg %edx, (%ecx) + jz 1f + pause + jmp 0b +1: + ret +END_FUNCTION(arch_spin_lock) + +// int arch_spin_trylock(spin_lock_t *lock); +FUNCTION(arch_spin_trylock) + mov 4(%esp), %ecx + + mov $1, %eax + lock xchg %eax, (%ecx) + + ret +END_FUNCTION(arch_spin_trylock) + +// void arch_spin_unlock(spin_lock_t *lock); +FUNCTION(arch_spin_unlock) + mov 4(%esp), %ecx + movl $0, (%ecx) + ret +END_FUNCTION(arch_spin_unlock) + +#endif // WITH_SMP \ No newline at end of file diff --git a/arch/x86/32/start.S b/arch/x86/32/start.S index 4c7d585a..57524d86 100644 --- a/arch/x86/32/start.S +++ b/arch/x86/32/start.S @@ -118,15 +118,6 @@ paging_setup: addl $4096, %eax movl %eax, 12(%esi) movl %eax, 12(%edi) - - /* Set PD in CR3 */ - movl $PHYS(kernel_pd), %eax - mov %eax, %cr3 - - /* Enabling Paging and from this point we are in */ - mov %cr0, %eax - btsl $(31), %eax - mov %eax, %cr0 #else /* map the first 1GB 1:1 using 4MB pages */ movl $PHYS(kernel_pd), %esi @@ -154,19 +145,20 @@ paging_setup: addl $0x00400000, %eax loop .Lfill_pd2 + /* enable PSE (4MB pages) */ + mov %cr4, %eax + orl $(1<<4), %eax + mov %eax, %cr4 +#endif + /* Set PD in CR3 */ movl $PHYS(kernel_pd), %eax mov %eax, %cr3 /* Enabling Paging and from this point we are in */ - mov %cr4, %eax - orl $0x10, %eax - mov %eax, %cr4 - mov %cr0, %eax btsl $(31), %eax mov %eax, %cr0 -#endif /* load the high kernel stack */ movl $(_kstack + 4096), %esp @@ -182,6 +174,11 @@ main_lk: /* set up the idt */ call setup_idt + /* set up the percpu data structure pointer for the boot cpu */ + pushl $0 + pushl $0 + call x86_configure_percpu_early + /* call the main module */ call lk_main 0: /* just sit around waiting for interrupts */ diff --git a/arch/x86/64/exceptions.S b/arch/x86/64/exceptions.S index 1a506ef1..092056a1 100644 --- a/arch/x86/64/exceptions.S +++ b/arch/x86/64/exceptions.S @@ -65,6 +65,8 @@ LOCAL_FUNCTION(interrupt_common) pushq %rsi pushq %rdi + /* TODO: deal with swapgs if coming from user space */ + /* pass the iframe using rdi */ movq %rsp, %rdi @@ -111,8 +113,6 @@ FUNCTION(setup_idt) loop .Lloop - lidt _idtr - ret END_FUNCTION(setup_idt) diff --git a/arch/x86/gdt.S b/arch/x86/64/gdt.S similarity index 80% rename from arch/x86/gdt.S rename to arch/x86/64/gdt.S index abb9c794..50b45e42 100644 --- a/arch/x86/gdt.S +++ b/arch/x86/64/gdt.S @@ -25,22 +25,26 @@ END_DATA(_gdtr_phys) .balign 8 DATA(_gdtr) .short _gdt_end - _gdt - 1 -#if ARCH_X86_32 - .int _gdt -#elif ARCH_X86_64 .quad _gdt -#endif END_DATA(_gdtr) +// 64bit GDT, laid out in a specific way due to requirements by the SYSENTER/SYSEXIT and +// SYSCALL/SYSRET instructions: +// +// CODE32 (for bootstrap purposes) +// CODE64 <- IA32_SYSENTER_CS, IA32_STAR.SYSCALL_CS +// DATA64 +// UCODE32 <- IA32_STAR.SYSRET_CS +// UDATA32 +// UCODE64 +// UDATA64 (optional if no 64bit sysenter support) .data .balign 8 DATA(_gdt) .int 0 .int 0 -/* ring 0 descriptors */ -.set codesel_32, . - _gdt -_code_32_gde: +/* ring 0 code 32bit (for bootstrapping into 64bit) */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ @@ -48,36 +52,7 @@ _code_32_gde: .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set datasel, . - _gdt -_data_gde: - .short 0xffff /* limit 15:00 */ - .short 0x0000 /* base 15:00 */ - .byte 0x00 /* base 23:16 */ - .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ - .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ - .byte 0x0 /* base 31:24 */ - -.set user_codesel_32, . - _gdt -_user_code_32_gde: - .short 0xffff /* limit 15:00 */ - .short 0x0000 /* base 15:00 */ - .byte 0x00 /* base 23:16 */ - .byte 0b11111010 /* P(1) DPL(11) S(1) 1 C(0) R(1) A(0) */ - .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ - .byte 0x0 /* base 31:24 */ - - -.set user_datasel, . - _gdt -_user_data_32_gde: - .short 0xffff /* limit 15:00 */ - .short 0x0000 /* base 15:00 */ - .byte 0x00 /* base 23:16 */ - .byte 0b11110010 /* P(1) DPL(11) S(1) 0 E(0) W(1) A(0) */ - .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ - .byte 0x0 /* base 31:24 */ - -.set codesel_64, . - _gdt -_code_64_gde: +/* ring 0 code 64bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ @@ -85,26 +60,23 @@ _code_64_gde: .byte 0b10101111 /* G(1) D(0) L(1) AVL(0) limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set datasel_64, . - _gdt -_data_64_gde: +/* ring 0 data 64bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ - .byte 0b10010010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ - .byte 0b11001111 /* G(1) B(1) 0 AVL(0) limit 19:16 */ + .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set user_codesel_64, . - _gdt -_user_code_64_gde: +/* ring 3 code 32bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ .byte 0b11111010 /* P(1) DPL(11) S(1) 1 C(0) R(1) A(0) */ - .byte 0b10101111 /* G(1) D(1) L(0) AVL(0) limit 19:16 */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ .byte 0x0 /* base 31:24 */ -.set user_datasel_64, . - _gdt -_user_data_64_gde: +/* ring 3 data 32bit */ .short 0xffff /* limit 15:00 */ .short 0x0000 /* base 15:00 */ .byte 0x00 /* base 23:16 */ @@ -112,9 +84,23 @@ _user_data_64_gde: .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ .byte 0x0 /* base 31:24 */ -/* TSS descriptor */ -.set tsssel, . - _gdt -_tss_gde: +/* ring 3 code 64bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11111010 /* P(1) DPL(11) S(1) 1 C(0) R(1) A(0) */ + .byte 0b10101111 /* G(1) D(1) L(0) AVL(0) limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* ring 3 data 64bit */ + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b11110010 /* P(1) DPL(11) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +/* per-cpu TSS descriptor */ .set i, 1 .rept SMP_MAX_CPUS .short 0 /* limit 15:00 */ @@ -123,7 +109,9 @@ _tss_gde: .byte 0x89 /* P(1) DPL(00) S(0) TYPE(9) */ .byte 0x80 /* G(1) D/B(0) L(0) AVL(0) limit 19:16 */ .byte 0 /* base 31:24 */ - .quad 0x0000000000000000 + /* 64-bit TSSs are 16 bytes long */ + .int 0 /* base 63:32 */ + .int 0 /* type(0) + reserved */ .set i, i+1 .endr diff --git a/arch/x86/64/mmu.c b/arch/x86/64/mmu.c index 858448e7..8f9dbca8 100644 --- a/arch/x86/64/mmu.c +++ b/arch/x86/64/mmu.c @@ -24,6 +24,11 @@ #include #define LOCAL_TRACE 0 +#define TRACE_CONTEXT_SWITCH 0 + +// TODO: +// - proper tlb flush (local and SMP) +// - synchronization of top level page tables for user space aspaces /* Address width including virtual/physical address*/ static uint8_t vaddr_width = 0; @@ -85,6 +90,11 @@ static bool x86_mmu_check_paddr(const paddr_t paddr) { return paddr <= max_paddr; } +/* is the address within the aspace */ +static bool is_valid_vaddr(const arch_aspace_t *aspace, vaddr_t vaddr) { + return (vaddr >= aspace->base && vaddr <= aspace->base + aspace->size - 1); +} + static inline uint64_t get_pfn_from_pte(uint64_t pte) { uint64_t pfn = (pte & (X86_PG_FRAME & X86_PHY_ADDR_MASK)); return pfn; @@ -517,16 +527,13 @@ int arch_mmu_unmap(arch_aspace_t * const aspace, const vaddr_t vaddr, const uint DEBUG_ASSERT(aspace); - if (!(x86_mmu_check_vaddr(vaddr))) + if (!is_valid_vaddr(aspace, vaddr)) return ERR_INVALID_ARGS; if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - paddr_t current_cr3_val = x86_get_cr3(); - - return (x86_mmu_unmap(paddr_to_kvaddr(current_cr3_val), vaddr, count)); + return (x86_mmu_unmap(aspace->cr3, vaddr, count)); } /** @@ -573,13 +580,12 @@ status_t arch_mmu_query(arch_aspace_t * const aspace, const vaddr_t vaddr, paddr if (!paddr) return ERR_INVALID_ARGS; - DEBUG_ASSERT(x86_get_cr3()); - paddr_t current_cr3_val = (addr_t)x86_get_cr3(); - uint64_t *cr3_virt = paddr_to_kvaddr(current_cr3_val); + if (!is_valid_vaddr(aspace, vaddr)) + return ERR_INVALID_ARGS; arch_flags_t ret_flags; uint32_t ret_level; - status_t stat = x86_mmu_get_mapping(cr3_virt, vaddr, &ret_level, &ret_flags, paddr); + status_t stat = x86_mmu_get_mapping(aspace->cr3, vaddr, &ret_level, &ret_flags, paddr); if (stat) return stat; @@ -602,48 +608,51 @@ int arch_mmu_map(arch_aspace_t *const aspace, const vaddr_t vaddr, const paddr_t if ((!x86_mmu_check_paddr(paddr))) return ERR_INVALID_ARGS; - if (!x86_mmu_check_vaddr(vaddr)) + if (!is_valid_vaddr(aspace, vaddr)) return ERR_INVALID_ARGS; if (count == 0) return NO_ERROR; - DEBUG_ASSERT(x86_get_cr3()); - addr_t current_cr3_val = (addr_t)x86_get_cr3(); - struct map_range range; range.start_vaddr = vaddr; range.start_paddr = paddr; range.size = count * PAGE_SIZE; - return (x86_mmu_map_range(paddr_to_kvaddr(current_cr3_val), &range, flags)); + return (x86_mmu_map_range(aspace->cr3, &range, flags)); } bool arch_mmu_supports_nx_mappings(void) { return true; } bool arch_mmu_supports_ns_mappings(void) { return false; } -bool arch_mmu_supports_user_aspaces(void) { return false; } +bool arch_mmu_supports_user_aspaces(void) { return true; } -void x86_mmu_early_init(void) { - volatile uint64_t efer_msr, cr0, cr4; - - /* Set WP bit in CR0*/ - cr0 = x86_get_cr0(); +void x86_mmu_early_init_percpu(void) { + /* Set WP bit in CR0 */ + uint64_t cr0 = x86_get_cr0(); cr0 |= X86_CR0_WP; x86_set_cr0(cr0); - /* Setting the SMEP & SMAP bit in CR4 */ - cr4 = x86_get_cr4(); - if (x86_feature_test(X86_FEATURE_SMEP)) - cr4 |= X86_CR4_SMEP; - if (x86_feature_test(X86_FEATURE_SMAP)) - cr4 |= X86_CR4_SMAP; - x86_set_cr4(cr4); + /* Set some mmu control bits in CR4 */ + uint32_t bits = 0; + bits |= x86_feature_test(X86_FEATURE_PGE) ? X86_CR4_PGE : 0; + bits |= x86_feature_test(X86_FEATURE_PSE) ? X86_CR4_PSE : 0; + bits |= x86_feature_test(X86_FEATURE_SMEP) ? X86_CR4_SMEP : 0; + /* for now, we dont support SMAP due to some tests that assume they can access user space */ + // bits |= x86_feature_test(X86_FEATURE_SMAP) ? X86_CR4_SMAP : 0; + if (bits) { + /* don't touch cr4 unless we need to, early cpus will fault if its not implemented */ + uint32_t cr4 = x86_get_cr4(); + cr4 |= bits; + x86_set_cr4(cr4); + } - /* Set NXE bit in MSR_EFER*/ - efer_msr = read_msr(X86_MSR_IA32_EFER); + /* Set NXE bit in MSR_EFER */ + uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER); efer_msr |= X86_EFER_NXE; write_msr(X86_MSR_IA32_EFER, efer_msr); +} +void x86_mmu_early_init(void) { /* getting the address width from CPUID instr */ paddr_width = x86_get_paddr_width(); vaddr_width = x86_get_vaddr_width(); @@ -673,20 +682,78 @@ void x86_mmu_init(void) { status_t arch_mmu_init_aspace(arch_aspace_t * const aspace, const vaddr_t base, const size_t size, const uint flags) { DEBUG_ASSERT(aspace); - if ((flags & ARCH_ASPACE_FLAG_KERNEL) == 0) { - return ERR_NOT_SUPPORTED; + LTRACEF("aspace %p, base %#lx, size %#zx, flags %#x\n", aspace, base, size, flags); + + /* validate that the base + size is sane and doesn't wrap */ + DEBUG_ASSERT(size > PAGE_SIZE); + DEBUG_ASSERT(base + size - 1 > base); + + aspace->flags = flags; + if (flags & ARCH_ASPACE_FLAG_KERNEL) { + /* at the moment we can only deal with address spaces as globally defined */ + DEBUG_ASSERT(base == KERNEL_ASPACE_BASE); + DEBUG_ASSERT(size == KERNEL_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + aspace->cr3 = kernel_pml4; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + } else { + DEBUG_ASSERT(base == USER_ASPACE_BASE); + DEBUG_ASSERT(size == USER_ASPACE_SIZE); + + aspace->base = base; + aspace->size = size; + + map_addr_t *va = pmm_alloc_kpages(1, NULL); + if (!va) { + return ERR_NO_MEMORY; + } + + aspace->cr3 = va; + aspace->cr3_phys = vaddr_to_paddr(aspace->cr3); + + /* copy the top entries from the kernel top table */ + memcpy(aspace->cr3 + NO_OF_PT_ENTRIES/2, kernel_pml4 + NO_OF_PT_ENTRIES/2, PAGE_SIZE/2); + + /* zero out the rest */ + memset(aspace->cr3, 0, PAGE_SIZE/2); } return NO_ERROR; } status_t arch_mmu_destroy_aspace(arch_aspace_t *aspace) { + // TODO: assert that we're not active on any cpus + if (aspace->flags & ARCH_ASPACE_FLAG_KERNEL) { + // can't destroy the kernel aspace + panic("attempt to destroy kernel aspace\n"); + return ERR_NOT_ALLOWED; + } + + // free the page table + pmm_free_kpages(aspace->cr3, 1); + return NO_ERROR; } void arch_mmu_context_switch(arch_aspace_t *aspace) { - if (aspace != NULL) { - PANIC_UNIMPLEMENTED; + if (TRACE_CONTEXT_SWITCH) + TRACEF("aspace %p\n", aspace); + + uint64_t cr3; + if (aspace) { + DEBUG_ASSERT((aspace->flags & ARCH_ASPACE_FLAG_KERNEL) == 0); + + cr3 = aspace->cr3_phys; + } else { + // TODO save copy of this + cr3 = vaddr_to_paddr(kernel_pml4); } + if (TRACE_CONTEXT_SWITCH) { + TRACEF("cr3 %#llx\n", cr3); + } + + x86_set_cr3(cr3); } diff --git a/arch/x86/64/spinlock.S b/arch/x86/64/spinlock.S new file mode 100644 index 00000000..53082af7 --- /dev/null +++ b/arch/x86/64/spinlock.S @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include + +#if WITH_SMP + +// void arch_spin_lock(spin_lock_t *lock); +FUNCTION(arch_spin_lock) + mov $1, %esi +0: + xor %eax, %eax + lock cmpxchg %esi, (%rdi) + jz 1f + pause + jmp 0b +1: + ret +END_FUNCTION(arch_spin_lock) + +// int arch_spin_trylock(spin_lock_t *lock); +FUNCTION(arch_spin_trylock) + mov $1, %eax + + lock xchg %eax, (%rdi) + + ret +END_FUNCTION(arch_spin_trylock) + +// void arch_spin_unlock(spin_lock_t *lock); +FUNCTION(arch_spin_unlock) + movl $0, (%rdi) + ret +END_FUNCTION(arch_spin_unlock) + +#endif // WITH_SMP \ No newline at end of file diff --git a/arch/x86/64/start.S b/arch/x86/64/start.S index 7637525a..a3aa9d41 100644 --- a/arch/x86/64/start.S +++ b/arch/x86/64/start.S @@ -96,7 +96,7 @@ paging_setup: /* PAE bit must be enabled for 64 bit paging*/ mov %cr4, %eax - btsl $(5), %eax + or $(1<<5), %eax mov %eax, %cr4 /* load the physical pointer to the top level page table */ @@ -201,9 +201,23 @@ highaddr: /* reload the gdtr */ lgdt _gdtr + /* zero out the data selectors */ + xor %eax, %eax + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %ss + movw %ax, %gs + movw %ax, %ss + /* set up the idt */ call setup_idt + /* set up the percpu data structure pointer for the boot cpu */ + xor %edi, %edi + xor %esi, %esi + call x86_configure_percpu_early + /* call the main module */ call lk_main diff --git a/arch/x86/arch.c b/arch/x86/arch.c index 6493c82e..a20dc873 100644 --- a/arch/x86/arch.c +++ b/arch/x86/arch.c @@ -19,35 +19,31 @@ #include #include #include -#include /* Describe how start.S sets up the MMU. * These data structures are later used by vm routines to lookup pointers * to physical pages based on physical addresses. */ struct mmu_initial_mapping mmu_initial_mappings[] = { -#if ARCH_X86_64 - /* 64GB of memory mapped where the kernel lives */ + /* 64GB of the first 64GB of memory mapped 1:1 */ { .phys = MEMBASE, .virt = KERNEL_ASPACE_BASE, - .size = PHYSMAP_SIZE, /* x86-64 maps first 64GB by default, 1GB on x86-32 */ + .size = PHYSMAP_SIZE, /* x86-64 maps first 64GB by default, 1GB on x86-32, 16MB in legacy mode */ .flags = 0, .name = "physmap" }, -#endif - /* 1GB of memory mapped where the kernel lives */ +#if ARCH_X86_64 + /* Another linear map of the first GB of memory where the kernel image + * lives at the top of the address space. */ { .phys = MEMBASE, .virt = KERNEL_BASE, -#if X86_LEGACY - .size = 16*MB, /* only map the first 16MB on legacy x86 due to page table usage */ -#else - .size = 1*GB, /* x86 maps first 1GB by default */ -#endif + .size = 1*GB, .flags = 0, .name = "kernel" }, +#endif /* null entry to terminate the list */ { 0 } @@ -63,13 +59,12 @@ __SECTION(".data") uint32_t _multiboot_info; /* main tss */ static tss_t system_tss __ALIGNED(16); -/* early initialization of the system, on the boot cpu, usually before any sort of - * printf output is available. - */ -void arch_early_init(void) { - /* enable caches here for now */ +void x86_early_init_percpu(void) { + // enable caches clear_in_cr0(X86_CR0_NW | X86_CR0_CD); + // configure the system TSS + // XXX move to a per cpu TSS in the percpu structure #if ARCH_X86_32 system_tss.esp0 = 0; system_tss.ss0 = DATA_SELECTOR; @@ -78,18 +73,33 @@ void arch_early_init(void) { system_tss.eflags = 0x00003002; system_tss.bitmap = offsetof(tss_32_t, tss_bitmap); system_tss.trace = 1; // trap on hardware task switch +#elif ARCH_X86_64 + /* nothing to be done here, a fully zeroed TSS is a good starting point */ #endif + const uint selector = TSS_SELECTOR_BASE + 8 * arch_curr_cpu_num(); + x86_set_gdt_descriptor(selector, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); + x86_ltr(selector); - set_global_desc(TSS_SELECTOR, &system_tss, sizeof(system_tss), 1, 0, 0, SEG_TYPE_TSS, 0, 0); - x86_ltr(TSS_SELECTOR); + /* load the kernel's IDT */ + asm("lidt _idtr"); + x86_mmu_early_init_percpu(); +#if X86_WITH_FPU + x86_fpu_early_init_percpu(); +#endif +} + +/* early initialization of the system, on the boot cpu, usually before any sort of + * printf output is available. + */ +void arch_early_init(void) { x86_feature_early_init(); - x86_mmu_early_init(); - #if X86_WITH_FPU x86_fpu_early_init(); #endif + + x86_early_init_percpu(); } /* later initialization pass, once the main kernel is initialized and scheduling has begun */ diff --git a/arch/x86/descriptor.c b/arch/x86/descriptor.c index 883fb8ec..58ec998c 100644 --- a/arch/x86/descriptor.c +++ b/arch/x86/descriptor.c @@ -5,62 +5,89 @@ * license that can be found in the LICENSE file or at * https://opensource.org/licenses/MIT */ - -#include #include -/* not the best way to do this, but easy for now */ -typedef union { - struct { - uint16_t limit_15_0; - uint16_t base_15_0; - uint8_t base_23_16; +#include +#include - uint8_t type : 4; - uint8_t s : 1; - uint8_t dpl : 2; - uint8_t p : 1; +extern uint64_t _gdt[]; - uint8_t limit_19_16 : 4; - uint8_t avl : 1; - uint8_t reserved_0 : 1; - uint8_t d_b : 1; - uint8_t g : 1; - - uint8_t base_31_24; - } __PACKED seg_desc_legacy; - - struct { - uint32_t base_63_32; - uint32_t reserved_1; - } __PACKED seg_desc_64; -} __PACKED seg_desc_t; - -extern seg_desc_t _gdt[]; - -void set_global_desc(seg_sel_t sel, void *base, uint32_t limit, +void x86_set_gdt_descriptor(seg_sel_t sel, void *base, uint32_t limit, uint8_t present, uint8_t ring, uint8_t sys, uint8_t type, uint8_t gran, uint8_t bits) { - // convert selector into index + typedef struct { + struct { + uint16_t limit_15_0; + uint16_t base_15_0; + uint8_t base_23_16; + + uint8_t type : 4; + uint8_t s : 1; + uint8_t dpl : 2; + uint8_t p : 1; + + uint8_t limit_19_16 : 4; + uint8_t avl : 1; + uint8_t reserved : 1; + uint8_t d_b : 1; + uint8_t g : 1; + + uint8_t base_31_24; + } seg_desc_legacy; + +#if ARCH_X86_64 + // some descriptors have additional fields for x86-64 + struct { + uint32_t base_63_32; + uint32_t reserved; + } seg_desc_64; +#endif + } seg_desc_t; + +#if ARCH_X86_64 + static_assert(sizeof(seg_desc_t) == 16, "seg_desc_t size mismatch"); +#else + static_assert(sizeof(seg_desc_t) == 8, "seg_desc_t size mismatch"); +#endif + + seg_desc_t desc = {0}; + + desc.seg_desc_legacy.limit_15_0 = limit & 0x0000ffff; + desc.seg_desc_legacy.limit_19_16 = (limit & 0x000f0000) >> 16; + + desc.seg_desc_legacy.base_15_0 = ((uintptr_t) base) & 0x0000ffff; + desc.seg_desc_legacy.base_23_16 = (((uintptr_t) base) & 0x00ff0000) >> 16; + desc.seg_desc_legacy.base_31_24 = ((uintptr_t) base) >> 24; + + desc.seg_desc_legacy.type = type & 0x0f; // segment type + desc.seg_desc_legacy.s = sys != 0; // system / non-system + desc.seg_desc_legacy.dpl = ring & 0x03; // descriptor privilege level + desc.seg_desc_legacy.p = present != 0; // present + desc.seg_desc_legacy.avl = 0; + desc.seg_desc_legacy.reserved = 0; + desc.seg_desc_legacy.d_b = bits != 0; // 16 / 32 bit + desc.seg_desc_legacy.g = gran != 0; // granularity + + // convert selector into index, which are always 8 byte indexed uint16_t index = sel >> 3; - - _gdt[index].seg_desc_legacy.limit_15_0 = limit & 0x0000ffff; - _gdt[index].seg_desc_legacy.limit_19_16 = (limit & 0x000f0000) >> 16; - - _gdt[index].seg_desc_legacy.base_15_0 = ((uintptr_t) base) & 0x0000ffff; - _gdt[index].seg_desc_legacy.base_23_16 = (((uintptr_t) base) & 0x00ff0000) >> 16; - _gdt[index].seg_desc_legacy.base_31_24 = ((uintptr_t) base) >> 24; - - _gdt[index].seg_desc_legacy.type = type & 0x0f; // segment type - _gdt[index].seg_desc_legacy.p = present != 0; // present - _gdt[index].seg_desc_legacy.dpl = ring & 0x03; // descriptor privilege level - _gdt[index].seg_desc_legacy.g = gran != 0; // granularity - _gdt[index].seg_desc_legacy.s = sys != 0; // system / non-system - _gdt[index].seg_desc_legacy.d_b = bits != 0; // 16 / 32 bit + seg_desc_t *entry = (seg_desc_t *)&_gdt[index]; + entry->seg_desc_legacy = desc.seg_desc_legacy; #ifdef ARCH_X86_64 - if (TSS_SELECTOR == sel) { - _gdt[index + 1].seg_desc_64.base_63_32 = (uint32_t)((uintptr_t) base >> 32); - _gdt[index + 1].seg_desc_64.reserved_1 = 0; + if (sys == 0) { + // some of the system descriptors have two more words + switch (type) { + case SEG_TYPE_TSS: + case SEG_TYPE_TSS_BUSY: + case SEG_TYPE_LDT: + case SEG_TYPE_CALL_GATE: + // copy the lower 32 bits of the descriptor (base and limit) + desc.seg_desc_64.base_63_32 = (uint32_t)((uintptr_t) base >> 32); + desc.seg_desc_64.reserved = 0; + + // copy the upper 64 bits of the descriptor + entry->seg_desc_64 = desc.seg_desc_64; + break; + } } #endif } diff --git a/arch/x86/faults.c b/arch/x86/faults.c index a59ec33e..2d1fbcd4 100644 --- a/arch/x86/faults.c +++ b/arch/x86/faults.c @@ -26,6 +26,7 @@ extern enum handler_return platform_irq(x86_iframe_t *frame); static void dump_fault_frame(x86_iframe_t *frame) { + dprintf(CRITICAL, "cpu %u:\n", arch_curr_cpu_num()); #if ARCH_X86_32 dprintf(CRITICAL, " CS: %04hx EIP: %08x EFL: %08x CR2: %08lx\n", frame->cs, frame->ip, frame->flags, x86_get_cr2()); diff --git a/arch/x86/feature.c b/arch/x86/feature.c index e4811652..ce28c280 100644 --- a/arch/x86/feature.c +++ b/arch/x86/feature.c @@ -133,10 +133,13 @@ static void x86_cpu_detect(void) { // read max hypervisor leaf cpuid(X86_CPUID_HYP_BASE, &a, &b, &c, &d); - // TODO: actually check that it's an understood hypervisor before setting this. - // It's possible on real hardware it's just returning the last valid regular cpuid. - if (a >= X86_CPUID_HYP_BASE) { + + // Check that it's an understood hypervisor leaf + if ((b == 0x4b4d564b && c == 0x564b4d56 && d == 0x4d) || /* KVMKVMKVM */ + (b == 0x54474354 && c == 0x43544743 && d == 0x47435447)) { /* TCGTCGTCGTCG */ max_cpuid_leaf_hyp = MIN(a, __X86_MAX_SUPPORTED_CPUID_HYP); + } else { + max_cpuid_leaf_hyp = 0; } } else { __x86_cpu_vendor = X86_CPU_VENDOR_INTEL; // intrinsically Intel without cpuid @@ -191,12 +194,12 @@ void x86_feature_early_init(void) { // cache a copy of the cpuid bits if (has_cpuid) { - for (uint32_t i = 1; i <= max_cpuid_leaf; i++) { + for (uint32_t i = 0; i <= max_cpuid_leaf; i++) { cpuid_c(i, 0, &saved_cpuids[i].a, &saved_cpuids[i].b, &saved_cpuids[i].c, &saved_cpuids[i].d); } if (max_cpuid_leaf_ext > 0) { - for (uint32_t i = X86_CPUID_EXT_BASE + 1; i - 1 < max_cpuid_leaf_ext; i++) { + for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) { uint32_t index = i - X86_CPUID_EXT_BASE; cpuid_c(i, 0, &saved_cpuids_ext[index].a, &saved_cpuids_ext[index].b, &saved_cpuids_ext[index].c, &saved_cpuids_ext[index].d); @@ -204,7 +207,7 @@ void x86_feature_early_init(void) { } if (max_cpuid_leaf_hyp > 0) { - for (uint32_t i = X86_CPUID_HYP_BASE + 1; i - 1 < max_cpuid_leaf_hyp; i++) { + for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) { uint32_t index = i - X86_CPUID_HYP_BASE; cpuid_c(i, 0, &saved_cpuids_hyp[index].a, &saved_cpuids_hyp[index].b, &saved_cpuids_hyp[index].c, &saved_cpuids_hyp[index].d); @@ -213,6 +216,23 @@ void x86_feature_early_init(void) { } } +static void x86_feature_dump_cpuid(void) { + for (uint32_t i = X86_CPUID_BASE; i <= max_cpuid_leaf; i++) { + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids[i - X86_CPUID_BASE].a, saved_cpuids[i - X86_CPUID_BASE].b, saved_cpuids[i - X86_CPUID_BASE].c, saved_cpuids[i - X86_CPUID_BASE].d); + } + for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) { + uint32_t index = i - X86_CPUID_HYP_BASE; + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids_hyp[index].a, saved_cpuids_hyp[index].b, saved_cpuids_hyp[index].c, saved_cpuids_hyp[index].d); + } + for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) { + uint32_t index = i - X86_CPUID_EXT_BASE; + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids[index].a, saved_cpuids[index].b, saved_cpuids[index].c, saved_cpuids[index].d); + } +} + /* later feature init hook, called after the kernel is able to schedule */ void x86_feature_init(void) { dprintf(SPEW, "X86: detected cpu level %d has_cpuid %d\n", x86_get_cpu_level(), has_cpuid); @@ -243,6 +263,10 @@ void x86_feature_init(void) { printf("X86: processor model info type %#x family %#x model %#x stepping %#x\n", model->processor_type, model->family, model->model, model->stepping); printf("\tdisplay_family %#x display_model %#x\n", model->display_family, model->display_model); + + if (has_cpuid && LK_DEBUGLEVEL > 1) { + x86_feature_dump_cpuid(); + } } bool x86_get_cpuid_subleaf(enum x86_cpuid_leaf_num num, uint32_t subleaf, struct x86_cpuid_leaf* leaf) { diff --git a/arch/x86/fpu.c b/arch/x86/fpu.c index 5ddf5eba..0777af21 100644 --- a/arch/x86/fpu.c +++ b/arch/x86/fpu.c @@ -63,6 +63,56 @@ typedef struct { static fpu_features_t fpu_features; +/* called per cpu as they're brought up */ +void x86_fpu_early_init_percpu(void) { + if (!fp_supported) { + return; + } + + /* No x87 emul, monitor co-processor */ + ulong x = x86_get_cr0(); + x &= ~X86_CR0_EM; + x |= X86_CR0_NE; + x |= X86_CR0_MP; + x86_set_cr0(x); + + /* Init x87 */ + uint16_t fcw; + __asm__ __volatile__ ("finit"); + __asm__ __volatile__("fstcw %0" : "=m" (fcw)); +#if FPU_MASK_ALL_EXCEPTIONS + /* mask all exceptions */ + fcw |= 0x3f; +#else + /* unmask all exceptions */ + fcw &= 0xffc0; +#endif + __asm__ __volatile__("fldcw %0" : : "m" (fcw)); + + /* Init SSE */ + x = x86_get_cr4(); + x |= X86_CR4_OSXMMEXPT; // supports exceptions + x |= X86_CR4_OSFXSR; // supports fxsave + x &= ~X86_CR4_OSXSAVE; // no support for xsave (currently) + x86_set_cr4(x); + + uint32_t mxcsr; + __asm__ __volatile__("stmxcsr %0" : "=m" (mxcsr)); +#if FPU_MASK_ALL_EXCEPTIONS + /* mask all exceptions */ + mxcsr = (0x3f << 7); +#else + /* unmask all exceptions */ + mxcsr &= 0x0000003f; +#endif + __asm__ __volatile__("ldmxcsr %0" : : "m" (mxcsr)); + + /* save fpu initial states, and used when new thread creates */ + __asm__ __volatile__("fxsave %0" : "=m" (fpu_init_states)); + + x86_set_cr0(x86_get_cr0() | X86_CR0_TS); +} + /* called on the first cpu before the kernel is initialized. printfs may not work here */ void x86_fpu_early_init(void) { fp_supported = false; @@ -115,51 +165,6 @@ void x86_fpu_early_init(void) { } } } - - /* No x87 emul, monitor co-processor */ - ulong x = x86_get_cr0(); - x &= ~X86_CR0_EM; - x |= X86_CR0_NE; - x |= X86_CR0_MP; - x86_set_cr0(x); - - /* Init x87 */ - uint16_t fcw; - __asm__ __volatile__ ("finit"); - __asm__ __volatile__("fstcw %0" : "=m" (fcw)); -#if FPU_MASK_ALL_EXCEPTIONS - /* mask all exceptions */ - fcw |= 0x3f; -#else - /* unmask all exceptions */ - fcw &= 0xffc0; -#endif - __asm__ __volatile__("fldcw %0" : : "m" (fcw)); - - /* Init SSE */ - x = x86_get_cr4(); - x |= X86_CR4_OSXMMEXPT; // supports exceptions - x |= X86_CR4_OSFXSR; // supports fxsave - x &= ~X86_CR4_OSXSAVE; // no support for xsave (currently) - x86_set_cr4(x); - - uint32_t mxcsr; - __asm__ __volatile__("stmxcsr %0" : "=m" (mxcsr)); -#if FPU_MASK_ALL_EXCEPTIONS - /* mask all exceptions */ - mxcsr = (0x3f << 7); -#else - /* unmask all exceptions */ - mxcsr &= 0x0000003f; -#endif - __asm__ __volatile__("ldmxcsr %0" : : "m" (mxcsr)); - - /* save fpu initial states, and used when new thread creates */ - __asm__ __volatile__("fxsave %0" : "=m" (fpu_init_states)); - - x86_set_cr0(x86_get_cr0() | X86_CR0_TS); - - return; } void x86_fpu_init(void) { diff --git a/arch/x86/include/arch/arch_ops.h b/arch/x86/include/arch/arch_ops.h index b3092d60..6b539c00 100644 --- a/arch/x86/include/arch/arch_ops.h +++ b/arch/x86/include/arch/arch_ops.h @@ -50,6 +50,20 @@ static inline ulong arch_cycle_count(void) { #endif } +#if WITH_SMP +#include +static inline struct thread *arch_get_current_thread(void) { + return x86_get_current_thread(); +} + +static inline void arch_set_current_thread(struct thread *t) { + x86_set_current_thread(t); +} + +static inline uint arch_curr_cpu_num(void) { + return x86_get_cpu_num(); +} +#else /* use a global pointer to store the current_thread */ extern struct thread *_current_thread; @@ -64,6 +78,7 @@ static inline void arch_set_current_thread(struct thread *t) { static inline uint arch_curr_cpu_num(void) { return 0; } +#endif #if ARCH_X86_64 // relies on SSE2 diff --git a/arch/x86/include/arch/aspace.h b/arch/x86/include/arch/aspace.h index e8d26ff0..32cbac86 100644 --- a/arch/x86/include/arch/aspace.h +++ b/arch/x86/include/arch/aspace.h @@ -8,11 +8,21 @@ #pragma once #include +#include +#include __BEGIN_CDECLS struct arch_aspace { - // nothing for now, does not support address spaces other than the kernel + /* pointer to the root page table */ + paddr_t cr3_phys; + map_addr_t *cr3; + + uint flags; + + /* range of address space */ + vaddr_t base; + size_t size; }; __END_CDECLS diff --git a/arch/x86/include/arch/defines.h b/arch/x86/include/arch/defines.h index 583e86ba..6244fbeb 100644 --- a/arch/x86/include/arch/defines.h +++ b/arch/x86/include/arch/defines.h @@ -18,8 +18,13 @@ /* based on how start.S sets up the physmap */ #if ARCH_X86_64 -#define PHYSMAP_SIZE (64ULL*GB) +#define PHYSMAP_SIZE (64ULL*1024*1024*1024) +#elif X86_LEGACY +/* Only map the first 16MB on legacy x86 due to page table usage + * due to lack of 4MB pages. */ +#define PHYSMAP_SIZE (16ULL*1024*1024) #elif ARCH_X86_32 -#define PHYSMAP_SIZE (1ULL*GB) +/* Map 1GB by default for x86-32 */ +#define PHYSMAP_SIZE (1ULL*1024*1024*1024) #endif diff --git a/arch/x86/include/arch/fpu.h b/arch/x86/include/arch/fpu.h index 59f62961..3e1f930e 100644 --- a/arch/x86/include/arch/fpu.h +++ b/arch/x86/include/arch/fpu.h @@ -26,6 +26,7 @@ void x86_fpu_early_init(void); void x86_fpu_init(void); +void x86_fpu_early_init_percpu(void); void fpu_init_thread_states(thread_t *t); void fpu_context_switch(thread_t *old_thread, thread_t *new_thread); void fpu_dev_na_handler(void); diff --git a/arch/x86/include/arch/spinlock.h b/arch/x86/include/arch/spinlock.h index a875846c..5c796028 100644 --- a/arch/x86/include/arch/spinlock.h +++ b/arch/x86/include/arch/spinlock.h @@ -7,13 +7,16 @@ */ #pragma once +#include #include #include #include #define SPIN_LOCK_INITIAL_VALUE (0) -typedef unsigned long spin_lock_t; +__BEGIN_CDECLS + +typedef unsigned int spin_lock_t; typedef x86_flags_t spin_lock_saved_state_t; typedef uint spin_lock_save_flags_t; @@ -27,6 +30,11 @@ static inline bool arch_spin_lock_held(spin_lock_t *lock) { return *lock != 0; } +#if WITH_SMP +void arch_spin_lock(spin_lock_t *lock); +int arch_spin_trylock(spin_lock_t *lock); +void arch_spin_unlock(spin_lock_t *lock); +#else static inline void arch_spin_lock(spin_lock_t *lock) { *lock = 1; } @@ -38,6 +46,7 @@ static inline int arch_spin_trylock(spin_lock_t *lock) { static inline void arch_spin_unlock(spin_lock_t *lock) { *lock = 0; } +#endif /* flags are unused on x86 */ #define ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS 0 @@ -53,4 +62,4 @@ arch_interrupt_restore(spin_lock_saved_state_t old_state, spin_lock_save_flags_t x86_restore_flags(old_state); } - +__END_CDECLS diff --git a/arch/x86/include/arch/x86.h b/arch/x86/include/arch/x86.h index c23a812c..585c2b2f 100644 --- a/arch/x86/include/arch/x86.h +++ b/arch/x86/include/arch/x86.h @@ -199,6 +199,7 @@ typedef tss_64_t tss_t; #define X86_MSR_IA32_PM_ENABLE 0x00000770 /* enable/disable HWP */ #define X86_MSR_IA32_HWP_CAPABILITIES 0x00000771 /* HWP performance range enumeration */ #define X86_MSR_IA32_HWP_REQUEST 0x00000774 /* power manage control hints */ +#define X86_MSR_IA32_X2APIC_BASE 0x00000800 /* X2APIC base register */ #define X86_MSR_IA32_EFER 0xc0000080 /* EFER */ #define X86_MSR_IA32_STAR 0xc0000081 /* system call address */ #define X86_MSR_IA32_LSTAR 0xc0000082 /* long mode call address */ @@ -316,6 +317,23 @@ static inline void x86_set_cr4(ulong in_val) { __asm__ __volatile__("mov %0,%%cr4 \n\t" : : "r"(in_val)); } +#define DEFINE_REGISTER_ACCESSOR(REG) \ + static inline void x86_set_##REG(uint16_t value) { \ + __asm__ volatile("mov %0, %%" #REG : : "r"(value)); \ + } \ + static inline uint16_t x86_get_##REG(void) { \ + uint16_t value; \ + __asm__ volatile("mov %%" #REG ", %0" : "=r"(value)); \ + return value; \ + } + +DEFINE_REGISTER_ACCESSOR(ds) +DEFINE_REGISTER_ACCESSOR(es) +DEFINE_REGISTER_ACCESSOR(fs) +DEFINE_REGISTER_ACCESSOR(gs) + +#undef DEFINE_REGISTER_ACCESSOR + static inline uint8_t inp(uint16_t _port) { uint8_t rv; __asm__ __volatile__("inb %1, %0" : "=a"(rv) : "dN"(_port)); @@ -471,6 +489,47 @@ static inline void write_msr (uint32_t msr_id, uint64_t msr_write_val) { : : "c" (msr_id), "a" (low_val), "d"(high_val)); } +#pragma GCC diagnostic push +/* The dereference of offset in the inline asm below generates this warning in GCC */ +#pragma GCC diagnostic ignored "-Warray-bounds" +static inline uint64_t x86_read_gs_offset64(uintptr_t offset) { + uint64_t ret; + __asm__("movq %%gs:%1, %0" : "=r"(ret) : "m"(*(uint64_t*)(offset))); + return ret; +} + +static inline void x86_write_gs_offset64(uintptr_t offset, uint64_t val) { + __asm__("movq %0, %%gs:%1" : : "ir"(val), "m"(*(uint64_t*)(offset)) : "memory"); +} + +static inline uint32_t x86_read_gs_offset32(uintptr_t offset) { + uint32_t ret; + __asm__("movl %%gs:%1, %0" : "=r"(ret) : "m"(*(uint32_t*)(offset))); + return ret; +} + +static inline void x86_write_gs_offset32(uintptr_t offset, uint32_t val) { + __asm__("movl %0, %%gs:%1" : : "ir"(val), "m"(*(uint32_t*)(offset)) : "memory"); +} +#pragma GCC diagnostic pop + +/* cannot easily use C generics or C++ templates here, so do it the hard way */ +#if __SIZEOF_POINTER__ == 8 +static inline void *x86_read_gs_offset_ptr(uintptr_t offset) { + return (void *)x86_read_gs_offset64(offset); +} +static inline void x86_write_gs_offset_ptr(uintptr_t offset, void *val) { + x86_write_gs_offset64(offset, (uint64_t)(val)); +} +#else +static inline void *x86_read_gs_offset_ptr(uintptr_t offset) { + return (void *)x86_read_gs_offset32(offset); +} +static inline void x86_write_gs_offset_ptr(uintptr_t offset, void *val) { + x86_write_gs_offset32(offset, (uint32_t)(val)); +} +#endif + typedef ulong x86_flags_t; static inline x86_flags_t x86_save_flags(void) { @@ -497,4 +556,6 @@ static inline void tlbsync_local(vaddr_t address) { asm volatile("invlpg %0" :: "m"(*(uint8_t *)address)); } +void x86_early_init_percpu(void); + __END_CDECLS diff --git a/arch/x86/include/arch/x86/descriptor.h b/arch/x86/include/arch/x86/descriptor.h index 804a9f66..03221654 100644 --- a/arch/x86/include/arch/x86/descriptor.h +++ b/arch/x86/include/arch/x86/descriptor.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2009 Corey Tabaka * Copyright (c) 2014 Intel Corporation + * Copyright (c) 2024 Travis Geiselbrecht * * Use of this source code is governed by a MIT-style * license that can be found in the LICENSE file or at @@ -8,34 +9,90 @@ */ #pragma once -/* - * System Selectors - */ +// System Selectors #define NULL_SELECTOR 0x00 -/********* x86 selectors *********/ +// ********* x86 selectors ********* +// Laid out slightly differently based on 32 vs 64 bit mode +#if ARCH_X86_64 + +#define CODE_SELECTOR 0x08 +#define CODE_64_SELECTOR 0x10 +#define DATA_SELECTOR 0x18 +#define USER_CODE_32_SELECTOR 0x20 +#define USER_DATA_32_SELECTOR 0x28 +#define USER_CODE_64_SELECTOR 0x30 +#define USER_DATA_64_SELECTOR 0x38 +#define TSS_SELECTOR_BASE 0x40 + +#elif ARCH_X86_32 + #define CODE_SELECTOR 0x08 #define DATA_SELECTOR 0x10 #define USER_CODE_32_SELECTOR 0x18 #define USER_DATA_32_SELECTOR 0x20 +#define TSS_SELECTOR_BASE 0x28 -/******* x86-64 selectors ********/ -#define CODE_64_SELECTOR 0x28 -#define STACK_64_SELECTOR 0x30 -#define USER_CODE_64_SELECTOR 0x38 -#define USER_DATA_64_SELECTOR 0x40 +#else +#error unknown architecture +#endif -#define TSS_SELECTOR 0x48 +// Base selector for a gs segment per cpu (SMP_MAX_CPUS) +#define PERCPU_SELECTOR_BASE (TSS_SELECTOR_BASE + 8 * SMP_MAX_CPUS) -/* - * Descriptor Types - */ -#define SEG_TYPE_TSS 0x9 -#define SEG_TYPE_TSS_BUSY 0xb -#define SEG_TYPE_TASK_GATE 0x5 -#define SEG_TYPE_INT_GATE 0xe // 32 bit -#define SEG_TYPE_DATA_RW 0x2 -#define SEG_TYPE_CODE_RW 0xa +// Worksheet of what the syscall instructions do which affects the GDT layout: +// SYSENTER +// CS = IA32_SYSENTER_CS +// SS = IA32_SYSENTER_CS + 8 +// SYSEXIT 32 +// CS = IA32_SYSENTER_CS + 16 +// SS = IA32_SYSENTER_CS + 24 +// SYSEXIT 64 +// CS = IA32_SYSENTER_CS + 32 +// SS = IA32_SYSENTER_CS + 40 + +// SYSCALL +// CS = IA32_STAR.SYSCALL_CS +// SS = IA32_STAR.SYSCALL_CS + 8 +// SYSRET 32 +// CS = IA32_STAR.SYSRET_CS +// SS = IA32_STAR.SYSRET_CS + 8 +// SYSRET 64 +// CS = IA32_STAR.SYSRET_CS + 16 +// SS = IA32_STAR.SYSRET_CS + 8 + +// code/data segment types (S = 1) +// bit 0 is A (accessed) +// bit 1 is W (accessed) +// bit 2 is E (expand-down) +// bit 3 is data (0) vs code (1) + +// data segment types: +#define SEG_TYPE_DATA_RO 0x0 +#define SEG_TYPE_DATA_RW 0x2 +#define SEG_TYPE_DATA_RO_EXPAND_DOWN 0x4 +#define SEG_TYPE_DATA_RW_EXPAND_DOWN 0x6 + +// code segment types: +// bit 3 is C (conforming) +#define SEG_TYPE_CODE_XO 0x9 +#define SEG_TYPE_CODE_RO 0xa +#define SEG_TYPE_CODE_XO_CONFORMING 0xc +#define SEG_TYPE_CODE_RO_CONFORMING 0xe + +// system segment types (S = 0) +#define SEG_TYPE_TSS_16 0x1 +#define SEG_TYPE_LDT 0x2 // usable in 64bit +#define SEG_TYPE_TSS_16_BUSY 0x3 +#define SEG_TYPE_CALL_GATE_16 0x4 +#define SEG_TYPE_TASK_GATE 0x5 +#define SEG_TYPE_INT_GATE_16 0x6 +#define SEG_TYPE_TRAP_GATE_16 0x7 +#define SEG_TYPE_TSS 0x9 // usable in 64bit +#define SEG_TYPE_TSS_BUSY 0xb // usable in 64bit +#define SEG_TYPE_CALL_GATE 0xc // usable in 64bit +#define SEG_TYPE_INT_GATE 0xe // usable in 64bit +#define SEG_TYPE_TRAP_GATE 0xf // usable in 64bit #ifndef ASSEMBLY @@ -43,7 +100,7 @@ typedef uint16_t seg_sel_t; -void set_global_desc(seg_sel_t sel, void *base, uint32_t limit, +void x86_set_gdt_descriptor(seg_sel_t sel, void *base, uint32_t limit, uint8_t present, uint8_t ring, uint8_t sys, uint8_t type, uint8_t gran, uint8_t bits); #endif diff --git a/arch/x86/include/arch/x86/feature.h b/arch/x86/include/arch/x86/feature.h index bba5db4b..a743a634 100644 --- a/arch/x86/include/arch/x86/feature.h +++ b/arch/x86/include/arch/x86/feature.h @@ -24,7 +24,6 @@ #pragma once #include -#include #include #include #include @@ -183,71 +182,171 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) { /* add feature bits to test here */ /* format: X86_CPUID_BIT(cpuid leaf, register (eax-edx:0-3), bit) */ #define X86_FEATURE_SSE3 X86_CPUID_BIT(0x1, 2, 0) +#define X86_FEATURE_PCLMULQDQ X86_CPUID_BIT(0x1, 2, 1) +#define X86_FEATURE_DTES64 X86_CPUID_BIT(0x1, 2, 2) #define X86_FEATURE_MON X86_CPUID_BIT(0x1, 2, 3) +#define X86_FEATURE_DSCPL X86_CPUID_BIT(0x1, 2, 4) #define X86_FEATURE_VMX X86_CPUID_BIT(0x1, 2, 5) +#define X86_FEATURE_SMX X86_CPUID_BIT(0x1, 2, 6) +#define X86_FEATURE_EIST X86_CPUID_BIT(0x1, 2, 7) #define X86_FEATURE_TM2 X86_CPUID_BIT(0x1, 2, 8) #define X86_FEATURE_SSSE3 X86_CPUID_BIT(0x1, 2, 9) +#define X86_FEATURE_CNXT_ID X86_CPUID_BIT(0x1, 2, 10) +#define X86_FEATURE_SDBG X86_CPUID_BIT(0x1, 2, 11) +#define X86_FEATURE_FMA X86_CPUID_BIT(0x1, 2, 12) +#define X86_FEATURE_CMPXCHG16B X86_CPUID_BIT(0x1, 2, 13) +#define X86_FEATURE_XTPR X86_CPUID_BIT(0x1, 2, 14) #define X86_FEATURE_PDCM X86_CPUID_BIT(0x1, 2, 15) #define X86_FEATURE_PCID X86_CPUID_BIT(0x1, 2, 17) +#define X86_FEATURE_DCA X86_CPUID_BIT(0x1, 2, 18) #define X86_FEATURE_SSE4_1 X86_CPUID_BIT(0x1, 2, 19) #define X86_FEATURE_SSE4_2 X86_CPUID_BIT(0x1, 2, 20) #define X86_FEATURE_X2APIC X86_CPUID_BIT(0x1, 2, 21) +#define X86_FEATURE_MOVBE X86_CPUID_BIT(0x1, 2, 22) +#define X86_FEATURE_POPCNT X86_CPUID_BIT(0x1, 2, 23) #define X86_FEATURE_TSC_DEADLINE X86_CPUID_BIT(0x1, 2, 24) #define X86_FEATURE_AESNI X86_CPUID_BIT(0x1, 2, 25) #define X86_FEATURE_XSAVE X86_CPUID_BIT(0x1, 2, 26) +#define X86_FEATURE_OSXSAVE X86_CPUID_BIT(0x1, 2, 27) #define X86_FEATURE_AVX X86_CPUID_BIT(0x1, 2, 28) #define X86_FEATURE_RDRAND X86_CPUID_BIT(0x1, 2, 30) #define X86_FEATURE_HYPERVISOR X86_CPUID_BIT(0x1, 2, 31) #define X86_FEATURE_FPU X86_CPUID_BIT(0x1, 3, 0) +#define X86_FEATURE_VM86 X86_CPUID_BIT(0x1, 3, 1) +#define X86_FEATURE_DE X86_CPUID_BIT(0x1, 3, 2) #define X86_FEATURE_PSE X86_CPUID_BIT(0x1, 3, 3) +#define X86_FEATURE_TSC X86_CPUID_BIT(0x1, 3, 4) +#define X86_FEATURE_MSR X86_CPUID_BIT(0x1, 3, 5) #define X86_FEATURE_PAE X86_CPUID_BIT(0x1, 3, 6) +#define X86_FEATURE_MCE X86_CPUID_BIT(0x1, 3, 7) +#define X86_FEATURE_CX8 X86_CPUID_BIT(0x1, 3, 8) #define X86_FEATURE_APIC X86_CPUID_BIT(0x1, 3, 9) #define X86_FEATURE_SEP X86_CPUID_BIT(0x1, 3, 11) +#define X86_FEATURE_MTRR X86_CPUID_BIT(0x1, 3, 12) #define X86_FEATURE_PGE X86_CPUID_BIT(0x1, 3, 13) +#define X86_FEATURE_MCA X86_CPUID_BIT(0x1, 3, 14) +#define X86_FEATURE_CMOV X86_CPUID_BIT(0x1, 3, 15) #define X86_FEATURE_PAT X86_CPUID_BIT(0x1, 3, 16) #define X86_FEATURE_PSE36 X86_CPUID_BIT(0x1, 3, 17) +#define X86_FEATURE_PSN X86_CPUID_BIT(0x1, 3, 18) #define X86_FEATURE_CLFLUSH X86_CPUID_BIT(0x1, 3, 19) +#define X86_FEATURE_DS X86_CPUID_BIT(0x1, 3, 21) #define X86_FEATURE_ACPI X86_CPUID_BIT(0x1, 3, 22) #define X86_FEATURE_MMX X86_CPUID_BIT(0x1, 3, 23) #define X86_FEATURE_FXSR X86_CPUID_BIT(0x1, 3, 24) #define X86_FEATURE_SSE X86_CPUID_BIT(0x1, 3, 25) #define X86_FEATURE_SSE2 X86_CPUID_BIT(0x1, 3, 26) +#define X86_FEATURE_SS X86_CPUID_BIT(0x1, 3, 27) +#define X86_FEATURE_HTT X86_CPUID_BIT(0x1, 3, 28) #define X86_FEATURE_TM X86_CPUID_BIT(0x1, 3, 29) +#define X86_FEATURE_PBE X86_CPUID_BIT(0x1, 3, 31) + #define X86_FEATURE_DTS X86_CPUID_BIT(0x6, 0, 0) #define X86_FEATURE_TURBO X86_CPUID_BIT(0x6, 0, 1) +#define X86_FEATURE_ARAT X86_CPUID_BIT(0x6, 0, 2) #define X86_FEATURE_PLN X86_CPUID_BIT(0x6, 0, 4) +#define X86_FEATURE_ECMD X86_CPUID_BIT(0x6, 0, 5) #define X86_FEATURE_PTM X86_CPUID_BIT(0x6, 0, 6) #define X86_FEATURE_HWP X86_CPUID_BIT(0x6, 0, 7) #define X86_FEATURE_HWP_NOT X86_CPUID_BIT(0x6, 0, 8) #define X86_FEATURE_HWP_ACT X86_CPUID_BIT(0x6, 0, 9) #define X86_FEATURE_HWP_PREF X86_CPUID_BIT(0x6, 0, 10) +#define X86_FEATURE_HWP_EPP X86_CPUID_BIT(0x6, 0, 11) +#define X86_FEATURE_HWP_PKG X86_CPUID_BIT(0x6, 0, 12) +#define X86_FEATURE_HDC X86_CPUID_BIT(0x6, 0, 13) #define X86_FEATURE_TURBO_MAX X86_CPUID_BIT(0x6, 0, 14) +#define X86_FEATURE_HWP_CAP X86_CPUID_BIT(0x6, 0, 15) +#define X86_FEATURE_HWP_PECI X86_CPUID_BIT(0x6, 0, 16) +#define X86_FEATURE_HWP_FLEX X86_CPUID_BIT(0x6, 0, 17) +#define X86_FEATURE_HWP_FAST X86_CPUID_BIT(0x6, 0, 18) #define X86_FEATURE_HW_FEEDBACK X86_CPUID_BIT(0x6, 2, 0) #define X86_FEATURE_PERF_BIAS X86_CPUID_BIT(0x6, 2, 3) + #define X86_FEATURE_FSGSBASE X86_CPUID_BIT(0x7, 1, 0) #define X86_FEATURE_TSC_ADJUST X86_CPUID_BIT(0x7, 1, 1) +#define X86_FEATURE_SGX X86_CPUID_BIT(0x7, 1, 2) +#define X86_FEATURE_BMI1 X86_CPUID_BIT(0x7, 1, 3) +#define X86_FEATURE_HLE X86_CPUID_BIT(0x7, 1, 4) #define X86_FEATURE_AVX2 X86_CPUID_BIT(0x7, 1, 5) #define X86_FEATURE_SMEP X86_CPUID_BIT(0x7, 1, 7) +#define X86_FEATURE_BMI2 X86_CPUID_BIT(0x7, 1, 8) #define X86_FEATURE_ERMS X86_CPUID_BIT(0x7, 1, 9) #define X86_FEATURE_INVPCID X86_CPUID_BIT(0x7, 1, 10) +#define X86_FEATURE_RTM X86_CPUID_BIT(0x7, 1, 11) +#define X86_FEATURE_MPX X86_CPUID_BIT(0x7, 1, 14) +#define X86_FEATURE_AVX512F X86_CPUID_BIT(0x7, 1, 16) +#define X86_FEATURE_AVX512DQ X86_CPUID_BIT(0x7, 1, 17) #define X86_FEATURE_RDSEED X86_CPUID_BIT(0x7, 1, 18) +#define X86_FEATURE_ADX X86_CPUID_BIT(0x7, 1, 19) #define X86_FEATURE_SMAP X86_CPUID_BIT(0x7, 1, 20) +#define X86_FEATURE_AVX512IFMA X86_CPUID_BIT(0x7, 1, 21) #define X86_FEATURE_CLFLUSHOPT X86_CPUID_BIT(0x7, 1, 23) #define X86_FEATURE_CLWB X86_CPUID_BIT(0x7, 1, 24) #define X86_FEATURE_PT X86_CPUID_BIT(0x7, 1, 25) +#define X86_FEATURE_AVX512PF X86_CPUID_BIT(0x7, 1, 26) +#define X86_FEATURE_AVX512ER X86_CPUID_BIT(0x7, 1, 27) +#define X86_FEATURE_AVX512CD X86_CPUID_BIT(0x7, 1, 28) +#define X86_FEATURE_SHA X86_CPUID_BIT(0x7, 1, 29) +#define X86_FEATURE_AVX512BW X86_CPUID_BIT(0x7, 1, 30) +#define X86_FEATURE_AVX512VL X86_CPUID_BIT(0x7, 1, 31) +#define X86_FEATURE_PREFETCHWT1 X86_CPUID_BIT(0x7, 2, 0) +#define X86_FEATURE_AVX512VBMI X86_CPUID_BIT(0x7, 2, 1) #define X86_FEATURE_UMIP X86_CPUID_BIT(0x7, 2, 2) #define X86_FEATURE_PKU X86_CPUID_BIT(0x7, 2, 3) +#define X86_FEATURE_OSPKE X86_CPUID_BIT(0x7, 2, 4) +#define X86_FEATURE_WAITPKG X86_CPUID_BIT(0x7, 2, 5) +#define X86_FEATURE_AVX512_VBMI2 X86_CPUID_BIT(0x7, 2, 6) +#define X86_FEATURE_CET_SS X86_CPUID_BIT(0x7, 2, 7) +#define X86_FEATURE_GFNI X86_CPUID_BIT(0x7, 2, 8) +#define X86_FEATURE_VAES X86_CPUID_BIT(0x7, 2, 9) +#define X86_FEATURE_VPCLMULQDQ X86_CPUID_BIT(0x7, 2, 10) +#define X86_FEATURE_AVX512_VNNI X86_CPUID_BIT(0x7, 2, 11) +#define X86_FEATURE_AVX512_BITALG X86_CPUID_BIT(0x7, 2, 12) +#define X86_FEATURE_TIME_EN X86_CPUID_BIT(0x7, 2, 13) +#define X86_FEATURE_AVX512_VPOPCNTDQ X86_CPUID_BIT(0x7, 2, 14) +#define X86_FEATURE_LA57 X86_CPUID_BIT(0x7, 2, 16) +#define X86_FEATURE_RDPID X86_CPUID_BIT(0x7, 2, 22) +#define X86_FEATURE_KL X86_CPUID_BIT(0x7, 2, 23) +#define X86_FEATURE_CLDEMOTE X86_CPUID_BIT(0x7, 2, 25) +#define X86_FEATURE_MOVDIRI X86_CPUID_BIT(0x7, 2, 27) +#define X86_FEATURE_MOVDIR64B X86_CPUID_BIT(0x7, 2, 28) +#define X86_FEATURE_SGX_LC X86_CPUID_BIT(0x7, 2, 30) +#define X86_FEATURE_PKS X86_CPUID_BIT(0x7, 2, 31) +#define X86_FEATURE_AVX512_4VNNIW X86_CPUID_BIT(0x7, 3, 2) +#define X86_FEATURE_AVX512_4FMAPS X86_CPUID_BIT(0x7, 3, 3) +#define X86_FEATURE_FSRM X86_CPUID_BIT(0x7, 3, 4) +#define X86_FEATURE_AVX512_VP2INTERSECT X86_CPUID_BIT(0x7, 3, 8) #define X86_FEATURE_MD_CLEAR X86_CPUID_BIT(0x7, 3, 10) +#define X86_FEATURE_SERIALIZE X86_CPUID_BIT(0x7, 3, 14) +#define X86_FEATURE_HYBRID X86_CPUID_BIT(0x7, 3, 15) +#define X86_FEATURE_PCONFIG X86_CPUID_BIT(0x7, 3, 18) +#define X86_FEATURE_CET_IBT X86_CPUID_BIT(0x7, 3, 20) #define X86_FEATURE_IBRS_IBPB X86_CPUID_BIT(0x7, 3, 26) #define X86_FEATURE_STIBP X86_CPUID_BIT(0x7, 3, 27) #define X86_FEATURE_L1D_FLUSH X86_CPUID_BIT(0x7, 3, 28) #define X86_FEATURE_ARCH_CAPABILITIES X86_CPUID_BIT(0x7, 3, 29) +#define X86_FEATURE_CORE_CAPABILITIES X86_CPUID_BIT(0x7, 3, 30) #define X86_FEATURE_SSBD X86_CPUID_BIT(0x7, 3, 31) -#define X86_FEATURE_KVM_PV_CLOCK X86_CPUID_BIT(0x40000001, 0, 3) +#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000001, 0, 0) +#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000001, 0, 1) +#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000001, 0, 2) +#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000001, 0, 3) +#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000001, 0, 4) +#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000001, 0, 5) #define X86_FEATURE_KVM_PV_EOI X86_CPUID_BIT(0x40000001, 0, 6) +#define X86_FEATURE_KVM_PV_UNHALT X86_CPUID_BIT(0x40000001, 0, 7) +#define X86_FEATURE_KVM_PV_TLB_FLUSH X86_CPUID_BIT(0x40000001, 0, 9) +#define X86_FEATURE_KVM_ASYNC_PF_VMEXIT X86_CPUID_BIT(0x40000001, 0, 10) #define X86_FEATURE_KVM_PV_IPI X86_CPUID_BIT(0x40000001, 0, 11) -#define X86_FEATURE_KVM_PV_CLOCK_STABLE X86_CPUID_BIT(0x40000001, 0, 24) +#define X86_FEATURE_KVM_POLL_CONTROL X86_CPUID_BIT(0x40000001, 0, 12) +#define X86_FEATURE_KVM_PV_SCHED_YIELD X86_CPUID_BIT(0x40000001, 0, 13) +#define X86_FEATURE_KVM_ASYNC_PF_INT X86_CPUID_BIT(0x40000001, 0, 14) +#define X86_FEATURE_KVM_MSI_EXT_DEST_ID X86_CPUID_BIT(0x40000001, 0, 15) +#define X86_FEATURE_KVM_HC_MAP_GPA_RANGE X86_CPUID_BIT(0x40000001, 0, 16) +#define X86_FEATURE_KVM_MIGRATION_CONTROL X86_CPUID_BIT(0x40000001, 0, 17) +#define X86_FEATURE_KVM_CLOCKSOURCE_STABLE X86_CPUID_BIT(0x40000001, 0, 24) + #define X86_FEATURE_AMD_TOPO X86_CPUID_BIT(0x80000001, 2, 22) #define X86_FEATURE_SSE4A X86_CPUID_BIT(0x80000001, 3, 6) @@ -256,6 +355,7 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) { #define X86_FEATURE_HUGE_PAGE X86_CPUID_BIT(0x80000001, 3, 26) #define X86_FEATURE_RDTSCP X86_CPUID_BIT(0x80000001, 3, 27) #define X86_FEATURE_INVAR_TSC X86_CPUID_BIT(0x80000007, 3, 8) +#define X86_FEATURE_CONSTANT_TSC X86_CPUID_BIT(0x80000007, 3, 8) // accessor to read some fields out of a register static inline uint32_t x86_get_vaddr_width(void) { diff --git a/arch/x86/include/arch/x86/lapic.h b/arch/x86/include/arch/x86/lapic.h new file mode 100644 index 00000000..2fa421af --- /dev/null +++ b/arch/x86/include/arch/x86/lapic.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ + +#pragma once + +#include +#include +#include +#include + +// local apic +void lapic_init(void); +status_t lapic_timer_init(bool invariant_tsc_supported); +void lapic_eoi(unsigned int vector); +void lapic_send_init_ipi(uint32_t apic_id, bool level); +void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector); +void lapic_send_ipi(uint32_t apic_id, mp_ipi_t ipi); + +status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +void lapic_cancel_timer(void); + diff --git a/arch/x86/include/arch/x86/mmu.h b/arch/x86/include/arch/x86/mmu.h index 6b483f81..d1fddbc8 100644 --- a/arch/x86/include/arch/x86/mmu.h +++ b/arch/x86/include/arch/x86/mmu.h @@ -117,6 +117,7 @@ typedef uint32_t arch_flags_t; void x86_mmu_early_init(void); void x86_mmu_init(void); +void x86_mmu_early_init_percpu(void); __END_CDECLS diff --git a/arch/x86/include/arch/x86/mp.h b/arch/x86/include/arch/x86/mp.h new file mode 100644 index 00000000..8b4c6e0b --- /dev/null +++ b/arch/x86/include/arch/x86/mp.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#pragma once + +#include +#include + +// per cpu pointer pointed to by gs: +typedef struct x86_percpu { + // pointer back to ourselves so we can get a raw pointer via segment:0 + struct x86_percpu *self; + + uint cpu_num; + uint32_t apic_id; + + struct thread *current_thread; + + // per cpu bootstrap stack + uint8_t bootstrap_stack[PAGE_SIZE] __ALIGNED(sizeof(uintptr_t) * 2); + + // XXX add more stuff: + // per cpu TSS + // per cpu doublefault/nmi stacks +} x86_percpu_t; + +#define X86_PERCPU_FIELD_OFFSET(field) offsetof(x86_percpu_t, field) + +// called extremely early on the boot cpu and each secondary cpu to set +// up the percpu struct and segment descriptors pointing to it +void x86_configure_percpu_early(uint cpu_num, uint apic_id); + +// C entry point for secondary cpus +__NO_RETURN void x86_secondary_entry(uint cpu_num); + +// allocate and initialize secondary cpu percpu structs +status_t x86_allocate_percpu_array(uint num_cpus); + +// get the percpu struct for the current cpu +static inline x86_percpu_t *x86_get_percpu(void) { + x86_percpu_t *percpu; + __asm__ volatile("mov %%gs:0, %0" : "=r" (percpu)); + return percpu; +} + +// get the percpu struct for a specific cpu +x86_percpu_t *x86_get_percpu_for_cpu(uint cpu_num); + +#if 0 +#define X86_PERCPU_GET(field) (_Generic(((x86_get_percpu())->field), \ + uint32_t: x86_read_gs_offset32, \ + uint64_t: x86_read_gs_offset64, \ + struct thread*: x86_read_gs_offset_ptr) \ + (X86_PERCPU_FIELD_OFFSET(field))) + +#define X86_PERCPU_SET(field, value) (_Generic(((x86_get_percpu())->field), \ + uint32_t: x86_write_gs_offset32, \ + uint64_t: x86_write_gs_offset64, \ + struct thread*: x86_write_gs_offset_ptr) \ + (X86_PERCPU_FIELD_OFFSET(field), value)) +#endif + +// get the current cpu number +static inline uint x86_get_cpu_num(void) { + return x86_read_gs_offset32(X86_PERCPU_FIELD_OFFSET(cpu_num)); +} + +// get the current apic id +static inline uint32_t x86_get_apic_id(void) { + return x86_read_gs_offset32(X86_PERCPU_FIELD_OFFSET(apic_id)); +} + +// read it from hardware directly +uint32_t x86_get_apic_id_from_hardware(void); + +// get/set the current thread +struct thread; + +static inline struct thread *x86_get_current_thread(void) { + return (struct thread *)x86_read_gs_offset_ptr(X86_PERCPU_FIELD_OFFSET(current_thread)); +} + +static inline void x86_set_current_thread(struct thread *t) { + x86_write_gs_offset_ptr(X86_PERCPU_FIELD_OFFSET(current_thread), t); +} diff --git a/arch/x86/include/arch/x86/pv.h b/arch/x86/include/arch/x86/pv.h new file mode 100644 index 00000000..b6ef8c4c --- /dev/null +++ b/arch/x86/include/arch/x86/pv.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#pragma once + +#include +#include + +#if !X86_LEGACY + +status_t pvclock_init(void); +uint64_t pvclock_get_tsc_freq(void); +bool pv_clock_is_stable(void); + +#endif // !X86_LEGACY diff --git a/arch/x86/lapic.c b/arch/x86/lapic.c new file mode 100644 index 00000000..4b997e65 --- /dev/null +++ b/arch/x86/lapic.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2021 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include "arch/x86/lapic.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_TRACE 0 + +static bool lapic_present = false; +static bool lapic_x2apic = false; +static bool use_tsc_deadline = false; +static volatile uint32_t *lapic_mmio; +static struct fp_32_64 timebase_to_lapic; + +// TODO: move these callbacks into the shared timer code +static platform_timer_callback t_callback; +static void *callback_arg; + +static void lapic_init_percpu(uint level); + +// local apic registers +enum lapic_regs { + LAPIC_ID = 0x20, + LAPIC_VERSION = 0x30, + LAPIC_TPR = 0x80, + LAPIC_APR = 0x90, + LAPIC_PPR = 0xa0, + LAPIC_EOI = 0xb0, + LAPIC_RRD = 0xc0, + LAPIC_LDR = 0xd0, + LAPIC_DFR = 0xe0, + LAPIC_SVR = 0xf0, + LAPIC_ISR0 = 0x100, + + LAPIC_TMR0 = 0x180, + + LAPIC_IRR0 = 0x200, + + LAPIC_ESR = 0x280, + + LAPIC_CMCI = 0x2f0, + LAPIC_ICRLO = 0x300, + LAPIC_ICRHI = 0x310, + LAPIC_TIMER = 0x320, + LAPIC_THERMAL = 0x330, + LAPIC_PERF = 0x340, + LAPIC_LINT0 = 0x350, + LAPIC_LINT1 = 0x360, + LAPIC_ERROR = 0x370, + LAPIC_TICR = 0x380, + LAPIC_TCCR = 0x390, + LAPIC_DIV = 0x3e0, + + // Extended features + LAPIC_EXT_FEATURES = 0x400, + LAPIC_EXT_CONTROL = 0x410, + LAPIC_EXT_SEOI = 0x420, + LAPIC_EXT_IER0 = 0x480, + LAPIC_EXT_LVT0 = 0x500, +}; + +enum lapic_interrupts { + LAPIC_INT_TIMER = 0xf8, + LAPIC_INT_GENERIC, + LAPIC_INT_RESCHEDULE, + + LAPIC_INT_SPURIOUS = 0xff, // Bits 0-3 must be 1 for P6 and below compatibility +}; + +enum lapic_timer_mode { + LAPIC_TIMER_MODE_ONESHOT = 0, + LAPIC_TIMER_MODE_PERIODIC = 1, + LAPIC_TIMER_MODE_TSC_DEADLINE = 2, +}; + +static uint32_t lapic_read(enum lapic_regs reg) { + LTRACEF_LEVEL(2, "reg %#x\n", reg); + if (lapic_x2apic) { + // TODO: do we need barriers here? + DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); + return read_msr(X86_MSR_IA32_X2APIC_BASE + reg / 0x10); + } else { + return mmio_read32(lapic_mmio + reg / 4); + } +} + +static void lapic_write(enum lapic_regs reg, uint32_t val) { + LTRACEF_LEVEL(2, "reg %#x val %#x\n", reg, val); + if (lapic_x2apic) { + DEBUG_ASSERT(reg != LAPIC_ICRLO && reg != LAPIC_ICRHI); + write_msr(X86_MSR_IA32_X2APIC_BASE + reg / 0x10, val); + } else { + mmio_write32(lapic_mmio + reg / 4, val); + } +} + +static void lapic_wait_for_icr_delivery(void) { + LTRACEF_LEVEL(2, "waiting for icr\n"); + uint32_t val; + do { + if (lapic_x2apic) { + val = read_msr(X86_MSR_IA32_X2APIC_BASE + 0x30); + } else { + val = lapic_read(LAPIC_ICRLO); + } + } while (val & (1u << 12)); +} + +// special case to write to the ICR register +static void lapic_write_icr(uint32_t low, uint32_t apic_id) { + LTRACEF_LEVEL(2, "%#x apic_id %#x\n", low, apic_id); + if (lapic_x2apic) { + write_msr(X86_MSR_IA32_X2APIC_BASE + 0x30, ((uint64_t)apic_id << 32) | low); + } else { + lapic_write(LAPIC_ICRHI, apic_id << 24); + lapic_write(LAPIC_ICRLO, low); + lapic_wait_for_icr_delivery(); + } +} + +status_t lapic_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + LTRACEF("cpu %u interval %u\n", arch_curr_cpu_num(), interval); + + DEBUG_ASSERT(arch_ints_disabled()); + + t_callback = callback; + callback_arg = arg; + + if (use_tsc_deadline) { + uint64_t now = __builtin_ia32_rdtsc(); + uint64_t delta = time_to_tsc_ticks(interval); + uint64_t deadline = now + delta; + LTRACEF("now %llu delta %llu deadline %llu\n", now, delta, deadline); + write_msr(X86_MSR_IA32_TSC_DEADLINE, deadline); + } else { + // set the initial count, which should trigger the timer + uint64_t ticks = u64_mul_u32_fp32_64(interval, timebase_to_lapic); + if (ticks > UINT32_MAX) { + ticks = UINT32_MAX; + } + + lapic_write(LAPIC_TICR, ticks & 0xffffffff); + } + + return NO_ERROR; +} + +void lapic_cancel_timer(void) { + LTRACE; + + DEBUG_ASSERT(arch_ints_disabled()); + + if (use_tsc_deadline) { + write_msr(X86_MSR_IA32_TSC_DEADLINE, 0); + } else { + lapic_write(LAPIC_TICR, 0); + } +} + +static enum handler_return lapic_timer_handler(void *arg) { + LTRACEF("cpu %u\n", arch_curr_cpu_num()); + + enum handler_return ret = INT_NO_RESCHEDULE; + if (t_callback) { + ret = t_callback(callback_arg, current_time()); + } + + return ret; +} + +static enum handler_return lapic_spurious_handler(void *arg) { + LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg); + + return INT_NO_RESCHEDULE; +} + +static enum handler_return lapic_generic_handler(void *arg) { + LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg); + + return INT_NO_RESCHEDULE; +} + +static enum handler_return lapic_reschedule_handler(void *arg) { + LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg); + + return mp_mbx_reschedule_irq(); +} + +void lapic_init(void) { + lapic_present = x86_feature_test(X86_FEATURE_APIC); +} + +static void lapic_init_postvm(uint level) { + if (!lapic_present) { + return; + } + + dprintf(INFO, "X86: local apic detected\n"); + + // IA32_APIC_BASE_MSR + uint64_t apic_base = read_msr(X86_MSR_IA32_APIC_BASE); + LTRACEF("raw apic base msr %#llx\n", apic_base); + + // make sure it's enabled + if ((apic_base & (1u<<11)) == 0) { + dprintf(INFO, "X86: enabling lapic\n"); + apic_base |= (1u<<11); + write_msr(X86_MSR_IA32_APIC_BASE, apic_base); + } + + dprintf(INFO, "X86: lapic physical address %#llx\n", apic_base & ~0xfff); + + // see if x2APIC mode is supported and enable + if (x86_feature_test(X86_FEATURE_X2APIC)) { + lapic_x2apic = true; + dprintf(INFO, "X86: local apic supports x2APIC mode\n"); + + write_msr(X86_MSR_IA32_APIC_BASE, apic_base | (1u<<10)); + } + + // map the lapic into the kernel since it's not guaranteed that the physmap covers it + if (!lapic_mmio) { + LTRACEF("mapping lapic into kernel\n"); + status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, (void **)&lapic_mmio, 0, + apic_base & ~0xfff, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE); + ASSERT(err == NO_ERROR); + } + + // Read the local apic id and version and features + uint32_t id = lapic_read(LAPIC_ID); + uint32_t version = lapic_read(LAPIC_VERSION); + bool eas = version & (1u<<31); + uint32_t max_lvt = (version >> 16) & 0xff; + version &= 0xff; + dprintf(INFO, "X86: local apic id %#x version %#x\n", id, version); + dprintf(INFO, "X86: local apic max lvt entries %u\n", max_lvt); + if (eas) { + dprintf(INFO, "X86: local apic EAS features %#x\n", lapic_read(LAPIC_EXT_FEATURES)); + } + + // Finish up some local initialization that all cpus will want to do + lapic_init_percpu(0); +} +LK_INIT_HOOK(lapic_init_postvm, lapic_init_postvm, LK_INIT_LEVEL_VM + 1); + +static void lapic_init_percpu(uint level) { + // Make sure the apic is enabled and x2apic mode is set (if supported) + uint64_t apic_base = read_msr(X86_MSR_IA32_APIC_BASE); + apic_base |= (1u<<11); + if (lapic_x2apic) { + apic_base |= (1u<<10); + } + write_msr(X86_MSR_IA32_APIC_BASE, apic_base); + + // set the spurious vector register + uint32_t svr = (LAPIC_INT_SPURIOUS | (1u<<8)); // enable + lapic_write(LAPIC_SVR, svr); + + LTRACEF("lapic svr %#x\n", lapic_read(LAPIC_SVR)); + + register_int_handler_msi(LAPIC_INT_SPURIOUS, &lapic_spurious_handler, NULL, false); + register_int_handler_msi(LAPIC_INT_GENERIC, &lapic_generic_handler, NULL, false); + register_int_handler_msi(LAPIC_INT_RESCHEDULE, &lapic_reschedule_handler, NULL, false); +} + +LK_INIT_HOOK_FLAGS(lapic_init_percpu, lapic_init_percpu, LK_INIT_LEVEL_VM, LK_INIT_FLAG_SECONDARY_CPUS); + +static uint32_t lapic_read_current_tick(void) { + if (!lapic_present) { + return 0; + } + + return lapic_read(LAPIC_TCCR); +} + +static void lapic_timer_init_percpu(uint level) { + // check for deadline mode + if (use_tsc_deadline) { + // put the timer in TSC deadline and clear the match register + uint32_t val = (LAPIC_TIMER_MODE_TSC_DEADLINE << 17) | LAPIC_INT_TIMER; + lapic_write(LAPIC_TIMER, val); + write_msr(X86_MSR_IA32_TSC_DEADLINE, 0); + } else { + // configure the local timer and make sure it is not set to fire + uint32_t val = (LAPIC_TIMER_MODE_ONESHOT << 17) | LAPIC_INT_TIMER; + lapic_write(LAPIC_TIMER, val); + lapic_write(LAPIC_TICR, 0); + } + + // register the timer interrupt vector + register_int_handler_msi(LAPIC_INT_TIMER, &lapic_timer_handler, NULL, false); +} +LK_INIT_HOOK_FLAGS(lapic_timer_init_percpu, lapic_timer_init_percpu, LK_INIT_LEVEL_VM + 1, LK_INIT_FLAG_SECONDARY_CPUS); + +status_t lapic_timer_init(bool invariant_tsc_supported) { + if (!lapic_present) { + return ERR_NOT_FOUND; + } + + // check for deadline mode + bool tsc_deadline = x86_feature_test(X86_FEATURE_TSC_DEADLINE); + if (invariant_tsc_supported && tsc_deadline) { + dprintf(INFO, "X86: local apic timer supports TSC deadline mode\n"); + use_tsc_deadline = true; + } else { + // configure the local timer and make sure it is not set to fire + uint32_t val = (LAPIC_TIMER_MODE_ONESHOT << 17) | LAPIC_INT_TIMER; + lapic_write(LAPIC_TIMER, val); + + // calibrate the timer frequency + lapic_write(LAPIC_TICR, 0xffffffff); // countdown from the max count + uint32_t lapic_hz = pit_calibrate_lapic(&lapic_read_current_tick); + lapic_write(LAPIC_TICR, 0); + printf("X86: local apic timer frequency %uHz\n", lapic_hz); + + fp_32_64_div_32_32(&timebase_to_lapic, lapic_hz, 1000); + dprintf(INFO, "X86: timebase to local apic timer ratio %u.%08u...\n", + timebase_to_lapic.l0, timebase_to_lapic.l32); + } + + lapic_timer_init_percpu(0); + + return NO_ERROR; +} + +void lapic_eoi(unsigned int vector) { + LTRACEF("vector %#x\n", vector); + if (!lapic_present) { + return; + } + + lapic_write(LAPIC_EOI, 0); +} + +void lapic_send_init_ipi(uint32_t apic_id, bool level) { + if (!lapic_present) { + return; + } + + // Level triggered mode, level according to arg, INIT delivery mode, no shorthand + lapic_write_icr((1u << 15) | (level ? (1u << 14) : 0) | (5u << 8), apic_id); +} + +void lapic_send_startup_ipi(uint32_t apic_id, uint32_t startup_vector) { + if (!lapic_present) { + return; + } + + // Startup IPI, no shorthand + lapic_write_icr((6u << 8) | (startup_vector >> 12), apic_id); +} + +void lapic_send_ipi(uint32_t apic_id, mp_ipi_t ipi) { + if (!lapic_present) { + return; + } + + LTRACEF("cpu %u target apic_id %#x, ipi %u\n", arch_curr_cpu_num(), apic_id, ipi); + + uint32_t vector; + switch (ipi) { + case MP_IPI_GENERIC: + vector = LAPIC_INT_GENERIC; + break; + case MP_IPI_RESCHEDULE: + vector = LAPIC_INT_RESCHEDULE; + break; + default: + panic("X86: unknown IPI %u\n", ipi); + } + + // send fixed mode, level asserted, no destination shorthand interrupt + lapic_write_icr(vector | (1U << 14), apic_id); +} \ No newline at end of file diff --git a/arch/x86/mp.c b/arch/x86/mp.c new file mode 100644 index 00000000..c45cd592 --- /dev/null +++ b/arch/x86/mp.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2024 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_TRACE 0 + +#if WITH_SMP + +// the boot cpu's percpu struct +static x86_percpu_t x86_boot_percpu; +// pointer to an array of percpu structs for each of the secondary cpus +static x86_percpu_t *x86_ap_percpus; + +x86_percpu_t *x86_get_percpu_for_cpu(uint cpu_num) { + DEBUG_ASSERT(cpu_num < SMP_MAX_CPUS); + if (cpu_num == 0) { + return &x86_boot_percpu; + } + DEBUG_ASSERT(x86_ap_percpus); + return &x86_ap_percpus[cpu_num - 1]; +} + +void x86_configure_percpu_early(uint cpu_num, uint apic_id) { + x86_percpu_t *percpu = x86_get_percpu_for_cpu(cpu_num); + + // initialize the percpu structure for this cpu + percpu->self = percpu; + percpu->cpu_num = cpu_num; + percpu->apic_id = apic_id; + +#if ARCH_X86_64 + // use the 64-bit gs base msr to set up a pointer to the percpu struct + write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0); + write_msr(X86_MSR_IA32_GS_BASE, (uint64_t)percpu); +#else + // set up a gs descriptor for this cpu + uint16_t selector = PERCPU_SELECTOR_BASE + cpu_num * 8; + x86_set_gdt_descriptor(selector, percpu, sizeof(*percpu), 1, 0, 1, SEG_TYPE_DATA_RW, 0, 1); + x86_set_gs(selector); +#endif +} + +status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) { + LTRACEF("cpu %u target 0x%x, ipi 0x%x\n", arch_curr_cpu_num(), target, ipi); + + DEBUG_ASSERT(arch_ints_disabled()); + uint curr_cpu_num = arch_curr_cpu_num(); + + // translate the target bitmap to apic id + while (target) { + uint cpu_num = __builtin_ctz(target); + target &= ~(1u << cpu_num); + + // skip the current cpu + if (cpu_num == curr_cpu_num) { + continue; + } + + x86_percpu_t *percpu = x86_get_percpu_for_cpu(cpu_num); + uint32_t apic_id = percpu->apic_id; + + // send the ipi to the target cpu + lapic_send_ipi(apic_id, ipi); + } + + return NO_ERROR; +} + +void arch_mp_init_percpu(void) {} + +uint32_t x86_get_apic_id_from_hardware(void) { + // read the apic id out of cpuid leaf 1, which should be present if SMP is enabled. + uint32_t apic_id, unused; + cpuid(0x1, &unused, &apic_id, &unused, &unused); + + apic_id >>= 24; + + // TODO: read full 32bit apic id from x2apic msr if available + + return apic_id; +} + +void x86_secondary_entry(uint cpu_num) { + uint32_t apic_id = x86_get_apic_id_from_hardware(); + x86_configure_percpu_early(cpu_num, apic_id); + + x86_early_init_percpu(); + + // run early secondary cpu init routines up to the threading level + lk_init_level(LK_INIT_FLAG_SECONDARY_CPUS, LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_THREADING - 1); + + dprintf(INFO, "SMP: secondary cpu %u started, apic id %u\n", arch_curr_cpu_num(), apic_id); + + lk_secondary_cpu_entry(); + + // should never get here except for an error condition + for (;;); +} + +status_t x86_allocate_percpu_array(uint num_cpus) { + x86_ap_percpus = memalign(_Alignof(x86_percpu_t), num_cpus * sizeof(x86_percpu_t)); + if (!x86_ap_percpus) { + return ERR_NO_MEMORY; + } + + memset(x86_ap_percpus, 0, num_cpus * sizeof(x86_percpu_t)); + return NO_ERROR; +} + +#else + +void x86_configure_percpu_early(uint cpu_num, uint apic_id) {} + +#endif \ No newline at end of file diff --git a/arch/x86/pv.c b/arch/x86/pv.c new file mode 100644 index 00000000..65e669d2 --- /dev/null +++ b/arch/x86/pv.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include "arch/x86/pv.h" + +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_TRACE 0 + +#if !X86_LEGACY + +// Deals with paravirtualized clock sources and event timers on the PC platform, +// specifically KVM. + +// From https://www.kernel.org/doc/html/v6.14/virt/kvm/x86/msr.html +struct pvclock_wall_clock { + uint32_t version; + uint32_t sec; + uint32_t nsec; +} __PACKED; +static_assert(sizeof(struct pvclock_wall_clock) == 12, "pvclock_wall_clock size mismatch"); + +struct pvclock_vcpu_time_info { + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; + uint64_t system_time; + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + uint8_t flags; + uint8_t pad[2]; +} __PACKED; +static_assert(sizeof(struct pvclock_vcpu_time_info) == 32, "pvclock_vcpu_time_info size mismatch"); +#define VCPU_TIME_INFO_FLAG_STABLE 0x1 + +static volatile struct pvclock_wall_clock *wall_clock; +static volatile struct pvclock_vcpu_time_info *vcpu_time_info; + +status_t pvclock_init(void) { + uint32_t clocksource_msr_base = 0; + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE)) { + clocksource_msr_base = 0x11; + } + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE2)) { + clocksource_msr_base = 0x4b564d00; + } + if (!clocksource_msr_base) { + return ERR_NOT_SUPPORTED; + } + dprintf(INFO, "pv_clock: clocksource detected, msr base %#x\n", clocksource_msr_base); + + // map a page of memory and point the KVM clocksource msrs at it + void *clocksource_page; + status_t err = vmm_alloc(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, &clocksource_page, 0, 0, 0); + if (err != NO_ERROR) { + printf("pv_clock: failed to allocate page for clocksource msrs\n"); + return err; + } + + paddr_t paddr; + arch_mmu_query(&vmm_get_kernel_aspace()->arch_aspace, (vaddr_t)clocksource_page, &paddr, NULL); + LTRACEF("clocksource page %p, paddr %#" PRIxPTR "\n", clocksource_page, paddr); + + write_msr(clocksource_msr_base, paddr); + write_msr(clocksource_msr_base + 1, paddr + sizeof(struct pvclock_wall_clock) + 1); + + wall_clock = (struct pvclock_wall_clock *)clocksource_page; + vcpu_time_info = (struct pvclock_vcpu_time_info *)(wall_clock + 1); + + dprintf(SPEW, "pv_clock: wall clock version %u, sec %u, nsec %u\n", + wall_clock->version, wall_clock->sec, wall_clock->nsec); + + dprintf(SPEW, "pv_clock: vcpu time info version %u, tsc timestamp %llu, system time %llu\n", + vcpu_time_info->version, vcpu_time_info->tsc_timestamp, vcpu_time_info->system_time); + dprintf(SPEW, "pv_clock: tsc to system mul %u, tsc shift %d, flags %u\n", + vcpu_time_info->tsc_to_system_mul, vcpu_time_info->tsc_shift, vcpu_time_info->flags); + + return NO_ERROR; +} + +uint64_t pvclock_get_tsc_freq(void) { + if (!vcpu_time_info) { + return 0; + } + + uint32_t tsc_mul = 0; + int8_t tsc_shift = 0; + uint32_t pre_version = 0, post_version = 0; + do { + pre_version = vcpu_time_info->version; + if (pre_version % 2 != 0) { + asm("pause"); + continue; + } + tsc_mul = vcpu_time_info->tsc_to_system_mul; + tsc_shift = vcpu_time_info->tsc_shift; + post_version = vcpu_time_info->version; + } while (pre_version != post_version); + + uint64_t tsc_khz = 1000000ULL << 32; + tsc_khz = tsc_khz / tsc_mul; + if (tsc_shift > 0) { + tsc_khz >>= tsc_shift; + } else { + tsc_khz <<= -tsc_shift; + } + return tsc_khz * 1000; +} + +bool pv_clock_is_stable(void) { + if (!vcpu_time_info) { + return false; + } + bool is_stable = (vcpu_time_info->flags & VCPU_TIME_INFO_FLAG_STABLE) || + x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE_STABLE); + return is_stable; +} + +#endif // !X86_LEGACY \ No newline at end of file diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk index c01e957a..fdb0b380 100644 --- a/arch/x86/rules.mk +++ b/arch/x86/rules.mk @@ -3,9 +3,15 @@ LOCAL_DIR := $(GET_LOCAL_DIR) MODULE := $(LOCAL_DIR) MODULE_OPTIONS := extra_warnings +MODULE_DEPS := lib/fixed_point # x86 code always runs with the mmu enabled WITH_KERNEL_VM := 1 +ifneq ($(CPU),legacy) +WITH_SMP ?= 1 +else +WITH_SMP ?= 0 +endif ifeq ($(SUBARCH),x86-32) MEMBASE ?= 0x00000000 @@ -13,8 +19,8 @@ KERNEL_BASE ?= 0x80000000 KERNEL_LOAD_OFFSET ?= 0x00200000 KERNEL_ASPACE_BASE ?= 0x80000000 KERNEL_ASPACE_SIZE ?= 0x7ff00000 -USER_ASPACE_BASE ?= 0 -USER_ASPACE_SIZE ?= 0x80000000 +USER_ASPACE_BASE ?= 0x1000 # 4KB +USER_ASPACE_SIZE ?= 0x7fffe000 # 2GB - 2*4KB SUBARCH_DIR := $(LOCAL_DIR)/32 endif @@ -27,8 +33,8 @@ KERNEL_BASE ?= 0xffffffff80000000 KERNEL_LOAD_OFFSET ?= 0x00200000 KERNEL_ASPACE_BASE ?= 0xffffff8000000000UL # -512GB KERNEL_ASPACE_SIZE ?= 0x0000008000000000UL -USER_ASPACE_BASE ?= 0x0000000000000000UL -USER_ASPACE_SIZE ?= 0x0000800000000000UL +USER_ASPACE_BASE ?= 0x0000000000001000UL # 4KB +USER_ASPACE_SIZE ?= 0x00007fffffffe000UL # ((1<<47) - 2*4KB) SUBARCH_DIR := $(LOCAL_DIR)/64 endif @@ -41,23 +47,38 @@ GLOBAL_DEFINES += \ KERNEL_LOAD_OFFSET=$(KERNEL_LOAD_OFFSET) \ KERNEL_ASPACE_BASE=$(KERNEL_ASPACE_BASE) \ KERNEL_ASPACE_SIZE=$(KERNEL_ASPACE_SIZE) \ - SMP_MAX_CPUS=1 \ + USER_ASPACE_BASE=$(USER_ASPACE_BASE) \ + USER_ASPACE_SIZE=$(USER_ASPACE_SIZE) \ ARCH_HAS_MMU=1 +ifeq ($(WITH_SMP),1) +SMP_MAX_CPUS ?= 16 +GLOBAL_DEFINES += \ + WITH_SMP=1 \ + SMP_MAX_CPUS=$(SMP_MAX_CPUS) +else +GLOBAL_DEFINES += \ + SMP_MAX_CPUS=1 +endif + MODULE_SRCS += \ $(SUBARCH_DIR)/start.S \ \ $(SUBARCH_DIR)/asm.S \ $(SUBARCH_DIR)/exceptions.S \ + $(SUBARCH_DIR)/gdt.S \ $(SUBARCH_DIR)/mmu.c \ $(SUBARCH_DIR)/ops.S \ + $(SUBARCH_DIR)/spinlock.S \ \ $(LOCAL_DIR)/arch.c \ $(LOCAL_DIR)/cache.c \ $(LOCAL_DIR)/descriptor.c \ $(LOCAL_DIR)/faults.c \ $(LOCAL_DIR)/feature.c \ - $(LOCAL_DIR)/gdt.S \ + $(LOCAL_DIR)/lapic.c \ + $(LOCAL_DIR)/mp.c \ + $(LOCAL_DIR)/pv.c \ $(LOCAL_DIR)/thread.c \ # legacy x86's dont have fpu support diff --git a/arch/x86/thread.c b/arch/x86/thread.c index 94aa3f5f..c8275372 100644 --- a/arch/x86/thread.c +++ b/arch/x86/thread.c @@ -17,18 +17,19 @@ #include #include +#if !WITH_SMP /* we're uniprocessor at this point for x86, so store a global pointer to the current thread */ struct thread *_current_thread; +#endif static void initial_thread_func(void) __NO_RETURN; static void initial_thread_func(void) { - int ret; - /* release the thread lock that was implicitly held across the reschedule */ spin_unlock(&thread_lock); arch_enable_ints(); - ret = _current_thread->entry(_current_thread->arg); + thread_t *ct = arch_get_current_thread(); + int ret = ct->entry(ct->arg); thread_exit(ret); } diff --git a/kernel/mp.c b/kernel/mp.c index 0787cab8..ea121458 100644 --- a/kernel/mp.c +++ b/kernel/mp.c @@ -38,6 +38,9 @@ void mp_reschedule(mp_cpu_mask_t target, uint flags) { target &= ~mp.realtime_cpus; } target &= ~(1U << local_cpu); + if (target == 0) { + return; + } LTRACEF("local %d, post mask target now 0x%x\n", local_cpu, target); diff --git a/lib/acpi_lite/acpi_lite.cpp b/lib/acpi_lite/acpi_lite.cpp index beb1c2b5..1059cf0d 100644 --- a/lib/acpi_lite/acpi_lite.cpp +++ b/lib/acpi_lite/acpi_lite.cpp @@ -17,7 +17,7 @@ #include // uses the vm to map in ACPI tables as they are found -static_assert(WITH_KERNEL_VM, ""); +static_assert(WITH_KERNEL_VM); #define LOCAL_TRACE 0 @@ -407,7 +407,7 @@ void acpi_lite_dump_tables(bool full_dump) { } } -status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_entry_callback callback) { +status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_entry_callback callback, void * const cookie) { const acpi_madt_table* madt = reinterpret_cast(acpi_get_table_by_sig(ACPI_MADT_SIG)); if (!madt) { @@ -417,14 +417,17 @@ status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_ent // bytewise array of the same table const uint8_t* madt_array = reinterpret_cast(madt); + LTRACEF("table at %p\n", madt_array); + // walk the table off the end of the header, looking for the requested type size_t off = sizeof(*madt); while (off < madt->header.length) { uint8_t type = madt_array[off]; uint8_t length = madt_array[off + 1]; + LTRACEF("type %u, length %u\n", type, length); if (type == search_type) { - callback(static_cast(&madt_array[off]), length); + callback(static_cast(&madt_array[off]), length, cookie); } off += length; @@ -433,4 +436,31 @@ status_t acpi_process_madt_entries_etc(const uint8_t search_type, const madt_ent return NO_ERROR; } +void acpi_lite_dump_madt_table() { + auto local_apic_callback = [](const void *_entry, size_t entry_len, void *cookie) { + const auto *entry = reinterpret_cast(_entry); + + printf("\tLOCAL APIC id %d, processor id %d, flags %#x\n", + entry->apic_id, entry->processor_id, entry->flags); + }; + + auto io_apic_callback = [](const void *_entry, size_t entry_len, void *cookie) { + const auto *entry = reinterpret_cast(_entry); + + printf("\tIO APIC id %d, address %#x gsi base %u\n", + entry->io_apic_id, entry->io_apic_address, entry->global_system_interrupt_base); + }; + + auto int_source_override_callback = [](const void *_entry, size_t entry_len, void *cookie) { + const auto *entry = reinterpret_cast(_entry); + + printf("\tINT OVERRIDE bus %u, source %u, gsi %u, flags %#x\n", + entry->bus, entry->source, entry->global_sys_interrupt, entry->flags); + }; + printf("MADT/APIC table:\n"); + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_LOCAL_APIC, local_apic_callback, nullptr); + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_IO_APIC, io_apic_callback, nullptr); + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_INT_SOURCE_OVERRIDE, int_source_override_callback, nullptr); +} + // vim: set ts=2 sw=2 expandtab: diff --git a/lib/acpi_lite/include/lib/acpi_lite.h b/lib/acpi_lite/include/lib/acpi_lite.h index 47a63942..4c6a19c7 100644 --- a/lib/acpi_lite/include/lib/acpi_lite.h +++ b/lib/acpi_lite/include/lib/acpi_lite.h @@ -11,18 +11,19 @@ #include #include #include -#include __BEGIN_CDECLS status_t acpi_lite_init(paddr_t rsdt); void acpi_lite_dump_tables(bool full_dump); +void acpi_lite_dump_madt_table(void); const struct acpi_sdt_header* acpi_get_table_by_sig(const char* sig); // A routine to iterate over all the MADT entries of a particular type via a callback //using MadtEntryCallback = fbl::Function; -typedef void (*madt_entry_callback)(const void* entry, size_t entry_len); -status_t acpi_process_madt_entries_etc(uint8_t search_type, const madt_entry_callback); +typedef void (*madt_entry_callback)(const void* entry, size_t entry_len, void *cookie); +status_t acpi_process_madt_entries_etc(uint8_t search_type, madt_entry_callback, void *cookie); + __END_CDECLS diff --git a/lib/acpi_lite/include/lib/acpi_lite/structs.h b/lib/acpi_lite/include/lib/acpi_lite/structs.h index b84c810d..df9d0d5c 100644 --- a/lib/acpi_lite/include/lib/acpi_lite/structs.h +++ b/lib/acpi_lite/include/lib/acpi_lite/structs.h @@ -230,6 +230,7 @@ static_assert(sizeof(struct acpi_madt_int_source_override_entry) == 10, ""); #define ACPI_MADT_FLAG_TRIGGER_MASK 0b1100 // DBG2 table +// From https://learn.microsoft.com/en-us/windows-hardware/drivers/bringup/acpi-debug-port-table #define ACPI_DBG2_SIG "DBG2" struct acpi_dbg2_table { struct acpi_sdt_header header; @@ -263,7 +264,13 @@ static_assert(sizeof(struct acpi_dbg2_device) == 22, ""); // debug port subtypes #define ACPI_DBG2_SUBTYPE_16550_COMPATIBLE 0x0000 #define ACPI_DBG2_SUBTYPE_16550_SUBSET 0x0001 +#define ACPI_DBG2_SUBTYPE_PL011 0x0003 +#define ACPI_DBG2_SUBTYPE_ARM_SBSA 0x000e +#define ACPI_DBG2_SUBTYPE_16550_DESCRIBED 0x0012 +#define ACPI_DBG2_SUBTYPE_RISCV_SBI 0x0015 + #define ACPI_DBG2_SUBTYPE_1394_STANDARD 0x0000 + #define ACPI_DBG2_SUBTYPE_USB_XHCI 0x0000 #define ACPI_DBG2_SUBTYPE_USB_EHCI 0x0001 diff --git a/platform/pc/include/platform/pc/timer.h b/platform/pc/include/platform/pc/timer.h new file mode 100644 index 00000000..a06aceff --- /dev/null +++ b/platform/pc/include/platform/pc/timer.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2025 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#pragma once + +#include +#include + +// A few shared timer routines needed by the arch/x86 layer +uint32_t pit_calibrate_lapic(uint32_t (*lapic_read_tick)(void)); +uint64_t time_to_tsc_ticks(lk_time_t time); diff --git a/platform/pc/interrupts.c b/platform/pc/interrupts.c index 89dad926..47c6fbaf 100644 --- a/platform/pc/interrupts.c +++ b/platform/pc/interrupts.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "platform_p.h" #include diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c deleted file mode 100644 index 145574a9..00000000 --- a/platform/pc/lapic.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2021 Travis Geiselbrecht - * - * Use of this source code is governed by a MIT-style - * license that can be found in the LICENSE file or at - * https://opensource.org/licenses/MIT - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "platform_p.h" -#include -#include - -#define LOCAL_TRACE 0 - -static bool lapic_present = false; -static uint8_t *lapic_mmio; - -void lapic_init(void) { - // discover the presence of the local apic and map it - LTRACE_ENTRY; - - // check feature bit 9 in edx of leaf 1 for presence of lapic - lapic_present = x86_feature_test(X86_FEATURE_APIC); -} - -void lapic_init_postvm(uint level) { - if (!lapic_present) - return; - - dprintf(INFO, "X86: local apic detected\n"); - - // IA32_APIC_BASE_MSR - uint64_t apic_base = read_msr(0x1b); - LTRACEF("apic base %#llx\n", apic_base); - - // TODO: assert that it's enabled - - apic_base &= ~0xfff; - dprintf(INFO, "X86: lapic physical address %#llx\n", apic_base); - - // map the lapic into the kernel since it's not guaranteed that the physmap covers it - status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, (void **)&lapic_mmio, 0, - apic_base & ~0xfff, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE); - ASSERT(err == NO_ERROR); -} - -LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM); - -void lapic_eoi(unsigned int vector) { - LTRACEF("vector %#x\n", vector); - if (lapic_present) { - *REG32(lapic_mmio + 0xb0) = 1; - } -} - diff --git a/platform/pc/mp-boot.S b/platform/pc/mp-boot.S new file mode 100644 index 00000000..c902153c --- /dev/null +++ b/platform/pc/mp-boot.S @@ -0,0 +1,200 @@ +#include +#include + +#if WITH_SMP + +#define LOAD_ADDRESS 0x4000 +#define MSR_EFER 0xc0000080 +#define EFER_LME 0x00000100 + +#define ARGS_ADDRESS (LOAD_ADDRESS + 0x1000) +#define ARGS_CR3 (ARGS_ADDRESS + 0x00) +#if ARCH_X86_64 +#define ARGS_STACK (ARGS_ADDRESS + 0x08) +#else +#define ARGS_STACK (ARGS_ADDRESS + 0x04) +#endif + +.text +.code16 +// secondary cpu boot entry point and switch to protected mode +// enters with the following state: +// real mode, CS 0x0400, PC 0 (physical address 0x4000) +// LOAD_ADDRESS (physical) == mp_boot_start (virtual) +FUNCTION(mp_boot_start) + // jump over the temp GDT below and switch to a flat memory segment (0) + ljmp $0, $(LOAD_ADDRESS + (.Lafter_gdt - mp_boot_start)) + +.org 0x8 +.Lgdt: + // stuff the GDTR in the first entry + .short (8*4) + .int (LOAD_ADDRESS + 0x8) // address of .Lgdt + .short 0 + + // 0x8 code flat 32bit + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10011010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ + .byte 0b11001111 /* G(1) D(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + + // 0x10 data flat 32bit + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10010010 /* P(1) DPL(00) S(1) 0 E(0) W(1) A(0) */ + .byte 0b11001111 /* G(1) B(1) 0 0 limit 19:16 */ + .byte 0x0 /* base 31:24 */ + + // 0x18 code 64bit + .short 0xffff /* limit 15:00 */ + .short 0x0000 /* base 15:00 */ + .byte 0x00 /* base 23:16 */ + .byte 0b10011010 /* P(1) DPL(00) S(1) 1 C(0) R(1) A(0) */ + .byte 0b10101111 /* G(1) D(0) L(1) AVL(0) limit 19:16 */ + .byte 0x0 /* base 31:24 */ + +.Lafter_gdt: + // load the above GDT + lgdt (LOAD_ADDRESS + 0x08) + + // switch to protected mode + movl %cr0, %eax + orl $1, %eax + movl %eax, %cr0 + + // jump to 32bit mode + ljmpl $0x8, $(LOAD_ADDRESS + (.Lprot - mp_boot_start)) +.Lprot: + .code32 + // we're now in 32bit mode, set up the 32bit data segment registers + mov $0x10, %ax + mov %ax, %ss + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + +#if ARCH_X86_64 + // set up 64bit paging + // set PAE bit in CR4 + mov %cr4, %eax + or $(1<<5), %eax + mov %eax, %cr4 + + // Enable Long mode + movl $MSR_EFER ,%ecx + rdmsr + orl $EFER_LME,%eax + wrmsr + + // load trampoline page table + movl (ARGS_CR3), %eax + mov %eax, %cr3 + + // enable paging, now we're in 32bit compatibility mode + mov %cr0, %eax + btsl $(31), %eax + mov %eax, %cr0 + + // load a very temporary stack pointer + movl $(LOAD_ADDRESS + 0x800), %esp + + // Use a far jump to get into 64bit mode + pushl $0x18 + pushl $(LOAD_ADDRESS + (.Lfarjump64 - mp_boot_start)) + lret + +.code64 +.Lfarjump64: + /* branch to our high address */ + movq (.Lhigh_addr), %rax + jmp *%rax +.Lhigh_addr: +.quad mp_boot_start_high + +#else // ARCH_X86_32 + // set up 32bit paging + + // set PSE bit in CR4 + mov %cr4, %eax + or $(1<<4), %eax + mov %eax, %cr4 + + // load trampoline page table + movl (ARGS_CR3), %eax + mov %eax, %cr3 + + // enable paging + mov %cr0, %eax + btsl $(31), %eax + mov %eax, %cr0 + + // Branch to the high address + lea mp_boot_start_high, %eax + jmp *%eax +#endif + +DATA(mp_boot_end) +END_FUNCTION(mp_boot_start) + +FUNCTION(mp_boot_start_high) +#if ARCH_X86_64 + // set up stack pointer + mov (ARGS_STACK), %rsp + + // load the real GDT + lgdt _gdtr + + push $CODE_64_SELECTOR + lea .Lnext(%rip), %rax + push %rax + lretq +.Lnext: + // zero out the segment registers + xor %ax, %ax + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + mov %ax, %ss + + // call into C + cld + mov $ARGS_ADDRESS, %rdi + call secondary_entry + jmp . + +#else // ARCH_X86_32 + // set up stack pointer + mov (ARGS_STACK), %esp + + // load the real GDT + lgdt _gdtr + + push $CODE_SELECTOR + lea .Lnext, %eax + push %eax + lret +.Lnext: + + // Load the real segment registers + mov $DATA_SELECTOR, %ax + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + mov %ax, %ss + + // call into C + cld + push $ARGS_ADDRESS + call secondary_entry + jmp . + +#endif +END_FUNCTION(mp_boot_start_high) + +#endif // WITH_SMP \ No newline at end of file diff --git a/platform/pc/mp.c b/platform/pc/mp.c new file mode 100644 index 00000000..149e4656 --- /dev/null +++ b/platform/pc/mp.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2024 Travis Geiselbrecht + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ + +#include "platform_p.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if WITH_SMP + +#define TRAMPOLINE_ADDRESS 0x4000 + +#define LOCAL_TRACE 1 + +extern void mp_boot_start(void); +extern void mp_boot_end(void); + +struct bootstrap_args { + // referenced in mp-boot.S, do not move without updating assembly + uintptr_t trampoline_cr3; + uintptr_t stack_top; + + // referenced in C, okay to move + uintptr_t cpu_num; + volatile uint32_t *boot_completed_ptr; // set by the secondary cpu when it's done +}; + +// called from assembly code in mp-boot.S +__NO_RETURN void secondary_entry(struct bootstrap_args *args) { + volatile uint32_t *boot_completed = args->boot_completed_ptr; + uint cpu_num = args->cpu_num; + + // context switch to the kernels cr3 + x86_set_cr3(vmm_get_kernel_aspace()->arch_aspace.cr3_phys); + // from now on out the boot args structure is not visible + + // we're done, let the primary cpu know so it can reuse the args + *boot_completed = 1; + + x86_secondary_entry(cpu_num); +} + +static status_t start_cpu(uint cpu_num, uint32_t apic_id, struct bootstrap_args *args) { + LTRACEF("cpu_num %u, apic_id %u\n", cpu_num, apic_id); + + // assert that this thread is pinned to the current cpu + DEBUG_ASSERT(thread_pinned_cpu(get_current_thread()) == (int)arch_curr_cpu_num()); + + volatile uint32_t boot_completed = 0; + args->boot_completed_ptr = &boot_completed; + + // start x86 secondary cpu + + // send INIT IPI + lapic_send_init_ipi(apic_id, true); + thread_sleep(10); + + // deassert INIT + lapic_send_init_ipi(apic_id, false); + thread_sleep(10); + + // send Startup IPI up to 2 times as recommended by Intel + for (int i = 0; i < 2; i++) { + lapic_send_startup_ipi(apic_id, TRAMPOLINE_ADDRESS); + + // Wait a little bit for the cpu to start before trying a second time + thread_sleep(10); + if (boot_completed) { + goto booted; + } + } + + // Wait up to a second for the cpu to finish starting + for (int i = 0; i < 1000; i++) { + if (boot_completed) { + goto booted; + } + thread_sleep(10); + } + + // we have failed to start this core + // TODO: handle trying to shut the core down before moving on. + printf("PC: failed to start cpu %u\n", cpu_num); + return ERR_TIMED_OUT; + +booted: + LTRACEF("cpu %u booted\n", cpu_num); + return NO_ERROR; +} + +struct detected_cpus { + uint32_t num_detected; + uint32_t apic_ids[SMP_MAX_CPUS]; +}; + +static void local_apic_callback(const void *_entry, size_t entry_len, void *cookie) { + const struct acpi_madt_local_apic_entry *entry = _entry; + struct detected_cpus *cpus = cookie; + + if ((entry->flags & ACPI_MADT_FLAG_ENABLED) == 0) { + return; + } + + // TODO: read the current APIC id and skip it, instead of assuming 0 is the boot cpu + // read BSP from X86_IA32_APIC_BASE_MSR bit 8? + if (entry->apic_id == 0) { + // skip the boot cpu + return; + } + if (cpus->num_detected < SMP_MAX_CPUS) { + cpus->apic_ids[cpus->num_detected++] = entry->apic_id; + } +} + +void platform_start_secondary_cpus(void) { + struct detected_cpus cpus; + cpus.num_detected = 1; + cpus.apic_ids[0] = 0; // the boot cpu + + acpi_process_madt_entries_etc(ACPI_MADT_TYPE_LOCAL_APIC, &local_apic_callback, &cpus); + + // TODO: fall back to legacy methods if ACPI fails + // TODO: deal with cpu topology + + // start up the secondary cpus + if (cpus.num_detected < 2) { + dprintf(INFO, "PC: no secondary cpus detected\n"); + return; + } + + // create a new aspace to build an identity map in + vmm_aspace_t *aspace; + status_t err = vmm_create_aspace(&aspace, "identity map", 0); + if (err < 0) { + panic("failed to create identity map aspace\n"); + } + + // set up an identity map for the trampoline code + + void *ptr = (void *)TRAMPOLINE_ADDRESS; + err = vmm_alloc_physical(aspace, "trampoline", 0x10000, &ptr, 0, + TRAMPOLINE_ADDRESS, VMM_FLAG_VALLOC_SPECIFIC, ARCH_MMU_FLAG_CACHED); + if (err < 0) { + panic("failed to allocate trampoline memory\n"); + } + + vmm_aspace_t *old_aspace = vmm_set_active_aspace(aspace); + + // set up bootstrap code page at TRAMPOLINE_ADDRESS for secondary cpu + memcpy(ptr, mp_boot_start, mp_boot_end - mp_boot_start); + + // next page has args in it + struct bootstrap_args *args = (struct bootstrap_args *)((uintptr_t)ptr + 0x1000); + args->trampoline_cr3 = aspace->arch_aspace.cr3_phys; + + dprintf(INFO, "PC: detected %u cpus\n", cpus.num_detected); + + lk_init_secondary_cpus(cpus.num_detected - 1); + err = x86_allocate_percpu_array(cpus.num_detected - 1); + if (err < 0) { + panic("failed to allocate percpu array\n"); + } + + for (uint i = 1; i < cpus.num_detected; i++) { + dprintf(INFO, "PC: starting cpu %u\n", cpus.apic_ids[i]); + + args->cpu_num = i; + + x86_percpu_t *percpu = x86_get_percpu_for_cpu(i); + args->stack_top = (uintptr_t)percpu->bootstrap_stack + sizeof(percpu->bootstrap_stack); + + LTRACEF("args for cpu %lu: trampoline_cr3 %#lx, stack_top 0x%lx\n", args->cpu_num, args->trampoline_cr3, args->stack_top); + + start_cpu(i, cpus.apic_ids[i], args); + } + + // restore old aspace + vmm_set_active_aspace(old_aspace); + + // free the trampoline aspace + vmm_free_aspace(aspace); +} + +#endif // WITH_SMP \ No newline at end of file diff --git a/platform/pc/pit.c b/platform/pc/pit.c new file mode 100644 index 00000000..f0f7c97c --- /dev/null +++ b/platform/pc/pit.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2009 Corey Tabaka + * + * Use of this source code is governed by a MIT-style + * license that can be found in the LICENSE file or at + * https://opensource.org/licenses/MIT + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "platform_p.h" +#include +#include + +#define LOCAL_TRACE 0 + +// TODO: switch this logic to lib/fixed_point math + +static platform_timer_callback t_callback; +static void *callback_arg; +static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE; + +static uint64_t ticks_per_ms; + +// next callback event time in 32.32 fixed point milliseconds +static uint64_t next_trigger_time; + +// if periodic, the delta to set to the next event. if oneshot, 0 +static uint64_t next_trigger_delta; + +// time in 32.32 fixed point milliseconds +static volatile uint64_t timer_current_time; +// delta time per periodic tick in 32.32 +static uint64_t timer_delta_time; + +#define INTERNAL_FREQ 1193182ULL +#define INTERNAL_FREQ_3X 3579546ULL +#define INTERNAL_FREQ_TICKS_PER_MS (INTERNAL_FREQ / 1000u) + +/* Maximum amount of time that can be program on the timer to schedule the next + * interrupt, in milliseconds */ +#define MAX_TIMER_INTERVAL 55 + +lk_time_t pit_current_time(void) { + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + lk_time_t time = (lk_time_t) (timer_current_time >> 32); + + spin_unlock_irqrestore(&lock, state); + + return time; +} + +lk_bigtime_t pit_current_time_hires(void) { + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + lk_bigtime_t time = (lk_bigtime_t) ((timer_current_time >> 22) * 1000) >> 10; + + spin_unlock_irqrestore(&lock, state); + + return time; +} + +static enum handler_return pit_timer_tick(void *arg) { + if (next_trigger_time != 0 || next_trigger_delta) { + LTRACEF("ntt %#" PRIx64 ", ntd %#" PRIx64 "\n", next_trigger_time, next_trigger_delta); + } + + spin_lock(&lock); + timer_current_time += timer_delta_time; + spin_unlock(&lock); + + lk_time_t time = current_time(); + + if (t_callback && next_trigger_time != 0 && timer_current_time >= next_trigger_time) { + if (next_trigger_delta != 0) { + uint64_t delta = timer_current_time - next_trigger_time; + next_trigger_time = timer_current_time + next_trigger_delta - delta; + } else { + next_trigger_time = 0; + } + + return t_callback(callback_arg, time); + } else { + return INT_NO_RESCHEDULE; + } +} + +static void set_pit_frequency(uint32_t frequency) { + uint32_t count, remainder; + + LTRACEF("frequency %u\n", frequency); + + /* figure out the correct divisor for the desired frequency */ + if (frequency <= 18) { + count = 0xffff; + } else if (frequency >= INTERNAL_FREQ) { + count = 1; + } else { + count = INTERNAL_FREQ_3X / frequency; + remainder = INTERNAL_FREQ_3X % frequency; + + if (remainder >= INTERNAL_FREQ_3X / 2) { + count += 1; + } + + count /= 3; + remainder = count % 3; + + if (remainder >= 1) { + count += 1; + } + } + + uint16_t divisor = count & 0xffff; + + /* + * funky math that i don't feel like explaining. essentially 32.32 fixed + * point representation of the configured timer delta. + */ + timer_delta_time = (3685982306ULL * count) >> 10; + + LTRACEF("dt %#x.%08x\n", (uint32_t)(timer_delta_time >> 32), (uint32_t)(timer_delta_time & 0xffffffff)); + LTRACEF("divisor %" PRIu16 "\n", divisor); + + /* + * setup the Programmable Interval Timer + * timer 0, mode 2, binary counter, LSB followed by MSB + */ + outp(I8253_CONTROL_REG, 0x34); + outp(I8253_DATA_REG, divisor & 0xff); // LSB + outp(I8253_DATA_REG, divisor >> 8); // MSB +} + +void pit_init(void) { + // start the PIT at 1Khz in free-running mode to keep a time base + timer_current_time = 0; + ticks_per_ms = INTERNAL_FREQ/1000; + set_pit_frequency(1000); // ~1ms granularity + register_int_handler(INT_PIT, &pit_timer_tick, NULL); + unmask_interrupt(INT_PIT); +} + +status_t pit_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + LTRACEF("pit_set_periodic_timer: interval %u\n", interval); + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + t_callback = callback; + callback_arg = arg; + + next_trigger_delta = (uint64_t) interval << 32; + next_trigger_time = timer_current_time + next_trigger_delta; + + unmask_interrupt(INT_PIT); + spin_unlock_irqrestore(&lock, state); + + return NO_ERROR; +} + +status_t pit_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + LTRACEF("pit_set_oneshot_timer: interval %u\n", interval); + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + t_callback = callback; + callback_arg = arg; + + next_trigger_delta = 0; + next_trigger_time = timer_current_time + ((uint64_t)interval << 32); + + unmask_interrupt(INT_PIT); + spin_unlock_irqrestore(&lock, state); + + return NO_ERROR; +} + +void pit_cancel_timer(void) { + LTRACE; + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + next_trigger_time = 0; + + spin_unlock_irqrestore(&lock, state); +} + +void pit_stop_timer(void) { + LTRACE; + + spin_lock_saved_state_t state; + spin_lock_irqsave(&lock, state); + + next_trigger_time = 0; + next_trigger_delta = 0; + + // stop the PIT + outp(I8253_CONTROL_REG, 0x34); + outp(I8253_DATA_REG, 0); // LSB + outp(I8253_DATA_REG, 0); // MSB + mask_interrupt(INT_PIT); + + spin_unlock_irqrestore(&lock, state); +} + +uint64_t pit_calibrate_tsc(void) { + DEBUG_ASSERT(arch_ints_disabled()); + + uint64_t tsc_ticks[5] = {0}; + uint32_t countdown_ms[5] = {0}; + + uint64_t tsc_freq = 0; + for (uint i = 0; i < countof(tsc_ticks); i++) { + // calibrate the tsc frequency using the PIT + countdown_ms[i] = 2 * (i + 1); + + uint16_t pic_ticks = INTERNAL_FREQ_TICKS_PER_MS * countdown_ms[i]; + outp(I8253_CONTROL_REG, 0x30); + outp(I8253_DATA_REG, pic_ticks & 0xff); // LSB + outp(I8253_DATA_REG, pic_ticks >> 8); // MSB + + // read the tsc + uint64_t tsc_start = __builtin_ia32_rdtsc(); + + // wait for countdown_ms + uint8_t status = 0; + do { + // Send a read-back command that latches the status of ch0 + outp(I8253_CONTROL_REG, 0xe2); + status = inp(I8253_DATA_REG); + // Wait for bit 7 (output) to go high and for bit 6 (null count) to go low + } while ((status & 0xc0) != 0x80); + + uint64_t tsc_end = __builtin_ia32_rdtsc(); + tsc_ticks[i] = tsc_end - tsc_start; + } + + // find the best time + uint best_index = 0; + for (uint i = 1; i < countof(tsc_ticks); i++) { + if (tsc_ticks[i] < tsc_ticks[best_index]) { + best_index = i; + } + } + + // calculate the tsc frequency + tsc_freq = (tsc_ticks[best_index] * 1000) / countdown_ms[best_index]; + dprintf(INFO, "PIT: calibrated TSC frequency: %" PRIu64 "Hz\n", tsc_freq); + + // put the PIT back to 1ms countdown + set_pit_frequency(1000); + + return tsc_freq; +} + +uint32_t pit_calibrate_lapic(uint32_t (*lapic_read_tick)(void)) { + DEBUG_ASSERT(arch_ints_disabled()); + + uint64_t lapic_ticks[5] = {0}; + uint32_t countdown_ms[5] = {0}; + + for (uint i = 0; i < countof(lapic_ticks); i++) { + // calibrate the tsc frequency using the PIT + countdown_ms[i] = 2 * (i + 1); + + uint16_t pic_ticks = INTERNAL_FREQ_TICKS_PER_MS * countdown_ms[i]; + outp(I8253_CONTROL_REG, 0x30); + outp(I8253_DATA_REG, pic_ticks & 0xff); // LSB + outp(I8253_DATA_REG, pic_ticks >> 8); // MSB + + // read the tsc + uint32_t tick_start = lapic_read_tick(); + + // wait for countdown_ms + uint8_t status = 0; + do { + // Send a read-back command that latches the status of ch0 + outp(I8253_CONTROL_REG, 0xe2); + status = inp(I8253_DATA_REG); + // Wait for bit 7 (output) to go high and for bit 6 (null count) to go low + } while ((status & 0xc0) != 0x80); + + uint32_t tick_end = lapic_read_tick(); + lapic_ticks[i] = tick_start - tick_end; + } + + // find the best time + uint best_index = 0; + for (uint i = 1; i < countof(lapic_ticks); i++) { + if (lapic_ticks[i] < lapic_ticks[best_index]) { + best_index = i; + } + } + + // calculate the tsc frequency + uint32_t freq = (lapic_ticks[best_index] * 1000) / countdown_ms[best_index]; + dprintf(INFO, "PIT: calibrated local apic frequency: %" PRIu32 "Hz\n", freq); + + // put the PIT back to 1ms countdown + set_pit_frequency(1000); + + return freq; +} \ No newline at end of file diff --git a/platform/pc/platform.c b/platform/pc/platform.c index c01c248e..69dc9033 100644 --- a/platform/pc/platform.c +++ b/platform/pc/platform.c @@ -13,7 +13,6 @@ #include #include #include -#include "platform_p.h" #include #include #include @@ -22,12 +21,13 @@ #include #include #include -#include #include #include #include #include +#include "platform_p.h" + #if WITH_DEV_BUS_PCI #include #endif @@ -188,9 +188,6 @@ void platform_early_init(void) { /* initialize the interrupt controller */ platform_init_interrupts(); - /* initialize the timer */ - platform_init_timer(); - /* look at multiboot to determine our memory size */ size_t found_arenas; platform_parse_multiboot_info(&found_arenas); @@ -218,44 +215,30 @@ void platform_early_init(void) { dprintf(INFO, "PC: total memory detected %" PRIu64 " bytes\n", total_mem); } -void local_apic_callback(const void *_entry, size_t entry_len) { - const struct acpi_madt_local_apic_entry *entry = _entry; - - printf("\tLOCAL APIC id %d, processor id %d, flags %#x\n", - entry->apic_id, entry->processor_id, entry->flags); -} - -void io_apic_callback(const void *_entry, size_t entry_len) { - const struct acpi_madt_io_apic_entry *entry = _entry; - - printf("\tIO APIC id %d, address %#x gsi base %u\n", - entry->io_apic_id, entry->io_apic_address, entry->global_system_interrupt_base); -} - -void int_source_override_callback(const void *_entry, size_t entry_len) { - const struct acpi_madt_int_source_override_entry *entry = _entry; - - printf("\tINT OVERRIDE bus %u, source %u, gsi %u, flags %#x\n", - entry->bus, entry->source, entry->global_sys_interrupt, entry->flags); -} - void platform_init(void) { platform_init_debug(); platform_init_keyboard(&console_input_buf); -#if WITH_DEV_BUS_PCI - bool pci_initted = false; + // Look for the root ACPI table + __UNUSED bool found_acpi = false; if (acpi_lite_init(0) == NO_ERROR) { if (LOCAL_TRACE) { acpi_lite_dump_tables(false); } + acpi_lite_dump_madt_table(); + found_acpi = true; + } - // dump the APIC table - printf("MADT/APIC table:\n"); - acpi_process_madt_entries_etc(ACPI_MADT_TYPE_LOCAL_APIC, &local_apic_callback); - acpi_process_madt_entries_etc(ACPI_MADT_TYPE_IO_APIC, &io_apic_callback); - acpi_process_madt_entries_etc(ACPI_MADT_TYPE_INT_SOURCE_OVERRIDE, &int_source_override_callback); + // Look for secondary cpus +#if WITH_SMP + platform_start_secondary_cpus(); +#endif + +#if WITH_DEV_BUS_PCI + bool pci_initted = false; + if (found_acpi) { + // TODO: handle interrupt source overrides from the MADT table // try to find the mcfg table const struct acpi_mcfg_table *table = (const struct acpi_mcfg_table *)acpi_get_table_by_sig(ACPI_MCFG_SIG); diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index 59e340fd..bd6e428a 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -8,13 +8,13 @@ #pragma once #include +#include extern cbuf_t console_input_buf; void platform_init_debug_early(void); void platform_init_debug(void); void platform_init_interrupts(void); -void platform_init_timer(void); // legacy programmable interrupt controller void pic_init(void); @@ -22,7 +22,15 @@ void pic_enable(unsigned int vector, bool enable); void pic_eoi(unsigned int vector); void pic_mask_interrupts(void); -// local apic -void lapic_init(void); -void lapic_eoi(unsigned int vector); +// programable interval timer +void pit_init(void); +status_t pit_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +status_t pit_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval); +void pit_cancel_timer(void); +void pit_stop_timer(void); +lk_time_t pit_current_time(void); +lk_bigtime_t pit_current_time_hires(void); +uint64_t pit_calibrate_tsc(void); +// secondary cpus +void platform_start_secondary_cpus(void); diff --git a/platform/pc/rules.mk b/platform/pc/rules.mk index 9b62107d..2ddfd39c 100644 --- a/platform/pc/rules.mk +++ b/platform/pc/rules.mk @@ -6,10 +6,10 @@ MODULE := $(LOCAL_DIR) # legacy implies older hardware, pre pentium, pre pci CPU ?= modern -MODULE_DEPS += \ - lib/acpi_lite \ - lib/bio \ - lib/cbuf +MODULE_DEPS += lib/acpi_lite +MODULE_DEPS += lib/bio +MODULE_DEPS += lib/cbuf +MODULE_DEPS += lib/fixed_point ifneq ($(CPU),legacy) MODULE_DEPS += dev/bus/pci/drivers @@ -22,13 +22,18 @@ MODULE_SRCS += \ $(LOCAL_DIR)/ide.c \ $(LOCAL_DIR)/interrupts.c \ $(LOCAL_DIR)/keyboard.c \ - $(LOCAL_DIR)/lapic.c \ + $(LOCAL_DIR)/mp.c \ + $(LOCAL_DIR)/mp-boot.S \ $(LOCAL_DIR)/pic.c \ + $(LOCAL_DIR)/pit.c \ $(LOCAL_DIR)/platform.c \ $(LOCAL_DIR)/timer.c \ $(LOCAL_DIR)/uart.c \ LK_HEAP_IMPLEMENTATION ?= dlmalloc +GLOBAL_DEFINES += \ + PLATFORM_HAS_DYNAMIC_TIMER=1 + include make/module.mk diff --git a/platform/pc/timer.c b/platform/pc/timer.c index ba4b852b..46c3e18a 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -1,184 +1,186 @@ /* - * Copyright (c) 2009 Corey Tabaka + * Copyright (c) 2025 Travis Geiselbrecht * * Use of this source code is governed by a MIT-style * license that can be found in the LICENSE file or at * https://opensource.org/licenses/MIT */ #include -#include -#include #include +#include +#include +#include +#include #include -#include +#include #include -#include -#include #include #include -#include "platform_p.h" +#include #include +#include +#include +#include +#include +#include -static platform_timer_callback t_callback; -static void *callback_arg; -static spin_lock_t lock; +#include "platform_p.h" -static uint64_t next_trigger_time; -static uint64_t next_trigger_delta; -static uint64_t ticks_per_ms; +#define LOCAL_TRACE 0 -static uint64_t timer_delta_time; -static volatile uint64_t timer_current_time; +// Deals with all of the various clock sources and event timers on the PC platform. +// TODO: +// HPET +// cpuid leaves that describe clock rates -static uint16_t divisor; +static enum clock_source { + CLOCK_SOURCE_INITIAL, + CLOCK_SOURCE_PIT, + CLOCK_SOURCE_TSC, + CLOCK_SOURCE_HPET, +} clock_source = CLOCK_SOURCE_INITIAL; -#define INTERNAL_FREQ 1193182ULL -#define INTERNAL_FREQ_3X 3579546ULL +static struct fp_32_64 tsc_to_timebase; +static struct fp_32_64 tsc_to_timebase_hires; +static struct fp_32_64 timebase_to_tsc; +static bool use_lapic_timer = false; -/* Maximum amount of time that can be program on the timer to schedule the next - * interrupt, in milliseconds */ -#define MAX_TIMER_INTERVAL 55 - - - -status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { - t_callback = callback; - callback_arg = arg; - - next_trigger_delta = (uint64_t) interval << 32; - next_trigger_time = timer_current_time + next_trigger_delta; - - return NO_ERROR; +static const char *clock_source_name(void) { + switch (clock_source) { + case CLOCK_SOURCE_INITIAL: + return "initial"; + case CLOCK_SOURCE_PIT: + return "PIT"; + case CLOCK_SOURCE_TSC: + return "TSC"; + case CLOCK_SOURCE_HPET: + return "HPET"; + default: + return "unknown"; + } } lk_time_t current_time(void) { - lk_time_t time; - - // XXX slight race - time = (lk_time_t) (timer_current_time >> 32); - - return time; + switch (clock_source) { + case CLOCK_SOURCE_PIT: + return pit_current_time(); + case CLOCK_SOURCE_TSC: + return u32_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase); + default: + return 0; + } } lk_bigtime_t current_time_hires(void) { - lk_bigtime_t time; - - // XXX slight race - time = (lk_bigtime_t) ((timer_current_time >> 22) * 1000) >> 10; - - return time; -} -static enum handler_return os_timer_tick(void *arg) { - uint64_t delta; - - timer_current_time += timer_delta_time; - - lk_time_t time = current_time(); - //lk_bigtime_t btime = current_time_hires(); - //printf_xy(71, 0, WHITE, "%08u", (uint32_t) time); - //printf_xy(63, 1, WHITE, "%016llu", (uint64_t) btime); - - if (t_callback && timer_current_time >= next_trigger_time) { - delta = timer_current_time - next_trigger_time; - next_trigger_time = timer_current_time + next_trigger_delta - delta; - - return t_callback(callback_arg, time); - } else { - return INT_NO_RESCHEDULE; + switch (clock_source) { + case CLOCK_SOURCE_PIT: + return pit_current_time_hires(); + case CLOCK_SOURCE_TSC: + return u64_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase_hires); + default: + return 0; } } -static void set_pit_frequency(uint32_t frequency) { - uint32_t count, remainder; +// Convert lk_time_t to TSC ticks +uint64_t time_to_tsc_ticks(lk_time_t time) { + return u64_mul_u32_fp32_64(time, timebase_to_tsc); +} - /* figure out the correct divisor for the desired frequency */ - if (frequency <= 18) { - count = 0xffff; - } else if (frequency >= INTERNAL_FREQ) { - count = 1; - } else { - count = INTERNAL_FREQ_3X / frequency; - remainder = INTERNAL_FREQ_3X % frequency; +void pc_init_timer(unsigned int level) { + // Initialize the PIT, it's always present in PC hardware + pit_init(); + clock_source = CLOCK_SOURCE_PIT; - if (remainder >= INTERNAL_FREQ_3X / 2) { - count += 1; - } +#if !X86_LEGACY + // XXX update note about what invariant TSC means + bool use_invariant_tsc = x86_feature_test(X86_FEATURE_INVAR_TSC); + LTRACEF("invariant TSC %d\n", use_invariant_tsc); - count /= 3; - remainder = count % 3; + // Test for hypervisor PV clock, which also effectively says if TSC is invariant across + // all cpus. + if (pvclock_init() == NO_ERROR) { + bool pv_clock_stable = pv_clock_is_stable(); - if (remainder >= 1) { - count += 1; - } + use_invariant_tsc |= pv_clock_stable; + + printf("pv_clock: Clocksource is %sstable\n", (pv_clock_stable ? "" : "not ")); } - divisor = count & 0xffff; + // XXX test for HPET and use it over PIT if present - /* - * funky math that i don't feel like explaining. essentially 32.32 fixed - * point representation of the configured timer delta. - */ - timer_delta_time = (3685982306ULL * count) >> 10; + if (use_invariant_tsc) { + // We're going to try to use the TSC as a time base, obtain the TSC frequency. + uint64_t tsc_hz = 0; - //dprintf(DEBUG, "set_pit_frequency: dt=%016llx\n", timer_delta_time); - //dprintf(DEBUG, "set_pit_frequency: divisor=%04x\n", divisor); + tsc_hz = pvclock_get_tsc_freq(); + if (tsc_hz == 0) { + // TODO: some x86 cores describe the TSC and lapic clocks in cpuid - /* - * setup the Programmable Interval Timer - * timer 0, mode 2, binary counter, LSB followed by MSB - */ - outp(I8253_CONTROL_REG, 0x34); - outp(I8253_DATA_REG, divisor & 0xff); // LSB - outp(I8253_DATA_REG, divisor >> 8); // MSB + // Calibrate the TSC against the PIT, which should always be present + tsc_hz = pit_calibrate_tsc(); + if (tsc_hz == 0) { + dprintf(CRITICAL, "PC: failed to calibrate TSC frequency\n"); + goto out; + } + } + + dprintf(INFO, "PC: TSC frequency %" PRIu64 "Hz\n", tsc_hz); + + // Compute the ratio of TSC to timebase + fp_32_64_div_32_32(&tsc_to_timebase, 1000, tsc_hz); + dprintf(INFO, "PC: TSC to timebase ratio %u.%08u...\n", + tsc_to_timebase.l0, tsc_to_timebase.l32); + + fp_32_64_div_32_32(&tsc_to_timebase_hires, 1000*1000, tsc_hz); + dprintf(INFO, "PC: TSC to hires timebase ratio %u.%08u...\n", + tsc_to_timebase_hires.l0, tsc_to_timebase_hires.l32); + + fp_32_64_div_32_32(&timebase_to_tsc, tsc_hz, 1000); + dprintf(INFO, "PC: timebase to TSC ratio %u.%08u...\n", + timebase_to_tsc.l0, timebase_to_tsc.l32); + + clock_source = CLOCK_SOURCE_TSC; + } +out: + + // Set up the local apic for event timer interrupts + if (lapic_timer_init(use_invariant_tsc) == NO_ERROR) { + dprintf(INFO, "PC: using LAPIC timer for event timer\n"); + use_lapic_timer = true; + } + + // If we're not using the PIT for time base and using the LAPIC timer for events, stop the PIT. + if (use_lapic_timer && clock_source != CLOCK_SOURCE_PIT) { + pit_stop_timer(); + } + +#endif // !X86_LEGACY + + dprintf(INFO, "PC: using %s clock source\n", clock_source_name()); } -void platform_init_timer(void) { +LK_INIT_HOOK(pc_timer, pc_init_timer, LK_INIT_LEVEL_VM + 2); - timer_current_time = 0; - ticks_per_ms = INTERNAL_FREQ/1000; - set_pit_frequency(1000); // ~1ms granularity - register_int_handler(INT_PIT, &os_timer_tick, NULL); - unmask_interrupt(INT_PIT); -} - -static void platform_halt_timers(void) { - mask_interrupt(INT_PIT); +status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { + if (use_lapic_timer) { + PANIC_UNIMPLEMENTED; + } + return pit_set_periodic_timer(callback, arg, interval); } status_t platform_set_oneshot_timer(platform_timer_callback callback, void *arg, lk_time_t interval) { - uint32_t count; - - spin_lock_saved_state_t state; - spin_lock_irqsave(&lock, state); - - t_callback = callback; - callback_arg = arg; - - - if (interval > MAX_TIMER_INTERVAL) - interval = MAX_TIMER_INTERVAL; - if (interval < 1) interval = 1; - - count = ticks_per_ms * interval; - - divisor = count & 0xffff; - timer_delta_time = (3685982306ULL * count) >> 10; - /* Program PIT in the software strobe configuration, to send one pulse - * after the count reach 0 */ - outp(I8253_CONTROL_REG, 0x38); - outp(I8253_DATA_REG, divisor & 0xff); // LSB - outp(I8253_DATA_REG, divisor >> 8); // MSB - - - unmask_interrupt(INT_PIT); - spin_unlock_irqrestore(&lock, state); - - return NO_ERROR; + if (use_lapic_timer) { + return lapic_set_oneshot_timer(callback, arg, interval); + } + return pit_set_oneshot_timer(callback, arg, interval); } void platform_stop_timer(void) { - /* Enable interrupt mode that will stop the decreasing counter of the PIT */ - outp(I8253_CONTROL_REG, 0x30); - return; + if (use_lapic_timer) { + lapic_cancel_timer(); + } else { + pit_cancel_timer(); + } } diff --git a/scripts/do-qemux86 b/scripts/do-qemux86 index 685718b4..96e3997c 100755 --- a/scripts/do-qemux86 +++ b/scripts/do-qemux86 @@ -86,7 +86,7 @@ fi ARGS="" if (( $DO_KVM )); then - ARGS+=" -enable-kvm -cpu host" + ARGS+=" -accel kvm -cpu host" else ARGS+=" -cpu $CPU" fi