diff --git a/arch/x86/feature.c b/arch/x86/feature.c index e4811652..ce28c280 100644 --- a/arch/x86/feature.c +++ b/arch/x86/feature.c @@ -133,10 +133,13 @@ static void x86_cpu_detect(void) { // read max hypervisor leaf cpuid(X86_CPUID_HYP_BASE, &a, &b, &c, &d); - // TODO: actually check that it's an understood hypervisor before setting this. - // It's possible on real hardware it's just returning the last valid regular cpuid. - if (a >= X86_CPUID_HYP_BASE) { + + // Check that it's an understood hypervisor leaf + if ((b == 0x4b4d564b && c == 0x564b4d56 && d == 0x4d) || /* KVMKVMKVM */ + (b == 0x54474354 && c == 0x43544743 && d == 0x47435447)) { /* TCGTCGTCGTCG */ max_cpuid_leaf_hyp = MIN(a, __X86_MAX_SUPPORTED_CPUID_HYP); + } else { + max_cpuid_leaf_hyp = 0; } } else { __x86_cpu_vendor = X86_CPU_VENDOR_INTEL; // intrinsically Intel without cpuid @@ -191,12 +194,12 @@ void x86_feature_early_init(void) { // cache a copy of the cpuid bits if (has_cpuid) { - for (uint32_t i = 1; i <= max_cpuid_leaf; i++) { + for (uint32_t i = 0; i <= max_cpuid_leaf; i++) { cpuid_c(i, 0, &saved_cpuids[i].a, &saved_cpuids[i].b, &saved_cpuids[i].c, &saved_cpuids[i].d); } if (max_cpuid_leaf_ext > 0) { - for (uint32_t i = X86_CPUID_EXT_BASE + 1; i - 1 < max_cpuid_leaf_ext; i++) { + for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) { uint32_t index = i - X86_CPUID_EXT_BASE; cpuid_c(i, 0, &saved_cpuids_ext[index].a, &saved_cpuids_ext[index].b, &saved_cpuids_ext[index].c, &saved_cpuids_ext[index].d); @@ -204,7 +207,7 @@ void x86_feature_early_init(void) { } if (max_cpuid_leaf_hyp > 0) { - for (uint32_t i = X86_CPUID_HYP_BASE + 1; i - 1 < max_cpuid_leaf_hyp; i++) { + for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) { uint32_t index = i - X86_CPUID_HYP_BASE; cpuid_c(i, 0, &saved_cpuids_hyp[index].a, &saved_cpuids_hyp[index].b, &saved_cpuids_hyp[index].c, &saved_cpuids_hyp[index].d); @@ -213,6 +216,23 @@ void x86_feature_early_init(void) { } } +static void x86_feature_dump_cpuid(void) { + for (uint32_t i = X86_CPUID_BASE; i <= max_cpuid_leaf; i++) { + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids[i - X86_CPUID_BASE].a, saved_cpuids[i - X86_CPUID_BASE].b, saved_cpuids[i - X86_CPUID_BASE].c, saved_cpuids[i - X86_CPUID_BASE].d); + } + for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) { + uint32_t index = i - X86_CPUID_HYP_BASE; + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids_hyp[index].a, saved_cpuids_hyp[index].b, saved_cpuids_hyp[index].c, saved_cpuids_hyp[index].d); + } + for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) { + uint32_t index = i - X86_CPUID_EXT_BASE; + printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i, + saved_cpuids[index].a, saved_cpuids[index].b, saved_cpuids[index].c, saved_cpuids[index].d); + } +} + /* later feature init hook, called after the kernel is able to schedule */ void x86_feature_init(void) { dprintf(SPEW, "X86: detected cpu level %d has_cpuid %d\n", x86_get_cpu_level(), has_cpuid); @@ -243,6 +263,10 @@ void x86_feature_init(void) { printf("X86: processor model info type %#x family %#x model %#x stepping %#x\n", model->processor_type, model->family, model->model, model->stepping); printf("\tdisplay_family %#x display_model %#x\n", model->display_family, model->display_model); + + if (has_cpuid && LK_DEBUGLEVEL > 1) { + x86_feature_dump_cpuid(); + } } bool x86_get_cpuid_subleaf(enum x86_cpuid_leaf_num num, uint32_t subleaf, struct x86_cpuid_leaf* leaf) { diff --git a/arch/x86/include/arch/x86/feature.h b/arch/x86/include/arch/x86/feature.h index f32d7bb8..a743a634 100644 --- a/arch/x86/include/arch/x86/feature.h +++ b/arch/x86/include/arch/x86/feature.h @@ -328,12 +328,12 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) { #define X86_FEATURE_CORE_CAPABILITIES X86_CPUID_BIT(0x7, 3, 30) #define X86_FEATURE_SSBD X86_CPUID_BIT(0x7, 3, 31) -#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000000, 0, 0) -#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000000, 0, 1) -#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000000, 0, 2) -#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000000, 0, 3) -#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000000, 0, 4) -#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000000, 0, 5) +#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000001, 0, 0) +#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000001, 0, 1) +#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000001, 0, 2) +#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000001, 0, 3) +#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000001, 0, 4) +#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000001, 0, 5) #define X86_FEATURE_KVM_PV_EOI X86_CPUID_BIT(0x40000001, 0, 6) #define X86_FEATURE_KVM_PV_UNHALT X86_CPUID_BIT(0x40000001, 0, 7) #define X86_FEATURE_KVM_PV_TLB_FLUSH X86_CPUID_BIT(0x40000001, 0, 9) @@ -355,6 +355,7 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) { #define X86_FEATURE_HUGE_PAGE X86_CPUID_BIT(0x80000001, 3, 26) #define X86_FEATURE_RDTSCP X86_CPUID_BIT(0x80000001, 3, 27) #define X86_FEATURE_INVAR_TSC X86_CPUID_BIT(0x80000007, 3, 8) +#define X86_FEATURE_CONSTANT_TSC X86_CPUID_BIT(0x80000007, 3, 8) // accessor to read some fields out of a register static inline uint32_t x86_get_vaddr_width(void) { diff --git a/lib/acpi_lite/acpi_lite.cpp b/lib/acpi_lite/acpi_lite.cpp index 7b5b09f6..1059cf0d 100644 --- a/lib/acpi_lite/acpi_lite.cpp +++ b/lib/acpi_lite/acpi_lite.cpp @@ -17,7 +17,7 @@ #include // uses the vm to map in ACPI tables as they are found -static_assert(WITH_KERNEL_VM, ""); +static_assert(WITH_KERNEL_VM); #define LOCAL_TRACE 0 diff --git a/platform/pc/lapic.c b/platform/pc/lapic.c index cc7c2448..bb38ed8b 100644 --- a/platform/pc/lapic.c +++ b/platform/pc/lapic.c @@ -141,14 +141,6 @@ enum handler_return lapic_timer_handler(void *arg) { } void lapic_init(void) { - // discover the presence of the local apic and map it - LTRACE_ENTRY; - - // check feature bit 9 in edx of leaf 1 for presence of lapic - lapic_present = x86_feature_test(X86_FEATURE_APIC); -} - -void lapic_init_postvm(uint level) { if (!lapic_present) return; @@ -207,8 +199,6 @@ void lapic_init_postvm(uint level) { lapic_set_oneshot_timer(1000000); } -LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM); - void lapic_eoi(unsigned int vector) { LTRACEF("vector %#x\n", vector); if (!lapic_present) { diff --git a/platform/pc/mp.c b/platform/pc/mp.c index 0900f4ac..07ddf487 100644 --- a/platform/pc/mp.c +++ b/platform/pc/mp.c @@ -91,6 +91,7 @@ static void local_apic_callback(const void *_entry, size_t entry_len, void *cook const struct acpi_madt_local_apic_entry *entry = _entry; struct detected_cpus *cpus = cookie; + // TODO: read the current APIC id and skip it, instead of assuming 0 is the boot cpu if (entry->apic_id == 0) { // skip the boot cpu return; diff --git a/platform/pc/pit.c b/platform/pc/pit.c index d9930ff9..6b6faa2f 100644 --- a/platform/pc/pit.c +++ b/platform/pc/pit.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,9 @@ #define LOCAL_TRACE 0 + +// TODO: switch this logic to lib/fixed_point math + static platform_timer_callback t_callback; static void *callback_arg; static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE; @@ -42,6 +46,7 @@ static uint64_t timer_delta_time; #define INTERNAL_FREQ 1193182ULL #define INTERNAL_FREQ_3X 3579546ULL +#define INTERNAL_FREQ_TICKS_PER_MS (INTERNAL_FREQ / 1000u) /* Maximum amount of time that can be program on the timer to schedule the next * interrupt, in milliseconds */ @@ -128,8 +133,8 @@ static void set_pit_frequency(uint32_t frequency) { */ timer_delta_time = (3685982306ULL * count) >> 10; - LTRACEF("dt 0x%016" PRIx64 "\n", timer_delta_time); - LTRACEF("divisor 0x%04" PRIx16 "\n", divisor); + LTRACEF("dt %#x.%08x\n", (uint32_t)(timer_delta_time >> 32), (uint32_t)(timer_delta_time & 0xffffffff)); + LTRACEF("divisor %" PRIu16 "\n", divisor); /* * setup the Programmable Interval Timer @@ -191,4 +196,54 @@ void pit_stop_timer(void) { mask_interrupt(INT_PIT); spin_unlock_irqrestore(&lock, state); +} + +uint64_t pit_calibrate_tsc(void) { + DEBUG_ASSERT(arch_ints_disabled()); + + uint64_t tsc_ticks[5] = {0}; + uint32_t countdown_ms[5] = {0}; + + uint64_t tsc_freq = 0; + for (uint i = 0; i < countof(tsc_ticks); i++) { + // calibrate the tsc frequency using the PIT + countdown_ms[i] = 2 * (i + 1); + + uint16_t pic_ticks = INTERNAL_FREQ_TICKS_PER_MS * countdown_ms[i]; + outp(I8253_CONTROL_REG, 0x30); + outp(I8253_DATA_REG, pic_ticks & 0xff); // LSB + outp(I8253_DATA_REG, pic_ticks >> 8); // MSB + + // read the tsc + uint64_t tsc_start = __builtin_ia32_rdtsc(); + + // wait for countdown_ms + uint8_t status = 0; + do { + // Send a read-back command that latches the status of ch0 + outp(I8253_CONTROL_REG, 0xe2); + status = inp(I8253_DATA_REG); + // Wait for bit 7 (output) to go high and for bit 6 (null count) to go low + } while ((status & 0xc0) != 0x80); + + uint64_t tsc_end = __builtin_ia32_rdtsc(); + tsc_ticks[i] = tsc_end - tsc_start; + } + + // find the best time + uint best_index = 0; + for (uint i = 1; i < countof(tsc_ticks); i++) { + if (tsc_ticks[i] < tsc_ticks[best_index]) { + best_index = i; + } + } + + // calculate the tsc frequency + tsc_freq = (tsc_ticks[best_index] * 1000) / countdown_ms[best_index]; + dprintf(INFO, "PIT: calibrated TSC frequency: %" PRIu64 "Hz\n", tsc_freq); + + // put the PIT back to 1ms countdown + set_pit_frequency(1000); + + return tsc_freq; } \ No newline at end of file diff --git a/platform/pc/platform_p.h b/platform/pc/platform_p.h index 5e17c41e..59df1e0f 100644 --- a/platform/pc/platform_p.h +++ b/platform/pc/platform_p.h @@ -21,6 +21,7 @@ void pic_init(void); void pic_enable(unsigned int vector, bool enable); void pic_eoi(unsigned int vector); void pic_mask_interrupts(void); +uint64_t pit_calibrate_tsc(void); // local apic void lapic_init(void); diff --git a/platform/pc/rules.mk b/platform/pc/rules.mk index 1f5d5e23..dbee0748 100644 --- a/platform/pc/rules.mk +++ b/platform/pc/rules.mk @@ -6,10 +6,10 @@ MODULE := $(LOCAL_DIR) # legacy implies older hardware, pre pentium, pre pci CPU ?= modern -MODULE_DEPS += \ - lib/acpi_lite \ - lib/bio \ - lib/cbuf +MODULE_DEPS += lib/acpi_lite +MODULE_DEPS += lib/bio +MODULE_DEPS += lib/cbuf +MODULE_DEPS += lib/fixed_point ifneq ($(CPU),legacy) MODULE_DEPS += dev/bus/pci/drivers diff --git a/platform/pc/timer.c b/platform/pc/timer.c index c92a2d99..03e77cdc 100644 --- a/platform/pc/timer.c +++ b/platform/pc/timer.c @@ -12,13 +12,17 @@ #include #include #include +#include #include #include #include #include "platform_p.h" #include +#include +#include +#include -#define LOCAL_TRACE 0 +#define LOCAL_TRACE 1 // Deals with all of the various clock sources and event timers on the PC platform. @@ -29,16 +33,19 @@ static enum clock_source { CLOCK_SOURCE_HPET, } clock_source = CLOCK_SOURCE_INITIAL; +struct fp_32_64 tsc_to_timebase; +struct fp_32_64 tsc_to_timebase_hires; + static const char *clock_source_name(void) { switch (clock_source) { case CLOCK_SOURCE_INITIAL: return "initial"; case CLOCK_SOURCE_PIT: - return "pit"; + return "PIT"; case CLOCK_SOURCE_TSC: - return "tsc"; + return "TSC"; case CLOCK_SOURCE_HPET: - return "hpet"; + return "HPET"; default: return "unknown"; } @@ -48,6 +55,8 @@ lk_time_t current_time(void) { switch (clock_source) { case CLOCK_SOURCE_PIT: return pit_current_time(); + case CLOCK_SOURCE_TSC: + return u32_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase); default: return 0; } @@ -57,16 +66,174 @@ lk_bigtime_t current_time_hires(void) { switch (clock_source) { case CLOCK_SOURCE_PIT: return pit_current_time_hires(); + case CLOCK_SOURCE_TSC: + return u64_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase_hires); default: return 0; } } -void pc_init_timer(unsigned int level) { - LTRACE_ENTRY; +// From https://www.kernel.org/doc/html/v6.14/virt/kvm/x86/msr.html +struct pvclock_wall_clock { + uint32_t version; + uint32_t sec; + uint32_t nsec; +} __PACKED; +static_assert(sizeof(struct pvclock_wall_clock) == 12, "pvclock_wall_clock size mismatch"); +struct pvclock_vcpu_time_info { + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; + uint64_t system_time; + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + uint8_t flags; + uint8_t pad[2]; +} __PACKED; +static_assert(sizeof(struct pvclock_vcpu_time_info) == 32, "pvclock_vcpu_time_info size mismatch"); + +static volatile struct pvclock_wall_clock *wall_clock; +static volatile struct pvclock_vcpu_time_info *vcpu_time_info; + +status_t pvclock_init(void) { + uint32_t clocksource_msr_base = 0; + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE)) { + clocksource_msr_base = 0x11; + } + if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE2)) { + clocksource_msr_base = 0x4b564d00; + } + if (!clocksource_msr_base) { + return ERR_NOT_SUPPORTED; + } + dprintf(INFO, "pv_clock: clocksource detected, msr base %#x\n", clocksource_msr_base); + + // map a page of memory and point the KVM clocksource msrs at it + void *clocksource_page; + status_t err = vmm_alloc(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, &clocksource_page, 0, 0, 0); + if (err != NO_ERROR) { + printf("pv_clock: failed to allocate page for clocksource msrs\n"); + return err; + } + + paddr_t paddr; + arch_mmu_query(&vmm_get_kernel_aspace()->arch_aspace, (vaddr_t)clocksource_page, &paddr, NULL); + LTRACEF("clocksource page %p, paddr %#" PRIxPTR "\n", clocksource_page, paddr); + + write_msr(clocksource_msr_base, paddr); + write_msr(clocksource_msr_base + 1, paddr + sizeof(struct pvclock_wall_clock) + 1); + + wall_clock = (struct pvclock_wall_clock *)clocksource_page; + vcpu_time_info = (struct pvclock_vcpu_time_info *)(wall_clock + 1); + + dprintf(SPEW, "pv_clock: wall clock version %u, sec %u, nsec %u\n", + wall_clock->version, wall_clock->sec, wall_clock->nsec); + + dprintf(SPEW, "pv_clock: vcpu time info version %u, tsc timestamp %llu, system time %llu\n", + vcpu_time_info->version, vcpu_time_info->tsc_timestamp, vcpu_time_info->system_time); + dprintf(SPEW, "pv_clock: tsc to system mul %u, tsc shift %d, flags %u\n", + vcpu_time_info->tsc_to_system_mul, vcpu_time_info->tsc_shift, vcpu_time_info->flags); + + return NO_ERROR; +} + +uint64_t pvclock_get_tsc_freq(void) { + uint32_t tsc_mul = 0; + int8_t tsc_shift = 0; + + if (!vcpu_time_info) { + return 0; + } + + uint32_t pre_version = 0, post_version = 0; + do { + pre_version = vcpu_time_info->version; + if (pre_version % 2 != 0) { + asm("pause"); + continue; + } + tsc_mul = vcpu_time_info->tsc_to_system_mul; + tsc_shift = vcpu_time_info->tsc_shift; + post_version = vcpu_time_info->version; + } while (pre_version != post_version); + + uint64_t tsc_khz = 1000000ULL << 32; + tsc_khz = tsc_khz / tsc_mul; + if (tsc_shift > 0) { + tsc_khz >>= tsc_shift; + } else { + tsc_khz <<= -tsc_shift; + } + return tsc_khz * 1000; +} + +bool pv_clock_is_stable(void) { + if (!vcpu_time_info) { + return false; + } + bool is_stable = (vcpu_time_info->flags & (1<<0)) || + x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE_STABLE); + return is_stable; +} + +void pc_init_timer(unsigned int level) { + // Initialize the PIT, it's always present in PC hardware pit_init(); clock_source = CLOCK_SOURCE_PIT; + + lapic_init(); + +#if !X86_LEGACY + // XXX update note about what invariant TSC means + bool invariant_tsc = x86_feature_test(X86_FEATURE_INVAR_TSC); + LTRACEF("invariant TSC %d\n", invariant_tsc); + + // Test for hypervisor PV clock, which also effectively says if TSC is invariant across + // all cpus. + if (pvclock_init() == NO_ERROR) { + bool pv_clock_stable = pv_clock_is_stable(); + + invariant_tsc |= pv_clock_stable; + + printf("pv_clock: Clocksource is %sstable\n", (pv_clock_stable ? "" : "not ")); + } + + // XXX test for HPET and use it over PIT if present + + if (invariant_tsc) { + // We're going to try to use the TSC as a time base, obtain the TSC frequency. + uint64_t tsc_hz = 0; + + tsc_hz = pvclock_get_tsc_freq(); + if (tsc_hz == 0) { + // TODO: some x86 cores describe the TSC and lapic clocks in cpuid + + // Calibrate the TSC against the PIT, which should always be present + tsc_hz = pit_calibrate_tsc(); + if (tsc_hz == 0) { + dprintf(CRITICAL, "PC: failed to calibrate TSC frequency\n"); + goto out; + } + } + + dprintf(INFO, "PC: TSC frequency %" PRIu64 "Hz\n", tsc_hz); + + // Compute the ratio of TSC to timebase + fp_32_64_div_32_32(&tsc_to_timebase, 1000, tsc_hz); + dprintf(INFO, "PC: TSC to timebase ratio %u.%08u...\n", + tsc_to_timebase.l0, tsc_to_timebase.l32); + + fp_32_64_div_32_32(&tsc_to_timebase_hires, 1000*1000, tsc_hz); + dprintf(INFO, "PC: TSC to hires timebase ratio %u.%08u...\n", + tsc_to_timebase_hires.l0, tsc_to_timebase_hires.l32); + + clock_source = CLOCK_SOURCE_TSC; + } +out: +#endif // !X86_LEGACY + + dprintf(INFO, "PC: using %s clock source\n", clock_source_name()); } LK_INIT_HOOK(pc_timer, pc_init_timer, LK_INIT_LEVEL_VM); diff --git a/scripts/do-qemux86 b/scripts/do-qemux86 index b2db507a..96e3997c 100755 --- a/scripts/do-qemux86 +++ b/scripts/do-qemux86 @@ -70,7 +70,7 @@ elif (( $DO_LEGACY )); then else QEMU="qemu-system-i386" PROJECT="pc-x86-test" - CPU=pentium3 + CPU=max MACHINE=pc fi @@ -86,7 +86,7 @@ fi ARGS="" if (( $DO_KVM )); then - ARGS+=" -enable-kvm -cpu host" + ARGS+=" -accel kvm -cpu host" else ARGS+=" -cpu $CPU" fi