[platform][pc] add support for TSC based clock

-Detect if under KVM hypervisor and read tick rate or
-calibrate tick against PIT
This commit is contained in:
Travis Geiselbrecht
2025-03-30 21:59:39 -07:00
parent 09412c194f
commit 2987f73d08
10 changed files with 276 additions and 37 deletions

View File

@@ -133,10 +133,13 @@ static void x86_cpu_detect(void) {
// read max hypervisor leaf
cpuid(X86_CPUID_HYP_BASE, &a, &b, &c, &d);
// TODO: actually check that it's an understood hypervisor before setting this.
// It's possible on real hardware it's just returning the last valid regular cpuid.
if (a >= X86_CPUID_HYP_BASE) {
// Check that it's an understood hypervisor leaf
if ((b == 0x4b4d564b && c == 0x564b4d56 && d == 0x4d) || /* KVMKVMKVM */
(b == 0x54474354 && c == 0x43544743 && d == 0x47435447)) { /* TCGTCGTCGTCG */
max_cpuid_leaf_hyp = MIN(a, __X86_MAX_SUPPORTED_CPUID_HYP);
} else {
max_cpuid_leaf_hyp = 0;
}
} else {
__x86_cpu_vendor = X86_CPU_VENDOR_INTEL; // intrinsically Intel without cpuid
@@ -191,12 +194,12 @@ void x86_feature_early_init(void) {
// cache a copy of the cpuid bits
if (has_cpuid) {
for (uint32_t i = 1; i <= max_cpuid_leaf; i++) {
for (uint32_t i = 0; i <= max_cpuid_leaf; i++) {
cpuid_c(i, 0, &saved_cpuids[i].a, &saved_cpuids[i].b, &saved_cpuids[i].c, &saved_cpuids[i].d);
}
if (max_cpuid_leaf_ext > 0) {
for (uint32_t i = X86_CPUID_EXT_BASE + 1; i - 1 < max_cpuid_leaf_ext; i++) {
for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) {
uint32_t index = i - X86_CPUID_EXT_BASE;
cpuid_c(i, 0, &saved_cpuids_ext[index].a, &saved_cpuids_ext[index].b, &saved_cpuids_ext[index].c,
&saved_cpuids_ext[index].d);
@@ -204,7 +207,7 @@ void x86_feature_early_init(void) {
}
if (max_cpuid_leaf_hyp > 0) {
for (uint32_t i = X86_CPUID_HYP_BASE + 1; i - 1 < max_cpuid_leaf_hyp; i++) {
for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) {
uint32_t index = i - X86_CPUID_HYP_BASE;
cpuid_c(i, 0, &saved_cpuids_hyp[index].a, &saved_cpuids_hyp[index].b, &saved_cpuids_hyp[index].c,
&saved_cpuids_hyp[index].d);
@@ -213,6 +216,23 @@ void x86_feature_early_init(void) {
}
}
static void x86_feature_dump_cpuid(void) {
for (uint32_t i = X86_CPUID_BASE; i <= max_cpuid_leaf; i++) {
printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i,
saved_cpuids[i - X86_CPUID_BASE].a, saved_cpuids[i - X86_CPUID_BASE].b, saved_cpuids[i - X86_CPUID_BASE].c, saved_cpuids[i - X86_CPUID_BASE].d);
}
for (uint32_t i = X86_CPUID_HYP_BASE; i <= max_cpuid_leaf_hyp; i++) {
uint32_t index = i - X86_CPUID_HYP_BASE;
printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i,
saved_cpuids_hyp[index].a, saved_cpuids_hyp[index].b, saved_cpuids_hyp[index].c, saved_cpuids_hyp[index].d);
}
for (uint32_t i = X86_CPUID_EXT_BASE; i <= max_cpuid_leaf_ext; i++) {
uint32_t index = i - X86_CPUID_EXT_BASE;
printf("X86: cpuid leaf %#x: %08x %08x %08x %08x\n", i,
saved_cpuids[index].a, saved_cpuids[index].b, saved_cpuids[index].c, saved_cpuids[index].d);
}
}
/* later feature init hook, called after the kernel is able to schedule */
void x86_feature_init(void) {
dprintf(SPEW, "X86: detected cpu level %d has_cpuid %d\n", x86_get_cpu_level(), has_cpuid);
@@ -243,6 +263,10 @@ void x86_feature_init(void) {
printf("X86: processor model info type %#x family %#x model %#x stepping %#x\n",
model->processor_type, model->family, model->model, model->stepping);
printf("\tdisplay_family %#x display_model %#x\n", model->display_family, model->display_model);
if (has_cpuid && LK_DEBUGLEVEL > 1) {
x86_feature_dump_cpuid();
}
}
bool x86_get_cpuid_subleaf(enum x86_cpuid_leaf_num num, uint32_t subleaf, struct x86_cpuid_leaf* leaf) {

View File

@@ -328,12 +328,12 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) {
#define X86_FEATURE_CORE_CAPABILITIES X86_CPUID_BIT(0x7, 3, 30)
#define X86_FEATURE_SSBD X86_CPUID_BIT(0x7, 3, 31)
#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000000, 0, 0)
#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000000, 0, 1)
#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000000, 0, 2)
#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000000, 0, 3)
#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000000, 0, 4)
#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000000, 0, 5)
#define X86_FEATURE_KVM_CLOCKSOURCE X86_CPUID_BIT(0x40000001, 0, 0)
#define X86_FEATURE_KVM_NOP_IO_DELAY X86_CPUID_BIT(0x40000001, 0, 1)
#define X86_FEATURE_KVM_MMU_OP X86_CPUID_BIT(0x40000001, 0, 2)
#define X86_FEATURE_KVM_CLOCKSOURCE2 X86_CPUID_BIT(0x40000001, 0, 3)
#define X86_FEATURE_KVM_ASYNC_PF X86_CPUID_BIT(0x40000001, 0, 4)
#define X86_FEATURE_KVM_STEAL_TIME X86_CPUID_BIT(0x40000001, 0, 5)
#define X86_FEATURE_KVM_PV_EOI X86_CPUID_BIT(0x40000001, 0, 6)
#define X86_FEATURE_KVM_PV_UNHALT X86_CPUID_BIT(0x40000001, 0, 7)
#define X86_FEATURE_KVM_PV_TLB_FLUSH X86_CPUID_BIT(0x40000001, 0, 9)
@@ -355,6 +355,7 @@ static inline bool x86_feature_test(struct x86_cpuid_bit bit) {
#define X86_FEATURE_HUGE_PAGE X86_CPUID_BIT(0x80000001, 3, 26)
#define X86_FEATURE_RDTSCP X86_CPUID_BIT(0x80000001, 3, 27)
#define X86_FEATURE_INVAR_TSC X86_CPUID_BIT(0x80000007, 3, 8)
#define X86_FEATURE_CONSTANT_TSC X86_CPUID_BIT(0x80000007, 3, 8)
// accessor to read some fields out of a register
static inline uint32_t x86_get_vaddr_width(void) {

View File

@@ -17,7 +17,7 @@
#include <kernel/vm.h>
// uses the vm to map in ACPI tables as they are found
static_assert(WITH_KERNEL_VM, "");
static_assert(WITH_KERNEL_VM);
#define LOCAL_TRACE 0

View File

@@ -141,14 +141,6 @@ enum handler_return lapic_timer_handler(void *arg) {
}
void lapic_init(void) {
// discover the presence of the local apic and map it
LTRACE_ENTRY;
// check feature bit 9 in edx of leaf 1 for presence of lapic
lapic_present = x86_feature_test(X86_FEATURE_APIC);
}
void lapic_init_postvm(uint level) {
if (!lapic_present)
return;
@@ -207,8 +199,6 @@ void lapic_init_postvm(uint level) {
lapic_set_oneshot_timer(1000000);
}
LK_INIT_HOOK(lapic, lapic_init_postvm, LK_INIT_LEVEL_VM);
void lapic_eoi(unsigned int vector) {
LTRACEF("vector %#x\n", vector);
if (!lapic_present) {

View File

@@ -91,6 +91,7 @@ static void local_apic_callback(const void *_entry, size_t entry_len, void *cook
const struct acpi_madt_local_apic_entry *entry = _entry;
struct detected_cpus *cpus = cookie;
// TODO: read the current APIC id and skip it, instead of assuming 0 is the boot cpu
if (entry->apic_id == 0) {
// skip the boot cpu
return;

View File

@@ -10,6 +10,7 @@
#include <lk/reg.h>
#include <lk/debug.h>
#include <lk/trace.h>
#include <assert.h>
#include <kernel/thread.h>
#include <kernel/spinlock.h>
#include <platform.h>
@@ -23,6 +24,9 @@
#define LOCAL_TRACE 0
// TODO: switch this logic to lib/fixed_point math
static platform_timer_callback t_callback;
static void *callback_arg;
static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE;
@@ -42,6 +46,7 @@ static uint64_t timer_delta_time;
#define INTERNAL_FREQ 1193182ULL
#define INTERNAL_FREQ_3X 3579546ULL
#define INTERNAL_FREQ_TICKS_PER_MS (INTERNAL_FREQ / 1000u)
/* Maximum amount of time that can be program on the timer to schedule the next
* interrupt, in milliseconds */
@@ -128,8 +133,8 @@ static void set_pit_frequency(uint32_t frequency) {
*/
timer_delta_time = (3685982306ULL * count) >> 10;
LTRACEF("dt 0x%016" PRIx64 "\n", timer_delta_time);
LTRACEF("divisor 0x%04" PRIx16 "\n", divisor);
LTRACEF("dt %#x.%08x\n", (uint32_t)(timer_delta_time >> 32), (uint32_t)(timer_delta_time & 0xffffffff));
LTRACEF("divisor %" PRIu16 "\n", divisor);
/*
* setup the Programmable Interval Timer
@@ -191,4 +196,54 @@ void pit_stop_timer(void) {
mask_interrupt(INT_PIT);
spin_unlock_irqrestore(&lock, state);
}
uint64_t pit_calibrate_tsc(void) {
DEBUG_ASSERT(arch_ints_disabled());
uint64_t tsc_ticks[5] = {0};
uint32_t countdown_ms[5] = {0};
uint64_t tsc_freq = 0;
for (uint i = 0; i < countof(tsc_ticks); i++) {
// calibrate the tsc frequency using the PIT
countdown_ms[i] = 2 * (i + 1);
uint16_t pic_ticks = INTERNAL_FREQ_TICKS_PER_MS * countdown_ms[i];
outp(I8253_CONTROL_REG, 0x30);
outp(I8253_DATA_REG, pic_ticks & 0xff); // LSB
outp(I8253_DATA_REG, pic_ticks >> 8); // MSB
// read the tsc
uint64_t tsc_start = __builtin_ia32_rdtsc();
// wait for countdown_ms
uint8_t status = 0;
do {
// Send a read-back command that latches the status of ch0
outp(I8253_CONTROL_REG, 0xe2);
status = inp(I8253_DATA_REG);
// Wait for bit 7 (output) to go high and for bit 6 (null count) to go low
} while ((status & 0xc0) != 0x80);
uint64_t tsc_end = __builtin_ia32_rdtsc();
tsc_ticks[i] = tsc_end - tsc_start;
}
// find the best time
uint best_index = 0;
for (uint i = 1; i < countof(tsc_ticks); i++) {
if (tsc_ticks[i] < tsc_ticks[best_index]) {
best_index = i;
}
}
// calculate the tsc frequency
tsc_freq = (tsc_ticks[best_index] * 1000) / countdown_ms[best_index];
dprintf(INFO, "PIT: calibrated TSC frequency: %" PRIu64 "Hz\n", tsc_freq);
// put the PIT back to 1ms countdown
set_pit_frequency(1000);
return tsc_freq;
}

View File

@@ -21,6 +21,7 @@ void pic_init(void);
void pic_enable(unsigned int vector, bool enable);
void pic_eoi(unsigned int vector);
void pic_mask_interrupts(void);
uint64_t pit_calibrate_tsc(void);
// local apic
void lapic_init(void);

View File

@@ -6,10 +6,10 @@ MODULE := $(LOCAL_DIR)
# legacy implies older hardware, pre pentium, pre pci
CPU ?= modern
MODULE_DEPS += \
lib/acpi_lite \
lib/bio \
lib/cbuf
MODULE_DEPS += lib/acpi_lite
MODULE_DEPS += lib/bio
MODULE_DEPS += lib/cbuf
MODULE_DEPS += lib/fixed_point
ifneq ($(CPU),legacy)
MODULE_DEPS += dev/bus/pci/drivers

View File

@@ -12,13 +12,17 @@
#include <lk/reg.h>
#include <lk/trace.h>
#include <kernel/thread.h>
#include <kernel/vm.h>
#include <platform.h>
#include <platform/timer.h>
#include <platform/pc.h>
#include "platform_p.h"
#include <arch/x86.h>
#include <arch/x86/feature.h>
#include <inttypes.h>
#include <lib/fixed_point.h>
#define LOCAL_TRACE 0
#define LOCAL_TRACE 1
// Deals with all of the various clock sources and event timers on the PC platform.
@@ -29,16 +33,19 @@ static enum clock_source {
CLOCK_SOURCE_HPET,
} clock_source = CLOCK_SOURCE_INITIAL;
struct fp_32_64 tsc_to_timebase;
struct fp_32_64 tsc_to_timebase_hires;
static const char *clock_source_name(void) {
switch (clock_source) {
case CLOCK_SOURCE_INITIAL:
return "initial";
case CLOCK_SOURCE_PIT:
return "pit";
return "PIT";
case CLOCK_SOURCE_TSC:
return "tsc";
return "TSC";
case CLOCK_SOURCE_HPET:
return "hpet";
return "HPET";
default:
return "unknown";
}
@@ -48,6 +55,8 @@ lk_time_t current_time(void) {
switch (clock_source) {
case CLOCK_SOURCE_PIT:
return pit_current_time();
case CLOCK_SOURCE_TSC:
return u32_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase);
default:
return 0;
}
@@ -57,16 +66,174 @@ lk_bigtime_t current_time_hires(void) {
switch (clock_source) {
case CLOCK_SOURCE_PIT:
return pit_current_time_hires();
case CLOCK_SOURCE_TSC:
return u64_mul_u64_fp32_64(__builtin_ia32_rdtsc(), tsc_to_timebase_hires);
default:
return 0;
}
}
void pc_init_timer(unsigned int level) {
LTRACE_ENTRY;
// From https://www.kernel.org/doc/html/v6.14/virt/kvm/x86/msr.html
struct pvclock_wall_clock {
uint32_t version;
uint32_t sec;
uint32_t nsec;
} __PACKED;
static_assert(sizeof(struct pvclock_wall_clock) == 12, "pvclock_wall_clock size mismatch");
struct pvclock_vcpu_time_info {
uint32_t version;
uint32_t pad0;
uint64_t tsc_timestamp;
uint64_t system_time;
uint32_t tsc_to_system_mul;
int8_t tsc_shift;
uint8_t flags;
uint8_t pad[2];
} __PACKED;
static_assert(sizeof(struct pvclock_vcpu_time_info) == 32, "pvclock_vcpu_time_info size mismatch");
static volatile struct pvclock_wall_clock *wall_clock;
static volatile struct pvclock_vcpu_time_info *vcpu_time_info;
status_t pvclock_init(void) {
uint32_t clocksource_msr_base = 0;
if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE)) {
clocksource_msr_base = 0x11;
}
if (x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE2)) {
clocksource_msr_base = 0x4b564d00;
}
if (!clocksource_msr_base) {
return ERR_NOT_SUPPORTED;
}
dprintf(INFO, "pv_clock: clocksource detected, msr base %#x\n", clocksource_msr_base);
// map a page of memory and point the KVM clocksource msrs at it
void *clocksource_page;
status_t err = vmm_alloc(vmm_get_kernel_aspace(), "lapic", PAGE_SIZE, &clocksource_page, 0, 0, 0);
if (err != NO_ERROR) {
printf("pv_clock: failed to allocate page for clocksource msrs\n");
return err;
}
paddr_t paddr;
arch_mmu_query(&vmm_get_kernel_aspace()->arch_aspace, (vaddr_t)clocksource_page, &paddr, NULL);
LTRACEF("clocksource page %p, paddr %#" PRIxPTR "\n", clocksource_page, paddr);
write_msr(clocksource_msr_base, paddr);
write_msr(clocksource_msr_base + 1, paddr + sizeof(struct pvclock_wall_clock) + 1);
wall_clock = (struct pvclock_wall_clock *)clocksource_page;
vcpu_time_info = (struct pvclock_vcpu_time_info *)(wall_clock + 1);
dprintf(SPEW, "pv_clock: wall clock version %u, sec %u, nsec %u\n",
wall_clock->version, wall_clock->sec, wall_clock->nsec);
dprintf(SPEW, "pv_clock: vcpu time info version %u, tsc timestamp %llu, system time %llu\n",
vcpu_time_info->version, vcpu_time_info->tsc_timestamp, vcpu_time_info->system_time);
dprintf(SPEW, "pv_clock: tsc to system mul %u, tsc shift %d, flags %u\n",
vcpu_time_info->tsc_to_system_mul, vcpu_time_info->tsc_shift, vcpu_time_info->flags);
return NO_ERROR;
}
uint64_t pvclock_get_tsc_freq(void) {
uint32_t tsc_mul = 0;
int8_t tsc_shift = 0;
if (!vcpu_time_info) {
return 0;
}
uint32_t pre_version = 0, post_version = 0;
do {
pre_version = vcpu_time_info->version;
if (pre_version % 2 != 0) {
asm("pause");
continue;
}
tsc_mul = vcpu_time_info->tsc_to_system_mul;
tsc_shift = vcpu_time_info->tsc_shift;
post_version = vcpu_time_info->version;
} while (pre_version != post_version);
uint64_t tsc_khz = 1000000ULL << 32;
tsc_khz = tsc_khz / tsc_mul;
if (tsc_shift > 0) {
tsc_khz >>= tsc_shift;
} else {
tsc_khz <<= -tsc_shift;
}
return tsc_khz * 1000;
}
bool pv_clock_is_stable(void) {
if (!vcpu_time_info) {
return false;
}
bool is_stable = (vcpu_time_info->flags & (1<<0)) ||
x86_feature_test(X86_FEATURE_KVM_CLOCKSOURCE_STABLE);
return is_stable;
}
void pc_init_timer(unsigned int level) {
// Initialize the PIT, it's always present in PC hardware
pit_init();
clock_source = CLOCK_SOURCE_PIT;
lapic_init();
#if !X86_LEGACY
// XXX update note about what invariant TSC means
bool invariant_tsc = x86_feature_test(X86_FEATURE_INVAR_TSC);
LTRACEF("invariant TSC %d\n", invariant_tsc);
// Test for hypervisor PV clock, which also effectively says if TSC is invariant across
// all cpus.
if (pvclock_init() == NO_ERROR) {
bool pv_clock_stable = pv_clock_is_stable();
invariant_tsc |= pv_clock_stable;
printf("pv_clock: Clocksource is %sstable\n", (pv_clock_stable ? "" : "not "));
}
// XXX test for HPET and use it over PIT if present
if (invariant_tsc) {
// We're going to try to use the TSC as a time base, obtain the TSC frequency.
uint64_t tsc_hz = 0;
tsc_hz = pvclock_get_tsc_freq();
if (tsc_hz == 0) {
// TODO: some x86 cores describe the TSC and lapic clocks in cpuid
// Calibrate the TSC against the PIT, which should always be present
tsc_hz = pit_calibrate_tsc();
if (tsc_hz == 0) {
dprintf(CRITICAL, "PC: failed to calibrate TSC frequency\n");
goto out;
}
}
dprintf(INFO, "PC: TSC frequency %" PRIu64 "Hz\n", tsc_hz);
// Compute the ratio of TSC to timebase
fp_32_64_div_32_32(&tsc_to_timebase, 1000, tsc_hz);
dprintf(INFO, "PC: TSC to timebase ratio %u.%08u...\n",
tsc_to_timebase.l0, tsc_to_timebase.l32);
fp_32_64_div_32_32(&tsc_to_timebase_hires, 1000*1000, tsc_hz);
dprintf(INFO, "PC: TSC to hires timebase ratio %u.%08u...\n",
tsc_to_timebase_hires.l0, tsc_to_timebase_hires.l32);
clock_source = CLOCK_SOURCE_TSC;
}
out:
#endif // !X86_LEGACY
dprintf(INFO, "PC: using %s clock source\n", clock_source_name());
}
LK_INIT_HOOK(pc_timer, pc_init_timer, LK_INIT_LEVEL_VM);

View File

@@ -70,7 +70,7 @@ elif (( $DO_LEGACY )); then
else
QEMU="qemu-system-i386"
PROJECT="pc-x86-test"
CPU=pentium3
CPU=max
MACHINE=pc
fi
@@ -86,7 +86,7 @@ fi
ARGS=""
if (( $DO_KVM )); then
ARGS+=" -enable-kvm -cpu host"
ARGS+=" -accel kvm -cpu host"
else
ARGS+=" -cpu $CPU"
fi