diff --git a/app/tests/fibo.c b/app/tests/fibo.c
index ac6a395f..3c3320f2 100644
--- a/app/tests/fibo.c
+++ b/app/tests/fibo.c
@@ -41,12 +41,15 @@ static int fibo_thread(void *argv)
 	if (fibo == 1)
 		return 1;
 
-	t[0] = thread_create("fibo", &fibo_thread, (void *)(fibo - 1), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+	char name[32];
+	snprintf(name, sizeof(name), "fibo %lu", fibo - 1);
+	t[0] = thread_create(name, &fibo_thread, (void *)(fibo - 1), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
 	if (!t[0]) {
 		printf("error creating thread for fibo %d\n", fibo-1);
 		return 0;
 	}
-	t[1] = thread_create("fibo", &fibo_thread, (void *)(fibo - 2), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+	snprintf(name, sizeof(name), "fibo %lu", fibo - 2);
+	t[1] = thread_create(name, &fibo_thread, (void *)(fibo - 2), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
 	if (!t[1]) {
 		printf("error creating thread for fibo %d\n", fibo-2);
 		thread_resume(t[0]);
@@ -89,4 +92,5 @@ int fibo(int argc, const cmd_args *argv)
 	return NO_ERROR;
 }
 
+// vim: set noexpandtab:
 
diff --git a/app/tests/include/app/tests.h b/app/tests/include/app/tests.h
index 69a0bd63..19bb8408 100644
--- a/app/tests/include/app/tests.h
+++ b/app/tests/include/app/tests.h
@@ -32,6 +32,7 @@ void clock_tests(void);
 void float_tests(void);
 void benchmarks(void);
 int fibo(int argc, const cmd_args *argv);
+int spinner(int argc, const cmd_args *argv);
 
 #endif
 
diff --git a/app/tests/tests.c b/app/tests/tests.c
index 275b968f..3ef56f61 100644
--- a/app/tests/tests.c
+++ b/app/tests/tests.c
@@ -38,6 +38,7 @@ STATIC_COMMAND("float_tests", "floating point test", (console_cmd)&float_tests)
 #endif
 STATIC_COMMAND("bench", "miscellaneous benchmarks", (console_cmd)&benchmarks)
 STATIC_COMMAND("fibo", "threaded fibonacci", (console_cmd)&fibo)
+STATIC_COMMAND("spinner", "create a spinning thread", (console_cmd)&spinner)
 STATIC_COMMAND_END(tests);
 
 #endif
diff --git a/app/tests/thread_tests.c b/app/tests/thread_tests.c
index bc34c1a8..b065172d 100644
--- a/app/tests/thread_tests.c
+++ b/app/tests/thread_tests.c
@@ -24,6 +24,7 @@
 #include <trace.h>
 #include <rand.h>
 #include <err.h>
+#include <assert.h>
 #include <app/tests.h>
 #include <kernel/thread.h>
 #include <kernel/mutex.h>
@@ -130,7 +131,7 @@ static int semaphore_test(void)
 static int mutex_thread(void *arg)
 {
 	int i;
-	const int iterations = 50000;
+	const int iterations = 1000000;
 
 	static volatile int shared = 0;
 
@@ -405,9 +406,11 @@ static int atomic_tester(void *arg)
 	int add = (intptr_t)arg;
 	int i;
 
-	TRACEF("add %d\n", add);
+	const int iter = 10000000;
 
-	for (i=0; i < 1000000; i++) {
+	TRACEF("add %d, %d iterations\n", add, iter);
+
+	for (i=0; i < iter; i++) {
 		atomic_add(&atomic, add);
 	}
 
@@ -455,6 +458,7 @@ static int preempt_tester(void *arg)
 	printf("exiting ts %lld\n", current_time_hires());
 
 	atomic_add(&preempt_count, -1);
+#undef COUNT
 
 	return 0;
 }
@@ -571,12 +575,53 @@ static void join_test(void)
 	printf("thread_join returns err %d, retval %d (should be 0 and 55)\n", err, ret);
 }
 
+static void spinlock_test(void)
+{
+    spin_lock_saved_state_t state;
+    spin_lock_t lock;
+
+    spin_lock_init(&lock);
+
+    // verify basic functionality (single core)
+    printf("testing spinlock:\n");
+    ASSERT(!spin_lock_held(&lock));
+    ASSERT(!arch_ints_disabled());
+    spin_lock_irqsave(&lock, state);
+    ASSERT(arch_ints_disabled());
+    ASSERT(spin_lock_held(&lock));
+    spin_unlock_irqrestore(&lock, state);
+    ASSERT(!spin_lock_held(&lock));
+    ASSERT(!arch_ints_disabled());
+    printf("seems to work\n");
+
+#define COUNT (1024*1024)
+    uint32_t c = arch_cycle_count();
+    for (uint i = 0; i < COUNT; i++) {
+        spin_lock(&lock);
+        spin_unlock(&lock);
+    }
+    c = arch_cycle_count() - c;
+
+    printf("%u cycles to acquire/release lock %u times (%u cycles per)\n", c, COUNT, c / COUNT);
+
+    c = arch_cycle_count();
+    for (uint i = 0; i < COUNT; i++) {
+        spin_lock_irqsave(&lock, state);
+        spin_unlock_irqrestore(&lock, state);
+    }
+    c = arch_cycle_count() - c;
+
+    printf("%u cycles to acquire/release lock w/irqsave %u times (%u cycles per)\n", c, COUNT, c / COUNT);
+#undef COUNT
+}
+
 int thread_tests(void)
 {
 	mutex_test();
 	semaphore_test();
 	event_test();
 
+	spinlock_test();
 	atomic_test();
 
 	thread_sleep(200);
@@ -589,4 +634,27 @@ int thread_tests(void)
 	return 0;
 }
 
+static int spinner_thread(void *arg)
+{
+	for (;;)
+		;
+
+	return 0;
+}
+
+int spinner(int argc, const cmd_args *argv)
+{
+	if (argc < 2) {
+		printf("not enough args\n");
+		printf("usage: %s <priority>\n", argv[0].str);
+		return -1;
+	}
+
+	thread_t *t = thread_create("spinner", spinner_thread, NULL, argv[1].u, DEFAULT_STACK_SIZE);
+	if (t)
+		thread_resume(t);
+
+	return 0;
+}
+
 /* vim: set ts=4 sw=4 noexpandtab: */
diff --git a/arch/arm/arm-m/arch.c b/arch/arm/arm-m/arch.c
index 47266e34..41f6873e 100644
--- a/arch/arm/arm-m/arch.c
+++ b/arch/arm/arm-m/arch.c
@@ -122,8 +122,6 @@ void _arm_cm_set_irqpri(uint32_t pri)
 
 void arm_cm_irq_entry(void)
 {
-	inc_critical_section();
-
 	THREAD_STATS_INC(interrupts);
 	KEVLOG_IRQ_ENTER(__get_IPSR());
 }
@@ -134,7 +132,6 @@ void arm_cm_irq_exit(bool reschedule)
 		arm_cm_trigger_preempt();
 
 	KEVLOG_IRQ_EXIT(__get_IPSR());
-	dec_critical_section();
 }
 
 void arch_chain_load(void *entry, ulong arg0, ulong arg1, ulong arg2, ulong arg3)
diff --git a/arch/arm/arm-m/exceptions.c b/arch/arm/arm-m/exceptions.c
index c66faa8d..ab566606 100644
--- a/arch/arm/arm-m/exceptions.c
+++ b/arch/arm/arm-m/exceptions.c
@@ -43,7 +43,6 @@ static void dump_frame(const struct arm_cm_exception_frame *frame)
 
 static void hardfault(struct arm_cm_exception_frame *frame)
 {
-	inc_critical_section();
 	printf("hardfault: ");
 	dump_frame(frame);
 
@@ -54,7 +53,6 @@ static void hardfault(struct arm_cm_exception_frame *frame)
 
 static void usagefault(struct arm_cm_exception_frame *frame)
 {
-	inc_critical_section();
 	printf("usagefault: ");
 	dump_frame(frame);
 
@@ -63,7 +61,6 @@ static void usagefault(struct arm_cm_exception_frame *frame)
 
 static void busfault(struct arm_cm_exception_frame *frame)
 {
-	inc_critical_section();
 	printf("busfault: ");
 	dump_frame(frame);
 
@@ -74,7 +71,6 @@ static void busfault(struct arm_cm_exception_frame *frame)
 
 void _nmi(void)
 {
-	inc_critical_section();
 	printf("nmi\n");
 	platform_halt(HALT_ACTION_HALT, HALT_REASON_SW_PANIC);
 }
@@ -92,7 +88,6 @@ __NAKED void _hardfault(void)
 
 void _memmanage(void)
 {
-	inc_critical_section();
 	printf("memmanage\n");
 	platform_halt(HALT_ACTION_HALT, HALT_REASON_SW_PANIC);
 }
@@ -122,7 +117,6 @@ void _usagefault(void)
 /* systick handler */
 void __WEAK _systick(void)
 {
-	inc_critical_section();
 	printf("systick\n");
 	platform_halt(HALT_ACTION_HALT, HALT_REASON_SW_PANIC);
 }
diff --git a/arch/arm/arm-m/systick/systick.c b/arch/arm/arm-m/systick/systick.c
index 03f7292f..e9f0765f 100644
--- a/arch/arm/arm-m/systick/systick.c
+++ b/arch/arm/arm-m/systick/systick.c
@@ -90,8 +90,6 @@ status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg
 
 	DEBUG_ASSERT(tick_rate != 0 && tick_rate_mhz != 0);
 
-	enter_critical_section();
-
 	cb = callback;
 	cb_args = arg;
 
@@ -99,8 +97,6 @@ status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg
 	tick_interval_us = interval * 1000;
 	arm_cm_systick_set_periodic(interval);
 
-	exit_critical_section();
-
 	return NO_ERROR;
 }
 
diff --git a/arch/arm/arm-m/thread.c b/arch/arm/arm-m/thread.c
index 32285b6a..301a5e6b 100644
--- a/arch/arm/arm-m/thread.c
+++ b/arch/arm/arm-m/thread.c
@@ -57,8 +57,9 @@ static void initial_thread_func(void)
 	dump_thread(_current_thread);
 #endif
 
-	/* exit the implicit critical section we're within */
-	exit_critical_section();
+	/* release the thread lock that was implicitly held across the reschedule */
+	spin_unlock(&thread_lock);
+	arch_enable_ints();
 
 	ret = _current_thread->entry(_current_thread->arg);
 
@@ -89,9 +90,6 @@ volatile struct arm_cm_exception_frame_long *preempt_frame;
 static void pendsv(struct arm_cm_exception_frame_long *frame)
 {
 	arch_disable_ints();
-	inc_critical_section();
-
-	ASSERT(critical_section_count == 1);
 
 	LTRACEF("preempting thread %p (%s)\n", _current_thread, _current_thread->name);
 
@@ -104,7 +102,6 @@ static void pendsv(struct arm_cm_exception_frame_long *frame)
 	/* if we got here, there wasn't anything to switch to, so just fall through and exit */
 	preempt_frame = NULL;
 
-	dec_critical_section();
 	arch_enable_ints();
 }
 
@@ -190,13 +187,6 @@ void arch_context_switch(struct thread *oldthread, struct thread *newthread)
 {
 	LTRACE_ENTRY;
 
-	if (newthread->arch.was_preempted) {
-		/* we're about to return directly to a thread that was preempted (in user space),
-		 * so push its critical section count back down to zero
-		 */
-		critical_section_count = newthread->saved_critical_section_count = 0;
-	}
-
 	/* if preempt_frame is set, we are being preempted */
 	if (preempt_frame) {
 		oldthread->arch.was_preempted = true;
@@ -250,3 +240,12 @@ void arch_context_switch(struct thread *oldthread, struct thread *newthread)
 
 }
 
+void arch_dump_thread(thread_t *t)
+{
+    if (t->state != THREAD_RUNNING) {
+        dprintf(INFO, "\tarch: ");
+        dprintf(INFO, "sp 0x%lx, was preempted %u\n", t->arch.sp, t->arch.was_preempted);
+    }
+}
+
+
diff --git a/arch/arm/arm/arch.c b/arch/arm/arm/arch.c
index 13ef3316..697f91c2 100644
--- a/arch/arm/arm/arch.c
+++ b/arch/arm/arm/arch.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2014 Travis Geiselbrecht
+ * Copyright (c) 2008-2015 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -24,48 +24,197 @@
 #include <trace.h>
 #include <stdlib.h>
 #include <err.h>
+#include <trace.h>
+#include <stdio.h>
+#include <reg.h>
 #include <arch.h>
 #include <arch/ops.h>
 #include <arch/mmu.h>
 #include <arch/arm.h>
 #include <arch/arm/mmu.h>
+#include <arch/mp.h>
+#include <kernel/spinlock.h>
+#include <kernel/thread.h>
+#include <lk/main.h>
+#include <lk/init.h>
 #include <platform.h>
 #include <target.h>
 #include <kernel/thread.h>
 
 #define LOCAL_TRACE 0
 
+#if WITH_DEV_TIMER_ARM_CORTEX_A9
+#include <dev/timer/arm_cortex_a9.h>
+#endif
+#if WITH_DEV_INTERRUPT_ARM_GIC
+#include <dev/interrupt/arm_gic.h>
+#endif
+#if WITH_DEV_CACHE_PL310
+#include <dev/cache/pl310.h>
+#endif
+
+/* initial and abort stacks */
+uint8_t abort_stack[ARCH_DEFAULT_STACK_SIZE * SMP_MAX_CPUS] __CPU_ALIGN;
+
+static void arm_basic_setup(void);
+static void spinlock_test(void);
+static void spinlock_test_secondary(void);
+
+#if WITH_SMP
+/* smp boot lock */
+spin_lock_t arm_boot_cpu_lock = 1;
+volatile int secondaries_to_init = 0;
+#endif
+
 void arch_early_init(void)
 {
 	/* turn off the cache */
 	arch_disable_cache(UCACHE);
+#if WITH_DEV_CACHE_PL310
+	pl310_set_enable(false);
+#endif
 
-	/* set the vector base to our exception vectors so we dont need to double map at 0 */
-#if ARM_ISA_ARMV7
-	arm_write_vbar(KERNEL_BASE + KERNEL_LOAD_OFFSET);
+	arm_basic_setup();
+
+#if WITH_SMP && ARM_CPU_CORTEX_A9
+	/* enable snoop control */
+	addr_t scu_base = arm_read_cbar();
+	*REG32(scu_base) |= (1<<0); /* enable SCU */
 #endif
 
 #if ARM_WITH_MMU
-	arm_mmu_init();
+	arm_mmu_early_init();
 
 	platform_init_mmu_mappings();
 #endif
 
 	/* turn the cache back on */
+#if WITH_DEV_CACHE_PL310
+	pl310_set_enable(true);
+#endif
 	arch_enable_cache(UCACHE);
+}
 
-#if ARM_WITH_VFP
-	/* enable cp10 and cp11 */
-	uint32_t val = arm_read_cpacr();
-	val |= (3<<22)|(3<<20);
-	arm_write_cpacr(val);
+void arch_init(void)
+{
+#if WITH_SMP
+	arch_mp_init_percpu();
 
-	/* make sure the fpu starts off disabled */
-	arm_fpu_set_enable(false);
+	LTRACEF("midr 0x%x\n", arm_read_midr());
+	LTRACEF("sctlr 0x%x\n", arm_read_sctlr());
+	LTRACEF("actlr 0x%x\n", arm_read_actlr());
+#if ARM_CPU_CORTEX_A9
+	LTRACEF("cbar 0x%x\n", arm_read_cbar());
+#endif
+	LTRACEF("mpidr 0x%x\n", arm_read_mpidr());
+	LTRACEF("ttbcr 0x%x\n", arm_read_ttbcr());
+	LTRACEF("ttbr0 0x%x\n", arm_read_ttbr0());
+	LTRACEF("dacr 0x%x\n", arm_read_dacr());
+#if ARM_CPU_CORTEX_A7
+	LTRACEF("l2ctlr 0x%x\n", arm_read_l2ctlr());
+	LTRACEF("l2ectlr 0x%x\n", arm_read_l2ectlr());
 #endif
 
-#if ENABLE_CYCLE_COUNTER
-#if ARM_ISA_ARMV7
+#if ARM_CPU_CORTEX_A9
+	addr_t scu_base = arm_read_cbar();
+	uint32_t scu_config = *REG32(scu_base + 4);
+	secondaries_to_init = scu_config & 0x3;
+#elif ARM_CPU_CORTEX_A7
+	uint32_t l2ctlr = arm_read_l2ctlr();
+	secondaries_to_init = (l2ctlr >> 24);
+#else
+	secondaries_to_init = SMP_MAX_CPUS - 1; /* TODO: get count from somewhere else, or add cpus as they boot */
+#endif
+
+	lk_init_secondary_cpus(secondaries_to_init);
+
+	dprintf(SPEW, "releasing %d secondary cpu%c\n", secondaries_to_init, secondaries_to_init > 1 ? 's' : ' ');
+
+	/* release the secondary cpus */
+	spin_unlock(&arm_boot_cpu_lock);
+
+	/* flush the release of the lock, since the secondary cpus are running without cache on */
+	arch_clean_cache_range((addr_t)&arm_boot_cpu_lock, sizeof(arm_boot_cpu_lock));
+#endif
+
+	//spinlock_test();
+
+	/* finish intializing the mmu */
+	arm_mmu_init();
+}
+
+#if WITH_SMP
+void arm_secondary_entry(uint asm_cpu_num)
+{
+	uint cpu = arch_curr_cpu_num();
+	if (cpu != asm_cpu_num)
+		return;
+
+	arm_basic_setup();
+
+	/* enable the local L1 cache */
+	//arch_enable_cache(UCACHE);
+
+	// XXX may not be safe, but just hard enable i and d cache here
+	// at the moment cannot rely on arch_enable_cache not dumping the L2
+	uint32_t sctlr = arm_read_sctlr();
+	sctlr |= (1<<12) | (1<<2); // enable i and dcache
+	arm_write_sctlr(sctlr);
+
+	/* run early secondary cpu init routines up to the threading level */
+	lk_init_level(LK_INIT_FLAG_SECONDARY_CPUS, LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_THREADING - 1);
+
+	arch_mp_init_percpu();
+
+	LTRACEF("cpu num %d\n", cpu);
+	LTRACEF("sctlr 0x%x\n", arm_read_sctlr());
+	LTRACEF("actlr 0x%x\n", arm_read_actlr());
+
+	/* we're done, tell the main cpu we're up */
+	atomic_add(&secondaries_to_init, -1);
+	smp_mb();
+	__asm__ volatile("sev");
+
+	lk_secondary_cpu_entry();
+}
+#endif
+
+static void arm_basic_setup(void)
+{
+	uint32_t sctlr = arm_read_sctlr();
+
+	/* ARMV7 bits */
+	sctlr &= ~(1<<10); /* swp disable */
+	sctlr |=  (1<<11); /* enable program flow prediction */
+	sctlr &= ~(1<<14); /* random cache/tlb replacement */
+	sctlr &= ~(1<<25); /* E bit set to 0 on exception */
+	sctlr &= ~(1<<30); /* no thumb exceptions */
+
+	arm_write_sctlr(sctlr);
+
+	uint32_t actlr = arm_read_actlr();
+#if ARM_CPU_CORTEX_A9
+	actlr |= (1<<2); /* enable dcache prefetch */
+#if WITH_DEV_CACHE_PL310
+	actlr |= (1<<7); /* L2 exclusive cache */
+	actlr |= (1<<3); /* L2 write full line of zeroes */
+	actlr |= (1<<1); /* L2 prefetch hint enable */
+#endif
+#if WITH_SMP
+	/* enable smp mode, cache and tlb broadcast */
+	actlr |= (1<<6) | (1<<0);
+#endif
+#endif // ARM_CPU_CORTEX_A9
+#if ARM_CPU_CORTEX_A7
+#if WITH_SMP
+	/* enable smp mode */
+	actlr |= (1<<6);
+#endif
+#endif // ARM_CPU_CORTEX_A7
+
+	arm_write_actlr(actlr);
+
+#if ENABLE_CYCLE_COUNTER && ARM_ISA_ARMV7
 	/* enable the cycle count register */
 	uint32_t en;
 	__asm__ volatile("mrc	p15, 0, %0, c9, c12, 0" : "=r" (en));
@@ -77,11 +226,26 @@ void arch_early_init(void)
 	en = (1<<31);
 	__asm__ volatile("mcr	p15, 0, %0, c9, c12, 1" :: "r" (en));
 #endif
-#endif
-}
 
-void arch_init(void)
-{
+#if ARM_WITH_VFP
+	/* enable cp10 and cp11 */
+	uint32_t val = arm_read_cpacr();
+	val |= (3<<22)|(3<<20);
+	arm_write_cpacr(val);
+
+	/* set enable bit in fpexc */
+	__asm__ volatile("mrc  p10, 7, %0, c8, c0, 0" : "=r" (val));
+	val |= (1<<30);
+	__asm__ volatile("mcr  p10, 7, %0, c8, c0, 0" :: "r" (val));
+
+	/* make sure the fpu starts off disabled */
+	arm_fpu_set_enable(false);
+#endif
+
+	/* set the vector base to our exception vectors so we dont need to double map at 0 */
+#if ARM_ISA_ARMV7
+	arm_write_vbar(KERNEL_BASE + KERNEL_LOAD_OFFSET);
+#endif
 }
 
 void arch_quiesce(void)
@@ -112,9 +276,15 @@ void arch_quiesce(void)
 /* virtual to physical translation */
 status_t arm_vtop(addr_t va, addr_t *pa)
 {
-	arm_write_ats1cpr(va & 0xfffff000);
+	spin_lock_saved_state_t irqstate;
+
+	arch_interrupt_save(&irqstate, SPIN_LOCK_FLAG_INTERRUPTS);
+
+	arm_write_ats1cpr(va & ~(PAGE_SIZE-1));
 	uint32_t par = arm_read_par();
 
+	arch_interrupt_restore(irqstate, SPIN_LOCK_FLAG_INTERRUPTS);
+
 	if (par & 1)
 		return ERR_NOT_FOUND;
 
@@ -131,7 +301,7 @@ void arch_chain_load(void *entry, ulong arg0, ulong arg1, ulong arg2, ulong arg3
 	LTRACEF("entry %p, args 0x%lx 0x%lx 0x%lx 0x%lx\n", entry, arg0, arg1, arg2, arg3);
 
 	/* we are going to shut down the system, start by disabling interrupts */
-	enter_critical_section();
+	arch_disable_ints();
 
 	/* give target and platform a chance to put hardware into a suitable
 	 * state for chain loading.
@@ -172,6 +342,9 @@ void arch_chain_load(void *entry, ulong arg0, ulong arg1, ulong arg2, ulong arg3
 
 	LTRACEF("disabling instruction/data cache\n");
 	arch_disable_cache(UCACHE);
+#if WITH_DEV_CACHE_PL310
+	pl310_set_enable(false);
+#endif
 
 	LTRACEF("branching to physical address of loader\n");
 
@@ -183,4 +356,37 @@ void arch_chain_load(void *entry, ulong arg0, ulong arg1, ulong arg2, ulong arg3
 #endif
 }
 
+static spin_lock_t lock = 0;
+
+static void spinlock_test(void)
+{
+	TRACE_ENTRY;
+
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&lock, state);
+
+	TRACEF("cpu0: i have the lock\n");
+	spin(1000000);
+	TRACEF("cpu0: releasing it\n");
+
+	spin_unlock_irqrestore(&lock, state);
+
+	spin(1000000);
+}
+
+static void spinlock_test_secondary(void)
+{
+	TRACE_ENTRY;
+
+	spin(500000);
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&lock, state);
+
+	TRACEF("cpu1: i have the lock\n");
+	spin(250000);
+	TRACEF("cpu1: releasing it\n");
+
+	spin_unlock_irqrestore(&lock, state);
+}
+
 /* vim: set ts=4 sw=4 noexpandtab: */
diff --git a/arch/arm/arm/asm.S b/arch/arm/arm/asm.S
index bab4ee2d..9e4a34fe 100644
--- a/arch/arm/arm/asm.S
+++ b/arch/arm/arm/asm.S
@@ -71,6 +71,9 @@ strex_spot:
 FUNCTION(arm_save_mode_regs)
 	mrs		r1, cpsr
 
+	stmia	r0, { r13, r14 }^ /* usr */
+	add	r0, #8
+
 	cps     #0x11	/* fiq */
 	str		r13, [r0], #4
 	str		r14, [r0], #4
diff --git a/arch/arm/arm/cache-ops.S b/arch/arm/arm/cache-ops.S
index ce36db40..05715dc1 100644
--- a/arch/arm/arm/cache-ops.S
+++ b/arch/arm/arm/cache-ops.S
@@ -137,11 +137,6 @@ FUNCTION(arch_disable_cache)
 	bic		r0, #(1<<1)
 	mcr		p15, 0, r0, c1, c0, 1		// disable L2 dcache
 #endif
-#if WITH_DEV_CACHE_PL310
-    bl      pl310_flush_invalidate
-    mov     r0, #0
-    bl      pl310_set_enable
-#endif
 
 .Licache_disable:
 	tst		r7, #ICACHE
@@ -185,11 +180,6 @@ FUNCTION(arch_enable_cache)
 	orr		r0, #(1<<1)
 	mcr		p15, 0, r0, c1, c0, 1		// enable L2 dcache
 #endif
-#if WITH_DEV_CACHE_PL310
-    bl      pl310_invalidate
-    mov     r0, #1
-    bl      pl310_set_enable
-#endif
 
 	mrc     p15, 0, r0, c1, c0, 0		// cr1
 	orr		r0, #(1<<2)
@@ -207,6 +197,7 @@ FUNCTION(arch_enable_cache)
 	mcr		p15, 0, r0, c1, c0, 0		// enable icache
 
 .Ldone_enable:
+	isb
 	msr		cpsr, r8
 	ldmfd	sp!, {r4-r12, pc}
 
diff --git a/arch/arm/arm/exceptions.S b/arch/arm/arm/exceptions.S
index 40154995..7df629f2 100644
--- a/arch/arm/arm/exceptions.S
+++ b/arch/arm/arm/exceptions.S
@@ -176,6 +176,7 @@ FUNCTION(arm_undefined)
 
 	restore
 
+#ifndef WITH_LIB_SYSCALL
 FUNCTION(arm_syscall)
 	saveall #0x13
 	/* r0 now holds pointer to iframe */
@@ -183,6 +184,7 @@ FUNCTION(arm_syscall)
 	bl		arm_syscall_handler
 
 	restoreall
+#endif
 
 FUNCTION(arm_prefetch_abort)
 	saveall_offset #4, #0x17
@@ -214,12 +216,6 @@ FUNCTION(arm_irq)
 
 	/* r0 now holds pointer to iframe */
 
-	/* increment the global critical section count */
-	LOADCONST(r2, critical_section_count)
-	ldr     r1, [r2]
-	add     r1, #1
-	str     r1, [r2]
-
 	/* track that we're inside an irq handler */
 	LOADCONST(r2, __arm_in_handler)
 	mov		r1, #1
@@ -237,12 +233,6 @@ FUNCTION(arm_irq)
 	cmp     r0, #0
 	blne    thread_preempt
 
-	/* decrement the global critical section count */
-	LOADCONST(r1, critical_section_count)
-	ldr     r0, [r1]
-	sub     r0, r0, #1
-	str     r0, [r1]
-
 	restore
 
 FUNCTION(arm_fiq)
diff --git a/arch/arm/arm/faults.c b/arch/arm/arm/faults.c
index d6397fa3..ad4cd56f 100644
--- a/arch/arm/arm/faults.c
+++ b/arch/arm/arm/faults.c
@@ -31,6 +31,7 @@ static void dump_mode_regs(uint32_t spsr)
 	struct arm_mode_regs regs;
 	arm_save_mode_regs(&regs);
 
+	dprintf(CRITICAL, "%c%s r13 0x%08x r14 0x%08x\n", ((spsr & MODE_MASK) == MODE_USR) ? '*' : ' ', "usr", regs.usr_r13, regs.usr_r14);
 	dprintf(CRITICAL, "%c%s r13 0x%08x r14 0x%08x\n", ((spsr & MODE_MASK) == MODE_FIQ) ? '*' : ' ', "fiq", regs.fiq_r13, regs.fiq_r14);
 	dprintf(CRITICAL, "%c%s r13 0x%08x r14 0x%08x\n", ((spsr & MODE_MASK) == MODE_IRQ) ? '*' : ' ', "irq", regs.irq_r13, regs.irq_r14);
 	dprintf(CRITICAL, "%c%s r13 0x%08x r14 0x%08x\n", ((spsr & MODE_MASK) == MODE_SVC) ? '*' : ' ', "svc", regs.svc_r13, regs.svc_r14);
@@ -67,6 +68,11 @@ static void dump_mode_regs(uint32_t spsr)
 
 static void dump_fault_frame(struct arm_fault_frame *frame)
 {
+	struct thread *current_thread = get_current_thread();
+
+	dprintf(CRITICAL, "current_thread %p, name %s\n",
+		current_thread, current_thread ? current_thread->name : "");
+
 	dprintf(CRITICAL, "r0  0x%08x r1  0x%08x r2  0x%08x r3  0x%08x\n", frame->r[0], frame->r[1], frame->r[2], frame->r[3]);
 	dprintf(CRITICAL, "r4  0x%08x r5  0x%08x r6  0x%08x r7  0x%08x\n", frame->r[4], frame->r[5], frame->r[6], frame->r[7]);
 	dprintf(CRITICAL, "r8  0x%08x r9  0x%08x r10 0x%08x r11 0x%08x\n", frame->r[8], frame->r[9], frame->r[10], frame->r[11]);
@@ -87,7 +93,6 @@ static void dump_iframe(struct arm_iframe *frame)
 
 static void exception_die(struct arm_fault_frame *frame, const char *msg)
 {
-	inc_critical_section();
 	dprintf(CRITICAL, msg);
 	dump_fault_frame(frame);
 
@@ -97,7 +102,6 @@ static void exception_die(struct arm_fault_frame *frame, const char *msg)
 
 static void exception_die_iframe(struct arm_iframe *frame, const char *msg)
 {
-	inc_critical_section();
 	dprintf(CRITICAL, msg);
 	dump_iframe(frame);
 
@@ -112,8 +116,6 @@ void arm_syscall_handler(struct arm_fault_frame *frame)
 
 void arm_undefined_handler(struct arm_iframe *frame)
 {
-	inc_critical_section();
-
 	/* look at the undefined instruction, figure out if it's something we can handle */
 	bool in_thumb = frame->spsr & (1<<5);
 	if (in_thumb) {
@@ -157,7 +159,6 @@ void arm_undefined_handler(struct arm_iframe *frame)
 #if ARM_WITH_VFP
 fpu:
 	arm_fpu_undefined_instruction(frame);
-	dec_critical_section();
 #endif
 }
 
@@ -168,7 +169,7 @@ void arm_data_abort_handler(struct arm_fault_frame *frame)
 
 	uint32_t fault_status = (BIT(fsr, 10) ? (1<<4) : 0) |  BITS(fsr, 3, 0);
 
-	dprintf(CRITICAL, "\n\ndata abort, ");
+	dprintf(CRITICAL, "\n\ncpu %u data abort, ", arch_curr_cpu_num());
 	bool write = !!BIT(fsr, 11);
 
 	/* decode the fault status (from table B3-23) */
@@ -228,7 +229,7 @@ void arm_prefetch_abort_handler(struct arm_fault_frame *frame)
 
 	uint32_t fault_status = (BIT(fsr, 10) ? (1<<4) : 0) |  BITS(fsr, 3, 0);
 
-	dprintf(CRITICAL, "\n\nprefetch abort, ");
+	dprintf(CRITICAL, "\n\ncpu %u prefetch abort, ", arch_curr_cpu_num());
 
 	/* decode the fault status (from table B3-23) */
 	switch (fault_status) {
diff --git a/arch/arm/arm/mmu.c b/arch/arm/arm/mmu.c
index d8054da7..165f313f 100644
--- a/arch/arm/arm/mmu.c
+++ b/arch/arm/arm/mmu.c
@@ -55,6 +55,9 @@ static uint32_t mmu_flags_to_l1_arch_flags(uint flags)
     switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
         case ARCH_MMU_FLAG_CACHED:
             arch_flags |= MMU_MEMORY_L1_TYPE_NORMAL_WRITE_BACK_ALLOCATE;
+#if WITH_SMP
+            arch_flags |= MMU_MEMORY_L1_SECTION_SHAREABLE;
+#endif
             break;
         case ARCH_MMU_FLAG_UNCACHED:
             arch_flags |= MMU_MEMORY_L1_TYPE_STRONGLY_ORDERED;
@@ -97,7 +100,13 @@ static uint32_t mmu_flags_to_l2_arch_flags(uint flags)
     uint32_t arch_flags = 0;
     switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
         case ARCH_MMU_FLAG_CACHED:
+#if WITH_SMP
+            arch_flags |= MMU_MEMORY_L2_SHAREABLE;
+#endif
             arch_flags |= MMU_MEMORY_L2_TYPE_NORMAL_WRITE_BACK_ALLOCATE;
+#if WITH_SMP
+            arch_flags |= MMU_MEMORY_L2_SHAREABLE;
+#endif
             break;
         case ARCH_MMU_FLAG_UNCACHED:
             arch_flags |= MMU_MEMORY_L2_TYPE_STRONGLY_ORDERED;
@@ -151,14 +160,23 @@ static void arm_mmu_map_section(addr_t paddr, addr_t vaddr, uint flags)
     arm_kernel_translation_table[index] = (paddr & ~(MB-1)) | (MMU_MEMORY_DOMAIN_MEM << 5) | MMU_MEMORY_L1_DESCRIPTOR_SECTION | flags;
 }
 
+static void arm_mmu_unmap_l1_entry(uint32_t index)
+{
+    DEBUG_ASSERT(index < countof(arm_kernel_translation_table));
+
+    arm_kernel_translation_table[index] = 0;
+    DSB;
+    arm_invalidate_tlb_mva_no_barrier((vaddr_t)index * SECTION_SIZE);
+}
+
 static void arm_mmu_unmap_section(addr_t vaddr)
 {
     DEBUG_ASSERT(IS_SECTION_ALIGNED(vaddr));
+    arm_mmu_unmap_l1_entry(vaddr / SECTION_SIZE);
+}
 
-    uint index = vaddr / SECTION_SIZE;
-    arm_kernel_translation_table[index] = 0;
-
-    arm_invalidate_tlb_mva(vaddr);
+void arm_mmu_early_init(void)
+{
 }
 
 void arm_mmu_init(void)
@@ -180,6 +198,7 @@ void arm_mmu_init(void)
         }
         map++;
     }
+    arm_after_invalidate_tlb_barrier();
 }
 
 void arch_disable_mmu(void)
@@ -211,6 +230,8 @@ status_t arch_mmu_query(vaddr_t vaddr, paddr_t *paddr, uint *flags)
 
             if (flags) {
                 *flags = 0;
+                if (tt_entry & MMU_MEMORY_L1_SECTION_NON_SECURE)
+                        *flags |= ARCH_MMU_FLAG_NS;
                 switch (tt_entry & MMU_MEMORY_L1_TYPE_MASK) {
                     case MMU_MEMORY_L1_TYPE_STRONGLY_ORDERED:
                         *flags |= ARCH_MMU_FLAG_UNCACHED;
@@ -256,6 +277,9 @@ status_t arch_mmu_query(vaddr_t vaddr, paddr_t *paddr, uint *flags)
 
                     if (flags) {
                         *flags = 0;
+                        /* NS flag is only present on L1 entry */
+                        if (tt_entry & MMU_MEMORY_L1_SECTION_NON_SECURE)
+                                *flags |= ARCH_MMU_FLAG_NS;
                         switch (l2_entry & MMU_MEMORY_L2_TYPE_MASK) {
                             case MMU_MEMORY_L2_TYPE_STRONGLY_ORDERED:
                                 *flags |= ARCH_MMU_FLAG_UNCACHED;
@@ -291,10 +315,138 @@ status_t arch_mmu_query(vaddr_t vaddr, paddr_t *paddr, uint *flags)
     return NO_ERROR;
 }
 
+
+/*
+ *  We allow up to 4 adjacent L1 entries to point within the same memory page
+ *  allocated for L2 page tables.
+ *
+ *  L1:   | 0 | 1 | 2 | 3 | .... | N+0 | N+1 | N+2 | N+3 |
+ *  L2:   [       0       | .....[      (N/4)            |
+ */
+#define L1E_PER_PAGE 4
+
+static status_t get_l2_table(uint32_t l1_index, paddr_t *ppa)
+{
+    status_t ret;
+    paddr_t pa;
+    uint32_t tt_entry;
+
+    DEBUG_ASSERT(ppa);
+
+    /* lookup an existing l2 pagetable */
+    for(uint i = 0; i < L1E_PER_PAGE; i++) {
+        tt_entry = arm_kernel_translation_table[ROUNDDOWN(l1_index, L1E_PER_PAGE) + i];
+        if ((tt_entry & MMU_MEMORY_L1_DESCRIPTOR_MASK)
+                     == MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE) {
+            *ppa = (paddr_t)ROUNDDOWN(MMU_MEMORY_L1_PAGE_TABLE_ADDR(tt_entry), PAGE_SIZE)
+                            + (PAGE_SIZE / L1E_PER_PAGE) * (l1_index & (L1E_PER_PAGE-1));
+            return NO_ERROR;
+        }
+    }
+
+    /* not found: allocate it */
+    uint32_t *l2_va = pmm_alloc_kpage();
+    if (!l2_va)
+        return ERR_NO_MEMORY;
+
+    /* wipe it clean to set no access */
+    memset(l2_va, 0, PAGE_SIZE);
+
+    /* get physical address */
+    ret = arm_vtop((vaddr_t)l2_va, &pa);
+    ASSERT(!ret);
+    ASSERT(paddr_to_kvaddr(pa));
+
+    DEBUG_ASSERT(IS_PAGE_ALIGNED((vaddr_t)l2_va));
+    DEBUG_ASSERT(IS_PAGE_ALIGNED(pa));
+
+    *ppa = pa + (PAGE_SIZE / L1E_PER_PAGE) * (l1_index & (L1E_PER_PAGE-1));
+
+    LTRACEF("allocated pagetable at %p, pa 0x%lx, pa 0x%lx\n", l2_va, pa, *ppa);
+    return NO_ERROR;
+}
+
+
+vm_page_t *address_to_page(paddr_t addr); // move to common
+
+static void put_l2_table(uint32_t l1_index, paddr_t l2_pa)
+{
+    /* check if any l1 entry points to this l2 table */
+    for (uint i = 0; i < L1E_PER_PAGE; i++) {
+        uint32_t tt_entry = arm_kernel_translation_table[ROUNDDOWN(l1_index, L1E_PER_PAGE) + i];
+        if ((tt_entry &  MMU_MEMORY_L1_DESCRIPTOR_MASK)
+                == MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE) {
+            return;
+        }
+    }
+
+    /* we can free this l2 table */
+    vm_page_t *page = address_to_page(l2_pa);
+    if (!page)
+         panic("bad page table paddr 0x%lx\n", l2_pa);
+
+    LTRACEF("freeing pagetable at 0x%lx\n", l2_pa);
+    pmm_free_page(page);
+}
+
+#if WITH_ARCH_MMU_PICK_SPOT
+
+static inline bool are_regions_compatible(uint new_region_flags,
+                                          uint adjacent_region_flags)
+{
+    /*
+     * Two regions are compatible if NS flag matches.
+     */
+    uint mask = ARCH_MMU_FLAG_NS;
+
+    if ((new_region_flags & mask) == (adjacent_region_flags & mask))
+        return true;
+
+    return false;
+}
+
+
+vaddr_t arch_mmu_pick_spot(vaddr_t base, uint prev_region_flags,
+                           vaddr_t end,  uint next_region_flags,
+                           vaddr_t align, size_t size, uint flags)
+{
+    LTRACEF("base = 0x%lx, end=0x%lx, align=%ld, size=%zd, flags=0x%x\n",
+             base, end, align, size, flags);
+
+    vaddr_t spot;
+
+    if (align >= SECTION_SIZE ||
+        are_regions_compatible(flags, prev_region_flags)) {
+        spot = ALIGN(base, align);
+    } else {
+        spot = ALIGN(base, SECTION_SIZE);
+    }
+
+    vaddr_t spot_end = spot + size - 1;
+    if (spot_end < spot || spot_end > end)
+        return end; /* wrapped around or it does not fit */
+
+    if ((spot_end / SECTION_SIZE) == (end / SECTION_SIZE)) {
+       if (!are_regions_compatible(flags, next_region_flags))
+            return end;
+    }
+
+    return spot;
+}
+#endif  /* WITH_ARCH_MMU_PICK_SPOT */
+
+
 int arch_mmu_map(vaddr_t vaddr, paddr_t paddr, uint count, uint flags)
 {
     LTRACEF("vaddr 0x%lx paddr 0x%lx count %u flags 0x%x\n", vaddr, paddr, count, flags);
 
+#if !WITH_ARCH_MMU_PICK_SPOT
+    if (flags & ARCH_MMU_FLAG_NS) {
+        /* WITH_ARCH_MMU_PICK_SPOT is required to support NS memory */
+        panic("NS mem is not supported\n");
+    }
+#endif
+
     /* paddr and vaddr must be aligned */
     DEBUG_ASSERT(IS_PAGE_ALIGNED(vaddr));
     DEBUG_ASSERT(IS_PAGE_ALIGNED(paddr));
@@ -332,33 +484,16 @@ int arch_mmu_map(vaddr_t vaddr, paddr_t paddr, uint count, uint flags)
                     PANIC_UNIMPLEMENTED;
                     break;
                 case MMU_MEMORY_L1_DESCRIPTOR_INVALID: {
-                    /* alloc and put in a L2 page table */
-                    uint32_t *l2_table = pmm_alloc_kpage();
-                    if (!l2_table) {
+                    paddr_t l2_pa = 0;
+                    if (get_l2_table(l1_index, &l2_pa) != NO_ERROR) {
                         TRACEF("failed to allocate pagetable\n");
                         goto done;
                     }
+                    tt_entry = l2_pa | MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE;
+                    if (flags & ARCH_MMU_FLAG_NS)
+                        tt_entry |= MMU_MEMORY_L1_PAGETABLE_NON_SECURE;
 
-                    /* get physical address */
-                    paddr_t l2_pa = 0;
-                    arm_vtop((vaddr_t)l2_table, &l2_pa);
-
-                    LTRACEF("allocated pagetable at %p, pa 0x%lx\n", l2_table, l2_pa);
-
-                    DEBUG_ASSERT(IS_PAGE_ALIGNED((vaddr_t)l2_table));
-                    DEBUG_ASSERT(IS_PAGE_ALIGNED(l2_pa));
-
-                    /* zero the L2 table and add it to the L1 table */
-                    memset(l2_table, 0, PAGE_SIZE);
-
-                    /* put it in the adjacent 4 entries filling in 1K page tables at once */
-                    l1_index = ROUNDDOWN(l1_index, 4);
-                    arm_kernel_translation_table[l1_index] = l2_pa | MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE;
-                    arm_kernel_translation_table[l1_index + 1] = (l2_pa + 1024) | MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE;
-                    arm_kernel_translation_table[l1_index + 2] = (l2_pa + 2048) |  MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE;
-                    arm_kernel_translation_table[l1_index + 3] = (l2_pa + 3072) |  MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE;
-                    tt_entry = arm_kernel_translation_table[l1_index];
-
+                    arm_kernel_translation_table[l1_index] = tt_entry;
                     /* fallthrough */
                 }
                 case MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE: {
@@ -373,14 +508,14 @@ int arch_mmu_map(vaddr_t vaddr, paddr_t paddr, uint count, uint flags)
                     uint arch_flags = mmu_flags_to_l2_arch_flags(flags) |
                         MMU_MEMORY_L2_DESCRIPTOR_SMALL_PAGE;
 
-                    /* add the entry */
                     uint l2_index = (vaddr % SECTION_SIZE) / PAGE_SIZE;
-                    l2_table[l2_index] = paddr | arch_flags;
-
-                    count--;
-                    mapped++;
-                    vaddr += PAGE_SIZE;
-                    paddr += PAGE_SIZE;
+                    do {
+                        l2_table[l2_index++] = paddr | arch_flags;
+                        count--;
+                        mapped++;
+                        vaddr += PAGE_SIZE;
+                        paddr += PAGE_SIZE;
+                    } while (count && (l2_index != (SECTION_SIZE / PAGE_SIZE)));
                     break;
                 }
                 default:
@@ -390,6 +525,7 @@ int arch_mmu_map(vaddr_t vaddr, paddr_t paddr, uint count, uint flags)
     }
 
 done:
+    DSB;
     return mapped;
 }
 
@@ -399,15 +535,21 @@ int arch_mmu_unmap(vaddr_t vaddr, uint count)
     if (!IS_PAGE_ALIGNED(vaddr))
         return ERR_INVALID_ARGS;
 
+    LTRACEF("vaddr 0x%lx count %u\n", vaddr, count);
+
     int unmapped = 0;
     while (count > 0) {
         uint l1_index = vaddr / SECTION_SIZE;
         uint32_t tt_entry = arm_kernel_translation_table[l1_index];
 
         switch (tt_entry & MMU_MEMORY_L1_DESCRIPTOR_MASK) {
-            case MMU_MEMORY_L1_DESCRIPTOR_INVALID:
+            case MMU_MEMORY_L1_DESCRIPTOR_INVALID: {
                 /* this top level page is not mapped, move on to the next one */
-                goto next_page;
+                uint page_cnt = MIN((SECTION_SIZE - (vaddr % SECTION_SIZE)) / PAGE_SIZE, count);
+                vaddr += page_cnt * PAGE_SIZE;
+                count -= page_cnt;
+                break;
+            }
             case MMU_MEMORY_L1_DESCRIPTOR_SECTION:
                 if (IS_SECTION_ALIGNED(vaddr) && count >= SECTION_SIZE / PAGE_SIZE) {
                     /* we're asked to remove at least all of this section, so just zero it out */
@@ -417,25 +559,60 @@ int arch_mmu_unmap(vaddr_t vaddr, uint count)
                     vaddr += SECTION_SIZE;
                     count -= SECTION_SIZE / PAGE_SIZE;
                     unmapped += SECTION_SIZE / PAGE_SIZE;
-                    goto next;
                 } else {
                     // XXX handle unmapping just part of a section
                     // will need to convert to a L2 table and then unmap the parts we are asked to
                     PANIC_UNIMPLEMENTED;
                 }
                 break;
+            case MMU_MEMORY_L1_DESCRIPTOR_PAGE_TABLE: {
+                uint32_t *l2_table = paddr_to_kvaddr(MMU_MEMORY_L1_PAGE_TABLE_ADDR(tt_entry));
+                uint page_idx = (vaddr % SECTION_SIZE) / PAGE_SIZE;
+                uint page_cnt = MIN((SECTION_SIZE / PAGE_SIZE) - page_idx, count);
+
+                /* unmap page run */
+                for (uint i = 0; i < page_cnt; i++) {
+                    l2_table[page_idx++] = 0;
+                }
+                DSB;
+
+                /* invalidate tlb */
+                for (uint i = 0; i < page_cnt; i++) {
+                    arm_invalidate_tlb_mva_no_barrier(vaddr);
+                    vaddr += PAGE_SIZE;
+                }
+                count -= page_cnt;
+                unmapped += page_cnt;
+
+                /*
+                 * Check if all pages related to this l1 entry are deallocated.
+                 * We only need to check pages that we did not clear above starting
+                 * from page_idx and wrapped around SECTION.
+                 */
+                page_cnt = (SECTION_SIZE / PAGE_SIZE) - page_cnt;
+                while (page_cnt) {
+                    if (page_idx == (SECTION_SIZE / PAGE_SIZE))
+                        page_idx = 0;
+                    if (l2_table[page_idx++])
+                        break;
+                    page_cnt--;
+                }
+                if (!page_cnt) {
+                    /* we can kill l1 entry */
+                    arm_mmu_unmap_l1_entry(l1_index);
+
+                    /* try to free l2 page itself */
+                    put_l2_table(l1_index, MMU_MEMORY_L1_PAGE_TABLE_ADDR(tt_entry));
+                }
+                break;
+            }
+
             default:
                 // XXX not implemented supersections or L2 tables
                 PANIC_UNIMPLEMENTED;
         }
-
-next_page:
-        vaddr += PAGE_SIZE;
-        count--;
-next:
-        ;
     }
-
+    arm_after_invalidate_tlb_barrier();
     return unmapped;
 }
 
diff --git a/arch/arm/arm/mp.c b/arch/arm/arm/mp.c
new file mode 100644
index 00000000..3e4fa95e
--- /dev/null
+++ b/arch/arm/arm/mp.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <arch/mp.h>
+
+#include <assert.h>
+#include <trace.h>
+#include <err.h>
+#include <platform/interrupts.h>
+#include <arch/ops.h>
+
+#if WITH_DEV_INTERRUPT_ARM_GIC
+#include <dev/interrupt/arm_gic.h>
+#elif PLATFORM_BCM2835
+/* bcm2835 has a weird custom interrupt controller for MP */
+extern void bcm2835_send_ipi(uint irq, uint cpu_mask);
+#else
+#error need other implementation of interrupt controller that can ipi
+#endif
+
+#define LOCAL_TRACE 0
+
+#define GIC_IPI_BASE (14)
+
+status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi)
+{
+    LTRACEF("target 0x%x, ipi %u\n", target, ipi);
+
+#if WITH_DEV_INTERRUPT_ARM_GIC
+    uint gic_ipi_num = ipi + GIC_IPI_BASE;
+
+    /* filter out targets outside of the range of cpus we care about */
+    target &= ((1UL << SMP_MAX_CPUS) - 1);
+    if (target != 0) {
+        LTRACEF("target 0x%x, gic_ipi %u\n", target, gic_ipi_num);
+        u_int flags = 0;
+#if WITH_LIB_SM
+        flags |= ARM_GIC_SGI_FLAG_NS;
+#endif
+        arm_gic_sgi(gic_ipi_num, flags, target);
+    }
+#elif PLATFORM_BCM2835
+    /* filter out targets outside of the range of cpus we care about */
+    target &= ((1UL << SMP_MAX_CPUS) - 1);
+    if (target != 0) {
+        bcm2835_send_ipi(ipi, target);
+    }
+#endif
+
+    return NO_ERROR;
+}
+
+enum handler_return arm_ipi_generic_handler(void *arg)
+{
+    LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg);
+
+    return INT_NO_RESCHEDULE;
+}
+
+enum handler_return arm_ipi_reschedule_handler(void *arg)
+{
+    LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg);
+
+    return mp_mbx_reschedule_irq();
+}
+
+void arch_mp_init_percpu(void)
+{
+#if WITH_DEV_INTERRUPT_ARM_GIC
+    register_int_handler(MP_IPI_GENERIC + GIC_IPI_BASE, &arm_ipi_generic_handler, 0);
+    register_int_handler(MP_IPI_RESCHEDULE + GIC_IPI_BASE, &arm_ipi_reschedule_handler, 0);
+#endif
+}
+
diff --git a/arch/arm/arm/ops.S b/arch/arm/arm/ops.S
index cae2baed..7e75a952 100644
--- a/arch/arm/arm/ops.S
+++ b/arch/arm/arm/ops.S
@@ -93,7 +93,7 @@ FUNCTION(_atomic_or)
 	mov		r0, r12
 	bx		lr
 
-FUNCTION(spin_trylock)
+FUNCTION(arch_spin_trylock)
 	mov	r2, r0
 	mov	r1, #1
 	ldrex	r0, [r2]
@@ -102,7 +102,7 @@ FUNCTION(spin_trylock)
 	dmb
 	bx	lr
 
-FUNCTION(spin_lock)
+FUNCTION(arch_spin_lock)
 	mov	r1, #1
 1:
 	ldrex	r2, [r0]
@@ -114,7 +114,7 @@ FUNCTION(spin_lock)
 	dmb
 	bx	lr
 
-FUNCTION(spin_unlock)
+FUNCTION(arch_spin_unlock)
 	mov	r1, #0
 	dmb
 	str	r1, [r0]
diff --git a/arch/arm/arm/start.S b/arch/arm/arm/start.S
index f791daa5..8a2419a2 100644
--- a/arch/arm/arm/start.S
+++ b/arch/arm/arm/start.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2013 Travis Geiselbrecht
+ * Copyright (c) 2008-2015 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -36,6 +36,9 @@ _start:
 	b	arm_reserved
 	b	arm_irq
 	b	arm_fiq
+#if WITH_SMP
+	b	arm_reset
+#endif
 
 .weak platform_reset
 platform_reset:
@@ -43,29 +46,42 @@ platform_reset:
 
 .globl arm_reset
 arm_reset:
-	/* do some cpu setup */
-#if ARM_WITH_CP15
+	/* do some early cpu setup */
 	mrc		p15, 0, r12, c1, c0, 0
-		/* XXX this is currently for arm926, revist with armv6 cores */
-		/* new thumb behavior, low exception vectors, i/d cache disable, mmu disabled */
-	bic		r12, #(1<<15| 1<<13 | 1<<12)
-	bic		r12, #(1<<2 | 1<<1 | 1<<0)
-#if ARM_ARCH_LEVEL < 6
-		/* enable alignment faults on pre-ARMv6 hardware. On v6+,
-		 * GCC is free to generate unaligned accesses.
-		 */
-	orr		r12, #(1<<1)
-#endif
+	/* i/d cache disable, mmu disabled */
+	bic		r12, #(1<<12)
+	bic		r12, #(1<<2 | 1<<0)
+#if WITH_KERNEL_VM
+	/* enable caches so atomics and spinlocks work */
+	orr		r12, r12, #(1<<12)
+	orr		r12, r12, #(1<<2)
+#endif // WITH_KERNEL_VM
 	mcr		p15, 0, r12, c1, c0, 0
-#endif
+
+	/* calculate the physical offset from our eventual virtual location */
+.Lphys_offset:
+	ldr		r4, =.Lphys_offset
+	adr		r11, .Lphys_offset
+	sub		r11, r11, r4
+
+#if WITH_SMP
+	/* figure out our cpu number */
+	mrc     p15, 0, r12, c0, c0, 5 /* read MPIDR */
+
+	/* mask off the bottom bits to test cluster number:cpu number */
+	ubfx    r12, r12, #0, #SMP_CPU_ID_BITS
+
+	/* if we're not cpu 0:0, fall into a trap and wait */
+	teq     r12, #0
+	movne	r0, r12
+	bne     arm_secondary_setup
+#endif // WITH_SMP
 
 #if WITH_CPU_EARLY_INIT
 	/* call platform/arch/etc specific init code */
 	bl __cpu_early_init
-#endif
+#endif // WITH_CPU_EARLY_INIT
 
-#if WITH_KERNEL_VM
-__relocate_start:
 #if WITH_NO_PHYS_RELOCATION
 	/* assume that image is properly loaded in physical memory */
 #else
@@ -73,7 +89,7 @@ __relocate_start:
 	adr		r4, _start                           /* this emits sub r4, pc, #constant */
 	ldr		r5, =(MEMBASE + KERNEL_LOAD_OFFSET)  /* calculate the binary's physical load address */
 	subs	r12, r4, r5                          /* calculate the delta between where we're loaded and the proper spot */
-	beq		.Lsetup_mmu
+	beq		.Lrelocate_done
 
 	/* we need to relocate ourselves to the proper spot */
 	ldr		r6, =__data_end
@@ -89,25 +105,22 @@ __relocate_start:
 
 	/* we're relocated, jump to the right address */
 	sub		pc, r12
-	nop
-#endif
+	nop		/* skipped in the add to pc */
 
-__mmu_start:
+	/* recalculate the physical offset */
+	sub		r11, r11, r12
+
+.Lrelocate_done:
+#endif // !WITH_NO_PHYS_RELOCATION
+
+#if WITH_KERNEL_VM
 .Lsetup_mmu:
+
 	/* set up the mmu according to mmu_initial_mappings */
 
-	/* calculate our physical to virtual offset */
-	mov		r12, pc
-	ldr		r5, =.Laddr1
-.Laddr1:
-	sub		r12, r5
-
-	/* r12 now holds the offset from virtual to physical:
-	 * virtual + r12 = physical */
-
 	/* load the base of the translation table and clear the table */
 	ldr		r4, =arm_kernel_translation_table
-	add		r4, r12
+	add		r4, r4, r11
 		/* r4 = physical address of translation table */
 
 	mov		r5, #0
@@ -122,13 +135,19 @@ __mmu_start:
 
 	/* load the address of the mmu_initial_mappings table and start processing */
 	ldr		r5, =mmu_initial_mappings
-	add		r5, r12
+	add		r5, r5, r11
 		/* r5 = physical address of mmu initial mapping table */
 
 .Linitial_mapping_loop:
 	ldmia	r5!, { r6-r10 }
 		/* r6 = phys, r7 = virt, r8 = size, r9 = flags, r10 = name */
 
+	/* round size up to 1MB alignment */
+	ubfx		r10, r6, #0, #20
+	add		r8, r8, r10
+	add		r8, r8, #(1 << 20)
+	sub		r8, r8, #1
+
 	/* mask all the addresses and sizes to 1MB boundaries */
 	lsr		r6, #20  /* r6 = physical address / 1MB */
 	lsr		r7, #20  /* r7 = virtual address / 1MB */
@@ -148,11 +167,11 @@ __mmu_start:
 		/* r10 = mmu entry flags */
 
 0:
-	orr		r11, r10, r6, lsl #20
-		/* r11 = phys addr | flags */
+	orr		r12, r10, r6, lsl #20
+		/* r12 = phys addr | flags */
 
 	/* store into appropriate translation table entry */
-	str		r11, [r4, r7, lsl #2]
+	str		r12, [r4, r7, lsl #2]
 
 	/* loop until we're done */
 	add		r6, #1
@@ -164,81 +183,49 @@ __mmu_start:
 
 .Linitial_mapping_done:
 
-	/* set up the mmu */
+#if MMU_WITH_TRAMPOLINE
+	/* move arm_kernel_translation_table address to r8 and
+	 * set cacheable attributes on translation walk
+	 */
+	orr		r8, r4, #MMU_TTBRx_FLAGS
 
-	/* Invalidate TLB */
-	mov		r12, #0
-	mcr		p15, 0, r12, c8, c7, 0
-	isb
+	/* Prepare tt_trampoline page table */
+	/* Calculate pagetable physical addresses */
+	ldr		r4, =tt_trampoline	/* r4 = tt_trampoline vaddr */
+	add		r4, r4, r11		/* r4 = tt_trampoline paddr */
 
-	/* Write 0 to TTBCR */
-	mcr		p15, 0, r12, c2, c0, 2
-	isb
-
-	/* set cacheable attributes on translation walk */
-	/* (SMP extensions) non-shareable, inner write-back write-allocate */
-	orr		r4, #(1<<6 | 0<<1)
-	/* outer write-back write-allocate */
-	orr		r4, #(1<<3)
-
-	/* Write ttbr with phys addr of the translation table */
-	mcr		p15, 0, r4, c2, c0, 0
-	isb
-
-	/* Write DACR */
-	mov		r12, #0x1
-	mcr		p15, 0, r12, c3, c0, 0
-	isb
-
-	/* Read SCTLR */
-	mrc		p15, 0, r12, c1, c0, 0
-
-	/* Disable TRE/AFE */
-	bic		r12, #(1<<29 | 1<<28)
-
-	/* Turn on the MMU */
-	orr		r12, #0x1
-
-	/* Write back SCTLR */
-	mcr		p15, 0, r12, c1, c0, 0
-	isb
-
-	/* Jump to virtual code address */
-	ldr		pc, =1f
+	/* Zero tt_trampoline translation tables */
+	mov		r6, #0
+	mov		r7, #0
 1:
+	str		r7, [r4, r6, lsl#2]
+	add		r6, #1
+	cmp		r6, #0x1000
+	blt		1b
 
-	/* Invalidate TLB */
-	mov		r12, #0
-	mcr		p15, 0, r12, c8, c7, 0
-	isb
+	/* Setup 1M section mapping at
+	 * phys	 -> phys   and
+	 * virt	 -> phys
+	 */
+	lsr		r6, pc, #20		/* r6 = paddr index */
+	ldr		r7, =MMU_KERNEL_L1_PTE_FLAGS
+	add		r7, r7, r6, lsl #20	/* r7 = pt entry */
 
-#else
-	/* see if we need to relocate */
-	mov		r4, pc
-	sub		r4, r4, #(.Laddr - _start)
-.Laddr:
-	ldr		r5, =_start
-	cmp		r4, r5
-	beq		.Lstack_setup
+	str		r7, [r4, r6, lsl #2]	/* tt_trampoline[paddr index] = pt entry */
 
-	/* we need to relocate ourselves to the proper spot */
-	ldr		r6, =__data_end
+	rsb		r6, r11, r6, lsl #20	/* r6 = vaddr */
+	str		r7, [r4, r6, lsr #(20 - 2)]	/* tt_trampoline[vaddr index] = pt entry */
+#endif // MMU_WITH_TRAMPOLINE
 
-.Lrelocate_loop:
-	ldr		r7, [r4], #4
-	str		r7, [r5], #4
-	cmp		r5, r6
-	bne		.Lrelocate_loop
-
-	/* we're relocated, jump to the right address */
-	ldr		r4, =.Lstack_setup
-	bx		r4
-#endif
+	/* set up the mmu */
+	bl		.Lmmu_setup
+#endif // WITH_KERNEL_VM
 
 	/* at this point we're running at our final location in virtual memory (if enabled) */
 .Lstack_setup:
 	/* set up the stack for irq, fiq, abort, undefined, system/user, and lastly supervisor mode */
-	ldr		r12, =abort_stack_top
+	ldr		r12, =abort_stack
+	add		r12, #ARCH_DEFAULT_STACK_SIZE
 
 	cpsid	i,#0x12       /* irq */
 	mov		sp, r12
@@ -287,18 +274,162 @@ __mmu_start:
 	bl		lk_main
 	b		.
 
+#if WITH_KERNEL_VM
+	/* per cpu mmu setup, shared between primary and secondary cpus
+	   args:
+	   r4 == translation table physical
+	   r8 == final translation table physical (if using trampoline)
+	*/
+.Lmmu_setup:
+	/* Invalidate TLB */
+	mov		r12, #0
+	mcr		p15, 0, r12, c8, c7, 0
+	isb
+
+	/* Write 0 to TTBCR */
+	mcr		p15, 0, r12, c2, c0, 2
+	isb
+
+	/* set cacheable attributes on translation walk */
+	/* inner write-back write-allocate */
+	orr		r12, r4, #(1<<6 | 0<<1)
+	/* outer write-back write-allocate */
+	orr		r12, #(1<<3)
+#if WITH_SMP
+	/* (SMP extensions) shareable, outer shareable */
+	orr		r12, #(1<<1 | 0<<5)
+#endif
+
+	/* Write ttbr with phys addr of the translation table */
+	mcr		p15, 0, r12, c2, c0, 0
+	isb
+
+	/* Write DACR */
+	mov		r12, #0x1
+	mcr		p15, 0, r12, c3, c0, 0
+	isb
+
+	/* Read SCTLR into r12 */
+	mrc		p15, 0, r12, c1, c0, 0
+
+	/* Disable TRE/AFE */
+	bic		r12, #(1<<29 | 1<<28)
+
+	/* Turn on the MMU */
+	orr		r12, #0x1
+
+	/* Write back SCTLR */
+	mcr		p15, 0, r12, c1, c0, 0
+	isb
+
+	/* Jump to virtual code address */
+	ldr		pc, =1f
+1:
+
+#if MMU_WITH_TRAMPOLINE
+	/* Switch to main page table */
+	mcr		p15, 0, r8, c2, c0, 0
+	isb
+#endif
+
+	/* Invalidate TLB */
+	mov		r12, #0
+	mcr		p15, 0, r12, c8, c7, 0
+	isb
+
+	/* assume lr was in physical memory, adjust it before returning */
+	sub		lr, r11
+	bx		lr
+#endif
+
+#if WITH_SMP
+	/* secondary cpu entry point */
+	/* r0 holds cpu number */
+	/* r11 hold phys offset */
+FUNCTION(arm_secondary_setup)
+	/* all other cpus, trap and wait to be released */
+1:
+	wfe
+	ldr     r12, =arm_boot_cpu_lock
+	add     r12, r12, r11
+	ldr     r12, [r12]
+	cmp     r12, #0
+	bne     1b
+
+	and	r1, r0, #0xff
+	cmp	r1, #(1 << SMP_CPU_CLUSTER_SHIFT)
+	bge	unsupported_cpu_trap
+	bic	r0, r0, #0xff
+	orr	r0, r1, r0, LSR #(8 - SMP_CPU_CLUSTER_SHIFT)
+
+	cmp		r0, #SMP_MAX_CPUS
+	bge		unsupported_cpu_trap
+	mov		r5, r0 /* save cpu num */
+
+	/* set up the stack for irq, fiq, abort, undefined, system/user, and lastly supervisor mode */
+	ldr		r1, =abort_stack
+	mov		r2, #ARCH_DEFAULT_STACK_SIZE
+	add		r0, #1
+	mul		r2, r2, r0
+	add		r1, r2
+
+	cpsid	i,#0x12       /* irq */
+	mov		sp, r1
+
+	cpsid	i,#0x11       /* fiq */
+	mov		sp, r1
+
+	cpsid	i,#0x17       /* abort */
+	mov		sp, r1
+
+	cpsid	i,#0x1b       /* undefined */
+	mov		sp, r1
+
+	cpsid	i,#0x1f       /* system */
+	mov		sp, r1
+
+	cpsid	i,#0x13       /* supervisor */
+	mov		sp, r1
+
+#if WITH_KERNEL_VM
+	/* load the physical base of the translation table and clear the table */
+	ldr		r4, =arm_kernel_translation_table
+	add		r4, r4, r11
+
+#if MMU_WITH_TRAMPOLINE
+	/* move arm_kernel_translation_table address to r8 and
+	 * set cacheable attributes on translation walk
+	 */
+	orr		r8, r4, #MMU_TTBRx_FLAGS
+
+	/* Prepare tt_trampoline page table */
+	/* Calculate pagetable physical addresses */
+	ldr		r4, =tt_trampoline	/* r4 = tt_trampoline vaddr */
+	add		r4, r4, r11		/* r4 = tt_trampoline paddr */
+#endif
+
+	/* set up the mmu on this cpu and switch to virtual memory */
+	bl		.Lmmu_setup
+#endif
+
+	/* stay in supervisor and call into arm arch code to continue setup */
+	mov		r0, r5
+	bl		arm_secondary_entry
+
+	/* cpus above the number we claim to support get trapped here */
+unsupported_cpu_trap:
+	wfe
+	b 		unsupported_cpu_trap
+#endif
+
 .ltorg
 
-.bss
-.align 3
-	/* the abort stack is for unrecoverable errors.
-	 * also note the initial working stack is set to here.
-	 * when the threading system starts up it'll switch to a new
-	 * dynamically allocated stack, so we don't need it for very long
-	 */
-LOCAL_DATA(abort_stack)
-	.skip 4096
-LOCAL_DATA(abort_stack_top)
+#if WITH_KERNEL_VM && MMU_WITH_TRAMPOLINE
+.section ".bss.prebss.translation_table"
+.align 14
+DATA(tt_trampoline)
+	.skip 16384
+#endif
 
 .data
 .align 2
diff --git a/arch/arm/arm/thread.c b/arch/arm/arm/thread.c
index ec377742..3e605b48 100644
--- a/arch/arm/arm/thread.c
+++ b/arch/arm/arm/thread.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Travis Geiselbrecht
+ * Copyright (c) 2008-2014 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -24,6 +24,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <debug.h>
+#include <trace.h>
 #include <kernel/thread.h>
 #include <arch/arm.h>
 
@@ -49,8 +50,9 @@ static void initial_thread_func(void)
 //	dprintf("initial_thread_func: thread %p calling %p with arg %p\n", current_thread, current_thread->entry, current_thread->arg);
 //	dump_thread(current_thread);
 
-	/* exit the implicit critical section we're within */
-	exit_critical_section();
+    /* release the thread lock that was implicitly held across the reschedule */
+    spin_unlock(&thread_lock);
+    arch_enable_ints();
 
 	thread_t *ct = get_current_thread();
 	ret = ct->entry(ct->arg);
@@ -85,7 +87,7 @@ void arch_thread_initialize(thread_t *t)
 
 void arch_context_switch(thread_t *oldthread, thread_t *newthread)
 {
-//	dprintf("arch_context_switch: old %p (%s), new %p (%s)\n", oldthread, oldthread->name, newthread, newthread->name);
+//	TRACEF("arch_context_switch: cpu %u old %p (%s), new %p (%s)\n", arch_curr_cpu_num(), oldthread, oldthread->name, newthread, newthread->name);
 #if ARM_WITH_VFP
     arm_fpu_thread_swap(oldthread, newthread);
 #endif
@@ -94,3 +96,11 @@ void arch_context_switch(thread_t *oldthread, thread_t *newthread)
 
 }
 
+void arch_dump_thread(thread_t *t)
+{
+    if (t->state != THREAD_RUNNING) {
+        dprintf(INFO, "\tarch: ");
+        dprintf(INFO, "sp 0x%lx\n", t->arch.sp);
+    }
+}
+
diff --git a/arch/arm/include/arch/arch_ops.h b/arch/arm/include/arch/arch_ops.h
index b7d181b5..aa2ea9b8 100644
--- a/arch/arm/include/arch/arch_ops.h
+++ b/arch/arm/include/arch/arch_ops.h
@@ -20,8 +20,7 @@
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#ifndef __ARCH_ARM_OPS_H
-#define __ARCH_ARM_OPS_H
+#pragma once
 
 #ifndef ASSEMBLY
 
@@ -196,7 +195,7 @@ static inline int atomic_cmpxchg(volatile int *ptr, int oldval, int newval)
 		    "ldrex	%[old], [%[ptr]]\n"
 		    "mov	%[test], #0\n"
 		    "teq	%[old], %[oldval]\n"
-#if ARM_ISA_ARMV7M
+#if (ARM_ISA_ARMV7M || __thumb__)
 		    "bne	0f\n"
 		    "strex	%[test], %[newval], [%[ptr]]\n"
 		    "0:\n"
@@ -233,6 +232,19 @@ static inline uint32_t arch_cycle_count(void)
 #endif
 }
 
+#if WITH_SMP && ARM_ISA_ARMV7
+static inline uint arch_curr_cpu_num(void)
+{
+    uint32_t mpidr = arm_read_mpidr();
+    return ((mpidr & ((1U << SMP_CPU_ID_BITS) - 1)) >> 8 << SMP_CPU_CLUSTER_SHIFT) | (mpidr & 0xff);
+}
+#else
+static inline uint arch_curr_cpu_num(void)
+{
+    return 0;
+}
+#endif
+
 /* defined in kernel/thread.h */
 
 #if !ARM_ISA_ARMV7M
@@ -305,63 +317,5 @@ static inline uint32_t arch_cycle_count(void) { return _arch_cycle_count(); }
 #define smp_rmb()   CF
 #endif
 
-typedef unsigned long spin_lock_t;
-void spin_lock(spin_lock_t *lock); /* interrupts should already be disabled */
-int spin_trylock(spin_lock_t *lock); /* Returns 0 on success, non-0 on failure */
-void spin_unlock(spin_lock_t *lock);
-
-typedef ulong spin_lock_saved_state_t;
-typedef ulong spin_lock_save_flags_t;
-
-enum {
-	/* Possible future flags:
-	 * SPIN_LOCK_FLAG_PMR_MASK         = 0x000000ff,
-	 * SPIN_LOCK_FLAG_PREEMPTION       = 0x10000000,
-	 * SPIN_LOCK_FLAG_SET_PMR          = 0x20000000,
-	 */
-
-	/* ARM specific flags */
-	SPIN_LOCK_FLAG_IRQ              = 0x40000000,
-	SPIN_LOCK_FLAG_FIQ              = 0x80000000, /* Do not use unless IRQs are already disabled */
-	SPIN_LOCK_FLAG_IRQ_FIQ          = SPIN_LOCK_FLAG_IRQ | SPIN_LOCK_FLAG_FIQ,
-
-	/* Generic flags */
-	SPIN_LOCK_FLAG_INTERRUPTS       = SPIN_LOCK_FLAG_IRQ,
-};
-
-enum {
-	/* private */
-	SPIN_LOCK_STATE_RESTORE_IRQ     = 1,
-	SPIN_LOCK_STATE_RESTORE_FIQ     = 2,
-};
-
-static inline void
-spin_lock_save(spin_lock_t *lock, spin_lock_saved_state_t *statep, spin_lock_save_flags_t flags)
-{
-	spin_lock_saved_state_t state = 0;
-	if ((flags & SPIN_LOCK_FLAG_IRQ) && !arch_ints_disabled()) {
-		state |= SPIN_LOCK_STATE_RESTORE_IRQ;
-		arch_disable_ints();
-	}
-	if ((flags & SPIN_LOCK_FLAG_FIQ) && !arch_fiqs_disabled()) {
-		state |= SPIN_LOCK_STATE_RESTORE_FIQ;
-		arch_disable_fiqs();
-	}
-	*statep = state;
-	spin_lock(lock);
-}
-
-static inline void
-spin_unlock_restore(spin_lock_t *lock, spin_lock_saved_state_t old_state, spin_lock_save_flags_t flags)
-{
-	spin_unlock(lock);
-	if ((flags & SPIN_LOCK_FLAG_FIQ) && (old_state & SPIN_LOCK_STATE_RESTORE_FIQ))
-		arch_enable_fiqs();
-	if ((flags & SPIN_LOCK_FLAG_IRQ) && (old_state & SPIN_LOCK_STATE_RESTORE_IRQ))
-		arch_enable_ints();
-}
-
 #endif // ASSEMBLY
 
-#endif
-
diff --git a/arch/arm/include/arch/arm.h b/arch/arm/include/arch/arm.h
index 38d137ef..47c7824d 100644
--- a/arch/arm/include/arch/arm.h
+++ b/arch/arm/include/arch/arm.h
@@ -47,6 +47,7 @@ __BEGIN_CDECLS
 #else
 #error unhandled arm isa
 #endif
+#define NOP __asm__ volatile("nop");
 
 void arm_context_switch(vaddr_t *old_sp, vaddr_t new_sp);
 
@@ -98,6 +99,7 @@ struct arm_fault_frame {
 #define MODE_SYS 0x1f
 
 struct arm_mode_regs {
+	uint32_t usr_r13, usr_r14;
 	uint32_t fiq_r13, fiq_r14;
 	uint32_t irq_r13, irq_r14;
 	uint32_t svc_r13, svc_r14;
@@ -115,6 +117,12 @@ static inline __ALWAYS_INLINE uint32_t arm_read_##reg(void) { \
 	return val; \
 } \
 \
+static inline __ALWAYS_INLINE uint32_t arm_read_##reg##_relaxed(void) { \
+	uint32_t val; \
+	__asm__("mrc " #cp ", " #op1 ", %0, " #c1 ","  #c2 "," #op2 : "=r" (val)); \
+	return val; \
+} \
+\
 static inline __ALWAYS_INLINE void arm_write_##reg(uint32_t val) { \
 	__asm__ volatile("mcr " #cp ", " #op1 ", %0, " #c1 ","  #c2 "," #op2 :: "r" (val)); \
 	ISB; \
@@ -156,6 +164,7 @@ GEN_CP15_REG_FUNCS(tpidrprw, 0, c13, c0, 4);
 GEN_CP15_REG_FUNCS(midr, 0, c0, c0, 0);
 GEN_CP15_REG_FUNCS(mpidr, 0, c0, c0, 5);
 GEN_CP15_REG_FUNCS(vbar, 0, c12, c0, 0);
+GEN_CP15_REG_FUNCS(cbar, 4, c15, c0, 0);
 
 GEN_CP15_REG_FUNCS(ats1cpr, 0, c7, c8, 0);
 GEN_CP15_REG_FUNCS(ats1cpw, 0, c7, c8, 1);
@@ -167,6 +176,11 @@ GEN_CP15_REG_FUNCS(ats12nsour, 0, c7, c8, 6);
 GEN_CP15_REG_FUNCS(ats12nsouw, 0, c7, c8, 7);
 GEN_CP15_REG_FUNCS(par, 0, c7, c4, 0);
 
+/* Branch predictor invalidate */
+GEN_CP15_REG_FUNCS(bpiall, 0, c7, c5, 6);
+GEN_CP15_REG_FUNCS(bpimva, 0, c7, c5, 7);
+GEN_CP15_REG_FUNCS(bpiallis, 0, c7, c1, 6);
+
 /* tlb registers */
 GEN_CP15_REG_FUNCS(tlbiallis, 0, c8, c3, 0);
 GEN_CP15_REG_FUNCS(tlbimvais, 0, c8, c3, 1);
@@ -183,6 +197,9 @@ GEN_CP15_REG_FUNCS(tlbimva, 0, c8, c7, 1);
 GEN_CP15_REG_FUNCS(tlbiasid, 0, c8, c7, 2);
 GEN_CP15_REG_FUNCS(tlbimvaa, 0, c8, c7, 3);
 
+GEN_CP15_REG_FUNCS(l2ctlr, 1, c9, c0, 2);
+GEN_CP15_REG_FUNCS(l2ectlr, 1, c9, c0, 3);
+
 /* debug registers */
 GEN_CP14_REG_FUNCS(dbddidr, 0, c0, c0, 0);
 GEN_CP14_REG_FUNCS(dbgdrar, 0, c1, c0, 0);
diff --git a/arch/arm/include/arch/arm/mmu.h b/arch/arm/include/arch/arm/mmu.h
index 4057f3a7..b32d267e 100644
--- a/arch/arm/include/arch/arm/mmu.h
+++ b/arch/arm/include/arch/arm/mmu.h
@@ -51,7 +51,9 @@
 #define MMU_MEMORY_L1_TYPE_NORMAL_WRITE_THROUGH          ((0x0 << 12) | (0x2 << 2))
 #define MMU_MEMORY_L1_TYPE_NORMAL_WRITE_BACK_NO_ALLOCATE ((0x0 << 12) | (0x3 << 2))
 #define MMU_MEMORY_L1_TYPE_NORMAL_WRITE_BACK_ALLOCATE    ((0x1 << 12) | (0x3 << 2))
-#define MMU_MEMORY_L1_TYPE_MASK                          ((0x3 << 12) | (0x3 << 2))
+#define MMU_MEMORY_L1_TYPE_MASK                          ((0x7 << 12) | (0x3 << 2))
+
+#define MMU_MEMORY_L1_TYPE_INNER_WRITE_BACK_ALLOCATE     ((0x4 << 12) | (0x1 << 2))
 
 /* C, B and TEX[2:0] encodings without TEX remap (for second level descriptors) */
                                                           /* TEX     |    CB    */
@@ -62,7 +64,7 @@
 #define MMU_MEMORY_L2_TYPE_NORMAL_WRITE_THROUGH          ((0x0 << 6) | (0x2 << 2))
 #define MMU_MEMORY_L2_TYPE_NORMAL_WRITE_BACK_NO_ALLOCATE ((0x0 << 6) | (0x3 << 2))
 #define MMU_MEMORY_L2_TYPE_NORMAL_WRITE_BACK_ALLOCATE    ((0x1 << 6) | (0x3 << 2))
-#define MMU_MEMORY_L2_TYPE_MASK                          ((0x3 << 6) | (0x3 << 2))
+#define MMU_MEMORY_L2_TYPE_MASK                          ((0x7 << 6) | (0x3 << 2))
 
 #define MMU_MEMORY_DOMAIN_MEM                            (0)
 
@@ -109,6 +111,13 @@
 #define MMU_MEMORY_L1_SECTION_NON_GLOBAL    (1 << 17)
 #define MMU_MEMORY_L1_SECTION_XN            (1 << 4)
 
+#define MMU_MEMORY_L1_CB_SHIFT              2
+#define MMU_MEMORY_L1_TEX_SHIFT            12
+
+#define MMU_MEMORY_SET_L1_INNER(val)        (((val) & 0x3) << MMU_MEMORY_L1_CB_SHIFT)
+#define MMU_MEMORY_SET_L1_OUTER(val)        (((val) & 0x3) << MMU_MEMORY_L1_TEX_SHIFT)
+#define MMU_MEMORY_SET_L1_CACHEABLE_MEM     (0x4 << MMU_MEMORY_L1_TEX_SHIFT)
+
 #define MMU_MEMORY_L2_SHAREABLE             (1 << 10)
 #define MMU_MEMORY_L2_NON_GLOBAL            (1 << 11)
 
@@ -134,6 +143,8 @@
 /* IRGN[1:0] is encoded as: IRGN[0] in TTBRx[6], and IRGN[1] in TTBRx[0] */
 #define MMU_MEMORY_TTBR_IRGN(x)             ((((x) & 0x1) << 6) | \
                                             ((((x) >> 1) & 0x1) << 0))
+#define MMU_MEMORY_TTBR_S                   (1 << 1)
+#define MMU_MEMORY_TTBR_NOS                 (1 << 5)
 
 /* Default configuration for main kernel page table:
  *    - section mappings for memory
@@ -142,18 +153,31 @@
 
 /* Enable cached page table walks:
  * inner/outer (IRGN/RGN): write-back + write-allocate
+ * (select inner sharable on smp)
  */
+#if WITH_SMP
+#define MMU_TTBRx_SHARABLE_FLAGS (MMU_MEMORY_TTBR_S | MMU_MEMORY_TTBR_NOS)
+#else
+#define MMU_TTBRx_SHARABLE_FLAGS (0)
+#endif
 #define MMU_TTBRx_FLAGS \
     (MMU_MEMORY_TTBR_RGN(MMU_MEMORY_WRITE_BACK_ALLOCATE) |\
-     MMU_MEMORY_TTBR_IRGN(MMU_MEMORY_WRITE_BACK_ALLOCATE))
+     MMU_MEMORY_TTBR_IRGN(MMU_MEMORY_WRITE_BACK_ALLOCATE) | \
+     MMU_TTBRx_SHARABLE_FLAGS)
 
 /* Section mapping, TEX[2:0]=001, CB=11, S=1, AP[2:0]=001 */
+#if WITH_SMP
+#define MMU_KERNEL_L1_PTE_FLAGS \
+    (MMU_MEMORY_L1_DESCRIPTOR_SECTION | \
+     MMU_MEMORY_L1_TYPE_NORMAL_WRITE_BACK_ALLOCATE | \
+     MMU_MEMORY_L1_AP_P_RW_U_NA | \
+     MMU_MEMORY_L1_SECTION_SHAREABLE)
+#else
 #define MMU_KERNEL_L1_PTE_FLAGS \
     (MMU_MEMORY_L1_DESCRIPTOR_SECTION | \
      MMU_MEMORY_L1_TYPE_NORMAL_WRITE_BACK_ALLOCATE | \
      MMU_MEMORY_L1_AP_P_RW_U_NA)
-/* XXX add with smp to above */
-//     MMU_MEMORY_L1_SECTION_SHAREABLE |
+#endif
 
 #define MMU_INITIAL_MAP_STRONGLY_ORDERED \
     (MMU_MEMORY_L1_DESCRIPTOR_SECTION | \
@@ -176,48 +200,77 @@
 
 __BEGIN_CDECLS
 
+void arm_mmu_early_init(void);
 void arm_mmu_init(void);
 status_t arm_vtop(addr_t va, addr_t *pa);
 
 /* tlb routines */
-static inline void arm_invalidate_tlb_global(void) {
-    CF;
+
+static inline void arm_after_invalidate_tlb_barrier(void) {
+#if WITH_SMP
+    arm_write_bpiallis(0);
+#else
+    arm_write_bpiall(0);
+#endif
+    DSB;
+    ISB;
+}
+
+static inline void arm_invalidate_tlb_global_no_barrier(void) {
 #if WITH_SMP
     arm_write_tlbiallis(0);
 #else
     arm_write_tlbiall(0);
 #endif
-    DSB;
 }
 
-static inline void arm_invalidate_tlb_mva(vaddr_t va) {
-    CF;
+static inline void arm_invalidate_tlb_global(void) {
+    DSB;
+    arm_invalidate_tlb_global_no_barrier();
+    arm_after_invalidate_tlb_barrier();
+}
+
+static inline void arm_invalidate_tlb_mva_no_barrier(vaddr_t va) {
 #if WITH_SMP
     arm_write_tlbimvaais(va & 0xfffff000);
 #else
     arm_write_tlbimvaa(va & 0xfffff000);
 #endif
-    DSB;
 }
 
-static inline void arm_invalidate_tlb_asid(uint8_t asid) {
-    CF;
+static inline void arm_invalidate_tlb_mva(vaddr_t va) {
+    DSB;
+    arm_invalidate_tlb_mva_no_barrier(va);
+    arm_after_invalidate_tlb_barrier();
+}
+
+
+static inline void arm_invalidate_tlb_asid_no_barrier(uint8_t asid) {
 #if WITH_SMP
     arm_write_tlbiasidis(asid);
 #else
     arm_write_tlbiasid(asid);
 #endif
-    DSB;
 }
 
-static inline void arm_invalidate_tlb_mva_asid(vaddr_t va, uint8_t asid) {
-    CF;
+static inline void arm_invalidate_tlb_asid(uint8_t asid) {
+    DSB;
+    arm_invalidate_tlb_asid_no_barrier(asid);
+    arm_after_invalidate_tlb_barrier();
+}
+
+static inline void arm_invalidate_tlb_mva_asid_no_barrier(vaddr_t va, uint8_t asid) {
 #if WITH_SMP
     arm_write_tlbimvais((va & 0xfffff000) | asid);
 #else
     arm_write_tlbimva((va & 0xfffff000) | asid);
 #endif
+}
+
+static inline void arm_invalidate_tlb_mva_asid(vaddr_t va, uint8_t asid) {
     DSB;
+    arm_invalidate_tlb_mva_asid_no_barrier(va, asid);
+    arm_after_invalidate_tlb_barrier();
 }
 
 __END_CDECLS
diff --git a/arch/arm/include/arch/defines.h b/arch/arm/include/arch/defines.h
index 899afc32..bf33b97d 100644
--- a/arch/arm/include/arch/defines.h
+++ b/arch/arm/include/arch/defines.h
@@ -34,12 +34,16 @@
 #define CACHE_LINE 32
 #elif ARM_CPU_ARM1136
 #define CACHE_LINE 32
+#elif ARM_CPU_CORTEX_A7
+#define CACHE_LINE 64 /* XXX L1 icache is 32 bytes */
 #elif ARM_CPU_CORTEX_A8
 #define CACHE_LINE 64
 #elif ARM_CPU_CORTEX_A9
 #define CACHE_LINE 32
 #elif ARM_CPU_CORTEX_M3 || ARM_CPU_CORTEX_M4
 #define CACHE_LINE 32 /* doesn't actually matter */
+#elif ARM_CPU_CORTEX_A15
+#define CACHE_LINE 64
 #else
 #error unknown cpu
 #endif
diff --git a/arch/arm/include/arch/spinlock.h b/arch/arm/include/arch/spinlock.h
new file mode 100644
index 00000000..b232024d
--- /dev/null
+++ b/arch/arm/include/arch/spinlock.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <arch/ops.h>
+#include <stdbool.h>
+
+#define SPIN_LOCK_INITIAL_VALUE (0)
+
+typedef unsigned long spin_lock_t;
+
+typedef unsigned long spin_lock_saved_state_t;
+typedef unsigned long spin_lock_save_flags_t;
+
+static inline void arch_spin_lock_init(spin_lock_t *lock)
+{
+    *lock = SPIN_LOCK_INITIAL_VALUE;
+}
+
+static inline bool arch_spin_lock_held(spin_lock_t *lock)
+{
+    return *lock != 0;
+}
+
+#if WITH_SMP
+
+void arch_spin_lock(spin_lock_t *lock);
+int arch_spin_trylock(spin_lock_t *lock);
+void arch_spin_unlock(spin_lock_t *lock);
+
+#else
+
+static inline void arch_spin_lock(spin_lock_t *lock)
+{
+    *lock = 1;
+}
+
+static inline int arch_spin_trylock(spin_lock_t *lock)
+{
+    return 0;
+}
+
+static inline void arch_spin_unlock(spin_lock_t *lock)
+{
+    *lock = 0;
+}
+
+#endif
+
+    /* ARM specific flags */
+#define SPIN_LOCK_FLAG_IRQ                      0x40000000
+#define SPIN_LOCK_FLAG_FIQ                      0x80000000 /* Do not use unless IRQs are already disabled */
+#define SPIN_LOCK_FLAG_IRQ_FIQ                  (SPIN_LOCK_FLAG_IRQ | SPIN_LOCK_FLAG_FIQ)
+
+    /* default arm flag is to just disable plain irqs */
+#define ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS  SPIN_LOCK_FLAG_IRQ
+
+enum {
+    /* private */
+    SPIN_LOCK_STATE_RESTORE_IRQ = 1,
+    SPIN_LOCK_STATE_RESTORE_FIQ = 2,
+};
+
+static inline void
+arch_interrupt_save(spin_lock_saved_state_t *statep, spin_lock_save_flags_t flags)
+{
+    spin_lock_saved_state_t state = 0;
+    if ((flags & SPIN_LOCK_FLAG_IRQ) && !arch_ints_disabled()) {
+        state |= SPIN_LOCK_STATE_RESTORE_IRQ;
+        arch_disable_ints();
+    }
+    if ((flags & SPIN_LOCK_FLAG_FIQ) && !arch_fiqs_disabled()) {
+        state |= SPIN_LOCK_STATE_RESTORE_FIQ;
+        arch_disable_fiqs();
+    }
+    *statep = state;
+}
+
+static inline void
+arch_interrupt_restore(spin_lock_saved_state_t old_state, spin_lock_save_flags_t flags)
+{
+    if ((flags & SPIN_LOCK_FLAG_FIQ) && (old_state & SPIN_LOCK_STATE_RESTORE_FIQ))
+        arch_enable_fiqs();
+    if ((flags & SPIN_LOCK_FLAG_IRQ) && (old_state & SPIN_LOCK_STATE_RESTORE_IRQ))
+        arch_enable_ints();
+}
+
+
diff --git a/arch/arm/rules.mk b/arch/arm/rules.mk
index 2cda3059..7ccc954a 100644
--- a/arch/arm/rules.mk
+++ b/arch/arm/rules.mk
@@ -20,7 +20,6 @@ GLOBAL_DEFINES += \
 	ARM_ISA_ARMv7M=1 \
 	ARM_WITH_THUMB=1 \
 	ARM_WITH_THUMB2=1
-GLOBAL_COMPILEFLAGS += -mcpu=$(ARM_CPU)
 HANDLED_CORE := true
 ENABLE_THUMB := true
 SUBARCH := arm-m
@@ -32,7 +31,6 @@ GLOBAL_DEFINES += \
 	ARM_ISA_ARMv7M=1 \
 	ARM_WITH_THUMB=1 \
 	ARM_WITH_THUMB2=1
-GLOBAL_COMPILEFLAGS += -mcpu=$(ARM_CPU)
 HANDLED_CORE := true
 ENABLE_THUMB := true
 SUBARCH := arm-m
@@ -47,11 +45,40 @@ GLOBAL_DEFINES += \
 	ARM_WITH_THUMB2=1 \
 	ARM_WITH_VFP=1 \
 	__FPU_PRESENT=1
-GLOBAL_COMPILEFLAGS += -mcpu=cortex-m4 -mfloat-abi=softfp
 HANDLED_CORE := true
 ENABLE_THUMB := true
 SUBARCH := arm-m
 endif
+ifeq ($(ARM_CPU),cortex-a7)
+GLOBAL_DEFINES += \
+	ARM_WITH_CP15=1 \
+	ARM_WITH_MMU=1 \
+	ARM_ISA_ARMv7=1 \
+	ARM_ISA_ARMv7A=1 \
+	ARM_WITH_VFP=1 \
+	ARM_WITH_NEON=1 \
+	ARM_WITH_THUMB=1 \
+	ARM_WITH_THUMB2=1 \
+	ARM_WITH_CACHE=1
+HANDLED_CORE := true
+endif
+ifeq ($(ARM_CPU),cortex-a15)
+GLOBAL_DEFINES += \
+	ARM_WITH_CP15=1 \
+	ARM_WITH_MMU=1 \
+	ARM_ISA_ARMv7=1 \
+	ARM_ISA_ARMv7A=1 \
+	ARM_WITH_THUMB=1 \
+	ARM_WITH_THUMB2=1 \
+	ARM_WITH_CACHE=1 \
+	ARM_WITH_L2=1
+ifneq ($(ARM_WITHOUT_VFP_NEON),true)
+GLOBAL_DEFINES += \
+	ARM_WITH_VFP=1 \
+	ARM_WITH_NEON=1
+endif
+HANDLED_CORE := true
+endif
 ifeq ($(ARM_CPU),cortex-a8)
 GLOBAL_DEFINES += \
 	ARM_WITH_CP15=1 \
@@ -64,9 +91,7 @@ GLOBAL_DEFINES += \
 	ARM_WITH_THUMB2=1 \
 	ARM_WITH_CACHE=1 \
 	ARM_WITH_L2=1
-GLOBAL_COMPILEFLAGS += -mcpu=$(ARM_CPU)
 HANDLED_CORE := true
-GLOBAL_COMPILEFLAGS += -mfpu=neon -mfloat-abi=softfp
 endif
 ifeq ($(ARM_CPU),cortex-a9)
 GLOBAL_DEFINES += \
@@ -77,7 +102,6 @@ GLOBAL_DEFINES += \
 	ARM_WITH_THUMB=1 \
 	ARM_WITH_THUMB2=1 \
 	ARM_WITH_CACHE=1
-GLOBAL_COMPILEFLAGS += -mcpu=$(ARM_CPU)
 HANDLED_CORE := true
 endif
 ifeq ($(ARM_CPU),cortex-a9-neon)
@@ -92,11 +116,7 @@ GLOBAL_DEFINES += \
 	ARM_WITH_THUMB=1 \
 	ARM_WITH_THUMB2=1 \
 	ARM_WITH_CACHE=1
-GLOBAL_COMPILEFLAGS += -mcpu=cortex-a9
 HANDLED_CORE := true
-# XXX cannot enable neon right now because compiler generates
-# neon code for 64bit integer ops
-GLOBAL_COMPILEFLAGS += -mfpu=vfpv3 -mfloat-abi=softfp
 endif
 ifeq ($(ARM_CPU),arm1136j-s)
 GLOBAL_DEFINES += \
@@ -106,7 +126,6 @@ GLOBAL_DEFINES += \
 	ARM_WITH_THUMB=1 \
 	ARM_WITH_CACHE=1 \
 	ARM_CPU_ARM1136=1
-GLOBAL_COMPILEFLAGS += -mcpu=$(ARM_CPU)
 HANDLED_CORE := true
 endif
 ifeq ($(ARM_CPU),arm1176jzf-s)
@@ -118,7 +137,6 @@ GLOBAL_DEFINES += \
 	ARM_WITH_THUMB=1 \
 	ARM_WITH_CACHE=1 \
 	ARM_CPU_ARM1136=1
-GLOBAL_COMPILEFLAGS += -mcpu=$(ARM_CPU)
 HANDLED_CORE := true
 endif
 
@@ -176,6 +194,31 @@ KERNEL_LOAD_OFFSET ?= 0
 GLOBAL_DEFINES += \
     KERNEL_BASE=$(KERNEL_BASE) \
     KERNEL_LOAD_OFFSET=$(KERNEL_LOAD_OFFSET)
+
+# if its requested we build with SMP, arm generically supports 4 cpus
+ifeq ($(WITH_SMP),1)
+SMP_MAX_CPUS ?= 4
+SMP_CPU_CLUSTER_SHIFT ?= 8
+SMP_CPU_ID_BITS ?= 24
+
+GLOBAL_DEFINES += \
+    WITH_SMP=1 \
+    SMP_MAX_CPUS=$(SMP_MAX_CPUS) \
+    SMP_CPU_CLUSTER_SHIFT=$(SMP_CPU_CLUSTER_SHIFT) \
+    SMP_CPU_ID_BITS=$(SMP_CPU_ID_BITS)
+
+MODULE_SRCS += \
+	$(LOCAL_DIR)/arm/mp.c
+else
+GLOBAL_DEFINES += \
+    SMP_MAX_CPUS=1
+endif
+
+ifeq (true,$(call TOBOOL,$(WITH_NS_MAPPING)))
+GLOBAL_DEFINES += \
+    WITH_ARCH_MMU_PICK_SPOT=1
+endif
+
 endif
 ifeq ($(SUBARCH),arm-m)
 MODULE_SRCS += \
@@ -192,49 +235,20 @@ GLOBAL_INCLUDES += \
 # we're building for small binaries
 GLOBAL_DEFINES += \
 	ARM_ONLY_THUMB=1 \
-	ARCH_DEFAULT_STACK_SIZE=1024
+	ARCH_DEFAULT_STACK_SIZE=1024 \
+	SMP_MAX_CPUS=1
 
 ARCH_OPTFLAGS := -Os
 WITH_LINKER_GC ?= 1
 endif
 
-# try to find the toolchain
-ifndef TOOLCHAIN_PREFIX
-TOOLCHAIN_PREFIX := arm-eabi-
-FOUNDTOOL=$(shell which $(TOOLCHAIN_PREFIX)gcc)
-ifeq ($(FOUNDTOOL),)
-TOOLCHAIN_PREFIX := arm-elf-
-FOUNDTOOL=$(shell which $(TOOLCHAIN_PREFIX)gcc)
-ifeq ($(FOUNDTOOL),)
-TOOLCHAIN_PREFIX := arm-none-eabi-
-FOUNDTOOL=$(shell which $(TOOLCHAIN_PREFIX)gcc)
-ifeq ($(FOUNDTOOL),)
-TOOLCHAIN_PREFIX := arm-linux-gnueabi-
-FOUNDTOOL=$(shell which $(TOOLCHAIN_PREFIX)gcc)
-
-# Set no stack protection if we found our gnueabi toolchain. We don't
-# need it.
-#
-# Stack protection is default in this toolchain and we get such errors
-# final linking stage:
-#
-# undefined reference to `__stack_chk_guard'
-# undefined reference to `__stack_chk_fail'
-# undefined reference to `__stack_chk_guard'
-#
-ifneq (,$(findstring arm-linux-gnueabi-,$(FOUNDTOOL)))
-        GLOBAL_COMPILEFLAGS += -fno-stack-protector
-endif
-
-endif
-endif
-endif
-ifeq ($(FOUNDTOOL),)
-$(error cannot find toolchain, please set TOOLCHAIN_PREFIX or add it to your path)
-endif
-endif
+# try to find toolchain
+include $(LOCAL_DIR)/toolchain.mk
+TOOLCHAIN_PREFIX := $(ARCH_$(ARCH)_TOOLCHAIN_PREFIX)
 $(info TOOLCHAIN_PREFIX = $(TOOLCHAIN_PREFIX))
 
+ARCH_COMPILEFLAGS += $(ARCH_$(ARCH)_COMPILEFLAGS)
+
 GLOBAL_COMPILEFLAGS += $(THUMBINTERWORK)
 
 # make sure some bits were set up
@@ -249,7 +263,10 @@ ifeq ($(MEMVARS_SET),0)
 $(error missing MEMBASE or MEMSIZE variable, please set in target rules.mk)
 endif
 
-LIBGCC := $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(THUMBCFLAGS) -print-libgcc-file-name)
+LIBGCC := $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(THUMBCFLAGS) -print-libgcc-file-name)
+$(info LIBGCC = $(LIBGCC))
+
+$(info GLOBAL_COMPILEFLAGS = $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(THUMBCFLAGS))
 
 # potentially generated files that should be cleaned out with clean make rule
 GENERATED += \
diff --git a/arch/arm/toolchain.mk b/arch/arm/toolchain.mk
new file mode 100644
index 00000000..5a4c8819
--- /dev/null
+++ b/arch/arm/toolchain.mk
@@ -0,0 +1,80 @@
+ifndef ARCH_arm_TOOLCHAIN_INCLUDED
+ARCH_arm_TOOLCHAIN_INCLUDED := 1
+
+# try to find the toolchain
+ifndef ARCH_arm_TOOLCHAIN_PREFIX
+ARCH_arm_TOOLCHAIN_PREFIX := arm-eabi-
+FOUNDTOOL=$(shell which $(ARCH_arm_TOOLCHAIN_PREFIX)gcc)
+ifeq ($(FOUNDTOOL),)
+ARCH_arm_TOOLCHAIN_PREFIX := arm-elf-
+FOUNDTOOL=$(shell which $(ARCH_arm_TOOLCHAIN_PREFIX)gcc)
+ifeq ($(FOUNDTOOL),)
+ARCH_arm_TOOLCHAIN_PREFIX := arm-none-eabi-
+FOUNDTOOL=$(shell which $(ARCH_arm_TOOLCHAIN_PREFIX)gcc)
+ifeq ($(FOUNDTOOL),)
+ARCH_arm_TOOLCHAIN_PREFIX := arm-linux-gnueabi-
+FOUNDTOOL=$(shell which $(ARCH_arm_TOOLCHAIN_PREFIX)gcc)
+
+# Set no stack protection if we found our gnueabi toolchain. We don't
+# need it.
+#
+# Stack protection is default in this toolchain and we get such errors
+# final linking stage:
+#
+# undefined reference to `__stack_chk_guard'
+# undefined reference to `__stack_chk_fail'
+# undefined reference to `__stack_chk_guard'
+#
+ifneq (,$(findstring arm-linux-gnueabi-,$(FOUNDTOOL)))
+        ARCH_arm_COMPILEFLAGS += -fno-stack-protector
+endif
+
+endif
+endif
+endif
+ifeq ($(FOUNDTOOL),)
+$(error cannot find toolchain, please set ARCH_arm_TOOLCHAIN_PREFIX or add it to your path)
+endif
+endif
+
+
+ifeq ($(ARM_CPU),cortex-m3)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+endif
+ifeq ($(ARM_CPU),cortex-m4)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+endif
+ifeq ($(ARM_CPU),cortex-m4f)
+ARCH_arm_COMPILEFLAGS += -mcpu=cortex-m4 -mfloat-abi=softfp
+endif
+ifeq ($(ARM_CPU),cortex-a7)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+ARCH_arm_COMPILEFLAGS += -mfpu=vfpv3 -mfloat-abi=softfp
+endif
+ifeq ($(ARM_CPU),cortex-a8)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+ARCH_arm_COMPILEFLAGS += -mfpu=neon -mfloat-abi=softfp
+endif
+ifeq ($(ARM_CPU),cortex-a9)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+endif
+ifeq ($(ARM_CPU),cortex-a9-neon)
+ARCH_arm_COMPILEFLAGS += -mcpu=cortex-a9
+# XXX cannot enable neon right now because compiler generates
+# neon code for 64bit integer ops
+ARCH_arm_COMPILEFLAGS += -mfpu=vfpv3 -mfloat-abi=softfp
+endif
+ifeq ($(ARM_CPU),cortex-a15)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+ifneq ($(ARM_WITHOUT_VFP_NEON),true)
+ARCH_arm_COMPILEFLAGS += -mfpu=vfpv3 -mfloat-abi=softfp
+endif
+endif
+ifeq ($(ARM_CPU),arm1136j-s)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+endif
+ifeq ($(ARM_CPU),arm1176jzf-s)
+ARCH_arm_COMPILEFLAGS += -mcpu=$(ARM_CPU)
+endif
+
+endif
diff --git a/arch/arm64/arch.c b/arch/arm64/arch.c
index e66ecbba..d4758b13 100644
--- a/arch/arm64/arch.c
+++ b/arch/arm64/arch.c
@@ -24,9 +24,23 @@
 #include <arch.h>
 #include <arch/ops.h>
 #include <arch/arm64.h>
+#include <arch/arm64/mmu.h>
+#include <arch/mp.h>
+#include <kernel/thread.h>
+#include <lk/init.h>
+#include <lk/main.h>
 #include <platform.h>
+#include <trace.h>
 
-void arch_early_init(void)
+#define LOCAL_TRACE 0
+
+#if WITH_SMP
+/* smp boot lock */
+static spin_lock_t arm_boot_cpu_lock = 1;
+static volatile int secondaries_to_init = 0;
+#endif
+
+static void arm64_cpu_early_init(void)
 {
     /* set the vector base */
     ARM64_WRITE_SYSREG(VBAR_EL1, (uint64_t)&arm64_exception_base);
@@ -38,8 +52,31 @@ void arch_early_init(void)
     }
 }
 
+void arch_early_init(void)
+{
+    arm64_cpu_early_init();
+    platform_init_mmu_mappings();
+}
+
 void arch_init(void)
 {
+    arch_mp_init_percpu();
+
+#if WITH_SMP
+    LTRACEF("midr_el1 0x%llx\n", ARM64_READ_SYSREG(midr_el1));
+
+    secondaries_to_init = SMP_MAX_CPUS - 1; /* TODO: get count from somewhere else, or add cpus as they boot */
+
+    lk_init_secondary_cpus(secondaries_to_init);
+
+    LTRACEF("releasing %d secondary cpus\n", secondaries_to_init);
+
+    /* release the secondary cpus */
+    spin_unlock(&arm_boot_cpu_lock);
+
+    /* flush the release of the lock, since the secondary cpus are running without cache on */
+    arch_clean_cache_range((addr_t)&arm_boot_cpu_lock, sizeof(arm_boot_cpu_lock));
+#endif
 }
 
 void arch_quiesce(void)
@@ -56,4 +93,30 @@ void arch_chain_load(void *entry, ulong arg0, ulong arg1, ulong arg2, ulong arg3
     PANIC_UNIMPLEMENTED;
 }
 
+#if WITH_SMP
+void arm64_secondary_entry(ulong asm_cpu_num)
+{
+    uint cpu = arch_curr_cpu_num();
+    if (cpu != asm_cpu_num)
+        return;
+
+    arm64_cpu_early_init();
+
+    spin_lock(&arm_boot_cpu_lock);
+    spin_unlock(&arm_boot_cpu_lock);
+
+    /* run early secondary cpu init routines up to the threading level */
+    lk_init_level(LK_INIT_FLAG_SECONDARY_CPUS, LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_THREADING - 1);
+
+    arch_mp_init_percpu();
+
+    LTRACEF("cpu num %d\n", cpu);
+
+    /* we're done, tell the main cpu we're up */
+    atomic_add(&secondaries_to_init, -1);
+    __asm__ volatile("sev");
+
+    lk_secondary_cpu_entry();
+}
+#endif
 
diff --git a/arch/arm64/asm.S b/arch/arm64/asm.S
index 617547d3..d60ca848 100644
--- a/arch/arm64/asm.S
+++ b/arch/arm64/asm.S
@@ -21,14 +21,7 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include <asm.h>
-
-.macro push ra, rb
-stp \ra, \rb, [sp,#-16]!
-.endm
-
-.macro pop ra, rb
-ldp \ra, \rb, [sp], #16
-.endm
+#include <arch/asm_macros.h>
 
 /* void arm64_context_switch(vaddr_t *old_sp, vaddr_t new_sp); */
 FUNCTION(arm64_context_switch)
diff --git a/arch/arm64/cache-ops.S b/arch/arm64/cache-ops.S
new file mode 100644
index 00000000..f5fa49d1
--- /dev/null
+++ b/arch/arm64/cache-ops.S
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2014, Google Inc. All rights reserved
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <asm.h>
+#include <arch/ops.h>
+#include <arch/defines.h>
+
+.text
+
+.macro cache_range_op, cache op
+    add     x2, x0, x1                  // calculate the end address
+    bic     x3, x0, #(CACHE_LINE-1)     // align the start with a cache line
+.Lcache_range_op_loop\@:
+    \cache  \op, x3
+    add     x3, x3, #CACHE_LINE
+    cmp     x3, x2
+    blo     .Lcache_range_op_loop\@
+    dsb     sy
+.endm
+
+    /* void arch_flush_cache_range(addr_t start, size_t len); */
+FUNCTION(arch_clean_cache_range)
+    cache_range_op dc cvac         // clean cache to PoC by MVA
+    ret
+
+    /* void arch_flush_invalidate_cache_range(addr_t start, size_t len); */
+FUNCTION(arch_clean_invalidate_cache_range)
+    cache_range_op dc civac        // clean & invalidate dcache to PoC by MVA
+    ret
+
+    /* void arch_invalidate_cache_range(addr_t start, size_t len); */
+FUNCTION(arch_invalidate_cache_range)
+    cache_range_op dc ivac         // invalidate dcache to PoC by MVA
+    ret
+
+    /* void arch_sync_cache_range(addr_t start, size_t len); */
+FUNCTION(arch_sync_cache_range)
+    cache_range_op dc cvau         // clean dcache to PoU by MVA
+    cache_range_op ic ivau         // invalidate icache to PoU by MVA
+    ret
diff --git a/arch/arm64/exceptions.S b/arch/arm64/exceptions.S
index 693d6571..38e3f3ff 100644
--- a/arch/arm64/exceptions.S
+++ b/arch/arm64/exceptions.S
@@ -21,18 +21,11 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include <asm.h>
+#include <arch/asm_macros.h>
 
 .section .text.boot.vectab
 .align 12
 
-.macro push ra, rb
-stp \ra, \rb, [sp,#-16]!
-.endm
-
-.macro pop ra, rb
-ldp \ra, \rb, [sp], #16
-.endm
-
 #define lr x30
 #define regsave_long_offset 0xf0
 #define regsave_short_offset 0x90
@@ -127,6 +120,17 @@ add sp, sp, #32
     b   .
 .endm
 
+.macro irq_exception
+    regsave_short
+    mov x0, sp
+    bl  platform_irq
+    cbz x0, .Lirq_exception_no_preempt\@
+    bl  thread_preempt
+1:
+.Lirq_exception_no_preempt\@:
+    b   arm64_exc_shared_restore_short
+.endm
+
 FUNCTION(arm64_exception_base)
 
 /* exceptions from current EL, using SP0 */
@@ -155,10 +159,7 @@ LOCAL_FUNCTION(arm64_sync_exc_current_el_SPx)
 
 .org 0x280
 LOCAL_FUNCTION(arm64_irq_current_el_SPx)
-    regsave_short
-    mov x0, sp
-    bl  platform_irq
-    b  arm64_exc_shared_restore_short
+    irq_exception
 
 .org 0x300
 LOCAL_FUNCTION(arm64_fiq_current_el_SPx)
@@ -191,15 +192,21 @@ LOCAL_FUNCTION(arm64_err_exc_lower_el_64)
 /* exceptions from lower EL, running arm32 */
 .org 0x600
 LOCAL_FUNCTION(arm64_sync_exc_lower_el_32)
-    invalid_exception 0x30
+    regsave_long
+    mov x0, sp
+    bl  arm64_sync_exception
+    b  arm64_exc_shared_restore_long
 
 .org 0x680
 LOCAL_FUNCTION(arm64_irq_lower_el_32)
-    invalid_exception 0x31
+    irq_exception
 
 .org 0x700
 LOCAL_FUNCTION(arm64_fiq_lower_el_32)
-    invalid_exception 0x32
+    regsave_short
+    mov x0, sp
+    bl  platform_fiq
+    b  arm64_exc_shared_restore_short
 
 .org 0x780
 LOCAL_FUNCTION(arm64_err_exc_lower_el_32)
diff --git a/arch/arm64/exceptions_c.c b/arch/arm64/exceptions_c.c
index a83ca7cb..dfb0edd9 100644
--- a/arch/arm64/exceptions_c.c
+++ b/arch/arm64/exceptions_c.c
@@ -43,14 +43,22 @@ static void dump_iframe(const struct arm64_iframe_long *iframe)
 
 void arm64_sync_exception(struct arm64_iframe_long *iframe)
 {
-    printf("sync_exception\n");
-    dump_iframe(iframe);
-
     uint32_t esr = ARM64_READ_SYSREG(esr_el1);
     uint32_t ec = esr >> 26;
     uint32_t il = (esr >> 25) & 0x1;
     uint32_t iss = esr & ((1<<24) - 1);
 
+#ifdef WITH_LIB_SYSCALL
+    if (ec == 0x15 || ec == 0x11) { // syscall 64/32
+        void arm64_syscall(struct arm64_iframe_long *iframe);
+        arm64_syscall(iframe);
+        return;
+    }
+#endif
+
+    printf("sync_exception\n");
+    dump_iframe(iframe);
+
     printf("ESR 0x%x: ec 0x%x, il 0x%x, iss 0x%x\n", esr, ec, il, iss);
 
     if (ec == 0x15) { // syscall
diff --git a/arch/arm64/include/arch/arch_ops.h b/arch/arm64/include/arch/arch_ops.h
index 2a12d7be..b1cc6aca 100644
--- a/arch/arm64/include/arch/arch_ops.h
+++ b/arch/arm64/include/arch/arch_ops.h
@@ -36,26 +36,62 @@
 static inline void arch_enable_ints(void)
 {
     CF;
-    __asm__ volatile("msr daifclr, #3" ::: "memory");
+    __asm__ volatile("msr daifclr, #2" ::: "memory");
 }
 
 static inline void arch_disable_ints(void)
 {
-    __asm__ volatile("msr daifset, #3" ::: "memory");
+    __asm__ volatile("msr daifset, #2" ::: "memory");
     CF;
 }
 
-// XXX
 static inline bool arch_ints_disabled(void)
 {
     unsigned int state;
 
-    __asm__ volatile("mrs %0, cpsr" : "=r"(state));
+    __asm__ volatile("mrs %0, daif" : "=r"(state));
     state &= (1<<7);
 
     return !!state;
 }
 
+static inline void arch_enable_fiqs(void)
+{
+    CF;
+    __asm__ volatile("msr daifclr, #1" ::: "memory");
+}
+
+static inline void arch_disable_fiqs(void)
+{
+    __asm__ volatile("msr daifset, #1" ::: "memory");
+    CF;
+}
+
+// XXX
+static inline bool arch_fiqs_disabled(void)
+{
+    unsigned int state;
+
+    __asm__ volatile("mrs %0, daif" : "=r"(state));
+    state &= (1<<6);
+
+    return !!state;
+}
+
+#define mb()        __asm__ volatile("dsb sy" : : : "memory")
+#define rmb()       __asm__ volatile("dsb ld" : : : "memory")
+#define wmb()       __asm__ volatile("dsb st" : : : "memory")
+
+#ifdef WITH_SMP
+#define smp_mb()    __asm__ volatile("dmb ish" : : : "memory")
+#define smp_rmb()   __asm__ volatile("dmb ishld" : : : "memory")
+#define smp_wmb()   __asm__ volatile("dmb ishst" : : : "memory")
+#else
+#define smp_mb()    CF
+#define smp_wmb()   CF
+#define smp_rmb()   CF
+#endif
+
 static inline int atomic_add(volatile int *ptr, int val)
 {
 #if USE_GCC_ATOMICS
@@ -152,6 +188,11 @@ static inline int atomic_swap(volatile int *ptr, int val)
 
 static inline int atomic_cmpxchg(volatile int *ptr, int oldval, int newval)
 {
+#if USE_GCC_ATOMICS
+    __atomic_compare_exchange_n(ptr, &oldval, newval, false,
+                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    return oldval;
+#else
     int old;
     int test;
 
@@ -174,6 +215,7 @@ static inline int atomic_cmpxchg(volatile int *ptr, int oldval, int newval)
     } while (test != 0);
 
     return old;
+#endif
 }
 
 static inline uint32_t arch_cycle_count(void)
@@ -208,5 +250,11 @@ static inline void set_current_thread(struct thread *t)
     ARM64_WRITE_SYSREG(tpidr_el1, (uint64_t)t);
 }
 
+static inline uint arch_curr_cpu_num(void)
+{
+    uint64_t mpidr =  ARM64_READ_SYSREG(mpidr_el1);
+    return ((mpidr & ((1U << SMP_CPU_ID_BITS) - 1)) >> 8 << SMP_CPU_CLUSTER_SHIFT) | (mpidr & 0xff);
+}
+
 #endif // ASSEMBLY
 
diff --git a/arch/arm64/include/arch/arm64.h b/arch/arm64/include/arch/arm64.h
index 997d1c26..ba9f731c 100644
--- a/arch/arm64/include/arch/arm64.h
+++ b/arch/arm64/include/arch/arm64.h
@@ -28,19 +28,22 @@
 
 __BEGIN_CDECLS
 
-#define DSB __asm__ volatile("dsb" ::: "memory")
+#define DSB __asm__ volatile("dsb sy" ::: "memory")
 #define ISB __asm__ volatile("isb" ::: "memory")
 
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
 #define ARM64_READ_SYSREG(reg) \
 ({ \
     uint64_t _val; \
-    __asm__ volatile("mrs %0," #reg : "=r" (_val)); \
+    __asm__ volatile("mrs %0," TOSTRING(reg) : "=r" (_val)); \
     _val; \
 })
 
 #define ARM64_WRITE_SYSREG(reg, val) \
 ({ \
-    __asm__ volatile("msr " #reg ", %0" :: "r" (val)); \
+    __asm__ volatile("msr " TOSTRING(reg) ", %0" :: "r" (val)); \
     ISB; \
 })
 
diff --git a/arch/arm64/include/arch/arm64/mmu.h b/arch/arm64/include/arch/arm64/mmu.h
new file mode 100644
index 00000000..8681564f
--- /dev/null
+++ b/arch/arm64/include/arch/arm64/mmu.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2014 Google Inc. All rights reserved
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __ARCH_ARM64_MMU_H
+#define __ARCH_ARM64_MMU_H
+
+#include <arch/defines.h>
+
+#define IFTE(c,t,e) (!!(c) * (t) | !(c) * (e))
+#define NBITS01(n)      IFTE(n, 1, 0)
+#define NBITS02(n)      IFTE((n) >>  1,  1 + NBITS01((n) >>  1), NBITS01(n))
+#define NBITS04(n)      IFTE((n) >>  2,  2 + NBITS02((n) >>  2), NBITS02(n))
+#define NBITS08(n)      IFTE((n) >>  4,  4 + NBITS04((n) >>  4), NBITS04(n))
+#define NBITS16(n)      IFTE((n) >>  8,  8 + NBITS08((n) >>  8), NBITS08(n))
+#define NBITS32(n)      IFTE((n) >> 16, 16 + NBITS16((n) >> 16), NBITS16(n))
+#define NBITS(n)        IFTE((n) >> 32, 32 + NBITS32((n) >> 32), NBITS32(n))
+
+#ifndef MMU_KERNEL_SIZE_SHIFT
+#define KERNEL_ASPACE_BITS (NBITS(0xffffffffffffffff-KERNEL_ASPACE_BASE))
+#define KERNEL_BASE_BITS (NBITS(0xffffffffffffffff-KERNEL_BASE))
+#if KERNEL_BASE_BITS > KERNEL_ASPACE_BITS
+#define KERNEL_ASPACE_BITS KERNEL_BASE_BITS /* KERNEL_BASE should not be below KERNEL_ASPACE_BASE */
+#endif
+
+#if KERNEL_ASPACE_BITS < 25
+#define MMU_KERNEL_SIZE_SHIFT (25)
+#else
+#define MMU_KERNEL_SIZE_SHIFT (KERNEL_ASPACE_BITS)
+#endif
+#endif
+
+#ifndef MMU_USER_SIZE_SHIFT
+#define MMU_USER_SIZE_SHIFT 48
+#endif
+
+#ifndef MMU_IDENT_SIZE_SHIFT
+#define MMU_IDENT_SIZE_SHIFT 42 /* Max size supported by block mappings */
+#endif
+
+#define MMU_KERNEL_PAGE_SIZE_SHIFT      (PAGE_SIZE_SHIFT)
+#define MMU_USER_PAGE_SIZE_SHIFT        (USER_PAGE_SIZE_SHIFT)
+
+#if MMU_IDENT_SIZE_SHIFT < 25
+#error MMU_IDENT_SIZE_SHIFT too small
+#elif MMU_IDENT_SIZE_SHIFT <= 29 /* Use 2MB block mappings (4K page size) */
+#define MMU_IDENT_PAGE_SIZE_SHIFT       (SHIFT_4K)
+#elif MMU_IDENT_SIZE_SHIFT <= 30 /* Use 512MB block mappings (64K page size) */
+#define MMU_IDENT_PAGE_SIZE_SHIFT       (SHIFT_64K)
+#elif MMU_IDENT_SIZE_SHIFT <= 39 /* Use 1GB block mappings (4K page size) */
+#define MMU_IDENT_PAGE_SIZE_SHIFT       (SHIFT_4K)
+#elif MMU_IDENT_SIZE_SHIFT <= 42 /* Use 512MB block mappings (64K page size) */
+#define MMU_IDENT_PAGE_SIZE_SHIFT       (SHIFT_64K)
+#else
+#error MMU_IDENT_SIZE_SHIFT too large
+#endif
+
+/*
+ * TCR TGx values
+ *
+ * Page size:   4K      16K     64K
+ * TG0:         0       2       1
+ * TG1:         2       1       3
+ */
+
+#define MMU_TG0(page_size_shift) ((((page_size_shift == 14) & 1) << 1) | \
+                                  ((page_size_shift == 16) & 1))
+
+#define MMU_TG1(page_size_shift) ((((page_size_shift == 12) & 1) << 1) | \
+                                  ((page_size_shift == 14) & 1) | \
+                                  ((page_size_shift == 16) & 1) | \
+                                  (((page_size_shift == 16) & 1) << 1))
+
+#define MMU_LX_X(page_shift, level) ((4 - (level)) * ((page_shift) - 3) + 3)
+
+#if MMU_USER_SIZE_SHIFT > MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 0)
+#define MMU_USER_TOP_SHIFT MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 0)
+#elif MMU_USER_SIZE_SHIFT > MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 1)
+#define MMU_USER_TOP_SHIFT MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 1)
+#elif MMU_USER_SIZE_SHIFT > MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 2)
+#define MMU_USER_TOP_SHIFT MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 2)
+#elif MMU_USER_SIZE_SHIFT > MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 3)
+#define MMU_USER_TOP_SHIFT MMU_LX_X(MMU_USER_PAGE_SIZE_SHIFT, 3)
+#else
+#error User address space size must be larger than page size
+#endif
+#define MMU_USER_PAGE_TABLE_ENTRIES_TOP (0x1 << (MMU_USER_SIZE_SHIFT - MMU_USER_TOP_SHIFT))
+
+#if MMU_KERNEL_SIZE_SHIFT > MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 0)
+#define MMU_KERNEL_TOP_SHIFT MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 0)
+#elif MMU_KERNEL_SIZE_SHIFT > MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 1)
+#define MMU_KERNEL_TOP_SHIFT MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 1)
+#elif MMU_KERNEL_SIZE_SHIFT > MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 2)
+#define MMU_KERNEL_TOP_SHIFT MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 2)
+#elif MMU_KERNEL_SIZE_SHIFT > MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 3)
+#define MMU_KERNEL_TOP_SHIFT MMU_LX_X(MMU_KERNEL_PAGE_SIZE_SHIFT, 3)
+#else
+#error Kernel address space size must be larger than page size
+#endif
+#define MMU_KERNEL_PAGE_TABLE_ENTRIES_TOP (0x1 << (MMU_KERNEL_SIZE_SHIFT - MMU_KERNEL_TOP_SHIFT))
+
+#if MMU_IDENT_SIZE_SHIFT > MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 0)
+#define MMU_IDENT_TOP_SHIFT MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 0)
+#elif MMU_IDENT_SIZE_SHIFT > MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 1)
+#define MMU_IDENT_TOP_SHIFT MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 1)
+#elif MMU_IDENT_SIZE_SHIFT > MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 2)
+#define MMU_IDENT_TOP_SHIFT MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 2)
+#elif MMU_IDENT_SIZE_SHIFT > MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 3)
+#define MMU_IDENT_TOP_SHIFT MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 3)
+#else
+#error Ident address space size must be larger than page size
+#endif
+#define MMU_PAGE_TABLE_ENTRIES_IDENT_SHIFT (MMU_IDENT_SIZE_SHIFT - MMU_IDENT_TOP_SHIFT)
+#define MMU_PAGE_TABLE_ENTRIES_IDENT (0x1 << MMU_PAGE_TABLE_ENTRIES_IDENT_SHIFT)
+
+#define MMU_PTE_DESCRIPTOR_BLOCK_MAX_SHIFT      (30)
+
+#ifndef ASSEMBLY
+#define BM(base, count, val) (((val) & ((1UL << (count)) - 1)) << (base))
+#else
+#define BM(base, count, val) (((val) & ((0x1 << (count)) - 1)) << (base))
+#endif
+
+#define MMU_SH_NON_SHAREABLE                    (0)
+#define MMU_SH_OUTER_SHAREABLE                  (2)
+#define MMU_SH_INNER_SHAREABLE                  (3)
+
+#define MMU_RGN_NON_CACHEABLE                   (0)
+#define MMU_RGN_WRITE_BACK_ALLOCATE             (1)
+#define MMU_RGN_WRITE_THROUGH_NO_ALLOCATE       (2)
+#define MMU_RGN_WRITE_BACK_NO_ALLOCATE          (3)
+
+#define MMU_TCR_TBI1                            BM(38, 1, 1)
+#define MMU_TCR_TBI0                            BM(37, 1, 1)
+#define MMU_TCR_AS                              BM(36, 1, 1)
+#define MMU_TCR_IPS(size)                       BM(32, 3, (size))
+#define MMU_TCR_TG1(granule_size)               BM(30, 2, (granule_size))
+#define MMU_TCR_SH1(shareability_flags)         BM(28, 2, (shareability_flags))
+#define MMU_TCR_ORGN1(cache_flags)              BM(26, 2, (cache_flags))
+#define MMU_TCR_IRGN1(cache_flags)              BM(24, 2, (cache_flags))
+#define MMU_TCR_EPD1                            BM(23, 1, 1)
+#define MMU_TCR_A1                              BM(22, 1, 1)
+#define MMU_TCR_T1SZ(size)                      BM(16, 6, (size))
+#define MMU_TCR_TG0(granule_size)               BM(14, 2, (granule_size))
+#define MMU_TCR_SH0(shareability_flags)         BM(12, 2, (shareability_flags))
+#define MMU_TCR_ORGN0(cache_flags)              BM(10, 2, (cache_flags))
+#define MMU_TCR_IRGN0(cache_flags)              BM( 8, 2, (cache_flags))
+#define MMU_TCR_EPD0                            BM( 7, 1, 1)
+#define MMU_TCR_T0SZ(size)                      BM( 0, 6, (size))
+
+#define MMU_MAIR_ATTR(index, attr)              BM(index * 8, 8, (attr))
+
+
+/* L0/L1/L2/L3 descriptor types */
+#define MMU_PTE_DESCRIPTOR_INVALID              BM(0, 2, 0)
+#define MMU_PTE_DESCRIPTOR_MASK                 BM(0, 2, 3)
+
+/* L0/L1/L2 descriptor types */
+#define MMU_PTE_L012_DESCRIPTOR_BLOCK           BM(0, 2, 1)
+#define MMU_PTE_L012_DESCRIPTOR_TABLE           BM(0, 2, 3)
+
+/* L3 descriptor types */
+#define MMU_PTE_L3_DESCRIPTOR_PAGE              BM(0, 2, 3)
+
+/* Output address mask */
+#define MMU_PTE_OUTPUT_ADDR_MASK                BM(12, 36, 0xfffffffff)
+
+/* Table attrs */
+#define MMU_PTE_ATTR_NS_TABLE                   BM(63, 1, 1)
+#define MMU_PTE_ATTR_AP_TABLE_NO_WRITE          BM(62, 1, 1)
+#define MMU_PTE_ATTR_AP_TABLE_NO_EL0            BM(61, 1, 1)
+#define MMU_PTE_ATTR_UXN_TABLE                  BM(60, 1, 1)
+#define MMU_PTE_ATTR_PXN_TABLE                  BM(59, 1, 1)
+
+/* Block/Page attrs */
+#define MMU_PTE_ATTR_RES_SOFTWARE               BM(55, 4, 0xf)
+#define MMU_PTE_ATTR_UXN                        BM(54, 1, 1)
+#define MMU_PTE_ATTR_PXN                        BM(53, 1, 1)
+#define MMU_PTE_ATTR_CONTIGUOUS                 BM(52, 1, 1)
+
+#define MMU_PTE_ATTR_NON_GLOBAL                 BM(11, 1, 1)
+#define MMU_PTE_ATTR_AF                         BM(10, 1, 1)
+
+#define MMU_PTE_ATTR_SH_NON_SHAREABLE           BM(8, 2, 0)
+#define MMU_PTE_ATTR_SH_OUTER_SHAREABLE         BM(8, 2, 2)
+#define MMU_PTE_ATTR_SH_INNER_SHAREABLE         BM(8, 2, 3)
+
+#define MMU_PTE_ATTR_AP_P_RW_U_NA               BM(6, 2, 0)
+#define MMU_PTE_ATTR_AP_P_RW_U_RW               BM(6, 2, 1)
+#define MMU_PTE_ATTR_AP_P_RO_U_NA               BM(6, 2, 2)
+#define MMU_PTE_ATTR_AP_P_RO_U_RO               BM(6, 2, 3)
+#define MMU_PTE_ATTR_AP_MASK                    BM(6, 2, 3)
+
+#define MMU_PTE_ATTR_NON_SECURE                 BM(5, 1, 1)
+
+#define MMU_PTE_ATTR_ATTR_INDEX(attrindex)      BM(2, 3, attrindex)
+#define MMU_PTE_ATTR_ATTR_INDEX_MASK            MMU_PTE_ATTR_ATTR_INDEX(7)
+
+/* Default configuration for main kernel page table:
+ *    - do cached translation walks
+ */
+
+/* Device-nGnRnE memory */
+#define MMU_MAIR_ATTR0                  MMU_MAIR_ATTR(0, 0x00)
+#define MMU_PTE_ATTR_STRONGLY_ORDERED   MMU_PTE_ATTR_ATTR_INDEX(0)
+
+/* Device-nGnRE memory */
+#define MMU_MAIR_ATTR1                  MMU_MAIR_ATTR(1, 0x04)
+#define MMU_PTE_ATTR_DEVICE             MMU_PTE_ATTR_ATTR_INDEX(1)
+
+/* Normal Memory, Outer Write-back non-transient Read/Write allocate,
+ * Inner Write-back non-transient Read/Write allocate
+ */
+#define MMU_MAIR_ATTR2                  MMU_MAIR_ATTR(2, 0xff)
+#define MMU_PTE_ATTR_NORMAL_MEMORY      MMU_PTE_ATTR_ATTR_INDEX(2)
+
+#define MMU_MAIR_ATTR3                  (0)
+#define MMU_MAIR_ATTR4                  (0)
+#define MMU_MAIR_ATTR5                  (0)
+#define MMU_MAIR_ATTR6                  (0)
+#define MMU_MAIR_ATTR7                  (0)
+
+#define MMU_MAIR_VAL                    (MMU_MAIR_ATTR0 | MMU_MAIR_ATTR1 | \
+                                         MMU_MAIR_ATTR2 | MMU_MAIR_ATTR3 | \
+                                         MMU_MAIR_ATTR4 | MMU_MAIR_ATTR5 | \
+                                         MMU_MAIR_ATTR6 | MMU_MAIR_ATTR7 )
+
+#define MMU_TCR_IPS_DEFAULT MMU_TCR_IPS(2) /* TODO: read at runtime, or configure per platform */
+
+/* Enable cached page table walks:
+ * inner/outer (IRGN/ORGN): write-back + write-allocate
+ */
+#define MMU_TCR_FLAGS1 (MMU_TCR_TG1(MMU_TG1(MMU_KERNEL_PAGE_SIZE_SHIFT)) | \
+                        MMU_TCR_SH1(MMU_SH_INNER_SHAREABLE) | \
+                        MMU_TCR_ORGN1(MMU_RGN_WRITE_BACK_ALLOCATE) | \
+                        MMU_TCR_IRGN1(MMU_RGN_WRITE_BACK_ALLOCATE) | \
+                        MMU_TCR_T1SZ(64 - MMU_KERNEL_SIZE_SHIFT))
+#define MMU_TCR_FLAGS0 (MMU_TCR_TG0(MMU_TG0(MMU_USER_PAGE_SIZE_SHIFT)) | \
+                        MMU_TCR_SH0(MMU_SH_INNER_SHAREABLE) | \
+                        MMU_TCR_ORGN0(MMU_RGN_WRITE_BACK_ALLOCATE) | \
+                        MMU_TCR_IRGN0(MMU_RGN_WRITE_BACK_ALLOCATE) | \
+                        MMU_TCR_T0SZ(64 - MMU_USER_SIZE_SHIFT))
+#define MMU_TCR_FLAGS0_IDENT \
+                       (MMU_TCR_TG0(MMU_TG0(MMU_IDENT_PAGE_SIZE_SHIFT)) | \
+                        MMU_TCR_SH0(MMU_SH_INNER_SHAREABLE) | \
+                        MMU_TCR_ORGN0(MMU_RGN_WRITE_BACK_ALLOCATE) | \
+                        MMU_TCR_IRGN0(MMU_RGN_WRITE_BACK_ALLOCATE) | \
+                        MMU_TCR_T0SZ(64 - MMU_IDENT_SIZE_SHIFT))
+#define MMU_TCR_FLAGS_IDENT (MMU_TCR_IPS_DEFAULT | MMU_TCR_FLAGS1 | MMU_TCR_FLAGS0_IDENT)
+#define MMU_TCR_FLAGS_KERNEL (MMU_TCR_IPS_DEFAULT | MMU_TCR_FLAGS1 | MMU_TCR_FLAGS0 | MMU_TCR_EPD0)
+#define MMU_TCR_FLAGS_USER (MMU_TCR_IPS_DEFAULT | MMU_TCR_FLAGS1 | MMU_TCR_FLAGS0)
+
+
+#if MMU_IDENT_SIZE_SHIFT > MMU_LX_X(MMU_IDENT_PAGE_SIZE_SHIFT, 2)
+#define MMU_PTE_IDENT_DESCRIPTOR MMU_PTE_L012_DESCRIPTOR_BLOCK
+#else
+#define MMU_PTE_IDENT_DESCRIPTOR MMU_PTE_L3_DESCRIPTOR_PAGE
+#endif
+#define MMU_PTE_IDENT_FLAGS \
+    (MMU_PTE_IDENT_DESCRIPTOR | \
+     MMU_PTE_ATTR_AF | \
+     MMU_PTE_ATTR_SH_INNER_SHAREABLE | \
+     MMU_PTE_ATTR_NORMAL_MEMORY | \
+     MMU_PTE_ATTR_AP_P_RW_U_NA)
+
+#define MMU_PTE_KERNEL_FLAGS \
+    (MMU_PTE_ATTR_AF | \
+     MMU_PTE_ATTR_SH_INNER_SHAREABLE | \
+     MMU_PTE_ATTR_NORMAL_MEMORY | \
+     MMU_PTE_ATTR_AP_P_RW_U_NA)
+
+#define MMU_INITIAL_MAP_STRONGLY_ORDERED \
+    (MMU_PTE_ATTR_AF | \
+     MMU_PTE_ATTR_STRONGLY_ORDERED | \
+     MMU_PTE_ATTR_AP_P_RW_U_NA)
+
+#define MMU_INITIAL_MAP_DEVICE \
+    (MMU_PTE_ATTR_AF | \
+     MMU_PTE_ATTR_DEVICE | \
+     MMU_PTE_ATTR_AP_P_RW_U_NA)
+
+#ifndef ASSEMBLY
+
+#include <sys/types.h>
+#include <assert.h>
+#include <compiler.h>
+#include <arch/arm64.h>
+
+typedef uint64_t pte_t;
+
+__BEGIN_CDECLS
+
+#define ARM64_TLBI_NOADDR(op) \
+({ \
+    __asm__ volatile("tlbi " #op::); \
+    ISB; \
+})
+
+#define ARM64_TLBI(op, val) \
+({ \
+    __asm__ volatile("tlbi " #op ", %0" :: "r" (val)); \
+    ISB; \
+})
+
+#define MMU_ARM64_GLOBAL_ASID (~0U)
+int arm64_mmu_map(vaddr_t vaddr, paddr_t paddr, size_t size, pte_t attrs,
+                  vaddr_t vaddr_base, uint top_size_shift,
+                  uint top_index_shift, uint page_size_shift,
+                  pte_t *top_page_table, uint asid);
+int arm64_mmu_unmap(vaddr_t vaddr, size_t size,
+                    vaddr_t vaddr_base, uint top_size_shift,
+                    uint top_index_shift, uint page_size_shift,
+                    pte_t *top_page_table, uint asid);
+
+__END_CDECLS
+#endif /* ASSEMBLY */
+
+#endif
diff --git a/arch/arm64/include/arch/asm_macros.h b/arch/arm64/include/arch/asm_macros.h
new file mode 100644
index 00000000..a07b783e
--- /dev/null
+++ b/arch/arm64/include/arch/asm_macros.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+.macro push ra, rb
+stp \ra, \rb, [sp,#-16]!
+.endm
+
+.macro pop ra, rb
+ldp \ra, \rb, [sp], #16
+.endm
+
+.macro tbzmask, reg, mask, label, shift=0
+.if \shift >= 64
+    .error "tbzmask: unsupported mask, \mask"
+.elseif \mask == 1 << \shift
+    tbz     \reg, #\shift, \label
+.else
+    tbzmask \reg, \mask, \label, "(\shift + 1)"
+.endif
+.endm
+
+.macro tbnzmask, reg, mask, label, shift=0
+.if \shift >= 64
+    .error "tbnzmask: unsupported mask, \mask"
+.elseif \mask == 1 << \shift
+    tbnz     \reg, #\shift, \label
+.else
+    tbnzmask \reg, \mask, \label, "(\shift + 1)"
+.endif
+.endm
+
+.macro calloc_bootmem_aligned, new_ptr, new_ptr_end, tmp, size_shift, phys_offset=0
+.if \size_shift < 4
+    .error "calloc_bootmem_aligned: Unsupported size_shift, \size_shift"
+.endif
+
+    /* load boot_alloc_end */
+    adrp    \tmp, boot_alloc_end
+    ldr     \new_ptr, [\tmp, #:lo12:boot_alloc_end]
+
+    /* align to page */
+.if \size_shift > 12
+    add     \new_ptr, \new_ptr, #(1 << \size_shift)
+    sub     \new_ptr, \new_ptr, #1
+.else
+    add     \new_ptr, \new_ptr, #(1 << \size_shift) - 1
+.endif
+    and     \new_ptr, \new_ptr, #~((1 << \size_shift) - 1)
+
+    /* add one page and store boot_alloc_end */
+    add     \new_ptr_end, \new_ptr, #(1 << \size_shift)
+    str     \new_ptr_end, [\tmp, #:lo12:boot_alloc_end]
+
+.if \phys_offset != 0
+    /* clear page */
+    sub     \new_ptr, \new_ptr, \phys_offset
+    sub     \new_ptr_end, \new_ptr_end, \phys_offset
+.endif
+
+    /* clear page */
+    mov     \tmp, \new_ptr
+.Lcalloc_bootmem_aligned_clear_loop\@:
+    stp     xzr, xzr, [\tmp], #16
+    cmp     \tmp, \new_ptr_end
+    b.lo    .Lcalloc_bootmem_aligned_clear_loop\@
+.endm
diff --git a/arch/arm64/include/arch/defines.h b/arch/arm64/include/arch/defines.h
index 34eb68c3..42ad4240 100644
--- a/arch/arm64/include/arch/defines.h
+++ b/arch/arm64/include/arch/defines.h
@@ -22,8 +22,22 @@
  */
 #pragma once
 
+#define SHIFT_4K        (12)
+#define SHIFT_16K       (14)
+#define SHIFT_64K       (16)
+
 /* arm specific stuff */
-#define PAGE_SIZE 4096
+#ifdef ARM64_LARGE_PAGESIZE_64K
+#define PAGE_SIZE_SHIFT (SHIFT_64K)
+#elif ARM64_LARGE_PAGESIZE_16K
+#define PAGE_SIZE_SHIFT (SHIFT_16K)
+#else
+#define PAGE_SIZE_SHIFT (SHIFT_4K)
+#endif
+#define USER_PAGE_SIZE_SHIFT SHIFT_4K
+
+#define PAGE_SIZE (1UL << PAGE_SIZE_SHIFT)
+#define USER_PAGE_SIZE (1UL << USER_PAGE_SIZE_SHIFT)
 
 #define CACHE_LINE 32
 
diff --git a/arch/arm64/include/arch/spinlock.h b/arch/arm64/include/arch/spinlock.h
new file mode 100644
index 00000000..f063cee0
--- /dev/null
+++ b/arch/arm64/include/arch/spinlock.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <arch/ops.h>
+#include <stdbool.h>
+
+#define SPIN_LOCK_INITIAL_VALUE (0)
+
+typedef unsigned long spin_lock_t;
+
+typedef unsigned int spin_lock_saved_state_t;
+typedef unsigned int spin_lock_save_flags_t;
+
+#if WITH_SMP
+void arch_spin_lock(spin_lock_t *lock);
+int arch_spin_trylock(spin_lock_t *lock);
+void arch_spin_unlock(spin_lock_t *lock);
+#else
+static inline void arch_spin_lock(spin_lock_t *lock)
+{
+    *lock = 1;
+}
+
+static inline int arch_spin_trylock(spin_lock_t *lock)
+{
+    return 0;
+}
+
+static inline void arch_spin_unlock(spin_lock_t *lock)
+{
+    *lock = 0;
+}
+#endif
+
+static inline void arch_spin_lock_init(spin_lock_t *lock)
+{
+    *lock = SPIN_LOCK_INITIAL_VALUE;
+}
+
+static inline bool arch_spin_lock_held(spin_lock_t *lock)
+{
+    return *lock != 0;
+}
+
+enum {
+    /* Possible future flags:
+     * SPIN_LOCK_FLAG_PMR_MASK         = 0x000000ff,
+     * SPIN_LOCK_FLAG_PREEMPTION       = 0x10000000,
+     * SPIN_LOCK_FLAG_SET_PMR          = 0x20000000,
+     */
+
+    /* ARM specific flags */
+    SPIN_LOCK_FLAG_IRQ              = 0x40000000,
+    SPIN_LOCK_FLAG_FIQ              = 0x80000000, /* Do not use unless IRQs are already disabled */
+    SPIN_LOCK_FLAG_IRQ_FIQ          = SPIN_LOCK_FLAG_IRQ | SPIN_LOCK_FLAG_FIQ,
+
+    /* Generic flags */
+    SPIN_LOCK_FLAG_INTERRUPTS       = SPIN_LOCK_FLAG_IRQ,
+};
+
+    /* default arm flag is to just disable plain irqs */
+#define ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS  SPIN_LOCK_FLAG_INTERRUPTS
+
+enum {
+    /* private */
+    SPIN_LOCK_STATE_RESTORE_IRQ = 1,
+    SPIN_LOCK_STATE_RESTORE_FIQ = 2,
+};
+
+static inline void
+arch_interrupt_save(spin_lock_saved_state_t *statep, spin_lock_save_flags_t flags)
+{
+    spin_lock_saved_state_t state = 0;
+    if ((flags & SPIN_LOCK_FLAG_IRQ) && !arch_ints_disabled()) {
+        state |= SPIN_LOCK_STATE_RESTORE_IRQ;
+        arch_disable_ints();
+    }
+    if ((flags & SPIN_LOCK_FLAG_FIQ) && !arch_fiqs_disabled()) {
+        state |= SPIN_LOCK_STATE_RESTORE_FIQ;
+        arch_disable_fiqs();
+    }
+    *statep = state;
+}
+
+static inline void
+arch_interrupt_restore(spin_lock_saved_state_t old_state, spin_lock_save_flags_t flags)
+{
+    if ((flags & SPIN_LOCK_FLAG_FIQ) && (old_state & SPIN_LOCK_STATE_RESTORE_FIQ))
+        arch_enable_fiqs();
+    if ((flags & SPIN_LOCK_FLAG_IRQ) && (old_state & SPIN_LOCK_STATE_RESTORE_IRQ))
+        arch_enable_ints();
+}
+
+
+
diff --git a/arch/arm64/mmu.c b/arch/arm64/mmu.c
new file mode 100644
index 00000000..02cb1f47
--- /dev/null
+++ b/arch/arm64/mmu.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2014 Google Inc. All rights reserved
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <arch/arm64/mmu.h>
+#include <assert.h>
+#include <debug.h>
+#include <err.h>
+#include <kernel/vm.h>
+#include <lib/heap.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <trace.h>
+
+#define LOCAL_TRACE 0
+
+STATIC_ASSERT(((long)KERNEL_BASE >> MMU_KERNEL_SIZE_SHIFT) == -1);
+STATIC_ASSERT(((long)KERNEL_ASPACE_BASE >> MMU_KERNEL_SIZE_SHIFT) == -1);
+STATIC_ASSERT(MMU_KERNEL_SIZE_SHIFT <= 48);
+STATIC_ASSERT(MMU_KERNEL_SIZE_SHIFT >= 25);
+
+/* the main translation table */
+pte_t arm64_kernel_translation_table[MMU_KERNEL_PAGE_TABLE_ENTRIES_TOP] __ALIGNED(MMU_KERNEL_PAGE_TABLE_ENTRIES_TOP * 8) __SECTION(".bss.prebss.translation_table");
+
+/* convert user level mmu flags to flags that go in L1 descriptors */
+static pte_t mmu_flags_to_pte_attr(uint flags)
+{
+    pte_t attr = MMU_PTE_ATTR_AF;
+
+    switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
+        case ARCH_MMU_FLAG_CACHED:
+            attr |= MMU_PTE_ATTR_NORMAL_MEMORY | MMU_PTE_ATTR_SH_INNER_SHAREABLE;
+            break;
+        case ARCH_MMU_FLAG_UNCACHED:
+            attr |= MMU_PTE_ATTR_STRONGLY_ORDERED;
+            break;
+        case ARCH_MMU_FLAG_UNCACHED_DEVICE:
+            attr |= MMU_PTE_ATTR_DEVICE;
+            break;
+        default:
+            /* invalid user-supplied flag */
+            DEBUG_ASSERT(1);
+            return ERR_INVALID_ARGS;
+    }
+
+    switch (flags & (ARCH_MMU_FLAG_PERM_USER | ARCH_MMU_FLAG_PERM_RO)) {
+        case 0:
+            attr |= MMU_PTE_ATTR_AP_P_RW_U_NA;
+            break;
+        case ARCH_MMU_FLAG_PERM_RO:
+            attr |= MMU_PTE_ATTR_AP_P_RO_U_NA;
+            break;
+        case ARCH_MMU_FLAG_PERM_USER:
+            attr |= MMU_PTE_ATTR_AP_P_RW_U_RW;
+            break;
+        case ARCH_MMU_FLAG_PERM_USER | ARCH_MMU_FLAG_PERM_RO:
+            attr |= MMU_PTE_ATTR_AP_P_RO_U_RO;
+            break;
+    }
+
+    if (flags & ARCH_MMU_FLAG_NS) {
+            attr |= MMU_PTE_ATTR_NON_SECURE;
+    }
+
+    return attr;
+}
+
+status_t arch_mmu_query(vaddr_t vaddr, paddr_t *paddr, uint *flags)
+{
+    uint index;
+    uint index_shift;
+    pte_t pte;
+    pte_t pte_addr;
+    uint descriptor_type;
+    pte_t *page_table;
+    vaddr_t kernel_base = ~0UL << MMU_KERNEL_SIZE_SHIFT;
+    vaddr_t vaddr_rem;
+
+    if (vaddr < kernel_base) {
+        TRACEF("vaddr 0x%lx < base 0x%lx\n", vaddr, kernel_base);
+        return ERR_INVALID_ARGS;
+    }
+
+    index_shift = MMU_KERNEL_TOP_SHIFT;
+    page_table = arm64_kernel_translation_table;
+
+    vaddr_rem = vaddr - kernel_base;
+    index = vaddr_rem >> index_shift;
+    ASSERT(index < MMU_KERNEL_PAGE_TABLE_ENTRIES_TOP);
+
+    while (true) {
+        index = vaddr_rem >> index_shift;
+        vaddr_rem -= (vaddr_t)index << index_shift;
+        pte = page_table[index];
+        descriptor_type = pte & MMU_PTE_DESCRIPTOR_MASK;
+        pte_addr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
+
+        LTRACEF("va 0x%lx, index %d, index_shift %d, rem 0x%lx, pte 0x%llx\n",
+                vaddr, index, index_shift, vaddr_rem, pte);
+
+        if (descriptor_type == MMU_PTE_DESCRIPTOR_INVALID)
+            return ERR_NOT_FOUND;
+
+        if (descriptor_type == ((index_shift > MMU_KERNEL_PAGE_SIZE_SHIFT) ?
+                                 MMU_PTE_L012_DESCRIPTOR_BLOCK :
+                                 MMU_PTE_L3_DESCRIPTOR_PAGE)) {
+            break;
+        }
+
+        if (index_shift <= MMU_KERNEL_PAGE_SIZE_SHIFT ||
+            descriptor_type != MMU_PTE_L012_DESCRIPTOR_TABLE) {
+            PANIC_UNIMPLEMENTED;
+        }
+
+        page_table = paddr_to_kvaddr(pte_addr);
+        index_shift -= MMU_KERNEL_PAGE_SIZE_SHIFT - 3;
+    }
+
+    if (paddr)
+        *paddr = pte_addr + vaddr_rem;
+    if (flags) {
+        *flags = 0;
+        if (pte & MMU_PTE_ATTR_NON_SECURE)
+            *flags |= ARCH_MMU_FLAG_NS;
+        switch (pte & MMU_PTE_ATTR_ATTR_INDEX_MASK) {
+            case MMU_PTE_ATTR_STRONGLY_ORDERED:
+                *flags |= ARCH_MMU_FLAG_UNCACHED;
+                break;
+            case MMU_PTE_ATTR_DEVICE:
+                *flags |= ARCH_MMU_FLAG_UNCACHED_DEVICE;
+                break;
+            case MMU_PTE_ATTR_NORMAL_MEMORY:
+                break;
+            default:
+                PANIC_UNIMPLEMENTED;
+        }
+        switch (pte & MMU_PTE_ATTR_AP_MASK) {
+            case MMU_PTE_ATTR_AP_P_RW_U_NA:
+                break;
+            case MMU_PTE_ATTR_AP_P_RW_U_RW:
+                *flags |= ARCH_MMU_FLAG_PERM_USER;
+                break;
+            case MMU_PTE_ATTR_AP_P_RO_U_NA:
+                *flags |= ARCH_MMU_FLAG_PERM_RO;
+                break;
+            case MMU_PTE_ATTR_AP_P_RO_U_RO:
+                *flags |= ARCH_MMU_FLAG_PERM_USER | ARCH_MMU_FLAG_PERM_RO;
+                break;
+        }
+    }
+    LTRACEF("va 0x%lx, paddr 0x%lx, flags 0x%x\n",
+            vaddr, paddr ? *paddr : ~0UL, flags ? *flags : ~0U);
+    return 0;
+}
+
+static int alloc_page_table(paddr_t *paddrp, uint page_size_shift)
+{
+    int ret;
+    int count;
+    size_t size = 1U << page_size_shift;
+    void *vaddr;
+
+    if (size >= PAGE_SIZE) {
+        count = size / PAGE_SIZE;
+        ret = pmm_alloc_contiguous(count, page_size_shift, paddrp, NULL);
+        if (ret != count)
+            return ERR_NO_MEMORY;
+    } else {
+        vaddr = heap_alloc(size, size);
+        if (!vaddr)
+            return ERR_NO_MEMORY;
+        ret = arch_mmu_query((vaddr_t)vaddr, paddrp, NULL);
+        if (ret) {
+            heap_free(vaddr);
+            return ret;
+        }
+    }
+    return 0;
+}
+
+static void free_page_table(void *vaddr, paddr_t paddr, uint page_size_shift)
+{
+    vm_page_t *address_to_page(paddr_t addr); /* TODO: remove */
+
+    size_t size = 1U << page_size_shift;
+    vm_page_t *page;
+
+    if (size >= PAGE_SIZE) {
+        page = address_to_page(paddr);
+        if (!page)
+            panic("bad page table paddr 0x%lx\n", paddr);
+        pmm_free_page(page);
+    } else {
+        heap_free(vaddr);
+    }
+}
+
+static pte_t *arm64_mmu_get_page_table(vaddr_t index, uint page_size_shift, pte_t *page_table)
+{
+    pte_t pte;
+    paddr_t paddr;
+    void *vaddr;
+    int ret;
+
+    pte = page_table[index];
+    switch (pte & MMU_PTE_DESCRIPTOR_MASK) {
+    case MMU_PTE_DESCRIPTOR_INVALID:
+        ret = alloc_page_table(&paddr, page_size_shift);
+        if (ret) {
+            TRACEF("failed to allocate page table\n");
+            return NULL;
+        }
+        vaddr = paddr_to_kvaddr(paddr);
+        LTRACEF("allocated page table, vaddr %p, paddr 0x%lx\n", vaddr, paddr);
+        memset(vaddr, MMU_PTE_DESCRIPTOR_INVALID, 1U << page_size_shift);
+        __asm__ volatile("dmb ishst" ::: "memory");
+        pte = paddr | MMU_PTE_L012_DESCRIPTOR_TABLE;
+        page_table[index] = pte;
+        LTRACEF("pte %p[0x%lx] = 0x%llx\n", page_table, index, pte);
+        return vaddr;
+
+    case MMU_PTE_L012_DESCRIPTOR_TABLE:
+        paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
+        LTRACEF("found page table 0x%lx\n", paddr);
+        return paddr_to_kvaddr(paddr);
+
+    case MMU_PTE_L012_DESCRIPTOR_BLOCK:
+        return NULL;
+
+    default:
+        PANIC_UNIMPLEMENTED;
+    }
+}
+
+static bool page_table_is_clear(pte_t *page_table, uint page_size_shift)
+{
+    int i;
+    int count = 1U << (page_size_shift - 3);
+    pte_t pte;
+
+    for (i = 0; i < count; i++) {
+        pte = page_table[i];
+        if (pte != MMU_PTE_DESCRIPTOR_INVALID) {
+            LTRACEF("page_table at %p still in use, index %d is 0x%llx\n",
+                    page_table, i, pte);
+            return false;
+        }
+    }
+
+    LTRACEF("page table at %p is clear\n", page_table);
+    return true;
+}
+
+static void arm64_mmu_unmap_pt(vaddr_t vaddr, vaddr_t vaddr_rel,
+                               size_t size,
+                               uint index_shift, uint page_size_shift,
+                               pte_t *page_table, uint asid)
+{
+    pte_t *next_page_table;
+    vaddr_t index;
+    size_t chunk_size;
+    vaddr_t vaddr_rem;
+    vaddr_t block_size;
+    vaddr_t block_mask;
+    pte_t pte;
+    paddr_t page_table_paddr;
+
+    LTRACEF("vaddr 0x%lx, vaddr_rel 0x%lx, size 0x%lx, index shift %d, page_size_shift %d, page_table %p\n",
+            vaddr, vaddr_rel, size, index_shift, page_size_shift, page_table);
+
+    while (size) {
+        block_size = 1UL << index_shift;
+        block_mask = block_size - 1;
+        vaddr_rem = vaddr_rel & block_mask;
+        chunk_size = MIN(size, block_size - vaddr_rem);
+        index = vaddr_rel >> index_shift;
+
+        pte = page_table[index];
+
+        if (index_shift > page_size_shift &&
+            (pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_TABLE) {
+            page_table_paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
+            next_page_table = paddr_to_kvaddr(page_table_paddr);
+            arm64_mmu_unmap_pt(vaddr, vaddr_rem, chunk_size,
+                               index_shift - (page_size_shift - 3),
+                               page_size_shift,
+                               next_page_table, asid);
+            if (chunk_size == block_size ||
+                page_table_is_clear(next_page_table, page_size_shift)) {
+                LTRACEF("pte %p[0x%lx] = 0 (was page table)\n", page_table, index);
+                page_table[index] = MMU_PTE_DESCRIPTOR_INVALID;
+                __asm__ volatile("dmb ishst" ::: "memory");
+                free_page_table(next_page_table, page_table_paddr, page_size_shift);
+            }
+        } else if (pte) {
+            LTRACEF("pte %p[0x%lx] = 0\n", page_table, index);
+            page_table[index] = MMU_PTE_DESCRIPTOR_INVALID;
+            CF;
+            if (asid == MMU_ARM64_GLOBAL_ASID)
+                ARM64_TLBI(vaae1is, vaddr >> 12);
+            else
+                ARM64_TLBI(vae1is, vaddr >> 12 | (vaddr_t)asid << 48);
+        } else {
+            LTRACEF("pte %p[0x%lx] already clear\n", page_table, index);
+        }
+        vaddr += chunk_size;
+        vaddr_rel += chunk_size;
+        size -= chunk_size;
+    }
+}
+
+static int arm64_mmu_map_pt(vaddr_t vaddr_in, vaddr_t vaddr_rel_in,
+                            paddr_t paddr_in,
+                            size_t size_in, pte_t attrs,
+                            uint index_shift, uint page_size_shift,
+                            pte_t *page_table, uint asid)
+{
+    int ret;
+    pte_t *next_page_table;
+    vaddr_t index;
+    vaddr_t vaddr = vaddr_in;
+    vaddr_t vaddr_rel = vaddr_rel_in;
+    paddr_t paddr = paddr_in;
+    size_t size = size_in;
+    size_t chunk_size;
+    vaddr_t vaddr_rem;
+    vaddr_t block_size;
+    vaddr_t block_mask;
+    pte_t pte;
+
+    LTRACEF("vaddr 0x%lx, vaddr_rel 0x%lx, paddr 0x%lx, size 0x%lx, attrs 0x%llx, index shift %d, page_size_shift %d, page_table %p\n",
+            vaddr, vaddr_rel, paddr, size, attrs,
+            index_shift, page_size_shift, page_table);
+
+    if ((vaddr_rel | paddr | size) & ((1UL << page_size_shift) - 1)) {
+        TRACEF("not page aligned\n");
+        return ERR_INVALID_ARGS;
+    }
+
+    while (size) {
+        block_size = 1UL << index_shift;
+        block_mask = block_size - 1;
+        vaddr_rem = vaddr_rel & block_mask;
+        chunk_size = MIN(size, block_size - vaddr_rem);
+        index = vaddr_rel >> index_shift;
+
+        if (((vaddr_rel | paddr) & block_mask) ||
+            (chunk_size != block_size) ||
+            (index_shift > MMU_PTE_DESCRIPTOR_BLOCK_MAX_SHIFT)) {
+            next_page_table = arm64_mmu_get_page_table(index, page_size_shift,
+                                                       page_table);
+            if (!next_page_table)
+                goto err;
+
+            ret = arm64_mmu_map_pt(vaddr, vaddr_rem, paddr, chunk_size, attrs,
+                                   index_shift - (page_size_shift - 3),
+                                   page_size_shift, next_page_table, asid);
+            if (ret)
+                goto err;
+        } else {
+            pte = page_table[index];
+            if (pte) {
+                TRACEF("page table entry already in use, index 0x%lx, 0x%llx\n",
+                       index, pte);
+                goto err;
+            }
+
+            pte = paddr | attrs;
+            if (index_shift > page_size_shift)
+                pte |= MMU_PTE_L012_DESCRIPTOR_BLOCK;
+            else
+                pte |= MMU_PTE_L3_DESCRIPTOR_PAGE;
+
+            LTRACEF("pte %p[0x%lx] = 0x%llx\n", page_table, index, pte);
+            page_table[index] = pte;
+        }
+        vaddr += chunk_size;
+        vaddr_rel += chunk_size;
+        paddr += chunk_size;
+        size -= chunk_size;
+    }
+
+    return 0;
+
+err:
+    arm64_mmu_unmap_pt(vaddr_in, vaddr_rel_in, size_in - size,
+                       index_shift, page_size_shift, page_table, asid);
+    DSB;
+    return ERR_GENERIC;
+}
+
+int arm64_mmu_map(vaddr_t vaddr, paddr_t paddr, size_t size, pte_t attrs,
+                  vaddr_t vaddr_base, uint top_size_shift,
+                  uint top_index_shift, uint page_size_shift,
+                  pte_t *top_page_table, uint asid)
+{
+    int ret;
+    vaddr_t vaddr_rel = vaddr - vaddr_base;
+    vaddr_t vaddr_rel_max = 1UL << top_size_shift;
+
+    LTRACEF("vaddr 0x%lx, paddr 0x%lx, size 0x%lx, attrs 0x%llx, asid 0x%x\n",
+            vaddr, paddr, size, attrs, asid);
+
+    if (vaddr_rel > vaddr_rel_max - size || size > vaddr_rel_max) {
+        TRACEF("vaddr 0x%lx, size 0x%lx out of range vaddr 0x%lx, size 0x%lx\n",
+               vaddr, size, vaddr_base, vaddr_rel_max);
+        return ERR_INVALID_ARGS;
+    }
+
+    if (!top_page_table) {
+        TRACEF("page table is NULL\n");
+        return ERR_INVALID_ARGS;
+    }
+
+    ret = arm64_mmu_map_pt(vaddr, vaddr_rel, paddr, size, attrs,
+                           top_index_shift, page_size_shift, top_page_table, asid);
+    DSB;
+    return ret;
+}
+
+int arm64_mmu_unmap(vaddr_t vaddr, size_t size,
+                    vaddr_t vaddr_base, uint top_size_shift,
+                    uint top_index_shift, uint page_size_shift,
+                    pte_t *top_page_table, uint asid)
+{
+    vaddr_t vaddr_rel = vaddr - vaddr_base;
+    vaddr_t vaddr_rel_max = 1UL << top_size_shift;
+
+    LTRACEF("vaddr 0x%lx, size 0x%lx, asid 0x%x\n", vaddr, size, asid);
+
+    if (vaddr_rel > vaddr_rel_max - size || size > vaddr_rel_max) {
+        TRACEF("vaddr 0x%lx, size 0x%lx out of range vaddr 0x%lx, size 0x%lx\n",
+               vaddr, size, vaddr_base, vaddr_rel_max);
+        return ERR_INVALID_ARGS;
+    }
+
+    if (!top_page_table) {
+        TRACEF("page table is NULL\n");
+        return ERR_INVALID_ARGS;
+    }
+
+    arm64_mmu_unmap_pt(vaddr, vaddr_rel, size,
+                       top_index_shift, page_size_shift, top_page_table, asid);
+    DSB;
+    return 0;
+}
+
+int arch_mmu_map(vaddr_t vaddr, paddr_t paddr, uint count, uint flags)
+{
+    return arm64_mmu_map(vaddr, paddr, count * PAGE_SIZE,
+                         mmu_flags_to_pte_attr(flags),
+                         ~0UL << MMU_KERNEL_SIZE_SHIFT, MMU_KERNEL_SIZE_SHIFT,
+                         MMU_KERNEL_TOP_SHIFT, MMU_KERNEL_PAGE_SIZE_SHIFT,
+                         arm64_kernel_translation_table, MMU_ARM64_GLOBAL_ASID);
+}
+
+int arch_mmu_unmap(vaddr_t vaddr, uint count)
+{
+    return arm64_mmu_unmap(vaddr, count * PAGE_SIZE,
+                           ~0UL << MMU_KERNEL_SIZE_SHIFT, MMU_KERNEL_SIZE_SHIFT,
+                           MMU_KERNEL_TOP_SHIFT, MMU_KERNEL_PAGE_SIZE_SHIFT,
+                           arm64_kernel_translation_table,
+                           MMU_ARM64_GLOBAL_ASID);
+}
diff --git a/arch/arm64/mp.c b/arch/arm64/mp.c
new file mode 100644
index 00000000..0760ed32
--- /dev/null
+++ b/arch/arm64/mp.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <arch/mp.h>
+
+#include <assert.h>
+#include <trace.h>
+#include <err.h>
+#include <platform/interrupts.h>
+#include <arch/ops.h>
+
+#if WITH_DEV_INTERRUPT_ARM_GIC
+#include <dev/interrupt/arm_gic.h>
+#else
+#error need other implementation of interrupt controller that can ipi
+#endif
+
+#define LOCAL_TRACE 0
+
+#define GIC_IPI_BASE (14)
+
+status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi)
+{
+    LTRACEF("target 0x%x, ipi %u\n", target, ipi);
+
+#if WITH_DEV_INTERRUPT_ARM_GIC
+    uint gic_ipi_num = ipi + GIC_IPI_BASE;
+
+    /* filter out targets outside of the range of cpus we care about */
+    target &= ((1UL << SMP_MAX_CPUS) - 1);
+    if (target != 0) {
+        LTRACEF("target 0x%x, gic_ipi %u\n", target, gic_ipi_num);
+        arm_gic_sgi(gic_ipi_num, ARM_GIC_SGI_FLAG_NS, target);
+    }
+#endif
+
+    return NO_ERROR;
+}
+
+enum handler_return arm_ipi_generic_handler(void *arg)
+{
+    LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg);
+
+    return INT_NO_RESCHEDULE;
+}
+
+enum handler_return arm_ipi_reschedule_handler(void *arg)
+{
+    LTRACEF("cpu %u, arg %p\n", arch_curr_cpu_num(), arg);
+
+    return mp_mbx_reschedule_irq();
+}
+
+void arch_mp_init_percpu(void)
+{
+    register_int_handler(MP_IPI_GENERIC + GIC_IPI_BASE, &arm_ipi_generic_handler, 0);
+    register_int_handler(MP_IPI_RESCHEDULE + GIC_IPI_BASE, &arm_ipi_reschedule_handler, 0);
+
+    //unmask_interrupt(MP_IPI_GENERIC);
+    //unmask_interrupt(MP_IPI_RESCHEDULE);
+}
+
diff --git a/arch/arm64/rules.mk b/arch/arm64/rules.mk
index b413eea8..1faf7b2a 100644
--- a/arch/arm64/rules.mk
+++ b/arch/arm64/rules.mk
@@ -4,7 +4,8 @@ MODULE := $(LOCAL_DIR)
 
 GLOBAL_DEFINES += \
 	ARM64_CPU_$(ARM_CPU)=1 \
-	ARM_ISA_ARMV8=1
+	ARM_ISA_ARMV8=1 \
+	IS_64BIT=1
 
 GLOBAL_INCLUDES += \
 	$(LOCAL_DIR)/include
@@ -15,32 +16,77 @@ MODULE_SRCS += \
 	$(LOCAL_DIR)/exceptions.S \
 	$(LOCAL_DIR)/exceptions_c.c \
 	$(LOCAL_DIR)/thread.c \
+	$(LOCAL_DIR)/spinlock.S \
 	$(LOCAL_DIR)/start.S \
+	$(LOCAL_DIR)/cache-ops.S \
 
 #	$(LOCAL_DIR)/arm/start.S \
-	$(LOCAL_DIR)/arm/cache-ops.S \
 	$(LOCAL_DIR)/arm/cache.c \
 	$(LOCAL_DIR)/arm/ops.S \
 	$(LOCAL_DIR)/arm/faults.c \
-	$(LOCAL_DIR)/arm/mmu.c \
 	$(LOCAL_DIR)/arm/dcc.S
 
 GLOBAL_DEFINES += \
-	ARCH_DEFAULT_STACK_SIZE=8192
+	ARCH_DEFAULT_STACK_SIZE=4096
+
+# if its requested we build with SMP, arm generically supports 4 cpus
+ifeq ($(WITH_SMP),1)
+SMP_MAX_CPUS ?= 4
+SMP_CPU_CLUSTER_SHIFT ?= 8
+SMP_CPU_ID_BITS ?= 24 # Ignore aff3 bits for now since they are not next to aff2
+
+GLOBAL_DEFINES += \
+    WITH_SMP=1 \
+    SMP_MAX_CPUS=$(SMP_MAX_CPUS) \
+    SMP_CPU_CLUSTER_SHIFT=$(SMP_CPU_CLUSTER_SHIFT) \
+    SMP_CPU_ID_BITS=$(SMP_CPU_ID_BITS)
+
+MODULE_SRCS += \
+    $(LOCAL_DIR)/mp.c
+else
+GLOBAL_DEFINES += \
+    SMP_MAX_CPUS=1
+endif
 
 ARCH_OPTFLAGS := -O2
 
-# try to find the toolchain
-ifndef TOOLCHAIN_PREFIX
-TOOLCHAIN_PREFIX := aarch64-elf-
+# we have a mmu and want the vmm/pmm
+WITH_KERNEL_VM ?= 1
+
+ifeq ($(WITH_KERNEL_VM),1)
+
+MODULE_SRCS += \
+	$(LOCAL_DIR)/mmu.c
+
+KERNEL_ASPACE_BASE ?= 0xffff000000000000
+KERNEL_ASPACE_SIZE ?= 0x0001000000000000
+
+GLOBAL_DEFINES += \
+    KERNEL_ASPACE_BASE=$(KERNEL_ASPACE_BASE) \
+    KERNEL_ASPACE_SIZE=$(KERNEL_ASPACE_SIZE)
+
+KERNEL_BASE ?= 0xffff000000000000
+KERNEL_LOAD_OFFSET ?= 0
+
+GLOBAL_DEFINES += \
+    KERNEL_BASE=$(KERNEL_BASE) \
+    KERNEL_LOAD_OFFSET=$(KERNEL_LOAD_OFFSET)
+
+else
+
+KERNEL_BASE ?= $(MEMBASE)
+KERNEL_LOAD_OFFSET ?= 0
+
 endif
 
-FOUNDTOOL=$(shell which $(TOOLCHAIN_PREFIX)gcc)
-ifeq ($(FOUNDTOOL),)
-$(error cannot find toolchain, please set TOOLCHAIN_PREFIX or add it to your path)
-endif
+
+# try to find the toolchain
+include $(LOCAL_DIR)/toolchain.mk
+TOOLCHAIN_PREFIX := $(ARCH_$(ARCH)_TOOLCHAIN_PREFIX)
 $(info TOOLCHAIN_PREFIX = $(TOOLCHAIN_PREFIX))
 
+ARCH_COMPILEFLAGS += $(ARCH_$(ARCH)_COMPILEFLAGS)
+
 # make sure some bits were set up
 MEMVARS_SET := 0
 ifneq ($(MEMBASE),)
@@ -63,6 +109,6 @@ GENERATED += \
 $(BUILDDIR)/system-onesegment.ld: $(LOCAL_DIR)/system-onesegment.ld $(wildcard arch/*.ld)
 	@echo generating $@
 	@$(MKDIR)
-	$(NOECHO)sed "s/%MEMBASE%/$(MEMBASE)/;s/%MEMSIZE%/$(MEMSIZE)/" < $< > $@
+	$(NOECHO)sed "s/%MEMBASE%/$(MEMBASE)/;s/%MEMSIZE%/$(MEMSIZE)/;s/%KERNEL_BASE%/$(KERNEL_BASE)/;s/%KERNEL_LOAD_OFFSET%/$(KERNEL_LOAD_OFFSET)/" < $< > $@
 
 include make/module.mk
diff --git a/arch/arm64/spinlock.S b/arch/arm64/spinlock.S
new file mode 100644
index 00000000..ef5b3d13
--- /dev/null
+++ b/arch/arm64/spinlock.S
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 Google Inc. All rights reserved
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <asm.h>
+
+.text
+
+FUNCTION(arch_spin_trylock)
+	mov	x2, x0
+	mov	x1, #1
+	ldaxr	x0, [x2]
+	cbnz	x0, 1f
+	stxr	w0, x1, [x2]
+1:
+	ret
+
+FUNCTION(arch_spin_lock)
+	mov	x1, #1
+	sevl
+1:
+	wfe
+	ldaxr	x2, [x0]
+	cbnz	x2, 1b
+	stxr	w2, x1, [x0]
+	cbnz	w2, 1b
+	ret
+
+FUNCTION(arch_spin_unlock)
+	stlr	xzr, [x0]
+	ret
diff --git a/arch/arm64/start.S b/arch/arm64/start.S
index 250a7246..9f76a48e 100644
--- a/arch/arm64/start.S
+++ b/arch/arm64/start.S
@@ -1,33 +1,342 @@
 #include <asm.h>
+#include <arch/arm64/mmu.h>
+#include <arch/asm_macros.h>
+#include <kernel/vm.h>
+
+/*
+ * Register use:
+ *  x0-x3   Arguments
+ *  x9-x15  Scratch
+ *  x19-x28 Globals
+ */
+tmp                     .req x9
+tmp2                    .req x10
+index                   .req x11
+index_shift             .req x12
+page_table              .req x13
+new_page_table          .req x14
+phys_offset             .req x15
+
+cpuid                   .req x19
+page_table0             .req x20
+page_table1             .req x21
+mmu_initial_mapping     .req x22
+vaddr                   .req x23
+paddr                   .req x24
+size                    .req x25
+attr                    .req x26
 
 .section .text.boot
 FUNCTION(_start)
-    ldr x0, =__stack_end
-    mov sp, x0
+#if WITH_KERNEL_VM
+    /* enable caches so atomics and spinlocks work */
+    mrs     tmp, sctlr_el1
+    orr     tmp, tmp, #(1<<12) /* Enable icache */
+    orr     tmp, tmp, #(1<<2)  /* Enable dcache/ucache */
+    bic     tmp, tmp, #(1<<3)  /* Disable Stack Alignment Check */ /* TODO: don't use unaligned stacks */
+    msr     sctlr_el1, tmp
+
+    /* set up the mmu according to mmu_initial_mappings */
+
+    /* load the base of the translation table and clear the table */
+    adrp    page_table1, arm64_kernel_translation_table
+    add     page_table1, page_table1, #:lo12:arm64_kernel_translation_table
+
+    /* Prepare tt_trampoline page table */
+    /* Calculate pagetable physical addresses */
+    adrp    page_table0, tt_trampoline
+    add     page_table0, page_table0, #:lo12:tt_trampoline
+
+#if WITH_SMP
+    mrs     cpuid, mpidr_el1
+    ubfx    cpuid, cpuid, #0, #SMP_CPU_ID_BITS
+    cbnz    cpuid, .Lmmu_enable_secondary
+#endif
+
+    mov     tmp, #0
+
+    /* walk through all the entries in the translation table, setting them up */
+.Lclear_top_page_table_loop:
+    str     xzr, [page_table1, tmp, lsl #3]
+    add     tmp, tmp, #1
+    cmp     tmp, #MMU_KERNEL_PAGE_TABLE_ENTRIES_TOP
+    bne     .Lclear_top_page_table_loop
+
+    /* load the address of the mmu_initial_mappings table and start processing */
+    adrp    mmu_initial_mapping, mmu_initial_mappings
+    add     mmu_initial_mapping, mmu_initial_mapping, #:lo12:mmu_initial_mappings
+
+.Linitial_mapping_loop:
+    ldp     paddr, vaddr, [mmu_initial_mapping, #__MMU_INITIAL_MAPPING_PHYS_OFFSET]
+    ldp     size, tmp, [mmu_initial_mapping, #__MMU_INITIAL_MAPPING_SIZE_OFFSET]
+
+    tbzmask tmp, MMU_INITIAL_MAPPING_FLAG_DYNAMIC, .Lnot_dynamic
+    adr     paddr, _start
+    mov     size, x0
+    str     paddr, [mmu_initial_mapping, #__MMU_INITIAL_MAPPING_PHYS_OFFSET]
+    str     size, [mmu_initial_mapping, #__MMU_INITIAL_MAPPING_SIZE_OFFSET]
+
+.Lnot_dynamic:
+    /* if size == 0, end of list */
+    cbz     size, .Linitial_mapping_done
+
+    /* set up the flags */
+    ldr     attr, =MMU_PTE_KERNEL_FLAGS
+    tbzmask tmp, MMU_INITIAL_MAPPING_FLAG_UNCACHED, .Lnot_uncached
+    ldr     attr, =MMU_INITIAL_MAP_STRONGLY_ORDERED
+    b       .Lmem_type_done
+.Lnot_uncached:
+    tbzmask tmp, MMU_INITIAL_MAPPING_FLAG_DEVICE, .Lmem_type_done
+    ldr     attr, =MMU_INITIAL_MAP_DEVICE
+.Lmem_type_done:
+
+    /* Check that paddr, vaddr and size are page aligned */
+    orr     tmp, vaddr, paddr
+    orr     tmp, tmp, size
+    tst     tmp, #(1 << MMU_KERNEL_PAGE_SIZE_SHIFT) - 1
+    bne     . /* Error: not page aligned */
+
+    /* Clear top bits of virtual address (should be all set) */
+    eor     vaddr, vaddr, #(~0 << MMU_KERNEL_SIZE_SHIFT)
+
+    /* Check that top bits were all set */
+    tst     vaddr, #(~0 << MMU_KERNEL_SIZE_SHIFT)
+    bne     . /* Error: vaddr out of range */
+
+.Lmap_range_top_loop:
+    /* Select top level page table */
+    mov     page_table, page_table1
+    mov     index_shift, #MMU_KERNEL_TOP_SHIFT
+
+    lsr     index, vaddr, index_shift
+
+.Lmap_range_one_table_loop:
+    /* Check if current level allow block descriptors */
+    cmp     index_shift, #MMU_PTE_DESCRIPTOR_BLOCK_MAX_SHIFT
+    b.hi    .Lmap_range_need_page_table
+
+    /* Check if paddr and vaddr alignment allows a block descriptor */
+    orr     tmp2, vaddr, paddr
+    lsr     tmp, tmp2, index_shift
+    lsl     tmp, tmp, index_shift
+    cmp     tmp, tmp2
+    b.ne    .Lmap_range_need_page_table
+
+    /* Check if size is large enough for a block mapping */
+    lsr     tmp, size, index_shift
+    cbz     tmp, .Lmap_range_need_page_table
+
+    /* Select descriptor type, page for level 3, block for level 0-2 */
+    orr     tmp, attr, #MMU_PTE_L3_DESCRIPTOR_PAGE
+    cmp     index_shift, MMU_KERNEL_PAGE_SIZE_SHIFT
+    beq     .Lmap_range_l3
+    orr     tmp, attr, #MMU_PTE_L012_DESCRIPTOR_BLOCK
+.Lmap_range_l3:
+
+    /* Write page table entry */
+    orr     tmp, tmp, paddr
+    str     tmp, [page_table, index, lsl #3]
+
+    /* Move to next page table entry */
+    mov     tmp, #1
+    lsl     tmp, tmp, index_shift
+    add     vaddr, vaddr, tmp
+    add     paddr, paddr, tmp
+    subs    size, size, tmp
+    /* TODO: add local loop if next entry is in the same page table */
+    b.ne    .Lmap_range_top_loop /* size != 0 */
+
+    /* Move to next mmu_initial_mappings entry */
+    add     mmu_initial_mapping, mmu_initial_mapping, __MMU_INITIAL_MAPPING_SIZE
+    b       .Linitial_mapping_loop
+
+.Lmap_range_need_page_table:
+    /* Check if page table entry is unused */
+    ldr     new_page_table, [page_table, index, lsl #3]
+    cbnz    new_page_table, .Lmap_range_has_page_table
+
+    /* Calculate phys offset (needed for memory allocation) */
+.Lphys_offset:
+    adr     phys_offset, .Lphys_offset /* phys */
+    ldr     tmp, =.Lphys_offset /* virt */
+    sub     phys_offset, tmp, phys_offset
+
+    /* Allocate new page table */
+    calloc_bootmem_aligned new_page_table, tmp, tmp2, MMU_KERNEL_PAGE_SIZE_SHIFT, phys_offset
+
+    /* Write page table entry (with allocated page table) */
+    orr     new_page_table, new_page_table, #MMU_PTE_L012_DESCRIPTOR_TABLE
+    str     new_page_table, [page_table, index, lsl #3]
+
+.Lmap_range_has_page_table:
+    /* Check descriptor type */
+    and     tmp, new_page_table, #MMU_PTE_DESCRIPTOR_MASK
+    cmp     tmp, #MMU_PTE_L012_DESCRIPTOR_TABLE
+    b.ne    . /* Error: entry already in use (as a block entry) */
+
+    /* switch to next page table level */
+    bic     page_table, new_page_table, #MMU_PTE_DESCRIPTOR_MASK
+    mov     tmp, #~0
+    lsl     tmp, tmp, index_shift
+    bic     tmp, vaddr, tmp
+    sub     index_shift, index_shift, #(MMU_KERNEL_PAGE_SIZE_SHIFT - 3)
+    lsr     index, tmp, index_shift
+
+    b       .Lmap_range_one_table_loop
+
+.Linitial_mapping_done:
+
+    /* Prepare tt_trampoline page table */
+
+    /* Zero tt_trampoline translation tables */
+    mov     tmp, #0
+.Lclear_tt_trampoline:
+    str     xzr, [page_table0, tmp, lsl#3]
+    add     tmp, tmp, #1
+    cmp     tmp, #MMU_PAGE_TABLE_ENTRIES_IDENT
+    blt     .Lclear_tt_trampoline
+
+    /* Setup mapping at phys -> phys */
+    adr     tmp, .Lmmu_on_pc
+    lsr     tmp, tmp, #MMU_IDENT_TOP_SHIFT    /* tmp = paddr index */
+    ldr     tmp2, =MMU_PTE_IDENT_FLAGS
+    add     tmp2, tmp2, tmp, lsl #MMU_IDENT_TOP_SHIFT  /* tmp2 = pt entry */
+
+    str     tmp2, [page_table0, tmp, lsl #3]     /* tt_trampoline[paddr index] = pt entry */
+
+#if WITH_SMP
+    adr     tmp, page_tables_not_ready
+    str     wzr, [tmp]
+    b       .Lpage_tables_ready
+
+.Lmmu_enable_secondary:
+    adr     tmp, page_tables_not_ready
+.Lpage_tables_not_ready:
+    ldr     tmp2, [tmp]
+    cbnz    tmp2, .Lpage_tables_not_ready
+.Lpage_tables_ready:
+#endif
+
+    /* set up the mmu */
+
+    /* Invalidate TLB */
+    tlbi    vmalle1is
+    isb
+    dsb     sy
+
+    /* Initialize Memory Attribute Indirection Register */
+    ldr     tmp, =MMU_MAIR_VAL
+    msr     mair_el1, tmp
+
+    /* Initialize TCR_EL1 */
+    /* set cacheable attributes on translation walk */
+    /* (SMP extensions) non-shareable, inner write-back write-allocate */
+    ldr     tmp, =MMU_TCR_FLAGS_IDENT
+    msr     tcr_el1, tmp
+
+    isb
+
+    /* Write ttbr with phys addr of the translation table */
+    msr     ttbr0_el1, page_table0
+    msr     ttbr1_el1, page_table1
+    isb
+
+    /* Read SCTLR */
+    mrs     tmp, sctlr_el1
+
+    /* Turn on the MMU */
+    orr     tmp, tmp, #0x1
+
+    /* Write back SCTLR */
+    msr     sctlr_el1, tmp
+.Lmmu_on_pc:
+    isb
+
+    /* Jump to virtual code address */
+    ldr     tmp, =.Lmmu_on_vaddr
+    br      tmp
+
+.Lmmu_on_vaddr:
+
+    /* Disable trampoline page-table in ttbr0 */
+    ldr     tmp, =MMU_TCR_FLAGS_KERNEL
+    msr     tcr_el1, tmp
+    isb
+
+
+    /* Invalidate TLB */
+    tlbi    vmalle1
+    isb
+
+#if WITH_SMP
+    cbnz    cpuid, .Lsecondary_boot
+#endif
+#endif /* WITH_KERNEL_VM */
+
+    ldr tmp, =__stack_end
+    mov sp, tmp
 
     /* clear bss */
 .L__do_bss:
     /* clear out the bss */
     /* NOTE: relies on __bss_start and __bss_end being 8 byte aligned */
-    ldr     x0, =__bss_start
-    ldr     x1, =__bss_end
-    mov     x2, #0
-    sub     x1, x1, x0
-    cbz     x1, .L__bss_loop_done
+    ldr     tmp, =__bss_start
+    ldr     tmp2, =__bss_end
+    sub     tmp2, tmp2, tmp
+    cbz     tmp2, .L__bss_loop_done
 .L__bss_loop:
-    sub     x1, x1, #8
-    str     x2, [x0], #8
-    cbnz    x1, .L__bss_loop
+    sub     tmp2, tmp2, #8
+    str     xzr, [tmp], #8
+    cbnz    tmp2, .L__bss_loop
 .L__bss_loop_done:
 
     bl  lk_main
     b   .
 
+#if WITH_SMP
+.Lsecondary_boot:
+    and     tmp, cpuid, #0xff
+    cmp     tmp, #(1 << SMP_CPU_CLUSTER_SHIFT)
+    bge     .Lunsupported_cpu_trap
+    bic     cpuid, cpuid, #0xff
+    orr     cpuid, tmp, cpuid, LSR #(8 - SMP_CPU_CLUSTER_SHIFT)
+
+    cmp     cpuid, #SMP_MAX_CPUS
+    bge     .Lunsupported_cpu_trap
+
+    /* Set up the stack */
+    ldr     tmp, =__stack_end
+    mov     tmp2, #ARCH_DEFAULT_STACK_SIZE
+    mul     tmp2, tmp2, cpuid
+    sub     sp, tmp, tmp2
+
+    mov     x0, cpuid
+    bl      arm64_secondary_entry
+
+.Lunsupported_cpu_trap:
+    wfe
+    b       .Lunsupported_cpu_trap
+#endif
+
 .ltorg
 
+#if WITH_SMP
+.data
+DATA(page_tables_not_ready)
+    .long       1
+DATA(secondary_cpu_allocated_stack)
+    .quad       0
+#endif
+
 .section .bss.prebss.stack
     .align 4
 DATA(__stack)
-    .skip 0x2000
+    .skip ARCH_DEFAULT_STACK_SIZE * SMP_MAX_CPUS
 DATA(__stack_end)
 
+#if WITH_KERNEL_VM
+.section ".bss.prebss.translation_table"
+.align 3 + MMU_PAGE_TABLE_ENTRIES_IDENT_SHIFT
+DATA(tt_trampoline)
+    .skip 8 * MMU_PAGE_TABLE_ENTRIES_IDENT
+#endif
diff --git a/arch/arm64/system-onesegment.ld b/arch/arm64/system-onesegment.ld
index 719474c5..8677be7c 100644
--- a/arch/arm64/system-onesegment.ld
+++ b/arch/arm64/system-onesegment.ld
@@ -4,10 +4,11 @@ OUTPUT_ARCH(aarch64)
 ENTRY(_start)
 SECTIONS
 {
-    . = %MEMBASE%;
+    . = %KERNEL_BASE% + %KERNEL_LOAD_OFFSET%;
 
     /* text/read-only data */
-    .text : {
+    /* set the load address to physical MEMBASE */
+    .text : AT(%MEMBASE% + %KERNEL_LOAD_OFFSET%) {
         KEEP(*(.text.boot))
         KEEP(*(.text.boot.vectab))
         *(.text* .sram.text.glue_7* .gnu.linkonce.t.*)
@@ -93,7 +94,7 @@ INCLUDE "arch/shared_data_sections.ld"
     . = ALIGN(8);
     _end = .;
 
-    . = %MEMBASE% + %MEMSIZE%;
+    . = %KERNEL_BASE% + %MEMSIZE%;
     _end_of_ram = .;
 
     /* Strip unnecessary stuff */
diff --git a/arch/arm64/thread.c b/arch/arm64/thread.c
index edecae3d..5d21b278 100644
--- a/arch/arm64/thread.c
+++ b/arch/arm64/thread.c
@@ -57,8 +57,9 @@ static void initial_thread_func(void)
 
     LTRACEF("initial_thread_func: thread %p calling %p with arg %p\n", current_thread, current_thread->entry, current_thread->arg);
 
-    /* exit the implicit critical section we're within */
-    exit_critical_section();
+    /* release the thread lock that was implicitly held across the reschedule */
+    spin_unlock(&thread_lock);
+    arch_enable_ints();
 
     ret = current_thread->entry(current_thread->arg);
 
@@ -92,3 +93,10 @@ void arch_context_switch(thread_t *oldthread, thread_t *newthread)
     arm64_context_switch(&oldthread->arch.sp, newthread->arch.sp);
 }
 
+void arch_dump_thread(thread_t *t)
+{
+    if (t->state != THREAD_RUNNING) {
+        dprintf(INFO, "\tarch: ");
+        dprintf(INFO, "sp 0x%lx\n", t->arch.sp);
+    }
+}
diff --git a/arch/arm64/toolchain.mk b/arch/arm64/toolchain.mk
new file mode 100644
index 00000000..4fd8c644
--- /dev/null
+++ b/arch/arm64/toolchain.mk
@@ -0,0 +1,18 @@
+ifndef ARCH_arm64_TOOLCHAIN_INCLUDED
+ARCH_arm64_TOOLCHAIN_INCLUDED := 1
+
+ifndef ARCH_arm64_TOOLCHAIN_PREFIX
+ARCH_arm64_TOOLCHAIN_PREFIX := aarch64-elf-
+FOUNDTOOL=$(shell which $(ARCH_arm64_TOOLCHAIN_PREFIX)gcc)
+ifeq ($(FOUNDTOOL),)
+ARCH_arm64_TOOLCHAIN_PREFIX := aarch64-linux-android-
+FOUNDTOOL=$(shell which $(ARCH_arm64_TOOLCHAIN_PREFIX)gcc)
+ifeq ($(FOUNDTOOL),)
+$(error cannot find toolchain, please set ARCH_arm64_TOOLCHAIN_PREFIX or add it to your path)
+endif
+endif
+endif
+
+ARCH_arm64_COMPILEFLAGS := -mgeneral-regs-only -DWITH_NO_FP=1
+
+endif
diff --git a/arch/microblaze/exceptions.c b/arch/microblaze/exceptions.c
index 807b5aca..857662ee 100644
--- a/arch/microblaze/exceptions.c
+++ b/arch/microblaze/exceptions.c
@@ -31,11 +31,7 @@ enum handler_return platform_irq_handler(void);
 
 void microblaze_irq(void)
 {
-    inc_critical_section();
-
     if (platform_irq_handler() == INT_RESCHEDULE)
         thread_preempt();
-
-    dec_critical_section();
 }
 
diff --git a/arch/microblaze/include/arch/arch_ops.h b/arch/microblaze/include/arch/arch_ops.h
index 7a54bdaa..7d5fd942 100644
--- a/arch/microblaze/include/arch/arch_ops.h
+++ b/arch/microblaze/include/arch/arch_ops.h
@@ -56,54 +56,37 @@ static inline void arch_disable_ints(void)
     CF;
 }
 
+static inline bool arch_ints_disabled(void)
+{
+    uint32_t state;
+
+    __asm__ volatile(
+        "mfs    %0, rmsr;"
+        : "=r" (state));
+
+    return !(state & (1<<1));
+}
+
 static inline int atomic_add(volatile int *ptr, int val)
 {
-	return __atomic_fetch_add(ptr, val, __ATOMIC_RELAXED);
+    return __atomic_fetch_add(ptr, val, __ATOMIC_RELAXED);
 }
 
 static inline int atomic_or(volatile int *ptr, int val)
 {
-	return __atomic_fetch_or(ptr, val, __ATOMIC_RELAXED);
+    return __atomic_fetch_or(ptr, val, __ATOMIC_RELAXED);
 }
 
 static inline int atomic_and(volatile int *ptr, int val)
 {
-	return __atomic_fetch_and(ptr, val, __ATOMIC_RELAXED);
+    return __atomic_fetch_and(ptr, val, __ATOMIC_RELAXED);
 }
 
 static inline int atomic_swap(volatile int *ptr, int val)
 {
-	return __atomic_exchange_n(ptr, val, __ATOMIC_RELAXED);
+    return __atomic_exchange_n(ptr, val, __ATOMIC_RELAXED);
 }
 
-#if 0
-static inline int atomic_cmpxchg(volatile int *ptr, int oldval, int newval)
-{
-	int old;
-	int test;
-
-	do {
-		__asm__ volatile(
-		    "ldrex	%[old], [%[ptr]]\n"
-		    "mov	%[test], #0\n"
-		    "teq	%[old], %[oldval]\n"
-#if ARM_ISA_ARMV7M
-		    "bne	0f\n"
-		    "strex	%[test], %[newval], [%[ptr]]\n"
-		    "0:\n"
-#else
-		    "strexeq %[test], %[newval], [%[ptr]]\n"
-#endif
-		    : [old]"=&r" (old), [test]"=&r" (test)
-		    : [ptr]"r" (ptr), [oldval]"Ir" (oldval), [newval]"r" (newval)
-		    : "cc");
-
-	} while (test != 0);
-
-	return old;
-}
-#endif
-
 /* use a global pointer to store the current_thread */
 extern struct thread *_current_thread;
 
@@ -119,3 +102,9 @@ static inline void set_current_thread(struct thread *t)
 
 static inline uint32_t arch_cycle_count(void) { return 0; }
 
+static inline uint arch_curr_cpu_num(void)
+{
+    return 0;
+}
+
+
diff --git a/arch/microblaze/include/arch/spinlock.h b/arch/microblaze/include/arch/spinlock.h
new file mode 100644
index 00000000..5c50c5b9
--- /dev/null
+++ b/arch/microblaze/include/arch/spinlock.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <arch/ops.h>
+#include <stdbool.h>
+
+#if WITH_SMP
+#error microblaze does not support SMP
+#endif
+
+#define SPIN_LOCK_INITIAL_VALUE (0)
+
+typedef unsigned int spin_lock_t;
+
+typedef unsigned int spin_lock_saved_state_t;
+typedef unsigned int spin_lock_save_flags_t;
+
+static inline void arch_spin_lock(spin_lock_t *lock)
+{
+    *lock = 1;
+}
+
+static inline int arch_spin_trylock(spin_lock_t *lock)
+{
+    return 0;
+}
+
+static inline void arch_spin_unlock(spin_lock_t *lock)
+{
+    *lock = 0;
+}
+
+static inline void arch_spin_lock_init(spin_lock_t *lock)
+{
+    *lock = SPIN_LOCK_INITIAL_VALUE;
+}
+
+static inline bool arch_spin_lock_held(spin_lock_t *lock)
+{
+    return *lock != 0;
+}
+
+    /* default arm flag is to just disable plain irqs */
+#define ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS  0
+
+enum {
+    /* private */
+    SPIN_LOCK_STATE_RESTORE_IRQ = 1,
+};
+
+static inline void
+arch_interrupt_save(spin_lock_saved_state_t *statep, spin_lock_save_flags_t flags)
+{
+    spin_lock_saved_state_t state = 0;
+    if (!arch_ints_disabled()) {
+        state |= SPIN_LOCK_STATE_RESTORE_IRQ;
+        arch_disable_ints();
+    }
+    *statep = state;
+}
+
+static inline void
+arch_interrupt_restore(spin_lock_saved_state_t old_state, spin_lock_save_flags_t flags)
+{
+    if (old_state & SPIN_LOCK_STATE_RESTORE_IRQ)
+        arch_enable_ints();
+}
+
+
+
+
diff --git a/arch/microblaze/rules.mk b/arch/microblaze/rules.mk
index 48a80211..7c3ebe21 100644
--- a/arch/microblaze/rules.mk
+++ b/arch/microblaze/rules.mk
@@ -19,6 +19,9 @@ MODULE_SRCS += \
 	$(LOCAL_DIR)/faults.c \
 	$(LOCAL_DIR)/descriptor.c
 
+GLOBAL_DEFINES += \
+	SMP_MAX_CPUS=1
+
 # set the default toolchain to microblaze elf and set a #define
 ifndef TOOLCHAIN_PREFIX
 TOOLCHAIN_PREFIX := microblaze-elf-
diff --git a/arch/microblaze/thread.c b/arch/microblaze/thread.c
index 22fd0d2b..2237971b 100644
--- a/arch/microblaze/thread.c
+++ b/arch/microblaze/thread.c
@@ -42,8 +42,9 @@ static void initial_thread_func(void)
     dump_thread(ct);
 #endif
 
-    /* exit the implicit critical section we're within */
-    exit_critical_section();
+    /* release the thread lock that was implicitly held across the reschedule */
+    spin_unlock(&thread_lock);
+    arch_enable_ints();
 
     int ret = ct->entry(ct->arg);
 
@@ -82,3 +83,11 @@ void arch_context_switch(thread_t *oldthread, thread_t *newthread)
     microblaze_context_switch(&oldthread->arch.cs_frame, &newthread->arch.cs_frame);
 }
 
+void arch_dump_thread(thread_t *t)
+{
+    if (t->state != THREAD_RUNNING) {
+        dprintf(INFO, "\tarch: ");
+        dprintf(INFO, "sp 0x%x\n", t->arch.cs_frame.r1);
+    }
+}
+
diff --git a/arch/x86-64/thread.c b/arch/x86-64/thread.c
index dea6fbdf..15ae7cb4 100644
--- a/arch/x86-64/thread.c
+++ b/arch/x86-64/thread.c
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <debug.h>
 #include <kernel/thread.h>
+#include <kernel/spinlock.h>
 #include <arch/x86.h>
 #include <arch/x86/descriptor.h>
 
@@ -45,8 +46,9 @@ static void initial_thread_func(void)
 {
 	int ret;
 
-	/* exit the implicit critical section we're within */
-	exit_critical_section();
+	/* release the thread lock that was implicitly held across the reschedule */
+	spin_unlock(&thread_lock);
+	arch_enable_ints();
 
 	ret = _current_thread->entry(_current_thread->arg);
 
@@ -126,3 +128,5 @@ void arch_context_switch(thread_t *oldthread, thread_t *newthread)
 		: "g" (newthread->arch.rsp)
 	);
 }
+
+/* vim: noexpandtab */
diff --git a/arch/x86/crt0.S b/arch/x86/crt0.S
index 4550b8ab..26cc8436 100644
--- a/arch/x86/crt0.S
+++ b/arch/x86/crt0.S
@@ -173,8 +173,6 @@ interrupt_common:
 	movl %esp, %eax			/* store pointer to iframe, using same method */
 	pushl %eax
 	
-	incl critical_section_count
-
 	call platform_irq
 	
 	cmpl $0,%eax
@@ -182,8 +180,6 @@ interrupt_common:
 	call thread_preempt
 
 0:
-	decl critical_section_count
-
 	popl %eax				/* drop pointer to iframe */
 	popl %eax				/* restore task_esp, stack switch can occur here if task_esp is modified */
 	movl %eax, %esp
diff --git a/arch/x86/faults.c b/arch/x86/faults.c
index afdc0bb9..32c732f7 100644
--- a/arch/x86/faults.c
+++ b/arch/x86/faults.c
@@ -46,7 +46,6 @@ static void dump_fault_frame(struct x86_iframe *frame)
 
 static void exception_die(struct x86_iframe *frame, const char *msg)
 {
-	inc_critical_section();
 	dprintf(CRITICAL, msg);
 	dump_fault_frame(frame);
 
diff --git a/arch/x86/include/arch/arch_ops.h b/arch/x86/include/arch/arch_ops.h
index f0816cdb..f8c5f169 100644
--- a/arch/x86/include/arch/arch_ops.h
+++ b/arch/x86/include/arch/arch_ops.h
@@ -37,13 +37,13 @@ static inline void arch_enable_ints(void)
 	__asm__ volatile("sti");
 }
 
-static inline inline void arch_disable_ints(void)
+static inline void arch_disable_ints(void)
 {
 	__asm__ volatile("cli");
 	CF;
 }
 
-static inline inline bool arch_ints_disabled(void)
+static inline bool arch_ints_disabled(void)
 {
 	unsigned int state;
 
@@ -53,7 +53,7 @@ static inline inline bool arch_ints_disabled(void)
 	   : "=a" (state)
 	   :: "memory");
 
-	return !!(state & (1<<9));
+	return !(state & (1<<9));
 }
 
 int _atomic_and(volatile int *ptr, int val);
@@ -110,6 +110,11 @@ static inline void set_current_thread(struct thread *t)
     _current_thread = t;
 }
 
+static inline uint arch_curr_cpu_num(void)
+{
+    return 0;
+}
+
 #endif // !ASSEMBLY
 
 #endif
diff --git a/arch/x86/include/arch/spinlock.h b/arch/x86/include/arch/spinlock.h
new file mode 100644
index 00000000..42d35a7a
--- /dev/null
+++ b/arch/x86/include/arch/spinlock.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <arch/ops.h>
+#include <arch/x86.h>
+#include <stdbool.h>
+
+#define SPIN_LOCK_INITIAL_VALUE (0)
+
+typedef unsigned long spin_lock_t;
+
+typedef uint32_t spin_lock_saved_state_t;
+typedef uint spin_lock_save_flags_t;
+
+/* simple implementation of spinlocks for no smp support */
+static inline void arch_spin_lock_init(spin_lock_t *lock)
+{
+    *lock = SPIN_LOCK_INITIAL_VALUE;
+}
+
+static inline bool arch_spin_lock_held(spin_lock_t *lock)
+{
+    return *lock != 0;
+}
+
+static inline void arch_spin_lock(spin_lock_t *lock)
+{
+    *lock = 1;
+}
+
+static inline int arch_spin_trylock(spin_lock_t *lock)
+{
+    return 0;
+}
+
+static inline void arch_spin_unlock(spin_lock_t *lock)
+{
+    *lock = 0;
+}
+
+/* flags are unused on x86 */
+#define ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS  0
+
+static inline void
+arch_interrupt_save(spin_lock_saved_state_t *statep, spin_lock_save_flags_t flags)
+{
+    *statep = x86_save_eflags();
+    arch_disable_ints();
+}
+
+static inline void
+arch_interrupt_restore(spin_lock_saved_state_t old_state, spin_lock_save_flags_t flags)
+{
+    x86_restore_eflags(old_state);
+}
+
+
diff --git a/arch/x86/include/arch/x86.h b/arch/x86/include/arch/x86.h
index 0caa0560..02493532 100644
--- a/arch/x86/include/arch/x86.h
+++ b/arch/x86/include/arch/x86.h
@@ -118,6 +118,28 @@ static inline uint32_t x86_get_cr2(void)
 	return rv;
 }
 
+static inline uint32_t x86_save_eflags(void)
+{
+	unsigned int state;
+
+	__asm__ volatile(
+		"pushfl;"
+		"popl %0"
+		: "=rm" (state)
+		:: "memory");
+
+	return state;
+}
+
+static inline void x86_restore_eflags(uint32_t eflags)
+{
+	__asm__ volatile(
+		"pushl %0;"
+		"popfl"
+		:: "g" (eflags)
+		: "memory", "cc");
+}
+
 #define rdtsc(low,high) \
      __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
 
diff --git a/arch/x86/rules.mk b/arch/x86/rules.mk
index 8eb4dccf..eee65490 100644
--- a/arch/x86/rules.mk
+++ b/arch/x86/rules.mk
@@ -22,6 +22,10 @@ ifndef TOOLCHAIN_PREFIX
 TOOLCHAIN_PREFIX := i386-elf-
 endif
 
+# for the moment, SMP is not supported on x86
+GLOBAL_DEFINES += \
+	SMP_MAX_CPUS=1
+
 LIBGCC := $(shell $(TOOLCHAIN_PREFIX)gcc $(CFLAGS) -print-libgcc-file-name)
 #$(info LIBGCC = $(LIBGCC))
 
diff --git a/arch/x86/thread.c b/arch/x86/thread.c
index 190ef9f5..fff6d348 100644
--- a/arch/x86/thread.c
+++ b/arch/x86/thread.c
@@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <debug.h>
 #include <kernel/thread.h>
+#include <kernel/spinlock.h>
 #include <arch/x86.h>
 #include <arch/x86/descriptor.h>
 
@@ -50,11 +51,9 @@ static void initial_thread_func(void)
 {
 	int ret;
 
-//	dprintf("initial_thread_func: thread %p calling %p with arg %p\n", _current_thread, _current_thread->entry, _current_thread->arg);
-//	dump_thread(_current_thread);
-
-	/* exit the implicit critical section we're within */
-	exit_critical_section();
+	/* release the thread lock that was implicitly held across the reschedule */
+	spin_unlock(&thread_lock);
+	arch_enable_ints();
 
 	ret = _current_thread->entry(_current_thread->arg);
 
@@ -89,6 +88,14 @@ void arch_thread_initialize(thread_t *t)
 	t->arch.esp = (vaddr_t)frame;
 }
 
+void arch_dump_thread(thread_t *t)
+{
+	if (t->state != THREAD_RUNNING) {
+		dprintf(INFO, "\tarch: ");
+		dprintf(INFO, "sp 0x%lx\n", t->arch.esp);
+	}
+}
+
 void arch_context_switch(thread_t *oldthread, thread_t *newthread)
 {
 	//dprintf(DEBUG, "arch_context_switch: old %p (%s), new %p (%s)\n", oldthread, oldthread->name, newthread, newthread->name);
diff --git a/dev/cache/pl310/pl310.c b/dev/cache/pl310/pl310.c
index 5e1b7f4e..13c59163 100644
--- a/dev/cache/pl310/pl310.c
+++ b/dev/cache/pl310/pl310.c
@@ -104,6 +104,10 @@ static void pl310_init(uint level)
     PL310_REG(REG1_TAG_RAM_CONTROL) = PL310_TAG_RAM_LATENCY;
     PL310_REG(REG1_DATA_RAM_CONTROL) = PL310_DATA_RAM_LATENCY;
 
+    /* configure */
+    /* early BRESP enable, instruction/data prefetch, exclusive cache, full line of zero */
+    PL310_REG(REG1_AUX_CONTROL) |= (1<<30)|(1<<29)|(1<<28)|(1<<12)|(1<<0);
+
     /* flush all the ways */
     PL310_REG(REG7_INV_WAY) = 0xffff;
 }
@@ -120,6 +124,7 @@ status_t pl310_set_enable(bool enable)
     if (enable) {
         if ((PL310_REG(REG1_CONTROL) & 1) == 0) {
             /* if disabled */
+            pl310_invalidate();
             PL310_REG(REG1_CONTROL) = 1;
         }
     } else {
diff --git a/dev/interrupt/arm_gic/arm_gic.c b/dev/interrupt/arm_gic/arm_gic.c
index e49a404b..0069d1fa 100644
--- a/dev/interrupt/arm_gic/arm_gic.c
+++ b/dev/interrupt/arm_gic/arm_gic.c
@@ -20,6 +20,8 @@
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
+#include <assert.h>
+#include <bits.h>
 #include <err.h>
 #include <sys/types.h>
 #include <debug.h>
@@ -27,9 +29,9 @@
 #include <reg.h>
 #include <kernel/thread.h>
 #include <kernel/debug.h>
+#include <lk/init.h>
 #include <platform/interrupts.h>
 #include <arch/ops.h>
-#include <arch/arm.h>
 #include <platform/gic.h>
 #include <trace.h>
 #if WITH_LIB_SM
@@ -60,11 +62,17 @@ static bool arm_gic_interrupt_change_allowed(int irq)
 	TRACEF("change to interrupt %d ignored after booting ns\n", irq);
 	return false;
 }
+
+static void suspend_resume_fiq(bool resume_gicc, bool resume_gicd);
 #else
 static bool arm_gic_interrupt_change_allowed(int irq)
 {
 	return true;
 }
+
+static void suspend_resume_fiq(bool resume_gicc, bool resume_gicd)
+{
+}
 #endif
 
 
@@ -73,10 +81,22 @@ struct int_handler_struct {
 	void *arg;
 };
 
-static struct int_handler_struct int_handler_table[MAX_INT];
+static struct int_handler_struct int_handler_table_per_cpu[GIC_MAX_PER_CPU_INT][SMP_MAX_CPUS];
+static struct int_handler_struct int_handler_table_shared[MAX_INT-GIC_MAX_PER_CPU_INT];
+
+static struct int_handler_struct *get_int_handler(unsigned int vector, uint cpu)
+{
+	if (vector < GIC_MAX_PER_CPU_INT)
+		return &int_handler_table_per_cpu[vector][cpu];
+	else
+		return &int_handler_table_shared[vector - GIC_MAX_PER_CPU_INT];
+}
 
 void register_int_handler(unsigned int vector, int_handler handler, void *arg)
 {
+	struct int_handler_struct *h;
+	uint cpu = arch_curr_cpu_num();
+
 	spin_lock_saved_state_t state;
 
 	if (vector >= MAX_INT)
@@ -85,8 +105,9 @@ void register_int_handler(unsigned int vector, int_handler handler, void *arg)
 	spin_lock_save(&gicd_lock, &state, GICD_LOCK_FLAGS);
 
 	if (arm_gic_interrupt_change_allowed(vector)) {
-		int_handler_table[vector].handler = handler;
-		int_handler_table[vector].arg = arg;
+		h = get_int_handler(vector, cpu);
+		h->handler = handler;
+		h->arg = arg;
 	}
 
 	spin_unlock_restore(&gicd_lock, state, GICD_LOCK_FLAGS);
@@ -130,6 +151,19 @@ void register_int_handler(unsigned int vector, int_handler handler, void *arg)
 #define GICD_CPENDSGIR(n)       (GICD_OFFSET + 0xf10 + (n) * 4)
 #define GICD_SPENDSGIR(n)       (GICD_OFFSET + 0xf20 + (n) * 4)
 
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define GIC_REG_COUNT(bit_per_reg) DIV_ROUND_UP(MAX_INT, (bit_per_reg))
+#define DEFINE_GIC_SHADOW_REG(name, bit_per_reg, init_val, init_from) \
+	uint32_t (name)[GIC_REG_COUNT(bit_per_reg)] = { \
+		[(init_from / bit_per_reg) ... \
+		 (GIC_REG_COUNT(bit_per_reg) - 1)] = (init_val) \
+	}
+
+#if WITH_LIB_SM
+static DEFINE_GIC_SHADOW_REG(gicd_igroupr, 32, ~0U, 0);
+#endif
+static DEFINE_GIC_SHADOW_REG(gicd_itargetsr, 4, 0x01010101, 32);
+
 static void gic_set_enable(uint vector, bool enable)
 {
 	int reg = vector / 32;
@@ -141,17 +175,49 @@ static void gic_set_enable(uint vector, bool enable)
 		GICREG(0, GICD_ICENABLER(reg)) = mask;
 }
 
-void arm_gic_init_secondary_cpu(void)
+static void arm_gic_init_percpu(uint level)
 {
 #if WITH_LIB_SM
 	GICREG(0, GICC_CTLR) = 0xb; // enable GIC0 and select fiq mode for secure
-	GICREG(0, GICD_IGROUPR(0)) = ~0UL; /* GICD_IGROUPR0 is banked */
+	GICREG(0, GICD_IGROUPR(0)) = ~0U; /* GICD_IGROUPR0 is banked */
 #else
 	GICREG(0, GICC_CTLR) = 1; // enable GIC0
 #endif
 	GICREG(0, GICC_PMR) = 0xFF; // unmask interrupts at all priority levels
 }
 
+LK_INIT_HOOK_FLAGS(arm_gic_init_percpu,
+       arm_gic_init_percpu,
+       LK_INIT_LEVEL_PLATFORM_EARLY, LK_INIT_FLAG_SECONDARY_CPUS);
+
+static void arm_gic_suspend_cpu(uint level)
+{
+	suspend_resume_fiq(false, false);
+}
+
+LK_INIT_HOOK_FLAGS(arm_gic_suspend_cpu, arm_gic_suspend_cpu,
+		LK_INIT_LEVEL_PLATFORM, LK_INIT_FLAG_CPU_SUSPEND);
+
+static void arm_gic_resume_cpu(uint level)
+{
+	spin_lock_saved_state_t state;
+	bool resume_gicd = false;
+
+	spin_lock_save(&gicd_lock, &state, GICD_LOCK_FLAGS);
+	if (!(GICREG(0, GICD_CTLR) & 1)) {
+		dprintf(SPEW, "%s: distibutor is off, calling arm_gic_init instead\n", __func__);
+		arm_gic_init();
+		resume_gicd = true;
+	} else {
+		arm_gic_init_percpu(0);
+	}
+	spin_unlock_restore(&gicd_lock, state, GICD_LOCK_FLAGS);
+	suspend_resume_fiq(true, resume_gicd);
+}
+
+LK_INIT_HOOK_FLAGS(arm_gic_resume_cpu, arm_gic_resume_cpu,
+		LK_INIT_LEVEL_PLATFORM, LK_INIT_FLAG_CPU_RESUME);
+
 static int arm_gic_max_cpu(void)
 {
 	return (GICREG(0, GICD_TYPER) >> 5) & 0x7;
@@ -169,21 +235,24 @@ void arm_gic_init(void)
 	if (arm_gic_max_cpu() > 0) {
 		/* Set external interrupts to target cpu 0 */
 		for (i = 32; i < MAX_INT; i += 4) {
-			GICREG(0, GICD_ITARGETSR(i / 4)) = 0x01010101;
+			GICREG(0, GICD_ITARGETSR(i / 4)) = gicd_itargetsr[i / 4];
 		}
 	}
 
 	GICREG(0, GICD_CTLR) = 1; // enable GIC0
 #if WITH_LIB_SM
+	GICREG(0, GICD_CTLR) = 3; // enable GIC0 ns interrupts
 	/*
 	 * Iterate through all IRQs and set them to non-secure
 	 * mode. This will allow the non-secure side to handle
 	 * all the interrupts we don't explicitly claim.
 	 */
-	for (i = 32; i < MAX_INT; i += 32)
-		GICREG(0, GICD_IGROUPR(i / 32)) = ~0UL;
+	for (i = 32; i < MAX_INT; i += 32) {
+		u_int reg = i / 32;
+		GICREG(0, GICD_IGROUPR(reg)) = gicd_igroupr[reg];
+	}
 #endif
-	arm_gic_init_secondary_cpu();
+	arm_gic_init_percpu(0);
 }
 
 static status_t arm_gic_set_secure_locked(u_int irq, bool secure)
@@ -196,9 +265,9 @@ static status_t arm_gic_set_secure_locked(u_int irq, bool secure)
 		return ERR_INVALID_ARGS;
 
 	if (secure)
-		GICREG(0, GICD_IGROUPR(reg)) &= ~mask;
+		GICREG(0, GICD_IGROUPR(reg)) = (gicd_igroupr[reg] &= ~mask);
 	else
-		GICREG(0, GICD_IGROUPR(reg)) |= mask;
+		GICREG(0, GICD_IGROUPR(reg)) = (gicd_igroupr[reg] |= mask);
 	LTRACEF("irq %d, secure %d, GICD_IGROUP%d = %x\n",
 		irq, secure, reg, GICREG(0, GICD_IGROUPR(reg)));
 #endif
@@ -216,8 +285,8 @@ static status_t arm_gic_set_target_locked(u_int irq, u_int cpu_mask, u_int enabl
 	enable_mask = (enable_mask << shift) & cpu_mask;
 
 	old_val = GICREG(0, GICD_ITARGETSR(reg));
-	new_val = (old_val & ~cpu_mask) | enable_mask;
-	GICREG(0, GICD_ITARGETSR(reg)) = new_val;
+	new_val = (gicd_itargetsr[reg] & ~cpu_mask) | enable_mask;
+	GICREG(0, GICD_ITARGETSR(reg)) = gicd_itargetsr[reg] = new_val;
 	LTRACEF("irq %i, GICD_ITARGETSR%d %x => %x (got %x)\n",
 		irq, reg, old_val, new_val, GICREG(0, GICD_ITARGETSR(reg)));
 
@@ -292,7 +361,8 @@ static
 enum handler_return __platform_irq(struct arm_iframe *frame)
 {
 	// get the current vector
-	unsigned int vector = GICREG(0, GICC_IAR) & 0x3ff;
+	uint32_t iar = GICREG(0, GICC_IAR);
+	unsigned int vector = iar & 0x3ff;
 
 	if (vector >= 0x3fe) {
 		// spurious
@@ -302,18 +372,22 @@ enum handler_return __platform_irq(struct arm_iframe *frame)
 	THREAD_STATS_INC(interrupts);
 	KEVLOG_IRQ_ENTER(vector);
 
-//	printf("platform_irq: spsr 0x%x, pc 0x%x, currthread %p, vector %d\n", frame->spsr, frame->pc, current_thread, vector);
+	uint cpu = arch_curr_cpu_num();
+
+//	printf("platform_irq: iar 0x%x cpu %u spsr 0x%x, pc 0x%x, currthread %p, vector %d\n",
+//			iar, cpu, frame->spsr, frame->pc, get_current_thread(), vector);
 
 	// deliver the interrupt
 	enum handler_return ret;
 
 	ret = INT_NO_RESCHEDULE;
-	if (int_handler_table[vector].handler)
-		ret = int_handler_table[vector].handler(int_handler_table[vector].arg);
+	struct int_handler_struct *handler = get_int_handler(vector, cpu);
+	if (handler->handler)
+		ret = handler->handler(handler->arg);
 
-	GICREG(0, GICC_EOIR) = vector;
+	GICREG(0, GICC_EOIR) = iar;
 
-//	printf("platform_irq: exit %d\n", ret);
+//	printf("platform_irq: cpu %u exit %d\n", cpu, ret);
 
 	KEVLOG_IRQ_EXIT(vector);
 
@@ -325,9 +399,11 @@ enum handler_return platform_irq(struct arm_iframe *frame)
 #if WITH_LIB_SM
 	uint32_t ahppir = GICREG(0, GICC_AHPPIR);
 	uint32_t pending_irq = ahppir & 0x3ff;
+	struct int_handler_struct *h;
+	uint cpu = arch_curr_cpu_num();
 
 	LTRACEF("ahppir %d\n", ahppir);
-	if (pending_irq < MAX_INT && int_handler_table[pending_irq].handler) {
+	if (pending_irq < MAX_INT && get_int_handler(pending_irq, cpu)->handler) {
 		enum handler_return ret = 0;
 		uint32_t irq;
 		uint8_t old_priority;
@@ -348,8 +424,8 @@ enum handler_return platform_irq(struct arm_iframe *frame)
 		spin_unlock_restore(&gicd_lock, state, GICD_LOCK_FLAGS);
 
 		LTRACEF("irq %d\n", irq);
-		if (irq < MAX_INT && int_handler_table[irq].handler)
-			ret = int_handler_table[irq].handler(int_handler_table[irq].arg);
+		if (irq < MAX_INT && (h = get_int_handler(pending_irq, cpu))->handler)
+			ret = h->handler(h->arg);
 		else
 			TRACEF("unexpected irq %d != %d may get lost\n", irq, pending_irq);
 		GICREG(0, GICC_AEOIR) = irq;
@@ -364,7 +440,7 @@ enum handler_return platform_irq(struct arm_iframe *frame)
 void platform_fiq(struct arm_iframe *frame)
 {
 #if WITH_LIB_SM
-	sm_handle_irq();
+	sm_handle_fiq();
 #else
 	PANIC_UNIMPLEMENTED;
 #endif
@@ -375,12 +451,13 @@ static status_t arm_gic_get_next_irq_locked(u_int min_irq, bool per_cpu)
 {
 	u_int irq;
 	u_int max_irq = per_cpu ? GIC_MAX_PER_CPU_INT : MAX_INT;
+	uint cpu = arch_curr_cpu_num();
 
 	if (!per_cpu && min_irq < GIC_MAX_PER_CPU_INT)
 		min_irq = GIC_MAX_PER_CPU_INT;
 
 	for (irq = min_irq; irq < max_irq; irq++)
-		if (int_handler_table[irq].handler)
+		if (get_int_handler(irq, cpu)->handler)
 			return irq;
 
 	return SM_ERR_END_OF_INPUT;
@@ -395,12 +472,27 @@ long smc_intc_get_next_irq(smc32_args_t *args)
 
 	arm_gic_non_secure_interrupts_frozen = true;
 	ret = arm_gic_get_next_irq_locked(args->params[0], args->params[1]);
+	LTRACEF("min_irq %d, per_cpu %d, ret %d\n",
+		args->params[0], args->params[1], ret);
 
 	spin_unlock_restore(&gicd_lock, state, GICD_LOCK_FLAGS);
 
 	return ret;
 }
 
+static u_long enabled_fiq_mask[BITMAP_NUM_WORDS(MAX_INT)];
+
+static void bitmap_update_locked(u_long *bitmap, u_int bit, bool set)
+{
+	u_long mask = 1UL << BITMAP_BIT_IN_WORD(bit);
+
+	bitmap += BITMAP_WORD(bit);
+	if (set)
+		*bitmap |= mask;
+	else
+		*bitmap &= ~mask;
+}
+
 long smc_intc_request_fiq(smc32_args_t *args)
 {
 	u_int fiq = args->params[0];
@@ -415,6 +507,7 @@ long smc_intc_request_fiq(smc32_args_t *args)
 	arm_gic_set_priority_locked(fiq, 0);
 
 	gic_set_enable(fiq, enable);
+	bitmap_update_locked(enabled_fiq_mask, fiq, enable);
 
 	dprintf(SPEW, "%s: fiq %d, enable %d done\n", __func__, fiq, enable);
 
@@ -423,21 +516,52 @@ long smc_intc_request_fiq(smc32_args_t *args)
 	return NO_ERROR;
 }
 
-static uint32_t read_mpidr(void)
+static u_int current_fiq[8] = { 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff };
+
+static bool update_fiq_targets(u_int cpu, bool enable, u_int triggered_fiq, bool resume_gicd)
 {
-	int mpidr;
-	__asm__ volatile("mrc		p15, 0, %0, c0, c0, 5"
-		: "=r" (mpidr)
-		);
-	return mpidr;
+	u_int i, j;
+	u_long mask;
+	u_int fiq;
+	bool smp = arm_gic_max_cpu() > 0;
+	bool ret = false;
+
+	spin_lock(&gicd_lock); /* IRQs and FIQs are already masked */
+	for (i = 0; i < BITMAP_NUM_WORDS(MAX_INT); i++) {
+		mask = enabled_fiq_mask[i];
+		while (mask) {
+			j = _ffz(~mask);
+			mask &= ~(1UL << j);
+			fiq = i * BITMAP_BITS_PER_WORD + j;
+			if (fiq == triggered_fiq)
+				ret = true;
+			LTRACEF("cpu %d, irq %i, enable %d\n", cpu, fiq, enable);
+			if (smp)
+				arm_gic_set_target_locked(fiq, 1U << cpu, enable ? ~0 : 0);
+			if (!smp || resume_gicd)
+				gic_set_enable(fiq, enable);
+		}
+	}
+	spin_unlock(&gicd_lock);
+	return ret;
 }
 
-static u_int current_fiq[8] = { 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff, 0x3ff };
+static void suspend_resume_fiq(bool resume_gicc, bool resume_gicd)
+{
+	u_int cpu = arch_curr_cpu_num();
+
+	ASSERT(cpu < 8);
+
+	update_fiq_targets(cpu, resume_gicc, ~0, resume_gicd);
+}
 
 status_t sm_intc_fiq_enter(void)
 {
-	u_int cpu = read_mpidr() & 7;
+	u_int cpu = arch_curr_cpu_num();
 	u_int irq = GICREG(0, GICC_IAR) & 0x3ff;
+	bool fiq_enabled;
+
+	ASSERT(cpu < 8);
 
 	LTRACEF("cpu %d, irq %i\n", cpu, irq);
 
@@ -446,20 +570,19 @@ status_t sm_intc_fiq_enter(void)
 		return ERR_NO_MSG;
 	}
 
-	if (arm_gic_max_cpu() > 0) {
-		spin_lock(&gicd_lock); /* IRQs and FIQs are already masked */
-		arm_gic_set_target_locked(irq, 1U << cpu, 0);
-		spin_unlock(&gicd_lock);
-	} else {
-		/* target register has no effect on uniprocessor systems */
-		gic_set_enable(irq, 0);
-	}
+	fiq_enabled = update_fiq_targets(cpu, false, irq, false);
 	GICREG(0, GICC_EOIR) = irq;
 
 	if (current_fiq[cpu] != 0x3ff) {
 		dprintf(INFO, "more than one fiq active: cpu %d, old %d, new %d\n", cpu, current_fiq[cpu], irq);
 		return ERR_ALREADY_STARTED;
 	}
+
+	if (!fiq_enabled) {
+		dprintf(INFO, "got disabled fiq: cpu %d, new %d\n", cpu, irq);
+		return ERR_NOT_READY;
+	}
+
 	current_fiq[cpu] = irq;
 
 	return 0;
@@ -467,19 +590,16 @@ status_t sm_intc_fiq_enter(void)
 
 void sm_intc_fiq_exit(void)
 {
-	u_int cpu = read_mpidr() & 7;
+	u_int cpu = arch_curr_cpu_num();
+
+	ASSERT(cpu < 8);
+
 	LTRACEF("cpu %d, irq %i\n", cpu, current_fiq[cpu]);
 	if (current_fiq[cpu] == 0x3ff) {
 		dprintf(INFO, "%s: no fiq active, cpu %d\n", __func__, cpu);
 		return;
 	}
-	if (arm_gic_max_cpu() > 0) {
-		spin_lock(&gicd_lock); /* IRQs and FIQs are already masked */
-		arm_gic_set_target_locked(current_fiq[cpu], 1U << cpu, ~0);
-		spin_unlock(&gicd_lock);
-	} else {
-		gic_set_enable(current_fiq[cpu], 1);
-	}
+	update_fiq_targets(cpu, true, current_fiq[cpu], false);
 	current_fiq[cpu] = 0x3ff;
 }
 #endif
diff --git a/dev/interrupt/arm_gic/include/dev/interrupt/arm_gic.h b/dev/interrupt/arm_gic/include/dev/interrupt/arm_gic.h
index 7b38e4b6..ee0fe614 100644
--- a/dev/interrupt/arm_gic/include/dev/interrupt/arm_gic.h
+++ b/dev/interrupt/arm_gic/include/dev/interrupt/arm_gic.h
@@ -26,7 +26,6 @@
 #include <sys/types.h>
 
 void arm_gic_init(void);
-void arm_gic_init_secondary_cpu(void);
 
 enum {
 	/* Ignore cpu_mask and forward interrupt to all CPUs other than the current cpu */
diff --git a/dev/timer/arm_cortex_a9/arm_cortex_a9_timer.c b/dev/timer/arm_cortex_a9/arm_cortex_a9_timer.c
index b7491560..ce7077b9 100644
--- a/dev/timer/arm_cortex_a9/arm_cortex_a9_timer.c
+++ b/dev/timer/arm_cortex_a9/arm_cortex_a9_timer.c
@@ -30,9 +30,11 @@
 #include <trace.h>
 #include <lib/fixed_point.h>
 #include <kernel/thread.h>
+#include <kernel/spinlock.h>
 #include <platform.h>
 #include <platform/interrupts.h>
 #include <platform/timer.h>
+#include <lk/init.h>
 
 /* driver for cortex-a9's private timer */
 #define LOCAL_TRACE 0
@@ -69,6 +71,7 @@
 
 static platform_timer_callback t_callback;
 static addr_t scu_control_base;
+static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE;
 
 static lk_time_t periodic_interval;
 static lk_time_t oneshot_interval;
@@ -77,6 +80,8 @@ static struct fp_32_64 timer_freq_msec_conversion;
 static struct fp_32_64 timer_freq_usec_conversion_inverse;
 static struct fp_32_64 timer_freq_msec_conversion_inverse;
 
+static void arm_cortex_a9_timer_init_percpu(uint level);
+
 uint64_t get_global_val(void)
 {
     uint32_t lo, hi;
@@ -118,7 +123,8 @@ status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg
     if (unlikely(ticks > 0xffffffff))
         ticks = 0xffffffff;
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     t_callback = callback;
 
@@ -130,7 +136,7 @@ status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg
     TIMREG(TIMER_LOAD) = ticks;
     TIMREG(TIMER_CONTROL) = (1<<2) | (1<<1) | (1<<0); // irq enable, autoreload, enable
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 
     return NO_ERROR;
 }
@@ -145,7 +151,8 @@ status_t platform_set_oneshot_timer (platform_timer_callback callback, void *arg
     if (unlikely(ticks > 0xffffffff))
         ticks = 0xffffffff;
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     t_callback = callback;
     oneshot_interval = interval;
@@ -156,7 +163,7 @@ status_t platform_set_oneshot_timer (platform_timer_callback callback, void *arg
     TIMREG(TIMER_LOAD) = ticks;
     TIMREG(TIMER_CONTROL) = (1<<2) | (1<<0) | (1<<0); // irq enable, oneshot, enable
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 
     return NO_ERROR;
 }
@@ -185,6 +192,19 @@ void arm_cortex_a9_timer_init(addr_t _scu_control_base, uint32_t freq)
 {
     scu_control_base = _scu_control_base;
 
+    arm_cortex_a9_timer_init_percpu(0);
+
+    /* save the timer frequency for later calculations */
+    timer_freq = freq;
+
+    /* precompute the conversion factor for global time to real time */
+    fp_32_64_div_32_32(&timer_freq_msec_conversion, timer_freq, 1000);
+    fp_32_64_div_32_32(&timer_freq_usec_conversion_inverse, 1000000, timer_freq);
+    fp_32_64_div_32_32(&timer_freq_msec_conversion_inverse, 1000, timer_freq);
+}
+
+static void arm_cortex_a9_timer_init_percpu(uint level)
+{
     /* disable timer */
     TIMREG(TIMER_CONTROL) = 0;
 
@@ -194,16 +214,14 @@ void arm_cortex_a9_timer_init(addr_t _scu_control_base, uint32_t freq)
     /* ack any irqs that may be pending */
     TIMREG(TIMER_ISR) = 1;
 
-    /* save the timer frequency for later calculations */
-    timer_freq = freq;
-
-    /* precompute the conversion factor for global time to real time */
-    fp_32_64_div_32_32(&timer_freq_msec_conversion, timer_freq, 1000);
-    fp_32_64_div_32_32(&timer_freq_usec_conversion_inverse, 1000000, timer_freq);
-    fp_32_64_div_32_32(&timer_freq_msec_conversion_inverse, 1000, timer_freq);
-
+    /* register the platform tick on each cpu */
     register_int_handler(CPU_PRIV_TIMER_INT, &platform_tick, NULL);
     unmask_interrupt(CPU_PRIV_TIMER_INT);
 }
 
+/* secondary cpu initialize the timer just before the kernel starts with interrupts enabled */
+LK_INIT_HOOK_FLAGS(arm_cortex_a9_timer_init_percpu,
+       arm_cortex_a9_timer_init_percpu,
+       LK_INIT_LEVEL_THREADING - 1, LK_INIT_FLAG_SECONDARY_CPUS);
+
 /* vim: set ts=4 sw=4 expandtab: */
diff --git a/dev/timer/arm_generic/arm_generic_timer.c b/dev/timer/arm_generic/arm_generic_timer.c
index 83d5542c..6193be3b 100644
--- a/dev/timer/arm_generic/arm_generic_timer.c
+++ b/dev/timer/arm_generic/arm_generic_timer.c
@@ -21,7 +21,9 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include <arch/ops.h>
 #include <assert.h>
+#include <lk/init.h>
 #include <platform.h>
 #include <platform/interrupts.h>
 #include <platform/timer.h>
@@ -33,7 +35,98 @@
 
 #include <lib/fixed_point.h>
 
+#if ARCH_ARM64
+
+/* CNTFRQ AArch64 register */
+#define TIMER_REG_CNTFRQ	cntfrq_el0
+
+/* CNTP AArch64 registers */
+#define TIMER_REG_CNTP_CTL	cntp_ctl_el0
+#define TIMER_REG_CNTP_CVAL	cntp_cval_el0
+#define TIMER_REG_CNTP_TVAL	cntp_tval_el0
+#define TIMER_REG_CNTPCT	cntpct_el0
+
+/* CNTPS AArch64 registers */
+#define TIMER_REG_CNTPS_CTL	cntps_ctl_el1
+#define TIMER_REG_CNTPS_CVAL	cntps_cval_el1
+#define TIMER_REG_CNTPS_TVAL	cntps_tval_el1
+#define TIMER_REG_CNTPSCT	cntpct_el0
+
+/* CNTV AArch64 registers */
+#define TIMER_REG_CNTV_CTL	cntv_ctl_el0
+#define TIMER_REG_CNTV_CVAL	cntv_cval_el0
+#define TIMER_REG_CNTV_TVAL	cntv_tval_el0
+#define TIMER_REG_CNTVCT	cntvct_el0
+
+#define READ_TIMER_REG32(reg) ARM64_READ_SYSREG(reg)
+#define READ_TIMER_REG64(reg) ARM64_READ_SYSREG(reg)
+#define WRITE_TIMER_REG32(reg, val) ARM64_WRITE_SYSREG(reg, val)
+#define WRITE_TIMER_REG64(reg, val) ARM64_WRITE_SYSREG(reg, val)
+
+#else
+
+/* CNTFRQ AArch32 register */
+#define TIMER_REG_CNTFRQ	"c0, 0"
+
+/* CNTP AArch32 registers */
+#define TIMER_REG_CNTP_CTL	"c2, 1"
+#define TIMER_REG_CNTP_CVAL	"2"
+#define TIMER_REG_CNTP_TVAL	"c2, 0"
+#define TIMER_REG_CNTPCT	"0"
+
+/* CNTPS AArch32 registers are banked and accessed though CNTP */
+#define CNTPS CNTP
+
+/* CNTV AArch32 registers */
+#define TIMER_REG_CNTV_CTL	"c3, 1"
+#define TIMER_REG_CNTV_CVAL	"3"
+#define TIMER_REG_CNTV_TVAL	"c3, 0"
+#define TIMER_REG_CNTVCT	"1"
+
+#define READ_TIMER_REG32(reg) \
+({ \
+	uint32_t _val; \
+	__asm__ volatile("mrc p15, 0, %0, c14, " reg : "=r" (_val)); \
+	_val; \
+})
+
+#define READ_TIMER_REG64(reg) \
+({ \
+	uint64_t _val; \
+	__asm__ volatile("mrrc p15, " reg ", %0, %H0, c14" : "=r" (_val)); \
+	_val; \
+})
+
+#define WRITE_TIMER_REG32(reg, val) \
+({ \
+	__asm__ volatile("mcr p15, 0, %0, c14, " reg :: "r" (val)); \
+	ISB; \
+})
+
+#define WRITE_TIMER_REG64(reg, val) \
+({ \
+	__asm__ volatile("mcrr p15, " reg ", %0, %H0, c14" :: "r" (val)); \
+	ISB; \
+})
+
+#endif
+
+#ifndef TIMER_ARM_GENERIC_SELECTED
+#define TIMER_ARM_GENERIC_SELECTED CNTP
+#endif
+
+#define COMBINE3(a,b,c) a ## b ## c
+#define XCOMBINE3(a,b,c) COMBINE3(a, b, c)
+
+#define SELECTED_TIMER_REG(reg)	XCOMBINE3(TIMER_REG_, TIMER_ARM_GENERIC_SELECTED, reg)
+#define TIMER_REG_CTL		SELECTED_TIMER_REG(_CTL)
+#define TIMER_REG_CVAL		SELECTED_TIMER_REG(_CVAL)
+#define TIMER_REG_TVAL		SELECTED_TIMER_REG(_TVAL)
+#define TIMER_REG_CT		SELECTED_TIMER_REG(CT)
+
+
 static platform_timer_callback t_callback;
+static int timer_irq;
 
 struct fp_32_64 cntpct_per_ms;
 struct fp_32_64 ms_per_cntpct;
@@ -58,7 +151,7 @@ static uint32_t read_cntfrq(void)
 {
 	uint32_t cntfrq;
 
-	__asm__ volatile("mrc p15, 0, %0, c14, c0, 0" : "=r" (cntfrq));
+	cntfrq = READ_TIMER_REG32(TIMER_REG_CNTFRQ);
 	LTRACEF("cntfrq: 0x%08x, %u\n", cntfrq, cntfrq);
 	return cntfrq;
 }
@@ -67,33 +160,33 @@ static uint32_t read_cntp_ctl(void)
 {
 	uint32_t cntp_ctl;
 
-	__asm__ volatile("mrc p15, 0, %0, c14, c2, 1" : "=r" (cntp_ctl));
+	cntp_ctl = READ_TIMER_REG32(TIMER_REG_CTL);
 	return cntp_ctl;
 }
 
 static void write_cntp_ctl(uint32_t cntp_ctl)
 {
-	LTRACEF_LEVEL(3, "cntp_ctl: 0x%x\n", cntp_ctl);
-	__asm__ volatile("mcr p15, 0, %0, c14, c2, 1" :: "r" (cntp_ctl));
+	LTRACEF_LEVEL(3, "cntp_ctl: 0x%x %x\n", cntp_ctl, read_cntp_ctl());
+	WRITE_TIMER_REG32(TIMER_REG_CTL, cntp_ctl);
 }
 
 static void write_cntp_cval(uint64_t cntp_cval)
 {
 	LTRACEF_LEVEL(3, "cntp_cval: 0x%016llx, %llu\n", cntp_cval, cntp_cval);
-	__asm__ volatile("mcrr p15, 2, %0, %H0, c14" :: "r" (cntp_cval));
+	WRITE_TIMER_REG64(TIMER_REG_CVAL, cntp_cval);
 }
 
 static void write_cntp_tval(int32_t cntp_tval)
 {
 	LTRACEF_LEVEL(3, "cntp_tval: 0x%08x, %d\n", cntp_tval, cntp_tval);
-	__asm__ volatile("mcr p15, 0, %0, c14, c2, 0" :: "r" (cntp_tval));
+	WRITE_TIMER_REG32(TIMER_REG_TVAL, cntp_tval);
 }
 
 static uint64_t read_cntpct(void)
 {
 	uint64_t cntpct;
 
-	__asm__ volatile("mrrc p15, 0, %0, %H0, c14" : "=r" (cntpct));
+	cntpct = READ_TIMER_REG64(TIMER_REG_CT);
 	LTRACEF_LEVEL(3, "cntpct: 0x%016llx, %llu\n", cntpct, cntpct);
 	return cntpct;
 }
@@ -120,6 +213,7 @@ status_t platform_set_oneshot_timer(platform_timer_callback callback, void *arg,
 	else
 		write_cntp_cval(read_cntpct() + cntpct_interval);
 	write_cntp_ctl(1);
+
 	return 0;
 }
 
@@ -138,10 +232,6 @@ lk_time_t current_time(void)
 	return cntpct_to_lk_time(read_cntpct());
 }
 
-void arm_generic_timer_init_secondary_cpu(void)
-{
-}
-
 static uint32_t abs_int32(int32_t a)
 {
 	return (a > 0) ? a : -a;
@@ -231,13 +321,19 @@ static void arm_generic_timer_init_conversion_factors(uint32_t cntfrq)
 	LTRACEF("us_per_cntpct: %08x.%08x%08x\n", us_per_cntpct.l0, us_per_cntpct.l32, us_per_cntpct.l64);
 }
 
-void arm_generic_timer_init(int irq)
+void arm_generic_timer_init(int irq, uint32_t freq_override)
 {
-	uint32_t cntfrq = read_cntfrq();
+	uint32_t cntfrq;
 
-	if (!cntfrq) {
-		TRACEF("Failed to initialize timer, frequency is 0\n");
-		return;
+	if (freq_override == 0) {
+		cntfrq = read_cntfrq();
+
+		if (!cntfrq) {
+			TRACEF("Failed to initialize timer, frequency is 0\n");
+			return;
+		}
+	} else {
+		cntfrq = freq_override;
 	}
 
 #if LOCAL_TRACE
@@ -252,7 +348,33 @@ void arm_generic_timer_init(int irq)
 	arm_generic_timer_init_conversion_factors(cntfrq);
 	test_time_conversions(cntfrq);
 
+	LTRACEF("register irq %d on cpu %d\n", irq, arch_curr_cpu_num());
 	register_int_handler(irq, &platform_tick, NULL);
 	unmask_interrupt(irq);
+
+	timer_irq = irq;
 }
 
+static void arm_generic_timer_init_secondary_cpu(uint level)
+{
+	LTRACEF("register irq %d on cpu %d\n", timer_irq, arch_curr_cpu_num());
+	register_int_handler(timer_irq, &platform_tick, NULL);
+	unmask_interrupt(timer_irq);
+}
+
+/* secondary cpu initialize the timer just before the kernel starts with interrupts enabled */
+LK_INIT_HOOK_FLAGS(arm_generic_timer_init_secondary_cpu,
+		   arm_generic_timer_init_secondary_cpu,
+		   LK_INIT_LEVEL_THREADING - 1, LK_INIT_FLAG_SECONDARY_CPUS);
+
+static void arm_generic_timer_resume_cpu(uint level)
+{
+	/* Always trigger a timer interrupt on each cpu for now */
+	write_cntp_tval(0);
+	write_cntp_ctl(1);
+}
+
+LK_INIT_HOOK_FLAGS(arm_generic_timer_resume_cpu, arm_generic_timer_resume_cpu,
+		LK_INIT_LEVEL_PLATFORM, LK_INIT_FLAG_CPU_RESUME);
+
+/* vim: set noexpandtab: */
diff --git a/dev/timer/arm_generic/include/dev/timer/arm_generic.h b/dev/timer/arm_generic/include/dev/timer/arm_generic.h
index f7f95ced..368d43c3 100644
--- a/dev/timer/arm_generic/include/dev/timer/arm_generic.h
+++ b/dev/timer/arm_generic/include/dev/timer/arm_generic.h
@@ -25,8 +25,8 @@
 
 #include <sys/types.h>
 
-void arm_generic_timer_init(int irq);
-void arm_generic_timer_init_secondary_cpu(void);
+/* if freq_override != 0, use that as the operating frequency instead of CNTFRQ register */
+void arm_generic_timer_init(int irq, uint32_t freq_override);
 
 #endif
 
diff --git a/engine.mk b/engine.mk
index 690a906e..a511eb19 100644
--- a/engine.mk
+++ b/engine.mk
@@ -49,7 +49,7 @@ OUTBIN := $(BUILDDIR)/lk.bin
 OUTELF := $(BUILDDIR)/lk.elf
 CONFIGHEADER := $(BUILDDIR)/config.h
 
-GLOBAL_INCLUDES := $(BUILDDIR) $(LKROOT)/include $(addsuffix /include,$(LKINC))
+GLOBAL_INCLUDES := $(BUILDDIR) $(addsuffix /include,$(LKINC))
 GLOBAL_OPTFLAGS ?= $(ARCH_OPTFLAGS)
 GLOBAL_COMPILEFLAGS := -g -fno-builtin -finline -include $(CONFIGHEADER)
 GLOBAL_COMPILEFLAGS += -W -Wall -Wno-multichar -Wno-unused-parameter -Wno-unused-function -Wno-unused-label
@@ -60,7 +60,13 @@ GLOBAL_CPPFLAGS := -fno-exceptions -fno-rtti -fno-threadsafe-statics
 GLOBAL_ASMFLAGS := -DASSEMBLY
 GLOBAL_LDFLAGS :=
 
-GLOBAL_LDFLAGS += -L $(LKROOT)
+GLOBAL_LDFLAGS += $(addprefix -L,$(LKINC))
+
+# Architecture specific compile flags
+ARCH_COMPILEFLAGS :=
+ARCH_CFLAGS :=
+ARCH_CPPFLAGS :=
+ARCH_ASMFLAGS :=
 
 # top level rule
 all:: $(OUTBIN) $(OUTELF).lst $(OUTELF).debug.lst $(OUTELF).sym $(OUTELF).sym.sorted $(OUTELF).size
@@ -93,6 +99,9 @@ ALLMODULES :=
 # add any external module dependencies
 MODULES := $(EXTERNAL_MODULES)
 
+# any .mk specified here will be included before build.mk
+EXTRA_BUILDRULES :=
+
 # any rules you put here will also be built by the system before considered being complete
 EXTRA_BUILDDEPS :=
 
@@ -179,6 +188,7 @@ OBJCOPY := $(TOOLCHAIN_PREFIX)objcopy
 CPPFILT := $(TOOLCHAIN_PREFIX)c++filt
 SIZE := $(TOOLCHAIN_PREFIX)size
 NM := $(TOOLCHAIN_PREFIX)nm
+STRIP := $(TOOLCHAIN_PREFIX)strip
 
 # try to have the compiler output colorized error messages if available
 export GCC_COLORS ?= 1
diff --git a/include/alloca.h b/include/alloca.h
new file mode 100644
index 00000000..ce617fed
--- /dev/null
+++ b/include/alloca.h
@@ -0,0 +1,6 @@
+#if !defined(__ALLOCA_H)
+#define __ALLOCA_H
+
+#define alloca(size) __builtin_alloca (size)
+
+#endif  /* !__ALLOCA_H */
diff --git a/include/arch/mmu.h b/include/arch/mmu.h
index 3180436f..dc6386b3 100644
--- a/include/arch/mmu.h
+++ b/include/arch/mmu.h
@@ -37,10 +37,14 @@ __BEGIN_CDECLS
 #define ARCH_MMU_FLAG_PERM_RO           (1<<3)
 #define ARCH_MMU_FLAG_PERM_NO_EXECUTE   (1<<4)
 #define ARCH_MMU_FLAG_NS                (1<<5) /* NON-SECURE */
+#define ARCH_MMU_FLAG_INVALID           (1<<7)  /* indicates that flags are not specified */
 
 int arch_mmu_map(vaddr_t vaddr, paddr_t paddr, uint count, uint flags);
 int arch_mmu_unmap(vaddr_t vaddr, uint count);
 status_t arch_mmu_query(vaddr_t vaddr, paddr_t *paddr, uint *flags);
+vaddr_t arch_mmu_pick_spot(vaddr_t base, uint prev_region_arch_mmu_flags,
+                           vaddr_t end,  uint next_region_arch_mmu_flags,
+                           vaddr_t align, size_t size, uint arch_mmu_flags);
 
 void arch_disable_mmu(void);
 
diff --git a/include/arch/mp.h b/include/arch/mp.h
new file mode 100644
index 00000000..2c3b2c56
--- /dev/null
+++ b/include/arch/mp.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <sys/types.h>
+#include <kernel/mp.h>
+
+/* send inter processor interrupt, if supported */
+status_t arch_mp_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi);
+
+void arch_mp_init_percpu(void);
diff --git a/include/arch/ops.h b/include/arch/ops.h
index d0e4f2ee..f0977519 100644
--- a/include/arch/ops.h
+++ b/include/arch/ops.h
@@ -45,6 +45,11 @@ static int atomic_or(volatile int *ptr, int val);
 
 static uint32_t arch_cycle_count(void);
 
+static uint arch_curr_cpu_num(void);
+
+/* Use to align structures on cache lines to avoid cpu aliasing. */
+#define __CPU_ALIGN __ALIGNED(CACHE_LINE)
+
 #endif // !ASSEMBLY
 #define ICACHE 1
 #define DCACHE 2
diff --git a/include/bits.h b/include/bits.h
index b430b7b5..a2ea571a 100644
--- a/include/bits.h
+++ b/include/bits.h
@@ -28,30 +28,34 @@
 #define clz(x) __builtin_clz(x)
 #define ctz(x) __builtin_ctz(x)
 
-#define BIT(x, bit) ((x) & (1 << (bit)))
+#define BIT(x, bit) ((x) & (1UL << (bit)))
 #define BIT_SHIFT(x, bit) (((x) >> (bit)) & 1)
-#define BITS(x, high, low) ((x) & (((1<<((high)+1))-1) & ~((1<<(low))-1)))
-#define BITS_SHIFT(x, high, low) (((x) >> (low)) & ((1<<((high)-(low)+1))-1))
-#define BIT_SET(x, bit) (((x) & (1 << (bit))) ? 1 : 0)
+#define BITS(x, high, low) ((x) & (((1UL<<((high)+1))-1) & ~((1UL<<(low))-1)))
+#define BITS_SHIFT(x, high, low) (((x) >> (low)) & ((1UL<<((high)-(low)+1))-1))
+#define BIT_SET(x, bit) (((x) & (1UL << (bit))) ? 1 : 0)
 
 #define BITMAP_BITS_PER_WORD (sizeof(unsigned long) * 8)
 #define BITMAP_NUM_WORDS(x) (((x) + BITMAP_BITS_PER_WORD - 1) / BITMAP_BITS_PER_WORD)
 #define BITMAP_WORD(x) ((x) / BITMAP_BITS_PER_WORD)
 #define BITMAP_BIT_IN_WORD(x) ((x) & (BITMAP_BITS_PER_WORD - 1))
 
+#define BITMAP_BITS_PER_INT (sizeof(unsigned int) * 8)
+#define BITMAP_BIT_IN_INT(x) ((x) & (BITMAP_BITS_PER_INT - 1))
+#define BITMAP_INT(x) ((x) / BITMAP_BITS_PER_INT)
+
 #define BIT_MASK(x) (((x) >= sizeof(unsigned long) * 8) ? (0UL-1) : ((1UL << (x)) - 1))
 
 static inline int bitmap_set(unsigned long *bitmap, int bit)
 {
-	unsigned long mask = 1 << BITMAP_BIT_IN_WORD(bit);
-	return atomic_or((int*)&bitmap[BITMAP_WORD(bit)], mask) & mask ? 1 : 0;
+	unsigned long mask = 1 << BITMAP_BIT_IN_INT(bit);
+	return atomic_or(&((int*)bitmap)[BITMAP_INT(bit)], mask) & mask ? 1 : 0;
 }
 
 static inline int bitmap_clear(unsigned long *bitmap, int bit)
 {
-	unsigned long mask = 1 << BITMAP_BIT_IN_WORD(bit);
+	unsigned long mask = 1 << BITMAP_BIT_IN_INT(bit);
 
-	return atomic_and((int*)&bitmap[BITMAP_WORD(bit)], ~mask) & mask ? 1:0;
+	return atomic_and(&((int*)bitmap)[BITMAP_INT(bit)], ~mask) & mask ? 1:0;
 }
 
 static inline int bitmap_test(unsigned long *bitmap, int bit)
diff --git a/include/debug.h b/include/debug.h
index e347fe5a..27b9b909 100644
--- a/include/debug.h
+++ b/include/debug.h
@@ -27,8 +27,7 @@
 #include <stddef.h>
 #include <compiler.h>
 #include <platform/debug.h>
-
-__BEGIN_CDECLS
+#include <list.h>
 
 #if !defined(LK_DEBUGLEVEL)
 #define LK_DEBUGLEVEL 0
@@ -40,11 +39,20 @@ __BEGIN_CDECLS
 #define INFO 1
 #define SPEW 2
 
+typedef struct __print_callback print_callback_t;
+struct __print_callback {
+	struct list_node entry;
+	void (*print)(print_callback_t *cb, const char *str, size_t len);
+};
+
+__BEGIN_CDECLS
+
 #if !DISABLE_DEBUG_OUTPUT
 
 /* input/output */
-#define _dputc(c) platform_dputc(c)
+void _dputc(char c);
 int _dputs(const char *str);
+int _dwrite(const char *ptr, size_t len);
 int _dprintf(const char *fmt, ...) __PRINTFLIKE(1, 2);
 int _dvprintf(const char *fmt, va_list ap);
 
@@ -57,6 +65,7 @@ void hexdump8(const void *ptr, size_t len);
 /* input/output */
 static inline void _dputc(char c) { }
 static inline int _dputs(const char *str) { return 0; }
+static inline int _dwrite(const char *ptr, size_t len) { return 0; }
 static inline int __PRINTFLIKE(1, 2) _dprintf(const char *fmt, ...) { return 0; }
 static inline int _dvprintf(const char *fmt, va_list ap) { return 0; }
 
@@ -66,8 +75,13 @@ static inline void hexdump8(const void *ptr, size_t len) { }
 
 #endif /* DISABLE_DEBUG_OUTPUT */
 
+/* register callback to receive debug prints */
+void register_print_callback(print_callback_t *cb);
+void unregister_print_callback(print_callback_t *cb);
+
 #define dputc(level, str) do { if ((level) <= LK_DEBUGLEVEL) { _dputc(str); } } while (0)
 #define dputs(level, str) do { if ((level) <= LK_DEBUGLEVEL) { _dputs(str); } } while (0)
+#define dwrite(level, ptr, len) do { if ((level) <= LK_DEBUGLEVEL) { _dwrite(ptr, len); } } while(0)
 #define dprintf(level, x...) do { if ((level) <= LK_DEBUGLEVEL) { _dprintf(x); } } while (0)
 #define dvprintf(level, x...) do { if ((level) <= LK_DEBUGLEVEL) { _dvprintf(x); } } while (0)
 
diff --git a/include/endian.h b/include/endian.h
index bff71dbf..90da5eab 100644
--- a/include/endian.h
+++ b/include/endian.h
@@ -69,6 +69,10 @@
 #define ntohl(n) BE32(n)
 #define htonl(h) BE32(h)
 
+/* 64-bit network byte swap stuff */
+#define htobe64(h) BE64(h)
+#define be64toh(b) BE64(b)
+
 // some memory access macros
 #if __POWERPC__
 #include <ppc_intrinsics.h>
diff --git a/include/err.h b/include/err.h
index a24459b2..f315dd39 100644
--- a/include/err.h
+++ b/include/err.h
@@ -65,6 +65,10 @@
 #define ERR_OUT_OF_RANGE        (-37)
 #define ERR_NOT_CONFIGURED      (-38)
 #define ERR_NOT_MOUNTED         (-39)
+#define ERR_FAULT               (-40)
+#define ERR_NO_RESOURCES        (-41)
+#define ERR_BAD_HANDLE          (-42)
+#define ERR_ACCESS_DENIED       (-43)
 
 #define ERR_USER_BASE           (-16384)
 
diff --git a/include/inttypes.h b/include/inttypes.h
index f6681e38..4e542d4a 100644
--- a/include/inttypes.h
+++ b/include/inttypes.h
@@ -23,6 +23,9 @@
 #ifndef __INTTYPES_H
 #define __INTTYPES_H
 
+#define PRIu32 "u"
+#define PRIx32 "x"
+
 #include <stdint.h>
 
 #endif
diff --git a/include/kernel/mp.h b/include/kernel/mp.h
new file mode 100644
index 00000000..90df39e1
--- /dev/null
+++ b/include/kernel/mp.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <kernel/thread.h>
+
+void mp_init(void);
+
+typedef uint32_t mp_cpu_mask_t;
+
+#define MP_CPU_ALL_BUT_LOCAL (UINT32_MAX)
+
+/* by default, mp_mbx_reschedule does not signal to cpus that are running realtime
+ * threads. Override this behavior.
+ */
+#define MP_RESCHEDULE_FLAG_REALTIME (0x1)
+
+void mp_reschedule(mp_cpu_mask_t target, uint flags);
+void mp_set_curr_cpu_active(bool active);
+
+typedef enum {
+    MP_IPI_GENERIC,
+    MP_IPI_RESCHEDULE,
+} mp_ipi_t;
+
+/* called from arch code during reschedule irq */
+enum handler_return mp_mbx_reschedule_irq(void);
+
+/* global mp state to track what the cpus are up to */
+struct mp_state {
+    volatile mp_cpu_mask_t active_cpus;
+
+    /* only safely accessible with thread lock held */
+    mp_cpu_mask_t idle_cpus;
+    mp_cpu_mask_t realtime_cpus;
+};
+
+extern struct mp_state mp;
+
+/* must be called with the thread lock held */
+static inline void mp_set_cpu_idle(uint cpu)
+{
+    mp.idle_cpus |= 1UL << cpu;
+}
+
+static inline void mp_set_cpu_busy(uint cpu)
+{
+    mp.idle_cpus &= ~(1UL << cpu);
+}
+
+static inline mp_cpu_mask_t mp_get_idle_mask(void)
+{
+    return mp.idle_cpus;
+}
+
+static inline void mp_set_cpu_realtime(uint cpu)
+{
+    mp.realtime_cpus |= 1UL << cpu;
+}
+
+static inline void mp_set_cpu_non_realtime(uint cpu)
+{
+    mp.realtime_cpus &= ~(1UL << cpu);
+}
+
+static inline mp_cpu_mask_t mp_get_realtime_mask(void)
+{
+    return mp.realtime_cpus;
+}
+
+
diff --git a/include/kernel/spinlock.h b/include/kernel/spinlock.h
new file mode 100644
index 00000000..b9bf246f
--- /dev/null
+++ b/include/kernel/spinlock.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <compiler.h>
+#include <arch/spinlock.h>
+
+__BEGIN_CDECLS
+
+/* interrupts should already be disabled */
+static inline void spin_lock(spin_lock_t *lock)
+{
+    arch_spin_lock(lock);
+}
+
+ /* Returns 0 on success, non-0 on failure */
+static inline int spin_trylock(spin_lock_t *lock)
+{
+    return arch_spin_trylock(lock);
+}
+
+/* interrupts should already be disabled */
+static inline void spin_unlock(spin_lock_t *lock)
+{
+    arch_spin_unlock(lock);
+}
+
+static inline void spin_lock_init(spin_lock_t *lock)
+{
+    arch_spin_lock_init(lock);
+}
+
+static inline bool spin_lock_held(spin_lock_t *lock)
+{
+    return arch_spin_lock_held(lock);
+}
+
+/* spin lock irq save flags: */
+
+/* Possible future flags:
+ * SPIN_LOCK_FLAG_PMR_MASK         = 0x000000ff
+ * SPIN_LOCK_FLAG_PREEMPTION       = 0x00000100
+ * SPIN_LOCK_FLAG_SET_PMR          = 0x00000200
+ */
+
+/* Generic flags */
+#define SPIN_LOCK_FLAG_INTERRUPTS ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS
+
+/* same as spin lock, but save disable and save interrupt state first */
+static inline void spin_lock_save(
+        spin_lock_t *lock,
+        spin_lock_saved_state_t *statep,
+        spin_lock_save_flags_t flags)
+{
+    arch_interrupt_save(statep, flags);
+    spin_lock(lock);
+}
+
+/* restore interrupt state before unlocking */
+static inline void spin_unlock_restore(
+        spin_lock_t *lock,
+        spin_lock_saved_state_t old_state,
+        spin_lock_save_flags_t flags)
+{
+    spin_unlock(lock);
+    arch_interrupt_restore(old_state, flags);
+}
+
+/* hand(ier) routines */
+#define spin_lock_irqsave(lock, statep) spin_lock_save(lock, &(statep), SPIN_LOCK_FLAG_INTERRUPTS)
+#define spin_unlock_irqrestore(lock, statep) spin_unlock_restore(lock, statep, SPIN_LOCK_FLAG_INTERRUPTS)
+
+__END_CDECLS
diff --git a/include/kernel/thread.h b/include/kernel/thread.h
index 4b981b23..e84f9af9 100644
--- a/include/kernel/thread.h
+++ b/include/kernel/thread.h
@@ -30,6 +30,7 @@
 #include <arch/ops.h>
 #include <arch/thread.h>
 #include <kernel/wait.h>
+#include <kernel/spinlock.h>
 #include <debug.h>
 
 enum thread_state {
@@ -45,6 +46,9 @@ typedef int (*thread_start_routine)(void *arg);
 
 /* thread local storage */
 enum thread_tls_list {
+#ifdef WITH_LIB_UTHREAD
+	TLS_ENTRY_UTHREAD,
+#endif
 	MAX_TLS_ENTRY
 };
 
@@ -52,6 +56,7 @@ enum thread_tls_list {
 #define THREAD_FLAG_FREE_STACK 0x2
 #define THREAD_FLAG_FREE_STRUCT 0x4
 #define THREAD_FLAG_REAL_TIME 0x8
+#define THREAD_FLAG_IDLE 0x10
 
 #define THREAD_MAGIC 'thrd'
 
@@ -63,9 +68,10 @@ typedef struct thread {
 	struct list_node queue_node;
 	int priority;
 	enum thread_state state;
-	int saved_critical_section_count;
 	int remaining_quantum;
 	unsigned int flags;
+	int curr_cpu;
+	int pinned_cpu; /* only run on pinned_cpu if >= 0 */
 
 	/* if blocked, a pointer to the wait queue */
 	struct wait_queue *blocking_wait_queue;
@@ -113,6 +119,8 @@ typedef struct thread {
 void thread_init_early(void);
 void thread_init(void);
 void thread_become_idle(void) __NO_RETURN;
+void thread_secondary_cpu_init_early(void);
+void thread_secondary_cpu_entry(void) __NO_RETURN;
 void thread_set_name(const char *name);
 void thread_set_priority(int priority);
 thread_t *thread_create(const char *name, thread_start_routine entry, void *arg, int priority, size_t stack_size);
@@ -126,6 +134,7 @@ status_t thread_detach_and_resume(thread_t *t);
 status_t thread_set_real_time(thread_t *t);
 
 void dump_thread(thread_t *t);
+void arch_dump_thread(thread_t *t);
 void dump_all_threads(void);
 
 /* scheduler routines */
@@ -134,6 +143,10 @@ void thread_preempt(void); /* get preempted (inserted into head of run queue) */
 void thread_block(void); /* block on something and reschedule */
 void thread_unblock(thread_t *t, bool resched); /* go back in the run queue */
 
+#ifdef WITH_LIB_UTHREAD
+void uthread_context_switch(thread_t *oldthread, thread_t *newthread);
+#endif
+
 /* called on every timer tick for the scheduler to do quantum expiration */
 enum handler_return thread_timer_tick(void);
 
@@ -141,36 +154,11 @@ enum handler_return thread_timer_tick(void);
 thread_t *get_current_thread(void);
 void set_current_thread(thread_t *);
 
-/* critical sections */
-extern int critical_section_count;
+/* scheduler lock */
+extern spin_lock_t thread_lock;
 
-static inline __ALWAYS_INLINE void enter_critical_section(void)
-{
-	CF;
-	if (critical_section_count == 0)
-		arch_disable_ints();
-	critical_section_count++;
-	CF;
-}
-
-static inline __ALWAYS_INLINE void exit_critical_section(void)
-{
-	CF;
-	critical_section_count--;
-	if (critical_section_count == 0)
-		arch_enable_ints();
-	CF;
-}
-
-static inline __ALWAYS_INLINE bool in_critical_section(void)
-{
-	CF;
-	return critical_section_count > 0;
-}
-
-/* only used by interrupt glue */
-static inline void inc_critical_section(void) { critical_section_count++; }
-static inline void dec_critical_section(void) { critical_section_count--; }
+#define THREAD_LOCK(state) spin_lock_saved_state_t state; spin_lock_irqsave(&thread_lock, state)
+#define THREAD_UNLOCK(state) spin_unlock_irqrestore(&thread_lock, state)
 
 /* thread local storage */
 static inline __ALWAYS_INLINE uintptr_t tls_get(uint entry)
@@ -195,18 +183,22 @@ static inline __ALWAYS_INLINE uintptr_t tls_set(uint entry, uintptr_t val)
 struct thread_stats {
 	lk_bigtime_t idle_time;
 	lk_bigtime_t last_idle_timestamp;
-	int reschedules;
-	int context_switches;
-	int preempts;
-	int yields;
-	int interrupts; /* platform code increment this */
-	int timer_ints; /* timer code increment this */
-	int timers; /* timer code increment this */
+	ulong reschedules;
+	ulong context_switches;
+	ulong preempts;
+	ulong yields;
+	ulong interrupts; /* platform code increment this */
+	ulong timer_ints; /* timer code increment this */
+	ulong timers; /* timer code increment this */
+
+#if WITH_SMP
+	ulong reschedule_ipis;
+#endif
 };
 
-extern struct thread_stats thread_stats;
+extern struct thread_stats thread_stats[SMP_MAX_CPUS];
 
-#define THREAD_STATS_INC(name) do { thread_stats.name++; } while(0)
+#define THREAD_STATS_INC(name) do { thread_stats[arch_curr_cpu_num()].name++; } while(0)
 
 #else
 
@@ -216,3 +208,4 @@ extern struct thread_stats thread_stats;
 
 #endif
 
+/* vim: set ts=4 sw=4 noexpandtab: */
diff --git a/include/kernel/vm.h b/include/kernel/vm.h
index 5c1aa2c1..cce6f5e7 100644
--- a/include/kernel/vm.h
+++ b/include/kernel/vm.h
@@ -22,10 +22,26 @@
  */
 #pragma once
 
+/* some assembly #defines, need to match the structure below */
+#if IS_64BIT
+#define __MMU_INITIAL_MAPPING_PHYS_OFFSET 0
+#define __MMU_INITIAL_MAPPING_VIRT_OFFSET 8
+#define __MMU_INITIAL_MAPPING_SIZE_OFFSET 16
+#define __MMU_INITIAL_MAPPING_FLAGS_OFFSET 24
+#define __MMU_INITIAL_MAPPING_SIZE        40
+#else
+#define __MMU_INITIAL_MAPPING_PHYS_OFFSET 0
+#define __MMU_INITIAL_MAPPING_VIRT_OFFSET 4
+#define __MMU_INITIAL_MAPPING_SIZE_OFFSET 8
+#define __MMU_INITIAL_MAPPING_FLAGS_OFFSET 12
+#define __MMU_INITIAL_MAPPING_SIZE        20
+#endif
+
 /* flags for initial mapping struct */
 #define MMU_INITIAL_MAPPING_TEMPORARY     (0x1)
 #define MMU_INITIAL_MAPPING_FLAG_UNCACHED (0x2)
 #define MMU_INITIAL_MAPPING_FLAG_DEVICE   (0x4)
+#define MMU_INITIAL_MAPPING_FLAG_DYNAMIC  (0x8)  /* entry has to be patched up by platform_reset */
 
 #ifndef ASSEMBLY
 
@@ -50,6 +66,13 @@ struct mmu_initial_mapping {
     const char *name;
 };
 
+/* Assert that the assembly macros above match this struct. */
+STATIC_ASSERT(__offsetof(struct mmu_initial_mapping, phys) == __MMU_INITIAL_MAPPING_PHYS_OFFSET);
+STATIC_ASSERT(__offsetof(struct mmu_initial_mapping, virt) == __MMU_INITIAL_MAPPING_VIRT_OFFSET);
+STATIC_ASSERT(__offsetof(struct mmu_initial_mapping, size) == __MMU_INITIAL_MAPPING_SIZE_OFFSET);
+STATIC_ASSERT(__offsetof(struct mmu_initial_mapping, flags) == __MMU_INITIAL_MAPPING_FLAGS_OFFSET);
+STATIC_ASSERT(sizeof(struct mmu_initial_mapping) == __MMU_INITIAL_MAPPING_SIZE);
+
 /* Platform or target must fill out one of these to set up the initial memory map
  * for kernel and enough IO space to boot.
  */
@@ -77,7 +100,7 @@ STATIC_ASSERT(KERNEL_ASPACE_BASE + (KERNEL_ASPACE_SIZE - 1) > KERNEL_ASPACE_BASE
 
 static inline bool is_kernel_address(vaddr_t va)
 {
-    return (va >= KERNEL_ASPACE_BASE && va <= (KERNEL_ASPACE_BASE + KERNEL_ASPACE_SIZE));
+    return (va >= KERNEL_ASPACE_BASE && va <= (KERNEL_ASPACE_BASE + KERNEL_ASPACE_SIZE - 1));
 }
 
 /* physical allocator */
@@ -183,7 +206,7 @@ status_t vmm_reserve_space(vmm_aspace_t *aspace, const char *name, size_t size,
 
 /* allocate a region of virtual space that maps a physical piece of address space.
    the physical pages that back this are not allocated from the pmm. */
-status_t vmm_alloc_physical(vmm_aspace_t *aspace, const char *name, size_t size, void **ptr, paddr_t paddr, uint vmm_flags, uint arch_mmu_flags)
+status_t vmm_alloc_physical(vmm_aspace_t *aspace, const char *name, size_t size, void **ptr, uint8_t align_log2, paddr_t paddr, uint vmm_flags, uint arch_mmu_flags)
     __NONNULL((1));
 
 /* allocate a region of memory backed by newly allocated contiguous physical memory  */
diff --git a/include/kernel/wait.h b/include/kernel/wait.h
index 8d23d69c..7f2c156c 100644
--- a/include/kernel/wait.h
+++ b/include/kernel/wait.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2012 Travis Geiselbrecht
+ * Copyright (c) 2008-2014 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
diff --git a/include/lib/cbuf.h b/include/lib/cbuf.h
index 45e116dd..c8e77c03 100644
--- a/include/lib/cbuf.h
+++ b/include/lib/cbuf.h
@@ -25,6 +25,7 @@
 
 #include <sys/types.h>
 #include <kernel/event.h>
+#include <kernel/spinlock.h>
 #include <iovec.h>
 
 typedef struct cbuf {
@@ -33,6 +34,7 @@ typedef struct cbuf {
 	uint len_pow2;
 	char *buf;
 	event_t event;
+	spin_lock_t lock;
 } cbuf_t;
 
 /**
diff --git a/include/lk/init.h b/include/lk/init.h
index fd1b1ae2..d5744b38 100644
--- a/include/lk/init.h
+++ b/include/lk/init.h
@@ -7,8 +7,6 @@
  * LK's init system
  */
 
-int lk_init_level(uint level);
-
 typedef void (*lk_init_hook)(uint level);
 
 enum lk_init_level {
@@ -29,25 +27,50 @@ enum lk_init_level {
     LK_INIT_LEVEL_LAST = UINT_MAX,
 };
 
+enum lk_init_flags {
+    LK_INIT_FLAG_PRIMARY_CPU     = 0x1,
+    LK_INIT_FLAG_SECONDARY_CPUS  = 0x2,
+    LK_INIT_FLAG_ALL_CPUS        = LK_INIT_FLAG_PRIMARY_CPU | LK_INIT_FLAG_SECONDARY_CPUS,
+    LK_INIT_FLAG_CPU_SUSPEND     = 0x4,
+    LK_INIT_FLAG_CPU_RESUME      = 0x8,
+};
+
+void lk_init_level(enum lk_init_flags flags, uint start_level, uint stop_level);
+
+static inline void lk_primary_cpu_init_level(uint start_level, uint stop_level) {
+	lk_init_level(LK_INIT_FLAG_PRIMARY_CPU, start_level, stop_level);
+}
+
+static inline void lk_init_level_all(enum lk_init_flags flags) {
+	lk_init_level(flags, LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_LAST);
+}
+
 struct lk_init_struct {
     uint level;
+    uint flags;
     lk_init_hook hook;
     const char *name;
 };
 
 #ifdef ARCH_X86_64
-#define LK_INIT_HOOK(_name, _hook, _level) \
+#define LK_INIT_HOOK_FLAGS(_name, _hook, _level, _flags) \
     const struct lk_init_struct _init_struct_##_name __ALIGNED(8) __SECTION(".lk_init") = { \
         .level = _level, \
+        .flags = _flags, \
         .hook = _hook, \
         .name = #_name, \
     };
 #else
-#define LK_INIT_HOOK(_name, _hook, _level) \
+#define LK_INIT_HOOK_FLAGS(_name, _hook, _level, _flags) \
     const struct lk_init_struct _init_struct_##_name __SECTION(".lk_init") = { \
         .level = _level, \
+        .flags = _flags, \
         .hook = _hook, \
         .name = #_name, \
     };
 #endif
+
+#define LK_INIT_HOOK(_name, _hook, _level) \
+    LK_INIT_HOOK_FLAGS(_name, _hook, _level, LK_INIT_FLAG_PRIMARY_CPU)
+
 // vim: set ts=4 sw=4 expandtab:
diff --git a/include/lk/main.h b/include/lk/main.h
new file mode 100644
index 00000000..bcf00428
--- /dev/null
+++ b/include/lk/main.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <compiler.h>
+#include <sys/types.h>
+
+
+void lk_main(ulong arg0, ulong arg1, ulong arg2, ulong arg3) __NO_RETURN __EXTERNALLY_VISIBLE;
+void lk_secondary_cpu_entry(void);
+void lk_init_secondary_cpus(uint secondary_cpu_count);
diff --git a/include/stdlib.h b/include/stdlib.h
index 540febb9..46eeedab 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -49,7 +49,12 @@ unsigned long long atoull(const char *num);
 #define STACKBUF_DMA_ALIGN(var, size) \
     uint8_t __##var[(size) + CACHE_LINE]; uint8_t *var = (uint8_t *)(ROUNDUP((addr_t)__##var, CACHE_LINE))
 
+void abort(void) __attribute__((noreturn));
 void qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *));
+void *bsearch(const void *key, const void *base, size_t num_elems, size_t size,
+              int (*compare)(const void *, const void *));
+unsigned long int strtoul(const char *nptr, char **endptr, int base);
+char *getenv(const char *name);
 
 #endif
 
diff --git a/include/strings.h b/include/strings.h
new file mode 100644
index 00000000..324dd8c6
--- /dev/null
+++ b/include/strings.h
@@ -0,0 +1,6 @@
+#if !defined(__STRINGS_H)
+#define __STRINGS_H
+
+int strcasecmp(const char *s1, const char *s2);
+
+#endif  /* !__STRINGS_H */
diff --git a/include/sys/types.h b/include/sys/types.h
index e65e5d05..d1769eb5 100644
--- a/include/sys/types.h
+++ b/include/sys/types.h
@@ -46,14 +46,14 @@ typedef uintptr_t paddr_t;
 
 typedef int kobj_id;
 
-typedef unsigned long lk_time_t;
+typedef uint32_t lk_time_t;
 typedef unsigned long long lk_bigtime_t;
-#define INFINITE_TIME ULONG_MAX
+#define INFINITE_TIME UINT32_MAX
 
-#define TIME_GTE(a, b) ((long)((a) - (b)) >= 0)
-#define TIME_LTE(a, b) ((long)((a) - (b)) <= 0)
-#define TIME_GT(a, b) ((long)((a) - (b)) > 0)
-#define TIME_LT(a, b) ((long)((a) - (b)) < 0)
+#define TIME_GTE(a, b) ((int32_t)((a) - (b)) >= 0)
+#define TIME_LTE(a, b) ((int32_t)((a) - (b)) <= 0)
+#define TIME_GT(a, b) ((int32_t)((a) - (b)) > 0)
+#define TIME_LT(a, b) ((int32_t)((a) - (b)) < 0)
 
 enum handler_return {
     INT_NO_RESCHEDULE = 0,
diff --git a/kernel/debug.c b/kernel/debug.c
index c28402ae..d66cbaa0 100644
--- a/kernel/debug.c
+++ b/kernel/debug.c
@@ -36,6 +36,7 @@
 #include <kernel/thread.h>
 #include <kernel/timer.h>
 #include <kernel/debug.h>
+#include <kernel/mp.h>
 #include <err.h>
 #include <platform.h>
 
@@ -73,43 +74,74 @@ static int cmd_threads(int argc, const cmd_args *argv)
 #if THREAD_STATS
 static int cmd_threadstats(int argc, const cmd_args *argv)
 {
-	printf("thread stats:\n");
-	printf("\ttotal idle time: %lld\n", thread_stats.idle_time);
-	printf("\ttotal busy time: %lld\n", current_time_hires() - thread_stats.idle_time);
-	printf("\treschedules: %d\n", thread_stats.reschedules);
-	printf("\tcontext_switches: %d\n", thread_stats.context_switches);
-	printf("\tpreempts: %d\n", thread_stats.preempts);
-	printf("\tyields: %d\n", thread_stats.yields);
-	printf("\tinterrupts: %d\n", thread_stats.interrupts);
-	printf("\ttimer interrupts: %d\n", thread_stats.timer_ints);
-	printf("\ttimers: %d\n", thread_stats.timers);
+	for (uint i = 0; i < SMP_MAX_CPUS; i++) {
+		if (!(mp.active_cpus & (1 << i)))
+			continue;
+
+		printf("thread stats (cpu %d):\n", i);
+		printf("\ttotal idle time: %lld\n", thread_stats[i].idle_time);
+		printf("\ttotal busy time: %lld\n", current_time_hires() - thread_stats[i].idle_time);
+		printf("\treschedules: %lu\n", thread_stats[i].reschedules);
+#if WITH_SMP
+		printf("\treschedule_ipis: %lu\n", thread_stats[i].reschedule_ipis);
+#endif
+		printf("\tcontext_switches: %lu\n", thread_stats[i].context_switches);
+		printf("\tpreempts: %lu\n", thread_stats[i].preempts);
+		printf("\tyields: %lu\n", thread_stats[i].yields);
+		printf("\tinterrupts: %lu\n", thread_stats[i].interrupts);
+		printf("\ttimer interrupts: %lu\n", thread_stats[i].timer_ints);
+		printf("\ttimers: %lu\n", thread_stats[i].timers);
+	}
 
 	return 0;
 }
 
 static enum handler_return threadload(struct timer *t, lk_time_t now, void *arg)
 {
-	static struct thread_stats old_stats;
-	static lk_bigtime_t last_idle_time;
+	static struct thread_stats old_stats[SMP_MAX_CPUS];
+	static lk_bigtime_t last_idle_time[SMP_MAX_CPUS];
 
-	lk_bigtime_t idle_time = thread_stats.idle_time;
-	if (get_current_thread()->priority == IDLE_PRIORITY) {
-		idle_time += current_time_hires() - thread_stats.last_idle_timestamp;
+	for (uint i = 0; i < SMP_MAX_CPUS; i++) {
+		/* dont display time for inactiv cpus */
+		if (!(mp.active_cpus & (1 << i)))
+			continue;
+
+		lk_bigtime_t idle_time = thread_stats[i].idle_time;
+
+		/* if the cpu is currently idle, add the time since it went idle up until now to the idle counter */
+		bool is_idle = !!(mp.idle_cpus & (1 << i));
+		if (is_idle) {
+			idle_time += current_time_hires() - thread_stats[i].last_idle_timestamp;
+		}
+
+		lk_bigtime_t delta_time = idle_time - last_idle_time[i];
+		lk_bigtime_t busy_time = 1000000ULL - (delta_time > 1000000ULL ? 1000000ULL : delta_time);
+		uint busypercent = (busy_time * 10000) / (1000000);
+
+		printf("cpu %u LOAD: "
+		       "%u.%02u%%, "
+		       "cs %lu, "
+		       "pmpts %lu, "
+#if WITH_SMP
+		       "rs_ipis %lu, "
+#endif
+		       "ints %lu, "
+		       "tmr ints %lu, "
+		       "tmrs %lu\n",
+		       i,
+		       busypercent / 100, busypercent % 100,
+		       thread_stats[i].context_switches - old_stats[i].context_switches,
+		       thread_stats[i].preempts - old_stats[i].preempts,
+#if WITH_SMP
+		       thread_stats[i].reschedule_ipis - old_stats[i].reschedule_ipis,
+#endif
+		       thread_stats[i].interrupts - old_stats[i].interrupts,
+		       thread_stats[i].timer_ints - old_stats[i].timer_ints,
+		       thread_stats[i].timers - old_stats[i].timers);
+
+		old_stats[i] = thread_stats[i];
+		last_idle_time[i] = idle_time;
 	}
-	lk_bigtime_t delta_time = idle_time - last_idle_time;
-	lk_bigtime_t busy_time = 1000000ULL - (delta_time > 1000000ULL ? 1000000ULL : delta_time);
-
-	uint busypercent = (busy_time * 10000) / (1000000);
-
-//	printf("idle_time %lld, busytime %lld\n", idle_time - last_idle_time, busy_time);
-	printf("LOAD: %d.%02d%%, cs %d, ints %d, timer ints %d, timers %d\n", busypercent / 100, busypercent % 100,
-	       thread_stats.context_switches - old_stats.context_switches,
-	       thread_stats.interrupts - old_stats.interrupts,
-	       thread_stats.timer_ints - old_stats.timer_ints,
-	       thread_stats.timers - old_stats.timers);
-
-	old_stats = thread_stats;
-	last_idle_time = idle_time;
 
 	return INT_NO_RESCHEDULE;
 }
@@ -119,8 +151,6 @@ static int cmd_threadload(int argc, const cmd_args *argv)
 	static bool showthreadload = false;
 	static timer_t tltimer;
 
-	enter_critical_section();
-
 	if (showthreadload == false) {
 		// start the display
 		timer_initialize(&tltimer);
@@ -131,8 +161,6 @@ static int cmd_threadload(int argc, const cmd_args *argv)
 		showthreadload = false;
 	}
 
-	exit_critical_section();
-
 	return 0;
 }
 
@@ -160,7 +188,7 @@ void kernel_evlog_add(uintptr_t id, uintptr_t arg0, uintptr_t arg1)
 		uint index = evlog_bump_head(&kernel_evlog);
 
 		kernel_evlog.items[index] = (uintptr_t)current_time_hires();
-		kernel_evlog.items[index+1] = id;
+		kernel_evlog.items[index+1] = (arch_curr_cpu_num() << 16) | id;
 		kernel_evlog.items[index+2] = arg0;
 		kernel_evlog.items[index+3] = arg1;
 	}
@@ -170,24 +198,24 @@ void kernel_evlog_add(uintptr_t id, uintptr_t arg0, uintptr_t arg1)
 
 static void kevdump_cb(const uintptr_t *i)
 {
-	switch (i[1]) {
+	switch (i[1] & 0xffff) {
 		case KERNEL_EVLOG_CONTEXT_SWITCH:
-			printf("%lu: context switch from %p to %p\n", i[0], (void *)i[2], (void *)i[3]);
+			printf("%lu.%lu: context switch from %p to %p\n", i[0], i[1] >> 16, (void *)i[2], (void *)i[3]);
 			break;
 		case KERNEL_EVLOG_PREEMPT:
-			printf("%lu: preempt on thread %p\n", i[0], (void *)i[2]);
+			printf("%lu.%lu: preempt on thread %p\n", i[0], i[1] >> 16, (void *)i[2]);
 			break;
 		case KERNEL_EVLOG_TIMER_TICK:
-			printf("%lu: timer tick\n", i[0]);
+			printf("%lu.%lu: timer tick\n", i[0], i[1] >> 16);
 			break;
 		case KERNEL_EVLOG_TIMER_CALL:
-			printf("%lu: timer call %p, arg %p\n", i[0], (void *)i[2], (void *)i[3]);
+			printf("%lu.%lu: timer call %p, arg %p\n", i[0], i[1] >> 16, (void *)i[2], (void *)i[3]);
 			break;
 		case KERNEL_EVLOG_IRQ_ENTER:
-			printf("%lu: irq entry %u\n", i[0], (uint)i[2]);
+			printf("%lu.%lu: irq entry %u\n", i[0], i[1] >> 16, (uint)i[2]);
 			break;
 		case KERNEL_EVLOG_IRQ_EXIT:
-			printf("%lu: irq exit  %u\n", i[0], (uint)i[2]);
+			printf("%lu.%lu: irq exit  %u\n", i[0], i[1] >> 16, (uint)i[2]);
 			break;
 		default:
 			printf("%lu: unknown id 0x%x 0x%x 0x%x\n", i[0], i[1], (uint)i[2], (uint)i[3]);
@@ -213,4 +241,4 @@ static int cmd_kevlog(int argc, const cmd_args *argv)
 
 #endif // WITH_KERNEL_EVLOG
 
-
+// vim: set noexpandtab:
diff --git a/kernel/event.c b/kernel/event.c
index 62f5b78b..b289b75f 100644
--- a/kernel/event.c
+++ b/kernel/event.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2013 Travis Geiselbrecht
+ * Copyright (c) 2008-2014 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -40,10 +40,11 @@
  * @{
  */
 
+#include <kernel/event.h>
 #include <debug.h>
 #include <assert.h>
 #include <err.h>
-#include <kernel/event.h>
+#include <kernel/thread.h>
 
 /**
  * @brief  Initialize an event object
@@ -70,14 +71,14 @@ void event_destroy(event_t *e)
 {
 	DEBUG_ASSERT(e->magic == EVENT_MAGIC);
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	e->magic = 0;
 	e->signalled = false;
 	e->flags = 0;
 	wait_queue_destroy(&e->wait, true);
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 }
 
 /**
@@ -101,7 +102,7 @@ status_t event_wait_timeout(event_t *e, lk_time_t timeout)
 
 	DEBUG_ASSERT(e->magic == EVENT_MAGIC);
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	if (e->signalled) {
 		/* signalled, we're going to fall through */
@@ -112,12 +113,9 @@ status_t event_wait_timeout(event_t *e, lk_time_t timeout)
 	} else {
 		/* unsignalled, block here */
 		ret = wait_queue_block(&e->wait, timeout);
-		if (ret < 0)
-			goto err;
 	}
 
-err:
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 
 	return ret;
 }
@@ -143,7 +141,7 @@ status_t event_signal(event_t *e, bool reschedule)
 {
 	DEBUG_ASSERT(e->magic == EVENT_MAGIC);
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	if (!e->signalled) {
 		if (e->flags & EVENT_FLAG_AUTOUNSIGNAL) {
@@ -163,7 +161,7 @@ status_t event_signal(event_t *e, bool reschedule)
 		}
 	}
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 
 	return NO_ERROR;
 }
diff --git a/kernel/init.c b/kernel/init.c
index 2a990832..557266c8 100644
--- a/kernel/init.c
+++ b/kernel/init.c
@@ -22,15 +22,20 @@
  */
 #include <compiler.h>
 #include <debug.h>
+#include <kernel/debug.h>
 #include <kernel/thread.h>
 #include <kernel/timer.h>
-#include <kernel/debug.h>
+#include <kernel/mp.h>
 
 void kernel_init(void)
 {
 	// if enabled, configure the kernel's event log
 	kernel_evlog_init();
 
+	// initialize the threading system
+	dprintf(SPEW, "initializing mp\n");
+	mp_init();
+
 	// initialize the threading system
 	dprintf(SPEW, "initializing threads\n");
 	thread_init();
diff --git a/kernel/mp.c b/kernel/mp.c
new file mode 100644
index 00000000..814050b8
--- /dev/null
+++ b/kernel/mp.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <kernel/mp.h>
+
+#include <stdlib.h>
+#include <debug.h>
+#include <assert.h>
+#include <trace.h>
+#include <arch/mp.h>
+#include <kernel/spinlock.h>
+
+#define LOCAL_TRACE 0
+
+/* a global state structure, aligned on cpu cache line to minimize aliasing */
+struct mp_state mp __CPU_ALIGN;
+
+void mp_init(void)
+{
+}
+
+void mp_reschedule(mp_cpu_mask_t target, uint flags)
+{
+#if WITH_SMP
+	uint local_cpu = arch_curr_cpu_num();
+
+	LTRACEF("local %d, target 0x%x\n", local_cpu, target);
+
+	/* mask out cpus that are not active and the local cpu */
+	target &= mp.active_cpus;
+
+	/* mask out cpus that are currently running realtime code */
+	if ((flags & MP_RESCHEDULE_FLAG_REALTIME) == 0) {
+		target &= ~mp.realtime_cpus;
+	}
+	target &= ~(1U << local_cpu);
+
+	LTRACEF("local %d, post mask target now 0x%x\n", local_cpu, target);
+
+	arch_mp_send_ipi(target, MP_IPI_RESCHEDULE);
+#endif
+}
+
+void mp_set_curr_cpu_active(bool active)
+{
+	atomic_or((volatile int *)&mp.active_cpus, 1U << arch_curr_cpu_num());
+}
+
+#if WITH_SMP
+enum handler_return mp_mbx_reschedule_irq(void)
+{
+	uint cpu = arch_curr_cpu_num();
+
+	LTRACEF("cpu %u\n", cpu);
+
+	THREAD_STATS_INC(reschedule_ipis);
+
+	return (mp.active_cpus & (1U << cpu)) ? INT_RESCHEDULE : INT_NO_RESCHEDULE;
+}
+#endif
+
+// vim: set noexpandtab:
+
diff --git a/kernel/mutex.c b/kernel/mutex.c
index b0b7ea04..69b7f5df 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -60,11 +60,11 @@ void mutex_destroy(mutex_t *m)
 		      get_current_thread(), get_current_thread()->name, m, m->holder, m->holder->name);
 #endif
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 	m->magic = 0;
 	m->count = 0;
 	wait_queue_destroy(&m->wait, true);
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 }
 
 /**
@@ -87,7 +87,7 @@ status_t mutex_acquire_timeout(mutex_t *m, lk_time_t timeout)
 		      get_current_thread(), get_current_thread()->name, m);
 #endif
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	status_t ret = NO_ERROR;
 	if (unlikely(++m->count > 1)) {
@@ -112,7 +112,7 @@ status_t mutex_acquire_timeout(mutex_t *m, lk_time_t timeout)
 	m->holder = get_current_thread();
 
 err:
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 	return ret;
 }
 
@@ -130,7 +130,7 @@ status_t mutex_release(mutex_t *m)
 	}
 #endif
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	m->holder = 0;
 
@@ -139,7 +139,7 @@ status_t mutex_release(mutex_t *m)
 		wait_queue_wake_one(&m->wait, true, NO_ERROR);
 	}
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 	return NO_ERROR;
 }
 
diff --git a/kernel/rules.mk b/kernel/rules.mk
index 1dd4ca36..aab917b3 100644
--- a/kernel/rules.mk
+++ b/kernel/rules.mk
@@ -15,6 +15,7 @@ MODULE_SRCS := \
 	$(LOCAL_DIR)/thread.c \
 	$(LOCAL_DIR)/timer.c \
 	$(LOCAL_DIR)/semaphore.c \
+	$(LOCAL_DIR)/mp.c
 
 ifeq ($(WITH_KERNEL_VM),1)
 MODULE_DEPS += kernel/vm
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index a1395fae..219360a4 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -26,17 +26,17 @@ void sem_init(semaphore_t *sem, unsigned int value)
 
 void sem_destroy(semaphore_t *sem)
 {
-	enter_critical_section();
+	THREAD_LOCK(state);
 	sem->count = 0;
 	wait_queue_destroy(&sem->wait, true);
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 }
 
 int sem_post(semaphore_t *sem, bool resched)
 {
 	int ret = 0;
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	/*
 	 * If the count is or was negative then a thread is waiting for a resource, otherwise
@@ -45,7 +45,7 @@ int sem_post(semaphore_t *sem, bool resched)
 	if (unlikely(++sem->count <= 0))
 		ret = wait_queue_wake_one(&sem->wait, resched, NO_ERROR);
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 
 	return ret;
 }
@@ -53,7 +53,7 @@ int sem_post(semaphore_t *sem, bool resched)
 status_t sem_wait(semaphore_t *sem)
 {
 	status_t ret = NO_ERROR;
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	/*
 	 * If there are no resources available then we need to
@@ -62,28 +62,28 @@ status_t sem_wait(semaphore_t *sem)
 	if (unlikely(--sem->count < 0))
 		ret = wait_queue_block(&sem->wait, INFINITE_TIME);
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 	return ret;
 }
 
 status_t sem_trywait(semaphore_t *sem)
 {
 	status_t ret = NO_ERROR;
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	if (unlikely(sem->count <= 0))
 		ret = ERR_NOT_READY;
 	else
 		sem->count--;
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 	return ret;
 }
 
 status_t sem_timedwait(semaphore_t *sem, lk_time_t timeout)
 {
 	status_t ret = NO_ERROR;
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	if (unlikely(--sem->count < 0)) {
 		ret = wait_queue_block(&sem->wait, timeout);
@@ -94,7 +94,7 @@ status_t sem_timedwait(semaphore_t *sem, lk_time_t timeout)
 		}
 	}
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 	return ret;
 }
 
diff --git a/kernel/thread.c b/kernel/thread.c
index 721ae19a..180980bb 100644
--- a/kernel/thread.c
+++ b/kernel/thread.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2009 Travis Geiselbrecht
+ * Copyright (c) 2008-2014 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -35,11 +35,13 @@
 #include <list.h>
 #include <malloc.h>
 #include <string.h>
+#include <printf.h>
 #include <err.h>
 #include <lib/dpc.h>
 #include <kernel/thread.h>
 #include <kernel/timer.h>
 #include <kernel/debug.h>
+#include <kernel/mp.h>
 #include <platform.h>
 #include <target.h>
 #include <lib/heap.h>
@@ -49,24 +51,24 @@
 #endif
 
 #if THREAD_STATS
-struct thread_stats thread_stats;
+struct thread_stats thread_stats[SMP_MAX_CPUS];
 #endif
 
 /* global thread list */
 static struct list_node thread_list;
 
-/* the global critical section count */
-int critical_section_count;
+/* master thread spinlock */
+spin_lock_t thread_lock = SPIN_LOCK_INITIAL_VALUE;
 
 /* the run queue */
 static struct list_node run_queue[NUM_PRIORITIES];
 static uint32_t run_queue_bitmap;
 
-/* the bootstrap thread (statically allocated) */
-static thread_t bootstrap_thread;
+/* make sure the bitmap is large enough to cover our number of priorities */
+STATIC_ASSERT(NUM_PRIORITIES <= sizeof(run_queue_bitmap) * 8);
 
-/* the idle thread */
-static thread_t *idle_thread;
+/* the idle thread(s) (statically allocated) */
+static thread_t idle_threads[SMP_MAX_CPUS];
 
 /* local routines */
 static void thread_resched(void);
@@ -74,7 +76,7 @@ static void idle_thread_routine(void) __NO_RETURN;
 
 #if PLATFORM_HAS_DYNAMIC_TIMER
 /* preemption timer */
-static timer_t preempt_timer;
+static timer_t preempt_timer[SMP_MAX_CPUS];
 #endif
 
 /* run queue manipulation */
@@ -84,7 +86,8 @@ static void insert_in_run_queue_head(thread_t *t)
 	ASSERT(t->magic == THREAD_MAGIC);
 	ASSERT(t->state == THREAD_READY);
 	ASSERT(!list_in_list(&t->queue_node));
-	ASSERT(in_critical_section());
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
 #endif
 
 	list_add_head(&run_queue[t->priority], &t->queue_node);
@@ -97,7 +100,8 @@ static void insert_in_run_queue_tail(thread_t *t)
 	ASSERT(t->magic == THREAD_MAGIC);
 	ASSERT(t->state == THREAD_READY);
 	ASSERT(!list_in_list(&t->queue_node));
-	ASSERT(in_critical_section());
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
 #endif
 
 	list_add_tail(&run_queue[t->priority], &t->queue_node);
@@ -108,6 +112,7 @@ static void init_thread_struct(thread_t *t, const char *name)
 {
 	memset(t, 0, sizeof(thread_t));
 	t->magic = THREAD_MAGIC;
+	t->pinned_cpu = -1;
 	strlcpy(t->name, name, sizeof(t->name));
 }
 
@@ -154,10 +159,10 @@ thread_t *thread_create_etc(thread_t *t, const char *name, thread_start_routine
 	t->entry = entry;
 	t->arg = arg;
 	t->priority = priority;
-	t->saved_critical_section_count = 1; /* we always start inside a critical section */
 	t->state = THREAD_SUSPENDED;
 	t->blocking_wait_queue = NULL;
 	t->wait_queue_block_ret = NO_ERROR;
+	t->curr_cpu = -1;
 
 	t->retcode = 0;
 	wait_queue_init(&t->retcode_wait_queue);
@@ -188,9 +193,9 @@ thread_t *thread_create_etc(thread_t *t, const char *name, thread_start_routine
 	arch_thread_initialize(t);
 
 	/* add it to the global thread list */
-	enter_critical_section();
+	THREAD_LOCK(state);
 	list_add_head(&thread_list, &t->thread_list_node);
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 
 	return t;
 }
@@ -216,22 +221,32 @@ status_t thread_set_real_time(thread_t *t)
 	ASSERT(t->magic == THREAD_MAGIC);
 #endif
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 #if PLATFORM_HAS_DYNAMIC_TIMER
 	if (t == get_current_thread()) {
 		/* if we're currently running, cancel the preemption timer. */
-		timer_cancel(&preempt_timer);
+		timer_cancel(&preempt_timer[arch_curr_cpu_num()]);
 	}
 #endif
 	t->flags |= THREAD_FLAG_REAL_TIME;
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 
 	return NO_ERROR;
 }
 
-static bool thread_is_real_time(thread_t *t)
+static bool thread_is_realtime(thread_t *t)
 {
-	return !!(t->flags & THREAD_FLAG_REAL_TIME);
+	return (t->flags & THREAD_FLAG_REAL_TIME) && t->priority > DEFAULT_PRIORITY;
+}
+
+static bool thread_is_idle(thread_t *t)
+{
+	return !!(t->flags & THREAD_FLAG_IDLE);
+}
+
+static bool thread_is_real_time_or_idle(thread_t *t)
+{
+	return !!(t->flags & (THREAD_FLAG_REAL_TIME | THREAD_FLAG_IDLE));
 }
 
 /**
@@ -251,13 +266,22 @@ status_t thread_resume(thread_t *t)
 	ASSERT(t->state != THREAD_DEATH);
 #endif
 
-	enter_critical_section();
+	bool resched = false;
+	bool ints_disabled = arch_ints_disabled();
+	THREAD_LOCK(state);
 	if (t->state == THREAD_SUSPENDED) {
 		t->state = THREAD_READY;
 		insert_in_run_queue_head(t);
-		thread_yield();
+		if (!ints_disabled) /* HACK, don't resced into bootstrap thread before idle thread is set up */
+			resched = true;
 	}
-	exit_critical_section();
+
+	mp_reschedule(MP_CPU_ALL_BUT_LOCAL, 0);
+
+	THREAD_UNLOCK(state);
+
+	if (resched)
+		thread_yield();
 
 	return NO_ERROR;
 }
@@ -277,11 +301,11 @@ status_t thread_join(thread_t *t, int *retcode, lk_time_t timeout)
 	ASSERT(t->magic == THREAD_MAGIC);
 #endif
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	if (t->flags & THREAD_FLAG_DETACHED) {
 		/* the thread is detached, go ahead and exit */
-		exit_critical_section();
+		THREAD_UNLOCK(state);
 		return ERR_THREAD_DETACHED;
 	}
 
@@ -289,7 +313,7 @@ status_t thread_join(thread_t *t, int *retcode, lk_time_t timeout)
 	if (t->state != THREAD_DEATH) {
 		status_t err = wait_queue_block(&t->retcode_wait_queue, timeout);
 		if (err < 0) {
-			exit_critical_section();
+			THREAD_UNLOCK(state);
 			return err;
 		}
 	}
@@ -311,7 +335,7 @@ status_t thread_join(thread_t *t, int *retcode, lk_time_t timeout)
 	/* clear the structure's magic */
 	t->magic = 0;
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 
 	/* free its stack and the thread structure itself */
 	if (t->flags & THREAD_FLAG_FREE_STACK && t->stack)
@@ -329,7 +353,7 @@ status_t thread_detach(thread_t *t)
 	ASSERT(t->magic == THREAD_MAGIC);
 #endif
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	/* if another thread is blocked inside thread_join() on this thread,
 	 * wake them up with a specific return code */
@@ -338,11 +362,11 @@ status_t thread_detach(thread_t *t)
 	/* if it's already dead, then just do what join would have and exit */
 	if (t->state == THREAD_DEATH) {
 		t->flags &= ~THREAD_FLAG_DETACHED; /* makes sure thread_join continues */
-		exit_critical_section();
+		THREAD_UNLOCK(state);
 		return thread_join(t, NULL, 0);
 	} else {
 		t->flags |= THREAD_FLAG_DETACHED;
-		exit_critical_section();
+		THREAD_UNLOCK(state);
 		return NO_ERROR;
 	}
 }
@@ -361,11 +385,12 @@ void thread_exit(int retcode)
 #if THREAD_CHECKS
 	ASSERT(current_thread->magic == THREAD_MAGIC);
 	ASSERT(current_thread->state == THREAD_RUNNING);
+	ASSERT(!thread_is_idle(current_thread));
 #endif
 
 //	dprintf("thread_exit: current %p\n", current_thread);
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	/* enter the dead state */
 	current_thread->state = THREAD_DEATH;
@@ -402,6 +427,34 @@ static void idle_thread_routine(void)
 		arch_idle();
 }
 
+static thread_t *get_top_thread(int cpu)
+{
+	thread_t *newthread;
+	uint32_t local_run_queue_bitmap = run_queue_bitmap;
+	uint next_queue;
+
+	while (local_run_queue_bitmap) {
+		/* find the first (remaining) queue with a thread in it */
+		next_queue = HIGHEST_PRIORITY - __builtin_clz(local_run_queue_bitmap)
+			- (sizeof(run_queue_bitmap) * 8 - NUM_PRIORITIES);
+
+		list_for_every_entry(&run_queue[next_queue], newthread, thread_t, queue_node) {
+			if (newthread->pinned_cpu < 0 || newthread->pinned_cpu == cpu) {
+				list_delete(&newthread->queue_node);
+
+				if (list_is_empty(&run_queue[next_queue]))
+					run_queue_bitmap &= ~(1<<next_queue);
+
+				return newthread;
+			}
+		}
+
+		local_run_queue_bitmap &= ~(1<<next_queue);
+	}
+	/* no threads to run, select the idle thread for this cpu */
+	return &idle_threads[cpu];
+}
+
 /**
  * @brief  Cause another thread to be executed.
  *
@@ -418,43 +471,26 @@ void thread_resched(void)
 	thread_t *newthread;
 
 	thread_t *current_thread = get_current_thread();
-
-//	printf("thread_resched: current %p: ", current_thread);
-//	dump_thread(current_thread);
+	uint cpu = arch_curr_cpu_num();
 
 #if THREAD_CHECKS
-	ASSERT(in_critical_section());
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
+	ASSERT(current_thread->state != THREAD_RUNNING);
 #endif
 
 	THREAD_STATS_INC(reschedules);
 
-	oldthread = current_thread;
-
-	// at the moment, can't deal with more than 32 priority levels
-	ASSERT(NUM_PRIORITIES <= 32);
-
-	// should at least find the idle thread
-#if THREAD_CHECKS
-	ASSERT(run_queue_bitmap != 0);
-#endif
-
-	int next_queue = HIGHEST_PRIORITY - __builtin_clz(run_queue_bitmap) - (32 - NUM_PRIORITIES);
-	//dprintf(SPEW, "bitmap 0x%x, next %d\n", run_queue_bitmap, next_queue);
-
-	newthread = list_remove_head_type(&run_queue[next_queue], thread_t, queue_node);
-
-	if (list_is_empty(&run_queue[next_queue]))
-		run_queue_bitmap &= ~(1<<next_queue);
+	newthread = get_top_thread(cpu);
 
 #if THREAD_CHECKS
 	ASSERT(newthread);
 #endif
 
-//	printf("newthread: ");
-//	dump_thread(newthread);
-
 	newthread->state = THREAD_RUNNING;
 
+	oldthread = current_thread;
+
 	if (newthread == oldthread)
 		return;
 
@@ -463,46 +499,74 @@ void thread_resched(void)
 		newthread->remaining_quantum = 5; // XXX make this smarter
 	}
 
+	/* mark the cpu ownership of the threads */
+	oldthread->curr_cpu = -1;
+	newthread->curr_cpu = cpu;
+
+	if (thread_is_idle(newthread)) {
+		mp_set_cpu_idle(cpu);
+	} else {
+		mp_set_cpu_busy(cpu);
+	}
+
+	if (thread_is_realtime(newthread)) {
+		mp_set_cpu_realtime(cpu);
+	} else {
+		mp_set_cpu_non_realtime(cpu);
+	}
+
 #if THREAD_STATS
 	THREAD_STATS_INC(context_switches);
 
-	if (oldthread == idle_thread) {
+	if (thread_is_idle(oldthread)) {
 		lk_bigtime_t now = current_time_hires();
-		thread_stats.idle_time += now - thread_stats.last_idle_timestamp;
+		thread_stats[cpu].idle_time += now - thread_stats[cpu].last_idle_timestamp;
 	}
-	if (newthread == idle_thread) {
-		thread_stats.last_idle_timestamp = current_time_hires();
+	if (thread_is_idle(newthread)) {
+		thread_stats[cpu].last_idle_timestamp = current_time_hires();
 	}
 #endif
 
 	KEVLOG_THREAD_SWITCH(oldthread, newthread);
 
-#if THREAD_CHECKS
-	ASSERT(critical_section_count > 0);
-	ASSERT(newthread->saved_critical_section_count > 0);
-#endif
-
 #if PLATFORM_HAS_DYNAMIC_TIMER
-	if (thread_is_real_time(newthread)) {
-		if (!thread_is_real_time(oldthread)) {
+	if (thread_is_real_time_or_idle(newthread)) {
+		if (!thread_is_real_time_or_idle(oldthread)) {
 			/* if we're switching from a non real time to a real time, cancel
 			 * the preemption timer. */
-			timer_cancel(&preempt_timer);
+#ifdef DEBUG_THREAD_CONTEXT_SWITCH
+			dprintf(ALWAYS, "arch_context_switch: stop preempt, cpu %d, old %p (%s), new %p (%s)\n",
+				cpu, oldthread, oldthread->name, newthread, newthread->name);
+#endif
+			timer_cancel(&preempt_timer[cpu]);
 		}
-	} else if (thread_is_real_time(oldthread)) {
+	} else if (thread_is_real_time_or_idle(oldthread)) {
 		/* if we're switching from a real time (or idle thread) to a regular one,
 		 * set up a periodic timer to run our preemption tick. */
-		timer_set_periodic(&preempt_timer, 10, (timer_callback)thread_timer_tick, NULL);
+#ifdef DEBUG_THREAD_CONTEXT_SWITCH
+		dprintf(ALWAYS, "arch_context_switch: start preempt, cpu %d, old %p (%s), new %p (%s)\n",
+			cpu, oldthread, oldthread->name, newthread, newthread->name);
+#endif
+		timer_set_periodic(&preempt_timer[cpu], 10, (timer_callback)thread_timer_tick, NULL);
 	}
 #endif
 
 	/* set some optional target debug leds */
-	target_set_debug_led(0, newthread != idle_thread);
+	target_set_debug_led(0, !thread_is_idle(&idle_threads[cpu]));
 
 	/* do the switch */
-	oldthread->saved_critical_section_count = critical_section_count;
 	set_current_thread(newthread);
-	critical_section_count = newthread->saved_critical_section_count;
+
+#ifdef DEBUG_THREAD_CONTEXT_SWITCH
+	dprintf(ALWAYS, "arch_context_switch: cpu %d, old %p (%s, pri %d, flags 0x%x), new %p (%s, pri %d, flags 0x%x)\n",
+		cpu, oldthread, oldthread->name, oldthread->priority,
+		oldthread->flags, newthread, newthread->name,
+		newthread->priority, newthread->flags);
+#endif
+
+#ifdef WITH_LIB_UTHREAD
+	uthread_context_switch(oldthread, newthread);
+#endif
 	arch_context_switch(oldthread, newthread);
 }
 
@@ -524,17 +588,19 @@ void thread_yield(void)
 	ASSERT(current_thread->state == THREAD_RUNNING);
 #endif
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 
 	THREAD_STATS_INC(yields);
 
 	/* we are yielding the cpu, so stick ourselves into the tail of the run queue and reschedule */
 	current_thread->state = THREAD_READY;
 	current_thread->remaining_quantum = 0;
-	insert_in_run_queue_tail(current_thread);
+	if (likely(!thread_is_idle(current_thread))) { /* idle thread doesn't go in the run queue */
+		insert_in_run_queue_tail(current_thread);
+	}
 	thread_resched();
 
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 }
 
 /**
@@ -559,23 +625,28 @@ void thread_preempt(void)
 #if THREAD_CHECKS
 	ASSERT(current_thread->magic == THREAD_MAGIC);
 	ASSERT(current_thread->state == THREAD_RUNNING);
-	ASSERT(in_critical_section());
 #endif
 
 #if THREAD_STATS
-	if (current_thread != idle_thread)
+	if (!thread_is_idle(current_thread))
 		THREAD_STATS_INC(preempts); /* only track when a meaningful preempt happens */
 #endif
 
 	KEVLOG_THREAD_PREEMPT(current_thread);
 
+	THREAD_LOCK(state);
+
 	/* we are being preempted, so we get to go back into the front of the run queue if we have quantum left */
 	current_thread->state = THREAD_READY;
-	if (current_thread->remaining_quantum > 0)
-		insert_in_run_queue_head(current_thread);
-	else
-		insert_in_run_queue_tail(current_thread); /* if we're out of quantum, go to the tail of the queue */
+	if (likely(!thread_is_idle(current_thread))) { /* idle thread doesn't go in the run queue */
+		if (current_thread->remaining_quantum > 0)
+			insert_in_run_queue_head(current_thread);
+		else
+			insert_in_run_queue_tail(current_thread); /* if we're out of quantum, go to the tail of the queue */
+	}
 	thread_resched();
+
+	THREAD_UNLOCK(state);
 }
 
 /**
@@ -595,7 +666,8 @@ void thread_block(void)
 
 	ASSERT(current_thread->magic == THREAD_MAGIC);
 	ASSERT(current_thread->state == THREAD_BLOCKED);
-	ASSERT(in_critical_section());
+	ASSERT(spin_lock_held(&thread_lock));
+	ASSERT(!thread_is_idle(current_thread));
 #endif
 
 	/* we are blocking on something. the blocking code should have already stuck us on a queue */
@@ -607,11 +679,13 @@ void thread_unblock(thread_t *t, bool resched)
 #if THREAD_CHECKS
 	ASSERT(t->magic == THREAD_MAGIC);
 	ASSERT(t->state == THREAD_BLOCKED);
-	ASSERT(in_critical_section());
+	ASSERT(spin_lock_held(&thread_lock));
+	ASSERT(!thread_is_idle(t));
 #endif
 
 	t->state = THREAD_READY;
 	insert_in_run_queue_head(t);
+	mp_reschedule(MP_CPU_ALL_BUT_LOCAL, 0);
 	if (resched)
 		thread_resched();
 }
@@ -620,7 +694,7 @@ enum handler_return thread_timer_tick(void)
 {
 	thread_t *current_thread = get_current_thread();
 
-	if (thread_is_real_time(current_thread))
+	if (thread_is_real_time_or_idle(current_thread))
 		return INT_NO_RESCHEDULE;
 
 	current_thread->remaining_quantum--;
@@ -641,9 +715,13 @@ static enum handler_return thread_sleep_handler(timer_t *timer, lk_time_t now, v
 	ASSERT(t->state == THREAD_SLEEPING);
 #endif
 
+	THREAD_LOCK(state);
+
 	t->state = THREAD_READY;
 	insert_in_run_queue_head(t);
 
+	THREAD_UNLOCK(state);
+
 	return INT_RESCHEDULE;
 }
 
@@ -666,15 +744,16 @@ void thread_sleep(lk_time_t delay)
 #if THREAD_CHECKS
 	ASSERT(current_thread->magic == THREAD_MAGIC);
 	ASSERT(current_thread->state == THREAD_RUNNING);
+	ASSERT(!thread_is_idle(current_thread));
 #endif
 
 	timer_initialize(&timer);
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 	timer_set_oneshot(&timer, delay, thread_sleep_handler, (void *)current_thread);
 	current_thread->state = THREAD_SLEEPING;
 	thread_resched();
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 }
 
 /**
@@ -686,6 +765,8 @@ void thread_init_early(void)
 {
 	int i;
 
+	DEBUG_ASSERT(arch_curr_cpu_num() == 0);
+
 	/* initialize the run queues */
 	for (i=0; i < NUM_PRIORITIES; i++)
 		list_initialize(&run_queue[i]);
@@ -694,14 +775,15 @@ void thread_init_early(void)
 	list_initialize(&thread_list);
 
 	/* create a thread to cover the current running state */
-	thread_t *t = &bootstrap_thread;
+	thread_t *t = &idle_threads[0];
 	init_thread_struct(t, "bootstrap");
 
 	/* half construct this thread, since we're already running */
 	t->priority = HIGHEST_PRIORITY;
 	t->state = THREAD_RUNNING;
-	t->saved_critical_section_count = 1;
 	t->flags = THREAD_FLAG_DETACHED;
+	t->curr_cpu = 0;
+	t->pinned_cpu = 0;
 	wait_queue_init(&t->retcode_wait_queue);
 	list_add_head(&thread_list, &t->thread_list_node);
 	set_current_thread(t);
@@ -715,7 +797,9 @@ void thread_init_early(void)
 void thread_init(void)
 {
 #if PLATFORM_HAS_DYNAMIC_TIMER
-	timer_initialize(&preempt_timer);
+	for (uint i = 0; i < SMP_MAX_CPUS; i++) {
+		timer_initialize(&preempt_timer[i]);
+	}
 #endif
 }
 
@@ -735,11 +819,21 @@ void thread_set_name(const char *name)
  */
 void thread_set_priority(int priority)
 {
-	if (priority < LOWEST_PRIORITY)
-		priority = LOWEST_PRIORITY;
+	thread_t *current_thread = get_current_thread();
+
+	THREAD_LOCK(state);
+
+	if (priority <= IDLE_PRIORITY)
+		priority = IDLE_PRIORITY + 1;
 	if (priority > HIGHEST_PRIORITY)
 		priority = HIGHEST_PRIORITY;
-	get_current_thread()->priority = priority;
+	current_thread->priority = priority;
+
+	current_thread->state = THREAD_READY;
+	insert_in_run_queue_head(current_thread);
+	thread_resched();
+
+	THREAD_UNLOCK(state);
 }
 
 /**
@@ -751,17 +845,71 @@ void thread_set_priority(int priority)
  */
 void thread_become_idle(void)
 {
-	idle_thread = get_current_thread();
+	DEBUG_ASSERT(arch_ints_disabled());
 
-	thread_set_name("idle");
-	thread_set_priority(IDLE_PRIORITY);
+	thread_t *t = get_current_thread();
 
-	/* mark the idle thread as real time, to avoid running the preemption
-	 * timer when it is scheduled. */
-	thread_set_real_time(idle_thread);
+	char name[16];
+	snprintf(name, sizeof(name), "idle %d", arch_curr_cpu_num());
+	thread_set_name(name);
 
-	/* release the implicit boot critical section and yield to the scheduler */
-	exit_critical_section();
+	/* mark ourself as idle */
+	t->priority = IDLE_PRIORITY;
+	t->flags |= THREAD_FLAG_IDLE;
+	t->pinned_cpu = arch_curr_cpu_num();
+
+	mp_set_curr_cpu_active(true);
+	mp_set_cpu_idle(arch_curr_cpu_num());
+
+	/* enable interrupts and start the scheduler */
+	arch_enable_ints();
+	thread_yield();
+
+	idle_thread_routine();
+}
+
+/* create an idle thread for the cpu we're on, and start scheduling */
+
+void thread_secondary_cpu_init_early(void)
+{
+	DEBUG_ASSERT(arch_ints_disabled());
+
+	/* construct an idle thread to cover our cpu */
+	uint cpu = arch_curr_cpu_num();
+	thread_t *t = &idle_threads[cpu];
+
+	char name[16];
+	snprintf(name, sizeof(name), "idle %d", cpu);
+	init_thread_struct(t, name);
+	t->pinned_cpu = cpu;
+
+	/* half construct this thread, since we're already running */
+	t->priority = HIGHEST_PRIORITY;
+	t->state = THREAD_RUNNING;
+	t->flags = THREAD_FLAG_DETACHED | THREAD_FLAG_IDLE;
+	t->curr_cpu = cpu;
+	t->pinned_cpu = cpu;
+	wait_queue_init(&t->retcode_wait_queue);
+
+	THREAD_LOCK(state);
+
+	list_add_head(&thread_list, &t->thread_list_node);
+	set_current_thread(t);
+
+	THREAD_UNLOCK(state);
+}
+
+void thread_secondary_cpu_entry(void)
+{
+	uint cpu = arch_curr_cpu_num();
+	thread_t *t = get_current_thread();
+	t->priority = IDLE_PRIORITY;
+
+	mp_set_curr_cpu_active(true);
+	mp_set_cpu_idle(cpu);
+
+	/* enable interrupts and start the scheduler on this cpu */
+	arch_enable_ints();
 	thread_yield();
 
 	idle_thread_routine();
@@ -786,9 +934,8 @@ static const char *thread_state_to_str(enum thread_state state)
 void dump_thread(thread_t *t)
 {
 	dprintf(INFO, "dump_thread: t %p (%s)\n", t, t->name);
-	dprintf(INFO, "\tstate %s, priority %d, remaining quantum %d, critical section %d\n",
-				  thread_state_to_str(t->state), t->priority, t->remaining_quantum,
-				  t->saved_critical_section_count);
+	dprintf(INFO, "\tstate %s, curr_cpu %d, pinned_cpu %d, priority %d, remaining quantum %d\n",
+				  thread_state_to_str(t->state), t->curr_cpu, t->pinned_cpu, t->priority, t->remaining_quantum);
 	dprintf(INFO, "\tstack %p, stack_size %zd\n", t->stack, t->stack_size);
 	dprintf(INFO, "\tentry %p, arg %p, flags 0x%x\n", t->entry, t->arg, t->flags);
 	dprintf(INFO, "\twait queue %p, wait queue ret %d\n", t->blocking_wait_queue, t->wait_queue_block_ret);
@@ -798,6 +945,7 @@ void dump_thread(thread_t *t)
 		dprintf(INFO, " 0x%lx", t->tls[i]);
 	}
 	dprintf(INFO, "\n");
+	arch_dump_thread(t);
 }
 
 /**
@@ -807,11 +955,11 @@ void dump_all_threads(void)
 {
 	thread_t *t;
 
-	enter_critical_section();
+	THREAD_LOCK(state);
 	list_for_every_entry(&thread_list, t, thread_t, thread_list_node) {
 		dump_thread(t);
 	}
-	exit_critical_section();
+	THREAD_UNLOCK(state);
 }
 
 /** @} */
@@ -834,10 +982,16 @@ static enum handler_return wait_queue_timeout_handler(timer_t *timer, lk_time_t
 	ASSERT(thread->magic == THREAD_MAGIC);
 #endif
 
-	if (thread_unblock_from_wait_queue(thread, ERR_TIMED_OUT) >= NO_ERROR)
-		return INT_RESCHEDULE;
+	spin_lock(&thread_lock);
 
-	return INT_NO_RESCHEDULE;
+	enum handler_return ret = INT_NO_RESCHEDULE;
+	if (thread_unblock_from_wait_queue(thread, ERR_TIMED_OUT) >= NO_ERROR) {
+		ret = INT_RESCHEDULE;
+	}
+
+	spin_unlock(&thread_lock);
+
+	return ret;
 }
 
 /**
@@ -867,7 +1021,8 @@ status_t wait_queue_block(wait_queue_t *wait, lk_time_t timeout)
 #if THREAD_CHECKS
 	ASSERT(wait->magic == WAIT_QUEUE_MAGIC);
 	ASSERT(current_thread->state == THREAD_RUNNING);
-	ASSERT(in_critical_section());
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
 #endif
 
 	if (timeout == 0)
@@ -885,7 +1040,7 @@ status_t wait_queue_block(wait_queue_t *wait, lk_time_t timeout)
 		timer_set_oneshot(&timer, timeout, wait_queue_timeout_handler, (void *)current_thread);
 	}
 
-	thread_block();
+	thread_resched();
 
 	/* we don't really know if the timer fired or not, so it's better safe to try to cancel it */
 	if (timeout != INFINITE_TIME) {
@@ -918,7 +1073,8 @@ int wait_queue_wake_one(wait_queue_t *wait, bool reschedule, status_t wait_queue
 
 #if THREAD_CHECKS
 	ASSERT(wait->magic == WAIT_QUEUE_MAGIC);
-	ASSERT(in_critical_section());
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
 #endif
 
 	t = list_remove_head_type(&wait->list, thread_t, queue_node);
@@ -940,9 +1096,12 @@ int wait_queue_wake_one(wait_queue_t *wait, bool reschedule, status_t wait_queue
 			insert_in_run_queue_head(current_thread);
 		}
 		insert_in_run_queue_head(t);
-		if (reschedule)
+		mp_reschedule(MP_CPU_ALL_BUT_LOCAL, 0);
+		if (reschedule) {
 			thread_resched();
+		}
 		ret = 1;
+
 	}
 
 	return ret;
@@ -972,7 +1131,8 @@ int wait_queue_wake_all(wait_queue_t *wait, bool reschedule, status_t wait_queue
 
 #if THREAD_CHECKS
 	ASSERT(wait->magic == WAIT_QUEUE_MAGIC);
-	ASSERT(in_critical_section());
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
 #endif
 
 	if (reschedule && wait->count > 0) {
@@ -1002,8 +1162,12 @@ int wait_queue_wake_all(wait_queue_t *wait, bool reschedule, status_t wait_queue
 	ASSERT(wait->count == 0);
 #endif
 
-	if (reschedule && ret > 0)
-		thread_resched();
+	if (ret > 0) {
+		mp_reschedule(MP_CPU_ALL_BUT_LOCAL, 0);
+		if (reschedule) {
+			thread_resched();
+		}
+	}
 
 	return ret;
 }
@@ -1017,7 +1181,8 @@ void wait_queue_destroy(wait_queue_t *wait, bool reschedule)
 {
 #if THREAD_CHECKS
 	ASSERT(wait->magic == WAIT_QUEUE_MAGIC);
-	ASSERT(in_critical_section());
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
 #endif
 	wait_queue_wake_all(wait, reschedule, ERR_OBJECT_DESTROYED);
 	wait->magic = 0;
@@ -1038,8 +1203,9 @@ void wait_queue_destroy(wait_queue_t *wait, bool reschedule)
 status_t thread_unblock_from_wait_queue(thread_t *t, status_t wait_queue_error)
 {
 #if THREAD_CHECKS
-	ASSERT(in_critical_section());
 	ASSERT(t->magic == THREAD_MAGIC);
+	ASSERT(arch_ints_disabled());
+	ASSERT(spin_lock_held(&thread_lock));
 #endif
 
 	if (t->state != THREAD_BLOCKED)
@@ -1057,6 +1223,7 @@ status_t thread_unblock_from_wait_queue(thread_t *t, status_t wait_queue_error)
 	t->state = THREAD_READY;
 	t->wait_queue_block_ret = wait_queue_error;
 	insert_in_run_queue_head(t);
+	mp_reschedule(MP_CPU_ALL_BUT_LOCAL, 0);
 
 	return NO_ERROR;
 }
diff --git a/kernel/timer.c b/kernel/timer.c
index 0b708870..48ec7dad 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2009 Travis Geiselbrecht
+ * Copyright (c) 2008-2014 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -41,12 +41,19 @@
 #include <kernel/thread.h>
 #include <kernel/timer.h>
 #include <kernel/debug.h>
+#include <kernel/spinlock.h>
 #include <platform/timer.h>
 #include <platform.h>
 
 #define LOCAL_TRACE 0
 
-static struct list_node timer_queue;
+spin_lock_t timer_lock;
+
+struct timer_state {
+	struct list_node timer_queue;
+} __CPU_ALIGN;
+
+static struct timer_state timers[SMP_MAX_CPUS];
 
 static enum handler_return timer_tick(void *arg, lk_time_t now);
 
@@ -58,13 +65,15 @@ void timer_initialize(timer_t *timer)
 	*timer = (timer_t)TIMER_INITIAL_VALUE(*timer);
 }
 
-static void insert_timer_in_queue(timer_t *timer)
+static void insert_timer_in_queue(uint cpu, timer_t *timer)
 {
 	timer_t *entry;
 
-	LTRACEF("timer %p, scheduled %lu, periodic %lu\n", timer, timer->scheduled_time, timer->periodic_time);
+	DEBUG_ASSERT(arch_ints_disabled());
 
-	list_for_every_entry(&timer_queue, entry, timer_t, node) {
+	LTRACEF("timer %p, cpu %u, scheduled %lu, periodic %lu\n", timer, cpu, timer->scheduled_time, timer->periodic_time);
+
+	list_for_every_entry(&timers[cpu].timer_queue, entry, timer_t, node) {
 		if (TIME_GT(entry->scheduled_time, timer->scheduled_time)) {
 			list_add_before(&entry->node, &timer->node);
 			return;
@@ -72,14 +81,14 @@ static void insert_timer_in_queue(timer_t *timer)
 	}
 
 	/* walked off the end of the list */
-	list_add_tail(&timer_queue, &timer->node);
+	list_add_tail(&timers[cpu].timer_queue, &timer->node);
 }
 
 static void timer_set(timer_t *timer, lk_time_t delay, lk_time_t period, timer_callback callback, void *arg)
 {
 	lk_time_t now;
 
-	LTRACEF("timer %p, delay %lu, period %lu, callback %p, arg %p, now %lu\n", timer, delay, period, callback, arg, now);
+	LTRACEF("timer %p, delay %lu, period %lu, callback %p, arg %p\n", timer, delay, period, callback, arg);
 
 	DEBUG_ASSERT(timer->magic == TIMER_MAGIC);
 
@@ -95,19 +104,21 @@ static void timer_set(timer_t *timer, lk_time_t delay, lk_time_t period, timer_c
 
 	LTRACEF("scheduled time %lu\n", timer->scheduled_time);
 
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&timer_lock, state);
 
-	insert_timer_in_queue(timer);
+	uint cpu = arch_curr_cpu_num();
+	insert_timer_in_queue(cpu, timer);
 
 #if PLATFORM_HAS_DYNAMIC_TIMER
-	if (list_peek_head_type(&timer_queue, timer_t, node) == timer) {
+	if (list_peek_head_type(&timers[cpu].timer_queue, timer_t, node) == timer) {
 		/* we just modified the head of the timer queue */
 		LTRACEF("setting new timer for %u msecs\n", (uint)delay);
 		platform_set_oneshot_timer(timer_tick, NULL, delay);
 	}
 #endif
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&timer_lock, state);
 }
 
 /**
@@ -159,10 +170,13 @@ void timer_cancel(timer_t *timer)
 {
 	DEBUG_ASSERT(timer->magic == TIMER_MAGIC);
 
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&timer_lock, state);
 
 #if PLATFORM_HAS_DYNAMIC_TIMER
-	timer_t *oldhead = list_peek_head_type(&timer_queue, timer_t, node);
+	uint cpu = arch_curr_cpu_num();
+
+	timer_t *oldhead = list_peek_head_type(&timers[cpu].timer_queue, timer_t, node);
 #endif
 
 	if (list_in_list(&timer->node))
@@ -177,7 +191,7 @@ void timer_cancel(timer_t *timer)
 
 #if PLATFORM_HAS_DYNAMIC_TIMER
 	/* see if we've just modified the head of the timer queue */
-	timer_t *newhead = list_peek_head_type(&timer_queue, timer_t, node);
+	timer_t *newhead = list_peek_head_type(&timers[cpu].timer_queue, timer_t, node);
 	if (newhead == NULL) {
 		LTRACEF("clearing old hw timer, nothing in the queue\n");
 		platform_stop_timer();
@@ -195,7 +209,7 @@ void timer_cancel(timer_t *timer)
 	}
 #endif
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&timer_lock, state);
 }
 
 /* called at interrupt time to process any pending timers */
@@ -204,14 +218,20 @@ static enum handler_return timer_tick(void *arg, lk_time_t now)
 	timer_t *timer;
 	enum handler_return ret = INT_NO_RESCHEDULE;
 
+	DEBUG_ASSERT(arch_ints_disabled());
+
 	THREAD_STATS_INC(timer_ints);
 //	KEVLOG_TIMER_TICK(); // enable only if necessary
 
-	LTRACEF("now %lu, sp %p\n", now, __GET_FRAME());
+	uint cpu = arch_curr_cpu_num();
+
+	LTRACEF("cpu %u now %lu, sp %p\n", cpu, now, __GET_FRAME());
+
+	spin_lock(&timer_lock);
 
 	for (;;) {
 		/* see if there's an event to process */
-		timer = list_peek_head_type(&timer_queue, timer_t, node);
+		timer = list_peek_head_type(&timers[cpu].timer_queue, timer_t, node);
 		if (likely(timer == 0))
 			break;
 		LTRACEF("next item on timer queue %p at %lu now %lu (%p, arg %p)\n", timer, timer->scheduled_time, now, timer->callback, timer->arg);
@@ -223,6 +243,9 @@ static enum handler_return timer_tick(void *arg, lk_time_t now)
 		DEBUG_ASSERT(timer && timer->magic == TIMER_MAGIC);
 		list_delete(&timer->node);
 
+		/* we pulled it off the list, release the list lock to handle it */
+		spin_unlock(&timer_lock);
+
 		LTRACEF("dequeued timer %p, scheduled %lu periodic %lu\n", timer, timer->scheduled_time, timer->periodic_time);
 
 		THREAD_STATS_INC(timers);
@@ -234,19 +257,22 @@ static enum handler_return timer_tick(void *arg, lk_time_t now)
 		if (timer->callback(timer, now, timer->arg) == INT_RESCHEDULE)
 			ret = INT_RESCHEDULE;
 
+		/* it may have been requeued or periodic, grab the lock so we can safely inspect it */
+		spin_lock(&timer_lock);
+
 		/* if it was a periodic timer and it hasn't been requeued
 		 * by the callback put it back in the list
 		 */
 		if (periodic && !list_in_list(&timer->node) && timer->periodic_time > 0) {
 			LTRACEF("periodic timer, period %u\n", (uint)timer->periodic_time);
 			timer->scheduled_time = now + timer->periodic_time;
-			insert_timer_in_queue(timer);
+			insert_timer_in_queue(cpu, timer);
 		}
 	}
 
 #if PLATFORM_HAS_DYNAMIC_TIMER
 	/* reset the timer to the next event */
-	timer = list_peek_head_type(&timer_queue, timer_t, node);
+	timer = list_peek_head_type(&timers[cpu].timer_queue, timer_t, node);
 	if (timer) {
 		/* has to be the case or it would have fired already */
 		DEBUG_ASSERT(TIME_GT(timer->scheduled_time, now));
@@ -256,25 +282,33 @@ static enum handler_return timer_tick(void *arg, lk_time_t now)
 		LTRACEF("setting new timer for %u msecs for event %p\n", (uint)delay, timer);
 		platform_set_oneshot_timer(timer_tick, NULL, delay);
 	}
+
+	/* we're done manipulating the timer queue */
+	spin_unlock(&timer_lock);
 #else
+	/* release the timer lock before calling the tick handler */
+	spin_unlock(&timer_lock);
+
 	/* let the scheduler have a shot to do quantum expiration, etc */
 	/* in case of dynamic timer, the scheduler will set up a periodic timer */
 	if (thread_timer_tick() == INT_RESCHEDULE)
 		ret = INT_RESCHEDULE;
 #endif
 
-	DEBUG_ASSERT(in_critical_section());
 	return ret;
 }
 
 void timer_init(void)
 {
-	list_initialize(&timer_queue);
-
+	timer_lock = SPIN_LOCK_INITIAL_VALUE;
+	for (uint i = 0; i < SMP_MAX_CPUS; i++) {
+		list_initialize(&timers[i].timer_queue);
+	}
 #if !PLATFORM_HAS_DYNAMIC_TIMER
 	/* register for a periodic timer tick */
 	platform_set_periodic_timer(timer_tick, NULL, 10); /* 10ms */
 #endif
 }
 
+/* vim: set noexpandtab */
 
diff --git a/kernel/vm/pmm.c b/kernel/vm/pmm.c
index 1c2ced7e..e93bae58 100644
--- a/kernel/vm/pmm.c
+++ b/kernel/vm/pmm.c
@@ -294,8 +294,8 @@ uint pmm_alloc_contiguous(uint count, uint8_t alignment_log2, paddr_t *pa, struc
 retry:
             /* search while we're still within the arena and have a chance of finding a slot
                (start + count < end of arena) */
-            while (start + count > start &&
-                    start + count <= a->size / PAGE_SIZE) {
+            while ((start < a->size / PAGE_SIZE) &&
+                   ((start + count) <= a->size / PAGE_SIZE)) {
                 vm_page_t *p = &a->page_array[start];
                 for (uint i = 0; i < count; i++) {
                     if (p->flags & VM_PAGE_FLAG_NONFREE) {
diff --git a/kernel/vm/vm.c b/kernel/vm/vm.c
index 1e7c04c7..6a223568 100644
--- a/kernel/vm/vm.c
+++ b/kernel/vm/vm.c
@@ -60,6 +60,8 @@ static void mark_pages_in_use(vaddr_t va, size_t len)
 
             /* alloate the range, throw the results away */
             pmm_alloc_range(pa, 1, &list);
+        } else {
+            panic("Could not find pa for va 0x%lx\n", va);
         }
     }
 }
diff --git a/kernel/vm/vmm.c b/kernel/vm/vmm.c
index 8c7f15f5..7cffb260 100644
--- a/kernel/vm/vmm.c
+++ b/kernel/vm/vmm.c
@@ -26,11 +26,13 @@
 #include <string.h>
 #include <lib/console.h>
 #include <kernel/vm.h>
+#include <kernel/mutex.h>
 #include "vm_priv.h"
 
 #define LOCAL_TRACE 0
 
 static struct list_node aspace_list = LIST_INITIAL_VALUE(aspace_list);
+static mutex_t vmm_lock = MUTEX_INITIAL_VALUE(vmm_lock);
 
 vmm_aspace_t _kernel_aspace;
 
@@ -161,7 +163,68 @@ static status_t add_region_to_aspace(vmm_aspace_t *aspace, vmm_region_t *r)
     return ERR_NO_MEMORY;
 }
 
-static vaddr_t alloc_spot(vmm_aspace_t *aspace, size_t size, uint8_t align_pow2, struct list_node **before)
+/*
+ *  Try to pick the spot within specified gap
+ *
+ *  Arch can override this to impose it's own restrictions.
+ */
+__WEAK vaddr_t arch_mmu_pick_spot(vaddr_t base, uint prev_region_arch_mmu_flags,
+                                  vaddr_t end,  uint next_region_arch_mmu_flags,
+                                  vaddr_t align, size_t size, uint arch_mmu_flags)
+{
+    /* just align it by default */
+    return ALIGN(base, align);
+}
+
+/*
+ *  Returns true if the caller has to stop search
+ */
+static inline bool check_gap(vmm_aspace_t *aspace,
+                             vmm_region_t *prev, vmm_region_t *next,
+                             vaddr_t *pva, vaddr_t align, size_t size,
+                             uint arch_mmu_flags)
+{
+    vaddr_t gap_beg; /* first byte of a gap */
+    vaddr_t gap_end; /* last byte of a gap */
+
+    DEBUG_ASSERT(pva);
+
+    if (prev)
+        gap_beg = prev->base + prev->size;
+    else
+        gap_beg = aspace->base;
+
+    if (next) {
+        if (gap_beg == next->base)
+            goto next_gap;  /* no gap between regions */
+        gap_end = next->base - 1;
+    } else {
+        if (gap_beg == (aspace->base + aspace->size))
+            goto not_found;  /* no gap at the end of address space. Stop search */
+        gap_end = aspace->base + aspace->size - 1;
+    }
+
+    *pva = arch_mmu_pick_spot(gap_beg, prev ? prev->flags : ARCH_MMU_FLAG_INVALID,
+                              gap_end, next ? next->flags : ARCH_MMU_FLAG_INVALID,
+                              align, size, arch_mmu_flags);
+    if (*pva < gap_beg)
+        goto not_found; /* address wrapped around */
+
+    if (*pva < gap_end && ((gap_end - *pva + 1) >= size)) {
+        /* we have enough room */
+        return true; /* found spot, stop search */
+    }
+
+next_gap:
+    return false; /* continue search */
+
+not_found:
+    *pva = -1;
+    return true; /* not_found: stop search */
+}
+
+static vaddr_t alloc_spot(vmm_aspace_t *aspace, size_t size, uint8_t align_pow2,
+                          uint arch_mmu_flags, struct list_node **before)
 {
     DEBUG_ASSERT(aspace);
     DEBUG_ASSERT(size > 0 && IS_PAGE_ALIGNED(size));
@@ -172,65 +235,30 @@ static vaddr_t alloc_spot(vmm_aspace_t *aspace, size_t size, uint8_t align_pow2,
         align_pow2 = PAGE_SIZE_SHIFT;
     vaddr_t align = 1UL << align_pow2;
 
-    /* start our search */
-    vaddr_t spot = ALIGN(aspace->base, align);
-    if (!is_inside_aspace(aspace, spot)) {
-        /* the alignment is so big, we can't even allocate in this address space */
-        return -1;
-    }
+    vaddr_t spot;
+    vmm_region_t *r = NULL;
 
-    vmm_region_t *r = list_peek_head_type(&aspace->region_list, vmm_region_t, node);
-    if (r) {
-        /* does it fit before the first element? */
-        if (spot < r->base && r->base - spot >= size) {
-            if (before)
-                *before = &aspace->region_list;
-            return spot;
-        }
-    } else {
-        /* nothing is in the list, does it fit in the aspace? */
-        if (aspace->base + aspace->size - spot >= size) {
-            if (before)
-                *before = &aspace->region_list;
-            return spot;
-        }
-    }
+    /* try to pick spot at the beginning of address space */
+    if (check_gap(aspace, NULL,
+                  list_peek_head_type(&aspace->region_list, vmm_region_t, node),
+                  &spot, align, size, arch_mmu_flags))
+        goto done;
 
     /* search the middle of the list */
     list_for_every_entry(&aspace->region_list, r, vmm_region_t, node) {
-        /* calculate the aligned spot after r */
-        spot = ALIGN(r->base + r->size, align);
-        if (!is_inside_aspace(aspace, spot))
-            break;
-
-        /* get the next element in the list */
-        vmm_region_t *next = list_next_type(&aspace->region_list, &r->node, vmm_region_t, node);
-
-        if (next) {
-            /* see if the aligned spot is between current and next */
-            if (spot >= next->base)
-                continue;
-
-            /* see if it'll fit between the current item and the next */
-            if (next->base - spot >= size) {
-                /* it'll fit here */
-                if (before)
-                    *before = &r->node;
-                return spot;
-            }
-        } else {
-            /* we're at the end of the list, will it fit between us and the end of the aspace? */
-            if ((aspace->base + aspace->size) - spot >= size) {
-                /* it'll fit here */
-                if (before)
-                    *before = &r->node;
-                return spot;
-            }
-        }
+        if (check_gap(aspace, r,
+                      list_next_type(&aspace->region_list, &r->node, vmm_region_t, node),
+                      &spot, align, size, arch_mmu_flags))
+            goto done;
     }
 
     /* couldn't find anything */
     return -1;
+
+done:
+    if (before)
+        *before = r ? &r->node : &aspace->region_list;
+    return spot;
 }
 
 /* allocate a region structure and stick it in the address space */
@@ -254,7 +282,8 @@ static vmm_region_t *alloc_region(vmm_aspace_t *aspace, const char *name, size_t
     } else {
         /* allocate a virtual slot for it */
         struct list_node *before = NULL;
-        vaddr = alloc_spot(aspace, size, align_pow2, &before);
+
+        vaddr = alloc_spot(aspace, size, align_pow2, arch_mmu_flags, &before);
         LTRACEF("alloc_spot returns 0x%lx, before %p\n", vaddr, before);
 
         if (vaddr == (vaddr_t)-1) {
@@ -298,20 +327,23 @@ status_t vmm_reserve_space(vmm_aspace_t *aspace, const char *name, size_t size,
     /* trim the size */
     size = trim_to_aspace(aspace, vaddr, size);
 
+    mutex_acquire(&vmm_lock);
+
     /* lookup how it's already mapped */
     uint arch_mmu_flags = 0;
     arch_mmu_query(vaddr, NULL, &arch_mmu_flags);
 
     /* build a new region structure */
     vmm_region_t *r = alloc_region(aspace, name, size, vaddr, 0, VMM_FLAG_VALLOC_SPECIFIC, VMM_REGION_FLAG_RESERVED, arch_mmu_flags);
-    if (!r)
-        return ERR_NO_MEMORY;
 
-    return NO_ERROR;
+    mutex_release(&vmm_lock);
+    return r ? NO_ERROR : ERR_NO_MEMORY;
 }
 
-status_t vmm_alloc_physical(vmm_aspace_t *aspace, const char *name, size_t size, void **ptr, paddr_t paddr, uint vmm_flags, uint arch_mmu_flags)
+status_t vmm_alloc_physical(vmm_aspace_t *aspace, const char *name, size_t size, void **ptr, uint8_t align_log2, paddr_t paddr, uint vmm_flags, uint arch_mmu_flags)
 {
+    status_t ret;
+
     LTRACEF("aspace %p name '%s' size 0x%zx ptr %p paddr 0x%lx vmm_flags 0x%x arch_mmu_flags 0x%x\n",
             aspace, name, size, ptr ? *ptr : 0, paddr, vmm_flags, arch_mmu_flags);
 
@@ -340,10 +372,14 @@ status_t vmm_alloc_physical(vmm_aspace_t *aspace, const char *name, size_t size,
         vaddr = (vaddr_t)*ptr;
     }
 
+    mutex_acquire(&vmm_lock);
+
     /* allocate a region and put it in the aspace list */
-    vmm_region_t *r = alloc_region(aspace, name, size, vaddr, 0, vmm_flags, VMM_REGION_FLAG_PHYSICAL, arch_mmu_flags);
-    if (!r)
-        return ERR_NO_MEMORY;
+    vmm_region_t *r = alloc_region(aspace, name, size, vaddr, align_log2, vmm_flags, VMM_REGION_FLAG_PHYSICAL, arch_mmu_flags);
+    if (!r) {
+        ret = ERR_NO_MEMORY;
+        goto err_alloc_region;
+    }
 
     /* return the vaddr if requested */
     if (ptr)
@@ -353,7 +389,11 @@ status_t vmm_alloc_physical(vmm_aspace_t *aspace, const char *name, size_t size,
     int err = arch_mmu_map(r->base, paddr, size / PAGE_SIZE, arch_mmu_flags);
     LTRACEF("arch_mmu_map returns %d\n", err);
 
-    return NO_ERROR;
+    ret = NO_ERROR;
+
+err_alloc_region:
+    mutex_release(&vmm_lock);
+    return ret;
 }
 
 status_t vmm_alloc_contiguous(vmm_aspace_t *aspace, const char *name, size_t size, void **ptr, uint8_t align_pow2, uint vmm_flags, uint arch_mmu_flags)
@@ -396,6 +436,8 @@ status_t vmm_alloc_contiguous(vmm_aspace_t *aspace, const char *name, size_t siz
         goto err;
     }
 
+    mutex_acquire(&vmm_lock);
+
     /* allocate a region and put it in the aspace list */
     vmm_region_t *r = alloc_region(aspace, name, size, vaddr, align_pow2, vmm_flags, VMM_REGION_FLAG_PHYSICAL, arch_mmu_flags);
     if (!r) {
@@ -416,9 +458,11 @@ status_t vmm_alloc_contiguous(vmm_aspace_t *aspace, const char *name, size_t siz
         list_add_tail(&r->page_list, &p->node);
     }
 
+    mutex_release(&vmm_lock);
     return NO_ERROR;
 
 err1:
+    mutex_release(&vmm_lock);
     pmm_free(&page_list);
 err:
     return err;
@@ -466,6 +510,8 @@ status_t vmm_alloc(vmm_aspace_t *aspace, const char *name, size_t size, void **p
         goto err1;
     }
 
+    mutex_acquire(&vmm_lock);
+
     /* allocate a region and put it in the aspace list */
     vmm_region_t *r = alloc_region(aspace, name, size, vaddr, align_pow2, vmm_flags, VMM_REGION_FLAG_PHYSICAL, arch_mmu_flags);
     if (!r) {
@@ -483,7 +529,7 @@ status_t vmm_alloc(vmm_aspace_t *aspace, const char *name, size_t size, void **p
     vaddr_t va = r->base;
     DEBUG_ASSERT(IS_PAGE_ALIGNED(va));
     while ((p = list_remove_head_type(&page_list, vm_page_t, node))) {
-        DEBUG_ASSERT(va < r->base + r->size);
+        DEBUG_ASSERT(va <= r->base + r->size - 1);
 
         paddr_t pa = page_to_address(p);
         DEBUG_ASSERT(IS_PAGE_ALIGNED(pa));
@@ -496,9 +542,11 @@ status_t vmm_alloc(vmm_aspace_t *aspace, const char *name, size_t size, void **p
         va += PAGE_SIZE;
     }
 
+    mutex_release(&vmm_lock);
     return NO_ERROR;
 
 err1:
+    mutex_release(&vmm_lock);
     pmm_free(&page_list);
 err:
     return err;
@@ -515,7 +563,7 @@ static vmm_region_t *vmm_find_region(const vmm_aspace_t *aspace, vaddr_t vaddr)
 
     /* search the region list */
     list_for_every_entry(&aspace->region_list, r, vmm_region_t, node) {
-        if ((vaddr >= r->base) && (vaddr < r->base + r->size))
+        if ((vaddr >= r->base) && (vaddr <= r->base + r->size - 1))
             return r;
     }
 
@@ -524,8 +572,11 @@ static vmm_region_t *vmm_find_region(const vmm_aspace_t *aspace, vaddr_t vaddr)
 
 status_t vmm_free_region(vmm_aspace_t *aspace, vaddr_t vaddr)
 {
+    mutex_acquire(&vmm_lock);
+
     vmm_region_t *r = vmm_find_region (aspace, vaddr);
     if (!r) {
+        mutex_release(&vmm_lock);
         return ERR_NOT_FOUND;
     }
 
@@ -535,6 +586,8 @@ status_t vmm_free_region(vmm_aspace_t *aspace, vaddr_t vaddr)
     /* unmap it */
     arch_mmu_unmap(r->base, r->size / PAGE_SIZE);
 
+    mutex_release(&vmm_lock);
+
     /* return physical pages if any */
     pmm_free (&r->page_list);
 
@@ -571,7 +624,7 @@ usage:
         printf("usage:\n");
         printf("%s aspaces\n", argv[0].str);
         printf("%s alloc <size> <align_pow2>\n", argv[0].str);
-        printf("%s alloc_physical <paddr> <size>\n", argv[0].str);
+        printf("%s alloc_physical <paddr> <size> <align_pow2>\n", argv[0].str);
         printf("%s alloc_contig <size> <align_pow2>\n", argv[0].str);
         return ERR_GENERIC;
     }
@@ -591,7 +644,7 @@ usage:
         if (argc < 4) goto notenoughargs;
 
         void *ptr = (void *)0x99;
-        status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "physical test", argv[3].u, &ptr, argv[2].u, 0, ARCH_MMU_FLAG_UNCACHED_DEVICE);
+        status_t err = vmm_alloc_physical(vmm_get_kernel_aspace(), "physical test", argv[3].u, &ptr, argv[4].u, argv[2].u, 0, ARCH_MMU_FLAG_UNCACHED_DEVICE);
         printf("vmm_alloc_physical returns %d, ptr %p\n", err, ptr);
     } else if (!strcmp(argv[1].str, "alloc_contig")) {
         if (argc < 4) goto notenoughargs;
diff --git a/lib/cbuf/cbuf.c b/lib/cbuf/cbuf.c
index 76030174..67855e12 100644
--- a/lib/cbuf/cbuf.c
+++ b/lib/cbuf/cbuf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2013 Travis Geiselbrecht
+ * Copyright (c) 2008-2014 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -50,6 +50,7 @@ void cbuf_initialize_etc(cbuf_t *cbuf, size_t len, void *buf)
 	cbuf->len_pow2 = log2_uint(len);
 	cbuf->buf = buf;
 	event_init(&cbuf->event, false, 0);
+	spin_lock_init(&cbuf->lock);
 
 	LTRACEF("len %zd, len_pow2 %u\n", len, cbuf->len_pow2);
 }
@@ -74,7 +75,8 @@ size_t cbuf_write(cbuf_t *cbuf, const void *_buf, size_t len, bool canreschedule
 	DEBUG_ASSERT(cbuf);
 	DEBUG_ASSERT(len < valpow2(cbuf->len_pow2));
 
-	enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&cbuf->lock, state);
 
 	size_t write_len;
 	size_t pos = 0;
@@ -102,9 +104,13 @@ size_t cbuf_write(cbuf_t *cbuf, const void *_buf, size_t len, bool canreschedule
 	}
 
 	if (cbuf->head != cbuf->tail)
-		event_signal(&cbuf->event, canreschedule);
+		event_signal(&cbuf->event, false);
 
-	exit_critical_section();
+    spin_unlock_irqrestore(&cbuf->lock, state);
+
+    // XXX convert to only rescheduling if 
+    if (canreschedule)
+        thread_preempt();
 
 	return pos;
 }
@@ -115,11 +121,15 @@ size_t cbuf_read(cbuf_t *cbuf, void *_buf, size_t buflen, bool block)
 
 	DEBUG_ASSERT(cbuf);
 
-	enter_critical_section();
-
+retry:
+    // block on the cbuf outside of the lock, which may
+    // unblock us early and we'll have to double check below
 	if (block)
 		event_wait(&cbuf->event);
 
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&cbuf->lock, state);
+
 	// see if there's data available
 	size_t ret = 0;
 	if (cbuf->tail != cbuf->head) {
@@ -147,6 +157,7 @@ size_t cbuf_read(cbuf_t *cbuf, void *_buf, size_t buflen, bool block)
 		}
 
 		if (cbuf->tail == cbuf->head) {
+            DEBUG_ASSERT(pos > 0);
 			// we've emptied the buffer, unsignal the event
 			event_unsignal(&cbuf->event);
 		}
@@ -154,7 +165,11 @@ size_t cbuf_read(cbuf_t *cbuf, void *_buf, size_t buflen, bool block)
 		ret = pos;
 	}
 
-	exit_critical_section();
+    spin_unlock_irqrestore(&cbuf->lock, state);
+
+    // we apparently blocked but raced with another thread and found no data, retry
+    if (block && ret == 0)
+        goto retry;
 
 	return ret;
 }
@@ -189,7 +204,8 @@ size_t cbuf_write_char(cbuf_t *cbuf, char c, bool canreschedule)
 {
 	DEBUG_ASSERT(cbuf);
 
-	enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&cbuf->lock, state);
 
 	size_t ret = 0;
 	if (cbuf_space_avail(cbuf) > 0) {
@@ -202,7 +218,7 @@ size_t cbuf_write_char(cbuf_t *cbuf, char c, bool canreschedule)
 			event_signal(&cbuf->event, canreschedule);
 	}
 
-	exit_critical_section();
+    spin_unlock_irqrestore(&cbuf->lock, state);
 
 	return ret;
 }
@@ -212,11 +228,13 @@ size_t cbuf_read_char(cbuf_t *cbuf, char *c, bool block)
 	DEBUG_ASSERT(cbuf);
 	DEBUG_ASSERT(c);
 
-	enter_critical_section();
-
+retry:
 	if (block)
 		event_wait(&cbuf->event);
 
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&cbuf->lock, state);
+
 	// see if there's data available
 	size_t ret = 0;
 	if (cbuf->tail != cbuf->head) {
@@ -232,7 +250,10 @@ size_t cbuf_read_char(cbuf_t *cbuf, char *c, bool block)
 		ret = 1;
 	}
 
-	exit_critical_section();
+    spin_unlock_irqrestore(&cbuf->lock, state);
+
+    if (block && ret == 0)
+        goto retry;
 
 	return ret;
 }
diff --git a/lib/debug/debug.c b/lib/debug/debug.c
index 81060ecf..4749bc34 100644
--- a/lib/debug/debug.c
+++ b/lib/debug/debug.c
@@ -33,6 +33,65 @@
 #include <platform/debug.h>
 #include <kernel/thread.h>
 
+#if WITH_LIB_SM
+#define PRINT_LOCK_FLAGS SPIN_LOCK_FLAG_IRQ_FIQ
+#else
+#define PRINT_LOCK_FLAGS SPIN_LOCK_FLAG_INTERRUPTS
+#endif
+
+static spin_lock_t print_spin_lock = 0;
+static struct list_node print_callbacks = LIST_INITIAL_VALUE(print_callbacks);
+/* print lock must be held when invoking out, outs, outc */
+static void out_count(const char *str, size_t len)
+{
+	print_callback_t *cb;
+	size_t i;
+
+	/* print to any registered loggers */
+	list_for_every_entry(&print_callbacks, cb, print_callback_t, entry) {
+		if (cb->print)
+			cb->print(cb, str, len);
+	}
+
+	/* write out the serial port */
+	for (i = 0; i < len; i++) {
+		platform_dputc(str[i]);
+	}
+}
+
+static void out_string(const char *str)
+{
+	out_count(str, strlen(str));
+}
+
+static void out_char(char c)
+{
+	out_count(&c, 1);
+}
+
+static int input_char(char *c)
+{
+	return platform_dgetc(c, true);
+}
+
+void register_print_callback(print_callback_t *cb)
+{
+	spin_lock_saved_state_t state;
+
+	spin_lock_save(&print_spin_lock, &state, PRINT_LOCK_FLAGS);
+	list_add_head(&print_callbacks, &cb->entry);
+	spin_unlock_restore(&print_spin_lock, state, PRINT_LOCK_FLAGS);
+}
+
+void unregister_print_callback(print_callback_t *cb)
+{
+	spin_lock_saved_state_t state;
+
+	spin_lock_save(&print_spin_lock, &state, PRINT_LOCK_FLAGS);
+	list_delete(&cb->entry);
+	spin_unlock_restore(&print_spin_lock, state, PRINT_LOCK_FLAGS);
+}
+
 void spin(uint32_t usecs)
 {
 	lk_bigtime_t start = current_time_hires();
@@ -69,7 +128,7 @@ static int __debug_stdio_fgetc(void *ctx)
 	char c;
 	int err;
 
-	err = platform_dgetc(&c, true);
+	err = input_char(&c);
 	if (err < 0)
 		return err;
 	return (unsigned char)c;
@@ -98,34 +157,55 @@ FILE __stdio_FILEs[3] = {
 
 #if !DISABLE_DEBUG_OUTPUT
 
+void _dputc(char c)
+{
+	spin_lock_saved_state_t state;
+
+	spin_lock_save(&print_spin_lock, &state, PRINT_LOCK_FLAGS);
+	out_char(c);
+	spin_unlock_restore(&print_spin_lock, state, PRINT_LOCK_FLAGS);
+}
+
 int _dputs(const char *str)
 {
-	while (*str != 0) {
-		_dputc(*str++);
-	}
+	spin_lock_saved_state_t state;
+
+	spin_lock_save(&print_spin_lock, &state, PRINT_LOCK_FLAGS);
+	out_string(str);
+	spin_unlock_restore(&print_spin_lock, state, PRINT_LOCK_FLAGS);
+
+	return 0;
+}
+
+int _dwrite(const char *ptr, size_t len)
+{
+	spin_lock_saved_state_t state;
+
+	spin_lock_save(&print_spin_lock, &state, PRINT_LOCK_FLAGS);
+	out_count(ptr, len);
+	spin_unlock_restore(&print_spin_lock, state, PRINT_LOCK_FLAGS);
 
 	return 0;
 }
 
 static int _dprintf_output_func(const char *str, size_t len, void *state)
 {
-	size_t count = 0;
-	while (count < len && *str) {
-		_dputc(*str);
-		str++;
-		count++;
-	}
+	size_t n = strnlen(str, len);
 
-	return count;
+	out_count(str, n);
+	return n;
 }
 
 int _dprintf(const char *fmt, ...)
 {
+	spin_lock_saved_state_t state;
 	int err;
-
 	va_list ap;
+
 	va_start(ap, fmt);
+	spin_lock_save(&print_spin_lock, &state, PRINT_LOCK_FLAGS);
 	err = _printf_engine(&_dprintf_output_func, NULL, fmt, ap);
+	spin_unlock_restore(&print_spin_lock, state, PRINT_LOCK_FLAGS);
 	va_end(ap);
 
 	return err;
@@ -133,9 +213,12 @@ int _dprintf(const char *fmt, ...)
 
 int _dvprintf(const char *fmt, va_list ap)
 {
+	spin_lock_saved_state_t state;
 	int err;
 
+	spin_lock_save(&print_spin_lock, &state, PRINT_LOCK_FLAGS);
 	err = _printf_engine(&_dprintf_output_func, NULL, fmt, ap);
+	spin_unlock_restore(&print_spin_lock, state, PRINT_LOCK_FLAGS);
 
 	return err;
 }
diff --git a/lib/heap/heap.c b/lib/heap/heap.c
index 7364cd1f..7e99bf6d 100644
--- a/lib/heap/heap.c
+++ b/lib/heap/heap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2009,2012,2014 Travis Geiselbrecht
+ * Copyright (c) 2008-2009,2012-2014 Travis Geiselbrecht
  * Copyright (c) 2009 Corey Tabaka
  *
  * Permission is hereby granted, free of charge, to any person obtaining
@@ -32,6 +32,7 @@
 #include <string.h>
 #include <kernel/thread.h>
 #include <kernel/mutex.h>
+#include <kernel/spinlock.h>
 #include <lib/heap.h>
 
 #define LOCAL_TRACE 0
@@ -87,6 +88,7 @@ struct heap {
 	mutex_t lock;
 	struct list_node free_list;
 	struct list_node delayed_free_list;
+	spin_lock_t delayed_free_lock;
 };
 
 // heap static vars
@@ -124,12 +126,15 @@ static void heap_dump(void)
 	list_for_every_entry(&theheap.free_list, chunk, struct free_heap_chunk, node) {
 		dump_free_chunk(chunk);
 	}
+	mutex_release(&theheap.lock);
 
 	dprintf(INFO, "\tdelayed free list:\n");
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&theheap.delayed_free_lock, state);
 	list_for_every_entry(&theheap.delayed_free_list, chunk, struct free_heap_chunk, node) {
 		dump_free_chunk(chunk);
 	}
-	mutex_release(&theheap.lock);
+	spin_unlock_irqrestore(&theheap.delayed_free_lock, state);
 }
 
 static void heap_test(void)
@@ -269,13 +274,14 @@ static void heap_free_delayed_list(void)
 
 	list_initialize(&list);
 
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&theheap.delayed_free_lock, state);
 
 	struct free_heap_chunk *chunk;
 	while ((chunk = list_remove_head_type(&theheap.delayed_free_list, struct free_heap_chunk, node))) {
 		list_add_head(&list, &chunk->node);
 	}
-	exit_critical_section();
+	spin_unlock_irqrestore(&theheap.delayed_free_lock, state);
 
 	while ((chunk = list_remove_head_type(&list, struct free_heap_chunk, node))) {
 		LTRACEF("freeing chunk %p\n", chunk);
@@ -464,9 +470,10 @@ void heap_delayed_free(void *ptr)
 
 	struct free_heap_chunk *chunk = heap_create_free_chunk(as->ptr, as->size, false);
 
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&theheap.delayed_free_lock, state);
 	list_add_head(&theheap.delayed_free_list, &chunk->node);
-	exit_critical_section();
+	spin_unlock_irqrestore(&theheap.delayed_free_lock, state);
 }
 
 void heap_get_stats(struct heap_stats *ptr)
@@ -507,8 +514,10 @@ static ssize_t heap_grow(size_t size)
 	size = ROUNDUP(size, PAGE_SIZE);
 
 	void *ptr = pmm_alloc_kpages(size / PAGE_SIZE, NULL);
-	if (!ptr)
+	if (!ptr) {
+		TRACEF("failed to grow kernel heap by 0x%zx bytes\n", size);
 		return ERR_NO_MEMORY;
+	}
 
 	LTRACEF("growing heap by 0x%zx bytes, new ptr %p\n", size, ptr);
 
@@ -541,6 +550,7 @@ void heap_init(void)
 
 	// initialize the delayed free list
 	list_initialize(&theheap.delayed_free_list);
+	spin_lock_init(&theheap.delayed_free_lock);
 
 	// set the heap range
 #if WITH_KERNEL_VM
diff --git a/lib/libc/atoi.c b/lib/libc/atoi.c
index f5610835..71dfd0f6 100644
--- a/lib/libc/atoi.c
+++ b/lib/libc/atoi.c
@@ -27,6 +27,7 @@
 
 #include <stdlib.h>
 #include <ctype.h>
+#include <errno.h>
 
 #define LONG_IS_INT 1
 
@@ -120,4 +121,70 @@ unsigned long long atoull(const char *num)
 	return value;
 }
 
+unsigned long strtoul(const char *nptr, char **endptr, int base) {
+	int neg = 0;
+	unsigned long ret = 0;
 
+	if (base < 0 || base == 1 || base > 36) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	while (isspace(*nptr)) {
+		nptr++;
+	}
+
+	if (*nptr == '+') {
+		nptr++;
+	} else if (*nptr == '-') {
+		neg = 1;
+		nptr++;
+	}
+
+	if ((base == 0 || base == 16) && nptr[0] == '0' && nptr[1] == 'x') {
+		base = 16;
+		nptr += 2;
+	} else if (base == 0 && nptr[0] == '0') {
+		base = 8;
+		nptr++;
+	} else if (base == 0) {
+		base = 10;
+	}
+
+	for (;;) {
+		char c = *nptr;
+		int v = -1;
+		unsigned long new_ret;
+
+		if (c >= 'A' && c <= 'Z') {
+			v = c - 'A' + 10;
+		} else if (c >= 'a' && c <= 'z') {
+			v = c - 'a' + 10;
+		} else if (c >= '0' && c <= '9') {
+			v = c - '0';
+		}
+
+		if (v < 0 || v >= base) {
+			*endptr = (char *) nptr;
+			break;
+		}
+
+		new_ret = ret * base;
+		if (new_ret / base != ret ||
+		    new_ret + v < new_ret ||
+		    ret == ULONG_MAX) {
+			ret = ULONG_MAX;
+			errno = ERANGE;
+		} else {
+			ret = new_ret + v;
+		}
+
+		nptr++;
+	}
+
+	if (neg && ret != ULONG_MAX) {
+		ret = -ret;
+	}
+
+	return ret;
+}
diff --git a/lib/libc/bsearch.c b/lib/libc/bsearch.c
new file mode 100644
index 00000000..53aafb87
--- /dev/null
+++ b/lib/libc/bsearch.c
@@ -0,0 +1,44 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <stdlib.h>
+
+void *bsearch(const void *key, const void *base, size_t num_elems, size_t size,
+              int (*compare)(const void *, const void *)) {
+	size_t low = 0, high = num_elems - 1;
+
+	if (num_elems == 0) {
+		return NULL;
+	}
+
+	for (;;) {
+		size_t mid = low + ((high - low) / 2);
+		const void *mid_elem = ((unsigned char*) base) + mid*size;
+		int r = compare(key, mid_elem);
+
+		if (r < 0) {
+			if (mid == 0) {
+				return NULL;
+			}
+			high = mid - 1;
+		} else if (r > 0) {
+			low = mid + 1;
+			if (low < mid || low > high) {
+				return NULL;
+			}
+		} else {
+			return (void*) mid_elem;
+		}
+	}
+}
diff --git a/lib/libc/rules.mk b/lib/libc/rules.mk
index 132fe5b5..a774683d 100644
--- a/lib/libc/rules.mk
+++ b/lib/libc/rules.mk
@@ -4,14 +4,17 @@ MODULE := $(LOCAL_DIR)
 
 MODULE_SRCS += \
 	$(LOCAL_DIR)/atoi.c \
+	$(LOCAL_DIR)/bsearch.c \
 	$(LOCAL_DIR)/ctype.c \
 	$(LOCAL_DIR)/printf.c \
-	$(LOCAL_DIR)/malloc.c \
 	$(LOCAL_DIR)/rand.c \
 	$(LOCAL_DIR)/stdio.c \
 	$(LOCAL_DIR)/qsort.c \
 	$(LOCAL_DIR)/eabi.c
 
+ifneq ($(WITH_CUSTOM_MALLOC),true)
+MODULE_SRCS += $(LOCAL_DIR)/malloc.c
+endif
 
 include $(LOCAL_DIR)/string/rules.mk
 
diff --git a/lib/minip/pktbuf.c b/lib/minip/pktbuf.c
index 69bf9041..27e3df91 100644
--- a/lib/minip/pktbuf.c
+++ b/lib/minip/pktbuf.c
@@ -28,6 +28,7 @@
 
 #include <kernel/thread.h>
 #include <kernel/semaphore.h>
+#include <kernel/spinlock.h>
 #include <lib/pktbuf.h>
 #include <lk/init.h>
 
@@ -45,6 +46,7 @@
 static struct list_node pb_freelist = LIST_INITIAL_VALUE(pb_freelist);
 static struct list_node pb_buflist = LIST_INITIAL_VALUE(pb_buflist);
 static semaphore_t pb_sem = SEMAPHORE_INITIAL_VALUE(pb_sem, -1);
+static spin_lock_t lock;
 
 
 static unsigned int cur_id = 0;
@@ -90,6 +92,8 @@ static inline pktbuf_buf_t *pktbuf_get_buf(void) {
 }
 
 pktbuf_t *pktbuf_alloc(void) {
+	spin_lock_saved_state_t state;
+
 	pktbuf_t *p = NULL;
 	pktbuf_buf_t *b = NULL;
 
@@ -97,12 +101,12 @@ pktbuf_t *pktbuf_alloc(void) {
 	 * pointer but no buffer and would otherwise have to do sem / list bookkeeping on
 	 * cleanup */
 	sem_wait(&pb_sem);
-	enter_critical_section();
+	spin_lock_irqsave(&lock, state);
 	b = pktbuf_get_buf();
 	if (b) {
 		p = list_remove_head_type(&pb_freelist, pktbuf_t, list);
 	}
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, state);
 
 	if (b->magic != PKTBUF_BUF_MAGIC) {
 		panic("pktbuf id %u has corrupted buffer magic value\n"
@@ -127,12 +131,13 @@ pktbuf_t *pktbuf_alloc(void) {
 }
 
 pktbuf_t *pktbuf_alloc_empty(void *buf, size_t dlen) {
+	spin_lock_saved_state_t state;
 	pktbuf_t *p;
 
 	sem_wait(&pb_sem);
-	enter_critical_section();
+	spin_lock_irqsave(&lock, state);
 	p = list_remove_head_type(&pb_freelist, pktbuf_t, list);
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, state);
 
 	if (!p) {
 		return NULL;
@@ -147,7 +152,8 @@ pktbuf_t *pktbuf_alloc_empty(void *buf, size_t dlen) {
 }
 
 int pktbuf_free(pktbuf_t *p, bool reschedule) {
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&lock, state);
 	list_add_tail(&pb_freelist, &(p->list));
 	if (p->managed && p->buffer) {
 		pktbuf_buf_t *pkt = (pktbuf_buf_t *)p->buffer;
@@ -158,7 +164,7 @@ int pktbuf_free(pktbuf_t *p, bool reschedule) {
 	p->eof = false;
 	p->managed = false;
 	p->flags = 0;
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, state);
 
 	return sem_post(&pb_sem, reschedule);
 }
diff --git a/make/build.mk b/make/build.mk
index 5d6f4b4a..2259a471 100644
--- a/make/build.mk
+++ b/make/build.mk
@@ -4,6 +4,10 @@ GLOBAL_COMPILEFLAGS += -ffunction-sections -fdata-sections
 GLOBAL_LDFLAGS += --gc-sections
 endif
 
+ifneq (,$(EXTRA_BUILDRULES))
+-include $(EXTRA_BUILDRULES)
+endif
+
 $(OUTBIN): $(OUTELF)
 	@echo generating image: $@
 	$(NOECHO)$(SIZE) $<
diff --git a/make/compile.mk b/make/compile.mk
index 798f269f..c2c98788 100644
--- a/make/compile.mk
+++ b/make/compile.mk
@@ -40,33 +40,33 @@ $(MODULE_OBJS): MODULE_INCLUDES:=$(MODULE_INCLUDES)
 $(MODULE_COBJS): $(BUILDDIR)/%.o: %.c $(MODULE_SRCDEPS)
 	@$(MKDIR)
 	@echo compiling $<
-	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CFLAGS) $(MODULE_CFLAGS) $(THUMBCFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
+	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CFLAGS) $(ARCH_CFLAGS) $(MODULE_CFLAGS) $(THUMBCFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
 
 $(MODULE_CPPOBJS): $(BUILDDIR)/%.o: %.cpp $(MODULE_SRCDEPS)
 	@$(MKDIR)
 	@echo compiling $<
-	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CPPFLAGS) $(MODULE_CPPFLAGS) $(THUMBCFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
+	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CPPFLAGS) $(ARCH_CPPFLAGS) $(MODULE_CPPFLAGS) $(THUMBCFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
 
 $(MODULE_ASMOBJS): $(BUILDDIR)/%.o: %.S $(MODULE_SRCDEPS)
 	@$(MKDIR)
 	@echo compiling $<
-	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_ASMFLAGS) $(MODULE_ASMFLAGS) $(THUMBCFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
+	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_ASMFLAGS) $(ARCH_ASMFLAGS) $(MODULE_ASMFLAGS) $(THUMBCFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
 
 # overridden arm versions
 $(MODULE_ARM_COBJS): $(BUILDDIR)/%.o: %.c $(MODULE_SRCDEPS)
 	@$(MKDIR)
 	@echo compiling $<
-	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CFLAGS) $(MODULE_CFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
+	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CFLAGS) $(ARCH_CFLAGS) $(MODULE_CFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
 
 $(MODULE_ARM_CPPOBJS): $(BUILDDIR)/%.o: %.cpp $(MODULE_SRCDEPS)
 	@$(MKDIR)
 	@echo compiling $<
-	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CPPFLAGS) $(MODULE_CPPFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
+	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_CPPFLAGS) $(ARCH_CPPFLAGS) $(MODULE_CPPFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
 
 $(MODULE_ARM_ASMOBJS): $(BUILDDIR)/%.o: %.S $(MODULE_SRCDEPS)
 	@$(MKDIR)
 	@echo compiling $<
-	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_ASMFLAGS) $(MODULE_ASMFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
+	$(NOECHO)$(CC) $(GLOBAL_OPTFLAGS) $(MODULE_OPTFLAGS) $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(MODULE_COMPILEFLAGS) $(GLOBAL_ASMFLAGS) $(ARCH_ASMFLAGS) $(MODULE_ASMFLAGS) $(GLOBAL_INCLUDES) $(MODULE_INCLUDES) -c $< -MD -MP -MT $@ -MF $(@:%o=%d) -o $@
 
 # clear some variables we set here
 MODULE_CSRCS :=
diff --git a/make/macros.mk b/make/macros.mk
index 5eebdca9..a45aca14 100644
--- a/make/macros.mk
+++ b/make/macros.mk
@@ -7,6 +7,9 @@ MKDIR = if [ ! -d $(dir $@) ]; then mkdir -p $(dir $@); fi
 # prepends the BUILD_DIR var to each item in the list
 TOBUILDDIR = $(addprefix $(BUILDDIR)/,$(1))
 
+# converts specified variable to boolean value
+TOBOOL = $(if $(filter-out 0 false,$1),true,false)
+
 COMMA := ,
 SPACE :=
 SPACE +=
@@ -32,11 +35,11 @@ define MAKECONFIGHEADER
 	$(MKDIR); \
 	echo generating $1; \
 	rm -f $1.tmp; \
-	LDEF=`echo $1 | tr '/\\.-' '_'`; \
+	LDEF=`echo $1 | tr '/\\.-' '_' | sed "s/C++/CPP/g;s/c++/cpp/g"`; \
 	echo \#ifndef __$${LDEF}_H > $1.tmp; \
 	echo \#define __$${LDEF}_H >> $1.tmp; \
 	for d in `echo $($2) | tr '[:lower:]' '[:upper:]'`; do \
-		echo "#define $$d" | sed "s/=/\ /g;s/-/_/g;s/\//_/g;s/\./_/g;s/\//_/g" >> $1.tmp; \
+		echo "#define $$d" | sed "s/=/\ /g;s/-/_/g;s/\//_/g;s/\./_/g;s/\//_/g;s/C++/CPP/g" >> $1.tmp; \
 	done; \
 	echo \#endif >> $1.tmp; \
 	$(call TESTANDREPLACEFILE,$1.tmp,$1)
diff --git a/makefile b/makefile
index 4bb9befe..4efdcb93 100644
--- a/makefile
+++ b/makefile
@@ -9,7 +9,10 @@ BUILDROOT ?= .
 DEFAULT_PROJECT ?=
 TOOLCHAIN_PREFIX ?=
 
+# check if LKROOT is already a part of LKINC list and add it only if it is not
+ifneq ($(findstring $(LKROOT),$(LKINC)), $(LKROOT))
 LKINC := $(LKROOT) $(LKINC)
+endif
 
 export LKMAKEROOT
 export LKROOT
diff --git a/platform/alterasoc/platform.c b/platform/alterasoc/platform.c
index c694594a..b5832979 100644
--- a/platform/alterasoc/platform.c
+++ b/platform/alterasoc/platform.c
@@ -74,6 +74,8 @@ void platform_early_init(void)
 {
     uart_init_early();
 
+    printf("stat 0x%x\n", *REG32(0xffd05000));
+
     /* initialize the interrupt controller */
     arm_gic_init();
 
@@ -81,6 +83,9 @@ void platform_early_init(void)
     arm_cortex_a9_timer_init(CPUPRIV_BASE, TIMER_CLOCK_FREQ);
 
     pmm_add_arena(&sdram_arena);
+
+    /* start the secondary cpu */
+    *REG32(0xffd05010) = 0;
 }
 
 void platform_init(void)
diff --git a/platform/alterasoc/rules.mk b/platform/alterasoc/rules.mk
index db1a7973..1122ea4c 100644
--- a/platform/alterasoc/rules.mk
+++ b/platform/alterasoc/rules.mk
@@ -4,6 +4,7 @@ MODULE := $(LOCAL_DIR)
 
 ARCH := arm
 ARM_CPU := cortex-a9-neon
+WITH_SMP := 1
 
 MODULE_DEPS := \
 	lib/cbuf \
diff --git a/platform/alterasoc/uart.c b/platform/alterasoc/uart.c
index 660a8f7e..cf131c45 100644
--- a/platform/alterasoc/uart.c
+++ b/platform/alterasoc/uart.c
@@ -26,6 +26,7 @@
 #include <trace.h>
 #include <lib/cbuf.h>
 #include <kernel/thread.h>
+#include <kernel/spinlock.h>
 #include <platform/interrupts.h>
 #include <platform/debug.h>
 #include <platform/alterasoc.h>
@@ -57,6 +58,8 @@ static cbuf_t uart1_rx_buf;
 static inline uintptr_t uart_to_ptr(unsigned int n) { return (n == 0) ? UART0_BASE : UART1_BASE; }
 static inline cbuf_t *uart_to_rxbuf(unsigned int n) { return (n == 0) ? &uart0_rx_buf : &uart1_rx_buf; }
 
+static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE;
+
 static enum handler_return uart_irq(void *arg)
 {
     bool resched = false;
@@ -114,34 +117,26 @@ int uart_putc(int port, char c)
 {
     uintptr_t base = uart_to_ptr(port);
 
-#if 1
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
+
     /* spin while fifo is full */
-    while ((UARTREG(base, UART_USR) & (1<<1)) == 0)
-        ;
-#else
-    /* spin while fifo is not empty */
-    while ((UARTREG(base, UART_USR) & (1<<2)) == 0)
-        ;
-#endif
+    while ((UARTREG(base, UART_USR) & (1<<1)) == 0) {
+    }
     UARTREG(base, UART_THR) = c;
 
+    spin_unlock_irqrestore(&lock, state);
+
     return 1;
 }
 
 int uart_getc(int port, bool wait)
 {
-#if 0
-    uintptr_t base = uart_to_ptr(port);
-
-    if ((UARTREG(base, UART_USR) & (1<<3)))
-        return UARTREG(base, UART_RBR);
-#else
     cbuf_t *rxbuf = uart_to_rxbuf(port);
 
     char c;
     if (cbuf_read_char(rxbuf, &c, wait) == 1)
         return c;
-#endif
 
     return -1;
 }
diff --git a/platform/bcm2835/include/platform/bcm2835.h b/platform/bcm2835/include/platform/bcm2835.h
new file mode 100644
index 00000000..b44a0d2a
--- /dev/null
+++ b/platform/bcm2835/include/platform/bcm2835.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#define SDRAM_BASE 0
+
+#define BCM_PERIPH_BASE_PHYS    (0x3f000000U)
+#define BCM_PERIPH_SIZE         (0x01100000U)
+#define BCM_PERIPH_BASE_VIRT    (0xe0000000U)
+
+/* pointer to 'local' peripherals at 0x40000000 */
+#define BCM_LOCAL_PERIPH_BASE_VIRT (BCM_PERIPH_BASE_VIRT + 0x01000000)
+
+#define IC0_BASE                (BCM_PERIPH_BASE_VIRT + 0x2000)
+#define ST_BASE                 (BCM_PERIPH_BASE_VIRT + 0x3000)
+#define MPHI_BASE               (BCM_PERIPH_BASE_VIRT + 0x6000)
+#define DMA_BASE                (BCM_PERIPH_BASE_VIRT + 0x7000)
+#define ARM_BASE                (BCM_PERIPH_BASE_VIRT + 0xB000)
+#define PM_BASE                 (BCM_PERIPH_BASE_VIRT + 0x100000)
+#define PCM_CLOCK_BASE          (BCM_PERIPH_BASE_VIRT + 0x101098)
+#define RNG_BASE                (BCM_PERIPH_BASE_VIRT + 0x104000)
+#define GPIO_BASE               (BCM_PERIPH_BASE_VIRT + 0x200000)
+#define UART0_BASE              (BCM_PERIPH_BASE_VIRT + 0x201000)
+#define MMCI0_BASE              (BCM_PERIPH_BASE_VIRT + 0x202000)
+#define I2S_BASE                (BCM_PERIPH_BASE_VIRT + 0x203000)
+#define SPI0_BASE               (BCM_PERIPH_BASE_VIRT + 0x204000)
+#define BSC0_BASE               (BCM_PERIPH_BASE_VIRT + 0x205000)
+#define UART1_BASE              (BCM_PERIPH_BASE_VIRT + 0x215000)
+#define EMMC_BASE               (BCM_PERIPH_BASE_VIRT + 0x300000)
+#define SMI_BASE                (BCM_PERIPH_BASE_VIRT + 0x600000)
+#define BSC1_BASE               (BCM_PERIPH_BASE_VIRT + 0x804000)
+#define USB_BASE                (BCM_PERIPH_BASE_VIRT + 0x980000)
+#define MCORE_BASE              (BCM_PERIPH_BASE_VIRT + 0x0000)
+
+#define ARMCTRL_BASE            (ARM_BASE + 0x000)
+#define ARMCTRL_INTC_BASE       (ARM_BASE + 0x200)
+#define ARMCTRL_TIMER0_1_BASE   (ARM_BASE + 0x400)
+#define ARMCTRL_0_SBM_BASE      (ARM_BASE + 0x800)
+
+#define ARM_LOCAL_BASE          (BCM_LOCAL_PERIPH_BASE_VIRT)
+
+/* interrupts */
+#define ARM_IRQ1_BASE                  0
+#define INTERRUPT_TIMER0               (ARM_IRQ1_BASE + 0)
+#define INTERRUPT_TIMER1               (ARM_IRQ1_BASE + 1)
+#define INTERRUPT_TIMER2               (ARM_IRQ1_BASE + 2)
+#define INTERRUPT_TIMER3               (ARM_IRQ1_BASE + 3)
+#define INTERRUPT_CODEC0               (ARM_IRQ1_BASE + 4)
+#define INTERRUPT_CODEC1               (ARM_IRQ1_BASE + 5)
+#define INTERRUPT_CODEC2               (ARM_IRQ1_BASE + 6)
+#define INTERRUPT_VC_JPEG              (ARM_IRQ1_BASE + 7)
+#define INTERRUPT_ISP                  (ARM_IRQ1_BASE + 8)
+#define INTERRUPT_VC_USB               (ARM_IRQ1_BASE + 9)
+#define INTERRUPT_VC_3D                (ARM_IRQ1_BASE + 10)
+#define INTERRUPT_TRANSPOSER           (ARM_IRQ1_BASE + 11)
+#define INTERRUPT_MULTICORESYNC0       (ARM_IRQ1_BASE + 12)
+#define INTERRUPT_MULTICORESYNC1       (ARM_IRQ1_BASE + 13)
+#define INTERRUPT_MULTICORESYNC2       (ARM_IRQ1_BASE + 14)
+#define INTERRUPT_MULTICORESYNC3       (ARM_IRQ1_BASE + 15)
+#define INTERRUPT_DMA0                 (ARM_IRQ1_BASE + 16)
+#define INTERRUPT_DMA1                 (ARM_IRQ1_BASE + 17)
+#define INTERRUPT_VC_DMA2              (ARM_IRQ1_BASE + 18)
+#define INTERRUPT_VC_DMA3              (ARM_IRQ1_BASE + 19)
+#define INTERRUPT_DMA4                 (ARM_IRQ1_BASE + 20)
+#define INTERRUPT_DMA5                 (ARM_IRQ1_BASE + 21)
+#define INTERRUPT_DMA6                 (ARM_IRQ1_BASE + 22)
+#define INTERRUPT_DMA7                 (ARM_IRQ1_BASE + 23)
+#define INTERRUPT_DMA8                 (ARM_IRQ1_BASE + 24)
+#define INTERRUPT_DMA9                 (ARM_IRQ1_BASE + 25)
+#define INTERRUPT_DMA10                (ARM_IRQ1_BASE + 26)
+#define INTERRUPT_DMA11                (ARM_IRQ1_BASE + 27)
+#define INTERRUPT_DMA12                (ARM_IRQ1_BASE + 28)
+#define INTERRUPT_AUX                  (ARM_IRQ1_BASE + 29)
+#define INTERRUPT_ARM                  (ARM_IRQ1_BASE + 30)
+#define INTERRUPT_VPUDMA               (ARM_IRQ1_BASE + 31)
+
+#define ARM_IRQ2_BASE                  32
+#define INTERRUPT_HOSTPORT             (ARM_IRQ2_BASE + 0)
+#define INTERRUPT_VIDEOSCALER          (ARM_IRQ2_BASE + 1)
+#define INTERRUPT_CCP2TX               (ARM_IRQ2_BASE + 2)
+#define INTERRUPT_SDC                  (ARM_IRQ2_BASE + 3)
+#define INTERRUPT_DSI0                 (ARM_IRQ2_BASE + 4)
+#define INTERRUPT_AVE                  (ARM_IRQ2_BASE + 5)
+#define INTERRUPT_CAM0                 (ARM_IRQ2_BASE + 6)
+#define INTERRUPT_CAM1                 (ARM_IRQ2_BASE + 7)
+#define INTERRUPT_HDMI0                (ARM_IRQ2_BASE + 8)
+#define INTERRUPT_HDMI1                (ARM_IRQ2_BASE + 9)
+#define INTERRUPT_PIXELVALVE1          (ARM_IRQ2_BASE + 10)
+#define INTERRUPT_I2CSPISLV            (ARM_IRQ2_BASE + 11)
+#define INTERRUPT_DSI1                 (ARM_IRQ2_BASE + 12)
+#define INTERRUPT_PWA0                 (ARM_IRQ2_BASE + 13)
+#define INTERRUPT_PWA1                 (ARM_IRQ2_BASE + 14)
+#define INTERRUPT_CPR                  (ARM_IRQ2_BASE + 15)
+#define INTERRUPT_SMI                  (ARM_IRQ2_BASE + 16)
+#define INTERRUPT_GPIO0                (ARM_IRQ2_BASE + 17)
+#define INTERRUPT_GPIO1                (ARM_IRQ2_BASE + 18)
+#define INTERRUPT_GPIO2                (ARM_IRQ2_BASE + 19)
+#define INTERRUPT_GPIO3                (ARM_IRQ2_BASE + 20)
+#define INTERRUPT_VC_I2C               (ARM_IRQ2_BASE + 21)
+#define INTERRUPT_VC_SPI               (ARM_IRQ2_BASE + 22)
+#define INTERRUPT_VC_I2SPCM            (ARM_IRQ2_BASE + 23)
+#define INTERRUPT_VC_SDIO              (ARM_IRQ2_BASE + 24)
+#define INTERRUPT_VC_UART              (ARM_IRQ2_BASE + 25)
+#define INTERRUPT_SLIMBUS              (ARM_IRQ2_BASE + 26)
+#define INTERRUPT_VEC                  (ARM_IRQ2_BASE + 27)
+#define INTERRUPT_CPG                  (ARM_IRQ2_BASE + 28)
+#define INTERRUPT_RNG                  (ARM_IRQ2_BASE + 29)
+#define INTERRUPT_VC_ARASANSDIO        (ARM_IRQ2_BASE + 30)
+#define INTERRUPT_AVSPMON              (ARM_IRQ2_BASE + 31)
+
+/* ARM interrupts, which are mostly mirrored from bank 1 and 2 */
+#define ARM_IRQ0_BASE                  64
+#define INTERRUPT_ARM_TIMER            (ARM_IRQ0_BASE + 0)
+#define INTERRUPT_ARM_MAILBOX          (ARM_IRQ0_BASE + 1)
+#define INTERRUPT_ARM_DOORBELL_0       (ARM_IRQ0_BASE + 2)
+#define INTERRUPT_ARM_DOORBELL_1       (ARM_IRQ0_BASE + 3)
+#define INTERRUPT_VPU0_HALTED          (ARM_IRQ0_BASE + 4)
+#define INTERRUPT_VPU1_HALTED          (ARM_IRQ0_BASE + 5)
+#define INTERRUPT_ILLEGAL_TYPE0        (ARM_IRQ0_BASE + 6)
+#define INTERRUPT_ILLEGAL_TYPE1        (ARM_IRQ0_BASE + 7)
+#define INTERRUPT_PENDING1             (ARM_IRQ0_BASE + 8)
+#define INTERRUPT_PENDING2             (ARM_IRQ0_BASE + 9)
+#define INTERRUPT_JPEG                 (ARM_IRQ0_BASE + 10)
+#define INTERRUPT_USB                  (ARM_IRQ0_BASE + 11)
+#define INTERRUPT_3D                   (ARM_IRQ0_BASE + 12)
+#define INTERRUPT_DMA2                 (ARM_IRQ0_BASE + 13)
+#define INTERRUPT_DMA3                 (ARM_IRQ0_BASE + 14)
+#define INTERRUPT_I2C                  (ARM_IRQ0_BASE + 15)
+#define INTERRUPT_SPI                  (ARM_IRQ0_BASE + 16)
+#define INTERRUPT_I2SPCM               (ARM_IRQ0_BASE + 17)
+#define INTERRUPT_SDIO                 (ARM_IRQ0_BASE + 18)
+#define INTERRUPT_UART                 (ARM_IRQ0_BASE + 19)
+#define INTERRUPT_ARASANSDIO           (ARM_IRQ0_BASE + 20)
+
+#define ARM_IRQ_LOCAL_BASE             96
+#define INTERRUPT_ARM_LOCAL_CNTPSIRQ   (ARM_IRQ_LOCAL_BASE + 0)
+#define INTERRUPT_ARM_LOCAL_CNTPNSIRQ  (ARM_IRQ_LOCAL_BASE + 1)
+#define INTERRUPT_ARM_LOCAL_CNTHPIRQ   (ARM_IRQ_LOCAL_BASE + 2)
+#define INTERRUPT_ARM_LOCAL_CNTVIRQ    (ARM_IRQ_LOCAL_BASE + 3)
+#define INTERRUPT_ARM_LOCAL_MAILBOX0   (ARM_IRQ_LOCAL_BASE + 4)
+#define INTERRUPT_ARM_LOCAL_MAILBOX1   (ARM_IRQ_LOCAL_BASE + 5)
+#define INTERRUPT_ARM_LOCAL_MAILBOX2   (ARM_IRQ_LOCAL_BASE + 6)
+#define INTERRUPT_ARM_LOCAL_MAILBOX3   (ARM_IRQ_LOCAL_BASE + 7)
+#define INTERRUPT_ARM_LOCAL_GPU_FAST   (ARM_IRQ_LOCAL_BASE + 8)
+#define INTERRUPT_ARM_LOCAL_PMU_FAST   (ARM_IRQ_LOCAL_BASE + 9)
+#define INTERRUPT_ARM_LOCAL_ZERO       (ARM_IRQ_LOCAL_BASE + 10)
+#define INTERRUPT_ARM_LOCAL_TIMER      (ARM_IRQ_LOCAL_BASE + 11)
+
+#define MAX_INT INTERRUPT_ARM_LOCAL_TIMER
+
+
diff --git a/platform/bcm2835/include/platform/gic.h b/platform/bcm2835/include/platform/gic.h
new file mode 100644
index 00000000..d1e7c8ca
--- /dev/null
+++ b/platform/bcm2835/include/platform/gic.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <platform/bcm2835.h>
+
+#define GICBASE(n)  (CPUPRIV_BASE_PHYS)
+#define GICC_OFFSET (0x0100)
+#define GICD_OFFSET (0x1000)
+
+
diff --git a/platform/bcm2835/intc.c b/platform/bcm2835/intc.c
new file mode 100644
index 00000000..97688ab0
--- /dev/null
+++ b/platform/bcm2835/intc.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <trace.h>
+#include <assert.h>
+#include <err.h>
+#include <bits.h>
+#include <arch/arm.h>
+#include <kernel/spinlock.h>
+#include <kernel/thread.h>
+#include <kernel/mp.h>
+#include <platform/interrupts.h>
+#include <platform/bcm2835.h>
+
+#define LOCAL_TRACE 0
+
+/* global interrupt controller */
+#define INTC_PEND0  (ARMCTRL_INTC_BASE + 0x0)
+#define INTC_PEND1  (ARMCTRL_INTC_BASE + 0x4)
+#define INTC_PEND2  (ARMCTRL_INTC_BASE + 0x8)
+#define INTC_FAST   (ARMCTRL_INTC_BASE + 0xc)
+#define INTC_ENABLE1   (ARMCTRL_INTC_BASE + 0x10)
+#define INTC_ENABLE2   (ARMCTRL_INTC_BASE + 0x14)
+#define INTC_ENABLE3   (ARMCTRL_INTC_BASE + 0x18)
+#define INTC_DISABLE1   (ARMCTRL_INTC_BASE + 0x1c)
+#define INTC_DISABLE2   (ARMCTRL_INTC_BASE + 0x20)
+#define INTC_DISABLE3   (ARMCTRL_INTC_BASE + 0x24)
+
+/* per-cpu local interrupt controller bits.
+ * each is repeated 4 times, one per cpu.
+ */
+#define INTC_LOCAL_TIMER_INT_CONTROL0 (ARM_LOCAL_BASE + 0x40)
+#define INTC_LOCAL_TIMER_INT_CONTROL1 (ARM_LOCAL_BASE + 0x44)
+#define INTC_LOCAL_TIMER_INT_CONTROL2 (ARM_LOCAL_BASE + 0x48)
+#define INTC_LOCAL_TIMER_INT_CONTROL3 (ARM_LOCAL_BASE + 0x4c)
+
+#define INTC_LOCAL_MAILBOX_INT_CONTROL0 (ARM_LOCAL_BASE + 0x50)
+#define INTC_LOCAL_MAILBOX_INT_CONTROL1 (ARM_LOCAL_BASE + 0x54)
+#define INTC_LOCAL_MAILBOX_INT_CONTROL2 (ARM_LOCAL_BASE + 0x58)
+#define INTC_LOCAL_MAILBOX_INT_CONTROL3 (ARM_LOCAL_BASE + 0x5c)
+
+#define INTC_LOCAL_IRQ_PEND0 (ARM_LOCAL_BASE + 0x60)
+#define INTC_LOCAL_IRQ_PEND1 (ARM_LOCAL_BASE + 0x64)
+#define INTC_LOCAL_IRQ_PEND2 (ARM_LOCAL_BASE + 0x68)
+#define INTC_LOCAL_IRQ_PEND3 (ARM_LOCAL_BASE + 0x6c)
+
+#define INTC_LOCAL_FIQ_PEND0 (ARM_LOCAL_BASE + 0x70)
+#define INTC_LOCAL_FIQ_PEND1 (ARM_LOCAL_BASE + 0x74)
+#define INTC_LOCAL_FIQ_PEND2 (ARM_LOCAL_BASE + 0x78)
+#define INTC_LOCAL_FIQ_PEND3 (ARM_LOCAL_BASE + 0x7c)
+
+#define INTC_LOCAL_MAILBOX0_SET0 (ARM_LOCAL_BASE + 0x80)
+#define INTC_LOCAL_MAILBOX0_SET1 (ARM_LOCAL_BASE + 0x90)
+#define INTC_LOCAL_MAILBOX0_SET2 (ARM_LOCAL_BASE + 0xa0)
+#define INTC_LOCAL_MAILBOX0_SET3 (ARM_LOCAL_BASE + 0xb0)
+
+#define INTC_LOCAL_MAILBOX0_CLR0 (ARM_LOCAL_BASE + 0xc0)
+#define INTC_LOCAL_MAILBOX0_CLR1 (ARM_LOCAL_BASE + 0xd0)
+#define INTC_LOCAL_MAILBOX0_CLR2 (ARM_LOCAL_BASE + 0xe0)
+#define INTC_LOCAL_MAILBOX0_CLR3 (ARM_LOCAL_BASE + 0xf0)
+
+struct int_handler_struct {
+    int_handler handler;
+    void *arg;
+};
+
+static struct int_handler_struct int_handler_table[MAX_INT];
+
+static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE;
+
+status_t mask_interrupt(unsigned int vector)
+{
+    LTRACEF("vector %u\n", vector);
+
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
+
+    if (vector >= INTERRUPT_ARM_LOCAL_CNTPSIRQ && vector <= INTERRUPT_ARM_LOCAL_CNTVIRQ) {
+        // local timer interrupts, mask on all cpus
+        for (uint cpu = 0; cpu < 4; cpu++) {
+            uintptr_t reg = INTC_LOCAL_TIMER_INT_CONTROL0 + cpu * 4;
+
+            *REG32(reg) &= (1 << (vector - INTERRUPT_ARM_LOCAL_CNTPSIRQ));
+        }
+    } else if (/* vector >= ARM_IRQ1_BASE && */ vector < (ARM_IRQ0_BASE + 32)) {
+        uintptr_t reg;
+        if (vector >= ARM_IRQ0_BASE)
+            reg = INTC_DISABLE3;
+        else if (vector >= ARM_IRQ2_BASE)
+            reg = INTC_DISABLE2;
+        else
+            reg = INTC_DISABLE1;
+
+        *REG32(reg) = 1 << (vector % 32);
+    } else {
+        PANIC_UNIMPLEMENTED;
+    }
+
+    spin_unlock_irqrestore(&lock, state);
+
+    return NO_ERROR;
+}
+
+status_t unmask_interrupt(unsigned int vector)
+{
+    LTRACEF("vector %u\n", vector);
+
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
+
+    if (vector >= INTERRUPT_ARM_LOCAL_CNTPSIRQ && vector <= INTERRUPT_ARM_LOCAL_CNTVIRQ) {
+        // local timer interrupts, unmask for all cpus
+        for (uint cpu = 0; cpu < 4; cpu++) {
+            uintptr_t reg = INTC_LOCAL_TIMER_INT_CONTROL0 + cpu * 4;
+
+            *REG32(reg) |= (1 << (vector - INTERRUPT_ARM_LOCAL_CNTPSIRQ));
+        }
+    } else if (/* vector >= ARM_IRQ1_BASE && */ vector < (ARM_IRQ0_BASE + 32)) {
+        uintptr_t reg;
+        if (vector >= ARM_IRQ0_BASE)
+            reg = INTC_ENABLE3;
+        else if (vector >= ARM_IRQ2_BASE)
+            reg = INTC_ENABLE2;
+        else
+            reg = INTC_ENABLE1;
+
+        *REG32(reg) = 1 << (vector % 32);
+    } else {
+        PANIC_UNIMPLEMENTED;
+    }
+
+    spin_unlock_irqrestore(&lock, state);
+
+    return NO_ERROR;
+}
+
+void register_int_handler(unsigned int vector, int_handler handler, void *arg)
+{
+    if (vector >= MAX_INT)
+        panic("register_int_handler: vector out of range %d\n", vector);
+
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
+
+    int_handler_table[vector].handler = handler;
+    int_handler_table[vector].arg = arg;
+
+    spin_unlock_irqrestore(&lock, state);
+}
+
+enum handler_return platform_irq(struct arm_iframe *frame)
+{
+    uint vector;
+    uint cpu = arch_curr_cpu_num();
+
+    THREAD_STATS_INC(interrupts);
+
+    // see what kind of irq it is
+    uint32_t pend = *REG32(INTC_LOCAL_IRQ_PEND0 + cpu * 4);
+
+    pend &= ~(1 << (INTERRUPT_ARM_LOCAL_GPU_FAST % 32)); // mask out gpu interrupts
+
+    if (pend != 0) {
+        // it's a local interrupt
+        LTRACEF("local pend 0x%x\n", pend);
+        vector = ARM_IRQ_LOCAL_BASE + ctz(pend);
+        goto decoded;
+    }
+
+    // XXX disable for now, since all of the interesting irqs are mirrored into the other banks
+#if 0
+    // look in bank 0 (ARM interrupts)
+    pend = *REG32(INTC_PEND0);
+    LTRACEF("pend0 0x%x\n", pend);
+    pend &= ~((1<<8)|(1<<9)); // mask out bit 8 and 9
+    if (pend != 0) {
+        // it's a bank 0 interrupt
+        vector = ARM_IRQ0_BASE + ctz(pend);
+        goto decoded;
+    }
+#endif
+
+    // look for VC interrupt bank 1
+    pend = *REG32(INTC_PEND1);
+    LTRACEF("pend1 0x%x\n", pend);
+    if (pend != 0) {
+        // it's a bank 1 interrupt
+        vector = ARM_IRQ1_BASE + ctz(pend);
+        goto decoded;
+    }
+
+    // look for VC interrupt bank 2
+    pend = *REG32(INTC_PEND2);
+    LTRACEF("pend2 0x%x\n", pend);
+    if (pend != 0) {
+        // it's a bank 2 interrupt
+        vector = ARM_IRQ2_BASE + ctz(pend);
+        goto decoded;
+    }
+
+    vector = 0xffffffff;
+
+decoded:
+    LTRACEF("cpu %u vector %u\n", cpu, vector);
+
+    // dispatch the irq
+    enum handler_return ret = INT_NO_RESCHEDULE;
+
+#if WITH_SMP
+    if (vector == INTERRUPT_ARM_LOCAL_MAILBOX0) {
+        pend = *REG32(INTC_LOCAL_MAILBOX0_CLR0 + 0x10 * cpu);
+        LTRACEF("mailbox0 clr 0x%x\n", pend);
+
+        // ack it
+        *REG32(INTC_LOCAL_MAILBOX0_CLR0 + 0x10 * cpu) = pend;
+
+        if (pend & (1 << MP_IPI_GENERIC)) {
+            PANIC_UNIMPLEMENTED;
+        }
+        if (pend & (1 << MP_IPI_RESCHEDULE)) {
+            ret = mp_mbx_reschedule_irq();
+        }
+    } else
+#endif // WITH_SMP
+    if (vector == 0xffffffff) {
+        ret = INT_NO_RESCHEDULE;
+    } else if (int_handler_table[vector].handler) {
+        ret = int_handler_table[vector].handler(int_handler_table[vector].arg);
+    } else {
+        panic("irq %u fired on cpu %u but no handler set!\n", vector, cpu);
+    }
+
+    return ret;
+}
+
+enum handler_return platform_fiq(struct arm_iframe *frame)
+{
+    PANIC_UNIMPLEMENTED;
+}
+
+void bcm2835_send_ipi(uint irq, uint cpu_mask)
+{
+    LTRACEF("irq %u, cpu_mask 0x%x\n", irq, cpu_mask);
+
+    for (uint i = 0; i < 4; i++) {
+        if (cpu_mask & (1<<i)) {
+            LTRACEF("sending to cpu %u\n", i);
+            *REG32(INTC_LOCAL_MAILBOX0_SET0 + 0x10 * i) = (1 << irq);
+        }
+    }
+}
+
+void intc_init(void)
+{
+    // mask everything
+    *REG32(INTC_DISABLE1) = 0xffffffff;
+    *REG32(INTC_DISABLE2) = 0xffffffff;
+    *REG32(INTC_DISABLE3) = 0xffffffff;
+
+#if WITH_SMP
+    // unable mailbox irqs on all cores
+    for (uint i = 0; i < 4; i++) {
+        *REG32(INTC_LOCAL_MAILBOX_INT_CONTROL0 + 0x4 * i) = 0x1;
+    }
+#endif
+}
+
diff --git a/platform/bcm2835/platform.c b/platform/bcm2835/platform.c
new file mode 100644
index 00000000..453aaf9a
--- /dev/null
+++ b/platform/bcm2835/platform.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <reg.h>
+#include <err.h>
+#include <debug.h>
+#include <trace.h>
+#include <dev/uart.h>
+#include <arch.h>
+#include <arch/arm.h>
+#include <arch/arm/mmu.h>
+#include <lk/init.h>
+#include <kernel/vm.h>
+#include <kernel/spinlock.h>
+#include <dev/timer/arm_generic.h>
+#include <platform.h>
+#include <platform/interrupts.h>
+#include <platform/bcm2835.h>
+
+extern void intc_init(void);
+extern void arm_reset(void);
+
+/* initial memory mappings. parsed by start.S */
+struct mmu_initial_mapping mmu_initial_mappings[] = {
+    /* 1GB of sdram space */
+    { .phys = SDRAM_BASE,
+      .virt = KERNEL_BASE,
+      .size = MEMSIZE,
+      .flags = 0,
+      .name = "memory" },
+
+    /* peripherals */
+    { .phys = BCM_PERIPH_BASE_PHYS,
+      .virt = BCM_PERIPH_BASE_VIRT,
+      .size = BCM_PERIPH_SIZE,
+      .flags = MMU_INITIAL_MAPPING_FLAG_DEVICE,
+      .name = "bcm peripherals" },
+
+    /* identity map to let the boot code run */
+    { .phys = SDRAM_BASE,
+      .virt = SDRAM_BASE,
+      .size = 16*1024*1024,
+      .flags = MMU_INITIAL_MAPPING_TEMPORARY },
+
+    /* null entry to terminate the list */
+    { 0 }
+};
+
+static pmm_arena_t arena = {
+    .name = "sdram",
+    .base = SDRAM_BASE,
+    .size = MEMSIZE,
+    .flags = PMM_ARENA_FLAG_KMAP,
+};
+
+void platform_init_mmu_mappings(void)
+{
+}
+
+void platform_early_init(void)
+{
+    uart_init_early();
+
+    intc_init();
+
+    arm_generic_timer_init(INTERRUPT_ARM_LOCAL_CNTPNSIRQ, 1000000);
+
+    /* add the main memory arena */
+    pmm_add_arena(&arena);
+
+#if WITH_SMP
+    /* start the other cpus */
+    uintptr_t sec_entry = (uintptr_t)&arm_reset;
+    sec_entry -= (KERNEL_BASE - MEMBASE);
+    for (uint i = 1; i <= 3; i++) {
+        *REG32(ARM_LOCAL_BASE + 0x8c + 0x10 * i) = sec_entry;
+    }
+#endif
+}
+
+void platform_init(void)
+{
+    uart_init();
+}
+
+#define DEBUG_UART 0
+
+void platform_dputc(char c)
+{
+    if (c == '\n')
+        uart_putc(DEBUG_UART, '\r');
+    uart_putc(DEBUG_UART, c);
+}
+
+int platform_dgetc(char *c, bool wait)
+{
+    int ret = uart_getc(DEBUG_UART, wait);
+    if (ret == -1)
+        return -1;
+    *c = ret;
+    return 0;
+}
+
diff --git a/platform/bcm2835/rules.mk b/platform/bcm2835/rules.mk
new file mode 100644
index 00000000..3a4d44d4
--- /dev/null
+++ b/platform/bcm2835/rules.mk
@@ -0,0 +1,45 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+MODULE := $(LOCAL_DIR)
+
+ARCH := arm
+ARM_CPU := cortex-a7
+WITH_SMP := 1
+SMP_CPU_ID_BITS := 8
+
+MODULE_DEPS := \
+	dev/timer/arm_generic \
+	lib/cbuf
+
+#lib/bio \
+	lib/cbuf \
+	lib/minip \
+	dev/interrupt/arm_gic \
+	dev/timer/arm_cortex_a9
+
+GLOBAL_INCLUDES += \
+	$(LOCAL_DIR)/include
+
+MODULE_SRCS += \
+	$(LOCAL_DIR)/intc.c \
+	$(LOCAL_DIR)/platform.c \
+	$(LOCAL_DIR)/uart.c \
+
+# default to no sdram unless the target calls it out
+ZYNQ_SDRAM_SIZE ?= 0
+
+MEMBASE := 0x00000000
+MEMSIZE ?= 0x10000000 # 256MB
+KERNEL_LOAD_OFFSET := 0x00008000 # loaded 32KB into physical
+
+# put our kernel at 0x80000000
+KERNEL_BASE = 0x80000000
+
+GLOBAL_DEFINES += \
+	MEMBASE=$(MEMBASE) \
+	MEMSIZE=$(MEMSIZE)
+
+LINKER_SCRIPT += \
+	$(BUILDDIR)/system-onesegment.ld
+
+include make/module.mk
diff --git a/platform/bcm2835/uart.c b/platform/bcm2835/uart.c
new file mode 100644
index 00000000..5c3d5ab6
--- /dev/null
+++ b/platform/bcm2835/uart.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <reg.h>
+#include <stdio.h>
+#include <trace.h>
+#include <lib/cbuf.h>
+#include <kernel/thread.h>
+#include <platform/interrupts.h>
+#include <platform/debug.h>
+#include <platform/bcm2835.h>
+
+/* TODO: extract this into a generic PL011 driver */
+
+/* PL011 implementation */
+#define UART_DR    (0x00)
+#define UART_RSR   (0x04)
+#define UART_TFR   (0x18)
+#define UART_ILPR  (0x20)
+#define UART_IBRD  (0x24)
+#define UART_FBRD  (0x28)
+#define UART_LCRH  (0x2c)
+#define UART_CR    (0x30)
+#define UART_IFLS  (0x34)
+#define UART_IMSC  (0x38)
+#define UART_TRIS  (0x3c)
+#define UART_TMIS  (0x40)
+#define UART_ICR   (0x44)
+#define UART_DMACR (0x48)
+
+#define UARTREG(base, reg)  (*REG32((base)  + (reg)))
+
+#define RXBUF_SIZE 16
+#define NUM_UART 1
+
+static cbuf_t uart_rx_buf[NUM_UART];
+
+static inline uintptr_t uart_to_ptr(unsigned int n)
+{
+    switch (n) {
+        default:
+        case 0: return UART0_BASE;
+    }
+}
+
+static enum handler_return uart_irq(void *arg)
+{
+    bool resched = false;
+    uint port = (uint)arg;
+    uintptr_t base = uart_to_ptr(port);
+
+    /* read interrupt status and mask */
+    uint32_t isr = UARTREG(base, UART_TMIS);
+
+    if (isr & ((1<<6) | (1<<4))) { // rtmis, rxmis
+        UARTREG(base, UART_ICR) = (1<<4);
+        cbuf_t *rxbuf = &uart_rx_buf[port];
+
+        /* while fifo is not empty, read chars out of it */
+        while ((UARTREG(base, UART_TFR) & (1<<4)) == 0) {
+            char c = UARTREG(base, UART_DR);
+            cbuf_write_char(rxbuf, c, false);
+
+            resched = true;
+        }
+    }
+
+    return resched ? INT_RESCHEDULE : INT_NO_RESCHEDULE;
+}
+
+void uart_init(void)
+{
+    for (size_t i = 0; i < NUM_UART; i++) {
+        // create circular buffer to hold received data
+        cbuf_initialize(&uart_rx_buf[i], RXBUF_SIZE);
+
+        // assumes interrupts are contiguous
+        register_int_handler(INTERRUPT_VC_UART + i, &uart_irq, (void *)i);
+
+        // clear all irqs
+        UARTREG(uart_to_ptr(i), UART_ICR) = 0x3ff;
+
+        // set fifo trigger level
+        UARTREG(uart_to_ptr(i), UART_IFLS) = 0; // 1/8 rxfifo, 1/8 txfifo
+
+        // enable rx interrupt
+        UARTREG(uart_to_ptr(i), UART_IMSC) = (1<<6)|(1<<4); // rtim, rxim
+
+        // enable receive
+        UARTREG(uart_to_ptr(i), UART_CR) |= (1<<9); // rxen
+
+        // enable interrupt
+        unmask_interrupt(INTERRUPT_VC_UART + i);
+    }
+}
+
+void uart_init_early(void)
+{
+    for (size_t i = 0; i < NUM_UART; i++) {
+        UARTREG(uart_to_ptr(i), UART_CR) = (1<<8)|(1<<0); // tx_enable, uarten
+    }
+}
+
+int uart_putc(int port, char c)
+{
+    uintptr_t base = uart_to_ptr(port);
+
+    /* spin while fifo is full */
+    while (UARTREG(base, UART_TFR) & (1<<5))
+        ;
+    UARTREG(base, UART_DR) = c;
+
+    return 1;
+}
+
+int uart_getc(int port, bool wait)
+{
+    cbuf_t *rxbuf = &uart_rx_buf[port];
+
+    char c;
+    if (cbuf_read_char(rxbuf, &c, wait) == 1)
+        return c;
+
+    return -1;
+}
+
+void uart_flush_tx(int port)
+{
+}
+
+void uart_flush_rx(int port)
+{
+}
+
+void uart_init_port(int port, uint baud)
+{
+}
+
+
diff --git a/platform/foundation-emu/interrupts.c b/platform/foundation-emu/interrupts.c
index de0a6952..f1fb345e 100644
--- a/platform/foundation-emu/interrupts.c
+++ b/platform/foundation-emu/interrupts.c
@@ -27,6 +27,7 @@
 #include <reg.h>
 #include <kernel/thread.h>
 #include <kernel/debug.h>
+#include <kernel/spinlock.h>
 #include <platform/interrupts.h>
 #include <arch/ops.h>
 #include <arch/arm64.h>
@@ -39,18 +40,20 @@ struct int_handler_struct {
 };
 
 static struct int_handler_struct int_handler_table[MAX_INT];
+static spin_lock_t lock;
 
 void register_int_handler(unsigned int vector, int_handler handler, void *arg)
 {
     if (vector >= MAX_INT)
         panic("register_int_handler: vector out of range %d\n", vector);
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     int_handler_table[vector].handler = handler;
     int_handler_table[vector].arg = arg;
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 }
 
 #define GICCPUREG(reg) (*REG32(GIC_PROC_BASE + (reg)))
@@ -156,11 +159,12 @@ status_t mask_interrupt(unsigned int vector)
     if (vector >= MAX_INT)
         return -1;
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     gic_set_enable(vector, false);
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 
     return NO_ERROR;
 }
@@ -170,11 +174,12 @@ status_t unmask_interrupt(unsigned int vector)
     if (vector >= MAX_INT)
         return -1;
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     gic_set_enable(vector, true);
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 
     return NO_ERROR;
 }
@@ -191,8 +196,6 @@ enum handler_return platform_irq(struct arm64_iframe_short *frame)
         return INT_NO_RESCHEDULE;
     }
 
-    inc_critical_section();
-
     THREAD_STATS_INC(interrupts);
     KEVLOG_IRQ_ENTER(vector);
 
@@ -212,8 +215,6 @@ enum handler_return platform_irq(struct arm64_iframe_short *frame)
     if (ret != INT_NO_RESCHEDULE)
         thread_preempt();
 
-    dec_critical_section();
-
     return ret;
 }
 
diff --git a/platform/foundation-emu/timer.c b/platform/foundation-emu/timer.c
index 2fc98a50..301f9431 100644
--- a/platform/foundation-emu/timer.c
+++ b/platform/foundation-emu/timer.c
@@ -27,6 +27,7 @@
 #include <stdio.h>
 #include <trace.h>
 #include <kernel/thread.h>
+#include <kernel/spinlock.h>
 #include <arch/arm64.h>
 #include <platform.h>
 #include <platform/interrupts.h>
@@ -37,6 +38,7 @@
 #define LOCAL_TRACE 0
 
 static platform_timer_callback t_callback;
+static spin_lock_t lock;
 
 /* armv8 specified timer */
 
@@ -64,7 +66,8 @@ status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg
 {
     LTRACEF("callback %p, arg %p, interval %lu\n", callback, arg, interval);
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     t_callback = callback;
 
@@ -83,7 +86,7 @@ status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg
 
     unmask_interrupt(INT_PPI_NSPHYS_TIMER);
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 
     return NO_ERROR;
 }
@@ -92,7 +95,8 @@ status_t platform_set_oneshot_timer (platform_timer_callback callback, void *arg
 {
     LTRACEF("callback %p, arg %p, interval %lu\n", callback, arg, interval);
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     t_callback = callback;
 
@@ -121,7 +125,7 @@ status_t platform_set_oneshot_timer (platform_timer_callback callback, void *arg
 
     unmask_interrupt(INT_PPI_NSPHYS_TIMER);
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 
     return NO_ERROR;
 }
diff --git a/platform/microblaze/intc.c b/platform/microblaze/intc.c
index 0510400f..f70b9a70 100644
--- a/platform/microblaze/intc.c
+++ b/platform/microblaze/intc.c
@@ -45,6 +45,8 @@
 
 #define INTC_REG(reg) (*REG32(INTC_BASEADDR + (reg) * 4))
 
+static spin_lock_t lock;
+
 struct int_handler_struct {
     int_handler handler;
     void *arg;
@@ -59,12 +61,13 @@ void register_int_handler(unsigned int vector, int_handler handler, void *arg)
     if (vector >= MAX_INT)
         return;
 
-    enter_critical_section();
+    spin_lock_saved_state_t state;
+    spin_lock_irqsave(&lock, state);
 
     int_handler_table[vector].handler = handler;
     int_handler_table[vector].arg = arg;
 
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, state);
 }
 
 status_t mask_interrupt(unsigned int vector)
diff --git a/platform/pc/interrupts.c b/platform/pc/interrupts.c
index 8bd284fb..240ef283 100644
--- a/platform/pc/interrupts.c
+++ b/platform/pc/interrupts.c
@@ -28,9 +28,12 @@
 #include <platform/interrupts.h>
 #include <arch/ops.h>
 #include <arch/x86.h>
+#include <kernel/spinlock.h>
 #include "platform_p.h"
 #include <platform/pc.h>
 
+static spin_lock_t lock;
+
 void x86_gpf_handler(struct x86_iframe *frame);
 void x86_invop_handler(struct x86_iframe *frame);
 void x86_unhandled_exception(struct x86_iframe *frame);
@@ -161,11 +164,12 @@ status_t mask_interrupt(unsigned int vector)
 
 //	dprintf(DEBUG, "%s: vector %d\n", __PRETTY_FUNCTION__, vector);
 
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&lock, state);
 
 	enable(vector, false);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, state);
 
 	return NO_ERROR;
 }
@@ -190,11 +194,12 @@ status_t unmask_interrupt(unsigned int vector)
 
 //	dprintf("%s: vector %d\n", __PRETTY_FUNCTION__, vector);
 
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&lock, state);
 
 	enable(vector, true);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, state);
 
 	return NO_ERROR;
 }
@@ -246,12 +251,14 @@ void register_int_handler(unsigned int vector, int_handler handler, void *arg)
 	if (vector >= INT_VECTORS)
 		panic("register_int_handler: vector out of range %d\n", vector);
 
-	enter_critical_section();
+	spin_lock_saved_state_t state;
+	spin_lock_irqsave(&lock, state);
 
 	int_handler_table[vector].arg = arg;
 	int_handler_table[vector].handler = handler;
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, state);
 }
 
+/* vim: set noexpandtab: */
 
diff --git a/platform/pc/pci.c b/platform/pc/pci.c
index 49ad95cd..9547513a 100644
--- a/platform/pc/pci.c
+++ b/platform/pc/pci.c
@@ -25,10 +25,12 @@
 #include <stdlib.h>
 #include <string.h>
 #include <kernel/thread.h>
+#include <kernel/spinlock.h>
 #include <arch/x86/descriptor.h>
 #include <dev/pci.h>
 
 static int last_bus = 0;
+static spin_lock_t lock;
 
 typedef struct {
 	uint16_t size;
@@ -64,87 +66,95 @@ int (*g_pci_set_irq_hw_int)(const pci_location_t *state, uint8_t int_pin, uint8_
 
 int pci_find_pci_device(pci_location_t *state, uint16_t device_id, uint16_t vendor_id, uint16_t index)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_find_pci_device(state, device_id, vendor_id, index);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
 
 int pci_find_pci_class_code(pci_location_t *state, uint32_t class_code, uint16_t index)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_find_pci_class_code(state, class_code, index);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
 
 int pci_read_config_byte(const pci_location_t *state, uint32_t reg, uint8_t *value)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_read_config_byte(state, reg, value);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
 int pci_read_config_half(const pci_location_t *state, uint32_t reg, uint16_t *value)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_read_config_half(state, reg, value);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
 
 int pci_read_config_word(const pci_location_t *state, uint32_t reg, uint32_t *value)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_read_config_word(state, reg, value);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
 
 int pci_write_config_byte(const pci_location_t *state, uint32_t reg, uint8_t value)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_write_config_byte(state, reg, value);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
 
 int pci_write_config_half(const pci_location_t *state, uint32_t reg, uint16_t value)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_write_config_half(state, reg, value);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
 
 int pci_write_config_word(const pci_location_t *state, uint32_t reg, uint32_t value)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_write_config_word(state, reg, value);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
@@ -152,29 +162,31 @@ int pci_write_config_word(const pci_location_t *state, uint32_t reg, uint32_t va
 
 int pci_get_irq_routing_options(irq_routing_entry *entries, uint16_t *count, uint16_t *pci_irqs)
 {
-	enter_critical_section();
-
 	irq_routing_options_t options;
 	options.size = sizeof(irq_routing_entry) * *count;
 	options.selector = DATA_SELECTOR;
 	options.offset = entries;
 
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
+
 	int res = g_pci_get_irq_routing_options(&options, pci_irqs);
 
-	*count = options.size / sizeof(irq_routing_entry);
+	spin_unlock_irqrestore(&lock, irqstate);
 
-	exit_critical_section();
+	*count = options.size / sizeof(irq_routing_entry);
 
 	return res;
 }
 
 int pci_set_irq_hw_int(const pci_location_t *state, uint8_t int_pin, uint8_t irq)
 {
-	enter_critical_section();
+	spin_lock_saved_state_t irqstate;
+	spin_lock_irqsave(&lock, irqstate);
 
 	int res = g_pci_set_irq_hw_int(state, int_pin, irq);
 
-	exit_critical_section();
+	spin_unlock_irqrestore(&lock, irqstate);
 
 	return res;
 }
diff --git a/platform/pc/timer.c b/platform/pc/timer.c
index 22ec13ca..fafc7689 100644
--- a/platform/pc/timer.c
+++ b/platform/pc/timer.c
@@ -25,6 +25,7 @@
 #include <reg.h>
 #include <debug.h>
 #include <kernel/thread.h>
+#include <kernel/spinlock.h>
 #include <platform.h>
 #include <platform/interrupts.h>
 #include <platform/console.h>
@@ -40,7 +41,7 @@ static uint64_t next_trigger_time;
 static uint64_t next_trigger_delta;
 
 static uint64_t timer_delta_time;
-static uint64_t timer_current_time;
+static volatile uint64_t timer_current_time;
 
 static uint16_t divisor;
 
@@ -49,16 +50,12 @@ static uint16_t divisor;
 
 status_t platform_set_periodic_timer(platform_timer_callback callback, void *arg, lk_time_t interval)
 {
-	enter_critical_section();
-
 	t_callback = callback;
 	callback_arg = arg;
 
 	next_trigger_delta = (uint64_t) interval << 32;
 	next_trigger_time = timer_current_time + next_trigger_delta;
 
-	exit_critical_section();
-
 	return NO_ERROR;
 }
 
@@ -66,9 +63,8 @@ lk_time_t current_time(void)
 {
 	lk_time_t time;
 
-	enter_critical_section();
+	// XXX slight race
 	time = (lk_time_t) (timer_current_time >> 32);
-	exit_critical_section();
 
 	return time;
 }
@@ -77,9 +73,8 @@ lk_bigtime_t current_time_hires(void)
 {
 	lk_bigtime_t time;
 
-	enter_critical_section();
+	// XXX slight race
 	time = (lk_bigtime_t) ((timer_current_time >> 22) * 1000) >> 10;
-	exit_critical_section();
 
 	return time;
 }
@@ -164,3 +159,4 @@ void platform_halt_timers(void)
 	mask_interrupt(INT_PIT);
 }
 
+/* vim: set noexpandtab */
diff --git a/platform/power.c b/platform/power.c
index 0978e183..ecb74457 100644
--- a/platform/power.c
+++ b/platform/power.c
@@ -35,7 +35,7 @@ __WEAK void platform_halt(platform_halt_action suggested_action,
                           platform_halt_reason reason)
 {
     dprintf(ALWAYS, "HALT: spinning forever... (reason = %d)\n", reason);
-    enter_critical_section();
+    arch_disable_ints();
     for(;;);
 }
 
diff --git a/platform/vexpress-a9/include/platform/gic.h b/platform/vexpress-a9/include/platform/gic.h
index 4cbf42b8..8270dd5c 100644
--- a/platform/vexpress-a9/include/platform/gic.h
+++ b/platform/vexpress-a9/include/platform/gic.h
@@ -24,7 +24,7 @@
 
 #include <platform/vexpress-a9.h>
 
-#define GICBASE(n)  (CPUPRIV_BASE_VIRT)
+#define GICBASE(n)  (CPUPRIV_BASE_PHYS)
 #define GICC_OFFSET (0x0100)
 #define GICD_OFFSET (0x1000)
 
diff --git a/platform/vexpress-a9/platform.c b/platform/vexpress-a9/platform.c
index 72d45bd6..8494082f 100644
--- a/platform/vexpress-a9/platform.c
+++ b/platform/vexpress-a9/platform.c
@@ -31,6 +31,7 @@
 #include <dev/virtio.h>
 #include <lk/init.h>
 #include <kernel/vm.h>
+#include <kernel/spinlock.h>
 #include <platform.h>
 #include <platform/gic.h>
 #include <platform/interrupts.h>
@@ -64,7 +65,7 @@ struct mmu_initial_mapping mmu_initial_mappings[] = {
 
     /* cortex-a9 private memory area */
     { .phys = CPUPRIV_BASE_PHYS,
-      .virt = CPUPRIV_BASE_VIRT,
+      .virt = CPUPRIV_BASE_PHYS, // XXX move back to CPUPRIV_BASE_VIRT
       .size = CPUPRIV_SIZE,
       .flags = MMU_INITIAL_MAPPING_FLAG_DEVICE,
       .name = "cpu_priv"},
@@ -96,7 +97,7 @@ void platform_early_init(void)
     arm_gic_init();
 
     /* initialize the timer block */
-    arm_cortex_a9_timer_init(CPUPRIV_BASE_VIRT, 100000000);
+    arm_cortex_a9_timer_init(CPUPRIV_BASE_PHYS, 100000000);
 
     uart_init_early();
 
diff --git a/platform/vexpress-a9/rules.mk b/platform/vexpress-a9/rules.mk
index e5fe7eb1..5971d669 100644
--- a/platform/vexpress-a9/rules.mk
+++ b/platform/vexpress-a9/rules.mk
@@ -4,6 +4,7 @@ MODULE := $(LOCAL_DIR)
 
 ARCH := arm
 ARM_CPU := cortex-a9-neon
+WITH_SMP ?= 1
 
 GLOBAL_INCLUDES += \
 	$(LOCAL_DIR)/include
diff --git a/platform/vexpress-a9/secondary_boot.S b/platform/vexpress-a9/secondary_boot.S
new file mode 100644
index 00000000..fc85ea73
--- /dev/null
+++ b/platform/vexpress-a9/secondary_boot.S
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2014 Travis Geiselbrecht
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <asm.h>
+#include <platform/vexpress-a9.h>
+
+.section .text
+
+/* true reset vector, to catch non boot cpus and hold them until later */
+FUNCTION(platform_reset)
+    mrc     p15, 0, r12, c0, c0, 5 /* read MPIDR */
+
+    /* mask off the bottom 12 bits to test cluster number:cpu number */
+    ubfx    r12, r12, #0, #12
+
+    /* if we're cpu 0:0, continue to the usual arm reset vector */
+    cmp     r12, #0
+    beq     arm_reset
+
+    /* all other cpus, trap and wait to be released */
+1:
+    wfe
+    ldr     r12, =boot_cpu_lock
+    ldr     r12, [r12]
+    cmp     r12, #0
+    bne     1b
+
+    b       arm_secondary_reset
+
+.ltorg
+
diff --git a/platform/zynq/debug.c b/platform/zynq/debug.c
index fe7db058..84244160 100644
--- a/platform/zynq/debug.c
+++ b/platform/zynq/debug.c
@@ -65,13 +65,13 @@ void platform_halt(platform_halt_action suggested_action,
         case HALT_ACTION_SHUTDOWN:
         case HALT_ACTION_HALT:
             printf("HALT: spinning forever... (reason = %d)\n", reason);
-            enter_critical_section();
+            arch_disable_ints();
             for(;;)
                 arch_idle();
             break;
         case HALT_ACTION_REBOOT:
             printf("REBOOT\n");
-            enter_critical_section();
+            arch_disable_ints();
             for (;;) {
                 zynq_slcr_unlock();
                 SLCR->PSS_RST_CTRL = 1;
diff --git a/platform/zynq/gem.c b/platform/zynq/gem.c
index 0351535f..6df7d96e 100644
--- a/platform/zynq/gem.c
+++ b/platform/zynq/gem.c
@@ -39,6 +39,7 @@
 #include <kernel/timer.h>
 #include <kernel/thread.h>
 #include <kernel/vm.h>
+#include <kernel/spinlock.h>
 #include <kernel/debug.h>
 #include <platform/interrupts.h>
 #include <platform/debug.h>
@@ -55,6 +56,8 @@
 #define GEM_RX_BUF_SIZE     1536
 #define GEM_TX_BUF_SIZE     1536
 
+static spin_lock_t lock = SPIN_LOCK_INITIAL_VALUE;
+
 struct gem_desc {
     uint32_t addr;
     uint32_t ctrl;
@@ -132,7 +135,8 @@ void queue_pkts_in_tx_tbl(void) {
     pktbuf_t *p;
     unsigned int cur_pos;
 
-    enter_critical_section();
+    spin_lock_saved_state_t irqstate;
+    spin_lock_irqsave(&lock, irqstate);
     if (list_is_empty(&gem.tx_queue)) {
         goto exit;
     }
@@ -168,7 +172,7 @@ void queue_pkts_in_tx_tbl(void) {
     gem.regs->net_ctrl |= NET_CTRL_START_TX;
 
 exit:
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, irqstate);
 }
 
 int gem_send_raw_pkt(struct pktbuf *p)
@@ -186,10 +190,11 @@ int gem_send_raw_pkt(struct pktbuf *p)
     // XXX handle multi part buffers
     arch_clean_cache_range((vaddr_t)p->data, p->dlen);
 
-    enter_critical_section();
+    spin_lock_saved_state_t irqstate;
+    spin_lock_irqsave(&lock, irqstate);
     list_add_tail(&gem.tx_queue, &p->list);
     queue_pkts_in_tx_tbl();
-    exit_critical_section();
+    spin_unlock_irqrestore(&lock, irqstate);
 
 err:
     return ret;
diff --git a/platform/zynq/include/platform/zynq.h b/platform/zynq/include/platform/zynq.h
index 2bbfd4b6..bd434333 100644
--- a/platform/zynq/include/platform/zynq.h
+++ b/platform/zynq/include/platform/zynq.h
@@ -29,6 +29,7 @@
 #define SDRAM_BASE          (0x00100000)
 #define SDRAM_APERTURE_SIZE (0x3ff00000)
 #define SRAM_BASE           (0x0)
+#define SRAM_BASE_HIGH      (0xfffc0000)
 #define SRAM_APERTURE_SIZE  (0x00040000)
 #define SRAM_SIZE           (0x00040000)
 
diff --git a/platform/zynq/platform.c b/platform/zynq/platform.c
index 2ebd1f3b..91743cfa 100644
--- a/platform/zynq/platform.c
+++ b/platform/zynq/platform.c
@@ -278,6 +278,12 @@ struct mmu_initial_mapping mmu_initial_mappings[] = {
       .flags = MMU_INITIAL_MAPPING_FLAG_DEVICE,
       .name = "hw-fc000000" },
 
+    /* sram high aperture */
+    { .phys = 0xfff00000,
+      .virt = 0xfff00000,
+      .size = 0x00100000,
+      .flags = MMU_INITIAL_MAPPING_FLAG_DEVICE },
+
     /* identity map to let the boot code run */
     { .phys = SRAM_BASE,
       .virt = SRAM_BASE,
diff --git a/platform/zynq/rules.mk b/platform/zynq/rules.mk
index e1629a37..9829fdde 100644
--- a/platform/zynq/rules.mk
+++ b/platform/zynq/rules.mk
@@ -4,6 +4,8 @@ MODULE := $(LOCAL_DIR)
 
 ARCH := arm
 ARM_CPU := cortex-a9-neon
+WITH_SMP ?= 1
+SMP_MAX_CPUS := 2
 
 MODULE_DEPS := \
 	lib/bio \
@@ -13,6 +15,7 @@ MODULE_DEPS := \
 	dev/interrupt/arm_gic \
 	dev/timer/arm_cortex_a9
 
+
 GLOBAL_INCLUDES += \
 	$(LOCAL_DIR)/include
 
diff --git a/platform/zynq/start.S b/platform/zynq/start.S
index 942ed09b..c97c5241 100644
--- a/platform/zynq/start.S
+++ b/platform/zynq/start.S
@@ -41,14 +41,15 @@ FUNCTION(platform_reset)
     str     r11, [r12]
     dsb
 
+#if !WITH_SMP
 0:
     /* stay trapped here forever */
     wfe
     b       0b
-
-    ldr pc, foo
-foo:
-    .word       0xa
+#else
+    /* pass on through the reset vector, where the arm arch code will trap the cpu */
+    b       arm_reset
+#endif
 
 DATA(__cpu_trapped)
     .word     0
diff --git a/project/pc-x86-test.mk b/project/pc-x86-test.mk
index a9d1ce6d..20a28dc5 100644
--- a/project/pc-x86-test.mk
+++ b/project/pc-x86-test.mk
@@ -5,6 +5,7 @@ LOCAL_DIR := $(GET_LOCAL_DIR)
 ARCH := x86
 TARGET := pc-x86
 MODULES += \
+	lib/debugcommands \
 	lib/libm \
 	app/tests \
 	app/shell \
diff --git a/project/rpi2-test.mk b/project/rpi2-test.mk
new file mode 100644
index 00000000..c71a7e7c
--- /dev/null
+++ b/project/rpi2-test.mk
@@ -0,0 +1,11 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+TARGET := rpi2
+
+MODULES += \
+	app/shell \
+	app/stringtests \
+	app/tests \
+	lib/cksum \
+	lib/debugcommands \
+
diff --git a/project/vexpress-a9-test.mk b/project/vexpress-a9-test.mk
index 8ad23041..1bc72820 100644
--- a/project/vexpress-a9-test.mk
+++ b/project/vexpress-a9-test.mk
@@ -9,7 +9,10 @@ MODULES += \
 	lib/bytes \
 	lib/cksum \
 	lib/debugcommands \
+	lib/evlog \
 	lib/libm
 
 WITH_LINKER_GC := 0
 
+GLOBAL_DEFINES += WITH_KERNEL_EVLOG=1
+
diff --git a/target/rpi2/rules.mk b/target/rpi2/rules.mk
new file mode 100644
index 00000000..41ffb2ed
--- /dev/null
+++ b/target/rpi2/rules.mk
@@ -0,0 +1,9 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+GLOBAL_INCLUDES += \
+	$(LOCAL_DIR)/include
+
+PLATFORM := bcm2835
+
+#include make/module.mk
+
diff --git a/top/init.c b/top/init.c
index 0fd8a747..ed5e0970 100644
--- a/top/init.c
+++ b/top/init.c
@@ -26,8 +26,10 @@
  * a init hook that is called at increasing init levels as the system is
  * initialized.
  */
+#include <arch/ops.h>
 #include <lk/init.h>
 
+#include <assert.h>
 #include <compiler.h>
 #include <debug.h>
 #include <trace.h>
@@ -38,13 +40,13 @@
 extern const struct lk_init_struct __lk_init[];
 extern const struct lk_init_struct __lk_init_end[];
 
-static uint last_init_level = 0;
-
-int lk_init_level(uint level)
+void lk_init_level(enum lk_init_flags required_flag, uint start_level, uint stop_level)
 {
-    LTRACEF("level %#x, last_init_level %#x\n", level, last_init_level);
+    LTRACEF("flags %#x, start_level %#x, stop_level %#x\n",
+            required_flag, start_level, stop_level);
 
-    uint last_called_level = last_init_level;
+    ASSERT(start_level > 0);
+    uint last_called_level = start_level - 1;
     const struct lk_init_struct *last = NULL;
     for (;;) {
         /* search for the lowest uncalled hook to call */
@@ -53,13 +55,15 @@ int lk_init_level(uint level)
         const struct lk_init_struct *found = NULL;
         bool seen_last = false;
         for (const struct lk_init_struct *ptr = __lk_init; ptr != __lk_init_end; ptr++) {
-            LTRACEF("looking at %p (%s) level %#x, seen_last %d\n", ptr, ptr->name, ptr->level, seen_last);
+            LTRACEF("looking at %p (%s) level %#x, flags %#x, seen_last %d\n", ptr, ptr->name, ptr->level, ptr->flags, seen_last);
 
             if (ptr == last)
                 seen_last = true;
 
             /* reject the easy ones */
-            if (ptr->level > level)
+            if (!(ptr->flags & required_flag))
+                continue;
+            if (ptr->level > stop_level)
                 continue;
             if (ptr->level < last_called_level)
                 continue;
@@ -67,7 +71,7 @@ int lk_init_level(uint level)
                 continue;
 
             /* keep the lowest one we haven't called yet */
-            if (ptr->level > last_init_level && ptr->level > last_called_level) {
+            if (ptr->level >= start_level && ptr->level > last_called_level) {
                 found = ptr;
                 continue;
             }
@@ -86,16 +90,13 @@ int lk_init_level(uint level)
             break;
 
 #if TRACE_INIT
-        printf("INIT: calling hook %p (%s) at level %#x\n", found->hook, found->name, found->level);
+        printf("INIT: cpu %d, calling hook %p (%s) at level %#x, flags %#x\n",
+               arch_curr_cpu_num(), found->hook, found->name, found->level, found->flags);
 #endif
         found->hook(found->level);
         last_called_level = found->level;
         last = found;
     }
-
-    last_init_level = level;
-
-    return 0;
 }
 
 #if 0
diff --git a/top/main.c b/top/main.c
index ae52c9f2..a31964d3 100644
--- a/top/main.c
+++ b/top/main.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Travis Geiselbrecht
+ * Copyright (c) 2013-2015 Travis Geiselbrecht
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files
@@ -33,8 +33,10 @@
 #include <platform.h>
 #include <target.h>
 #include <lib/heap.h>
+#include <kernel/mutex.h>
 #include <kernel/thread.h>
 #include <lk/init.h>
+#include <lk/main.h>
 
 /* saved boot arguments from whoever loaded the system */
 ulong lk_boot_args[4];
@@ -44,6 +46,11 @@ extern void *__ctor_end;
 extern int __bss_start;
 extern int _end;
 
+#if WITH_SMP
+static thread_t *secondary_bootstrap_threads[SMP_MAX_CPUS - 1];
+static uint secondary_bootstrap_thread_count;
+#endif
+
 static int bootstrap2(void *arg);
 
 extern void kernel_init(void);
@@ -64,11 +71,8 @@ static void call_constructors(void)
 }
 
 /* called from arch code */
-void lk_main(ulong arg0, ulong arg1, ulong arg2, ulong arg3) __NO_RETURN __EXTERNALLY_VISIBLE;
 void lk_main(ulong arg0, ulong arg1, ulong arg2, ulong arg3)
 {
-	inc_critical_section();
-
 	// save the boot args
 	lk_boot_args[0] = arg0;
 	lk_boot_args[1] = arg1;
@@ -79,18 +83,22 @@ void lk_main(ulong arg0, ulong arg1, ulong arg2, ulong arg3)
 	thread_init_early();
 
 	// early arch stuff
-	lk_init_level(LK_INIT_LEVEL_ARCH_EARLY - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_ARCH_EARLY - 1);
 	arch_early_init();
 
 	// do any super early platform initialization
-	lk_init_level(LK_INIT_LEVEL_PLATFORM_EARLY - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_ARCH_EARLY, LK_INIT_LEVEL_PLATFORM_EARLY - 1);
 	platform_early_init();
 
 	// do any super early target initialization
-	lk_init_level(LK_INIT_LEVEL_TARGET_EARLY - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_PLATFORM_EARLY, LK_INIT_LEVEL_TARGET_EARLY - 1);
 	target_early_init();
 
-	dprintf(INFO, "welcome to lk\n\n");
+#if WITH_SMP
+	dprintf(INFO, "\nwelcome to lk/MP\n\n");
+#else
+	dprintf(INFO, "\nwelcome to lk\n\n");
+#endif
 	dprintf(INFO, "boot args 0x%lx 0x%lx 0x%lx 0x%lx\n",
 		lk_boot_args[0], lk_boot_args[1], lk_boot_args[2], lk_boot_args[3]);
 
@@ -100,18 +108,19 @@ void lk_main(ulong arg0, ulong arg1, ulong arg2, ulong arg3)
 
 	// bring up the kernel heap
 	dprintf(SPEW, "initializing heap\n");
-	lk_init_level(LK_INIT_LEVEL_HEAP - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_TARGET_EARLY, LK_INIT_LEVEL_HEAP - 1);
 	heap_init();
 
 	// initialize the kernel
-	lk_init_level(LK_INIT_LEVEL_KERNEL - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_HEAP, LK_INIT_LEVEL_KERNEL - 1);
 	kernel_init();
 
-	lk_init_level(LK_INIT_LEVEL_THREADING - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_KERNEL, LK_INIT_LEVEL_THREADING - 1);
 
 	// create a thread to complete system initialization
 	dprintf(SPEW, "creating bootstrap completion thread\n");
 	thread_t *t = thread_create("bootstrap2", &bootstrap2, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+	t->pinned_cpu = 0;
 	thread_detach(t);
 	thread_resume(t);
 
@@ -123,26 +132,71 @@ static int bootstrap2(void *arg)
 {
 	dprintf(SPEW, "top of bootstrap2()\n");
 
-	lk_init_level(LK_INIT_LEVEL_ARCH - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_THREADING, LK_INIT_LEVEL_ARCH - 1);
 	arch_init();
 
 	// initialize the rest of the platform
 	dprintf(SPEW, "initializing platform\n");
-	lk_init_level(LK_INIT_LEVEL_PLATFORM - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_ARCH, LK_INIT_LEVEL_PLATFORM - 1);
 	platform_init();
 
 	// initialize the target
 	dprintf(SPEW, "initializing target\n");
-	lk_init_level(LK_INIT_LEVEL_TARGET - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_PLATFORM, LK_INIT_LEVEL_TARGET - 1);
 	target_init();
 
 	dprintf(SPEW, "calling apps_init()\n");
-	lk_init_level(LK_INIT_LEVEL_APPS - 1);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_TARGET, LK_INIT_LEVEL_APPS - 1);
 	apps_init();
 
-	lk_init_level(LK_INIT_LEVEL_LAST);
+	lk_primary_cpu_init_level(LK_INIT_LEVEL_APPS, LK_INIT_LEVEL_LAST);
 
 	return 0;
 }
 
+#if WITH_SMP
+void lk_secondary_cpu_entry(void)
+{
+	uint cpu = arch_curr_cpu_num();
+
+	if (cpu > secondary_bootstrap_thread_count) {
+		dprintf(CRITICAL, "Invalid secondary cpu num %d, SMP_MAX_CPUS %d, secondary_bootstrap_thread_count %d\n",
+			cpu, SMP_MAX_CPUS, secondary_bootstrap_thread_count);
+		return;
+	}
+
+	thread_secondary_cpu_init_early();
+	thread_resume(secondary_bootstrap_threads[cpu - 1]);
+
+	dprintf(SPEW, "entering scheduler on cpu %d\n", cpu);
+	thread_secondary_cpu_entry();
+}
+
+static int secondary_cpu_bootstrap2(void *arg)
+{
+	/* secondary cpu initialize from threading level up. 0 to threading was handled in arch */
+	lk_init_level(LK_INIT_FLAG_SECONDARY_CPUS, LK_INIT_LEVEL_THREADING, LK_INIT_LEVEL_LAST);
+
+	return 0;
+}
+
+void lk_init_secondary_cpus(uint secondary_cpu_count)
+{
+	if (secondary_cpu_count >= SMP_MAX_CPUS) {
+		dprintf(CRITICAL, "Invalid secondary_cpu_count %d, SMP_MAX_CPUS %d\n",
+			secondary_cpu_count, SMP_MAX_CPUS);
+		secondary_cpu_count = SMP_MAX_CPUS - 1;
+	}
+	for (uint i = 0; i < secondary_cpu_count; i++) {
+		dprintf(SPEW, "creating bootstrap completion thread for cpu %d\n", i + 1);
+		thread_t *t = thread_create("secondarybootstrap2",
+					    &secondary_cpu_bootstrap2, NULL,
+					    DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+		t->pinned_cpu = i + 1;
+		thread_detach(t);
+		secondary_bootstrap_threads[i] = t;
+	}
+	secondary_bootstrap_thread_count = secondary_cpu_count;
+}
+#endif
 // vim: noexpandtab: