|
Description
|
Prior to Xen 3.0.3, when we wrote level-0 PTE entries, we needed to include
PT_USER by hand (and hence we invented PT_KERN). In 3.0.3, they broke
compatibility such that a 64-bit guest must *not* use PT_USER except for
the usual user mappings.
We cleaned up most PT_KERN usage, but we missed a spot where we used 'PT_USER'
directly (tut):
usr/src/uts/i86pc/vm/hat_i86.c:
566 for (i = 0; i <= mmu.max_page_level; ++i) {
567 mmu.pte_bits[i] = PT_VALID;
568 #if defined(__xpv) && defined(__amd64)
569 mmu.pte_bits[i] |= PT_USER;
570 #endif
571 if (i > 0)
572 mmu.pte_bits[i] |= PT_PAGESIZE;
573 }
All our kernel level-0 pte entries end up with the PT_USER bit when we
go to Xen. What effect does that have?
xen/arch/x86/mm.c :
670 #ifdef USER_MAPPINGS_ARE_GLOBAL
671 #define adjust_guest_l1e(pl1e) \
672 do { \
673 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) ) \
674 { \
675 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
676 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
677 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
678 MEM_LOG("Global bit is set to kernel page %lx", \
679 l1e_get_pfn((pl1e))); \
680 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
681 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
682 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
683 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
684 } \
685 } while ( 0 )
An unpleasant one - by default, PT_USER (aka _PAGE_USER) PTEs also gain _PAGE_GLOBAL,
the performance hack introduced shortly after changing the PT_USER requirement.
So, even though we have separate pagetables for the kernel and user, a stale TLB
mapping could in theory be used to let userspace access kernel memory.
However, for most (and possibly all) cases, we go through the emulated pt write code,
which has a bug that meant _PAGE_GLOBAL was never getting set:
xen/arch/x86/mm.c :
3113 static int ptwr_emulated_update(
...
/* set val to the guest's PTE value to write */
...
3171 /* Check the new PTE. */
3172 nl1e = l1e_from_intpte(val);
...
3196
3197 adjust_guest_l1e(nl1e);
3198
...
3202 if ( do_cmpxchg )
3203 {
3204 if ( shadow_mode_enabled(d) )
3205 shadow_lock(d);
3206 ol1e = l1e_from_intpte(old);
3207 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
Note that when we do the cmpxchg (which apparently *only* Solaris uses),
we're using 'val', not the modified PTE contents which have set _PAGE_GLOBAL.
If we fix this, can we reproduce a problem? Call this 'readkern':
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/signal.h>
/*
* Verify that userspace can't read kernel addresses. Example usage:
*
* dtrace -qn 'fbt:genunix::entry /arg0 > `kernelbase/ \
* { printf("%p ", arg0); }' | xargs readkern
*/
volatile uint64_t *address;
void
sigsegv(void)
{
_exit(0);
}
int main(int argc, char *argv[])
{
struct sigaction segv;
if (argc < 2) {
fprintf(stderr, "usage: readkern <address>\n");
exit(1);
}
segv.sa_handler = sigsegv;
sigemptyset(&segv.sa_mask);
segv.sa_flags = 0;
if (sigaction(SIGSEGV, &segv, NULL) == -1) {
fprintf(stderr, "failed to sigaction.\n");
exit(1);
}
if (sscanf(argv[1], "%llx", &address) != 1) {
fprintf(stderr, "failed to parse address %s.\n",
argv[1]);
exit(1);
}
for (;;) {
uint64_t tmp;
(void) getpid();
tmp = *address;
printf("%p\n", address);
exit(1);
}
return (0);
}
And run it like this:
dtrace -qn 'fbt:genunix::entry /arg0 > `kernelbase/ { printf("%p ", arg0); }' | \
xargs -n 1 ~johnlev/bin/i386/readkern | while read ln; do echo $ln::whatis | mdb -k ; done
We can see output such as:
ffffff01d6f09c00 is ffffff01d6f09c00+0, allocated as a thread structure
ffffff01c8c98438 is ffffff01c8c983e8+50, bufctl ffffff01c8ebf8d0 allocated from as_cache
ffffff01d6f09c00 is ffffff01d6f09c00+0, allocated as a thread structure
ffffff01d44d7e80 is ffffff01d44d7e80+0, bufctl ffffff01d3a2b388 allocated from kmem_alloc_40
ffffff01d44d7e80 is ffffff01d44d7e80+0, bufctl ffffff01d3a2b388 allocated from kmem_alloc_40
Note that the Xen bug is probably hurting performance quite badly.
|