Loading arch/x86/xen/mmu.c +124 −1 Original line number Diff line number Diff line Loading @@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } #ifdef CONFIG_X86_64 static __initdata u64 __last_pgt_set_rw = 0; static __initdata u64 __pgt_buf_start = 0; static __initdata u64 __pgt_buf_end = 0; static __initdata u64 __pgt_buf_top = 0; /* * As a consequence of the commit: * * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e * Author: Yinghai Lu <yinghai@kernel.org> * Date: Fri Dec 17 16:58:28 2010 -0800 * * x86-64, mm: Put early page table high * * at some point init_memory_mapping is going to reach the pagetable pages * area and map those pages too (mapping them as normal memory that falls * in the range of addresses passed to init_memory_mapping as argument). * Some of those pages are already pagetable pages (they are in the range * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and * everything is fine. * Some of these pages are not pagetable pages yet (they fall in the range * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they * are going to be mapped RW. When these pages become pagetable pages and * are hooked into the pagetable, xen will find that the guest has already * a RW mapping of them somewhere and fail the operation. * The reason Xen requires pagetables to be RO is that the hypervisor needs * to verify that the pagetables are valid before using them. The validation * operations are called "pinning". * * In order to fix the issue we mark all the pages in the entire range * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation * is completed only the range pgt_buf_start-pgt_buf_end is reserved by * init_memory_mapping. Hence the kernel is going to crash as soon as one * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those * ranges are RO). * * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ * the init_memory_mapping has completed (in a perfect world we would * call this function from init_memory_mapping, but lets ignore that). * * Because we are called _after_ init_memory_mapping the pgt_buf_[start, * end,top] have all changed to new values (b/c init_memory_mapping * is called and setting up another new page-table). Hence, the first time * we enter this function, we save away the pgt_buf_start value and update * the pgt_buf_[end,top]. * * When we detect that the "old" pgt_buf_start through pgt_buf_end * PFNs have been reserved (so memblock_x86_reserve_range has been called), * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. * * And then we update those "old" pgt_buf_[end|top] with the new ones * so that we can redo this on the next pagetable. */ static __init void mark_rw_past_pgt(void) { if (pgt_buf_end > pgt_buf_start) { u64 addr, size; /* Save it away. */ if (!__pgt_buf_start) { __pgt_buf_start = pgt_buf_start; __pgt_buf_end = pgt_buf_end; __pgt_buf_top = pgt_buf_top; return; } /* If we get the range that starts at __pgt_buf_end that means * the range is reserved, and that in 'init_memory_mapping' * the 'memblock_x86_reserve_range' has been called with the * outdated __pgt_buf_start, __pgt_buf_end (the "new" * pgt_buf_[start|end|top] refer now to a new pagetable. * Note: we are called _after_ the pgt_buf_[..] have been * updated.*/ addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), &size, PAGE_SIZE); /* Still not reserved, meaning 'memblock_x86_reserve_range' * hasn't been called yet. Update the _end and _top.*/ if (addr == PFN_PHYS(__pgt_buf_start)) { __pgt_buf_end = pgt_buf_end; __pgt_buf_top = pgt_buf_top; return; } /* OK, the area is reserved, meaning it is time for us to * set RW for the old end->top PFNs. */ /* ..unless we had already done this. */ if (__pgt_buf_end == __last_pgt_set_rw) return; addr = PFN_PHYS(__pgt_buf_end); /* set as RW the rest */ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); while (addr < PFN_PHYS(__pgt_buf_top)) { make_lowmem_page_readwrite(__va(addr)); addr += PAGE_SIZE; } /* And update everything so that we are ready for the next * pagetable (the one created for regions past 4GB) */ __last_pgt_set_rw = __pgt_buf_end; __pgt_buf_start = pgt_buf_start; __pgt_buf_end = pgt_buf_end; __pgt_buf_top = pgt_buf_top; } return; } #else static __init void mark_rw_past_pgt(void) { } #endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 Loading @@ -1488,6 +1601,14 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { unsigned long pfn = pte_pfn(pte); /* * A bit of optimization. We do not need to call the workaround * when xen_set_pte_init is called with a PTE with 0 as PFN. * That is b/c the pagetable at that point are just being populated * with empty values and we can save some cycles by not calling * the 'memblock' code.*/ if (pfn) mark_rw_past_pgt(); /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an Loading @@ -1495,7 +1616,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) * it is RO. */ if (((!is_early_ioremap_ptep(ptep) && pfn >= pgt_buf_start && pfn < pgt_buf_end)) || pfn >= pgt_buf_start && pfn < pgt_buf_top)) || (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); Loading Loading @@ -1997,6 +2118,8 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { mark_rw_past_pgt(); #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif Loading Loading
arch/x86/xen/mmu.c +124 −1 Original line number Diff line number Diff line Loading @@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } #ifdef CONFIG_X86_64 static __initdata u64 __last_pgt_set_rw = 0; static __initdata u64 __pgt_buf_start = 0; static __initdata u64 __pgt_buf_end = 0; static __initdata u64 __pgt_buf_top = 0; /* * As a consequence of the commit: * * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e * Author: Yinghai Lu <yinghai@kernel.org> * Date: Fri Dec 17 16:58:28 2010 -0800 * * x86-64, mm: Put early page table high * * at some point init_memory_mapping is going to reach the pagetable pages * area and map those pages too (mapping them as normal memory that falls * in the range of addresses passed to init_memory_mapping as argument). * Some of those pages are already pagetable pages (they are in the range * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and * everything is fine. * Some of these pages are not pagetable pages yet (they fall in the range * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they * are going to be mapped RW. When these pages become pagetable pages and * are hooked into the pagetable, xen will find that the guest has already * a RW mapping of them somewhere and fail the operation. * The reason Xen requires pagetables to be RO is that the hypervisor needs * to verify that the pagetables are valid before using them. The validation * operations are called "pinning". * * In order to fix the issue we mark all the pages in the entire range * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation * is completed only the range pgt_buf_start-pgt_buf_end is reserved by * init_memory_mapping. Hence the kernel is going to crash as soon as one * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those * ranges are RO). * * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ * the init_memory_mapping has completed (in a perfect world we would * call this function from init_memory_mapping, but lets ignore that). * * Because we are called _after_ init_memory_mapping the pgt_buf_[start, * end,top] have all changed to new values (b/c init_memory_mapping * is called and setting up another new page-table). Hence, the first time * we enter this function, we save away the pgt_buf_start value and update * the pgt_buf_[end,top]. * * When we detect that the "old" pgt_buf_start through pgt_buf_end * PFNs have been reserved (so memblock_x86_reserve_range has been called), * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. * * And then we update those "old" pgt_buf_[end|top] with the new ones * so that we can redo this on the next pagetable. */ static __init void mark_rw_past_pgt(void) { if (pgt_buf_end > pgt_buf_start) { u64 addr, size; /* Save it away. */ if (!__pgt_buf_start) { __pgt_buf_start = pgt_buf_start; __pgt_buf_end = pgt_buf_end; __pgt_buf_top = pgt_buf_top; return; } /* If we get the range that starts at __pgt_buf_end that means * the range is reserved, and that in 'init_memory_mapping' * the 'memblock_x86_reserve_range' has been called with the * outdated __pgt_buf_start, __pgt_buf_end (the "new" * pgt_buf_[start|end|top] refer now to a new pagetable. * Note: we are called _after_ the pgt_buf_[..] have been * updated.*/ addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), &size, PAGE_SIZE); /* Still not reserved, meaning 'memblock_x86_reserve_range' * hasn't been called yet. Update the _end and _top.*/ if (addr == PFN_PHYS(__pgt_buf_start)) { __pgt_buf_end = pgt_buf_end; __pgt_buf_top = pgt_buf_top; return; } /* OK, the area is reserved, meaning it is time for us to * set RW for the old end->top PFNs. */ /* ..unless we had already done this. */ if (__pgt_buf_end == __last_pgt_set_rw) return; addr = PFN_PHYS(__pgt_buf_end); /* set as RW the rest */ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); while (addr < PFN_PHYS(__pgt_buf_top)) { make_lowmem_page_readwrite(__va(addr)); addr += PAGE_SIZE; } /* And update everything so that we are ready for the next * pagetable (the one created for regions past 4GB) */ __last_pgt_set_rw = __pgt_buf_end; __pgt_buf_start = pgt_buf_start; __pgt_buf_end = pgt_buf_end; __pgt_buf_top = pgt_buf_top; } return; } #else static __init void mark_rw_past_pgt(void) { } #endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 Loading @@ -1488,6 +1601,14 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { unsigned long pfn = pte_pfn(pte); /* * A bit of optimization. We do not need to call the workaround * when xen_set_pte_init is called with a PTE with 0 as PFN. * That is b/c the pagetable at that point are just being populated * with empty values and we can save some cycles by not calling * the 'memblock' code.*/ if (pfn) mark_rw_past_pgt(); /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an Loading @@ -1495,7 +1616,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) * it is RO. */ if (((!is_early_ioremap_ptep(ptep) && pfn >= pgt_buf_start && pfn < pgt_buf_end)) || pfn >= pgt_buf_start && pfn < pgt_buf_top)) || (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); Loading Loading @@ -1997,6 +2118,8 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { mark_rw_past_pgt(); #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif Loading