Conflict checking for PAT mappings This insures that there are not illegal caching attribute aliases. The attributes are put into a red-black tree. The checking is done in change_page_attr which fails on illegal aliases. Each user gets a reference which is undone when change_page_attr reverted to write-back (PAGE_KERNEL) This adds also support to the main VM to be able to undo linear mapping changes on unmap. This allows to easily remap write combined pages into user space. The prefered interface for this is mmap on /proc/bus/pci/**/* for the individual devices. Signed-off-by: Andi Kleen Index: linux/arch/x86_64/mm/memattr.c =================================================================== --- /dev/null +++ linux/arch/x86_64/mm/memattr.c @@ -0,0 +1,160 @@ +/* Manage caching attributes of physical memory areas. This avoids + any illegal aliases. */ + +#include +#include +#include + +/* Protected by the init_mm semaphore */ +static struct rb_root mattr_root; + +struct memattr { + struct rb_node nd; + unsigned long start, end; + pgprot_t attr; + int count; +}; + +/* lookup first element intersecting start-end */ +static struct memattr *mattr_lookup(unsigned long start, unsigned long end) +{ + struct rb_node *p; + + for (p = mattr_root.rb_node; p != NULL; ) { + struct memattr *n = rb_entry(p, struct memattr, nd); + if (start >= n->end) + p = p->rb_right; + else if (end <= n->start) + p = p->rb_left; + else + break; + } + if (!p) + return NULL; + for (;;) { + struct memattr *w = NULL; + struct rb_node *prev = rb_prev(p); + if (!prev) + break; + w = rb_entry(prev, struct memattr, nd); + if (w->end <= start) + break; + p = prev; + } + return rb_entry(p, struct memattr, nd); +} + +/* Insert a new shared policy into the list. */ +static void mattr_insert(struct memattr *new) +{ + struct rb_node **p = &mattr_root.rb_node; + struct rb_node *parent = NULL; + struct memattr *nd; + + while (*p) { + parent = *p; + nd = rb_entry(parent, struct memattr, nd); + if (new->start < nd->start) + p = &(*p)->rb_left; + else if (new->end > nd->end) + p = &(*p)->rb_right; + else + BUG(); + } + rb_link_node(&new->nd, parent, p); + rb_insert_color(&new->nd, &mattr_root); +} + +struct memattr * +mattr_alloc(unsigned long start, unsigned long end, pgprot_t attr) +{ + struct memattr *n = kmalloc(sizeof(struct memattr), GFP_KERNEL); + if (n != NULL) { + n->start = start; + n->end = end; + n->attr = attr; + n->count = 0; + } + return n; +} + +static void mattr_free(struct memattr *m) +{ + rb_erase(&m->nd, &mattr_root); + kfree(m); +} + +/** mattr_get - Get reference for attribute of physical memory range + * @start: + * @end: + * @attr: + * Caller ensures locking. + */ +int mattr_get(unsigned long start, unsigned long end, pgprot_t attr) +{ + struct rb_node *next; + struct memattr *m, *overlap, *new = NULL; + int count = 1; + + overlap = mattr_lookup(start, end); + /* First check for conflicts before doing anything final */ + for (m = overlap; + m != NULL && m->start < end; + m = rb_entry(next, struct memattr, nd)) { + /* Conflict? */ + if ((pgprot_val(m->attr) & _PAGE_MA_MASK) != + (pgprot_val(attr) & _PAGE_MA_MASK)) + return -EIO; + if (m->start < start) + start = m->start; + if (m->end > end) + end = m->end; + count += m->count; + next = rb_next(&m->nd); + if (!next) + break; + } + /* Get new one */ + if (pgprot_nonstd(attr)) { + new = mattr_alloc(start, end, attr); + if (!new) + return -ENOMEM; + } + /* Now free all old attributes */ + for (m = overlap; + m && m->start < end; + m = rb_entry(next, struct memattr, nd)) { + next = rb_next(&m->nd); + mattr_free(m); + if (!next) + break; + } + /* And replace with new one */ + if (new != NULL) { + new->count = count; + mattr_insert(new); + } + return 0; +} + +/** mattr_put - free reference of attribute for physical memory range. + * @start: + * @end: + * @attr: + * Caller ensures locking. + */ +void mattr_put(unsigned long start, unsigned long end, pgprot_t attr) +{ + struct memattr *m = mattr_lookup(start, end); + if (!pgprot_nonstd(attr)) { + BUG_ON(m != NULL); + return; + } + BUG_ON(m == NULL); + BUG_ON(m->start > start); + BUG_ON(m->end < end); + BUG_ON(pgprot_val(m->attr) != pgprot_val(attr)); + BUG_ON(m->count <= 0); + if (--m->count == 0) + mattr_free(m); +} Index: linux/arch/x86_64/mm/Makefile =================================================================== --- linux.orig/arch/x86_64/mm/Makefile +++ linux/arch/x86_64/mm/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux x86_64-specific parts of the memory manager. # -obj-y := init.o fault.o ioremap.o extable.o pageattr.o +obj-y := init.o fault.o ioremap.o extable.o pageattr.o memattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o Index: linux/include/asm-i386/cacheflush.h =================================================================== --- linux.orig/include/asm-i386/cacheflush.h +++ linux/include/asm-i386/cacheflush.h @@ -26,6 +26,8 @@ void global_flush_tlb(void); int change_page_attr(struct page *page, int numpages, pgprot_t prot); int change_page_attr_pfn(unsigned long pfn, int numpages, pgprot_t prot); +int mattr_get(unsigned long pfn, unsigned long end, pgprot_t prot); +void mattr_put(unsigned long pfn, unsigned long end, pgprot_t prot); #ifdef CONFIG_DEBUG_PAGEALLOC /* internal debugging function */ Index: linux/arch/i386/mm/Makefile =================================================================== --- linux.orig/arch/i386/mm/Makefile +++ linux/arch/i386/mm/Makefile @@ -2,9 +2,12 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o +obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o \ + memattr.o obj-$(CONFIG_NUMA) += discontig.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o + +memattr-y += ../../x86_64/mm/memattr.o Index: linux/arch/i386/mm/pageattr.c =================================================================== --- linux.orig/arch/i386/mm/pageattr.c +++ linux/arch/i386/mm/pageattr.c @@ -179,13 +179,21 @@ int change_page_attr_pfn(unsigned long p int err = 0; int i; unsigned long flags; + pgprot_t attr = __pgprot(pgprot_val(prot) & _PAGE_MA_MASK); spin_lock_irqsave(&cpa_lock, flags); + err = mattr_get(pfn, pfn+numpages, attr); + if (err) { + spin_unlock_irqrestore(&cpa_lock, flags); + return err; + } for (i = 0; i < numpages; i++, pfn++) { err = __change_page_attr(pfn, prot); if (err) break; } + if (err) + mattr_put(pfn, pfn+numpages, attr); spin_unlock_irqrestore(&cpa_lock, flags); return err; } Index: linux/include/asm-x86_64/cacheflush.h =================================================================== --- linux.orig/include/asm-x86_64/cacheflush.h +++ linux/include/asm-x86_64/cacheflush.h @@ -26,6 +26,8 @@ void global_flush_tlb(void); int change_page_attr(struct page *page, int numpages, pgprot_t prot); int change_page_attr_pfn(unsigned long pfn, int numpages, pgprot_t prot); +int mattr_get(unsigned long pfn, unsigned long end, pgprot_t prot); +void mattr_put(unsigned long pfn, unsigned long end, pgprot_t prot); #define ARCH_HAS_CPA 1 Index: linux/arch/x86_64/mm/pageattr.c =================================================================== --- linux.orig/arch/x86_64/mm/pageattr.c +++ linux/arch/x86_64/mm/pageattr.c @@ -14,6 +14,8 @@ #include #include +static DECLARE_MUTEX(cpa_sem); + static inline pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); @@ -185,8 +187,14 @@ int change_page_attr_pfn(unsigned long p unsigned long address = pfn << PAGE_SHIFT; int err = 0; int i; + pgprot_t attr = __pgprot(pgprot_val(prot) & _PAGE_MA_MASK); - down_write(&init_mm.mmap_sem); + down(&cpa_sem); + err = mattr_get(pfn, pfn + numpages, attr); + if (err) { + up(&cpa_sem); + return err; + } for (i = 0; i < numpages; i++, address += PAGE_SIZE) { unsigned long pfn = __pa(address) >> PAGE_SHIFT; @@ -201,9 +209,13 @@ int change_page_attr_pfn(unsigned long p addr2 = __START_KERNEL_map + __pa(address); pgprot_val(prot2) &= ~_PAGE_NX; err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); + if (err) + break; } } - up_write(&init_mm.mmap_sem); + if (err) + mattr_put(pfn, pfn + numpages, attr); + up(&cpa_sem); return err; } Index: linux/include/asm-x86_64/pgtable.h =================================================================== --- linux.orig/include/asm-x86_64/pgtable.h +++ linux/include/asm-x86_64/pgtable.h @@ -119,6 +119,8 @@ static inline pte_t ptep_get_and_clear_f #define pte_same(a, b) ((a).pte == (b).pte) +#define pte_caching_nonstd(pte) (pte_val(pte) & _PAGE_MA_MASK) + #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PUD_SIZE (1UL << PUD_SHIFT) @@ -448,8 +450,12 @@ static inline pte_t pte_modify(pte_t pte extern int kern_addr_valid(unsigned long addr); -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - remap_pfn_range(vma, vaddr, pfn, size, prot) +#define io_remap_page_range(vma, vaddr, page, size, prot) \ + io_remap_pfn_range(vma, vaddr, page_to_pfn(page), size, prot) + +extern int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long + vaddr, unsigned long pfn, unsigned long size, + pgprot_t prot); #define MK_IOSPACE_PFN(space, pfn) (pfn) #define GET_IOSPACE(pfn) 0 Index: linux/include/asm-i386/pgtable.h =================================================================== --- linux.orig/include/asm-i386/pgtable.h +++ linux/include/asm-i386/pgtable.h @@ -455,6 +455,14 @@ extern void noexec_setup(const char *str } \ } while (0) + +#define io_remap_page_range(vma, vaddr, page, size, prot) \ + io_remap_pfn_range(vma, vaddr, page_to_pfn(page), size, prot) + +extern int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long + vaddr, unsigned long pfn, unsigned long size, + pgprot_t prot); + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM Index: linux/arch/i386/mm/ioremap.c =================================================================== --- linux.orig/arch/i386/mm/ioremap.c +++ linux/arch/i386/mm/ioremap.c @@ -301,3 +301,20 @@ void __init bt_iounmap(void *addr, unsig --nrpages; } } + +int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long + vaddr, unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + int err; + err = change_page_attr_pfn(pfn, (size + PAGE_SIZE - 1)>>PAGE_SHIFT, + prot); + if (err) { + printk("io_remap_pfn_range: changing memattributes failed %lx\n", + pgprot_val(prot)); + return err; + } + return remap_pfn_range(vma, vaddr, pfn, size, prot); +} +EXPORT_SYMBOL(io_remap_pfn_range); + Index: linux/arch/x86_64/mm/ioremap.c =================================================================== --- linux.orig/arch/x86_64/mm/ioremap.c +++ linux/arch/x86_64/mm/ioremap.c @@ -260,3 +260,18 @@ void iounmap(volatile void __iomem *addr } EXPORT_SYMBOL(iounmap); +int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long + vaddr, unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + int err; + err = change_page_attr_pfn(pfn, (size + PAGE_SIZE - 1)>>PAGE_SHIFT, + prot); + if (err) { + printk("io_remap_pfn_range: changing memattributes failed %lx\n", + pgprot_val(prot)); + return err; + } + return remap_pfn_range(vma, vaddr, pfn, size, prot); +} +EXPORT_SYMBOL(io_remap_pfn_range); Index: linux/arch/i386/pci/i386.c =================================================================== --- linux.orig/arch/i386/pci/i386.c +++ linux/arch/i386/pci/i386.c @@ -282,10 +282,7 @@ int pci_mmap_page_range(struct pci_dev * /* Write-combine setting may also be controlled via * the mtrr interfaces on this platform. */ - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, - vma->vm_page_prot)) - return -EAGAIN; - - return 0; + vma->vm_page_prot); } Index: linux/mm/memory.c =================================================================== --- linux.orig/mm/memory.c +++ linux/mm/memory.c @@ -544,6 +544,10 @@ static void zap_pte_range(struct mmu_gat if (PageReserved(page)) page = NULL; } +#ifdef ARCH_HAS_CPA + if (pte_caching_nonstd(ptent)) + change_page_attr_pfn(pte_pfn(ptent), 1, PAGE_KERNEL); +#endif if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to Index: linux/include/asm-i386/pgtable-3level.h =================================================================== --- linux.orig/include/asm-i386/pgtable-3level.h +++ linux/include/asm-i386/pgtable-3level.h @@ -66,6 +66,8 @@ static inline void set_pte(pte_t *ptep, #define set_pud(pudptr,pudval) \ (*(pudptr) = (pudval)) +#define pte_caching_nonstd(pte) ((pte).pte_low & _PAGE_MA_MASK) + /* * Pentium-II erratum A13: in PAE mode we explicitly have to flush * the TLB via cr3 if the top-level pgd is changed... Index: linux/include/asm-i386/pgtable-2level.h =================================================================== --- linux.orig/include/asm-i386/pgtable-2level.h +++ linux/include/asm-i386/pgtable-2level.h @@ -26,6 +26,8 @@ #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pte_caching_nonstd(pte) ((pte).pte_low & _PAGE_MA_MASK) + #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #define pmd_page_kernel(pmd) \