x86: Basic PAT implementation Originally based on a patch from Eric Biederman, but heavily changed. --- arch/i386/kernel/cpu/common.c | 3 ++ arch/i386/mm/Makefile | 4 ++- arch/i386/pci/i386.c | 13 +++------ arch/x86_64/kernel/setup64.c | 3 ++ arch/x86_64/mm/Makefile | 2 - arch/x86_64/mm/pat.c | 53 ++++++++++++++++++++++++++++++++++++++++ include/asm-i386/cpufeature.h | 2 + include/asm-i386/msr.h | 2 + include/asm-i386/pgtable.h | 39 +++++++++++++++++++++++++---- include/asm-i386/processor.h | 2 + include/asm-x86_64/cpufeature.h | 1 include/asm-x86_64/msr.h | 2 + include/asm-x86_64/pgtable.h | 35 ++++++++++++++++++++++---- include/asm-x86_64/processor.h | 2 + 14 files changed, 142 insertions(+), 21 deletions(-) Index: linux/arch/i386/pci/i386.c =================================================================== --- linux.orig/arch/i386/pci/i386.c +++ linux/arch/i386/pci/i386.c @@ -265,8 +265,6 @@ void pcibios_set_master(struct pci_dev * int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { - unsigned long prot; - /* I/O space cannot be accessed via normal processor loads and * stores on this platform. */ @@ -276,14 +274,11 @@ int pci_mmap_page_range(struct pci_dev * /* Leave vm_pgoff as-is, the PCI space address is the physical * address on this platform. */ - prot = pgprot_val(vma->vm_page_prot); - if (boot_cpu_data.x86 > 3) - prot |= _PAGE_PCD | _PAGE_PWT; - vma->vm_page_prot = __pgprot(prot); + if (write_combine) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - /* Write-combine setting is ignored, it is changed via the mtrr - * interfaces on this platform. - */ if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot)) Index: linux/include/asm-i386/cpufeature.h =================================================================== --- linux.orig/include/asm-i386/cpufeature.h +++ linux/include/asm-i386/cpufeature.h @@ -135,6 +135,8 @@ #define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM) #define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) +#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) + #endif /* __ASM_I386_CPUFEATURE_H */ /* Index: linux/include/asm-i386/msr.h =================================================================== --- linux.orig/include/asm-i386/msr.h +++ linux/include/asm-i386/msr.h @@ -136,6 +136,8 @@ static inline void wrmsrl (unsigned long #define MSR_IA32_LASTINTFROMIP 0x1dd #define MSR_IA32_LASTINTTOIP 0x1de +#define MSR_IA32_CR_PAT 0x277 + #define MSR_IA32_MC0_CTL 0x400 #define MSR_IA32_MC0_STATUS 0x401 #define MSR_IA32_MC0_ADDR 0x402 Index: linux/include/asm-i386/pgtable.h =================================================================== --- linux.orig/include/asm-i386/pgtable.h +++ linux/include/asm-i386/pgtable.h @@ -119,6 +119,12 @@ void paging_init(void); #define _PAGE_UNUSED2 0x400 #define _PAGE_UNUSED3 0x800 +/* We redefine PWT|PCD to be write combining. PAT bit is not used */ + +#define _PAGE_WC (_PAGE_PWT|_PAGE_PCD) + +#define _PAGE_CACHE_MASK (_PAGE_PWT|_PAGE_PCD) + /* If _PAGE_PRESENT is clear, we use these: */ #define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ #define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE; @@ -159,6 +165,7 @@ void paging_init(void); extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_WC) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -166,8 +173,10 @@ extern unsigned long long __PAGE_KERNEL, #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_ATTR(x) __pgprot(__PAGE_KERNEL | (x)) /* * The i386 can't do page protection for execute, and considers that @@ -360,11 +369,31 @@ static inline void clone_pgd_range(pgd_t } /* - * Macro to mark a page protection value as "uncacheable". On processors which do not support - * it, this is a no-op. - */ -#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot)) + * Macro to mark a page protection value as "uncacheable". + * Accesses through a uncached translation bypasses the cache + * and do not allow for consecutive writes to be combined. + * On processors which do not support it, this is a no-op. + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) ? \ + (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_PCD)): \ + (prot)) + +/* + * Macro to make mark a page protection value as "write-combining". + * Accesses through a write-combining translation bypasses the + * caches, but does allow for consecutive writes to be combined into + * single (but larger) write transactions. + * On processors that do not support PAT this setting allows + * mtrrs to set write combining. + * On processors which do not support it, this is a no-op. + */ +#define pgprot_writecombine(prot) \ + ((boot_cpu_data.x86 > 3) ? \ + (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_WC)): \ + (prot)) + +#define pgprot_nonstd(prot) (pgprot_val(prot) & _PAGE_CACHE_MASK) /* * Conversion functions: convert a page and protection to a page entry, Index: linux/include/asm-x86_64/cpufeature.h =================================================================== --- linux.orig/include/asm-x86_64/cpufeature.h +++ linux/include/asm-x86_64/cpufeature.h @@ -112,5 +112,6 @@ #define cpu_has_cyrix_arr 0 #define cpu_has_centaur_mcr 0 #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) +#define cpu_has_pat 1 #endif /* __ASM_X8664_CPUFEATURE_H */ Index: linux/include/asm-x86_64/msr.h =================================================================== --- linux.orig/include/asm-x86_64/msr.h +++ linux/include/asm-x86_64/msr.h @@ -223,6 +223,8 @@ static inline unsigned int cpuid_edx(uns #define MSR_MTRRfix4K_F8000 0x26f #define MSR_MTRRdefType 0x2ff +#define MSR_IA32_CR_PAT 0x277 + #define MSR_IA32_MC0_CTL 0x400 #define MSR_IA32_MC0_STATUS 0x401 #define MSR_IA32_MC0_ADDR 0x402 Index: linux/include/asm-x86_64/pgtable.h =================================================================== --- linux.orig/include/asm-x86_64/pgtable.h +++ linux/include/asm-x86_64/pgtable.h @@ -158,6 +158,12 @@ static inline pte_t ptep_get_and_clear_f #define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ #define _PAGE_GLOBAL 0x100 /* Global TLB entry */ +/* We redefine PWT|PCD to be write combining. PAT bit is not used */ + +#define _PAGE_WC (_PAGE_PWT|_PAGE_PCD) + +#define _PAGE_CACHE_MASK (_PAGE_PWT|_PAGE_PCD) + #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_NX (1UL<<_PAGE_BIT_NX) @@ -174,12 +180,12 @@ static inline pte_t ptep_get_and_clear_f #define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define __PAGE_KERNEL \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) +#define __PAGE_KERNEL_ATTR(x) \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED |_PAGE_NX|(x)) +#define __PAGE_KERNEL __PAGE_KERNEL_ATTR(0) #define __PAGE_KERNEL_EXEC \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) -#define __PAGE_KERNEL_NOCACHE \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX) +#define __PAGE_KERNEL_NOCACHE __PAGE_KERNEL_ATTR(_PAGE_PCD) #define __PAGE_KERNEL_RO \ (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) #define __PAGE_KERNEL_VSYSCALL \ @@ -197,10 +203,12 @@ static inline pte_t ptep_get_and_clear_f #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC) #define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) #define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) +#define PAGE_KERNEL_ATTR(x) MAKE_GLOBAL(__PAGE_KERNEL_ATTR(x)) /* xwr */ #define __P000 PAGE_NONE @@ -305,8 +313,24 @@ static inline void ptep_set_wrprotect(st /* * Macro to mark a page protection value as "uncacheable". + * Accesses through a uncached translation bypasses the cache + * and do not allow for consecutive writes to be combined. */ -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) +#define pgprot_noncached(prot) \ + __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_PCD) + +/* + * Macro to make mark a page protection value as "write-combining". + * Accesses through a write-combining translation works bypasses the + * caches, but does allow for consecutive writes to be combined into + * single (but larger) write transactions. + * This is mostly useful for IO accesses, for memory it is often slower. + * It also implies uncached. + */ +#define pgprot_writecombine(prot) \ + __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_WC) + +#define pgprot_nonstd(prot) (pgprot_val(prot) & _PAGE_CACHE_MASK) static inline int pmd_large(pmd_t pte) { return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; @@ -433,6 +457,7 @@ extern int kern_addr_valid(unsigned long #define pgtable_cache_init() do { } while (0) #define check_pgt_cache() do { } while (0) +/* AGP users use MTRRs for now. Need to add an ioctl to agpgart for WC */ #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 Index: linux/arch/x86_64/mm/pat.c =================================================================== --- /dev/null +++ linux/arch/x86_64/mm/pat.c @@ -0,0 +1,53 @@ +/* Handle caching attributes in page tables (PAT) */ +#include +#include +#include +#include +#include +#include + +static u64 boot_pat_state; + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ +}; + +#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) + +void __cpuinit pat_init(void) +{ + /* Set PWT+PCD to Write-Combining. All other bits stay the same */ + if (cpu_has_pat) { + u64 pat; + /* PTE encoding used in Linux: + PAT + |PCD + ||PWT + ||| + 000 WB default + 010 UC_MINUS _PAGE_PCD + 011 WC _PAGE_WC + PAT bit unused */ + pat = PAT(0,WB) | PAT(1,WT) | PAT(2,UC_MINUS) | PAT(3,UC) | + PAT(4,WC) | PAT(5,WT) | PAT(6,UC_MINUS) | PAT(7,UC); + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + wrmsrl(MSR_IA32_CR_PAT, pat); + } + __flush_tlb_all(); + asm volatile("wbinvd"); +} + +#undef PAT + +void pat_shutdown(void) +{ + /* Restore CPU default pat state */ + if (cpu_has_pat) + wrmsrl(MSR_IA32_CR_PAT, boot_pat_state); +} + Index: linux/arch/x86_64/kernel/setup64.c =================================================================== --- linux.orig/arch/x86_64/kernel/setup64.c +++ linux/arch/x86_64/kernel/setup64.c @@ -284,9 +284,12 @@ void __cpuinit cpu_init (void) fpu_init(); + pat_init(); + raw_local_save_flags(kernel_eflags); } void cpu_shutdown(void) { + pat_shutdown(); } Index: linux/arch/i386/kernel/cpu/common.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/common.c +++ linux/arch/i386/kernel/cpu/common.c @@ -699,10 +699,13 @@ old_gdt: current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); + + pat_init(); } void cpu_shutdown(void) { + pat_shutdown(); } #ifdef CONFIG_HOTPLUG_CPU Index: linux/arch/i386/mm/Makefile =================================================================== --- linux.orig/arch/i386/mm/Makefile +++ linux/arch/i386/mm/Makefile @@ -2,9 +2,11 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o +obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o pat.o obj-$(CONFIG_NUMA) += discontig.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o + +pat-y := ../../x86_64/mm/pat.o Index: linux/arch/x86_64/mm/Makefile =================================================================== --- linux.orig/arch/x86_64/mm/Makefile +++ linux/arch/x86_64/mm/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux x86_64-specific parts of the memory manager. # -obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o +obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o pat.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o Index: linux/include/asm-i386/processor.h =================================================================== --- linux.orig/include/asm-i386/processor.h +++ linux/include/asm-i386/processor.h @@ -116,6 +116,8 @@ extern void cpu_shutdown(void); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; +extern void pat_shutdown(void); +extern void pat_init(void); #ifdef CONFIG_X86_HT extern void detect_ht(struct cpuinfo_x86 *c); Index: linux/include/asm-x86_64/processor.h =================================================================== --- linux.orig/include/asm-x86_64/processor.h +++ linux/include/asm-x86_64/processor.h @@ -102,6 +102,8 @@ extern void cpu_shutdown(void); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; +extern void pat_init(void); +extern void pat_shutdown(void); /* * EFLAGS bits