From: ebiederm@xmission.com (Eric W. Biederman)

[Mostly based on code from Eric with some changes by me -AK]

PAT (or setting caching policy in the page table entries) has been a
long desired feature in the kernel and as large memory sizes become
more prevalent it becomes increasingly hard to specify all of the
regions that need write-back caching with just 8 MTRRs much less add
in the write-combining regions. 

This implementation does not attempt to change the world but it is instead
an incremental improvement on what we already have.  We already have
page attributes of write-back (neither PCD nor PWT), uncached (PCD and PWT), 
and uncached but allow write-combining (PCD but not PWT) in the x86 page
tables.  This implementation simply promotes (PCD but not PWT) to
request write-combining instead of simply allowing it.  If PAT is
not implemented on the cpu on the cpu someone requesting
write-combining will get an uncached area that will allow the
mtrrs to specify write-combining.

The way we used the existing page attributes was not completely
consistent, and it is cumbersome to use if you don't understand
the architecture minutia.  So I have added an implementation
pgprot_writecombine and the flags _PAGE_MA_WB, _PAGE_MA_WC,
_PAGE_MA_UC so it is clearer what the users are doing.

There should probably be an ioremap_writecombine added as well
but for now you can use __ioremap(..., _PAGE_MA_WC);

In previous conversations concerns have been raised about aliasing
issues caused by the same physical addresses being cached in different
ways.  Currently this code only allows for an additional flavor
of uncached access to physical memory addresses which should be hard
to abuse, and should raise no additional aliasing problems.  No
attempt has been made to fix theoretical aliasing problems.

I have tested this code and it works for me but it probably needs
to sit in the -mm tree for a little while, to get broader exposure. 

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andi Kleen <ak@suse.de>

Index: linux/arch/i386/pci/i386.c
===================================================================
--- linux.orig/arch/i386/pci/i386.c
+++ linux/arch/i386/pci/i386.c
@@ -274,8 +274,6 @@ void pcibios_set_master(struct pci_dev *
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
-	unsigned long prot;
-
 	/* I/O space cannot be accessed via normal processor loads and
 	 * stores on this platform.
 	 */
@@ -285,14 +283,11 @@ int pci_mmap_page_range(struct pci_dev *
 	/* Leave vm_pgoff as-is, the PCI space address is the physical
 	 * address on this platform.
 	 */
-	prot = pgprot_val(vma->vm_page_prot);
-	if (boot_cpu_data.x86 > 3)
-		prot |= _PAGE_PCD | _PAGE_PWT;
-	vma->vm_page_prot = __pgprot(prot);
+	if (write_combine)
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	else
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-	/* Write-combine setting is ignored, it is changed via the mtrr
-	 * interfaces on this platform.
-	 */
 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 			       vma->vm_end - vma->vm_start,
 			       vma->vm_page_prot))
Index: linux/include/asm-i386/cpufeature.h
===================================================================
--- linux.orig/include/asm-i386/cpufeature.h
+++ linux/include/asm-i386/cpufeature.h
@@ -135,6 +135,8 @@
 #define cpu_has_pmm		boot_cpu_has(X86_FEATURE_PMM)
 #define cpu_has_pmm_enabled	boot_cpu_has(X86_FEATURE_PMM_EN)
 
+#define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
+
 #endif /* __ASM_I386_CPUFEATURE_H */
 
 /* 
Index: linux/include/asm-i386/msr.h
===================================================================
--- linux.orig/include/asm-i386/msr.h
+++ linux/include/asm-i386/msr.h
@@ -136,6 +136,8 @@ static inline void wrmsrl (unsigned long
 #define MSR_IA32_LASTINTFROMIP		0x1dd
 #define MSR_IA32_LASTINTTOIP		0x1de
 
+#define MSR_IA32_CR_PAT		0x277
+
 #define MSR_IA32_MC0_CTL		0x400
 #define MSR_IA32_MC0_STATUS		0x401
 #define MSR_IA32_MC0_ADDR		0x402
Index: linux/include/asm-i386/pgtable.h
===================================================================
--- linux.orig/include/asm-i386/pgtable.h
+++ linux/include/asm-i386/pgtable.h
@@ -119,6 +119,12 @@ void paging_init(void);
 #define _PAGE_UNUSED2	0x400
 #define _PAGE_UNUSED3	0x800
 
+/* We redefine PWT|PCD to be write combining. PAT bit is not used */
+
+#define _PAGE_WC	(_PAGE_PWT|_PAGE_PCD)
+
+#define _PAGE_CACHE_MASK	(_PAGE_PWT|_PAGE_PCD)
+
 /* If _PAGE_PRESENT is clear, we use these: */
 #define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
 #define _PAGE_PROTNONE	0x080	/* if the user mapped it with PROT_NONE;
@@ -159,6 +165,7 @@ void paging_init(void);
 extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_WC		(__PAGE_KERNEL | _PAGE_WC)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
 
@@ -166,6 +173,7 @@ extern unsigned long long __PAGE_KERNEL,
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_WC		__pgprot(__PAGE_KERNEL_WC)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE)
 #define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC)
 
@@ -293,11 +301,31 @@ static inline void clone_pgd_range(pgd_t
 }
 
 /*
- * Macro to mark a page protection value as "uncacheable".  On processors which do not support
- * it, this is a no-op.
- */
-#define pgprot_noncached(prot)	((boot_cpu_data.x86 > 3)					  \
-				 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
+ * Macro to mark a page protection value as "uncacheable". 
+ * Accesses through a uncached translation bypasses the cache
+ * and do not allow for consecutive writes to be combined.
+ * On processors which do not support it, this is a no-op.
+ */
+#define pgprot_noncached(prot)		\
+	((boot_cpu_data.x86 > 3) ?	\
+		(__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_PCD)): \
+		(prot))
+
+/*
+ * Macro to make mark a page protection value as "write-combining".
+ * Accesses through a write-combining translation bypasses the
+ * caches, but does allow for consecutive writes to be combined into
+ * single (but larger) write transactions. 
+ * On processors that do not support PAT this setting allows
+ * mtrrs to set write combining.
+ * On processors which do not support it, this is a no-op.
+ */
+#define pgprot_writecombine(prot)	\
+	((boot_cpu_data.x86 > 3) ?	\
+		(__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_WC)): \
+		(prot))
+
+#define pgprot_nonstd(prot) (pgprot_val(prot) & _PAGE_CACHE_MASK)
 
 /*
  * Conversion functions: convert a page and protection to a page entry,
Index: linux/include/asm-x86_64/cpufeature.h
===================================================================
--- linux.orig/include/asm-x86_64/cpufeature.h
+++ linux/include/asm-x86_64/cpufeature.h
@@ -112,5 +112,6 @@
 #define cpu_has_cyrix_arr      0
 #define cpu_has_centaur_mcr    0
 #define cpu_has_clflush	       boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_pat            1
 
 #endif /* __ASM_X8664_CPUFEATURE_H */
Index: linux/include/asm-x86_64/msr.h
===================================================================
--- linux.orig/include/asm-x86_64/msr.h
+++ linux/include/asm-x86_64/msr.h
@@ -212,6 +212,8 @@ static inline unsigned int cpuid_edx(uns
 #define MSR_MTRRfix4K_F8000	0x26f
 #define MSR_MTRRdefType		0x2ff
 
+#define MSR_IA32_CR_PAT		0x277
+
 #define MSR_IA32_MC0_CTL       0x400
 #define MSR_IA32_MC0_STATUS        0x401
 #define MSR_IA32_MC0_ADDR      0x402
Index: linux/include/asm-x86_64/pgtable.h
===================================================================
--- linux.orig/include/asm-x86_64/pgtable.h
+++ linux/include/asm-x86_64/pgtable.h
@@ -164,6 +164,12 @@ static inline pte_t ptep_get_and_clear_f
 #define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
 #define _PAGE_GLOBAL	0x100	/* Global TLB entry */
 
+/* We redefine PWT|PCD to be write combining. PAT bit is not used */
+
+#define _PAGE_WC	(_PAGE_PWT|_PAGE_PCD)
+
+#define _PAGE_CACHE_MASK	(_PAGE_PWT|_PAGE_PCD)
+
 #define _PAGE_PROTNONE	0x080	/* If not present */
 #define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
 
@@ -203,6 +209,7 @@ static inline pte_t ptep_get_and_clear_f
 #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
 #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
 #define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
 #define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
 #define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
@@ -310,8 +317,24 @@ static inline void ptep_set_wrprotect(st
 
 /*
  * Macro to mark a page protection value as "uncacheable".
+ * Accesses through a uncached translation bypasses the cache
+ * and do not allow for consecutive writes to be combined.
  */
-#define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
+#define pgprot_noncached(prot) \
+	__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_PCD)
+
+/*
+ * Macro to make mark a page protection value as "write-combining".
+ * Accesses through a write-combining translation works bypasses the
+ * caches, but does allow for consecutive writes to be combined into
+ * single (but larger) write transactions. 
+ * This is mostly useful for IO accesses, for memory it is often slower.
+ * It also implies uncached.
+ */
+#define pgprot_writecombine(prot) \
+	__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_MASK) | _PAGE_WC)
+
+#define pgprot_nonstd(prot) (pgprot_val(prot) & _PAGE_CACHE_MASK)
 
 static inline int pmd_large(pmd_t pte) { 
 	return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
@@ -434,6 +457,7 @@ extern int kern_addr_valid(unsigned long
 #define pgtable_cache_init()   do { } while (0)
 #define check_pgt_cache()      do { } while (0)
 
+/* AGP users use MTRRs for now. Need to add an ioctl to agpgart for WC */
 #define PAGE_AGP    PAGE_KERNEL_NOCACHE
 #define HAVE_PAGE_AGP 1
 
Index: linux/arch/x86_64/mm/pat.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/mm/pat.c
@@ -0,0 +1,210 @@
+/* Handle caching attributes in page tables (PAT) */
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/gfp.h>
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/processor.h>
+
+static u64 boot_pat_state; 
+
+enum { 
+	PAT_UC = 0,   	/* uncached */
+	PAT_WC = 1,		/* Write combining */
+	PAT_WT = 4,		/* Write Through */
+	PAT_WP = 5,		/* Write Protected */
+	PAT_WB = 6,		/* Write Back (default) */
+	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
+
+void __cpuinit pat_init(void)
+{
+	/* Set PWT+PCD to Write-Combining. All other bits stay the same */
+	if (cpu_has_pat) {
+		u64 pat; 
+		/* PTE encoding used in Linux:
+                   PAT
+                   |PCD
+                   ||PWT
+                   |||
+		   000 WB         default 
+		   010 UC_MINUS   _PAGE_PCD
+		   011 WC         _PAGE_WC
+		   PAT bit unused */
+		pat = PAT(0,WB) | PAT(1,WT) | PAT(2,UC_MINUS) | PAT(3,UC) | 
+		      PAT(4,WC) | PAT(5,WT) | PAT(6,UC_MINUS) | PAT(7,UC);
+		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+		wrmsrl(MSR_IA32_CR_PAT, pat);
+	}
+	__flush_tlb_global();
+	asm volatile("wbinvd");
+}
+
+#undef PAT
+
+void pat_shutdown(void)
+{
+	/* Restore CPU default pat state */
+	if (cpu_has_pat)
+		wrmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+}
+
+/* Manage caching attributes of physical memory areas. This avoids
+   any illegal aliases. */
+
+/* Protected by the init_mm semaphore */
+static struct rb_root mattr_root = RB_ROOT;
+
+struct memattr {
+	struct rb_node nd;
+	unsigned long start, end;
+	pgprot_t attr;	
+	int count;
+};
+
+/* lookup first element intersecting start-end */
+static struct memattr *mattr_lookup(unsigned long start, unsigned long end)
+{
+	struct rb_node *p;
+
+	for (p = mattr_root.rb_node; p != NULL; ) {
+		struct memattr *n = rb_entry(p, struct memattr, nd);
+		if (start >= n->end)
+			p = p->rb_right;
+		else if (end <= n->start)
+			p = p->rb_left;
+		else
+			break;
+	}
+	if (!p)
+		return NULL;
+	for (;;) {
+		struct memattr *w = NULL;
+		struct rb_node *prev = rb_prev(p);
+		if (!prev)
+			break;
+		w = rb_entry(prev, struct memattr, nd);
+		if (w->end <= start)
+			break;
+		p = prev;
+	}
+	return rb_entry(p, struct memattr, nd);
+}
+
+/* Insert a new shared policy into the list. */
+static void mattr_insert(struct memattr *new)
+{
+	struct rb_node **p = &mattr_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct memattr *nd;
+
+	while (*p) {
+		parent = *p;
+		nd = rb_entry(parent, struct memattr, nd);
+		if (new->start < nd->start)
+			p = &(*p)->rb_left;
+		else if (new->end > nd->end)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+	rb_link_node(&new->nd, parent, p);
+	rb_insert_color(&new->nd, &mattr_root);
+}
+
+struct memattr *
+mattr_alloc(unsigned long start, unsigned long end, pgprot_t attr)
+{
+	struct memattr *n = kmalloc(sizeof(struct memattr), GFP_KERNEL);
+	if (!n)
+		return NULL;
+	n->start = start;
+	n->end = end;
+	n->attr = attr;
+	n->count = 0;
+	return n;
+}
+
+static void mattr_free(struct memattr *m)
+{
+	rb_erase(&m->nd, &mattr_root);
+	kfree(m);
+}
+
+/** mattr_get - Get reference for attribute of physical memory range
+ *  @start:
+ *  @end:
+ *  @attr:
+ *  Caller ensures locking.
+ */
+int mattr_get(unsigned long start, unsigned long end, pgprot_t attr)
+{
+	struct rb_node *next;
+	struct memattr *m, *overlap, *new = NULL;
+	int count = 1;	
+
+	overlap = mattr_lookup(start, end);
+	/* First check for conflicts before doing anything final */	
+	for (m = overlap;
+	     m != NULL && m->start < end; 	
+	     m = rb_entry(next, struct memattr, nd)) {
+		/* Conflict? */
+	        if ((pgprot_val(m->attr) & _PAGE_MA_MASK) !=
+		    (pgprot_val(attr) & _PAGE_MA_MASK))
+			return -EIO;
+		if (m->start < start)
+			start = m->start;
+		if (m->end > end)
+			end = m->end;
+		count += m->count;
+		next = rb_next(&m->nd);
+		if (!next)
+			break;
+	}
+	/* Get new one */
+	if (pgprot_nonstd(attr)) {
+		new = mattr_alloc(start, end, attr);
+		if (!new)
+			return -ENOMEM;
+	}
+	/* Now free all old attributes */
+	for (m = overlap; 
+	     m && m->start < end; 
+	     m = rb_entry(next, struct memattr, nd)) {
+		next = rb_next(&m->nd);
+		mattr_free(m);
+		if (!next)
+			break;
+	}
+	/* And replace with new one */
+	if (new != NULL) {
+		new->count = count;
+		mattr_insert(new);
+	}
+	return 0;
+}
+
+/** mattr_put - free reference of attribute for physical memory range.
+ *  @start:
+ *  @end:
+ *  @attr:
+ *  Caller ensures locking.
+ */
+void mattr_put(unsigned long start, unsigned long end, pgprot_t attr)
+{
+	struct memattr *m = mattr_lookup(start, end);
+	if (!pgprot_nonstd(attr)) {
+		BUG_ON(m != NULL);
+		return;
+	}
+	BUG_ON(m == NULL);
+	BUG_ON(m->start > start);
+	BUG_ON(m->end < end);
+	BUG_ON(pgprot_val(m->attr) != pgprot_val(attr));
+	BUG_ON(m->count <= 0);
+	if (--m->count == 0)
+		mattr_free(m);
+}
Index: linux/arch/x86_64/kernel/setup64.c
===================================================================
--- linux.orig/arch/x86_64/kernel/setup64.c
+++ linux/arch/x86_64/kernel/setup64.c
@@ -290,8 +290,11 @@ void __cpuinit cpu_init (void)
 	set_debugreg(0UL, 7);
 
 	fpu_init(); 
+
+	pat_init();
 }
 
 void cpu_shutdown(void)
 {
+	pat_shutdown();
 }
Index: linux/arch/i386/kernel/cpu/common.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/common.c
+++ linux/arch/i386/kernel/cpu/common.c
@@ -691,10 +691,13 @@ old_gdt:
 	current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
+
+	pat_init();
 }
 
 void cpu_shutdown(void)
 {
+	pat_shutdown();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
Index: linux/arch/i386/mm/Makefile
===================================================================
--- linux.orig/arch/i386/mm/Makefile
+++ linux/arch/i386/mm/Makefile
@@ -2,9 +2,11 @@
 # Makefile for the linux i386-specific parts of the memory manager.
 #
 
-obj-y	:= init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o
+obj-y	:= init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o pat.o
 
 obj-$(CONFIG_NUMA) += discontig.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_HIGHMEM) += highmem.o
 obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
+
+pat-y 	:= ../../x86_64/mm/pat.o
Index: linux/arch/x86_64/mm/Makefile
===================================================================
--- linux.orig/arch/x86_64/mm/Makefile
+++ linux/arch/x86_64/mm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux x86_64-specific parts of the memory manager.
 #
 
-obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o mmap.o
+obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o mmap.o pat.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NUMA) += numa.o
 obj-$(CONFIG_K8_NUMA) += k8topology.o
Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -116,6 +116,8 @@ extern void cpu_shutdown(void);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
+extern void pat_shutdown(void);
+extern void pat_init(void);
 
 #ifdef CONFIG_X86_HT
 extern void detect_ht(struct cpuinfo_x86 *c);
Index: linux/include/asm-x86_64/processor.h
===================================================================
--- linux.orig/include/asm-x86_64/processor.h
+++ linux/include/asm-x86_64/processor.h
@@ -102,6 +102,8 @@ extern void cpu_shutdown(void);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
+extern void pat_init(void);
+extern void pat_shutdown(void);
 
 /*
  * EFLAGS bits