From 248d9a5b63bba72bfc316b8a48c6163fce5acc22 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@gmail.com>
Date: Thu, 18 Feb 2010 21:53:01 +0200
Subject: [PATCH] ARM: Use cache alignment from asm/cache.h

Make code more optimal for ARM variants with
different cache line size.

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@gmail.com>
---
 arch/arm/boot/compressed/head.S    |   11 ++++++-----
 arch/arm/include/asm/dma-mapping.h |    2 +-
 arch/arm/kernel/entry-armv.S       |   31 ++++++++++++++++---------------
 arch/arm/kernel/entry-common.S     |    7 ++++---
 arch/arm/kernel/head.S             |    3 ++-
 arch/arm/kernel/vmlinux.lds.S      |    5 +++--
 arch/arm/lib/copy_page.S           |    2 +-
 arch/arm/lib/memchr.S              |    3 ++-
 arch/arm/lib/memset.S              |    3 ++-
 arch/arm/lib/memzero.S             |    3 ++-
 arch/arm/lib/strchr.S              |    3 ++-
 arch/arm/lib/strncpy_from_user.S   |    3 ++-
 arch/arm/lib/strnlen_user.S        |    3 ++-
 arch/arm/lib/strrchr.S             |    3 ++-
 arch/arm/mm/abort-ev4.S            |    3 ++-
 arch/arm/mm/abort-nommu.S          |    3 ++-
 16 files changed, 51 insertions(+), 37 deletions(-)

--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/linkage.h>
+#include <asm/cache.h>
 
 /*
  * Debugging stuff
@@ -355,7 +356,7 @@ params:		ldr	r0, =0x10000100		@ params_p
  * This routine must preserve:
  *  r4, r5, r6, r7, r8
  */
-		.align	5
+		.align	L1_CACHE_SHIFT
 cache_on:	mov	r3, #8			@ cache_on function
 		b	call_cache_fn
 
@@ -544,7 +545,7 @@ __common_mmu_cache_on:
 		mcr	p15, 0, r3, c2, c0, 0	@ load page table pointer
 		mcr	p15, 0, r1, c3, c0, 0	@ load domain access control
 		b	1f
-		.align	5			@ cache line aligned
+		.align	L1_CACHE_SHIFT		@ cache line aligned
 1:		mcr	p15, 0, r0, c1, c0, 0	@ load control register
 		mrc	p15, 0, r0, c1, c0, 0	@ and read it back to
 		sub	pc, lr, r0, lsr #32	@ properly flush pipeline
@@ -563,7 +564,7 @@ __common_mmu_cache_on:
  * r8     = atags pointer
  * r9-r12,r14 = corrupted
  */
-		.align	5
+		.align	L1_CACHE_SHIFT
 reloc_start:	add	r9, r5, r0
 		sub	r9, r9, #128		@ do not copy the stack
 		debug_reloc_start
@@ -793,7 +794,7 @@ proc_types:
  * This routine must preserve:
  *  r4, r6, r7
  */
-		.align	5
+		.align	L1_CACHE_SHIFT
 cache_off:	mov	r3, #12			@ cache_off function
 		b	call_cache_fn
 
@@ -868,7 +869,7 @@ __armv3_mmu_cache_off:
  * This routine must preserve:
  *  r0, r4, r5, r6, r7
  */
-		.align	5
+		.align	L1_CACHE_SHIFT
 cache_clean_flush:
 		mov	r3, #16
 		b	call_cache_fn
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -23,6 +23,7 @@
 #include <asm/unwind.h>
 #include <asm/unistd.h>
 #include <asm/tls.h>
+#include <asm/cache.h>
 
 #include "entry-header.S"
 
@@ -167,7 +168,7 @@ ENDPROC(__und_invalid)
 	stmia	r5, {r0 - r4}
 	.endm
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __dabt_svc:
 	svc_entry
 
@@ -215,7 +216,7 @@ __dabt_svc:
  UNWIND(.fnend		)
 ENDPROC(__dabt_svc)
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __irq_svc:
 	svc_entry
 
@@ -259,7 +260,7 @@ svc_preempt:
 	b	1b
 #endif
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __und_svc:
 #ifdef CONFIG_KPROBES
 	@ If a kprobe is about to simulate a "stmdb sp..." instruction,
@@ -305,7 +306,7 @@ __und_svc:
  UNWIND(.fnend		)
 ENDPROC(__und_svc)
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __pabt_svc:
 	svc_entry
 
@@ -341,7 +342,7 @@ __pabt_svc:
  UNWIND(.fnend		)
 ENDPROC(__pabt_svc)
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 .LCcralign:
 	.word	cr_alignment
 #ifdef MULTI_DABORT
@@ -414,7 +415,7 @@ ENDPROC(__pabt_svc)
 #endif
 	.endm
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __dabt_usr:
 	usr_entry
 	kuser_cmpxchg_check
@@ -446,7 +447,7 @@ __dabt_usr:
  UNWIND(.fnend		)
 ENDPROC(__dabt_usr)
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __irq_usr:
 	usr_entry
 	kuser_cmpxchg_check
@@ -475,7 +476,7 @@ ENDPROC(__irq_usr)
 
 	.ltorg
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __und_usr:
 	usr_entry
 
@@ -691,7 +692,7 @@ __und_usr_unknown:
 	b	do_undefinstr
 ENDPROC(__und_usr_unknown)
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 __pabt_usr:
 	usr_entry
 
@@ -805,7 +806,7 @@ ENDPROC(__switch_to)
 #endif
 	.endm
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 	.globl	__kuser_helper_start
 __kuser_helper_start:
 
@@ -845,7 +846,7 @@ __kuser_memory_barrier:				@ 0xffff0fa0
 	smp_dmb
 	usr_ret	lr
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 
 /*
  * Reference prototype:
@@ -972,7 +973,7 @@ kuser_cmpxchg_fixup:
 
 #endif
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 
 /*
  * Reference prototype:
@@ -1050,7 +1051,7 @@ __kuser_helper_end:
  * of which is copied into r0 for the mode specific abort handler.
  */
 	.macro	vector_stub, name, mode, correction=0
-	.align	5
+	.align	L1_CACHE_SHIFT
 
 vector_\name:
 	.if \correction
@@ -1181,7 +1182,7 @@ __stubs_start:
 	.long	__und_invalid			@  e
 	.long	__und_invalid			@  f
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 
 /*=============================================================================
  * Undefined FIQs
@@ -1211,7 +1212,7 @@ vector_addrexcptn:
  * We group all the following data together to optimise
  * for CPUs with separate I & D caches.
  */
-	.align	5
+	.align	L1_CACHE_SHIFT
 
 .LCvswi:
 	.word	vector_swi
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -10,13 +10,14 @@
 
 #include <asm/unistd.h>
 #include <asm/ftrace.h>
+#include <asm/cache.h>
 #include <mach/entry-macro.S>
 #include <asm/unwind.h>
 
 #include "entry-header.S"
 
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 /*
  * This is the fast syscall return path.  We do as little as
  * possible here, and this includes saving r0 back into the SVC
@@ -260,7 +261,7 @@ ENDPROC(ftrace_stub)
 #define A710(code...)
 #endif
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 ENTRY(vector_swi)
 	sub	sp, sp, #S_FRAME_SIZE
 	stmia	sp, {r0 - r12}			@ Calling r0 - r12
@@ -404,7 +405,7 @@ __sys_trace_return:
 	bl	syscall_trace
 	b	ret_slow_syscall
 
-	.align	5
+	.align	L1_CACHE_SHIFT
 #ifdef CONFIG_ALIGNMENT_TRAP
 	.type	__cr_alignment, #object
 __cr_alignment:
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -21,6 +21,7 @@
 #include <asm/memory.h>
 #include <asm/thread_info.h>
 #include <asm/system.h>
+#include <asm/cache.h>
 
 #ifdef CONFIG_DEBUG_LL
 #include <mach/debug-macro.S>
@@ -373,7 +374,7 @@ ENDPROC(__enable_mmu)
  *
  * other registers depend on the function called upon completion
  */
-	.align	5
+	.align	L1_CACHE_SHIFT
 __turn_mmu_on:
 	mov	r0, r0
 	mcr	p15, 0, r0, c1, c0, 0		@ write control reg
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -7,6 +7,7 @@
 #include <asm/thread_info.h>
 #include <asm/memory.h>
 #include <asm/page.h>
+#include <asm/cache.h>
 	
 #define PROC_INFO							\
 	VMLINUX_SYMBOL(__proc_info_begin) = .;				\
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -17,7 +17,7 @@
 #define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 ))
 
 		.text
-		.align	5
+		.align	L1_CACHE_SHIFT
 /*
  * StrongARM optimised copy_page routine
  * now 1.78bytes/cycle, was 1.60 bytes/cycle (50MHz bus -> 89MB/s)
--- a/arch/arm/lib/memchr.S
+++ b/arch/arm/lib/memchr.S
@@ -11,9 +11,10 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 	.text
-	.align	5
+	.align	L1_CACHE_SHIFT
 ENTRY(memchr)
 1:	subs	r2, r2, #1
 	bmi	2f
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -11,9 +11,10 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 	.text
-	.align	5
+	.align	L1_CACHE_SHIFT
 	.word	0
 
 1:	subs	r2, r2, #4		@ 1 do we have enough
--- a/arch/arm/lib/memzero.S
+++ b/arch/arm/lib/memzero.S
@@ -9,9 +9,10 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 	.text
-	.align	5
+	.align	L1_CACHE_SHIFT
 	.word	0
 /*
  * Align the pointer in r0.  r3 contains the number of bytes that we are
--- a/arch/arm/lib/strchr.S
+++ b/arch/arm/lib/strchr.S
@@ -11,9 +11,10 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 		.text
-		.align	5
+		.align	L1_CACHE_SHIFT
 ENTRY(strchr)
 		and	r1, r1, #0xff
 1:		ldrb	r2, [r0], #1
--- a/arch/arm/lib/strncpy_from_user.S
+++ b/arch/arm/lib/strncpy_from_user.S
@@ -10,9 +10,10 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/errno.h>
+#include <asm/cache.h>
 
 	.text
-	.align	5
+	.align	L1_CACHE_SHIFT
 
 /*
  * Copy a string from user space to kernel space.
--- a/arch/arm/lib/strnlen_user.S
+++ b/arch/arm/lib/strnlen_user.S
@@ -10,9 +10,10 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/errno.h>
+#include <asm/cache.h>
 
 	.text
-	.align	5
+	.align	L1_CACHE_SHIFT
 
 /* Prototype: unsigned long __strnlen_user(const char *str, long n)
  * Purpose  : get length of a string in user memory
--- a/arch/arm/lib/strrchr.S
+++ b/arch/arm/lib/strrchr.S
@@ -11,9 +11,10 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 		.text
-		.align	5
+		.align	L1_CACHE_SHIFT
 ENTRY(strrchr)
 		mov	r3, #0
 1:		ldrb	r2, [r0], #1
--- a/arch/arm/mm/abort-ev4.S
+++ b/arch/arm/mm/abort-ev4.S
@@ -1,5 +1,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 /*
  * Function: v4_early_abort
  *
@@ -17,7 +18,7 @@
  * abort here if the I-TLB and D-TLB aren't seeing the same
  * picture.  Unfortunately, this does happen.  We live with it.
  */
-	.align	5
+	.align	L1_CACHE_SHIFT
 ENTRY(v4_early_abort)
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
--- a/arch/arm/mm/abort-nommu.S
+++ b/arch/arm/mm/abort-nommu.S
@@ -1,5 +1,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 /*
  * Function: nommu_early_abort
  *
@@ -12,7 +13,7 @@
  * Note: There is no FSR/FAR on !CPU_CP15_MMU cores.
  *       Just fill zero into the registers.
  */
-	.align	5
+	.align	L1_CACHE_SHIFT
 ENTRY(nommu_early_abort)
 	mov	r0, #0				@ clear r0, r1 (no FSR/FAR)
 	mov	r1, #0