mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00
For many sun4v processor types, reading or writing a privileged register
has a latency of 40 to 70 cycles. Use a combination of the low-latency
allclean, otherw, normalw, and nop instructions in etrap and rtrap to
replace 2 rdpr and 5 wrpr instructions and improve etrap/rtrap
performance. allclean, otherw, and normalw are available on NG2 and
later processors.
The average ticks to execute the flush windows trap ("ta 0x3") with and
without this patch on select platforms:
CPU Not patched Patched % Latency Reduction
NG2 1762 1558 -11.58
NG4 3619 3204 -11.47
M7 3015 2624 -12.97
SPARC64-X 829 770 -7.12
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
349 lines
9.0 KiB
ArmAsm
349 lines
9.0 KiB
ArmAsm
/*
|
|
* rtrap.S: Preparing for return from trap on Sparc V9.
|
|
*
|
|
* Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
|
|
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
|
|
*/
|
|
|
|
|
|
#include <asm/asi.h>
|
|
#include <asm/pstate.h>
|
|
#include <asm/ptrace.h>
|
|
#include <asm/spitfire.h>
|
|
#include <asm/head.h>
|
|
#include <asm/visasm.h>
|
|
#include <asm/processor.h>
|
|
|
|
#ifdef CONFIG_CONTEXT_TRACKING
|
|
# define SCHEDULE_USER schedule_user
|
|
#else
|
|
# define SCHEDULE_USER schedule
|
|
#endif
|
|
|
|
.text
|
|
.align 32
|
|
__handle_preemption:
|
|
call SCHEDULE_USER
|
|
wrpr %g0, RTRAP_PSTATE, %pstate
|
|
ba,pt %xcc, __handle_preemption_continue
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
|
|
__handle_user_windows:
|
|
call fault_in_user_windows
|
|
wrpr %g0, RTRAP_PSTATE, %pstate
|
|
ba,pt %xcc, __handle_preemption_continue
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
|
|
__handle_userfpu:
|
|
rd %fprs, %l5
|
|
andcc %l5, FPRS_FEF, %g0
|
|
sethi %hi(TSTATE_PEF), %o0
|
|
be,a,pn %icc, __handle_userfpu_continue
|
|
andn %l1, %o0, %l1
|
|
ba,a,pt %xcc, __handle_userfpu_continue
|
|
|
|
__handle_signal:
|
|
mov %l5, %o1
|
|
add %sp, PTREGS_OFF, %o0
|
|
mov %l0, %o2
|
|
call do_notify_resume
|
|
wrpr %g0, RTRAP_PSTATE, %pstate
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
|
|
/* Signal delivery can modify pt_regs tstate, so we must
|
|
* reload it.
|
|
*/
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
|
|
sethi %hi(0xf << 20), %l4
|
|
and %l1, %l4, %l4
|
|
ba,pt %xcc, __handle_preemption_continue
|
|
andn %l1, %l4, %l1
|
|
|
|
/* When returning from a NMI (%pil==15) interrupt we want to
|
|
* avoid running softirqs, doing IRQ tracing, preempting, etc.
|
|
*/
|
|
.globl rtrap_nmi
|
|
rtrap_nmi: ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
|
|
sethi %hi(0xf << 20), %l4
|
|
and %l1, %l4, %l4
|
|
andn %l1, %l4, %l1
|
|
srl %l4, 20, %l4
|
|
ba,pt %xcc, rtrap_no_irq_enable
|
|
nop
|
|
/* Do not actually set the %pil here. We will do that
|
|
* below after we clear PSTATE_IE in the %pstate register.
|
|
* If we re-enable interrupts here, we can recurse down
|
|
* the hardirq stack potentially endlessly, causing a
|
|
* stack overflow.
|
|
*/
|
|
|
|
.align 64
|
|
.globl rtrap_irq, rtrap, irqsz_patchme, rtrap_xcall
|
|
rtrap_irq:
|
|
rtrap:
|
|
/* mm/ultra.S:xcall_report_regs KNOWS about this load. */
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
|
|
rtrap_xcall:
|
|
sethi %hi(0xf << 20), %l4
|
|
and %l1, %l4, %l4
|
|
andn %l1, %l4, %l1
|
|
srl %l4, 20, %l4
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
brnz,pn %l4, rtrap_no_irq_enable
|
|
nop
|
|
call trace_hardirqs_on
|
|
nop
|
|
/* Do not actually set the %pil here. We will do that
|
|
* below after we clear PSTATE_IE in the %pstate register.
|
|
* If we re-enable interrupts here, we can recurse down
|
|
* the hardirq stack potentially endlessly, causing a
|
|
* stack overflow.
|
|
*
|
|
* It is tempting to put this test and trace_hardirqs_on
|
|
* call at the 'rt_continue' label, but that will not work
|
|
* as that path hits unconditionally and we do not want to
|
|
* execute this in NMI return paths, for example.
|
|
*/
|
|
#endif
|
|
rtrap_no_irq_enable:
|
|
andcc %l1, TSTATE_PRIV, %l3
|
|
bne,pn %icc, to_kernel
|
|
nop
|
|
|
|
/* We must hold IRQs off and atomically test schedule+signal
|
|
* state, then hold them off all the way back to userspace.
|
|
* If we are returning to kernel, none of this matters. Note
|
|
* that we are disabling interrupts via PSTATE_IE, not using
|
|
* %pil.
|
|
*
|
|
* If we do not do this, there is a window where we would do
|
|
* the tests, later the signal/resched event arrives but we do
|
|
* not process it since we are still in kernel mode. It would
|
|
* take until the next local IRQ before the signal/resched
|
|
* event would be handled.
|
|
*
|
|
* This also means that if we have to deal with user
|
|
* windows, we have to redo all of these sched+signal checks
|
|
* with IRQs disabled.
|
|
*/
|
|
to_user: wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
wrpr 0, %pil
|
|
__handle_preemption_continue:
|
|
ldx [%g6 + TI_FLAGS], %l0
|
|
sethi %hi(_TIF_USER_WORK_MASK), %o0
|
|
or %o0, %lo(_TIF_USER_WORK_MASK), %o0
|
|
andcc %l0, %o0, %g0
|
|
sethi %hi(TSTATE_PEF), %o0
|
|
be,pt %xcc, user_nowork
|
|
andcc %l1, %o0, %g0
|
|
andcc %l0, _TIF_NEED_RESCHED, %g0
|
|
bne,pn %xcc, __handle_preemption
|
|
andcc %l0, _TIF_DO_NOTIFY_RESUME_MASK, %g0
|
|
bne,pn %xcc, __handle_signal
|
|
ldub [%g6 + TI_WSAVED], %o2
|
|
brnz,pn %o2, __handle_user_windows
|
|
nop
|
|
sethi %hi(TSTATE_PEF), %o0
|
|
andcc %l1, %o0, %g0
|
|
|
|
/* This fpdepth clear is necessary for non-syscall rtraps only */
|
|
user_nowork:
|
|
bne,pn %xcc, __handle_userfpu
|
|
stb %g0, [%g6 + TI_FPDEPTH]
|
|
__handle_userfpu_continue:
|
|
|
|
rt_continue: ldx [%sp + PTREGS_OFF + PT_V9_G1], %g1
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G2], %g2
|
|
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G3], %g3
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G4], %g4
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G5], %g5
|
|
brz,pt %l3, 1f
|
|
mov %g6, %l2
|
|
|
|
/* Must do this before thread reg is clobbered below. */
|
|
LOAD_PER_CPU_BASE(%g5, %g6, %i0, %i1, %i2)
|
|
1:
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G6], %g6
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G7], %g7
|
|
|
|
/* Normal globals are restored, go to trap globals. */
|
|
661: wrpr %g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
|
|
nop
|
|
.section .sun4v_2insn_patch, "ax"
|
|
.word 661b
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
SET_GL(1)
|
|
.previous
|
|
|
|
mov %l2, %g6
|
|
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I0], %i0
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I1], %i1
|
|
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I2], %i2
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I3], %i3
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I4], %i4
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I5], %i5
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I6], %i6
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I7], %i7
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TPC], %l2
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TNPC], %o2
|
|
|
|
ld [%sp + PTREGS_OFF + PT_V9_Y], %o3
|
|
wr %o3, %g0, %y
|
|
wrpr %l4, 0x0, %pil
|
|
wrpr %g0, 0x1, %tl
|
|
andn %l1, TSTATE_SYSCALL, %l1
|
|
wrpr %l1, %g0, %tstate
|
|
wrpr %l2, %g0, %tpc
|
|
wrpr %o2, %g0, %tnpc
|
|
|
|
brnz,pn %l3, kern_rtt
|
|
mov PRIMARY_CONTEXT, %l7
|
|
|
|
661: ldxa [%l7 + %l7] ASI_DMMU, %l0
|
|
.section .sun4v_1insn_patch, "ax"
|
|
.word 661b
|
|
ldxa [%l7 + %l7] ASI_MMU, %l0
|
|
.previous
|
|
|
|
sethi %hi(sparc64_kern_pri_nuc_bits), %l1
|
|
ldx [%l1 + %lo(sparc64_kern_pri_nuc_bits)], %l1
|
|
or %l0, %l1, %l0
|
|
|
|
661: stxa %l0, [%l7] ASI_DMMU
|
|
.section .sun4v_1insn_patch, "ax"
|
|
.word 661b
|
|
stxa %l0, [%l7] ASI_MMU
|
|
.previous
|
|
|
|
sethi %hi(KERNBASE), %l7
|
|
flush %l7
|
|
rdpr %wstate, %l1
|
|
rdpr %otherwin, %l2
|
|
srl %l1, 3, %l1
|
|
|
|
661: wrpr %l2, %g0, %canrestore
|
|
.section .fast_win_ctrl_1insn_patch, "ax"
|
|
.word 661b
|
|
.word 0x89880000 ! normalw
|
|
.previous
|
|
|
|
wrpr %l1, %g0, %wstate
|
|
brnz,pt %l2, user_rtt_restore
|
|
661: wrpr %g0, %g0, %otherwin
|
|
.section .fast_win_ctrl_1insn_patch, "ax"
|
|
.word 661b
|
|
nop
|
|
.previous
|
|
|
|
ldx [%g6 + TI_FLAGS], %g3
|
|
wr %g0, ASI_AIUP, %asi
|
|
rdpr %cwp, %g1
|
|
andcc %g3, _TIF_32BIT, %g0
|
|
sub %g1, 1, %g1
|
|
bne,pt %xcc, user_rtt_fill_32bit
|
|
wrpr %g1, %cwp
|
|
ba,a,pt %xcc, user_rtt_fill_64bit
|
|
nop
|
|
|
|
user_rtt_fill_fixup_dax:
|
|
ba,pt %xcc, user_rtt_fill_fixup_common
|
|
mov 1, %g3
|
|
|
|
user_rtt_fill_fixup_mna:
|
|
ba,pt %xcc, user_rtt_fill_fixup_common
|
|
mov 2, %g3
|
|
|
|
user_rtt_fill_fixup:
|
|
ba,pt %xcc, user_rtt_fill_fixup_common
|
|
clr %g3
|
|
|
|
user_rtt_pre_restore:
|
|
add %g1, 1, %g1
|
|
wrpr %g1, 0x0, %cwp
|
|
|
|
user_rtt_restore:
|
|
restore
|
|
rdpr %canrestore, %g1
|
|
wrpr %g1, 0x0, %cleanwin
|
|
retry
|
|
nop
|
|
|
|
kern_rtt: rdpr %canrestore, %g1
|
|
brz,pn %g1, kern_rtt_fill
|
|
nop
|
|
kern_rtt_restore:
|
|
stw %g0, [%sp + PTREGS_OFF + PT_V9_MAGIC]
|
|
restore
|
|
retry
|
|
|
|
to_kernel:
|
|
#ifdef CONFIG_PREEMPT
|
|
ldsw [%g6 + TI_PRE_COUNT], %l5
|
|
brnz %l5, kern_fpucheck
|
|
ldx [%g6 + TI_FLAGS], %l5
|
|
andcc %l5, _TIF_NEED_RESCHED, %g0
|
|
be,pt %xcc, kern_fpucheck
|
|
nop
|
|
cmp %l4, 0
|
|
bne,pn %xcc, kern_fpucheck
|
|
nop
|
|
call preempt_schedule_irq
|
|
nop
|
|
ba,pt %xcc, rtrap
|
|
#endif
|
|
kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5
|
|
brz,pt %l5, rt_continue
|
|
srl %l5, 1, %o0
|
|
add %g6, TI_FPSAVED, %l6
|
|
ldub [%l6 + %o0], %l2
|
|
sub %l5, 2, %l5
|
|
|
|
add %g6, TI_GSR, %o1
|
|
andcc %l2, (FPRS_FEF|FPRS_DU), %g0
|
|
be,pt %icc, 2f
|
|
and %l2, FPRS_DL, %l6
|
|
andcc %l2, FPRS_FEF, %g0
|
|
be,pn %icc, 5f
|
|
sll %o0, 3, %o5
|
|
rd %fprs, %g1
|
|
|
|
wr %g1, FPRS_FEF, %fprs
|
|
ldx [%o1 + %o5], %g1
|
|
add %g6, TI_XFSR, %o1
|
|
sll %o0, 8, %o2
|
|
add %g6, TI_FPREGS, %o3
|
|
brz,pn %l6, 1f
|
|
add %g6, TI_FPREGS+0x40, %o4
|
|
|
|
membar #Sync
|
|
ldda [%o3 + %o2] ASI_BLK_P, %f0
|
|
ldda [%o4 + %o2] ASI_BLK_P, %f16
|
|
membar #Sync
|
|
1: andcc %l2, FPRS_DU, %g0
|
|
be,pn %icc, 1f
|
|
wr %g1, 0, %gsr
|
|
add %o2, 0x80, %o2
|
|
membar #Sync
|
|
ldda [%o3 + %o2] ASI_BLK_P, %f32
|
|
ldda [%o4 + %o2] ASI_BLK_P, %f48
|
|
1: membar #Sync
|
|
ldx [%o1 + %o5], %fsr
|
|
2: stb %l5, [%g6 + TI_FPDEPTH]
|
|
ba,pt %xcc, rt_continue
|
|
nop
|
|
5: wr %g0, FPRS_FEF, %fprs
|
|
sll %o0, 8, %o2
|
|
|
|
add %g6, TI_FPREGS+0x80, %o3
|
|
add %g6, TI_FPREGS+0xc0, %o4
|
|
membar #Sync
|
|
ldda [%o3 + %o2] ASI_BLK_P, %f32
|
|
ldda [%o4 + %o2] ASI_BLK_P, %f48
|
|
membar #Sync
|
|
wr %g0, FPRS_DU, %fprs
|
|
ba,pt %xcc, rt_continue
|
|
stb %l5, [%g6 + TI_FPDEPTH]
|