ENTRY(entry_SYSCALL_64)
/
*
*
Interrupts are off on entry.
*
We do
not
frame this tiny irq
-
off block with TRACE_IRQS_OFF
/
ON,
*
it
is
too small to ever cause noticeable irq latency.
*
/
SWAPGS_UNSAFE_STACK
/
/
KPTI 进内核态需要切到内核页表
SWITCH_KERNEL_CR3_NO_STACK
/
*
*
A hypervisor implementation might want to use a label
*
after the swapgs, so that it can do the swapgs
*
for
the guest
and
jump here on syscall.
*
/
GLOBAL(entry_SYSCALL_64_after_swapgs)
/
/
将用户栈偏移保存到 per
-
cpu 变量 rsp_scratch 中
movq
%
rsp, PER_CPU_VAR(rsp_scratch)
/
/
加载内核栈偏移
movq PER_CPU_VAR(cpu_current_top_of_stack),
%
rsp
TRACE_IRQS_OFF
/
*
Construct struct pt_regs on stack
*
/
pushq $__USER_DS
/
*
pt_regs
-
>ss
*
/
pushq PER_CPU_VAR(rsp_scratch)
/
*
pt_regs
-
>sp
*
/
pushq
%
r11
/
*
pt_regs
-
>flags
*
/
pushq $__USER_CS
/
*
pt_regs
-
>cs
*
/
pushq
%
rcx
/
*
pt_regs
-
>ip
*
/
pushq
%
rax
/
*
pt_regs
-
>orig_ax
*
/
pushq
%
rdi
/
*
pt_regs
-
>di
*
/
pushq
%
rsi
/
*
pt_regs
-
>si
*
/
pushq
%
rdx
/
*
pt_regs
-
>dx
*
/
pushq
%
rcx
/
*
pt_regs
-
>cx
*
/
pushq $
-
ENOSYS
/
*
pt_regs
-
>ax
*
/
pushq
%
r8
/
*
pt_regs
-
>r8
*
/
pushq
%
r9
/
*
pt_regs
-
>r9
*
/
pushq
%
r10
/
*
pt_regs
-
>r10
*
/
pushq
%
r11
/
*
pt_regs
-
>r11
*
/
/
/
为r12
-
r15, rbp, rbx保留位置
sub $(
6
*
8
),
%
rsp
/
*
pt_regs
-
>bp, bx, r12
-
15
not
saved
*
/
/
*
*
If we need to do entry work
or
if
we guess we'll need to do
*
exit work, go straight to the slow path.
*
/
movq PER_CPU_VAR(current_task),
%
r11
testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(
%
r11)
jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath:
/
*
*
Easy case: enable interrupts
and
issue the syscall. If the syscall
*
needs pt_regs, we'll call a stub that disables interrupts again
*
and
jumps to the slow path.
*
/
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
/
/
确保系统调用号没超过最大值,超过了则跳转到后面的符号
1
处进行返回
cmpq $__NR_syscall_max,
%
rax
andl $__SYSCALL_MASK,
%
eax
cmpl $__NR_syscall_max,
%
eax
ja
1f
/
*
return
-
ENOSYS (already
in
pt_regs
-
>ax)
*
/
/
/
除系统调用外的其他调用都通过 rcx 来传第四个参数,因此将 r10 的内容设置到 rcx
movq
%
r10,
%
rcx
/
*
*
This call instruction
is
handled specially
in
stub_ptregs_64.
*
It might end up jumping to the slow path. If it jumps, RAX
*
and
all
argument registers are clobbered.
*
/
/
/
调用系统调用表中对应的函数
call
*
sys_call_table(,
%
rax,
8
)
.Lentry_SYSCALL_64_after_fastpath_call:
/
/
将函数返回值压到栈中,返回时弹出
movq
%
rax, RAX(
%
rsp)
1
:
/
*
*
If we get here, then we know that pt_regs
is
clean
for
SYSRET64.
*
If we see that no exit work
is
required (which we are required
*
to check with IRQs off), then we can go straight to SYSRET64.
*
/
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movq PER_CPU_VAR(current_task),
%
r11
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(
%
r11)
jnz
1f
LOCKDEP_SYS_EXIT
/
/
宏的实现与 CONFIG_DEBUG_LOCK_ALLOC 内核配置选项相关,该配置允许在退出系统调用时调试锁。
TRACE_IRQS_ON
/
*
user mode
is
traced as IRQs on
*
/
movq RIP(
%
rsp),
%
rcx
movq EFLAGS(
%
rsp),
%
r11
RESTORE_C_REGS_EXCEPT_RCX_R11
/
/
恢复除 rxc 和 r11 外所有通用寄存器, 因为 rcx 寄存器为调用系统调用的应用程序的返回地址, r11 寄存器为老的 flags register
/
*
*
This opens a window where we have a user CR3, but are
*
running
in
the kernel. This makes using the CS
*
register useless
for
telling whether
or
not
we need to
*
switch CR3
in
NMIs. Normal interrupts are OK because
*
they are off here.
*
/
SWITCH_USER_CR3
/
/
KPTI 返回用户态需要切回用户页表
/
*
根据压栈的内容,恢复 rsp 为用户态的栈顶
*
/
movq RSP(
%
rsp),
%
rsp
USERGS_SYSRET64
/
*
调用宏 USERGS_SYSRET64 ,其扩展调用 swapgs 指令交换用户 GS 和内核GS, sysret 指令执行从系统调用处理退出
*
/
........
........