-
-
[旧帖] [原创]init进程创建过程——基于0.12内核 0.00雪花
-
2011-7-23 03:54 2667
-
求职没收到一个电话,真悲剧。算了还是先写点东西吧,本来赵炯博士的书已经非常perfect了,我算是从中剽窃吧,写到晚上四点,没绘图,没条理,高手就点吐掉口水吧。上传文本出现笑脸,格式也变了,最后有doc可下载。内容是关于任务0、1的执行流程,时钟中断,信号处理,任务调度等。
0. 说明
本文C代码和AT&T格式汇编均摘自linux-0.12内核, Intel格式汇编摘自bochs。
1. 从开机到任务0执行
bootsect.S--->setup.S--->head.s--->main.c
1)bootsect.S: 引导代码,将setup代码和system模块加载到内存指定位置处。
2)setup.S : 读取时钟、硬盘参数表等信息,main函数会用到。
3)head.s : 内核模块(system)最先执行的代码,主要设置page directory table 和四个page tables,映射整个物理内存(high effective physical memory = 16M),供内核代码和任务0使用;并设置main函数。
00005400: (   push 0x00000000 ; 6a00 :main函数三个参数
00005402: (   push 0x00000000 ; 6a00
00005404: (   push 0x00000000 ; 6a00
00005406: (   push 0x00005412 ; 6812540000
0000540b: (   push 0x000066b0 ; 68b0660000 :main地址
00005410: (   jmp .+62 ; eb3e :页目录、页表设置
00005412: (   jmp .-2 ; ebfe
4)main
sched_init(); :任务0设置,并clear NT标志,move_to_user_mode有用
buffer_init(buffer_memory_end);
hd_init();
floppy_init();
sti();
move_to_user_mode(); :内核初始化完毕后移动到任务0执行
if (!fork()) { :任务1创建
init(); : 任务1执行文件系统加载等操作
}
for(;;) :任务0执行任务切换操作
__asm__(“int $0x80”::”a” (__NR_pause):”ax”);
Tips:
○1fork对子进程返回0,因为copy_process时设置新任务的eax=0,见下文。
○2kernel/sched.c中定义:
long user_stack [ PAGE_SIZE>>2 ];
struct {
long * a;
short b;
} stack_start = { & user_stack [PAGE_SIZE>>2] , 0x10 };
_stack_start: 0x222bc (见System.map)
head.s开始执行时,将user_stack用做内核初始化堆栈,并在切换到任务0时,用作任务0的user stack:
00000000: (   mov eax, 0x00000010 ; b810000000
00000005: (   mov ds, ax ; 8ed8
00000007: (   mov es, ax ; 8ec0
00000009: (   mov fs, ax ; 8ee0
0000000b: (   mov gs, ax ; 8ee8
0000000d: (   lss esp, ds:0x222bc ; 0fb225bc220200 :esp-_stack_start
00000014: (   call .+86 ; e856000000 :设置idt
00000019: (   call .+129 ; e881000000 :设置gdt
0000001e: (   mov eax, 0x00000010 ; b810000000
00000023: (   mov ds, ax ; 8ed8
00000025: (   mov es, ax ; 8ec0
00000027: (   mov fs, ax ; 8ee0
00000029: (   mov gs, ax ; 8ee8
0000002b: (   lss esp, ds:0x222bc ; 0fb225bc220200
move_to_user_mode:
00006825: (   mov eax, esp ; 89e0 :内核堆栈将用作任务0 user stack
00006827: (   push 0x00000017 ; 6a17 :任务0的ss:esp
00006829: (   push eax ; 50
0000682a: (   pushfd ; 9c
0000682b: (   push 0x0000000f ; 6a0f :任务0的cs:eip
0000682d: (   push 0x00006833 ; 6833680000
00006832: (   iretd ; cf :模拟中断返回,切换到任务0
00006833: (   mov eax, 0x00000017 ; b817000000
00006838: (   mov ds, ax ; 8ed8
0000683a: (   mov es, ax ; 8ec0
0000683c: (   mov fs, ax ; 8ee0
0000683e: (   mov gs, ax ; 8ee8
00006840: (   add esp, 0x0000000c ; 83c40c
Tips:
○1iretd执行时,eflags= 0x00000206: id vip vif ac vm rf nt IOPL=0 of df IF tf sf zf af PF cf,nt标准已复位;
2. 任务0到任务1
fork:
(1) main.c中定义:
static inline _syscall0(int,fork)
Tips:
○1main.c将fork定义为inline,因为创建进程时,任务0、1共用page directory、page tables,任务1置位page entry的r/w,任务0对应的page entry无变化。所以在对堆栈进行写操作时,任务1必须先运行,以免弄乱堆栈。见上面的main中任务0执行pause系统调用,详见后文分析。
(2) include/unistd.h中定义:
#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
: "=a" (__res) \
: "0" (__NR_##name)); \
if (__res >= 0) \
return (type) __res; \
errno = -__res; \
return -1; \
}
对应汇编为:
00006843: (   mov eax, 0x00000002 ; b802000000
00006848: (   int 0x80 ; cd80 :fork系统调用(eax=0x2)
0000684a: (   mov edx, eax ; 89c2 :eax返回PID,对子进程为0
0000684c: (   test edx, edx ; 85d2
0000684e: (   jnl .+13 ; 7d0d
00006850: (   neg edx ; f7da
00006852: (   mov dword ptr ds:0x38ea8, edx ; 8915a88e0300
00006858: (   mov edx, 0xffffffff ; baffffffff
0000685d: (   test edx, edx ; 85d2
0000685f: (   jnz .+5 ; 7505
执行fork系统调用前堆栈:
| STACK 0x00038604 [0x00000000]
| STACK 0x00038608 [0x00000050]
| STACK 0x0003860c [0x000001cb]
| STACK 0x00038610 [0x00005412]
| STACK 0x00038614 [0x00000000]
| STACK 0x00038618 [0x00000000]
| STACK 0x0003861c [0x00000000]
fork进入后堆栈:
| STACK 0x00022194 [0x0000684a] : eip (user space)
| STACK 0x00022198 [0x0000000f] : cs (user space)
| STACK 0x0002219c [0x00000212] : eflags
| STACK 0x000221a0 [0x00038604] : esp (user space)
| STACK 0x000221a4 [0x00000017] : ss (user space)
Tips:
○1系统调用接口(_system_call)
.align 2
_system_call:
push %ds :保存段寄存器
push %es
push %fs
pushl %eax # save the orig_eax :系统调用号,此处为0x2
pushl %edx : edx、ecx、ebx用做系统调用执行函数的参数,参考上面的_syscall0(无参数)
pushl %ecx # push %ebx,%ecx,%edx as parameters
pushl %ebx # to the system call
movl $0x10,%edx # set up ds,es to kernel space :linus将gdt表1-内核代码段,2-内核数据段,0-不用
mov %dx,%ds
mov %dx,%es
movl $0x17,%edx # fs points to local data space :fs用于user space/kernel space交换数据;ldt表2-用户空间数据段,1-用户空间代码段,0-不用
mov %dx,%fs
cmpl _NR_syscalls,%eax :linux/sys.h 中定义 int NR_syscalls = sizeof(sys_call_table)/sizeof(fn_ptr);
jae bad_sys_call
call _sys_call_table(,%eax,4) :调用system routine,此处为_sys_fork
pushl %eax :系统调用返回值,此处为PID,对任务0是1,对任务1是0,因为_sys_fork调用copy_process时设置任务1的eax=0,见下文
2: :重新调度任务,因为内核在执行move_to_user_mode前,已开启中断允许标志(见sti()); 时钟中断处理程序会递减当前任务(current)的运行时间片,并根据current的运行级别(cpl)决定是否抢占任务,见后文
movl _current,%eax
cmpl $0,state(%eax) # state :state表示任务运行状态(RUNNING/INTERRUPTIBLE/UNINTERRUPTIBLE/ZOMBIE/STOPPED, 见linux/sched.h)
jne reschedule
cmpl $0,counter(%eax) # counter :时间片,初始时为任务的priority
je reschedule
ret_from_sys_call: :下面代码到标号3之间,用于处理current的signals;进程在内核(cpl=0,ss=0x10)发生中断时不捕获信号;do_signal用于信号前预处理,把对应信号处理函数和恢复函数保存在user space stack中(所以用户可定义自己的信号处理函数),_system_call执行后,先执行信号处理,再执行发生系统调用后的一条指令,详见后文
movl _current,%eax
cmpl _task,%eax # task[0] cannot have signals :任务0是不捕获任何信号的,参考后文schedule说明
je 3f
cmpw $0x0f,CS(%esp) # was old code segment supervisor ? :内核中断不捕获信号
jne 3f
cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ?
jne 3f
movl signal(%eax),%ebx :信号标识码(bit(x) = (1<<(signal+1)))
movl blocked(%eax),%ecx :信号阻塞标识码
notl %ecx
andl %ebx,%ecx
bsfl %ecx,%ecx
je 3f
btrl %ecx,%ebx
movl %ebx,signal(%eax)
incl %ecx
pushl %ecx
call _do_signal :信号预处理,见后文
popl %ecx
testl %eax, %eax
jne 2b # see if we need to switch tasks, or do more signals :是否还有可捕获信号
3: popl %eax
popl %ebx
popl %ecx
popl %edx
addl $4, %esp # skip orig_eax
pop %fs
pop %es
pop %ds
iret
_system_call fork (1):
00007984: (   push ds ; 1e
00007985: (   push es ; 06
00007986: (   push fs ; 0fa0
00007988: (   push eax ; 50 :_NR_FORK(0x2)
00007989: (   push edx ; 52
0000798a: (   push ecx ; 51
0000798b: (   push ebx ; 53
0000798c: (   mov edx, 0x00000010 ; ba10000000
00007991: (   mov ds, dx ; 8eda
00007993: (   mov es, dx ; 8ec2
00007995: (   mov edx, 0x00000017 ; ba17000000
0000799a: (   mov fs, dx ; 8ee2
0000799c: (   cmp eax, dword ptr ds:0x211a4 ; 3b05a4110200 :_NR_syscalls: 0x211a4(System.map)
000079a2: (   jnb .-48 ; 73d0
000079a4: (   call dword ptr ds:[eax*4+135240] ; ff148548100200 :call sys_call_table[NR_FORK],即_sys_fork;135240 == 0x21048; _sys_call_table: 0x21048(System.map)
000079ab: (   push eax ; 50 :见下文copy_process
_sys_fork:
00007abc: (   call .+8375 ; e8b7200000 :find_empty_process
00007ac1: (   test eax, eax ; 85c0
00007ac3: (   js .+14 ; 780e
00007ac5: (   push gs ; 0fa8
00007ac7: (   push esi ; 56
00007ac8: (   push edi ; 57
00007ac9: (   push ebp ; 55
00007aca: (   push eax ; 50
00007acb: (   call .+7284 ; e8741c0000 :copy_process,创建子进程主要代码
00007ad0: (   add esp, 0x00000014 ; 83c414
00007ad3: (   ret ; c3
find_empty_process:
主要确定全局唯一进程号,并判断系统是否还可以创建进程。
int find_empty_process(void)
{
int i;
repeat:
if ((++last_pid)<0) last_pid=1; :PID (1 for task 1)
for(i=0 ; i<NR_TASKS ; i++)
if (task[i] && ((task[i]->pid == last_pid) ||
(task[i]->pgrp == last_pid)))
goto repeat;
for(i=1 ; i<NR_TASKS ; i++) :进程队列是否已满
if (!task[i])
return i; 任务队列索引(eax)
return -EAGAIN;
}
copy_process:
声明:
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
long ebx,long ecx,long edx, long orig_eax,
long fs,long es,long ds,
long eip,long cs,long eflags,long esp,long ss)
进入前的堆栈:
| STACK 0x00022160 [0x00000001] :index of tasks table (task 1)
| STACK 0x00022164 [0x0003860c]
| STACK 0x00022168 [0x00000ffc]
| STACK 0x0002216c [0x000e0000]
| STACK 0x00022170 [0x00000017]
| STACK 0x00022174 [0x000079ab] : eip(instruction after sys_fork)
| STACK 0x00022178 [0x00000000]
| STACK 0x0002217c [0x00056800]
| STACK 0x00022180 [0x00000021]
| STACK 0x00022184 [0x00000002] : NR_FORK
| STACK 0x00022188 [0x00000017] :_system_call put
| STACK 0x0002218c [0x00000017]
| STACK 0x00022190 [0x00000017]
| STACK 0x00022194 [0x0000684a] :eip (instruction after fork _system_ call)
| STACK 0x00022198 [0x0000000f] :cs (user cs of task 0)
| STACK 0x0002219c [0x00000212] :eflags
| STACK 0x000221a0 [0x00038604] :esp (user stack of task 0)
| STACK 0x000221a4 [0x00000017] :ss (user stack of task 0)
copy_process函数体
{
struct task_struct *p;
int i;
struct file *f;
p = (struct task_struct *) get_free_page(); :在main memory 中存放process descriptor,其中kernel stack start top position位于其所在page下一位置
if (!p)
return -EAGAIN;
task[nr] = p;
*p = *current; /* NOTE! this doesn't copy the supervisor stack */
:以下重新设置子进程的相关值
p->state = TASK_UNINTERRUPTIBLE; :参考下文进程创建成功后设置其为已就绪状态
p->pid = last_pid; :见上文find_empty_process
p->counter = p->priority; :初始化时间片,覆盖parent的counter
p->signal = 0; :no signal
p->alarm = 0; : no blocked mask
p->leader = 0; /* process leadership doesn't inherit */
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->start_time = jiffies; :进程创建时间,当前系统滴答值
--------------------------------------------------任务状态段---------------------------------------------------------------
p->tss.back_link = 0; :back link field字段,用于任务门切换后根据NT标志返回前一任务
p->tss.esp0 = PAGE_SIZE + (long) p; :进程创建后,内核堆栈是不变的,即任务切换时不被cpu保存
p->tss.ss0 = 0x10;
p->tss.eip = eip; :子进程eip,此处为0x0000684a
p->tss.eflags = eflags;
p->tss.eax = 0; :子进程返回的PID,上文中子进程fork返回0的原因
p->tss.ecx = ecx;
p->tss.edx = edx;
p->tss.ebx = ebx;
p->tss.esp = esp;
p->tss.ebp = ebp;
p->tss.esi = esi;
p->tss.edi = edi;
p->tss.es = es & 0xffff;
p->tss.cs = cs & 0xffff;
p->tss.ss = ss & 0xffff;
p->tss.ds = ds & 0xffff;
p->tss.fs = fs & 0xffff;
p->tss.gs = gs & 0xffff;
p->tss.ldt = _LDT(nr); :gdt中ldt索引,每个任务在gdt中占两项(tss,ldt),任务0(tss=4,ldt=5),任务1(tss=6,ldt=7)
p->tss.trace_bitmap = 0x80000000; :i/o位图
--------------------------------------------------任务状态段---------------------------------------------------------------
if (last_task_used_math == current):保存数学协处理器状态,见下文
__asm__("clts ; fnsave %0 ; frstor %0"::"m" (p->tss.i387));
if (copy_mem(nr,p)) { :复制parent的page directory, page tables,见下文
task[nr] = NULL;
free_page((long) p);
return -EAGAIN;
}
for (i=0; i<NR_OPEN;i++) :文件句柄,因为继承了parent
if (f=p->filp[i])
f->f_count++;
if (current->pwd) :当前目录i节点(i_node 用于描述目录与文件,见文件系统代码)
current->pwd->i_count++;
if (current->root) :当前根目录i_node
current->root->i_count++;
if (current->executable) :当前执行文件i_node
current->executable->i_count++;
if (current->library) :当前库文件i_node, 库文件加载在逻辑地址开始的60M处
current->library->i_count++;
set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss)); :设置gdt,见上文p->tss.ldt = _LDT(nr);
set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
p->p_pptr = current; :以下指针用于task tree
p->p_cptr = 0;
p->p_ysptr = 0;
p->p_osptr = current->p_cptr;
if (p->p_osptr)
p->p_osptr->p_ysptr = p;
current->p_cptr = p;
p->state = TASK_RUNNING; /* do this last, just in case */ :置为就绪状态
return last_pid;
}
copy_process 中数学协处理器状态保存(task 0--->task 1):
_current: 0x221b4 (System.map)
_last_task_used_math: 0x221b8
<bochs:82> x /xw ds:0x221b4
[bochs]:
0x000221b4 <bogus+ 0>: 0x000211a8
<bochs:83> x /xw ds:0x221b8
[bochs]:
0x000221b8 <bogus+ 0>: 0x00000000
000098fb: (   mov eax, dword ptr ds:0x221b4 ; a1b4210200
00009900: (   cmp eax, dword ptr ds:0x221b8 ; 3b05b8210200
00009906: (   jnz .+14 ; 750e :可知current != last_task_used_math (current=task 0)
00009908: (   clts ; 0f06
0000990a: (   fnsave ds:[ebx+1072] ; ddb330040000
00009910: (   frstor ds:[ebx+1072] ; dda330040000
copy_memory:
设置 ldt of child task, parent/ child共用page directory, page tables,参考copy_page_tables。
copy_page_tables: int copy_mem(int nr,struct task_struct * p)
{
unsigned long old_data_base,new_data_base,data_limit;
unsigned long old_code_base,new_code_base,code_limit;
code_limit=get_limit(0x0f);
data_limit=get_limit(0x17);
old_code_base = get_base(current->ldt[1]);
old_data_base = get_base(current->ldt[2]);
if (old_data_base != old_code_base)
panic("We don't support separate I&D");
if (data_limit < code_limit)
panic("Bad data_limit");
new_data_base = new_code_base = nr * TASK_SIZE;
p->start_code = new_code_base;
set_base(p->ldt[1],new_code_base);
set_base(p->ldt[2],new_data_base);
if (copy_page_tables(old_data_base,new_data_base,data_limit)) { :linus said this is most complicated code in memory management
free_page_tables(new_data_base,data_limit);
return -ENOMEM;
}
return 0;
}
copy_page_tables:
此处我们主要关注task 0--->task 1
int copy_page_tables(unsigned long from,unsigned long to,long size)
{
unsigned long * from_page_table;
unsigned long * to_page_table;
unsigned long this_page;
unsigned long * from_dir, * to_dir;
unsigned long new_page;
unsigned long nr;
if ((from&0x3fffff) || (to&0x3fffff))
panic("copy_page_tables called with wrong alignment");
from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ :base address of page directory,见下文
to_dir = (unsigned long *) ((to>>20) & 0xffc); :from(to)都是逻辑地址,22-31(index of page directory),12-21(index of page table), 0-11(offset address to physical page)
size = ((unsigned) (size+0x3fffff)) >> 22;
for( ; size-->0 ; from_dir++,to_dir++) {
if (1 & *to_dir) : 11 10 9 8 7 6 5 4 3 2 1 0
| A V L | | |D |A | | | U/S|R/W | P|
panic("copy_page_tables: already exist");
if (!(1 & *from_dir))
continue;
from_page_table = (unsigned long *) (0xfffff000 & *from_dir);
if (!(to_page_table = (unsigned long *) get_free_page())) :存放page table entries
return -1; /* Out of memory, see freeing */
*to_dir = ((unsigned long) to_page_table) | 7; :set page table = to_page_table | 111(2), present、user task can write
nr = (from==0)?0xA0:1024; :Note! 此时form=0,所以task 1只复制0xA0个pages
for ( ; nr-- > 0 ; from_page_table++,to_page_table++) {
this_page = *from_page_table;
if (!this_page)
continue;
if (!(1 & this_page)) { :说明被复制页面在交换设备中,(task0不会出现,task 0在内核代码中)
if (!(new_page = get_free_page())) :申请页面,下文读出交换设备
return -1;
read_swap_page(this_page>>1, (char *) new_page);
*to_page_table = this_page; :目标页面依然存于设备文件
*from_page_table = new_page | (PAGE_DIRTY | 7); :被复制页面已交换进main ram
continue;
}
this_page &= ~2;
*to_page_table = this_page; :否则目标页面写保护,(task 1 执行写时复制机制, task 0 页面属性无变化, 所以task 1 创建后要在task 0 执行写操作前执行,以免弄乱堆栈)
if (this_page > LOW_MEM) { :用于用户进程(task 0 位于 0—640kb)
*from_page_table = this_page;
this_page -= LOW_MEM;
this_page >>= 12;
mem_map[this_page]++;
}
}
}
invalidate(); :刷新页高速缓冲
return 0;
}
_page_dir:
参考前面head.s 将main入栈后,执行jmp +62,在物理地址0X00000000开始设置page directory, page tables.
.text
.globl _idt,_gdt,_pg_dir,_tmp_floppy_area
_pg_dir: :_pg_dir起始地址,位于0x0000, size of every page is 4k
startup_32:
.org 0x1000 :address of first page table
pg0:
.org 0x2000 :second
pg1:
.org 0x3000 :third
pg2:
.org 0x4000 :fourth
pg3:
.org 0x5000 :fifth
.align 2
setup_paging:
movl $1024*5,%ecx /* 5 pages - pg_dir+4 page tables */
xorl %eax,%eax
xorl %edi,%edi /* pg_dir is at 0x000 */
cld;rep;stosl
movl $pg0+7,_pg_dir /* set present bit/user r/w */
movl $pg1+7,_pg_dir+4 /* --------- " " --------- */
movl $pg2+7,_pg_dir+8 /* --------- " " --------- */
movl $pg3+7,_pg_dir+12 /* --------- " " --------- */
movl $pg3+4092,%edi
movl $0xfff007,%eax /* 16Mb - 4096 + 7 (r/w user,p) */
std
1: stosl /* fill pages backwards - more efficient :-) */
subl $0x1000,%eax
jge 1b
xorl %eax,%eax /* pg_dir is at 0x0000 */
movl %eax,%cr3 /* cr3 - page directory start */
movl %cr0,%eax
orl $0x80000000,%eax
movl %eax,%cr0 /* set paging (PG) bit */
ret /* this also flushes prefetch-queue */
_system_call fork (2):
000079ab: (   push eax ; 50 :PID of task 1
000079ac: (   mov eax, dword ptr ds:0x221b4 ; a1b4210200 : task 0 Is RUNNING?
000079b1: (   cmp dword ptr ds:[eax], 0x00000000 ; 83780000
000079b5: (   jnz .-63 ; 75c1
000079b7: (   cmp dword ptr ds:[eax+4], 0x00000000 ; 83780400 : counter out?
000079bb: (   jz .-69 ; 74bb
此后task 1创建成功,task 0 返回PID of task 1:
<bochs:158> ? eax
0x1 1
检测task 0的状态与时间片,判断是否需要切换任务(因为main调用sched_init中开启了时钟中断)。
sched_init:
outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */
outb_p(LATCH & 0xff , 0x40); /* LSB */
outb(LATCH >> 8 , 0x40); /* MSB */
set_intr_gate(0x20,&timer_interrupt); :idt中时钟中断
outb(inb_p(0x21)&~0x01,0x21); :设置时钟中断允许标志
_timer_interrupt:
.align 2
_timer_interrupt:
push %ds # save ds,es and put kernel data space
push %es # into them. %fs is used by _system_call
push %fs
pushl $-1 # fill in -1 for orig_eax :对应_system_call为系统调用号
pushl %edx # we save %eax,%ecx,%edx as gcc doesn't
pushl %ecx # save those across function calls. %ebx
pushl %ebx # is saved as we use that in ret_sys_call
pushl %eax
movl $0x10,%eax
mov %ax,%ds
mov %ax,%es
movl $0x17,%eax
mov %ax,%fs
incl _jiffies
movb $0x20,%al # EOI to interrupt controller #1
outb %al,$0x20
movl CS(%esp),%eax
andl $3,%eax # %eax is CPL (0 or 3, 0=supervisor)
pushl %eax
call _do_timer # 'do_timer(long CPL)' does everything from
addl $4,%esp # task switching to accounting ...
jmp ret_from_sys_call
do_timer:
if (cpl)
current->utime++; :user space time
else
current->stime++; :kernel space time
if ((--current->counter)>0) return; :时间片递减,若发生中断时,当前进程counter>=2,不切换任务
current->counter=0;
if (!cpl) return; :进程处于内核时不被抢占
schedule();
_system_call fork (3):
进程唯一捕获信号的地方,task[0]忽略信号的处理。参见下文do_signal
<bochs:163> x /xw ds:0x221b4
[bochs]:
0x000221b4 <bogus+ 0>: 0x000211a8
<bochs:164> x /xw ds:0x221bc
[bochs]:
0x000221bc <bogus+ 0>: 0x000211a8
000079bd: (   mov eax, dword ptr ds:0x221b4 ; a1b4210200 :_current: 0x221b4
000079c2: (   cmp eax, dword ptr ds:0x221bc ; 3b05bc210200 :_task: 0x221bc
000079c8: (   jz .+52 ; 7434 :由上可知,将执行0x000079fe处指令
000079ca: (   cmp word ptr ss:[esp+36], 0x000f ; 66837c24240f
000079d0: (   jnz .+44 ; 752c
000079d2: (   cmp word ptr ss:[esp+48], 0x0017 ; 66837c243017
000079d8: (   jnz .+36 ; 7524
000079da: (   mov ebx, dword ptr ds:[eax+12] ; 8b580c
000079dd: (   mov ecx, dword ptr ds:[eax+528] ; 8b8810020000
000079e3: (   not ecx ; f7d1
000079e5: (   and ecx, ebx ; 21d9
000079e7: (   bsf ecx, ecx ; 0fbcc9
000079ea: (   jz .+18 ; 7412
000079ec: (   btr ebx, ecx ; 0fb3cb
000079ef: (   mov dword ptr ds:[eax+12], ebx ; 89580c
000079f2: (   inc ecx ; 41
000079f3: (   push ecx ; 51
000079f4: (   call .+18019 ; e863460000
000079f9: (   pop ecx ; 59
000079fa: (   test eax, eax ; 85c0
000079fc: (   jnz .-82 ; 75ae
Tips:
○1 do_signal:
do_signal将信号处理函数置于user stack, 此处task 0不捕获信号
int do_signal(long signr,long eax,long ebx, long ecx, long edx, long orig_eax,
long fs, long es, long ds,
long eip, long cs, long eflags,
unsigned long * esp, long ss)
_system_call fork (4):
000079fe: (   pop eax ; 58
000079ff: (   pop ebx ; 5b
00007a00: (   pop ecx ; 59
00007a01: (   pop edx ; 5a
00007a02: (   add esp, 0x00000004 ; 83c404
00007a05: (   pop fs ; 0fa1
00007a07: (   pop es ; 07
00007a08: (   pop ds ; 1f
00007a09: (   iretd ; cf
系统调用中断前后段寄存器变化为:
es:0x0010, dh=0x00c09300, dl=0x00000fff, valid=1
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
cs:0x0008, dh=0x00c09b00, dl=0x00000fff, valid=1
Code segment, base=0x00000000, limit=0x00ffffff, Execute/Read, Accessed, 32-bit
ss:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
ds:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
fs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
es:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
cs:0x000f, dh=0x00c0fb00, dl=0x0000009f, valid=1
Code segment, base=0x00000000, limit=0x0009ffff, Execute/Read, Accessed, 32-bit
ss:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
ds:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
fs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
此后,task 0调用pause系统调用,切换到task 1上运行。(Note! task 0 与task 1 此时共用user stack, task 1运行前,task 0 不能对ss:esp执行写操作)
00006866: (   mov eax, 0x0000001d ; b81d000000
0000686b: (   int 0x80 ; cd80 :见下文sys_pause
0000686d: (   jmp .-9 ; ebf7
sys_pause:
int sys_pause(void)
{
current->state = TASK_INTERRUPTIBLE; :task 0不起作用
schedule();
return 0;
}
schedule:
void schedule(void)
{
int i,next,c;
struct task_struct ** p;
/* check alarm, wake up any interruptible tasks that have got a signal */
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) :从task 1起
if (*p) {
if ((*p)->timeout && (*p)->timeout < jiffies) { :超时且可中断则置为就绪态
(*p)->timeout = 0;
if ((*p)->state == TASK_INTERRUPTIBLE)
(*p)->state = TASK_RUNNING;
}
if ((*p)->alarm && (*p)->alarm < jiffies) { :系统向其发送ALARM signal
(*p)->signal |= (1<<(SIGALRM-1));
(*p)->alarm = 0;
}
if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) && :除屏蔽信号外还有信号,且处于可中断状态,则置为就绪态
(*p)->state==TASK_INTERRUPTIBLE)
(*p)->state=TASK_RUNNING;
}
/* this is the scheduler proper: */
while (1) {
c = -1;
next = 0;
i = NR_TASKS;
p = &task[NR_TASKS];
while (--i) { :到task 1,选取时间片最大者;若有task 0外其它任务,且都counter=0,则重设counter
if (!*--p)
continue;
if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
c = (*p)->counter, next = i;
}
if (c) break;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
if (*p)
(*p)->counter = ((*p)->counter >> 1) +
(*p)->priority;
}
switch_to(next);
}
switch_to:
#define switch_to(n) {\
struct {long a,b;} __tmp; \
__asm__("cmpl %%ecx,_current\n\t" \
"je 1f\n\t" \
"movw %%dx,%1\n\t" \
"xchgl %%ecx,_current\n\t" \
"ljmp %0\n\t" \ : 至此,task 0切换到task 1执行,task 0被切换回后执cmpl指令
"cmpl %%ecx,_last_task_used_math\n\t" \
"jne 1f\n\t" \
"clts\n" \
"1:" \
::"m" (*&__tmp.a),"m" (*&__tmp.b), \
"d" (_TSS(n)),"c" ((long) task[n])); \
}
00006f2b: (   mov word ptr ss:[ebp-4], dx ; 668955fc
00006f2f: (   xchg dword ptr ds:0x221b4, ecx ; 870db4210200
00006f35: (   jmp far ss:[ebp-8] ; ff6df8 :任务切换在此
00006f38: (   cmp dword ptr ds:0x221b8, ecx ; 390db8210200
00006f3e: (   jnz .+2 ; 7502
00006f40: (   clts ; 0f06
Task 0此时位于kernel space
<bochs:3> sreg
es:0x0010, dh=0x00c09300, dl=0x00000fff, valid=1
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
cs:0x0008, dh=0x00c09b00, dl=0x00000fff, valid=1
Code segment, base=0x00000000, limit=0x00ffffff, Execute/Read, Accessed, 32-bit
ss:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
ds:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
fs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
ldtr:0x0028, dh=0x00008202, dl=0x15580068, valid=1
tr:0x0020, dh=0x00008b02, dl=0x15700068, valid=1
gdtr:base=0x00005cb8, limit=0x7ff
idtr:base=0x000054b8, limit=0x7ff
<bochs:4> u
00006f35: (   jmp far ss:[ebp-8] ; ff6df8
<bochs:5> s
Next at t=52090203
(0) [0x0000684a] 000f:0000684a (unk. ctxt): mov edx, eax ; 89c2
切换至task 1
<bochs:6> sreg
es:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
cs:0x000f, dh=0x04c0fb00, dl=0x0000009f, valid=1
Code segment, base=0x04000000, limit=0x0009ffff, Execute/Read, Accessed, 32-bit
ss:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
ds:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
fs:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
ldtr:0x0038, dh=0x000082ff, dl=0xf3b00068, valid=1
tr:0x0030, dh=0x00008bff, dl=0xf3c80068, valid=1
task 1 开始执行。
0. 说明
本文C代码和AT&T格式汇编均摘自linux-0.12内核, Intel格式汇编摘自bochs。
1. 从开机到任务0执行
bootsect.S--->setup.S--->head.s--->main.c
1)bootsect.S: 引导代码,将setup代码和system模块加载到内存指定位置处。
2)setup.S : 读取时钟、硬盘参数表等信息,main函数会用到。
3)head.s : 内核模块(system)最先执行的代码,主要设置page directory table 和四个page tables,映射整个物理内存(high effective physical memory = 16M),供内核代码和任务0使用;并设置main函数。
00005400: (   push 0x00000000 ; 6a00 :main函数三个参数
00005402: (   push 0x00000000 ; 6a00
00005404: (   push 0x00000000 ; 6a00
00005406: (   push 0x00005412 ; 6812540000
0000540b: (   push 0x000066b0 ; 68b0660000 :main地址
00005410: (   jmp .+62 ; eb3e :页目录、页表设置
00005412: (   jmp .-2 ; ebfe
4)main
sched_init(); :任务0设置,并clear NT标志,move_to_user_mode有用
buffer_init(buffer_memory_end);
hd_init();
floppy_init();
sti();
move_to_user_mode(); :内核初始化完毕后移动到任务0执行
if (!fork()) { :任务1创建
init(); : 任务1执行文件系统加载等操作
}
for(;;) :任务0执行任务切换操作
__asm__(“int $0x80”::”a” (__NR_pause):”ax”);
Tips:
○1fork对子进程返回0,因为copy_process时设置新任务的eax=0,见下文。
○2kernel/sched.c中定义:
long user_stack [ PAGE_SIZE>>2 ];
struct {
long * a;
short b;
} stack_start = { & user_stack [PAGE_SIZE>>2] , 0x10 };
_stack_start: 0x222bc (见System.map)
head.s开始执行时,将user_stack用做内核初始化堆栈,并在切换到任务0时,用作任务0的user stack:
00000000: (   mov eax, 0x00000010 ; b810000000
00000005: (   mov ds, ax ; 8ed8
00000007: (   mov es, ax ; 8ec0
00000009: (   mov fs, ax ; 8ee0
0000000b: (   mov gs, ax ; 8ee8
0000000d: (   lss esp, ds:0x222bc ; 0fb225bc220200 :esp-_stack_start
00000014: (   call .+86 ; e856000000 :设置idt
00000019: (   call .+129 ; e881000000 :设置gdt
0000001e: (   mov eax, 0x00000010 ; b810000000
00000023: (   mov ds, ax ; 8ed8
00000025: (   mov es, ax ; 8ec0
00000027: (   mov fs, ax ; 8ee0
00000029: (   mov gs, ax ; 8ee8
0000002b: (   lss esp, ds:0x222bc ; 0fb225bc220200
move_to_user_mode:
00006825: (   mov eax, esp ; 89e0 :内核堆栈将用作任务0 user stack
00006827: (   push 0x00000017 ; 6a17 :任务0的ss:esp
00006829: (   push eax ; 50
0000682a: (   pushfd ; 9c
0000682b: (   push 0x0000000f ; 6a0f :任务0的cs:eip
0000682d: (   push 0x00006833 ; 6833680000
00006832: (   iretd ; cf :模拟中断返回,切换到任务0
00006833: (   mov eax, 0x00000017 ; b817000000
00006838: (   mov ds, ax ; 8ed8
0000683a: (   mov es, ax ; 8ec0
0000683c: (   mov fs, ax ; 8ee0
0000683e: (   mov gs, ax ; 8ee8
00006840: (   add esp, 0x0000000c ; 83c40c
Tips:
○1iretd执行时,eflags= 0x00000206: id vip vif ac vm rf nt IOPL=0 of df IF tf sf zf af PF cf,nt标准已复位;
2. 任务0到任务1
fork:
(1) main.c中定义:
static inline _syscall0(int,fork)
Tips:
○1main.c将fork定义为inline,因为创建进程时,任务0、1共用page directory、page tables,任务1置位page entry的r/w,任务0对应的page entry无变化。所以在对堆栈进行写操作时,任务1必须先运行,以免弄乱堆栈。见上面的main中任务0执行pause系统调用,详见后文分析。
(2) include/unistd.h中定义:
#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
: "=a" (__res) \
: "0" (__NR_##name)); \
if (__res >= 0) \
return (type) __res; \
errno = -__res; \
return -1; \
}
对应汇编为:
00006843: (   mov eax, 0x00000002 ; b802000000
00006848: (   int 0x80 ; cd80 :fork系统调用(eax=0x2)
0000684a: (   mov edx, eax ; 89c2 :eax返回PID,对子进程为0
0000684c: (   test edx, edx ; 85d2
0000684e: (   jnl .+13 ; 7d0d
00006850: (   neg edx ; f7da
00006852: (   mov dword ptr ds:0x38ea8, edx ; 8915a88e0300
00006858: (   mov edx, 0xffffffff ; baffffffff
0000685d: (   test edx, edx ; 85d2
0000685f: (   jnz .+5 ; 7505
执行fork系统调用前堆栈:
| STACK 0x00038604 [0x00000000]
| STACK 0x00038608 [0x00000050]
| STACK 0x0003860c [0x000001cb]
| STACK 0x00038610 [0x00005412]
| STACK 0x00038614 [0x00000000]
| STACK 0x00038618 [0x00000000]
| STACK 0x0003861c [0x00000000]
fork进入后堆栈:
| STACK 0x00022194 [0x0000684a] : eip (user space)
| STACK 0x00022198 [0x0000000f] : cs (user space)
| STACK 0x0002219c [0x00000212] : eflags
| STACK 0x000221a0 [0x00038604] : esp (user space)
| STACK 0x000221a4 [0x00000017] : ss (user space)
Tips:
○1系统调用接口(_system_call)
.align 2
_system_call:
push %ds :保存段寄存器
push %es
push %fs
pushl %eax # save the orig_eax :系统调用号,此处为0x2
pushl %edx : edx、ecx、ebx用做系统调用执行函数的参数,参考上面的_syscall0(无参数)
pushl %ecx # push %ebx,%ecx,%edx as parameters
pushl %ebx # to the system call
movl $0x10,%edx # set up ds,es to kernel space :linus将gdt表1-内核代码段,2-内核数据段,0-不用
mov %dx,%ds
mov %dx,%es
movl $0x17,%edx # fs points to local data space :fs用于user space/kernel space交换数据;ldt表2-用户空间数据段,1-用户空间代码段,0-不用
mov %dx,%fs
cmpl _NR_syscalls,%eax :linux/sys.h 中定义 int NR_syscalls = sizeof(sys_call_table)/sizeof(fn_ptr);
jae bad_sys_call
call _sys_call_table(,%eax,4) :调用system routine,此处为_sys_fork
pushl %eax :系统调用返回值,此处为PID,对任务0是1,对任务1是0,因为_sys_fork调用copy_process时设置任务1的eax=0,见下文
2: :重新调度任务,因为内核在执行move_to_user_mode前,已开启中断允许标志(见sti()); 时钟中断处理程序会递减当前任务(current)的运行时间片,并根据current的运行级别(cpl)决定是否抢占任务,见后文
movl _current,%eax
cmpl $0,state(%eax) # state :state表示任务运行状态(RUNNING/INTERRUPTIBLE/UNINTERRUPTIBLE/ZOMBIE/STOPPED, 见linux/sched.h)
jne reschedule
cmpl $0,counter(%eax) # counter :时间片,初始时为任务的priority
je reschedule
ret_from_sys_call: :下面代码到标号3之间,用于处理current的signals;进程在内核(cpl=0,ss=0x10)发生中断时不捕获信号;do_signal用于信号前预处理,把对应信号处理函数和恢复函数保存在user space stack中(所以用户可定义自己的信号处理函数),_system_call执行后,先执行信号处理,再执行发生系统调用后的一条指令,详见后文
movl _current,%eax
cmpl _task,%eax # task[0] cannot have signals :任务0是不捕获任何信号的,参考后文schedule说明
je 3f
cmpw $0x0f,CS(%esp) # was old code segment supervisor ? :内核中断不捕获信号
jne 3f
cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ?
jne 3f
movl signal(%eax),%ebx :信号标识码(bit(x) = (1<<(signal+1)))
movl blocked(%eax),%ecx :信号阻塞标识码
notl %ecx
andl %ebx,%ecx
bsfl %ecx,%ecx
je 3f
btrl %ecx,%ebx
movl %ebx,signal(%eax)
incl %ecx
pushl %ecx
call _do_signal :信号预处理,见后文
popl %ecx
testl %eax, %eax
jne 2b # see if we need to switch tasks, or do more signals :是否还有可捕获信号
3: popl %eax
popl %ebx
popl %ecx
popl %edx
addl $4, %esp # skip orig_eax
pop %fs
pop %es
pop %ds
iret
_system_call fork (1):
00007984: (   push ds ; 1e
00007985: (   push es ; 06
00007986: (   push fs ; 0fa0
00007988: (   push eax ; 50 :_NR_FORK(0x2)
00007989: (   push edx ; 52
0000798a: (   push ecx ; 51
0000798b: (   push ebx ; 53
0000798c: (   mov edx, 0x00000010 ; ba10000000
00007991: (   mov ds, dx ; 8eda
00007993: (   mov es, dx ; 8ec2
00007995: (   mov edx, 0x00000017 ; ba17000000
0000799a: (   mov fs, dx ; 8ee2
0000799c: (   cmp eax, dword ptr ds:0x211a4 ; 3b05a4110200 :_NR_syscalls: 0x211a4(System.map)
000079a2: (   jnb .-48 ; 73d0
000079a4: (   call dword ptr ds:[eax*4+135240] ; ff148548100200 :call sys_call_table[NR_FORK],即_sys_fork;135240 == 0x21048; _sys_call_table: 0x21048(System.map)
000079ab: (   push eax ; 50 :见下文copy_process
_sys_fork:
00007abc: (   call .+8375 ; e8b7200000 :find_empty_process
00007ac1: (   test eax, eax ; 85c0
00007ac3: (   js .+14 ; 780e
00007ac5: (   push gs ; 0fa8
00007ac7: (   push esi ; 56
00007ac8: (   push edi ; 57
00007ac9: (   push ebp ; 55
00007aca: (   push eax ; 50
00007acb: (   call .+7284 ; e8741c0000 :copy_process,创建子进程主要代码
00007ad0: (   add esp, 0x00000014 ; 83c414
00007ad3: (   ret ; c3
find_empty_process:
主要确定全局唯一进程号,并判断系统是否还可以创建进程。
int find_empty_process(void)
{
int i;
repeat:
if ((++last_pid)<0) last_pid=1; :PID (1 for task 1)
for(i=0 ; i<NR_TASKS ; i++)
if (task[i] && ((task[i]->pid == last_pid) ||
(task[i]->pgrp == last_pid)))
goto repeat;
for(i=1 ; i<NR_TASKS ; i++) :进程队列是否已满
if (!task[i])
return i; 任务队列索引(eax)
return -EAGAIN;
}
copy_process:
声明:
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
long ebx,long ecx,long edx, long orig_eax,
long fs,long es,long ds,
long eip,long cs,long eflags,long esp,long ss)
进入前的堆栈:
| STACK 0x00022160 [0x00000001] :index of tasks table (task 1)
| STACK 0x00022164 [0x0003860c]
| STACK 0x00022168 [0x00000ffc]
| STACK 0x0002216c [0x000e0000]
| STACK 0x00022170 [0x00000017]
| STACK 0x00022174 [0x000079ab] : eip(instruction after sys_fork)
| STACK 0x00022178 [0x00000000]
| STACK 0x0002217c [0x00056800]
| STACK 0x00022180 [0x00000021]
| STACK 0x00022184 [0x00000002] : NR_FORK
| STACK 0x00022188 [0x00000017] :_system_call put
| STACK 0x0002218c [0x00000017]
| STACK 0x00022190 [0x00000017]
| STACK 0x00022194 [0x0000684a] :eip (instruction after fork _system_ call)
| STACK 0x00022198 [0x0000000f] :cs (user cs of task 0)
| STACK 0x0002219c [0x00000212] :eflags
| STACK 0x000221a0 [0x00038604] :esp (user stack of task 0)
| STACK 0x000221a4 [0x00000017] :ss (user stack of task 0)
copy_process函数体
{
struct task_struct *p;
int i;
struct file *f;
p = (struct task_struct *) get_free_page(); :在main memory 中存放process descriptor,其中kernel stack start top position位于其所在page下一位置
if (!p)
return -EAGAIN;
task[nr] = p;
*p = *current; /* NOTE! this doesn't copy the supervisor stack */
:以下重新设置子进程的相关值
p->state = TASK_UNINTERRUPTIBLE; :参考下文进程创建成功后设置其为已就绪状态
p->pid = last_pid; :见上文find_empty_process
p->counter = p->priority; :初始化时间片,覆盖parent的counter
p->signal = 0; :no signal
p->alarm = 0; : no blocked mask
p->leader = 0; /* process leadership doesn't inherit */
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->start_time = jiffies; :进程创建时间,当前系统滴答值
--------------------------------------------------任务状态段---------------------------------------------------------------
p->tss.back_link = 0; :back link field字段,用于任务门切换后根据NT标志返回前一任务
p->tss.esp0 = PAGE_SIZE + (long) p; :进程创建后,内核堆栈是不变的,即任务切换时不被cpu保存
p->tss.ss0 = 0x10;
p->tss.eip = eip; :子进程eip,此处为0x0000684a
p->tss.eflags = eflags;
p->tss.eax = 0; :子进程返回的PID,上文中子进程fork返回0的原因
p->tss.ecx = ecx;
p->tss.edx = edx;
p->tss.ebx = ebx;
p->tss.esp = esp;
p->tss.ebp = ebp;
p->tss.esi = esi;
p->tss.edi = edi;
p->tss.es = es & 0xffff;
p->tss.cs = cs & 0xffff;
p->tss.ss = ss & 0xffff;
p->tss.ds = ds & 0xffff;
p->tss.fs = fs & 0xffff;
p->tss.gs = gs & 0xffff;
p->tss.ldt = _LDT(nr); :gdt中ldt索引,每个任务在gdt中占两项(tss,ldt),任务0(tss=4,ldt=5),任务1(tss=6,ldt=7)
p->tss.trace_bitmap = 0x80000000; :i/o位图
--------------------------------------------------任务状态段---------------------------------------------------------------
if (last_task_used_math == current):保存数学协处理器状态,见下文
__asm__("clts ; fnsave %0 ; frstor %0"::"m" (p->tss.i387));
if (copy_mem(nr,p)) { :复制parent的page directory, page tables,见下文
task[nr] = NULL;
free_page((long) p);
return -EAGAIN;
}
for (i=0; i<NR_OPEN;i++) :文件句柄,因为继承了parent
if (f=p->filp[i])
f->f_count++;
if (current->pwd) :当前目录i节点(i_node 用于描述目录与文件,见文件系统代码)
current->pwd->i_count++;
if (current->root) :当前根目录i_node
current->root->i_count++;
if (current->executable) :当前执行文件i_node
current->executable->i_count++;
if (current->library) :当前库文件i_node, 库文件加载在逻辑地址开始的60M处
current->library->i_count++;
set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss)); :设置gdt,见上文p->tss.ldt = _LDT(nr);
set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
p->p_pptr = current; :以下指针用于task tree
p->p_cptr = 0;
p->p_ysptr = 0;
p->p_osptr = current->p_cptr;
if (p->p_osptr)
p->p_osptr->p_ysptr = p;
current->p_cptr = p;
p->state = TASK_RUNNING; /* do this last, just in case */ :置为就绪状态
return last_pid;
}
copy_process 中数学协处理器状态保存(task 0--->task 1):
_current: 0x221b4 (System.map)
_last_task_used_math: 0x221b8
<bochs:82> x /xw ds:0x221b4
[bochs]:
0x000221b4 <bogus+ 0>: 0x000211a8
<bochs:83> x /xw ds:0x221b8
[bochs]:
0x000221b8 <bogus+ 0>: 0x00000000
000098fb: (   mov eax, dword ptr ds:0x221b4 ; a1b4210200
00009900: (   cmp eax, dword ptr ds:0x221b8 ; 3b05b8210200
00009906: (   jnz .+14 ; 750e :可知current != last_task_used_math (current=task 0)
00009908: (   clts ; 0f06
0000990a: (   fnsave ds:[ebx+1072] ; ddb330040000
00009910: (   frstor ds:[ebx+1072] ; dda330040000
copy_memory:
设置 ldt of child task, parent/ child共用page directory, page tables,参考copy_page_tables。
copy_page_tables: int copy_mem(int nr,struct task_struct * p)
{
unsigned long old_data_base,new_data_base,data_limit;
unsigned long old_code_base,new_code_base,code_limit;
code_limit=get_limit(0x0f);
data_limit=get_limit(0x17);
old_code_base = get_base(current->ldt[1]);
old_data_base = get_base(current->ldt[2]);
if (old_data_base != old_code_base)
panic("We don't support separate I&D");
if (data_limit < code_limit)
panic("Bad data_limit");
new_data_base = new_code_base = nr * TASK_SIZE;
p->start_code = new_code_base;
set_base(p->ldt[1],new_code_base);
set_base(p->ldt[2],new_data_base);
if (copy_page_tables(old_data_base,new_data_base,data_limit)) { :linus said this is most complicated code in memory management
free_page_tables(new_data_base,data_limit);
return -ENOMEM;
}
return 0;
}
copy_page_tables:
此处我们主要关注task 0--->task 1
int copy_page_tables(unsigned long from,unsigned long to,long size)
{
unsigned long * from_page_table;
unsigned long * to_page_table;
unsigned long this_page;
unsigned long * from_dir, * to_dir;
unsigned long new_page;
unsigned long nr;
if ((from&0x3fffff) || (to&0x3fffff))
panic("copy_page_tables called with wrong alignment");
from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ :base address of page directory,见下文
to_dir = (unsigned long *) ((to>>20) & 0xffc); :from(to)都是逻辑地址,22-31(index of page directory),12-21(index of page table), 0-11(offset address to physical page)
size = ((unsigned) (size+0x3fffff)) >> 22;
for( ; size-->0 ; from_dir++,to_dir++) {
if (1 & *to_dir) : 11 10 9 8 7 6 5 4 3 2 1 0
| A V L | | |D |A | | | U/S|R/W | P|
panic("copy_page_tables: already exist");
if (!(1 & *from_dir))
continue;
from_page_table = (unsigned long *) (0xfffff000 & *from_dir);
if (!(to_page_table = (unsigned long *) get_free_page())) :存放page table entries
return -1; /* Out of memory, see freeing */
*to_dir = ((unsigned long) to_page_table) | 7; :set page table = to_page_table | 111(2), present、user task can write
nr = (from==0)?0xA0:1024; :Note! 此时form=0,所以task 1只复制0xA0个pages
for ( ; nr-- > 0 ; from_page_table++,to_page_table++) {
this_page = *from_page_table;
if (!this_page)
continue;
if (!(1 & this_page)) { :说明被复制页面在交换设备中,(task0不会出现,task 0在内核代码中)
if (!(new_page = get_free_page())) :申请页面,下文读出交换设备
return -1;
read_swap_page(this_page>>1, (char *) new_page);
*to_page_table = this_page; :目标页面依然存于设备文件
*from_page_table = new_page | (PAGE_DIRTY | 7); :被复制页面已交换进main ram
continue;
}
this_page &= ~2;
*to_page_table = this_page; :否则目标页面写保护,(task 1 执行写时复制机制, task 0 页面属性无变化, 所以task 1 创建后要在task 0 执行写操作前执行,以免弄乱堆栈)
if (this_page > LOW_MEM) { :用于用户进程(task 0 位于 0—640kb)
*from_page_table = this_page;
this_page -= LOW_MEM;
this_page >>= 12;
mem_map[this_page]++;
}
}
}
invalidate(); :刷新页高速缓冲
return 0;
}
_page_dir:
参考前面head.s 将main入栈后,执行jmp +62,在物理地址0X00000000开始设置page directory, page tables.
.text
.globl _idt,_gdt,_pg_dir,_tmp_floppy_area
_pg_dir: :_pg_dir起始地址,位于0x0000, size of every page is 4k
startup_32:
.org 0x1000 :address of first page table
pg0:
.org 0x2000 :second
pg1:
.org 0x3000 :third
pg2:
.org 0x4000 :fourth
pg3:
.org 0x5000 :fifth
.align 2
setup_paging:
movl $1024*5,%ecx /* 5 pages - pg_dir+4 page tables */
xorl %eax,%eax
xorl %edi,%edi /* pg_dir is at 0x000 */
cld;rep;stosl
movl $pg0+7,_pg_dir /* set present bit/user r/w */
movl $pg1+7,_pg_dir+4 /* --------- " " --------- */
movl $pg2+7,_pg_dir+8 /* --------- " " --------- */
movl $pg3+7,_pg_dir+12 /* --------- " " --------- */
movl $pg3+4092,%edi
movl $0xfff007,%eax /* 16Mb - 4096 + 7 (r/w user,p) */
std
1: stosl /* fill pages backwards - more efficient :-) */
subl $0x1000,%eax
jge 1b
xorl %eax,%eax /* pg_dir is at 0x0000 */
movl %eax,%cr3 /* cr3 - page directory start */
movl %cr0,%eax
orl $0x80000000,%eax
movl %eax,%cr0 /* set paging (PG) bit */
ret /* this also flushes prefetch-queue */
_system_call fork (2):
000079ab: (   push eax ; 50 :PID of task 1
000079ac: (   mov eax, dword ptr ds:0x221b4 ; a1b4210200 : task 0 Is RUNNING?
000079b1: (   cmp dword ptr ds:[eax], 0x00000000 ; 83780000
000079b5: (   jnz .-63 ; 75c1
000079b7: (   cmp dword ptr ds:[eax+4], 0x00000000 ; 83780400 : counter out?
000079bb: (   jz .-69 ; 74bb
此后task 1创建成功,task 0 返回PID of task 1:
<bochs:158> ? eax
0x1 1
检测task 0的状态与时间片,判断是否需要切换任务(因为main调用sched_init中开启了时钟中断)。
sched_init:
outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */
outb_p(LATCH & 0xff , 0x40); /* LSB */
outb(LATCH >> 8 , 0x40); /* MSB */
set_intr_gate(0x20,&timer_interrupt); :idt中时钟中断
outb(inb_p(0x21)&~0x01,0x21); :设置时钟中断允许标志
_timer_interrupt:
.align 2
_timer_interrupt:
push %ds # save ds,es and put kernel data space
push %es # into them. %fs is used by _system_call
push %fs
pushl $-1 # fill in -1 for orig_eax :对应_system_call为系统调用号
pushl %edx # we save %eax,%ecx,%edx as gcc doesn't
pushl %ecx # save those across function calls. %ebx
pushl %ebx # is saved as we use that in ret_sys_call
pushl %eax
movl $0x10,%eax
mov %ax,%ds
mov %ax,%es
movl $0x17,%eax
mov %ax,%fs
incl _jiffies
movb $0x20,%al # EOI to interrupt controller #1
outb %al,$0x20
movl CS(%esp),%eax
andl $3,%eax # %eax is CPL (0 or 3, 0=supervisor)
pushl %eax
call _do_timer # 'do_timer(long CPL)' does everything from
addl $4,%esp # task switching to accounting ...
jmp ret_from_sys_call
do_timer:
if (cpl)
current->utime++; :user space time
else
current->stime++; :kernel space time
if ((--current->counter)>0) return; :时间片递减,若发生中断时,当前进程counter>=2,不切换任务
current->counter=0;
if (!cpl) return; :进程处于内核时不被抢占
schedule();
_system_call fork (3):
进程唯一捕获信号的地方,task[0]忽略信号的处理。参见下文do_signal
<bochs:163> x /xw ds:0x221b4
[bochs]:
0x000221b4 <bogus+ 0>: 0x000211a8
<bochs:164> x /xw ds:0x221bc
[bochs]:
0x000221bc <bogus+ 0>: 0x000211a8
000079bd: (   mov eax, dword ptr ds:0x221b4 ; a1b4210200 :_current: 0x221b4
000079c2: (   cmp eax, dword ptr ds:0x221bc ; 3b05bc210200 :_task: 0x221bc
000079c8: (   jz .+52 ; 7434 :由上可知,将执行0x000079fe处指令
000079ca: (   cmp word ptr ss:[esp+36], 0x000f ; 66837c24240f
000079d0: (   jnz .+44 ; 752c
000079d2: (   cmp word ptr ss:[esp+48], 0x0017 ; 66837c243017
000079d8: (   jnz .+36 ; 7524
000079da: (   mov ebx, dword ptr ds:[eax+12] ; 8b580c
000079dd: (   mov ecx, dword ptr ds:[eax+528] ; 8b8810020000
000079e3: (   not ecx ; f7d1
000079e5: (   and ecx, ebx ; 21d9
000079e7: (   bsf ecx, ecx ; 0fbcc9
000079ea: (   jz .+18 ; 7412
000079ec: (   btr ebx, ecx ; 0fb3cb
000079ef: (   mov dword ptr ds:[eax+12], ebx ; 89580c
000079f2: (   inc ecx ; 41
000079f3: (   push ecx ; 51
000079f4: (   call .+18019 ; e863460000
000079f9: (   pop ecx ; 59
000079fa: (   test eax, eax ; 85c0
000079fc: (   jnz .-82 ; 75ae
Tips:
○1 do_signal:
do_signal将信号处理函数置于user stack, 此处task 0不捕获信号
int do_signal(long signr,long eax,long ebx, long ecx, long edx, long orig_eax,
long fs, long es, long ds,
long eip, long cs, long eflags,
unsigned long * esp, long ss)
_system_call fork (4):
000079fe: (   pop eax ; 58
000079ff: (   pop ebx ; 5b
00007a00: (   pop ecx ; 59
00007a01: (   pop edx ; 5a
00007a02: (   add esp, 0x00000004 ; 83c404
00007a05: (   pop fs ; 0fa1
00007a07: (   pop es ; 07
00007a08: (   pop ds ; 1f
00007a09: (   iretd ; cf
系统调用中断前后段寄存器变化为:
es:0x0010, dh=0x00c09300, dl=0x00000fff, valid=1
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
cs:0x0008, dh=0x00c09b00, dl=0x00000fff, valid=1
Code segment, base=0x00000000, limit=0x00ffffff, Execute/Read, Accessed, 32-bit
ss:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
ds:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
fs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
es:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
cs:0x000f, dh=0x00c0fb00, dl=0x0000009f, valid=1
Code segment, base=0x00000000, limit=0x0009ffff, Execute/Read, Accessed, 32-bit
ss:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
ds:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
fs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
此后,task 0调用pause系统调用,切换到task 1上运行。(Note! task 0 与task 1 此时共用user stack, task 1运行前,task 0 不能对ss:esp执行写操作)
00006866: (   mov eax, 0x0000001d ; b81d000000
0000686b: (   int 0x80 ; cd80 :见下文sys_pause
0000686d: (   jmp .-9 ; ebf7
sys_pause:
int sys_pause(void)
{
current->state = TASK_INTERRUPTIBLE; :task 0不起作用
schedule();
return 0;
}
schedule:
void schedule(void)
{
int i,next,c;
struct task_struct ** p;
/* check alarm, wake up any interruptible tasks that have got a signal */
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) :从task 1起
if (*p) {
if ((*p)->timeout && (*p)->timeout < jiffies) { :超时且可中断则置为就绪态
(*p)->timeout = 0;
if ((*p)->state == TASK_INTERRUPTIBLE)
(*p)->state = TASK_RUNNING;
}
if ((*p)->alarm && (*p)->alarm < jiffies) { :系统向其发送ALARM signal
(*p)->signal |= (1<<(SIGALRM-1));
(*p)->alarm = 0;
}
if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) && :除屏蔽信号外还有信号,且处于可中断状态,则置为就绪态
(*p)->state==TASK_INTERRUPTIBLE)
(*p)->state=TASK_RUNNING;
}
/* this is the scheduler proper: */
while (1) {
c = -1;
next = 0;
i = NR_TASKS;
p = &task[NR_TASKS];
while (--i) { :到task 1,选取时间片最大者;若有task 0外其它任务,且都counter=0,则重设counter
if (!*--p)
continue;
if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
c = (*p)->counter, next = i;
}
if (c) break;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
if (*p)
(*p)->counter = ((*p)->counter >> 1) +
(*p)->priority;
}
switch_to(next);
}
switch_to:
#define switch_to(n) {\
struct {long a,b;} __tmp; \
__asm__("cmpl %%ecx,_current\n\t" \
"je 1f\n\t" \
"movw %%dx,%1\n\t" \
"xchgl %%ecx,_current\n\t" \
"ljmp %0\n\t" \ : 至此,task 0切换到task 1执行,task 0被切换回后执cmpl指令
"cmpl %%ecx,_last_task_used_math\n\t" \
"jne 1f\n\t" \
"clts\n" \
"1:" \
::"m" (*&__tmp.a),"m" (*&__tmp.b), \
"d" (_TSS(n)),"c" ((long) task[n])); \
}
00006f2b: (   mov word ptr ss:[ebp-4], dx ; 668955fc
00006f2f: (   xchg dword ptr ds:0x221b4, ecx ; 870db4210200
00006f35: (   jmp far ss:[ebp-8] ; ff6df8 :任务切换在此
00006f38: (   cmp dword ptr ds:0x221b8, ecx ; 390db8210200
00006f3e: (   jnz .+2 ; 7502
00006f40: (   clts ; 0f06
Task 0此时位于kernel space
<bochs:3> sreg
es:0x0010, dh=0x00c09300, dl=0x00000fff, valid=1
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
cs:0x0008, dh=0x00c09b00, dl=0x00000fff, valid=1
Code segment, base=0x00000000, limit=0x00ffffff, Execute/Read, Accessed, 32-bit
ss:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
ds:0x0010, dh=0x00c09300, dl=0x00000fff, valid=7
Data segment, base=0x00000000, limit=0x00ffffff, Read/Write, Accessed
fs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x00c0f300, dl=0x0000009f, valid=1
Data segment, base=0x00000000, limit=0x0009ffff, Read/Write, Accessed
ldtr:0x0028, dh=0x00008202, dl=0x15580068, valid=1
tr:0x0020, dh=0x00008b02, dl=0x15700068, valid=1
gdtr:base=0x00005cb8, limit=0x7ff
idtr:base=0x000054b8, limit=0x7ff
<bochs:4> u
00006f35: (   jmp far ss:[ebp-8] ; ff6df8
<bochs:5> s
Next at t=52090203
(0) [0x0000684a] 000f:0000684a (unk. ctxt): mov edx, eax ; 89c2
切换至task 1
<bochs:6> sreg
es:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
cs:0x000f, dh=0x04c0fb00, dl=0x0000009f, valid=1
Code segment, base=0x04000000, limit=0x0009ffff, Execute/Read, Accessed, 32-bit
ss:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
ds:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
fs:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
gs:0x0017, dh=0x04c0f300, dl=0x0000009f, valid=1
Data segment, base=0x04000000, limit=0x0009ffff, Read/Write, Accessed
ldtr:0x0038, dh=0x000082ff, dl=0xf3b00068, valid=1
tr:0x0030, dh=0x00008bff, dl=0xf3c80068, valid=1
task 1 开始执行。
[培训]二进制漏洞攻防(第3期);满10人开班;模糊测试与工具使用二次开发;网络协议漏洞挖掘;Linux内核漏洞挖掘与利用;AOSP漏洞挖掘与利用;代码审计。
赞赏
看原图