操作系统
操作系统是指在整个系统中完成最基本功能以及系统管理的那部分。包括:
- 内核
- 设备驱动程序
- 启动引导程序
- 命令行SHELL或者其他用户界面
-
内核
中断服务程序
- 进程调度程序
- 内存管理程序
-
进程
定义
处于执行期程序以及资源总称。进程就是处于执行期间的代码段,但不仅限于代码段,通常包含其他资源:
打开的文件
- 挂起的信号
- 内核内部数据
- 处理器状态
- 映射内存地址空间
- 多线程
- 用来存放全局变量的数据段
- … …
线程是进程中活动中的对象,是内核调度的基本单元。在Linux系统中,线程是一种特殊的进程:包含独立的程序计数器、进程栈以及进程寄存器。
生命周期
父进程(fork) 创建子进程;
父进程(exec)子进程创建新的地址空间,载入程序,执行进程;
子进程(exit)退出执行,并释放资源;
描述符(task_struct)
数据结构
内核把进程的列表存放于叫做任务队列(task list)的双向循环链表中。链表中的每一项都是类型为task_struct、称为进程描述符的结构。进程描述符包含一个具体进程所有信息。
// kernel/sched.h
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;
int lock_depth; /* BKL lock depth */
unsigned int policy;
cpumask_t cpus_allowed;
struct sched_info sched_info;
struct list_head tasks;
struct mm_struct *mm, *active_mm;
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
/* ??? */
unsigned int personality;
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
pid_t pid;
pid_t tgid;
#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct *real_parent; /* real parent process */
struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
* children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */
/*
* ptraced is the list of tasks this task is using ptrace on.
* This includes both natural children and PTRACE_ATTACH targets.
* p->ptrace_entry is p's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
/* process credentials */
const struct cred __rcu *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
* credentials (COW) */
struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by setup_new_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
uid_t loginuid;
unsigned int sessionid;
#endif
seccomp_t seccomp;
/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
* mempolicy */
spinlock_t alloc_lock;
#ifdef CONFIG_GENERIC_HARDIRQS
/* IRQ handler threads */
struct irqaction *irqaction;
#endif
/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
gfp_t lockdep_reclaim_gfp;
#endif
/* journalling filesystem info */
void *journal_info;
/* stacked block device info */
struct bio_list *bio_list;
#ifdef CONFIG_BLOCK
/* stack plugging */
struct blk_plug *plug;
#endif
/* VM state */
struct reclaim_state *reclaim_state;
struct backing_dev_info *backing_dev_info;
struct io_context *io_context;
unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
int mems_allowed_change_disable;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
short pref_node_fork;
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu;
/*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;
struct list_head *scm_work_list;
};
分配进程描述符
Linux通过slab分配task_struct结构,这样能够达到对象复用以及缓存着色的目的。
创建与释放
创建分为两个过程去执行: fork()和exec().
fork() -> clone() -> do_fork() -> copy_process(). copy_process 定义在 kernel/fork.c中:
- 调用dum_task_struct() 为新进程创建内核栈、thread_info结构和task_struct;
- 进程描述符内成员设为初始值,进程状态设置为TASK_UNINTERRUPTIBLE(保证它不会投入运行);
- 置为task_struct中的flags成员;
- 调用alloc_pid()分配一个有效PID;
- 返回一个指向子进程的指针。
进程终结:do_exit().
- task_struct 中 flags 设置为 PF_EXITING;
- del_timer_sync() 确保没有定时器在排队或者处理程序;
- exit_mm() 释放进程占用的 mm_struct;
- sem __exit();
- exit_files() 和 exit_fs(), 递减文件描述符和文件系统数据引用计数。
- 完成内核机制规定的退出动作;
- exit_notify()通知父进程,给子进程重新找养父(线程组中其他进程或者init进程);
- 进程状态 task_struct->exit_state = EXIT_ZOMBIE;
- 调用schedule()切换新的进程,而改进程因为处于僵尸状态不会再次被调度。
- 执行完以上过程,进程不可被运行,它所占用内存只包含内核栈、thread_info结构和task_struct结构,这些信息供父进程检索。
- 父进程检索到信息后,通知内核那是无关信息,进程所持有剩余内存释放,归还系统:release_task():
- 调用 __exit_signal() -> _unhash_process() -> detach_pid() 从pidhash上删除该进程,同时任务列表中删除该进程;
- _exit_gignal() 释放僵死进程所有剩余资源,并统计和记录;
- release_task()调用put_task_struct()释放进程内核栈和thread_info结构页,释放slab高速缓存。
子进程要寻找养父的目的,是防止子进程切换到僵死状态时,没有父进程索引,白白耗费内存。