Linux内核进程调度:进程控制块task_struct

2025-10-12 4652 words 10 minutes

Contents

struct task_struct 是 Linux 内核中用于表示进程的核心数据结构。它包含了进程的各种信息，如状态、调度信息、内存管理信息、文件描述符等。理解 task_struct 的结构和各个字段的作用，对于深入了解 Linux 进程调度机制至关重要。

以下代码摘自 Linux 5.80 版本的内核源码，不同版本可能会有所不同。

linux内核源码路径: include/linux/sched.h

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * For reasons of header soup (see current_thread_info()), this
	 * must be the first element of task_struct.
	 */
	struct thread_info		thread_info;
#endif
	/* -1 unrunnable, 0 runnable, >0 stopped: */
	volatile long			state;

	/*
	 * This begins the randomizable portion of task_struct. Only
	 * scheduling-critical items should be added above here.
	 */
	randomized_struct_fields_start

	void				*stack;
	refcount_t			usage;
	/* Per task flags (PF_*), defined further below: */
	unsigned int			flags;
	unsigned int			ptrace;

#ifdef CONFIG_SMP
	int				on_cpu;
	struct __call_single_node	wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/* Current CPU: */
	unsigned int			cpu;
#endif
	unsigned int			wakee_flips;
	unsigned long			wakee_flip_decay_ts;
	struct task_struct		*last_wakee;

	/*
	 * recent_used_cpu is initially set as the last CPU used by a task
	 * that wakes affine another task. Waker/wakee relationships can
	 * push tasks around a CPU where each wakeup moves to the next one.
	 * Tracking a recently used CPU allows a quick search for a recently
	 * used CPU that may be idle.
	 */
	int				recent_used_cpu;
	int				wake_cpu;
#endif
	int				on_rq;

	int				prio;
	int				static_prio;
	int				normal_prio;
	unsigned int			rt_priority;

	const struct sched_class	*sched_class;
	struct sched_entity		se;
	struct sched_rt_entity		rt;
#ifdef CONFIG_CGROUP_SCHED
	struct task_group		*sched_task_group;
#endif
	struct sched_dl_entity		dl;

#ifdef CONFIG_UCLAMP_TASK
	/* Clamp values requested for a scheduling entity */
	struct uclamp_se		uclamp_req[UCLAMP_CNT];
	/* Effective clamp values used for a scheduling entity */
	struct uclamp_se		uclamp[UCLAMP_CNT];
#endif

#ifdef CONFIG_PREEMPT_NOTIFIERS
	/* List of struct preempt_notifier: */
	struct hlist_head		preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
	unsigned int			btrace_seq;
#endif

	unsigned int			policy;
	int				nr_cpus_allowed;
	const cpumask_t			*cpus_ptr;
	cpumask_t			cpus_mask;

#ifdef CONFIG_PREEMPT_RCU
	int				rcu_read_lock_nesting;
	union rcu_special		rcu_read_unlock_special;
	struct list_head		rcu_node_entry;
	struct rcu_node			*rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
	unsigned long			rcu_tasks_nvcsw;
	u8				rcu_tasks_holdout;
	u8				rcu_tasks_idx;
	int				rcu_tasks_idle_cpu;
	struct list_head		rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
	int				trc_reader_nesting;
	int				trc_ipi_to_cpu;
	union rcu_special		trc_reader_special;
	bool				trc_reader_checked;
	struct list_head		trc_holdout_list;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

	struct sched_info		sched_info;

	struct list_head		tasks;
#ifdef CONFIG_SMP
	struct plist_node		pushable_tasks;
	struct rb_node			pushable_dl_tasks;
#endif

	struct mm_struct		*mm;
	struct mm_struct		*active_mm;

	/* Per-thread vma caching: */
	struct vmacache			vmacache;

#ifdef SPLIT_RSS_COUNTING
	struct task_rss_stat		rss_stat;
#endif
	int				exit_state;
	int				exit_code;
	int				exit_signal;
	/* The signal sent when the parent dies: */
	int				pdeath_signal;
	/* JOBCTL_*, siglock protected: */
	unsigned long			jobctl;

	/* Used for emulating ABI behavior of previous Linux versions: */
	unsigned int			personality;

	/* Scheduler bits, serialized by scheduler locks: */
	unsigned			sched_reset_on_fork:1;
	unsigned			sched_contributes_to_load:1;
	unsigned			sched_migrated:1;
	unsigned			sched_remote_wakeup:1;
#ifdef CONFIG_PSI
	unsigned			sched_psi_wake_requeue:1;
#endif

	/* Force alignment to the next boundary: */
	unsigned			:0;

	/* Unserialized, strictly 'current' */

	/* Bit to tell LSMs we're in execve(): */
	unsigned			in_execve:1;
	unsigned			in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
	unsigned			restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
	unsigned			in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
	unsigned			brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
	/* disallow userland-initiated cgroup migration */
	unsigned			no_cgroup_migration:1;
	/* task is frozen/stopped (used by the cgroup freezer) */
	unsigned			frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
	unsigned			use_memdelay:1;
#endif
#ifdef CONFIG_PSI
	/* Stalled due to lack of memory */
	unsigned			in_memstall:1;
#endif

	unsigned long			atomic_flags; /* Flags requiring atomic access. */

	struct restart_block		restart_block;

	pid_t				pid;
	pid_t				tgid;

#ifdef CONFIG_STACKPROTECTOR
	/* Canary value for the -fstack-protector GCC feature: */
	unsigned long			stack_canary;
#endif
	/*
	 * Pointers to the (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with
	 * p->real_parent->pid)
	 */

	/* Real parent process: */
	struct task_struct __rcu	*real_parent;

	/* Recipient of SIGCHLD, wait4() reports: */
	struct task_struct __rcu	*parent;

	/*
	 * Children/sibling form the list of natural children:
	 */
	struct list_head		children;
	struct list_head		sibling;
	struct task_struct		*group_leader;

	/*
	 * 'ptraced' is the list of tasks this task is using ptrace() on.
	 *
	 * This includes both natural children and PTRACE_ATTACH targets.
	 * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
	 */
	struct list_head		ptraced;
	struct list_head		ptrace_entry;

	/* PID/PID hash table linkage. */
	struct pid			*thread_pid;
	struct hlist_node		pid_links[PIDTYPE_MAX];
	struct list_head		thread_group;
	struct list_head		thread_node;

	struct completion		*vfork_done;

	/* CLONE_CHILD_SETTID: */
	int __user			*set_child_tid;

	/* CLONE_CHILD_CLEARTID: */
	int __user			*clear_child_tid;

	u64				utime;
	u64				stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
	u64				utimescaled;
	u64				stimescaled;
#endif
	u64				gtime;
	struct prev_cputime		prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
	struct vtime			vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
	atomic_t			tick_dep_mask;
#endif
	/* Context switch counts: */
	unsigned long			nvcsw;
	unsigned long			nivcsw;

	/* Monotonic time in nsecs: */
	u64				start_time;

	/* Boot based time in nsecs: */
	u64				start_boottime;

	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
	unsigned long			min_flt;
	unsigned long			maj_flt;

	/* Empty if CONFIG_POSIX_CPUTIMERS=n */
	struct posix_cputimers		posix_cputimers;

	/* Process credentials: */

	/* Tracer's credentials at attach: */
	const struct cred __rcu		*ptracer_cred;

	/* Objective and real subjective task credentials (COW): */
	const struct cred __rcu		*real_cred;

	/* Effective (overridable) subjective task credentials (COW): */
	const struct cred __rcu		*cred;

#ifdef CONFIG_KEYS
	/* Cached requested key. */
	struct key			*cached_requested_key;
#endif

	/*
	 * executable name, excluding path.
	 *
	 * - normally initialized setup_new_exec()
	 * - access it with [gs]et_task_comm()
	 * - lock it with task_lock()
	 */
	char				comm[TASK_COMM_LEN];

	struct nameidata		*nameidata;

#ifdef CONFIG_SYSVIPC
	struct sysv_sem			sysvsem;
	struct sysv_shm			sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
	unsigned long			last_switch_count;
	unsigned long			last_switch_time;
#endif
	/* Filesystem information: */
	struct fs_struct		*fs;

	/* Open file information: */
	struct files_struct		*files;

#ifdef CONFIG_IO_URING
	struct io_uring_task		*io_uring;
#endif

	/* Namespaces: */
	struct nsproxy			*nsproxy;

	/* Signal handlers: */
	struct signal_struct		*signal;
	struct sighand_struct __rcu		*sighand;
	sigset_t			blocked;
	sigset_t			real_blocked;
	/* Restored if set_restore_sigmask() was used: */
	sigset_t			saved_sigmask;
	struct sigpending		pending;
	unsigned long			sas_ss_sp;
	size_t				sas_ss_size;
	unsigned int			sas_ss_flags;

	struct callback_head		*task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
	struct audit_context		*audit_context;
#endif
	kuid_t				loginuid;
	unsigned int			sessionid;
#endif
	struct seccomp			seccomp;

	/* Thread group tracking: */
	u64				parent_exec_id;
	u64				self_exec_id;

	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
	spinlock_t			alloc_lock;

	/* Protection of the PI data structures: */
	raw_spinlock_t			pi_lock;

	struct wake_q_node		wake_q;

#ifdef CONFIG_RT_MUTEXES
	/* PI waiters blocked on a rt_mutex held by this task: */
	struct rb_root_cached		pi_waiters;
	/* Updated under owner's pi_lock and rq lock */
	struct task_struct		*pi_top_task;
	/* Deadlock detection and priority inheritance handling: */
	struct rt_mutex_waiter		*pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
	/* Mutex deadlock detection: */
	struct mutex_waiter		*blocked_on;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	int				non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
	unsigned int			irq_events;
	unsigned int			hardirq_threaded;
	unsigned long			hardirq_enable_ip;
	unsigned long			hardirq_disable_ip;
	unsigned int			hardirq_enable_event;
	unsigned int			hardirq_disable_event;
	int				hardirqs_enabled;
	int				hardirq_context;
	u64				hardirq_chain_key;
	unsigned long			softirq_disable_ip;
	unsigned long			softirq_enable_ip;
	unsigned int			softirq_disable_event;
	unsigned int			softirq_enable_event;
	int				softirqs_enabled;
	int				softirq_context;
	int				irq_config;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH			48UL
	u64				curr_chain_key;
	int				lockdep_depth;
	unsigned int			lockdep_recursion;
	struct held_lock		held_locks[MAX_LOCK_DEPTH];
#endif

#ifdef CONFIG_UBSAN
	unsigned int			in_ubsan;
#endif

	/* Journalling filesystem info: */
	void				*journal_info;

	/* Stacked block device info: */
	struct bio_list			*bio_list;

#ifdef CONFIG_BLOCK
	/* Stack plugging: */
	struct blk_plug			*plug;
#endif

	/* VM state: */
	struct reclaim_state		*reclaim_state;

	struct backing_dev_info		*backing_dev_info;

	struct io_context		*io_context;

#ifdef CONFIG_COMPACTION
	struct capture_control		*capture_control;
#endif
	/* Ptrace state: */
	unsigned long			ptrace_message;
	kernel_siginfo_t		*last_siginfo;

	struct task_io_accounting	ioac;
#ifdef CONFIG_PSI
	/* Pressure stall state */
	unsigned int			psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
	/* Accumulated RSS usage: */
	u64				acct_rss_mem1;
	/* Accumulated virtual memory usage: */
	u64				acct_vm_mem1;
	/* stime + utime since last update: */
	u64				acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
	/* Protected by ->alloc_lock: */
	nodemask_t			mems_allowed;
	/* Seqence number to catch updates: */
	seqcount_t			mems_allowed_seq;
	int				cpuset_mem_spread_rotor;
	int				cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
	/* Control Group info protected by css_set_lock: */
	struct css_set __rcu		*cgroups;
	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
	struct list_head		cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
	u32				closid;
	u32				rmid;
#endif
#ifdef CONFIG_FUTEX
	struct robust_list_head __user	*robust_list;
#ifdef CONFIG_COMPAT
	struct compat_robust_list_head __user *compat_robust_list;
#endif
	struct list_head		pi_state_list;
	struct futex_pi_state		*pi_state_cache;
	struct mutex			futex_exit_mutex;
	unsigned int			futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
	struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
	struct mutex			perf_event_mutex;
	struct list_head		perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
	unsigned long			preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
	/* Protected by alloc_lock: */
	struct mempolicy		*mempolicy;
	short				il_prev;
	short				pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
	int				numa_scan_seq;
	unsigned int			numa_scan_period;
	unsigned int			numa_scan_period_max;
	int				numa_preferred_nid;
	unsigned long			numa_migrate_retry;
	/* Migration stamp: */
	u64				node_stamp;
	u64				last_task_numa_placement;
	u64				last_sum_exec_runtime;
	struct callback_head		numa_work;

	/*
	 * This pointer is only modified for current in syscall and
	 * pagefault context (and for tasks being destroyed), so it can be read
	 * from any of the following contexts:
	 *  - RCU read-side critical section
	 *  - current->numa_group from everywhere
	 *  - task's runqueue locked, task not running
	 */
	struct numa_group __rcu		*numa_group;

	/*
	 * numa_faults is an array split into four regions:
	 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
	 * in this precise order.
	 *
	 * faults_memory: Exponential decaying average of faults on a per-node
	 * basis. Scheduling placement decisions are made based on these
	 * counts. The values remain static for the duration of a PTE scan.
	 * faults_cpu: Track the nodes the process was running on when a NUMA
	 * hinting fault was incurred.
	 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
	 * during the current scan window. When the scan completes, the counts
	 * in faults_memory and faults_cpu decay and these values are copied.
	 */
	unsigned long			*numa_faults;
	unsigned long			total_numa_faults;

	/*
	 * numa_faults_locality tracks if faults recorded during the last
	 * scan window were remote/local or failed to migrate. The task scan
	 * period is adapted based on the locality of the faults with different
	 * weights depending on whether they were shared or private faults
	 */
	unsigned long			numa_faults_locality[3];

	unsigned long			numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
	struct rseq __user *rseq;
	u32 rseq_sig;
	/*
	 * RmW on rseq_event_mask must be performed atomically
	 * with respect to preemption.
	 */
	unsigned long rseq_event_mask;
#endif

	struct tlbflush_unmap_batch	tlb_ubc;

	union {
		refcount_t		rcu_users;
		struct rcu_head		rcu;
	};

	/* Cache last used pipe for splice(): */
	struct pipe_inode_info		*splice_pipe;

	struct page_frag		task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
	struct task_delay_info		*delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
	int				make_it_fail;
	unsigned int			fail_nth;
#endif
	/*
	 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
	 * balance_dirty_pages() for a dirty throttling pause:
	 */
	int				nr_dirtied;
	int				nr_dirtied_pause;
	/* Start of a write-and-pause period: */
	unsigned long			dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
	int				latency_record_count;
	struct latency_record		latency_record[LT_SAVECOUNT];
#endif
	/*
	 * Time slack values; these are used to round up poll() and
	 * select() etc timeout values. These are in nanoseconds.
	 */
	u64				timer_slack_ns;
	u64				default_timer_slack_ns;

#ifdef CONFIG_KASAN
	unsigned int			kasan_depth;
#endif
#ifdef CONFIG_KCSAN
	struct kcsan_ctx		kcsan_ctx;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
	/* Index of current stored address in ret_stack: */
	int				curr_ret_stack;
	int				curr_ret_depth;

	/* Stack of return addresses for return function tracing: */
	struct ftrace_ret_stack		*ret_stack;

	/* Timestamp for last schedule: */
	unsigned long long		ftrace_timestamp;

	/*
	 * Number of functions that haven't been traced
	 * because of depth overrun:
	 */
	atomic_t			trace_overrun;

	/* Pause tracing: */
	atomic_t			tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
	/* State flags for use by tracers: */
	unsigned long			trace;

	/* Bitmask and counter of trace recursion: */
	unsigned long			trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
	/* See kernel/kcov.c for more details. */

	/* Coverage collection mode enabled for this task (0 if disabled): */
	unsigned int			kcov_mode;

	/* Size of the kcov_area: */
	unsigned int			kcov_size;

	/* Buffer for coverage collection: */
	void				*kcov_area;

	/* KCOV descriptor wired with this task or NULL: */
	struct kcov			*kcov;

	/* KCOV common handle for remote coverage collection: */
	u64				kcov_handle;

	/* KCOV sequence number: */
	int				kcov_sequence;

	/* Collect coverage from softirq context: */
	unsigned int			kcov_softirq;
#endif

#ifdef CONFIG_MEMCG
	struct mem_cgroup		*memcg_in_oom;
	gfp_t				memcg_oom_gfp_mask;
	int				memcg_oom_order;

	/* Number of pages to reclaim on returning to userland: */
	unsigned int			memcg_nr_pages_over_high;

	/* Used by memcontrol for targeted memcg charge: */
	struct mem_cgroup		*active_memcg;
#endif

#ifdef CONFIG_BLK_CGROUP
	struct request_queue		*throttle_queue;
#endif

#ifdef CONFIG_UPROBES
	struct uprobe_task		*utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
	unsigned int			sequential_io;
	unsigned int			sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	unsigned long			task_state_change;
#endif
	int				pagefault_disabled;
#ifdef CONFIG_MMU
	struct task_struct		*oom_reaper_list;
#endif
#ifdef CONFIG_VMAP_STACK
	struct vm_struct		*stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/* A live task holds one reference: */
	refcount_t			stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
	int patch_state;
#endif
#ifdef CONFIG_SECURITY
	/* Used by LSM modules for access restriction: */
	void				*security;
#endif

#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
	unsigned long			lowest_stack;
	unsigned long			prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
	u64				mce_addr;
	__u64				mce_ripv : 1,
					mce_whole_page : 1,
					__mce_reserved : 62;
	struct callback_head		mce_kill_me;
#endif

	/*
	 * New fields for task_struct should be added above here, so that
	 * they are included in the randomized portion of task_struct.
	 */
	randomized_struct_fields_end

	/* CPU-specific state of this task: */
	struct thread_struct		thread;

	/*
	 * WARNING: on x86, 'thread_struct' contains a variable-sized
	 * structure.  It *MUST* be at the end of 'task_struct'.
	 *
	 * Do not put anything below here!
	 */
};

🧩 一、任务基础信息区

字段	含义
`volatile long state`	当前任务的状态：`TASK_RUNNING`、`TASK_INTERRUPTIBLE`、`TASK_UNINTERRUPTIBLE`、`TASK_STOPPED`、`EXIT_ZOMBIE` 等。
`void *stack`	指向该任务的内核栈顶。每个进程在内核态都有独立的8KB（x86）内核栈。
`refcount_t usage`	引用计数，用于控制该 task_struct 的生命周期。
`unsigned int flags`	标志位（`PF_*` 系列），标记任务特性，例如 `PF_KTHREAD`（内核线程）、`PF_NOFREEZE` 等。
`unsigned int ptrace`	用于 `ptrace` 调试机制，标识被调试状态。
`pid_t pid`	进程号（唯一标识）。
`pid_t tgid`	线程组 ID，组内线程共享资源（如地址空间）。主线程的 `pid == tgid`。
`char comm[TASK_COMM_LEN]`	当前任务的可读名称（通常是程序名，最多16字节）。

⚙️ 二、调度器相关字段

这些字段由内核调度器（kernel/sched/）使用，决定任务何时、在哪个 CPU 上执行。调度器通常通过优先级指定任务的调度顺序，普通任务通过 CFS（完全公平调度器）调度，进程的nice值确定了静态优先级，nice值范围是[-20, 19]，数字越低优先级越高，通过top命令的NI列可以查看；实时任务的优先级范围是[1, 99]，数字越大优先级越高；Deadline类型的进程优先级为-1。

字段	含义
`int prio`	动态优先级，调度时实际使用的优先级。
`int static_prio`	静态优先级（通常用户设置的 `nice` 值会影响它）。nice 值的范围是[-20, 19], 而实时优先级的有效范围是[1, 99], 二者的重叠部分如何处理呢？内核在处理 nice 时会加上120, 完成 nice 值与静态优先级之间的转换。
`int normal_prio`	归一化优先级（综合 static_prio + 调度策略）。rtpriority 数字越大，优先级越高；nice 值则相反，而 deadline 进程始终要维持最高优先级，为了便于管理，内核设计了一种归一化算法，将所有的优先级统一到 [-1, 139] 这个区间上，并且数字越小优先级越大，该优先级就叫着归一化优先级
`unsigned int rt_priority`	实时任务优先级（0~99），数字越大优先级越高，当数值是0时表示该进程是普通进程。
`unsigned int policy`	调度策略：`SCHED_NORMAL`、`SCHED_FIFO`、`SCHED_RR`、`SCHED_DEADLINE` 等。
`const struct sched_class *sched_class`	指向调度类对象（如 `fair_sched_class`, `rt_sched_class`），定义任务的调度行为。
`struct sched_entity se`	普通任务的调度实体（CFS调度器使用）。
`struct sched_rt_entity rt`	实时任务的调度实体。
`struct sched_dl_entity dl`	deadline 调度任务的调度实体。
`int on_cpu`	当前任务是否正在某个 CPU 上运行。
`int on_rq`	是否在运行队列中。
`unsigned int cpu`	当前任务所在的 CPU。
`int nr_cpus_allowed` / `cpumask_t cpus_mask`	该任务允许运行的 CPU 集合（CPU亲和性）。

优先级归一化的实现：

static inline int __normal_prio(struct task_struct *p) {
    return p->static_prio;
}

static inline int normal_prio(struct task_struct *p) {
    int prio;

    if (task_has_dl_policy(p))
        /* MAX_DL_PRIO为0, 因此Deadline的优先级永远为-1 */
        prio = MAX_DL_PRIO - 1;
    else if (task_has_rt_policy(p))
        /* MAX_RT_PRIO为100, 而rt_priority的范围是[1,99]且数字越大对应的优先级越高，下面的算法实现了优先级反转，高优先级将对应小的数字。
         */
        prio = MAX_RT_PRIO - 1 - p->rt_priority;
    else
        /* 对于普通进程，直接返回静态优先级static_prio */
        prio = __normal_prio(p);
    return prio;
}

🧠 三、内存管理相关

字段	含义
`struct mm_struct *mm`	用户空间内存描述符，包含页表、VMA等。内核线程中此项为 `NULL`。
`struct mm_struct *active_mm`	当前活跃的 `mm`。即使是内核线程，在切换时也会借用上一个用户进程的 `mm`。
`struct vmacache vmacache`	最近访问的虚拟内存区域缓存，加速 `find_vma()` 查找。
`struct task_rss_stat rss_stat`	内存页统计信息。

🧾 四、父子关系与进程树

字段	含义
`struct task_struct __rcu *real_parent`	实际的父进程。
`struct task_struct __rcu *parent`	接收 SIGCHLD 的进程。
`struct list_head children`	子进程链表头。
`struct list_head sibling`	兄弟进程链表节点。
`struct task_struct *group_leader`	线程组的主线程。

🧩 五、文件系统与文件描述符表

字段	含义
`struct fs_struct *fs`	文件系统上下文（当前工作目录、根目录等）。
`struct files_struct *files`	打开文件表（文件描述符表）。
`struct nsproxy *nsproxy`	命名空间（UTS、IPC、PID、Mount、NET 等）信息。

🔒 六、安全与权限

字段	含义
`const struct cred __rcu *cred`	有效凭证（权限、UID、GID、能力等）。
`const struct cred __rcu *real_cred`	实际凭证。
`struct seccomp seccomp`	seccomp 安全策略（系统调用过滤）。
`void *security`	LSM 安全模块（SELinux/AppArmor）使用。

🔄 七、信号与异常处理

字段	含义
`struct signal_struct *signal`	信号共享信息（线程组共享）。
`struct sighand_struct *sighand`	信号处理函数表。
`sigset_t blocked`	屏蔽的信号集合。
`struct sigpending pending`	待处理的信号队列。
`int exit_state, exit_code, exit_signal`	退出状态和信号。

📊 八、时间与统计

字段	含义
`u64 utime, stime`	用户态时间与内核态时间。
`u64 start_time, start_boottime`	任务启动时间。
`unsigned long nvcsw, nivcsw`	上下文切换次数（自愿与非自愿）。
`unsigned long min_flt, maj_flt`	次缺页和主缺页次数。

💡 九、同步与锁机制

字段	含义
`spinlock_t alloc_lock`	分配保护锁。
`raw_spinlock_t pi_lock`	优先级继承（PI）锁。
`struct wake_q_node wake_q`	唤醒队列节点。
`struct mutex_waiter *blocked_on`	当前阻塞的互斥量。

🧰 十、性能、追踪、调试

字段	含义
`struct perf_event_context *perf_event_ctxp[]`	perf 性能计数器上下文。
`struct io_context *io_context`	I/O 调度信息。
`struct audit_context *audit_context`	审计系统信息。
`struct task_delay_info *delays`	延迟统计信息。
`struct kcov *kcov`	内核代码覆盖率工具使用。

⚡ 十一、NUMA（非一致内存访问）与调度平衡

字段	含义
`struct mempolicy *mempolicy`	NUMA 内存策略。
`int numa_preferred_nid`	首选 NUMA 节点。
`struct numa_group *numa_group`	关联的 NUMA 组。

🧱 十二、内核线程与特殊任务支持

字段	含义
`struct thread_struct thread`	CPU 特定寄存器上下文（保存在寄存器中或用于上下文切换）。
`struct thread_info thread_info`	每线程的低层信息（栈、标志等）。
`struct reclaim_state *reclaim_state`	内存回收状态。
`struct page_frag task_frag`	页面片段缓存。
`struct bio_list *bio_list`	I/O 操作链表。

🚀 十三、cgroup（控制组）

字段	含义
`struct css_set *cgroups`	当前所属的控制组集合。
`struct list_head cg_list`	所在控制组链表节点。
`struct task_group *sched_task_group`	调度器的 cgroup 控制信息。

🧮 十四、其它辅助信息

字段	含义
`struct completion *vfork_done`	用于 `vfork()` 同步。
`int pdeath_signal`	父进程退出时发送给子进程的信号。
`struct io_uring_task *io_uring`	io_uring 上下文。
`struct bio_list *bio_list`	块 I/O 任务队列。
`void *journal_info`	文件系统日志。

🧩 十五、最后的 `thread_struct`

这一部分保存了CPU特定上下文信息，如：

通用寄存器（RIP、RSP、RBP等）
FPU寄存器
调试寄存器
段寄存器等

在进程上下文切换时（context_switch()），内核会把当前任务的 thread_struct 寄存器状态保存下来，载入下一个任务的寄存器值，实现任务切换。

🧭 总结

模块	功能
调度器字段	决定任务运行位置与时间
内存管理字段	指向地址空间、页表等
信号处理字段	响应用户信号与中断
文件系统字段	打开文件、目录、命名空间
父子关系字段	构建进程树
安全字段	权限控制、LSM 支持
NUMA/性能字段	多核调度与性能计数
thread_struct	CPU 寄存器上下文

参考文献

Linux核心概念详解：s3.shizhz.me/linux-sched/concepts/priority