接下来解释一下函数
jump_lable_init()
parse_early_param()
random_init_early(command_line)
setup_log_buf(0)
vfs_caches_init_early()
sort_main_extable()
trap_init() // important!
mm_core_init()
poking_init()
ftrace_init()
early_trace_init()
sched_init() // important!
jump_lable_init()
impl in /kernel/jump_lable.c
void __init jump_label_init(void)
{
struct jump_entry *iter_start = __start___jump_table;
struct jump_entry *iter_stop = __stop___jump_table;
struct static_key *key = NULL;
struct jump_entry *iter;
/*
* Since we are initializing the static_key.enabled field with
* with the 'raw' int values (to avoid pulling in atomic.h) in
* jump_label.h, let's make sure that is safe. There are only two
* cases to check since we initialize to 0 or 1.
*/
BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
if (static_key_initialized)
return;
cpus_read_lock();
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
bool in_init;
/* rewrite NOPs */
if (jump_label_type(iter) == JUMP_LABEL_NOP)
arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
in_init = init_section_contains((void *)jump_entry_code(iter), 1);
jump_entry_set_init(iter, in_init);
iterk = jump_entry_key(iter);
if (iterk == key)
continue;
key = iterk;
static_key_set_entries(key, iter);
}
static_key_initialized = true;
jump_label_unlock();
cpus_read_unlock();
}
用于初始化内核中的跳转标签(jump label),跳转标签是一种特殊类型的标签,用于在代码中创建跳转点,这些跳转点可以在运行时被替换,以实现内核的动态模块化。
当一个模块被加载时,内核会创建一个新的跳转标签,这个标签指向模块的入口点。这样,当模块需要被替换或卸载时,内核只需要修改跳转标签的指向,而不需要重新编译内核。
parse_early_param()
解析内核早期的参数,因为此时内核还没有完成初始化,无法直接解析参数random_init_early()
这个函数用于在内存分配器初始化之前初始化随机数生成器。setup_log_buf(0)
初始化内核日志缓冲区。这个缓冲区用于存储内核的日志信息,以便在内核初始化时记录相关信息。vfs_caches_init_early()
初始化虚拟文件系统(VFS)的缓存。
- 关于VFS:
VFS是Linux内核中的一个重要抽象层,它为用户空间的应用程序提供了一种统一的文件和文件系统操作接口。
VFS的设计使得不同的文件系统类型能够在Linux中统一使用,无论它们是存放在硬盘上的标准文件系统(如EXT4、XFS、NTFS),还是网络文件系统(如NFS),或者是特殊的伪文件系统(如/proc或/sys,linux上“一切皆文件”)。
sort_main_extable()
对内核扩展表进行排序。内核扩展表包含了用于处理中断和异常的函数指针。trap_init()
impl in/arch/<arch>/kernel/traps.c, /init/main.c
定义在/init/main.c的是带了__weak宏的,表明这是一个弱符号,至于定义在traps.c中的,不同的架构名称也有一些不同,具体的请自行查阅源码
- 关于弱符号:
在链接过程中,如果强符号(即没有标记为弱符号的符号)和弱符号同名,那么强符号会覆盖弱符号。如果只有弱符号,那么链接器会使用这个弱符号。如果没有符号的定义,链接器会忽略这个弱符号,这允许程序在符号未定义的情况下正常链接和运行。
在x86下的实现
void __init trap_init(void)
{
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
/* Init GHCB memory pages when running as an SEV-ES guest */
sev_es_init_vc_handling();
/* Initialize TSS before setting up traps so ISTs work */
cpu_init_exception_handling();
/* Setup traps as cpu_init() might #GP */
idt_setup_traps();
cpu_init();
}
初始化处理中断和异常的机制,设置IDT、初始化异常处理以及cpu的部分初始化。
mm_core_init()
初始化内存管理的核心部分,设置页表和内存管理的数据结构。poking_init()
初始化某种形式的内存访问检查或调试功能。poking 通常指的是对内存的非法访问,如写入只读内存区域。ftrace_init()
初始化内核的函数跟踪(Function Tracing)机制,允许内核跟踪函数调用和返回,并记录这些事件的日志。early_trace_init()
初始化早期函数跟踪。这可能是对 ftrace_init() 的一个补充,用于在更早的阶段启用跟踪。sched_init()
impl in /kernel/sched/core.c
这个文件大概有12016行,十分重要,深入了解内核的进程调度可以仔细看看
void __init sched_init(void)
{
unsigned long ptr = 0;
int i;
/* Make sure the linker didn't screw up */
BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
&fair_sched_class != &rt_sched_class + 1 ||
&rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
if (ptr) {
ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
}
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
task_group_cache = KMEM_CACHE(task_group, 0);
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
struct rq *rq;
rq = cpu_rq(i);
raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much CPU bandwidth does root_task_group get?
*
* In case of task-groups formed thr' the cgroup filesystem, it
* gets 100% of the CPU resources in the system. This overall
* system CPU resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
* In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the CPU resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
rq->cpu = i;
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->wake_stamp = jiffies;
rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
#endif
#ifdef CONFIG_HOTPLUG_CPU
rcuwait_init(&rq->hotplug_wait);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
rq->core_forceidle_count = 0;
rq->core_forceidle_occupation = 0;
rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif
zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
*/
mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
* The idle task doesn't need the kthread struct to function, but it
* is dressed up as a per-CPU kthread and thus needs to play the part
* if we want to avoid special-casing it in code that deals with per-CPU
* kthreads.
*/
WARN_ON(!set_kthread_struct(current));
/*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
psi_init();
init_uclamp();
preempt_dynamic_init();
scheduler_running = 1;
}
初始化调度器。调度器是内核中的一个关键组件,负责决定哪个进程应该在处理器上运行。这个步骤必须在启动任何中断(如定时器中断)之前完成,因为中断可能会打断进程的执行。
调度器直到今天的内核版本(v6.8.9)也还在逐步完善,其中机制实在是太复杂,笔者也没有弄明白,只能浅浅列出代码,不过初始化的代码结合注释还是能理解的。