Linux内核启动流程分析

我在学习Linux驱动的时候总感觉蒙着一层雾,让我看不清Linux内核的核心,Linux内核有很多子系统,我觉得有必要先从Linux Kernel的启动去宏观的看一下各个子系统是哪个时刻被启动的,我主要以ARM64为例子来分析Linux 内核的启动流程。我们知道在Linux内核启动之前是ubootuboot会做一些初始化工作,如初始化ddr,我使用的内核源码版本为4.19.232

1. 内核链接文件

​ 内核编译后生成的目标文件是ELF格式的vmlinux,vmlinux文件是各个源代码按照vmlinux.lds设定的规则,链接后得到的Object文件,并不是一个可执行的文件,不能在ARM平台上运行;通常会对其压缩,生成zImage或bzImage;通常内核映像以压缩格式存储,并不是一个可执行的内核;因此内核阶段需要先对内核映像自解压,他们的文件头部打包有解压缩程序

Linux内核的链接文件目录在arch/arm64/kernel/vmlinux.lds.S,内核在编译时会根据vmlinux.lds.S生成vmlinux.ldsvmlinux.lds就是内核最后的链接脚本,会用于链接生成内核镜像vmlinux

//arch/arm64/kernel/vmlinux.lds.S
/* SPDX-License-Identifier: GPL-2.0 */
/*
* ld script to make ARM Linux kernel
* taken from the i386 version by Russell King
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
*/

#include <asm-generic/vmlinux.lds.h>
#include <asm/cache.h>
#include <asm/kernel-pgtable.h>
#include <asm/thread_info.h>
#include <asm/memory.h>
#include <asm/page.h>
#include <asm/pgtable.h>

#include "image.h"

/* .exit.text needed in case of alternative patching */
#define ARM_EXIT_KEEP(x) x
#define ARM_EXIT_DISCARD(x)

OUTPUT_ARCH(aarch64)
ENTRY(_text)

jiffies = jiffies_64;


#define HYPERVISOR_EXTABLE \
. = ALIGN(SZ_8); \
__start___kvm_ex_table = .; \
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;

#define HYPERVISOR_TEXT \
/* \
* Align to 4 KB so that \
* a) the HYP vector table is at its minimum \
* alignment of 2048 bytes \
* b) the HYP init code will not cross a page \
* boundary if its size does not exceed \
* 4 KB (see related ASSERT() below) \
*/ \
. = ALIGN(SZ_4K); \
__hyp_idmap_text_start = .; \
*(.hyp.idmap.text) \
__hyp_idmap_text_end = .; \
__hyp_text_start = .; \
*(.hyp.text) \
HYPERVISOR_EXTABLE \
__hyp_text_end = .;

#define IDMAP_TEXT \
. = ALIGN(SZ_4K); \
__idmap_text_start = .; \
*(.idmap.text) \
__idmap_text_end = .;

#ifdef CONFIG_HIBERNATION
#define HIBERNATE_TEXT \
. = ALIGN(SZ_4K); \
__hibernate_exit_text_start = .; \
*(.hibernate_exit.text) \
__hibernate_exit_text_end = .;
#else
#define HIBERNATE_TEXT
#endif

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
#define TRAMP_TEXT \
. = ALIGN(PAGE_SIZE); \
__entry_tramp_text_start = .; \
*(.entry.tramp.text) \
. = ALIGN(PAGE_SIZE); \
__entry_tramp_text_end = .;
#else
#define TRAMP_TEXT
#endif

/*
* The size of the PE/COFF section that covers the kernel image, which
* runs from stext to _edata, must be a round multiple of the PE/COFF
* FileAlignment, which we set to its minimum value of 0x200. 'stext'
* itself is 4 KB aligned, so padding out _edata to a 0x200 aligned
* boundary should be sufficient.
*/
PECOFF_FILE_ALIGNMENT = 0x200;

#ifdef CONFIG_EFI
#define PECOFF_EDATA_PADDING \
.pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); }
#else
#define PECOFF_EDATA_PADDING
#endif

SECTIONS
{
/*
* XXX: The linker does not define how output sections are
* assigned to input sections when there are multiple statements
* matching the same input section name. There is no documented
* order of matching.
*/
/DISCARD/ : {
ARM_EXIT_DISCARD(EXIT_TEXT)
ARM_EXIT_DISCARD(EXIT_DATA)
EXIT_CALL
*(.discard)
*(.discard.*)
*(.interp .dynamic)
*(.dynsym .dynstr .hash .gnu.hash)
*(.eh_frame)
}

. = KIMAGE_VADDR + TEXT_OFFSET;

.head.text : {
_text = .;
HEAD_TEXT
}
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
__exception_text_start = .;
*(.exception.text)
__exception_text_end = .;
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
ENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
HYPERVISOR_TEXT
IDMAP_TEXT
HIBERNATE_TEXT
TRAMP_TEXT
*(.fixup)
*(.gnu.warning)
. = ALIGN(16);
*(.got) /* Global offset table */
}

. = ALIGN(SEGMENT_ALIGN);
_etext = .; /* End of text section */

RO_DATA(PAGE_SIZE) /* everything from this point to */
EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */
NOTES

. = ALIGN(SEGMENT_ALIGN);
__init_begin = .;
__inittext_begin = .;

INIT_TEXT_SECTION(8)
.exit.text : {
ARM_EXIT_KEEP(EXIT_TEXT)
}

. = ALIGN(4);
.altinstructions : {
__alt_instructions = .;
*(.altinstructions)
__alt_instructions_end = .;
}

. = ALIGN(PAGE_SIZE);
__inittext_end = .;
__initdata_begin = .;

.init.data : {
INIT_DATA
INIT_SETUP(16)
INIT_CALLS
CON_INITCALL
SECURITY_INITCALL
INIT_RAM_FS
*(.init.rodata.* .init.bss) /* from the EFI stub */
}
.exit.data : {
ARM_EXIT_KEEP(EXIT_DATA)
}

PERCPU_SECTION(L1_CACHE_BYTES)

.rela.dyn : ALIGN(8) {
*(.rela .rela*)
}

__rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR);
__rela_size = SIZEOF(.rela.dyn);

. = ALIGN(SEGMENT_ALIGN);
__initdata_end = .;
__init_end = .;

_data = .;
_sdata = .;
RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)

/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback
* Granule (CWG) of data from the cache. Keep the section that
* requires this type of maintenance to be in its own Cache Writeback
* Granule (CWG) area so the cache maintenance operations don't
* interfere with adjacent data.
*/
.mmuoff.data.write : ALIGN(SZ_2K) {
__mmuoff_data_start = .;
*(.mmuoff.data.write)
}
. = ALIGN(SZ_2K);
.mmuoff.data.read : {
*(.mmuoff.data.read)
__mmuoff_data_end = .;
}

PECOFF_EDATA_PADDING
__pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
_edata = .;

BSS_SECTION(0, 0, 0)

. = ALIGN(PAGE_SIZE);
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_pg_dir = .;
. += PAGE_SIZE;
#endif

#ifdef CONFIG_ARM64_SW_TTBR0_PAN
reserved_ttbr0 = .;
. += RESERVED_TTBR0_SIZE;
#endif
swapper_pg_dir = .;
. += SWAPPER_DIR_SIZE;
swapper_pg_end = .;

__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
_end = .;

STABS_DEBUG

HEAD_SYMBOLS
}

/*
* The HYP init code and ID map text can't be longer than a page each,
* and should not cross a page boundary.
*/
ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"HYP init code too big or misaligned")
ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"ID map text too big or misaligned")
#ifdef CONFIG_HIBERNATION
ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
<= SZ_4K, "Hibernate exit text too big or misaligned")
#endif
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
"Entry trampoline text too big")
#endif
/*
* If padding is applied before .head.text, virt<->phys conversions will fail.
*/
ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned")

  • 在此文件的开头指定了输出的架构以及内核入口地址为_text

    OUTPUT_ARCH(aarch64)
    ENTRY(_text)
  • _text是代码段的起始地址,定义在下面的SECTIONS部分,可以看见_text即为.head_text段,地址为:

    KIMAGE_VADDR + TEXT_OFFSET,这两个宏定义在arch/arm64/include/asm/memory.h

    image-20240607140937077


    . = KIMAGE_VADDR + TEXT_OFFSET;

    .head.text : {
    _text = .;
    HEAD_TEXT
    }
    .text : { /* Real text segment */
    _stext = .; /* Text and read-only data */
    __exception_text_start = .;
    *(.exception.text)
    __exception_text_end = .;
    IRQENTRY_TEXT
    SOFTIRQENTRY_TEXT
    ENTRY_TEXT
    TEXT_TEXT
    SCHED_TEXT
    CPUIDLE_TEXT
    LOCK_TEXT
    KPROBES_TEXT
    HYPERVISOR_TEXT
    IDMAP_TEXT
    HIBERNATE_TEXT
    TRAMP_TEXT
    *(.fixup)
    *(.gnu.warning)
    . = ALIGN(16);
    *(.got) /* Global offset table */
    }

2. 内核启动第一阶段

2.1 内核启动入口点

我手上有一块迅为的RK3588的板子,我们来将编译好的vmlinuxelf文件读一下看一下入口地址是多少:

image-20240607141148612

使用迅为提供的编译器将其反汇编,在得到的汇编文件vmlinux.s中查找入口地址:0xffffffc008000000

vmlinux:     file format elf64-littleaarch64
vmlinux
architecture: aarch64, flags 0x00000150:
HAS_SYMS, DYNAMIC, D_PAGED
start address 0xffffffc008000000

Disassembly of section .head.text:

ffffffc008000000 <_text>:
ffffffc008000000: 91005a4d add x13, x18, #0x16
ffffffc008000004: 146c7fff b ffffffc009b20000 <primary_entry>

由上面的反汇编文件可知,Linux内核的第一条指令是add x13, x18, #0x16,对应的符号是.head.text,在include/linux/init.h中有如下定义:

/* For assembly routines */
#define __HEAD .section ".head.text","ax"
#define __INIT .section ".init.text","ax"

__HEAD这个宏代表的就是.head.text这个段,所以去寻找__HEAD这个宏看哪里使用了,在arch/arm64/kernel/head.S中:

/*
* Kernel startup entry point.
* ---------------------------
*
* The requirements are:
* MMU = off, D-cache = off, I-cache = on or off,
* x0 = physical address to the FDT blob.
*
* This code is mostly position independent so you call this at
* __pa(PAGE_OFFSET).
*
* Note that the callee-saved registers are used for storing variables
* that are useful before the MMU is enabled. The allocations are described
* in the entry routines.
*/
__HEAD
_head:
/*
* DO NOT MODIFY. Image header expected by Linux boot-loaders.
*/
#ifdef CONFIG_EFI
/*
* This add instruction has no meaningful effect except that
* its opcode forms the magic "MZ" signature required by UEFI.
*/
add x13, x18, #0x16
b primary_entry
#else
b primary_entry // branch to kernel start, magic
.long 0 // reserved
#endif
.quad 0 // Image load offset from start of RAM, little-endian
le64sym _kernel_size_le // Effective size of kernel image, little-endian
le64sym _kernel_flags_le // Informative flags, little-endian
.quad 0 // reserved
.quad 0 // reserved
.quad 0 // reserved
.ascii ARM64_IMAGE_MAGIC // Magic number
#ifdef CONFIG_EFI
.long pe_header - _head // Offset to the PE header.

pe_header:
__EFI_PE_HEADER
#else
.long 0 // reserved
#endif

这里就是内核的启动点,在上面的注释中说了linux内核启动之前需要关闭MMU以及D-cacheI-cache可以开启或者关闭,同时x0FDT blob的物理地址

  • D-cache是数据缓存
  • I-cache是指令缓存
  • FDT uboot使用的扁平设备树,flatted device tree,

数据缓存有可能缓存了bootloader的数据,如果不清除,可能导致内核访问错误的数据。而bootloader的指令与内核指令无关,所以可以不关闭指令缓存。

``add x13, x18, #0x16 用于形成 "MZ" 签名。主要是为了满足 UEFI 固件对映像文件格式的要求,而不是为了执行任何有意义的计算。其作用是确保生成的机器码包含必要的 "MZ" 签名,使得内核映像可以被 UEFI 识别和启动。相当于一个魔数。然后执行b primary_entry跳转到primary_entry`函数执行:

2.2 primary_entry函数

	__INIT

/*
* The following callee saved general purpose registers are used on the
* primary lowlevel boot path:
*
* Register Scope Purpose
* x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0
* x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset
* x28 __create_page_tables() callee preserved temp register
* x19/x20 __primary_switch() callee preserved temp registers
* x24 __primary_switch() .. relocate_kernel() current RELR displacement
*/
SYM_CODE_START(primary_entry)
bl preserve_boot_args
bl init_kernel_el // w0=cpu_boot_mode
adrp x23, __PHYS_OFFSET
and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0
bl set_cpu_boot_mode_flag
bl __create_page_tables
/*
* The following calls CPU setup code, see arch/arm64/mm/proc.S for
* details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
bl __cpu_setup // initialise processor
b __primary_switch
SYM_CODE_END(primary_entry)

2.2.1 preserve_boot_args

/*
* Preserve the arguments passed by the bootloader in x0 .. x3
*/
SYM_CODE_START_LOCAL(preserve_boot_args)
mov x21, x0 // x21=FDT,// 将dtb的地址暂存在x21寄存器,释放出x0使用

adr_l x0, boot_args // 将boot_args数组的地址保存到x0中
stp x21, x1, [x0] // 保存x21、x1的值到boot_args[0]、boot_args[1]
stp x2, x3, [x0, #16] // 将x2、x3的值保存到boot_args[2]、boot_args[3

dmb sy // needed before dc ivac with 内存屏障
// MMU off

mov x1, #0x20 // 4 x 8 bytes x1 = 32
b __inval_dcache_area // tail call 无效化数据缓存区域
SYM_CODE_END(preserve_boot_args)
  • boot_args定义在arch/arm64/setup.c中,用于保存内核启动时的参数,是一个数组

    /*
    * The recorded values of x0 .. x3 upon kernel entry.
    */
    u64 __cacheline_aligned boot_args[4];
  • stp 是一个存储配对指令,将两个寄存器的值存储到连续的内存位置。

2.2.2 init_kernel_el

/*
* Starting from EL2 or EL1, configure the CPU to execute at the highest
* reachable EL supported by the kernel in a chosen default state. If dropping
* from EL2 to EL1, configure EL2 before configuring EL1.
*
* Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if
* SCTLR_ELx.EOS is clear), we place an ISB prior to ERET.
*
* Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if
* booted in EL1 or EL2 respectively.
*/
SYM_FUNC_START(init_kernel_el)
mov_q x0, INIT_SCTLR_EL1_MMU_OFF #将 INIT_SCTLR_EL1_MMU_OFF 常量加载到寄存器 x0 中
msr sctlr_el1, x0 #将寄存器 x0 的值写入 sctlr_el1 寄存器,配置 EL1 的系统控制寄存器。

mrs x0, CurrentEL # 读取当前的异常级别(EL)到寄存器 x0 中。
cmp x0, #CurrentEL_EL2 #比较当前 EL 是否为 EL2。
b.eq init_el2 #如果当前EL是EL2,则跳转到 init_el2 标签。

SYM_INNER_LABEL(init_el1, SYM_L_LOCAL)
isb
mov_q x0, INIT_PSTATE_EL1 #将INIT_PSTATE_EL1常量加载到寄存器x0中,这个常量用于初始化 PSTATE 寄存器。
msr spsr_el1, x0#将寄存器x0的值写入spsr_el1寄存器,配置EL1的Saved Program Status Register。
msr elr_el1, lr #将链接寄存器(lr)的值写入 elr_el1 寄存器,配置 EL1 的 Exception Link Register。
mov w0, #BOOT_CPU_MODE_EL1 #将 BOOT_CPU_MODE_EL1 常量加载到寄存器 w0 中,表示当前的 CPU 模式为 EL1。
eret

SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
mov_q x0, HCR_HOST_NVHE_FLAGS #将 HCR_HOST_NVHE_FLAGS 常量加载到寄存器 x0 中,用于配置 EL2 的 Hypervisor Configuration Register。
msr hcr_el2, x0 #将寄存器 x0 的值写入 hcr_el2 寄存器。
isb

init_el2_state

/* Hypervisor stub */
adr_l x0, __hyp_stub_vectors #使用 adr_l 宏将 __hyp_stub_vectors 的地址加载到寄存器 x0 中。__hyp_stub_vectors 是 EL2 的异常向量表地址。
msr vbar_el2, x0 #将寄存器 x0 的值写入 vbar_el2 寄存器,设置 EL2 的向量基址寄存器。
isb

msr elr_el2, lr #将链接寄存器(lr)的值写入 elr_el2 寄存器,配置 EL2 的 Exception Link Register。
mov w0, #BOOT_CPU_MODE_EL2 #将 BOOT_CPU_MODE_EL2 常量加载到寄存器 w0 中,表示当前的 CPU 模式为 EL2。
eret
SYM_FUNC_END(init_kernel_el)

//arch/arm64/include/asm/virt.h
#define BOOT_CPU_MODE_EL1 (0xe11)
#define BOOT_CPU_MODE_EL2 (0xe12)

这段代码实现了在不同异常级别(EL2 或 EL1)下对处理器进行初始化,并根据当前的启动级别设置相应的寄存器和状态,以便内核能够正确执行。主要步骤包括:

  1. 配置 EL1 的系统控制寄存器
  2. 检查当前异常级别
  3. 如果是 EL1
    • 配置 PSTATE 和异常链接寄存器。
    • 切换到 EL1 并继续执行。
  4. 如果是 EL2
    • 配置 Hypervisor Configuration Register。
    • 进一步初始化 EL2 状态。
    • 设置异常向量基址。
    • 切换到 EL2 并继续执行

2.2.3 set_cpu_boot_mode_flag

/*
* Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
* in w0. See arch/arm64/include/asm/virt.h for more info.
*/
SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
adr_l x1, __boot_cpu_mode #将 __boot_cpu_mode 的地址加载到寄存器 x1 中
cmp w0, #BOOT_CPU_MODE_EL2 #如果为不为EL2则跳转到标签1处
b.ne 1f
add x1, x1, #4 #如果 w0 中的值等于 BOOT_CPU_MODE_EL2,则将 x1(即 __boot_cpu_mode 的地址)加上 4,以指向下一个 4 字节对齐的位置。
1: str w0, [x1] // 将寄存器 w0 的值存储到内存地址 x1 指向的位置
dmb sy
dc ivac, x1 // Invalidate potentially stale cache line
ret
SYM_FUNC_END(set_cpu_boot_mode_flag)

/*
* We need to find out the CPU boot mode long after boot, so we need to
* store it in a writable variable.
*
* This is not in .bss, because we set it sufficiently early that the boot-time
* zeroing of .bss would clobber it.
*/
SYM_DATA_START(__boot_cpu_mode)
.long BOOT_CPU_MODE_EL2
.long BOOT_CPU_MODE_EL1
SYM_DATA_END(__boot_cpu_mode)


//arch/arm64/include/asm/virt.h
#define BOOT_CPU_MODE_EL1 (0xe11)
#define BOOT_CPU_MODE_EL2 (0xe12)

  • w0寄存器保存了cpu的启动模式
  • __boot_cpu_mode是一个int64的全局变量保存cpu的启动模式,前面四个字节的值为0xe11,后面四个字节的值为0xe12

2.2.4 __create_page_tables

/*
* Setup the initial page tables. We only setup the barest amount which is
* required to get the kernel running. The following sections are required:
* - identity mapping to enable the MMU (low address, TTBR0)
* - first few MB of the kernel linear mapping to jump to once the MMU has
* been enabled
*/
SYM_FUNC_START_LOCAL(__create_page_tables)
mov x28, lr

/*
* Invalidate the init page tables to avoid potential dirty cache lines
* being evicted. Other page tables are allocated in rodata as part of
* the kernel image, and thus are clean to the PoC per the boot
* protocol.
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
bl __inval_dcache_area

/*
* Clear the init page tables.
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
subs x1, x1, #64
b.ne 1b

mov x7, SWAPPER_MM_MMUFLAGS

/*
* Create the identity mapping.
*/
adrp x0, idmap_pg_dir
adrp x3, __idmap_text_start // __pa(__idmap_text_start)

#ifdef CONFIG_ARM64_VA_BITS_52
mrs_s x6, SYS_ID_AA64MMFR2_EL1
and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
mov x5, #52
cbnz x6, 1f
#endif
mov x5, #VA_BITS_MIN
1:
adr_l x6, vabits_actual
str x5, [x6]
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line

/*
* VA_BITS may be too small to allow for an ID mapping to be created
* that covers system RAM if that is located sufficiently high in the
* physical address space. So for the ID map, use an extended virtual
* range in that case, and configure an additional translation level
* if needed.
*
* Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
* entire ID map region can be mapped. As T0SZ == (64 - #bits used),
* this number conveniently equals the number of leading zeroes in
* the physical address of __idmap_text_end.
*/
adrp x5, __idmap_text_end
clz x5, x5
cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough?
b.ge 1f // .. then skip VA range extension

adr_l x6, idmap_t0sz
str x5, [x6]
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line

#if (VA_BITS < 48)
#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))

/*
* If VA_BITS < 48, we have to configure an additional table level.
* First, we have to verify our assumption that the current value of
* VA_BITS was chosen such that all translation levels are fully
* utilised, and that lowering T0SZ will always result in an additional
* translation level to be configured.
*/
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif

mov x4, EXTRA_PTRS
create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
#else
/*
* If VA_BITS == 48, we don't have to configure an additional
* translation level, but the top-level table has more entries.
*/
mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
str_l x4, idmap_ptrs_per_pgd, x5
#endif
1:
ldr_l x4, idmap_ptrs_per_pgd
mov x5, x3 // __pa(__idmap_text_start)
adr_l x6, __idmap_text_end // __pa(__idmap_text_end)

map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14

/*
* Map the kernel image (starting with PHYS_OFFSET).
*/
adrp x0, init_pg_dir
mov_q x5, KIMAGE_VADDR // compile time __va(_text)
add x5, x5, x23 // add KASLR displacement
mov x4, PTRS_PER_PGD
adrp x6, _end // runtime __pa(_end)
adrp x3, _text // runtime __pa(_text)
sub x6, x6, x3 // _end - _text
add x6, x6, x5 // runtime __va(_end)

map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14

/*
* Since the page tables have been populated with non-cacheable
* accesses (MMU disabled), invalidate those tables again to
* remove any speculatively loaded cache lines.
*/
dmb sy

adrp x0, idmap_pg_dir
adrp x1, idmap_pg_end
sub x1, x1, x0
bl __inval_dcache_area

adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
bl __inval_dcache_area

ret x28
SYM_FUNC_END(__create_page_tables)

页表的映射比较复杂,后面再分析,主要是的功能时缓存无效化、页表清空、虚拟地址配置,然后把内核进行了映射。

2.2.5 __cpu_setup

//arch/arm64/mm/proc.S

/*
* __cpu_setup
*
* Initialise the processor for turning the MMU on.
*
* Output:
* Return in x0 the value of the SCTLR_EL1 register.
*/
.pushsection ".idmap.text", "awx"
SYM_FUNC_START(__cpu_setup)
tlbi vmalle1 // 通过 tlbi 指令无效化本地 TLB,刷新tlb,用于后续映射
dsb nsh

mov x1, #3 << 20
msr cpacr_el1, x1 // Enable FP/ASIMD
mov x1, #1 << 12 // Reset mdscr_el1 and disable
msr mdscr_el1, x1 // access to the DCC from EL0
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu
reset_pmuserenr_el0 x1 // Disable PMU access from EL0
reset_amuserenr_el0 x1 // Disable AMU access from EL0

/*
* Memory region attributes
*/
mov_q x5, MAIR_EL1_SET #内存属性配置值写入 MAIR_EL1 寄存器,设置内存区域的属性
msr mair_el1, x5
/*
* Set/prepare TCR and TTBR. TCR_EL1.T1SZ gets further
* adjusted if the kernel is compiled with 52bit VA support.
*/
mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS

tcr_clear_errata_bits x10, x9, x5

#ifdef CONFIG_ARM64_VA_BITS_52
ldr_l x9, vabits_actual
sub x9, xzr, x9
add x9, x9, #64
tcr_set_t1sz x10, x9
#else
ldr_l x9, idmap_t0sz
#endif
tcr_set_t0sz x10, x9

/*
* Set the IPS bits in TCR_EL1.
*/
tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6
#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable hardware update of the Access Flags bit.
* Hardware dirty bit management is enabled later,
* via capabilities.
*/
mrs x9, ID_AA64MMFR1_EL1
and x9, x9, #0xf
cbz x9, 1f
orr x10, x10, #TCR_HA // hardware Access flag update
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
msr tcr_el1, x10
/*
* Prepare SCTLR
*/
mov_q x0, INIT_SCTLR_EL1_MMU_ON
ret // return to head.S
SYM_FUNC_END(__cpu_setup)

__cpu_setup定义在arch/arm64/mm/proc.S中,这段代码通过一系列步骤初始化处理器,以便安全地开启内存管理单元(MMU)。主要步骤包括:

  1. 无效化本地 TLB
  2. 启用浮点和 SIMD 单元
  3. 设置调试寄存器
  4. 配置内存属性寄存器
  5. 配置翻译控制寄存器和基址寄存器
  6. 设置物理地址大小和硬件访问标志
  7. 准备系统控制寄存器并返回

这些步骤确保处理器在启用 MMU 时能正确处理内存访问和管理

2.2.6 __primary_switch

SYM_FUNC_START_LOCAL(__primary_switch)
#ifdef CONFIG_RANDOMIZE_BASE
mov x19, x0 // preserve new SCTLR_EL1 value
mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value
#endif

adrp x1, init_pg_dir
bl __enable_mmu
#ifdef CONFIG_RELOCATABLE
#ifdef CONFIG_RELR
mov x24, #0 // no RELR displacement yet
#endif
bl __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
blr x8

/*
* If we return here, we have a KASLR displacement in x23 which we need
* to take into account by discarding the current kernel mapping and
* creating a new one.
*/
pre_disable_mmu_workaround
msr sctlr_el1, x20 // disable the MMU
isb
bl __create_page_tables // recreate kernel mapping

tlbi vmalle1 // Remove any stale TLB entries
dsb nsh
isb

set_sctlr_el1 x19 // re-enable the MMU

bl __relocate_kernel
#endif
#endif
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
br x8
SYM_FUNC_END(__primary_switch)

这段代码主要是用于内核映射的重定位,如果内核地址需要重定位则需要查询页表进行重新映射,然后在最后

ldr	x8, =__primary_switched
adrp x0, __PHYS_OFFSET
br x8

跳转到__primary_switched函数继续执行

/*
* The following fragment of code is executed with the MMU enabled.
*
* x0 = __PHYS_OFFSET
*/
SYM_FUNC_START_LOCAL(__primary_switched)
adrp x4, init_thread_union
add sp, x4, #THREAD_SIZE
adr_l x5, init_task
msr sp_el0, x5 // Save thread_info

adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
isb

stp xzr, x30, [sp, #-16]!
mov x29, sp

#ifdef CONFIG_SHADOW_CALL_STACK
adr_l scs_sp, init_shadow_call_stack // Set shadow call stack
#endif

str_l x21, __fdt_pointer, x5 // Save FDT pointer

ldr_l x4, kimage_vaddr // Save the offset between
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 // physical mappings

// Clear BSS
adr_l x0, __bss_start
mov x1, xzr
adr_l x2, __bss_stop
sub x2, x2, x0
bl __pi_memset
dsb ishst // Make zero page visible to PTW

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
bl kasan_early_init
#endif
mov x0, x21 // pass FDT address in x0
bl early_fdt_map // Try mapping the FDT early
bl init_feature_override // Parse cpu feature overrides
#ifdef CONFIG_RANDOMIZE_BASE
tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
b.ne 0f
bl kaslr_early_init // parse FDT for KASLR options
cbz x0, 0f // KASLR disabled? just proceed
orr x23, x23, x0 // record KASLR offset
ldp x29, x30, [sp], #16 // we must enable KASLR, return
ret // to __primary_switch()
0:
#endif
bl switch_to_vhe // Prefer VHE if possible
add sp, sp, #16
mov x29, #0
mov x30, #0
b start_kernel
SYM_FUNC_END(__primary_switched)

此函数在启用 MMU 后执行,负责初始化各种系统寄存器和数据结构,清空 BSS 段,处理设备树和特性覆盖,并根据需要处理内核地址空间布局随机化(KASLR)。最后,它跳转到 start_kernel 函数,开始内核的主要启动过程。

3. 内核启动第二阶段

Linux内核启动的第二阶段也就是常说的C语言阶段,从start_kernel()函数开始;start_kernel()函数是所有Linux平台进入系统内核初始化后的入口函数;主要完成剩余的与硬件平台相关的初始化工作,这些初始化操作,有的是公共的,有的是需要配置才会执行的;内核工作需要的模块的初始化依次被调用,如:内存管理、调度系统、异常处理等;

3.1 start_kernel

start_kernel()函数在init/main.c文件中,主要完成Linux子系统的初始化工作;

casmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
char *command_line;
char *after_dashes;

set_task_stack_end_magic(&init_task);
smp_setup_processor_id();
debug_objects_early_init();

cgroup_init_early();

local_irq_disable();
early_boot_irqs_disabled = true;

/*
* Interrupts are still disabled. Do necessary setups, then
* enable them.
*/
boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
early_security_init();
setup_arch(&command_line);
setup_boot_config(command_line);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();

build_all_zonelists(NULL);
page_alloc_init();

#ifdef CONFIG_ARCH_ROCKCHIP
{
const char *s = saved_command_line;
const char *e = &saved_command_line[strlen(saved_command_line)];
int n =
pr_notice("Kernel command line: %s\n", saved_command_line);
n -= strlen("Kernel command line: ");
s += n;
/* command line maybe too long to print one time */
while (n > 0 && s < e) {
n = pr_cont("%s\n", s);
s += n;
}
}
#else
pr_notice("Kernel command line: %s\n", saved_command_line);
#endif
/* parameters may set static keys */
jump_label_init();
parse_early_param();
after_dashes = parse_args("Booting kernel",
static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, NULL, &unknown_bootoption);
if (!IS_ERR_OR_NULL(after_dashes))
parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
NULL, set_init_arg);
if (extra_init_args)
parse_args("Setting extra init args", extra_init_args,
NULL, 0, -1, -1, NULL, set_init_arg);

/*
* These use large bootmem allocations and must precede
* kmem_cache_init()
*/
setup_log_buf(0);
vfs_caches_init_early();
sort_main_extable();
trap_init();
mm_init();
poking_init();
ftrace_init();

/* trace_printk can be enabled here */
early_trace_init();

/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();

if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
radix_tree_init();

/*
* Set up housekeeping before setting up workqueues to allow the unbound
* workqueue to take non-housekeeping into account.
*/
housekeeping_init();

/*
* Allow workqueue creation and work item queueing/cancelling
* early. Work item execution depends on kthreads and starts after
* workqueue_init().
*/
workqueue_init_early();

rcu_init();

/* Trace events are available after this */
trace_init();

if (initcall_debug)
initcall_debug_enable();

context_tracking_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
tick_init();
rcu_init_nohz();
init_timers();
hrtimers_init();
softirq_init();
timekeeping_init();
kfence_init();
time_init();

/*
* For best initial stack canary entropy, prepare it after:
* - setup_arch() for any UEFI RNG entropy and boot cmdline access
* - timekeeping_init() for ktime entropy used in random_init()
* - time_init() for making random_get_entropy() work on some platforms
* - random_init() to initialize the RNG from from early entropy sources
*/
random_init(command_line);
boot_init_stack_canary();

perf_event_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");

early_boot_irqs_disabled = false;
local_irq_enable();

kmem_cache_init_late();

/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
if (panic_later)
panic("Too many boot %s vars at `%s'", panic_later,
panic_param);

lockdep_init();

/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();

#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
page_to_pfn(virt_to_page((void *)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
setup_per_cpu_pageset();
numa_policy_init();
acpi_early_init();
if (late_time_init)
late_time_init();
sched_clock_init();
calibrate_delay();

arch_cpu_finalize_init();

pid_idr_init();
anon_vma_init();
#ifdef CONFIG_X86
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
thread_stack_cache_init();
cred_init();
fork_init();
proc_caches_init();
uts_ns_init();
buffer_init();
key_init();
security_init();
dbg_late_init();
vfs_caches_init();
pagecache_init();
signals_init();
seq_file_init();
proc_root_init();
nsfs_init();
cpuset_init();
cgroup_init();
taskstats_init_early();
delayacct_init();

acpi_subsystem_init();
arch_post_acpi_subsys_init();
sfi_init_late();
kcsan_init();

/* Do the rest non-__init'ed, we're now alive */
arch_call_rest_init();

prevent_tail_call_optimization();
}

可以看见依次调用了很多的初始化函数

参考链接