Xhyper剖析[4]--XhyperCPU虚拟化

5.1 vcpu构建

在 CPU 虚拟化中，首先明确一个概念就是 Guest VM 依然还是运行在实际的物理 CPU 上，那么这里提到的 VCPU 是一个什么概念呢？

VCPU 的运行与进程的上下文切换是类似的，当 Guest VM 需要切换到 VCPU 运行时，Hypervisor 就将 VCPU 对应的 Guest VM 的上下文恢复到对应的物理 CPU 中。当 Guest OS 由于执行一些敏感指令、访问没有权限的内存等情况时，会陷入运行在更高特权级的 Hypervisor，然后 Hypervisor 负责将物理 CPU 的状态保存到 VCPU 上下文中。所以我们这里就可以理解 VCPU 就是对物理 CPU 资源的抽象，Hypervisor 实现了物理 CPU 对 Guest VM 的时间复用。
在 Hypervisor 中可以控制所有的系统资源，包括之前讲到的物理内存、IO 设备、中断、指令等。Hypervisor 可以配置 Guest VM 在执行到敏感指令时触发异常，该异常会被路由到 EL2 的 Hypervisor 中，然后可以对这些敏感指令进行模拟。

首先对Guest VM进行抽象：

一个VM包括其需要的CPU个数，二阶段地址转换的页表基地址寄存器和VCPU集合

// vm.h
typedef struct vm {
    char       name[32];
    int        nvcpu;
    u64       *vttbr;
    spinlock_t vm_lock;
    struct vcpu *vcpus[NCPU];
} vm_t;

vcpu需要保存的内容如下：

enum vcpu_state {
    VCPU_UNUSED,
    VCPU_ALLOCED,
    VCPU_READY,
    VCPU_RUNNING,
};

typedef struct vcpu {
    struct {
        u64 x[31];
        u64 spsr;
        u64 elr;
    } regs;

    struct {
        u64 spsr_el1;
        u64 elr_el1;
        u64 mpidr_el1;
        u64 midr_el1;
        u64 sp_el0;
        u64 sp_el1;
        u64 ttbr0_el1;
        u64 ttbr1_el1;
        u64 tcr_el1;
        u64 vbar_el1;
        u64 sctlr_el1;
        u64 cntv_ctl_el0;
        u64 cntv_tval_el0;
        u64 cntfrq_el0;
    } sys_regs;
    
    const char *core_name;
    struct vm  *vm;
    int        cpuid;
    enum vcpu_state state;
} vcpu_t;

保存通用寄存器regs和系统寄存器sys_regs

在Xhyper中一个物理cpu对应一个vcpu

创建一个vcpu

vcpu_t *create_vcpu(vm_t *vm, int vcpuid, u64 entry)
{
    vcpu_t *vcpu = vcpu_alloc();
    if(vcpu == NULL) {
        abort("Unable to alloc a vcpu");
    }

    vcpu->core_name = "Cortex-A72";
    vcpu->vm        = vm;
    vcpu->cpuid     = vcpuid;

    /*
        程序状态保存寄存器（SPSR）
        SPSR_M(5) : 5 = 0b0101 EL1 handler模式
        SPSR_DAIF : (0xf << 6) = 0b1111 << 6 = 0b1111000000 = 0x3C0
            - D（位 9）：Debug 异常（如断点、单步）屏蔽。
            - A（位 8）：SError（系统错误）屏蔽。
            - I（位 7）：IRQ（普通中断）屏蔽。
            - F（位 6）：FIQ（快速中断）屏蔽
    */
    vcpu->regs.spsr = SPSR_M(5) | SPSR_DAIF;    /* used to set the spsr_el2 */
    // elr 存放了异常返回地址
    vcpu->regs.elr  = entry;                    /* used to set the elr_el2 */
    // cpu ID判断寄存器
    vcpu->sys_regs.mpidr_el1 = vcpuid;          /* used to fake the mpidr_el1 */
    // cpu型号厂商等信息
    vcpu->sys_regs.midr_el1  = 0x410FD081;      /* used to fake the core to cortex-a72 */

    return vcpu;
}

修改regs.spsr，该值在进入 Guest VM 之前会被加载到 SPSR_EL2 寄存器中，可以保证异常返回到 Aarch64 状态的 EL1。
修改regs.elr，该值在进入 Guest VM 之前会被加载到 ELR_EL2 寄存器中，可以保证异常返回到 Guest VM 的入口地址。
修改sys_regs.mpidr_el1，该值在进入 Guest VM 之前会被加载到 MPIDR_EL1 寄存器中，能够使 Guest VM 看到虚假的核号。
修改sys_regs.midr_el1，该值在进入 Guest VM 之前会被加载到 MIDR_EL1 寄存器中，能够使 Guest VM 看到虚假的 CPU 身份信息

选择cpu

static void switch_vcpu(vcpu_t *vcpu)
{
    // 获取当前运行的物理cpu id
    cur_pcpu()->vcpu = vcpu;
    /* tpidr_el2保存当前执行的vcpu的地址，在上下文中可以用于获取vcpu */
    write_sysreg(tpidr_el2, vcpu);

    vcpu->state = VCPU_RUNNING;
    /* 设置stage2转换的页表基地址寄存器 */
    write_sysreg(vttbr_el2, vcpu->vm->vttbr);
    flush_tlb();
    /* 设置EL1/EL0系统寄存器的初始状态 */
    restore_sysreg(vcpu);
    isb();
    /* 切换到EL1 */
    switch_out();
}

首先将当前 vpu 的地址保存到 TPIDR_EL2 寄存器中，这样做的目的是当 Guest VM 的该 VCPU 发生异常时，Hypervisor 可以通过读取 TPIDR_EL2 来获取发生异常的 VCPU。
修改 VTTBR_EL2，保证该 Guest VM 可以访问其 IPA。
然后将 VCPU 中的通用寄存器和系统寄存器的值加载到实际的物理寄存器中。
调用 switch_out 返回 Guest VM。

有个重要的函数switch_out：

从tpidr_el2取出vcpu的地址，根据此指针去恢复寄存器的值
恢复完成寄存器的值后，调用eret指令返回EL1执行

# vector.s
.macro restore_vm_regs
    mrs x0,  tpidr_el2          /* x0 = &vcpu->regs */
    ldp x30, x1, [x0, #8 * 30]  /* x1 = spsr_el2 */
    ldr x2, [x0, #8 * 32]       /* x2 = elr_el2 */
    msr spsr_el2, x1            /* spsr_el2 存放程序状态 */
    msr elr_el2,  x2            /* elr 存放异常返回地址 */

    ldp x3, x4, [x0, #8 * 0]    /* x3 = x0, x4 = x1 */
    stp x3, x4, [sp, #-16]!     /* 将 X3 和 X4（Guest VM 的 X0 和 X1）压入栈，栈指针 SP 减 16 字节。 */
    /* 恢复 X2-X29*/
    ldp x2, x3, [x0, #8 * 2]
    ldp x4, x5, [x0, #8 * 4]
    ldp x6, x7, [x0, #8 * 6]
    ldp x8, x9, [x0, #8 * 8]
    ldp x10, x11, [x0, #8 * 10]
    ldp x12, x13, [x0, #8 * 12]
    ldp x14, x15, [x0, #8 * 14]
    ldp x16, x17, [x0, #8 * 16]
    ldp x18, x19, [x0, #8 * 18]
    ldp x20, x21, [x0, #8 * 20]
    ldp x22, x23, [x0, #8 * 22]
    ldp x24, x25, [x0, #8 * 24]
    ldp x26, x27, [x0, #8 * 26]
    ldp x28, x29, [x0, #8 * 28]
    /* 恢复 X0 和 X1 */
    ldp x0, x1, [sp], #16
.endm

.global  switch_out
.type    switch_out, function
switch_out:
    restore_vm_regs
    eret

5.2 启动cpu 0

在Xhyper的main.c的hyper_init_primary函数中首先调用了hyper_setup();

/* Provides configuration controls for virtualization */
extern void hyper_vector();
void hyper_setup()
{   
    /*
        HCR_TSC : 控制虚拟机的 SMC（Secure Monitor Call）指令是否陷入 Hypervisor
        HCR_RW  : 决定虚拟机的执行状态是 AArch64 还是 AArch32
        HCR_VM  : 开启或关闭 Stage-2 地址转换
    */
    u64 hcr = HCR_TSC | HCR_RW | HCR_VM;
    LOG_INFO("Setting hcr_el2 to 0x%x and enable stage 2 address translation\n");
    write_sysreg(hcr_el2, hcr);

    LOG_INFO("Setting Vector Base Address Register for EL2\n");
    write_sysreg(vbar_el2, (u64)hyper_vector);

    isb();
}

vbar_el2寄存器是异常向量表地址寄存器，代表产生异常时陷入EL2时的异常向量表的地址，这里设置成了hyper_vector

hyper_vector定义在vector.s中，关于arm异常向量表的知识参考2.3节

.macro save_vm_regs
    stp x0, x1, [sp, #-16]!     /* save x0, x1 on stack */
    mrs x0, tpidr_el2           /* x0 = &vcpu->regs */

    // 保存通用寄存器 X2-X29
    stp x2, x3, [x0, #8 * 2]
    stp x4, x5, [x0, #8 * 4]
    stp x6, x7, [x0, #8 * 6]
    stp x8, x9, [x0, #8 * 8]
    stp x10, x11, [x0, #8 * 10]
    stp x12, x13, [x0, #8 * 12]
    stp x14, x15, [x0, #8 * 14]
    stp x16, x17, [x0, #8 * 16]
    stp x18, x19, [x0, #8 * 18]
    stp x20, x21, [x0, #8 * 20]
    stp x22, x23, [x0, #8 * 22]
    stp x24, x25, [x0, #8 * 24]
    stp x26, x27, [x0, #8 * 26]
    stp x28, x29, [x0, #8 * 28]

    mrs x1, spsr_el2 /* x1 = spsr_el2 */
    mrs x2, elr_el2  /* x2 = elr_el2 */
    ldp x3, x4, [sp], #16   /* x3 = x0, x4 = x1  从栈加载之前保存的 X0 和 X1 到 X3 和 X4*/
    stp x30, x1, [x0, #8 *30] /* x30 = #8 *30   x1=spsr_el2 = #8 *31 保存spsr_el2*/
    str x2, [x0, #8 * 32]  /* 保存 elr_el2 */ 
    stp x3, x4, [x0, #8 * 0] /* 保存 x0 x1 */
.endm


.global hyper_vector
hyper_vector:
.balign 0x800                   //确保向量表地址按 2048 字节（2KB）对齐
.set  vector_base, hyper_vector //定义符号 vector_base，指向 hyper_vector 地址
.org  vector_base               // 设置汇编起始地址为 vector_base。

/* Current EL with SP_EL0 */
.org (vector_base + 0x00)
    b .                     // sync
.org (vector_base + 0x80)
    b .                     // IRQ
.org (vector_base + 0x100)
    b .                     // FIQ
.org (vector_base + 0x180)
    b .                     // error

/* Current EL with SP_ELx */
.org (vector_base + 0x200)
    b vector_el2_sync       // sync  暂时为空
.org (vector_base + 0x280)
    b vector_el2_irq        // IRQ 暂时为空
.org (vector_base + 0x300)
    b .                     // FIQ
.org (vector_base + 0x380)
    b .                     // error

/* Lower EL using aarch64 */
.org (vector_base + 0x400)
    b vector_el1_sync       // sync 处理来自EL1的异sync异常
.org (vector_base + 0x480)
    b vector_el1_irq        // IRQ  暂时为空
.org (vector_base + 0x500)
    b .                     // FIQ
.org (vector_base + 0x580)
    b .                     // error

/* Lower EL using aarch32 */
.org (vector_base + 0x600)
    b .                     // sync
.org (vector_base + 0x680)
    b .                     // IRQ
.org (vector_base + 0x700)
    b .                     // FIQ
.org (vector_base + 0x780)
    b .                     // error

vector_el2_sync:
    b .

vector_el2_irq:
    b .

vector_el1_sync:
    save_vm_regs
    bl el1_sync_proc
    restore_vm_regs
    eret

vector_el1_irq:
    b .

可以看见当异常为vector_el1_sync时会首先调用save_vm_regs保存当前cpu的执行状态到对应的vcpu中
然后调用el1_sync_proc处理异常

//el1_sync.c
void el1_sync_proc()
{
    vcpu_t *vcpu;
    /* which vcpu has been trapped into EL2 */
    read_sysreg(vcpu, tpidr_el2);

    u64 esr, elr, far;
    /* Exception Syndrome Register */
    read_sysreg(esr, esr_el2);
    /* Exception Link Register */
    read_sysreg(elr, elr_el2);
    /* Holds the faulting Virtual Address */
    read_sysreg(far, far_el2);

    /* Exception Class  取esr_el2寄存器的 26-31位，即EC字段*/
    u64 esr_ec  = (esr >> 26) & 0x3F;
    /* Instruction Specific Syndrome  取esr_el2寄存器0-24位，即ISS字段*/
    u64 esr_iss = esr & 0x1FFFFFF;
    /* Instruction Length 取esr_el2寄存器的25位，即IL字段 */
    u64 esr_il  = (esr >> 25) & 0x1;

    switch(esr_ec) {

        /* HVC instruction execution in AArch64 state, when HVC is not disabled. */
        /* 64位环境下执行HVC（Hypervisor Call）指令触发的异常。用于虚拟机监控模式（EL2），虚拟机通过此指令与Hypervisor交互。*/
        case 0x16:
            LOG_INFO("\033[32m [el1_sync_proc] hvc trap from EL1\033[0m\n");
            if(hvc_smc_handler(vcpu, esr_iss) != 0) {
                abort("Unknown HVC call #%d", esr_iss);
            }
            /* hvc from EL1 will set the preferred exception return address to pc+4 */
            vcpu->regs.elr += 0;
            break;
        /* 64位环境下执行SMC（Secure Monitor Call）指令触发的异常。用于安全监控模式（EL3），实现安全世界与非安全世界的切换*/
        case 0x17:
            LOG_INFO("\033[32m[el1_sync_proc] smc trap from EL1\033[0m\n");
            /* on smc call, iss is the imm of a smc */
            if(hvc_smc_handler(vcpu, esr_iss) != 0) {
                abort("Unknown SMC call #%d", esr_iss);
            }
            /* smc trapped from EL1 will set preferred execption return address to pc
             * so we need to +4 return to the next instruction.
             */
            vcpu->regs.elr += 4;
            break;
        default:
            abort("Unknown el1 sync: esr_ec %p, esr_iss %p, elr %p, far %p", esr_ec, esr_iss, elr, far);
            break;
    }

    return;
}

esr_el2寄存器的ec字段保存了异常产生的原因，即根据ec字段的值来判断是什么异常
- 0x16：表示客户机执行的 HVC 指令触发了异常，且被陷阱到 EL2。
- 0x17: Trapped SMC 指令（由 HCR_TSC 触发）。
esr_el2寄存器的iss字段提供更详细的异常信息

通过switch_out函数就会返回到EL1中执行，入口函数为guest os的_start函数：

.section .text, "ax"
.global  _start
.type    _start, function
.align 4

_start:
    /* Set stack for c code */
    adrp    x0, sp_stack
    /* Get Current code id */
    mrs     x1, mpidr_el1
    and     x1, x1, #0x0f
    add     x2, x1, 1
    mov     x3, #SZ_4K
    mul     x3, x3, x2
    add     x0, x0, x3
    mov     sp, x0
    cbz     x1, vm_primary_init
    bl      vm_secondary_init
    /* spin here */
    b       .

此函数会比对当前的core id如果为0，则会跳转到vm_primary_init函数执行，如果core id为1则会跳转到vm_secondary_init函数执行

vm_primary_init函数如下：

int vm_primary_init()
{
    pl011_init();
    print_logo();

    /* test code for wakeup vcore 1 */
    smc_call((u64)0xc4000003, (u64)1, (u64)_start);

    while(1) {
        printf("I am vm 1 on core 0\n");
        for(int i=0; i < 100000000; i++);
    }

    return 0;
}

smc_call函数可以用来启用另外一个cpu核心主要是调用smc指令来实现

.global smc_call
.type    smc_call, function
smc_call:
    smc #0
    ret

smc指定对应的id如下：

/* https://developer.aliyun.com/article/1205031 */
#define PSCI_VERSION            0x84000000 //返回 PSCI 的主版本号和次版本号（32 位值）：
#define PSCI_MIGRATE_INFO_TYPE  0x84000006 //返回系统是否支持 CPU 迁移（migration），以及迁移的类型
#define PSCI_SYSTEM_OFF         0x84000008 //通知系统进入完全关闭状态（电源关闭）
#define PSCI_SYSTEM_RESET       0x84000009 //通知系统执行重启（软复位或硬复位）
#define PSCI_SYSTEM_CPUON       0xc4000003 //唤醒一个关闭或低功耗的 CPU，设置其执行入口地址和上下文
#define PSCI_FEATURE            0x8400000a //检查特定 PSCI 功能是否可用。输入功能 ID，返回支持状态。

5.3 启动cpu 1

通过smc指令就会从guest os陷入到el2然后通过查找异常向量表调用到el1_sync_proc()函数，根据esr_el2的ec字段的值来判断进行下一步调用

如果是0x17即smc call的话会进入到hvc_smc_handler处理

//el1_sync.c
static void vpsci_handler(vcpu_t *vcpu)
{
    /*
     * x0 - function id
     * x1 - target cpu
     * x2 - entry addr (the entry addr for guest vm, not for vmm)
     */

    u64 ret = vpsci_trap_smc(vcpu, vcpu->regs.x[0], vcpu->regs.x[1], vcpu->regs.x[2]);
    vcpu->regs.x[0] = ret;
}

static int hvc_smc_handler(vcpu_t *vcpu, int imm)
{
    switch(imm) {
        case 0:
            vpsci_handler(vcpu);
            return 0;
        default:
            return -1;
    } 
}

vpsci_trap_smc进一步处理：

// vpsci.c
static u32 vpsci_version()
{
    return smc_call(PSCI_VERSION, 0, 0);
}

static s32 vpsci_migrate_info_type()
{
    return smc_call(PSCI_MIGRATE_INFO_TYPE, 0, 0);
}

static s32 vpsci_cpu_on(vcpu_t *vcpu, u64 funid, u64 target_cpu, u64 entry_addr)
{
    LOG_INFO("Vpsci cpu on call for vcpu %d on entrypoint %p\n", target_cpu, entry_addr);
    
    if(target_cpu >= (u64)vcpu->vm->nvcpu) {
        LOG_WARN("Vpsci failed to wakeup vcpu\n");
    }

    vcpu_t *target = vcpu->vm->vcpus[target_cpu];
    target->regs.elr = entry_addr;
    target->state = VCPU_READY;
    /* wakeup the physical cpu */
    return smc_call(PSCI_SYSTEM_CPUON, target_cpu, (u64)_start);
}

u64 vpsci_trap_smc(vcpu_t *vcpu, u64 funid, u64 target_cpu, u64 entry_addr)
{
    if(vcpu == NULL) {
        abort("vpsci_trap_smc with NULL vcpu");
    }

    switch(funid) {
        case PSCI_VERSION:
            return vpsci_version();
        case PSCI_MIGRATE_INFO_TYPE:
            return (s64)vpsci_migrate_info_type();     
        case PSCI_SYSTEM_OFF:
            LOG_WARN("Unsupported PSCI CPU OFF\n");
            break;
        case PSCI_SYSTEM_RESET:
            LOG_WARN("Unsupported PSCI CPU RESET\n");
            break;
        case PSCI_SYSTEM_CPUON:
            return (s64)vpsci_cpu_on(vcpu, funid, target_cpu, entry_addr);
        case PSCI_FEATURE:    /* Linux will use this funid to get the PSCI FEATURE */
            /* fake it */
            return 0;
        default:
            abort("Unknown function id : %p from hvc/smc call", funid);
            return -1;
    }
    return -1;
}

当调用的id号为PSCI_SYSTEM_CPUON时会去唤醒一个cpu核心：

这里有个重要点，将唤醒的cpu对应的vcpu的返回地址设置为了entry_addr，这样被唤醒的cpu从el2返回到el1执行时就会返回到entry_addr处执行

static s32 vpsci_cpu_on(vcpu_t *vcpu, u64 funid, u64 target_cpu, u64 entry_addr)
{
    LOG_INFO("Vpsci cpu on call for vcpu %d on entrypoint %p\n", target_cpu, entry_addr);
    
    if(target_cpu >= (u64)vcpu->vm->nvcpu) {
        LOG_WARN("Vpsci failed to wakeup vcpu\n");
    }

    vcpu_t *target = vcpu->vm->vcpus[target_cpu];
    target->regs.elr = entry_addr;
    target->state = VCPU_READY;
    /* wakeup the physical cpu */
    return smc_call(PSCI_SYSTEM_CPUON, target_cpu, (u64)_start);
}

唤醒的cpu核心在el2的执行地址为_start：

.section .text, "ax"
.global  _start
.type    _start, function
.align 4

# 设置栈指针，如果core_id = 0 , 则跳转到 hyper_init_primary
_start:
    /* Set stack for c code */
    adrp    x0, sp_stack
    /* Get Current code id */
    mrs     x1, mpidr_el1
    and     x1, x1, #0x0f
    add     x2, x1, 1
    mov     x3, #SZ_4K
    mul     x3, x3, x2
    add     x0, x0, x3
    mov     sp, x0
    cbz     x1, hyper_init_primary
    bl      hyper_init_secondary
    /* spin here */
    b       .

start函数会比对当前的core id是0还是1，如果是1，则说明是被唤醒的一个新的core，因此会跳转到hyper_init_secondary函数执行

int hyper_init_secondary()
{
    LOG_INFO("core %d is activated\n", coreid());
    stage2_mmu_init();
    hyper_setup();
    start_vcpu();
    
    while(1) {}
    return 0;
}

到这里core 1也已经被启动了，core 1通过start_vcpu函数就可以返回到el1执行，core 0在处理完成el1来的异常后也会返回到el1

vector_el1_sync:
    save_vm_regs
    bl el1_sync_proc
    restore_vm_regs
    eret

整体的流程如下：