首页 > 系统相关 >linux源码解析07–缺页异常之架构相关部分(ARM64为例)分析

linux源码解析07–缺页异常之架构相关部分(ARM64为例)分析

时间:2023-02-13 12:32:08浏览次数:54  
标签:07 vmf fault pte vma mm 源码 flags 缺页

缺页异常:

也叫缺页中断,页错误,是操作系统虚拟内存管理重要的一种机制,属于处理器的同步异常;

访问虚拟内存的时候,虚拟地址和物理地址没有建立映射关系,或者有访问权限错误发生时,会触发缺页异常;

内核必须处理此异常,而且对于进程来说时透明的。

缺页异常实现过程

缺页异常,属于同步异常,触发时,CPU会自动跳转到异常向量表; 异常向量表入口地址在arch/arm64/kernel/entry.S

相关源码文件

异常入口:arch/arm64/kernel/entry.S arm64架构处理:arch/arm64/mm/fault.c 通用代码: mm/memory.c

总体调用过程:

vectors    ///arch/arm64/kernel/entry.S架构相关
->el1_sync
->el1_sync_handler
->el1_abort
->do_mem_abort     ///arch/arm64/mm/fault.c  架构相关
 ->do_page_fault
 ->__do_page_fault
	->handle_mm_fault   ///mm/memory.c架构无关
	->__handle_mm_fault
	  ->handle_pte_fault
	    ->do_annoymous_page  ///匿名映射缺页异常
		->do_fault           ///文件映射缺页异常
		->do_swap_page       ///swap缺页异常
		->do_wp_page         ///写时复制缺页异常

源码分析

异常处理汇编入口:

/*
 * Exception vectors.
 */
	.pushsection ".entry.text", "ax"

	.align	11
SYM_CODE_START(vectors)
	kernel_ventry	1, sync_invalid			// Synchronous EL1t
	kernel_ventry	1, irq_invalid			// IRQ EL1t
	kernel_ventry	1, fiq_invalid			// FIQ EL1t
	kernel_ventry	1, error_invalid		// Error EL1t

	kernel_ventry	1, sync				// Synchronous EL1h ///linux异常向量入口,这里是同步异常,kernel_ventry宏展开为el1_sync
	kernel_ventry	1, irq				// IRQ EL1h
	kernel_ventry	1, fiq				// FIQ EL1h
	kernel_ventry	1, error			// Error EL1h
...

el1_sync函数:

SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
	kernel_entry 1  ///保存上下文
	mov	x0, sp      ///sp作为参数
	bl	el1_sync_handler
	kernel_exit 1   ///恢复上下文
SYM_CODE_END(el1_sync)

el1_sync_handler函数:

asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs)
{
	unsigned long esr = read_sysreg(esr_el1);

	switch (ESR_ELx_EC(esr)) {   ///读取esr_el1的EC域,判断异常类型
	case ESR_ELx_EC_DABT_CUR:  ///0x25,表示来自当前的异常等级的数据异常
	case ESR_ELx_EC_IABT_CUR:
		el1_abort(regs, esr);  ///数据异常入口
		break;
...
}

do_mem_abort()->do_page_fault()

static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
				   struct pt_regs *regs)
{
	const struct fault_info *inf;
	struct mm_struct *mm = current->mm;
	vm_fault_t fault;
	unsigned long vm_flags;
	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
	unsigned long addr = untagged_addr(far);

	if (kprobe_page_fault(regs, esr))
		return 0;

	/*
	 * If we're in an interrupt or have no user context, we must not take
	 * the fault.
	 */  ///检查异常发生时的路径, 若不合条件,直接跳转no_context,出错处理
	if (faulthandler_disabled() || !mm)  ///是否关闭缺页中断,是否在中断上下文,是否内核空间,内核进程的mm总是NULL
		goto no_context;

	if (user_mode(regs))
		mm_flags |= FAULT_FLAG_USER;

	/*
	 * vm_flags tells us what bits we must have in vma->vm_flags
	 * for the fault to be benign, __do_page_fault() would check
	 * vma->vm_flags & vm_flags and returns an error if the
	 * intersection is empty
	 */
	if (is_el0_instruction_abort(esr)) {    ///判断是否为低异常等级的指令异常,若为指令异常,说明该地址具有可执行权限
		/* It was exec fault */
		vm_flags = VM_EXEC;
		mm_flags |= FAULT_FLAG_INSTRUCTION;
	} else if (is_write_abort(esr)) { ///写内存区错误
		/* It was write fault */
		vm_flags = VM_WRITE;
		mm_flags |= FAULT_FLAG_WRITE;
	} else {
		/* It was read fault */
		vm_flags = VM_READ;
		/* Write implies read */
		vm_flags |= VM_WRITE;
		/* If EPAN is absent then exec implies read */
		if (!cpus_have_const_cap(ARM64_HAS_EPAN))
			vm_flags |= VM_EXEC;
	}
	///判断是否为用户空间,是否EL1权限错误,都满足时,表明处于少见的特殊情况,直接报错处理
	if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
		if (is_el1_instruction_abort(esr))
			die_kernel_fault("execution of user memory",
					 addr, esr, regs);

		if (!search_exception_tables(regs->pc))
			die_kernel_fault("access to user memory outside uaccess routines",
					 addr, esr, regs);
	}

	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);

	/*
	 * As per x86, we may deadlock here. However, since the kernel only
	 * validly references user space from well defined areas of the code,
	 * we can bug out early if this is from code which shouldn't.
	 */   ///执行到这里,可以断定缺页异常没有发生在中断上下文,没有发生在内核线程,以及一些特殊情况;接下来检查由于地址空间引发的缺页异常
	if (!mmap_read_trylock(mm)) {
		if (!user_mode(regs) && !search_exception_tables(regs->pc))
			goto no_context;
retry:
		mmap_read_lock(mm);  ///睡眠,等待锁释放
	} else {
		/*
		 * The above mmap_read_trylock() might have succeeded in which
		 * case, we'll have missed the might_sleep() from down_read().
		 */
		might_sleep();
#ifdef CONFIG_DEBUG_VM
		if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
			mmap_read_unlock(mm);
			goto no_context;
		}
#endif
	}
	///进一步处理缺页以异常
	fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);

	...

no_context:
	__do_kernel_fault(addr, esr, regs);  ///报错处理
	return 0;
}

__do_page_fault()

static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
				  unsigned int mm_flags, unsigned long vm_flags,
				  struct pt_regs *regs)
{
	struct vm_area_struct *vma = find_vma(mm, addr); ///查找失效地址addr对应的vma

	if (unlikely(!vma)) ///找不到vma,说明addr还没在进程地址空间中,返回VM_FAULT_BADMAP
		return VM_FAULT_BADMAP;

	/*
	 * Ok, we have a good vm_area for this memory access, so we can handle
	 * it.
	 */
	if (unlikely(vma->vm_start > addr)) {   ///vm_start>addr,特殊情况(栈,向下增长),检查能否vma扩展到addr,否则报错
		if (!(vma->vm_flags & VM_GROWSDOWN))
			return VM_FAULT_BADMAP;
		if (expand_stack(vma, addr))
			return VM_FAULT_BADMAP;
	}

	/*
	 * Check that the permissions on the VMA allow for the fault which
	 * occurred.
	 */
	if (!(vma->vm_flags & vm_flags)) ///判断vma属性,无权限,直接返回
		return VM_FAULT_BADACCESS;
	return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs);  ///缺页中断处理的核心处理函数
}

handle_mm_fault()->__handle_mm_fault()

进入通用代码处理阶段

* By the time we get here, we already hold the mm semaphore
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
		unsigned long address, unsigned int flags)
{
	struct vm_fault vmf = {  ///构建vma描述结构体
		.vma = vma,
		.address = address & PAGE_MASK,
		.flags = flags,
		.pgoff = linear_page_index(vma, address),
		.gfp_mask = __get_fault_gfp_mask(vma),
	};
	unsigned int dirty = flags & FAULT_FLAG_WRITE;
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	p4d_t *p4d;
	vm_fault_t ret;

	pgd = pgd_offset(mm, address);     ///计算pgd页表项
	p4d = p4d_alloc(mm, pgd, address); ///计算p4d页表项,这里等于pgd
	if (!p4d)
		return VM_FAULT_OOM;

	vmf.pud = pud_alloc(mm, p4d, address);  ///计算pud页表项
	if (!vmf.pud)
		return VM_FAULT_OOM;
...

	vmf.pmd = pmd_alloc(mm, vmf.pud, address);  ///计算pmd页表项
	if (!vmf.pmd)
		return VM_FAULT_OOM;
...

	return handle_pte_fault(&vmf); ///进入通用代码处理
}

handle_pte_fault()

分类处理各种类型缺页异常

static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
	pte_t entry;

	if (unlikely(pmd_none(*vmf->pmd))) {
		/*
		 * Leave __pte_alloc() until later: because vm_ops->fault may
		 * want to allocate huge page, and if we expose page table
		 * for an instant, it will be difficult to retract from
		 * concurrent faults and from rmap lookups.
		 */
		vmf->pte = NULL;
	} else {
		/*
		 * If a huge pmd materialized under us just retry later.  Use
		 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
		 * of pmd_trans_huge() to ensure the pmd didn't become
		 * pmd_trans_huge under us and then back to pmd_none, as a
		 * result of MADV_DONTNEED running immediately after a huge pmd
		 * fault in a different thread of this mm, in turn leading to a
		 * misleading pmd_trans_huge() retval. All we have to ensure is
		 * that it is a regular pmd that we can walk with
		 * pte_offset_map() and we can do that through an atomic read
		 * in C, which is what pmd_trans_unstable() provides.
		 */
		if (pmd_devmap_trans_unstable(vmf->pmd))
			return 0;
		/*
		 * A regular pmd is established and it can't morph into a huge
		 * pmd from under us anymore at this point because we hold the
		 * mmap_lock read mode and khugepaged takes it in write mode.
		 * So now it's safe to run pte_offset_map().
		 */
		vmf->pte = pte_offset_map(vmf->pmd, vmf->address);  ///计算pte页表项
		vmf->orig_pte = *vmf->pte;                          ///读取pte内容到vmf->orig_pte

		/*
		 * some architectures can have larger ptes than wordsize,
		 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
		 * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
		 * accesses.  The code below just needs a consistent view
		 * for the ifs and we later double check anyway with the
		 * ptl lock held. So here a barrier will do.
		 */
		barrier();  ///有的处理器PTE会大于字长,所以READ_ONCE()不能保证原子性,添加内存屏障以保证正确读取了PTE内容
		if (pte_none(vmf->orig_pte)) {
			pte_unmap(vmf->pte);
			vmf->pte = NULL;
		}
	}
///pte为空
	if (!vmf->pte) {
		if (vma_is_anonymous(vmf->vma))
			return do_anonymous_page(vmf);  ///处理匿名映射
		else
			return do_fault(vmf);           ///文件映射
	}
	
///pte不为空
	if (!pte_present(vmf->orig_pte))   ///pte存在,但是不在内存中,从交换分区读回页面
		return do_swap_page(vmf); 

	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))  ///处理numa调度页面
		return do_numa_page(vmf);

	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	spin_lock(vmf->ptl);
	entry = vmf->orig_pte;
	if (unlikely(!pte_same(*vmf->pte, entry))) {
		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
		goto unlock;
	}
	if (vmf->flags & FAULT_FLAG_WRITE) {  ///FAULT_FLAG_WRITE标志,根据ESR_ELx_WnR设置
		if (!pte_write(entry))
			return do_wp_page(vmf);       ///vma可写,pte只读,触发缺页异常,父子进程共享的内存,写时复制
		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);  ///访问标志位错误,设置PTE_AF位
	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,  ///更新PTE和缓存
				vmf->flags & FAULT_FLAG_WRITE)) {
		update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
	} else {
		/* Skip spurious TLB flush for retried page fault */
		if (vmf->flags & FAULT_FLAG_TRIED)
			goto unlock;
		/*
		 * This is needed only for protection faults but the arch code
		 * is not yet telling us if this is a protection fault or not.
		 * This still avoids useless tlb flushes for .text page faults
		 * with threads.
		 */
		if (vmf->flags & FAULT_FLAG_WRITE)
			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
	}
unlock:
	pte_unmap_unlock(vmf->pte, vmf->ptl);
	return 0;
}

到这里,进入通用代码部分,涉及页面处理相当复杂,参考以下流程图:

缺页异常流程图:

主要缺页异常类型:

匿名映射缺页异常 文件映射缺页异常 swap缺页异常 写时复制缺页异常

匿名映射:无文件支持的内存映射,把物理内存映射到进程的虚拟地址空间,如进程堆、栈。

文件映射:有文件支持的内存映射,将文件的一个区间映射到进程的虚拟地址空间,数据来源是存储设备上的文件,如mmap映射文件。

主缺页:缺页异常时,需要做IO 操作才能获得所需要的数据页,如从磁盘读取文件页(VM_FAULT_MAJOR)。

次缺页:缺页异常时,不需要做IO操作就能获得所需要的数据页,如从page cache查找到页。

交换:是指先将内存某部分的程序或数据写入外存交换区(swap分区或文件),使用的时候再从外存交换区中调入指定的程序或数据到内存中来,并让其执行的一种内存扩充技术。

写实复制:(Copy-on-write)进程试图访问虚拟内存可写而页表没有写权限的页时,处理器发生缺页异常,将分配新的页面并复制旧的页面到新页面。

标签:07,vmf,fault,pte,vma,mm,源码,flags,缺页
From: https://blog.51cto.com/u_7784550/6052174

相关文章

  • RTSP Server(LIVE555)源码分析(一)-重要关系类
        live项目包括四个基本的库,程序入口类(在mediaServer中),各种测试代码(测试代码在testProgs里面)。四个基本的库分别是:UsageEnvironment&TaskScheduler、Groupsoc......
  • 0源码基础学习Spring源码系列(一)——Bean注入流程
    作者:京东科技韩国凯通过本文,读者可以0源码基础的初步学习spring源码,并能够举一反三从此进入源码世界的大米!由于是第一次阅读源码,文章之中难免存在一些问题,还望包涵指正!......
  • Handler源码解析
    Handler源码解析一、基本原理回顾在android开发中,经常会在子线程中进行一些操作,当操作完毕后会通过handler发送一些数据给主线程,通知主线程做相应的操作。探索其背后的......
  • SAP MM事务代码S_P00_07000139可以用于跟踪物料移动细节
    SAPMM事务代码S_P00_07000139可以用于跟踪物料移动细节SAP系统里有很多标准的报表可以查询指定时间段的期初期末库存,出库入库的数量金额等信息。除了比较常见的事务代码以......
  • ChatGPT热度再起,今天教你搭建微信小程序(附源码)
    最近几天ChatGPT的热度又重新起来了,在微信指数中搜索量一度飙升到3100万,大部分人都是看着ChatGPT的热度一点点消散,然后什么也做不了。做电商的卖账号、电话卡,做自媒体的写......
  • 07 自定义分页器
    自定义分页器推导current_page=request.GET.get("page",1)#获取用户访问的页码,没有则默认展示第一页try:#由于后端接收到的前端数据是字符串类型所以做类型转换......
  • STM32F407VET6 / BLACK_F407VE开发板间隔0.5秒不断重启
    有一块STM32F407VET6的故障开发板,之前的问题是经常无法烧录,必须reset之后才能连接,具体查看这篇STM32F407VET6烧录出现flashdownloadfailedtargetdllhasbeen......
  • [07] 案例&优化
    1.通信框架功能设计1.1功能描述通信框架承载了业务内部各模块之间的消息交互和服务调用,它的主要功能如下:基于Netty的NIO通信框架,提供高性能的异步通信能力;提供......
  • linux007之文件、目录操作命令
    查看当前所在目录:pwd:查看当前所在目录路径ls:查看当前目录所有文件ls路径:查看指定目录下所有文件ls-l:以列的形式查看当前目录所有文件ls-l路......
  • day07-SpringMVC底层机制简单实现-03
    SpringMVC底层机制简单实现-03https://github.com/liyuelian/springmvc-demo.git7.任务6-完成控制器方法获取参数-@RequestParam功能说明:自定义@RequestParam注解和......