首页 > 系统相关 >linux内存管理(七)- 写时复制

linux内存管理(七)- 写时复制

时间:2024-06-11 16:32:42浏览次数:31  
标签:folio struct vmf pte vma 内存 linux page 写时

在fork进程的时候子进程会共享父进程的页表,但并没有分配新页。此时页表时只读的,如果父进程或者子进程写内存就会触发page fault,内核会重新分配内存更改页表,从此分道扬镳。因此写时复制包含两部分内容,第一是fork进程时复制页表并设置pte为只读,第二是写内存发生page fault。

先来看看第一部分。

复制页表的操作发生在dup_mmap中,调用链是kernel_clone->copy_process->copy_mm->dup_mm->dup_mmap

static __latent_entropy int dup_mmap(struct mm_struct *mm,
                    struct mm_struct *oldmm)
{
...
    for_each_vma(vmi, mpnt) {
...
        if (!(tmp->vm_flags & VM_WIPEONFORK))
            //复制页表
            retval = copy_page_range(tmp, mpnt);
...
    }
...
}
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
...
    do {
        next = pgd_addr_end(addr, end);
        if (pgd_none_or_clear_bad(src_pgd))
            continue;
        if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                        addr, next))) {
            untrack_pfn_clear(dst_vma);
            ret = -ENOMEM;
            break;
        }
    } while (dst_pgd++, src_pgd++, addr = next, addr != end);
...
    return ret;
}

copy_page_range循环复制各级页表,copy_p4d_range->copy_pud_range->copy_pmd_range->copy_pte_range->copy_present_pte

static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
         pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
         struct folio **prealloc)
{
    struct mm_struct *src_mm = src_vma->vm_mm;
    unsigned long vm_flags = src_vma->vm_flags;
    pte_t pte = ptep_get(src_pte);
    struct page *page;
    struct folio *folio;

    page = vm_normal_page(src_vma, addr, pte);
    if (page)
        folio = page_folio(page);
    if (page && folio_test_anon(folio)) {
        /*
         * If this page may have been pinned by the parent process,
         * copy the page immediately for the child so that we'll always
         * guarantee the pinned page won't be randomly replaced in the
         * future.
         */
        folio_get(folio);
        if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
            /* Page may be pinned, we have to copy. */
            folio_put(folio);
            return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                         addr, rss, prealloc, page);
        }
        rss[MM_ANONPAGES]++;
    } else if (page) {
        folio_get(folio);
        folio_dup_file_rmap_pte(folio, page);
        rss[mm_counter_file(page)]++;
    }

    /*
     * If it's a COW mapping, write protect it both
     * in the parent and the child
     */
//如果是cow mapping且pte有可写属性,将pte改为写保护 if (is_cow_mapping(vm_flags) && pte_write(pte)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); /* * If it's a shared mapping, mark it clean in * the child */ if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; }

如果vma是cow mapping,pte将会被设置为写保护。如果写对应的内存就会触发page fault,最终由handle_pte_fault处理。

在分析handle_pte_fault时我们只关注了分配匿名页的情况,下面我们看看写时复制。

static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
    pte_t entry;
...
    if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
        if (!pte_write(entry))
            return do_wp_page(vmf);
        else if (likely(vmf->flags & FAULT_FLAG_WRITE))
            entry = pte_mkdirty(entry);

什么情况会走到处理写时复制的地方呢?如果pte不为0,页面在内存中,错误是写错误,页面非共享且当前pte没有可写属性,我们就认为此时发生了写时复制,do_wp_page会处理此错误。

static vm_fault_t do_wp_page(struct vm_fault *vmf)
    __releases(vmf->ptl)
{
    const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    struct vm_area_struct *vma = vmf->vma;
    struct folio *folio = NULL;
    pte_t pte;
    ...
    vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

...
    return wp_page_copy(vmf);
}

忽略special映射,vm_normal_page会返回普通映射页面。wp_page_copy会去处理写时复制的情况。

static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
    const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    struct vm_area_struct *vma = vmf->vma;
    struct mm_struct *mm = vma->vm_mm;
    struct folio *old_folio = NULL;
    struct folio *new_folio = NULL;
    pte_t entry;
    int page_copied = 0;
    struct mmu_notifier_range range;
    vm_fault_t ret;
    bool pfn_is_zero;

    delayacct_wpcopy_start();

    if (vmf->page)
        old_folio = page_folio(vmf->page);
    //准备反向映射
    ret = vmf_anon_prepare(vmf);
    if (unlikely(ret))
        goto out;
       
    pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
    //分配内存
    new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
    if (!new_folio)
        goto oom;

    if (!pfn_is_zero) {
        int err;
        //copy 旧页到新分配的页面
        err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
        if (err) {
            /*
             * COW failed, if the fault was solved by other,
             * it's fine. If not, userspace would re-fault on
             * the same address and we will handle the fault
             * from the second attempt.
             * The -EHWPOISON case will not be retried.
             */
            folio_put(new_folio);
            if (old_folio)
                folio_put(old_folio);

            delayacct_wpcopy_end();
            return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
        }
        kmsan_copy_page_meta(&new_folio->page, vmf->page);
    }

    __folio_mark_uptodate(new_folio);

    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                vmf->address & PAGE_MASK,
                (vmf->address & PAGE_MASK) + PAGE_SIZE);
    mmu_notifier_invalidate_range_start(&range);

    /*
     * Re-check the pte - we dropped the lock
     */
    vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
    if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
        if (old_folio) {
            if (!folio_test_anon(old_folio)) {
                dec_mm_counter(mm, mm_counter_file(&old_folio->page));
                inc_mm_counter(mm, MM_ANONPAGES);
            }
        } else {
            ksm_might_unmap_zero_page(mm, vmf->orig_pte);
            inc_mm_counter(mm, MM_ANONPAGES);
        }
        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = mk_pte(&new_folio->page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (unlikely(unshare)) {
            if (pte_soft_dirty(vmf->orig_pte))
                entry = pte_mksoft_dirty(entry);
            if (pte_uffd_wp(vmf->orig_pte))
                entry = pte_mkuffd_wp(entry);
        } else {
            entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        /*
         * Clear the pte entry and flush it first, before updating the
         * pte with the new entry, to keep TLBs on different CPUs in
         * sync. This code used to set the new PTE then flush TLBs, but
         * that left a window where the new PTE could be loaded into
         * some TLBs while the old PTE remains in others.
         */
        ptep_clear_flush(vma, vmf->address, vmf->pte);
        //将vma的anon_vma设置到folio的i_mapping字段
        folio_add_new_anon_rmap(new_folio, vma, vmf->address);
//将folio加入lru folio_add_lru_vma(new_folio, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ BUG_ON(unshare && pte_write(entry)); //将新的page地址设置到pte上 set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * folio_remove_rmap_pte() with the ptp_clear_flush * above. Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in folio_remove_rmap_pte(); * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ //pte已经切换到新的page,反向映射该remove了 folio_remove_rmap_pte(old_folio, vmf->page, vma); } /* Free the old page.. */ new_folio = old_folio; page_copied = 1; pte_unmap_unlock(vmf->pte, vmf->ptl); } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); } mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(&old_folio->page); folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom: ret = VM_FAULT_OOM; out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return ret; }

wp_page_copy的流程:

1. vmf_anon_prepare准备反向映射;

2. folio_prealloc分配新页;

3. __wp_page_copy_user将旧页的内容复制到新页;

4. folio_add_new_anon_rmap将vma->anon_vma + 1设置到folio->i_mapping;

 5. set_pte_at_notify设置新的page pfn到pte中;

6. folio_remove_map_pte删除旧的folio反向映射;

标签:folio,struct,vmf,pte,vma,内存,linux,page,写时
From: https://www.cnblogs.com/banshanjushi/p/17997552

相关文章

  • 【Linux驱动设备开发详解】14.Linux网络设备架构
    1.Linux网络设备驱动的结构与字符设备和块设备不同,网络设备并不对应于/dev目录下的文件,应用程序最终使用套接字完成与网络设备的接口。Linux系统对网络设备驱动定义了4个层次,这4个层次为:网络协议接口层:向网络层协议提供同一的数据包收发接口,无论是IP还是ARP,都是通过dev_queue_......
  • petalinux 交叉编译指定内核驱动
    需要编译u-dma-buf驱动。ThisrepositorycontainsaMakefie.MakefilehasthefollowingParameters:ParameterNameDescriptionDefaultValueARCHArchitectureName$(shelluname-m|sed-es/arm.*/arm/-es/aarch64.*/arm64/)KERNEL_SRCKernelSourc......
  • rockylinux8编译安装zabbix6.0.30-LTS
    zabbix6.和mysql安装系统环境:rockylinux8.10zabbix版本:zabbix-6.0.30LTS版本php版本:php7.2nginx版本:1.26mysql版本:mysql8#下载软件包wgethttps://cdn.zabbix.com/zabbix/sources/stable/6.0/zabbix-6.0.30.tar.gztarxvfzabbix-6.0.30.tar.gzln-s/tools/zabbix-6.0......
  • 【工作必备知识】Linux系统网络诊断与netstat命令
    【工作必备知识】Linux系统网络诊断与netstat命令大家好,我叫秋意零。今天分享一篇Linux系统中与网络相关的干货(包含相关面试题),有可能对你理解网络有一定帮助。同时工作中网络诊断也时常使用,对排查问题有帮助,绝对干货。如果有帮助记得点赞三连呀。netstat命令netstat......
  • linux内存管理(六)- 内核新struct - folio
    folio大概是5.16引入的,看起来像是page的封装,这里有一篇讲解folio很好的博客,论好名字的重要性:Linux内核page到folio的变迁-CSDN博客structfolio{/*private:don'tdocumenttheanonunion*/union{struct{/*public:*/unsignedlon......
  • 每天学一个 Linux 命令(6):shutdown
    Github地址:https://github.com/mingongge/Learn-a-Linux-command-every-day命令介绍shutdown命令可以用执行系统关机或系统重启,shutdown可以关闭系统的所有应用程序,并按用户的指定要求,进行系统关闭或重启的动作执行。此命令需要具备系统管理员权限才能使用。命令格式shutdo......
  • linux内存管理(五)- 缺页处理
    分析一下缺页的处理。缺页的意思是在访问内存的时候该地址还没有建好页表,页面尚未分配,或者页面被swap出去或者没有权限。缺页是同步异常,用户态发生缺页异常会等待内核解决,当然这一切对于用户态都是透明的。缺页处理的核心函数是do_page_fault,这个函数是架构相关的所以这个函数分布......
  • 在Linux中,当用户反馈网站访问慢,如何处理?
    当用户反馈网站访问慢时,在Linux环境中进行问题排查和解决可以遵循以下步骤:确认问题存在:首先,尝试复现问题。自己或让同事从不同地点和网络环境下访问网站,看是否同样慢。使用浏览器的开发者工具(如Chrome的Network面板)检查页面加载时间,识别哪个资源加载慢。定位问题源头:......
  • 在Linux中,文件权限有哪些?
    在Linux中,文件权限是确保系统安全的重要机制,它们控制着用户能够对文件或目录执行的操作类型。Linux文件权限分为以下几种基本类型:读权限(r):对于文件:允许用户查看文件的内容,例如使用cat、less或more命令阅读文件。对于目录:允许用户查看目录中的文件列表,即可以执行ls命令。......
  • 在Linux中,性能调优都有哪几种方法?
    在Linux中,性能调优是一个综合性的过程,旨在提升系统的运行效率、响应速度和资源利用率。以下是一些关键的性能调优方法:监控与分析使用工具如top,htop,vmstat,iostat,netstat,dstat,iftop,nmon等监控CPU使用率、内存使用、磁盘I/O、网络流量等,以便识别瓶颈。利用sysdig......