linux内存管理（十）- 页面回收（二）

标签：folio pgdat lru 内存 linux sc nr reclaim 页面

本篇了解一下内核是怎样触发页面回收的。

触发内存回收的方式有两种，同步和异步回收。alloc_pages在分配内存的时候，如果内存短缺会主动回收内存，这是同步回收；内核有一个或多个kswapd内核线程负责在后台回收内存，这是异步。

看一下shrink_active_list

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
    unsigned long nr[NR_LRU_LISTS];
    unsigned long targets[NR_LRU_LISTS];
    unsigned long nr_to_scan;
    enum lru_list lru;
    unsigned long nr_reclaimed = 0;
    unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    bool proportional_reclaim;
    struct blk_plug plug;

    if (lru_gen_enabled() && !root_reclaim(sc)) {
        lru_gen_shrink_lruvec(lruvec, sc);
        return;
    }
    //计算各个lru链表需要扫描的page个数
    get_scan_count(lruvec, sc, nr);

    /* Record the original scan target for proportional adjustments later */
    memcpy(targets, nr, sizeof(nr));

    /*
     * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
     * event that can occur when there is little memory pressure e.g.
     * multiple streaming readers/writers. Hence, we do not abort scanning
     * when the requested number of pages are reclaimed when scanning at
     * DEF_PRIORITY on the assumption that the fact we are direct
     * reclaiming implies that kswapd is not keeping up and it is best to
     * do a batch of work at once. For memcg reclaim one check is made to
     * abort proportional reclaim if either the file or anon lru has already
     * dropped to zero at the first pass.
     */
    proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
                sc->priority == DEF_PRIORITY);

    blk_start_plug(&plug);
    while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||          //active file也要被回收
                    nr[LRU_INACTIVE_FILE]) {
        unsigned long nr_anon, nr_file, percentage;
        unsigned long nr_scanned;

        for_each_evictable_lru(lru) {
            if (nr[lru]) {
                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
                nr[lru] -= nr_to_scan;
                //shrink_list会调用shrink active list或者shrink inactive list
                nr_reclaimed += shrink_list(lru, nr_to_scan,
                                lruvec, sc);
            }
        }

        cond_resched();

        if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
            continue;

        /*
         * For kswapd and memcg, reclaim at least the number of pages
         * requested. Ensure that the anon and file LRUs are scanned
         * proportionally what was requested by get_scan_count(). We
         * stop reclaiming one LRU and reduce the amount scanning
         * proportional to the original scan target.
         */
        nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
        nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

        /*
         * It's just vindictive to attack the larger once the smaller
         * has gone to zero.  And given the way we stop scanning the
         * smaller below, this makes sure that we only make one nudge
         * towards proportionality once we've got nr_to_reclaim.
         */
        if (!nr_file || !nr_anon)
            break;

        if (nr_file > nr_anon) {
            unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                        targets[LRU_ACTIVE_ANON] + 1;
            lru = LRU_BASE;
            percentage = nr_anon * 100 / scan_target;
        } else {
            unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
                        targets[LRU_ACTIVE_FILE] + 1;
            lru = LRU_FILE;
            percentage = nr_file * 100 / scan_target;
        }

        /* Stop scanning the smaller of the LRU */
        nr[lru] = 0;
        nr[lru + LRU_ACTIVE] = 0;

        /*
         * Recalculate the other LRU scan count based on its original
         * scan target and the percentage scanning already complete
         */
        lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
        nr_scanned = targets[lru] - nr[lru];
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);

        lru += LRU_ACTIVE;
        nr_scanned = targets[lru] - nr[lru];
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);
    }
    blk_finish_plug(&plug);
    sc->nr_reclaimed += nr_reclaimed;

    /*
     * Even if we did not try to evict anon pages at all, we want to
     * rebalance the anon lru active/inactive ratio.
     */
    if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
        inactive_is_low(lruvec, LRU_INACTIVE_ANON))
        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                   sc, LRU_ACTIVE_ANON);
}

看看shrink_list

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                 struct lruvec *lruvec, struct scan_control *sc)
{
    if (is_active_lru(lru)) {
        if (sc->may_deactivate & (1 << is_file_lru(lru)))
            shrink_active_list(nr_to_scan, lruvec, sc, lru);
        else
            sc->skipped_deactivate = 1;
        return 0;
    }

    return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}

如果当前是active lru且允许deactive当前的lru那么调用shrink_active_list。

/*
 * shrink_active_list() moves folios from the active LRU to the inactive LRU.
 *
 * We move them the other way if the folio is referenced by one or more
 * processes.
 *
 * If the folios are mostly unmapped, the processing is fast and it is
 * appropriate to hold lru_lock across the whole operation.  But if
 * the folios are mapped, the processing is slow (folio_referenced()), so
 * we should drop lru_lock around each folio.  It's impossible to balance
 * this, so instead we remove the folios from the LRU while processing them.
 * It is safe to rely on the active flag against the non-LRU folios in here
 * because nobody will play with that bit on a non-LRU folio.
 *
 * The downside is that we have to touch folio->_refcount against each folio.
 * But we had to alter folio->flags anyway.
 */
static void shrink_active_list(unsigned long nr_to_scan,
                   struct lruvec *lruvec,
                   struct scan_control *sc,
                   enum lru_list lru)
{
    unsigned long nr_taken;
    unsigned long nr_scanned;
    unsigned long vm_flags;
    LIST_HEAD(l_hold);    /* The folios which were snipped off */
    LIST_HEAD(l_active);
    LIST_HEAD(l_inactive);
    unsigned nr_deactivate, nr_activate;
    unsigned nr_rotated = 0;
    int file = is_file_lru(lru);
    struct pglist_data *pgdat = lruvec_pgdat(lruvec);
    //排空lru per cpu cache
    lru_add_drain();

    spin_lock_irq(&lruvec->lru_lock);
    //把要扫描的页面先从lru上分离到l_hold中备用，我觉得这是为了减少对lru锁的使用时长
    nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
                     &nr_scanned, sc, lru);

    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);

    if (!cgroup_reclaim(sc))
        __count_vm_events(PGREFILL, nr_scanned);
    __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);

    spin_unlock_irq(&lruvec->lru_lock);
    //遍历l_hold
    while (!list_empty(&l_hold)) {
        struct folio *folio;

        cond_resched();
        folio = lru_to_folio(&l_hold);
        list_del(&folio->lru);

        if (unlikely(!folio_evictable(folio))) {
            folio_putback_lru(folio);
            continue;
        }
        //容我日后分析
        if (unlikely(buffer_heads_over_limit)) {
            if (folio_needs_release(folio) &&
                folio_trylock(folio)) {
                filemap_release_folio(folio, 0);
                folio_unlock(folio);
            }
        }

        /* Referenced or rmap lock contention: rotate */
        if (folio_referenced(folio, 0, sc->target_mem_cgroup,
                     &vm_flags) != 0) {
            /*
             * Identify referenced, file-backed active folios and
             * give them one more trip around the active list. So
             * that executable code get better chances to stay in
             * memory under moderate memory pressure.  Anon folios
             * are not likely to be evicted by use-once streaming
             * IO, plus JVM can create lots of anon VM_EXEC folios,
             * so we ignore them here.
             */
            //可执行文件cache如果被引用过那就先放回active list
            if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
                nr_rotated += folio_nr_pages(folio);
                list_add(&folio->lru, &l_active);
                continue;
            }
        }
        //将folio的active标志清掉
        folio_clear_active(folio);    /* we are de-activating */
        folio_set_workingset(folio);
        //将folio加到inactive tmp list中
        list_add(&folio->lru, &l_inactive);
    }

    /*
     * Move folios back to the lru list.
     */
    spin_lock_irq(&lruvec->lru_lock);

    nr_activate = move_folios_to_lru(lruvec, &l_active);
    nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
    /* Keep all free folios in l_active list */
    list_splice(&l_inactive, &l_active);

    __count_vm_events(PGDEACTIVATE, nr_deactivate);
    __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);

    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
    spin_unlock_irq(&lruvec->lru_lock);

    if (nr_rotated)
        lru_note_cost(lruvec, file, 0, nr_rotated);
    mem_cgroup_uncharge_list(&l_active);
    free_unref_page_list(&l_active);
    trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
            nr_deactivate, nr_rotated, sc->priority, file);
}

看看shrink_inactive_list

static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
        struct lruvec *lruvec, struct scan_control *sc,
        enum lru_list lru)
{
    LIST_HEAD(folio_list);
    unsigned long nr_scanned;
    unsigned int nr_reclaimed = 0;
    unsigned long nr_taken;
    struct reclaim_stat stat;
    bool file = is_file_lru(lru);
    enum vm_event_item item;
    struct pglist_data *pgdat = lruvec_pgdat(lruvec);
    bool stalled = false;

    while (unlikely(too_many_isolated(pgdat, file, sc))) {
        if (stalled)
            return 0;

        /* wait a bit for the reclaimer. */
        stalled = true;
        reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);

        /* We are about to die and free our memory. Return now. */
        if (fatal_signal_pending(current))
            return SWAP_CLUSTER_MAX;
    }

    lru_add_drain();

    spin_lock_irq(&lruvec->lru_lock);
    //分离要扫描的lru folio到folio_list
    nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
                     &nr_scanned, sc, lru);

    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
    item = PGSCAN_KSWAPD + reclaimer_offset();
    if (!cgroup_reclaim(sc))
        __count_vm_events(item, nr_scanned);
    __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
    __count_vm_events(PGSCAN_ANON + file, nr_scanned);

    spin_unlock_irq(&lruvec->lru_lock);

    if (nr_taken == 0)
        return 0;
    //回收folio_list里面的folio，返回回收的page数量
    nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);

    spin_lock_irq(&lruvec->lru_lock);
    //没有被回收的folio放回lru
    move_folios_to_lru(lruvec, &folio_list);

    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
    item = PGSTEAL_KSWAPD + reclaimer_offset();
    if (!cgroup_reclaim(sc))
        __count_vm_events(item, nr_reclaimed);
    __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
    __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
    spin_unlock_irq(&lruvec->lru_lock);

    lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
    mem_cgroup_uncharge_list(&folio_list);
    //folio_list还有东西，free到伙伴系统
    free_unref_page_list(&folio_list);

    /*
     * If dirty folios are scanned that are not queued for IO, it
     * implies that flushers are not doing their job. This can
     * happen when memory pressure pushes dirty folios to the end of
     * the LRU before the dirty limits are breached and the dirty
     * data has expired. It can also happen when the proportion of
     * dirty folios grows not through writes but through memory
     * pressure reclaiming all the clean cache. And in some cases,
     * the flushers simply cannot keep up with the allocation
     * rate. Nudge the flusher threads in case they are asleep.
     */
    if (stat.nr_unqueued_dirty == nr_taken) {
        wakeup_flusher_threads(WB_REASON_VMSCAN);
        /*
         * For cgroupv1 dirty throttling is achieved by waking up
         * the kernel flusher here and later waiting on folios
         * which are in writeback to finish (see shrink_folio_list()).
         *
         * Flusher may not be able to issue writeback quickly
         * enough for cgroupv1 writeback throttling to work
         * on a large system.
         */
        if (!writeback_throttling_sane(sc))
            reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
    }

    sc->nr.dirty += stat.nr_dirty;
    sc->nr.congested += stat.nr_congested;
    sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
    sc->nr.writeback += stat.nr_writeback;
    sc->nr.immediate += stat.nr_immediate;
    sc->nr.taken += nr_taken;
    if (file)
        sc->nr.file_taken += nr_taken;

    trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
            nr_scanned, nr_reclaimed, &stat, sc->priority, file);
    return nr_reclaimed;
}

shrink_inactive_list调用shrink_folio_list去回收页。

看一下shrink_folio_list

static unsigned int shrink_folio_list(struct list_head *folio_list,
        struct pglist_data *pgdat, struct scan_control *sc,
        struct reclaim_stat *stat, bool ignore_references)
{
    LIST_HEAD(ret_folios);
    LIST_HEAD(free_folios);
    LIST_HEAD(demote_folios);
    unsigned int nr_reclaimed = 0;
    unsigned int pgactivate = 0;
    bool do_demote_pass;
    struct swap_iocb *plug = NULL;

    memset(stat, 0, sizeof(*stat));
    cond_resched();
    do_demote_pass = can_demote(pgdat->node_id, sc);

retry:
    //扫描folio_list
    while (!list_empty(folio_list)) {
        struct address_space *mapping;
        struct folio *folio;
        enum folio_references references = FOLIOREF_RECLAIM;
        bool dirty, writeback;
        unsigned int nr_pages;

        cond_resched();

        folio = lru_to_folio(folio_list);
        list_del(&folio->lru);

        if (!folio_trylock(folio))
            goto keep;

        VM_BUG_ON_FOLIO(folio_test_active(folio), folio);

        nr_pages = folio_nr_pages(folio);

        /* Account the number of base pages */
        sc->nr_scanned += nr_pages;

        if (unlikely(!folio_evictable(folio)))
            goto activate_locked;

        if (!sc->may_unmap && folio_mapped(folio))
            goto keep_locked;

        /* folio_update_gen() tried to promote this page? */
        if (lru_gen_enabled() && !ignore_references &&
            folio_mapped(folio) && folio_test_referenced(folio))
            goto keep_locked;

        /*
         * The number of dirty pages determines if a node is marked
         * reclaim_congested. kswapd will stall and start writing
         * folios if the tail of the LRU is all dirty unqueued folios.
         */
        folio_check_dirty_writeback(folio, &dirty, &writeback);
        if (dirty || writeback)
            stat->nr_dirty += nr_pages;

        if (dirty && !writeback)
            stat->nr_unqueued_dirty += nr_pages;

        /*
         * Treat this folio as congested if folios are cycling
         * through the LRU so quickly that the folios marked
         * for immediate reclaim are making it to the end of
         * the LRU a second time.
         */
        if (writeback && folio_test_reclaim(folio))
            stat->nr_congested += nr_pages;

        /*
         * If a folio at the tail of the LRU is under writeback, there
         * are three cases to consider.
         *
         * 1) If reclaim is encountering an excessive number
         *    of folios under writeback and this folio has both
         *    the writeback and reclaim flags set, then it
         *    indicates that folios are being queued for I/O but
         *    are being recycled through the LRU before the I/O
         *    can complete. Waiting on the folio itself risks an
         *    indefinite stall if it is impossible to writeback
         *    the folio due to I/O error or disconnected storage
         *    so instead note that the LRU is being scanned too
         *    quickly and the caller can stall after the folio
         *    list has been processed.
         *
         * 2) Global or new memcg reclaim encounters a folio that is
         *    not marked for immediate reclaim, or the caller does not
         *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
         *    not to fs). In this case mark the folio for immediate
         *    reclaim and continue scanning.
         *
         *    Require may_enter_fs() because we would wait on fs, which
         *    may not have submitted I/O yet. And the loop driver might
         *    enter reclaim, and deadlock if it waits on a folio for
         *    which it is needed to do the write (loop masks off
         *    __GFP_IO|__GFP_FS for this reason); but more thought
         *    would probably show more reasons.
         *
         * 3) Legacy memcg encounters a folio that already has the
         *    reclaim flag set. memcg does not have any dirty folio
         *    throttling so we could easily OOM just because too many
         *    folios are in writeback and there is nothing else to
         *    reclaim. Wait for the writeback to complete.
         *
         * In cases 1) and 2) we activate the folios to get them out of
         * the way while we continue scanning for clean folios on the
         * inactive list and refilling from the active list. The
         * observation here is that waiting for disk writes is more
         * expensive than potentially causing reloads down the line.
         * Since they're marked for immediate reclaim, they won't put
         * memory pressure on the cache working set any longer than it
         * takes to write them to disk.
         */
        if (folio_test_writeback(folio)) {
            /* Case 1 above */
            if (current_is_kswapd() &&
                folio_test_reclaim(folio) &&
                test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                stat->nr_immediate += nr_pages;
                goto activate_locked;

            /* Case 2 above */
            } else if (writeback_throttling_sane(sc) ||
                !folio_test_reclaim(folio) ||
                !may_enter_fs(folio, sc->gfp_mask)) {
                /*
                 * This is slightly racy -
                 * folio_end_writeback() might have
                 * just cleared the reclaim flag, then
                 * setting the reclaim flag here ends up
                 * interpreted as the readahead flag - but
                 * that does not matter enough to care.
                 * What we do want is for this folio to
                 * have the reclaim flag set next time
                 * memcg reclaim reaches the tests above,
                 * so it will then wait for writeback to
                 * avoid OOM; and it's also appropriate
                 * in global reclaim.
                 */
                folio_set_reclaim(folio);
                stat->nr_writeback += nr_pages;
                goto activate_locked;

            /* Case 3 above */
            } else {
                folio_unlock(folio);
                folio_wait_writeback(folio);
                /* then go back and try same folio again */
                list_add_tail(&folio->lru, folio_list);
                continue;
            }
        }

        if (!ignore_references)
            //判断需要如何处理当前folio
            references = folio_check_references(folio, sc);

        switch (references) {
        case FOLIOREF_ACTIVATE:
            goto activate_locked;
        case FOLIOREF_KEEP:
            stat->nr_ref_keep += nr_pages;
            goto keep_locked;
        case FOLIOREF_RECLAIM:
        case FOLIOREF_RECLAIM_CLEAN:
            ; /* try to reclaim the folio below */
        }

        /*
         * Before reclaiming the folio, try to relocate
         * its contents to another node.
         */
        if (do_demote_pass &&
            (thp_migration_supported() || !folio_test_large(folio))) {
            //可以迁移到其他node
            list_add(&folio->lru, &demote_folios);
            folio_unlock(folio);
            continue;
        }

        /*
         * Anonymous process memory has backing store?
         * Try to allocate it some swap space here.
         * Lazyfree folio could be freed directly
         */
        if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
            if (!folio_test_swapcache(folio)) {
                if (!(sc->gfp_mask & __GFP_IO))
                    goto keep_locked;
                if (folio_maybe_dma_pinned(folio))
                    goto keep_locked;
                if (folio_test_large(folio)) {
                    /* cannot split folio, skip it */
                    if (!can_split_folio(folio, NULL))
                        goto activate_locked;
                    /*
                     * Split folios without a PMD map right
                     * away. Chances are some or all of the
                     * tail pages can be freed without IO.
                     */
                    if (!folio_entire_mapcount(folio) &&
                        split_folio_to_list(folio,
                                folio_list))
                        goto activate_locked;
                }
                if (!add_to_swap(folio)) {
                    if (!folio_test_large(folio))
                        goto activate_locked_split;
                    /* Fallback to swap normal pages */
                    if (split_folio_to_list(folio,
                                folio_list))
                        goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                    count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
                    count_vm_event(THP_SWPOUT_FALLBACK);
#endif
                    if (!add_to_swap(folio))
                        goto activate_locked_split;
                }
            }
        } else if (folio_test_swapbacked(folio) &&
               folio_test_large(folio)) {
            /* Split shmem folio */
            if (split_folio_to_list(folio, folio_list))
                goto keep_locked;
        }

        /*
         * If the folio was split above, the tail pages will make
         * their own pass through this function and be accounted
         * then.
         */
        if ((nr_pages > 1) && !folio_test_large(folio)) {
            sc->nr_scanned -= (nr_pages - 1);
            nr_pages = 1;
        }

        /*
         * The folio is mapped into the page tables of one or more
         * processes. Try to unmap it here.
         */
        if (folio_mapped(folio)) {
            enum ttu_flags flags = TTU_BATCH_FLUSH;
            bool was_swapbacked = folio_test_swapbacked(folio);

            if (folio_test_pmd_mappable(folio))
                flags |= TTU_SPLIT_HUGE_PMD;

            try_to_unmap(folio, flags);
            if (folio_mapped(folio)) {
                stat->nr_unmap_fail += nr_pages;
                if (!was_swapbacked &&
                    folio_test_swapbacked(folio))
                    stat->nr_lazyfree_fail += nr_pages;
                goto activate_locked;
            }
        }

        /*
         * Folio is unmapped now so it cannot be newly pinned anymore.
         * No point in trying to reclaim folio if it is pinned.
         * Furthermore we don't want to reclaim underlying fs metadata
         * if the folio is pinned and thus potentially modified by the
         * pinning process as that may upset the filesystem.
         */
        if (folio_maybe_dma_pinned(folio))
            goto activate_locked;

        mapping = folio_mapping(folio);
        if (folio_test_dirty(folio)) {
            /*
             * Only kswapd can writeback filesystem folios
             * to avoid risk of stack overflow. But avoid
             * injecting inefficient single-folio I/O into
             * flusher writeback as much as possible: only
             * write folios when we've encountered many
             * dirty folios, and when we've already scanned
             * the rest of the LRU for clean folios and see
             * the same dirty folios again (with the reclaim
             * flag set).
             */
            if (folio_is_file_lru(folio) &&
                (!current_is_kswapd() ||
                 !folio_test_reclaim(folio) ||
                 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
                /*
                 * Immediately reclaim when written back.
                 * Similar in principle to folio_deactivate()
                 * except we already have the folio isolated
                 * and know it's dirty
                 */
                node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
                        nr_pages);
                folio_set_reclaim(folio);

                goto activate_locked;
            }

            if (references == FOLIOREF_RECLAIM_CLEAN)
                goto keep_locked;
            if (!may_enter_fs(folio, sc->gfp_mask))
                goto keep_locked;
            if (!sc->may_writepage)
                goto keep_locked;

            /*
             * Folio is dirty. Flush the TLB if a writable entry
             * potentially exists to avoid CPU writes after I/O
             * starts and then write it out here.
             */
            try_to_unmap_flush_dirty();
            switch (pageout(folio, mapping, &plug)) {
            case PAGE_KEEP:
                goto keep_locked;
            case PAGE_ACTIVATE:
                goto activate_locked;
            case PAGE_SUCCESS:
                stat->nr_pageout += nr_pages;

                if (folio_test_writeback(folio))
                    goto keep;
                if (folio_test_dirty(folio))
                    goto keep;

                /*
                 * A synchronous write - probably a ramdisk.  Go
                 * ahead and try to reclaim the folio.
                 */
                if (!folio_trylock(folio))
                    goto keep;
                if (folio_test_dirty(folio) ||
                    folio_test_writeback(folio))
                    goto keep_locked;
                mapping = folio_mapping(folio);
                fallthrough;
            case PAGE_CLEAN:
                ; /* try to free the folio below */
            }
        }

        /*
         * If the folio has buffers, try to free the buffer
         * mappings associated with this folio. If we succeed
         * we try to free the folio as well.
         *
         * We do this even if the folio is dirty.
         * filemap_release_folio() does not perform I/O, but it
         * is possible for a folio to have the dirty flag set,
         * but it is actually clean (all its buffers are clean).
         * This happens if the buffers were written out directly,
         * with submit_bh(). ext3 will do this, as well as
         * the blockdev mapping.  filemap_release_folio() will
         * discover that cleanness and will drop the buffers
         * and mark the folio clean - it can be freed.
         *
         * Rarely, folios can have buffers and no ->mapping.
         * These are the folios which were not successfully
         * invalidated in truncate_cleanup_folio().  We try to
         * drop those buffers here and if that worked, and the
         * folio is no longer mapped into process address space
         * (refcount == 1) it can be freed.  Otherwise, leave
         * the folio on the LRU so it is swappable.
         */
        if (folio_needs_release(folio)) {
            if (!filemap_release_folio(folio, sc->gfp_mask))
                goto activate_locked;
            if (!mapping && folio_ref_count(folio) == 1) {
                folio_unlock(folio);
                if (folio_put_testzero(folio))
                    goto free_it;
                else {
                    /*
                     * rare race with speculative reference.
                     * the speculative reference will free
                     * this folio shortly, so we may
                     * increment nr_reclaimed here (and
                     * leave it off the LRU).
                     */
                    nr_reclaimed += nr_pages;
                    continue;
                }
            }
        }

        if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
            /* follow __remove_mapping for reference */
            if (!folio_ref_freeze(folio, 1))
                goto keep_locked;
            /*
             * The folio has only one reference left, which is
             * from the isolation. After the caller puts the
             * folio back on the lru and drops the reference, the
             * folio will be freed anyway. It doesn't matter
             * which lru it goes on. So we don't bother checking
             * the dirty flag here.
             */
            count_vm_events(PGLAZYFREED, nr_pages);
            count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
        } else if (!mapping || !__remove_mapping(mapping, folio, true,
                             sc->target_mem_cgroup))
            goto keep_locked;

        folio_unlock(folio);
free_it:
        /*
         * Folio may get swapped out as a whole, need to account
         * all pages in it.
         */
        nr_reclaimed += nr_pages;

        /*
         * Is there need to periodically free_folio_list? It would
         * appear not as the counts should be low
         */
        if (unlikely(folio_test_large(folio)))
            destroy_large_folio(folio);
        else
            //这是要被释放的页，放到free_folios list
            list_add(&folio->lru, &free_folios);
        continue;

activate_locked_split:
        /*
         * The tail pages that are failed to add into swap cache
         * reach here.  Fixup nr_scanned and nr_pages.
         */
        if (nr_pages > 1) {
            sc->nr_scanned -= (nr_pages - 1);
            nr_pages = 1;
        }
activate_locked:
        /* Not a candidate for swapping, so reclaim swap space. */
        if (folio_test_swapcache(folio) &&
            (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
            folio_free_swap(folio);
        VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
        if (!folio_test_mlocked(folio)) {
            int type = folio_is_file_lru(folio);
            folio_set_active(folio);
            stat->nr_activate[type] += nr_pages;
            count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
        }
keep_locked:
        folio_unlock(folio);
keep:
        list_add(&folio->lru, &ret_folios);
        VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
                folio_test_unevictable(folio), folio);
    }
    /* 'folio_list' is always empty here */

    /* Migrate folios selected for demotion */
    nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
    /* Folios that could not be demoted are still in @demote_folios */
    if (!list_empty(&demote_folios)) {
        /* Folios which weren't demoted go back on @folio_list */
        list_splice_init(&demote_folios, folio_list);

        /*
         * goto retry to reclaim the undemoted folios in folio_list if
         * desired.
         *
         * Reclaiming directly from top tier nodes is not often desired
         * due to it breaking the LRU ordering: in general memory
         * should be reclaimed from lower tier nodes and demoted from
         * top tier nodes.
         *
         * However, disabling reclaim from top tier nodes entirely
         * would cause ooms in edge scenarios where lower tier memory
         * is unreclaimable for whatever reason, eg memory being
         * mlocked or too hot to reclaim. We can disable reclaim
         * from top tier nodes in proactive reclaim though as that is
         * not real memory pressure.
         */
        if (!sc->proactive) {
            do_demote_pass = false;
            goto retry;
        }
    }

    pgactivate = stat->nr_activate[0] + stat->nr_activate[1];

    mem_cgroup_uncharge_list(&free_folios);
    try_to_unmap_flush();
    //释放页到伙伴系统，这才是真正的回收
    free_unref_page_list(&free_folios);

    list_splice(&ret_folios, folio_list);
    count_vm_events(PGACTIVATE, pgactivate);

    if (plug)
        swap_write_unplug(plug);
    return nr_reclaimed;
}

shrink_folio_list是一个很复杂的函数，现在还没完全看懂。回头看。

目前来看决定哪些页面被扫描的函数是

static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
        struct lruvec *lruvec, struct list_head *dst,
        unsigned long *nr_scanned, struct scan_control *sc,
        enum lru_list lru)
{
    struct list_head *src = &lruvec->lists[lru];
    unsigned long nr_taken = 0;
    unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
    unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
    unsigned long skipped = 0;
    unsigned long scan, total_scan, nr_pages;
    LIST_HEAD(folios_skipped);

    total_scan = 0;
    scan = 0;
    while (scan < nr_to_scan && !list_empty(src)) {
        struct list_head *move_to = src;
        struct folio *folio;

        folio = lru_to_folio(src);
        prefetchw_prev_lru_folio(folio, src, flags);

        nr_pages = folio_nr_pages(folio);
        total_scan += nr_pages;

        if (folio_zonenum(folio) > sc->reclaim_idx ||
                skip_cma(folio, sc)) {
            nr_skipped[folio_zonenum(folio)] += nr_pages;
            move_to = &folios_skipped;
            goto move;
        }

        /*
         * Do not count skipped folios because that makes the function
         * return with no isolated folios if the LRU mostly contains
         * ineligible folios.  This causes the VM to not reclaim any
         * folios, triggering a premature OOM.
         * Account all pages in a folio.
         */
        scan += nr_pages;

        if (!folio_test_lru(folio))
            goto move;
        if (!sc->may_unmap && folio_mapped(folio))
            goto move;

        /*
         * Be careful not to clear the lru flag until after we're
         * sure the folio is not being freed elsewhere -- the
         * folio release code relies on it.
         */
        if (unlikely(!folio_try_get(folio)))
            goto move;

        if (!folio_test_clear_lru(folio)) {
            /* Another thread is already isolating this folio */
            folio_put(folio);
            goto move;
        }

        nr_taken += nr_pages;
        nr_zone_taken[folio_zonenum(folio)] += nr_pages;
        move_to = dst;
move:
        list_move(&folio->lru, move_to);
    }

    /*
     * Splice any skipped folios to the start of the LRU list. Note that
     * this disrupts the LRU order when reclaiming for lower zones but
     * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
     * scanning would soon rescan the same folios to skip and waste lots
     * of cpu cycles.
     */
    if (!list_empty(&folios_skipped)) {
        int zid;

        list_splice(&folios_skipped, src);
        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
            if (!nr_skipped[zid])
                continue;

            __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
            skipped += nr_skipped[zid];
        }
    }
    *nr_scanned = total_scan;
    trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
                    total_scan, skipped, nr_taken, lru);
    update_lru_sizes(lruvec, lru, nr_zone_taken);
    return nr_taken;
}

扫描lru链表，满足要求就加到folio_list中，后面会扫描folio_list从中找出要回收的页面。get_scan_count函数会计算各lru链表需要扫描的数量。

/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.
 *
 * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
 * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
 */
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
               unsigned long *nr)
{
    struct pglist_data *pgdat = lruvec_pgdat(lruvec);
    struct mem_cgroup *memcg = lruvec_memcg(lruvec);
    unsigned long anon_cost, file_cost, total_cost;
    int swappiness = mem_cgroup_swappiness(memcg);
    u64 fraction[ANON_AND_FILE];
    u64 denominator = 0;    /* gcc */
    enum scan_balance scan_balance;
    unsigned long ap, fp;
    enum lru_list lru;

    /* If we have no swap space, do not bother scanning anon folios. */
    if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
        scan_balance = SCAN_FILE;
        goto out;
    }

    /*
     * Global reclaim will swap to prevent OOM even with no
     * swappiness, but memcg users want to use this knob to
     * disable swapping for individual groups completely when
     * using the memory controller's swap limit feature would be
     * too expensive.
     */
    if (cgroup_reclaim(sc) && !swappiness) {
        scan_balance = SCAN_FILE;
        goto out;
    }

    /*
     * Do not apply any pressure balancing cleverness when the
     * system is close to OOM, scan both anon and file equally
     * (unless the swappiness setting disagrees with swapping).
     */
    if (!sc->priority && swappiness) {
        scan_balance = SCAN_EQUAL;
        goto out;
    }

    /*
     * If the system is almost out of file pages, force-scan anon.
     */
    if (sc->file_is_tiny) {
        scan_balance = SCAN_ANON;
        goto out;
    }

    /*
     * If there is enough inactive page cache, we do not reclaim
     * anything from the anonymous working right now.
     */
    if (sc->cache_trim_mode) {
        scan_balance = SCAN_FILE;
        goto out;
    }

    scan_balance = SCAN_FRACT;
    /*
     * Calculate the pressure balance between anon and file pages.
     *
     * The amount of pressure we put on each LRU is inversely
     * proportional to the cost of reclaiming each list, as
     * determined by the share of pages that are refaulting, times
     * the relative IO cost of bringing back a swapped out
     * anonymous page vs reloading a filesystem page (swappiness).
     *
     * Although we limit that influence to ensure no list gets
     * left behind completely: at least a third of the pressure is
     * applied, before swappiness.
     *
     * With swappiness at 100, anon and file have equal IO cost.
     */
    total_cost = sc->anon_cost + sc->file_cost;
    anon_cost = total_cost + sc->anon_cost;
    file_cost = total_cost + sc->file_cost;
    total_cost = anon_cost + file_cost;

    ap = swappiness * (total_cost + 1);
    ap /= anon_cost + 1;

    fp = (200 - swappiness) * (total_cost + 1);
    fp /= file_cost + 1;

    fraction[0] = ap;
    fraction[1] = fp;
    denominator = ap + fp;
out:
    for_each_evictable_lru(lru) {
        int file = is_file_lru(lru);
        unsigned long lruvec_size;
        unsigned long low, min;
        unsigned long scan;

        lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
        mem_cgroup_protection(sc->target_mem_cgroup, memcg,
                      &min, &low);

        if (min || low) {
            /*
             * Scale a cgroup's reclaim pressure by proportioning
             * its current usage to its memory.low or memory.min
             * setting.
             *
             * This is important, as otherwise scanning aggression
             * becomes extremely binary -- from nothing as we
             * approach the memory protection threshold, to totally
             * nominal as we exceed it.  This results in requiring
             * setting extremely liberal protection thresholds. It
             * also means we simply get no protection at all if we
             * set it too low, which is not ideal.
             *
             * If there is any protection in place, we reduce scan
             * pressure by how much of the total memory used is
             * within protection thresholds.
             *
             * There is one special case: in the first reclaim pass,
             * we skip over all groups that are within their low
             * protection. If that fails to reclaim enough pages to
             * satisfy the reclaim goal, we come back and override
             * the best-effort low protection. However, we still
             * ideally want to honor how well-behaved groups are in
             * that case instead of simply punishing them all
             * equally. As such, we reclaim them based on how much
             * memory they are using, reducing the scan pressure
             * again by how much of the total memory used is under
             * hard protection.
             */
            unsigned long cgroup_size = mem_cgroup_size(memcg);
            unsigned long protection;

            /* memory.low scaling, make sure we retry before OOM */
            if (!sc->memcg_low_reclaim && low > min) {
                protection = low;
                sc->memcg_low_skipped = 1;
            } else {
                protection = min;
            }

            /* Avoid TOCTOU with earlier protection check */
            cgroup_size = max(cgroup_size, protection);

            scan = lruvec_size - lruvec_size * protection /
                (cgroup_size + 1);

            /*
             * Minimally target SWAP_CLUSTER_MAX pages to keep
             * reclaim moving forwards, avoiding decrementing
             * sc->priority further than desirable.
             */
            scan = max(scan, SWAP_CLUSTER_MAX);
        } else {
            scan = lruvec_size;
        }

        scan >>= sc->priority;

        /*
         * If the cgroup's already been deleted, make sure to
         * scrape out the remaining cache.
         */
        if (!scan && !mem_cgroup_online(memcg))
            scan = min(lruvec_size, SWAP_CLUSTER_MAX);

        switch (scan_balance) {
        case SCAN_EQUAL:
            /* Scan lists relative to size */
            break;
        case SCAN_FRACT:
            /*
             * Scan types proportional to swappiness and
             * their relative recent reclaim efficiency.
             * Make sure we don't miss the last page on
             * the offlined memory cgroups because of a
             * round-off error.
             */
            scan = mem_cgroup_online(memcg) ?
                   div64_u64(scan * fraction[file], denominator) :
                   DIV64_U64_ROUND_UP(scan * fraction[file],
                          denominator);
            break;
        case SCAN_FILE:
        case SCAN_ANON:
            /* Scan one type exclusively */
            if ((scan_balance == SCAN_FILE) != file)
                scan = 0;
            break;
        default:
            /* Look ma, no brain */
            BUG();
        }

        nr[lru] = scan;
    }
}

计算的关键参数是swappiness，这个值是0-100，默认60，越大需要被扫描的匿名页越多，100表示跟文件cache页一样多。这个值可以在/proc/sys/vm/swappiness中修改。

上面我们是按直接回收的路径分析的，下面看看异步回收的路径。

异步内存回收是通过一个内核线程kswapd，它的初始化路径是

static int __init kswapd_init(void)
{
        int nid;

        swap_setup();
        for_each_node_state(nid, N_MEMORY)
                //给每个node建一个kswapd线程
                kswapd_run(nid);
        return 0;
}

module_init(kswapd_init)

/*
 * This kswapd start function will be called by init and node-hot-add.
 */
void __meminit kswapd_run(int nid)
{
        pg_data_t *pgdat = NODE_DATA(nid);

        pgdat_kswapd_lock(pgdat);
        if (!pgdat->kswapd) {
                //创建并唤醒kswapd线程
                pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
                if (IS_ERR(pgdat->kswapd)) {
                        /* failure at boot is fatal */
                        pr_err("Failed to start kswapd on node %d，ret=%ld\n",
                                   nid, PTR_ERR(pgdat->kswapd));
                        BUG_ON(system_state < SYSTEM_RUNNING);
                        pgdat->kswapd = NULL;
                }
        }
        pgdat_kswapd_unlock(pgdat);
}

在初始化node的时候会初始化kswapd_wait。

void __init free_area_init(unsigned long *max_zone_pfn)
{
...
        for_each_node(nid) {
                pg_data_t *pgdat;

                if (!node_online(nid)) {
                     ...
                        free_area_init_node(nid);
...
}

static void __init free_area_init_node(int nid)
{
...
        free_area_init_core(pgdat);
        lru_gen_init_pgdat(pgdat);
}

static void __init free_area_init_core(struct pglist_data *pgdat)
{
        enum zone_type j;
        int nid = pgdat->node_id;

        pgdat_init_internals(pgdat);
...
}

static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
        int i;

        pgdat_resize_init(pgdat);
        pgdat_kswapd_lock_init(pgdat);

        pgdat_init_split_queue(pgdat);
        pgdat_init_kcompactd(pgdat);

        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);

        for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
                init_waitqueue_head(&pgdat->reclaim_wait[i]);

        pgdat_page_ext_init(pgdat);
        lruvec_init(&pgdat->__lruvec);
}

可知kswapd_wait是每个node都有一个。

但是光有一个等待队列头没用，还得把kswapd线程加进队列。

/*
 * The background pageout daemon, started as a kernel thread
 * from the init process.
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
 * that frees anything up. This is needed for things like routing
 * etc, where we otherwise might have all activity going on in
 * asynchronous contexts that cannot page things out.
 *
 * If there are applications that are active memory-allocators
 * (most normal use), this basically shouldn't matter.
 */
static int kswapd(void *p)
{
    unsigned int alloc_order, reclaim_order;
    unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
    pg_data_t *pgdat = (pg_data_t *)p;
    struct task_struct *tsk = current;
    const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

    if (!cpumask_empty(cpumask))
        set_cpus_allowed_ptr(tsk, cpumask);

    /*
     * Tell the memory management that we're a "memory allocator",
     * and that if we need more memory we should get access to it
     * regardless (see "__alloc_pages()"). "kswapd" should
     * never get caught in the normal page freeing logic.
     *
     * (Kswapd normally doesn't need memory anyway, but sometimes
     * you need a small amount of memory in order to be able to
     * page out something else, and this flag essentially protects
     * us from recursively trying to free more memory as we're
     * trying to free the first piece of memory in the first place).
     */
    tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
    set_freezable();

    WRITE_ONCE(pgdat->kswapd_order, 0);
    WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
    atomic_set(&pgdat->nr_writeback_throttled, 0);
    //上面的代码只是在第一次执行的时候运行
    for ( ; ; ) {
        //以后就呆在这个循环里了
        bool ret;

        alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
        highest_zoneidx = kswapd_highest_zoneidx(pgdat,
                            highest_zoneidx);

kswapd_try_sleep:
        //把自己加到等待队列中后schedule，等着被唤醒
        kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
                    highest_zoneidx);

        /* Read the new order and highest_zoneidx */
        alloc_order = READ_ONCE(pgdat->kswapd_order);
        highest_zoneidx = kswapd_highest_zoneidx(pgdat,
                            highest_zoneidx);
        WRITE_ONCE(pgdat->kswapd_order, 0);
        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);

        ret = try_to_freeze();
        if (kthread_should_stop())
            break;

        /*
         * We can speed up thawing tasks if we don't call balance_pgdat
         * after returning from the refrigerator
         */
        if (ret)
            continue;

        /*
         * Reclaim begins at the requested order but if a high-order
         * reclaim fails then kswapd falls back to reclaiming for
         * order-0. If that happens, kswapd will consider sleeping
         * for the order it finished reclaiming at (reclaim_order)
         * but kcompactd is woken to compact for the original
         * request (alloc_order).
         */
        trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
                        alloc_order);
        //开始做正事
        reclaim_order = balance_pgdat(pgdat, alloc_order,
                        highest_zoneidx);
        if (reclaim_order < alloc_order)
            goto kswapd_try_sleep;
    }

    tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);

    return 0;
}

static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
                unsigned int highest_zoneidx)
{
    long remaining = 0;
    DEFINE_WAIT(wait);

    if (freezing(current) || kthread_should_stop())
        return;
    //把自己加到kswapd_wait等待队列
    prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

    /*
     * Try to sleep for a short interval. Note that kcompactd will only be
     * woken if it is possible to sleep for a short interval. This is
     * deliberate on the assumption that if reclaim cannot keep an
     * eligible zone balanced that it's also unlikely that compaction will
     * succeed.
     */
    //睡会儿
    if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
        /*
         * Compaction records what page blocks it recently failed to
         * isolate pages from and skips them in the future scanning.
         * When kswapd is going to sleep, it is reasonable to assume
         * that pages and compaction may succeed so reset the cache.
         */
        reset_isolation_suitable(pgdat);

        /*
         * We have freed the memory, now we should compact it to make
         * allocation of the requested order possible.
         */
        wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);

        remaining = schedule_timeout(HZ/10);

        /*
         * If woken prematurely then reset kswapd_highest_zoneidx and
         * order. The values will either be from a wakeup request or
         * the previous request that slept prematurely.
         */
        if (remaining) {
            WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
                    kswapd_highest_zoneidx(pgdat,
                            highest_zoneidx));

            if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
                WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
        }

        finish_wait(&pgdat->kswapd_wait, &wait);
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    }

    /*
     * After a short sleep, check if it was a premature sleep. If not, then
     * go fully to sleep until explicitly woken up.
     */
    if (!remaining &&
        prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

        /*
         * vmstat counters are not perfectly accurate and the estimated
         * value for counters such as NR_FREE_PAGES can deviate from the
         * true value by nr_online_cpus * threshold. To avoid the zone
         * watermarks being breached while under pressure, we reduce the
         * per-cpu vmstat threshold while kswapd is awake and restore
         * them before going back to sleep.
         */
        set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

        if (!kthread_should_stop())
            schedule();

        set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
    } else {
        if (remaining)
            count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
        else
            count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
    }
    finish_wait(&pgdat->kswapd_wait, &wait);
}

唤醒kswapd的通常是在分配内存时，alloc_page()-->__alloc_pages_nodemask()-->__alloc_pages_slowpath()-->wake_all_kswapds()-->wakeup_kswapd()

/*
 * A zone is low on free memory or too fragmented for high-order memory.  If
 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
 * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
 * has failed or is not needed, still wake up kcompactd if only compaction is
 * needed.
 */
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
                   enum zone_type highest_zoneidx)
{
        pg_data_t *pgdat;
        enum zone_type curr_idx;

        if (!managed_zone(zone))
                return;

        if (!cpuset_zone_allowed(zone, gfp_flags))
                return;
        pgdat = zone->zone_pgdat;
        curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

        if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);

        if (READ_ONCE(pgdat->kswapd_order) < order)
                WRITE_ONCE(pgdat->kswapd_order, order);

        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;

        /* Hopeless node, leave it to direct reclaim if possible */
        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
            (pgdat_balanced(pgdat, order, highest_zoneidx) &&
             !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
                /*
                 * There may be plenty of free memory available, but it's too
                 * fragmented for high-order allocations.  Wake up kcompactd
                 * and rely on compaction_suitable() to determine if it's
                 * needed.  If it fails, it will defer subsequent attempts to
                 * ratelimit its work.
                 */
                if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
                        wakeup_kcompactd(pgdat, order, highest_zoneidx);
                return;
        }
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
                                      gfp_flags);
        wake_up_interruptible(&pgdat->kswapd_wait);
}

下面看看balance_pgdat

/*
 * For kswapd, balance_pgdat() will reclaim pages across a node from zones
 * that are eligible for use by the caller until at least one zone is
 * balanced.
 *
 * Returns the order kswapd finished reclaiming at.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
 * found to have free_pages <= high_wmark_pages(zone), any page in that zone
 * or lower is eligible for reclaim until at least one usable zone is
 * balanced.
 */
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
    int i;
    unsigned long nr_soft_reclaimed;
    unsigned long nr_soft_scanned;
    unsigned long pflags;
    unsigned long nr_boost_reclaim;
    unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
    bool boosted;
    struct zone *zone;
    //用于回收页面的控制结构
    struct scan_control sc = {
        .gfp_mask = GFP_KERNEL,
        .order = order,
        .may_unmap = 1,
    };
    //设置当前task->reclaim_state成员
    set_task_reclaim_state(current, &sc.reclaim_state);
    psi_memstall_enter(&pflags);
    __fs_reclaim_acquire(_THIS_IP_);

    count_vm_event(PAGEOUTRUN);

    /*
     * Account for the reclaim boost. Note that the zone boost is left in
     * place so that parallel allocations that are near the watermark will
     * stall or direct reclaim until kswapd is finished.
     */
    nr_boost_reclaim = 0;
    for (i = 0; i <= highest_zoneidx; i++) {
        zone = pgdat->node_zones + i;
        if (!managed_zone(zone))
            continue;

        nr_boost_reclaim += zone->watermark_boost;
        zone_boosts[i] = zone->watermark_boost;
    }
    boosted = nr_boost_reclaim;

restart:
    //将当前node hzoneidx及以下的所有zone设置ZONE_RECLAIM_ACTIVE到flag
    set_reclaim_active(pgdat, highest_zoneidx);
    //控制需要扫描的页面数量，nrpage = lru_pages >> priority
    sc.priority = DEF_PRIORITY;   // 12
    do {
        unsigned long nr_reclaimed = sc.nr_reclaimed;
        bool raise_priority = true;
        bool balanced;
        bool ret;

        sc.reclaim_idx = highest_zoneidx;

        /*
         * If the number of buffer_heads exceeds the maximum allowed
         * then consider reclaiming from all zones. This has a dual
         * purpose -- on 64-bit systems it is expected that
         * buffer_heads are stripped during active rotation. On 32-bit
         * systems, highmem pages can pin lowmem memory and shrinking
         * buffers can relieve lowmem pressure. Reclaim may still not
         * go ahead if all eligible zones for the original allocation
         * request are balanced to avoid excessive reclaim from kswapd.
         */
        if (buffer_heads_over_limit) {
            for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
                zone = pgdat->node_zones + i;
                if (!managed_zone(zone))
                    continue;

                sc.reclaim_idx = i;
                break;
            }
        }

        /*
         * If the pgdat is imbalanced then ignore boosting and preserve
         * the watermarks for a later time and restart. Note that the
         * zone watermarks will be still reset at the end of balancing
         * on the grounds that the normal reclaim should be enough to
         * re-evaluate if boosting is required when kswapd next wakes.
         */
        //有没有一个zone有足够的free page，free page > zone->wmark[high]且可以分配出2^order的页面
        balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
        if (!balanced && nr_boost_reclaim) {
            nr_boost_reclaim = 0;
            goto restart;
        }

        /*
         * If boosting is not active then only reclaim if there are no
         * eligible zones. Note that sc.reclaim_idx is not used as
         * buffer_heads_over_limit may have adjusted it.
         */
        //没有boost且已经balanced，那就不用回收了
        if (!nr_boost_reclaim && balanced)
            goto out;

        /* Limit the priority of boosting to avoid reclaim writeback */
        if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
            raise_priority = false;

        /*
         * Do not writeback or swap pages for boosted reclaim. The
         * intent is to relieve pressure not issue sub-optimal IO
         * from reclaim context. If no pages are reclaimed, the
         * reclaim will be aborted.
         */
        sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
        sc.may_swap = !nr_boost_reclaim;

        /*
         * Do some background aging, to give pages a chance to be
         * referenced before reclaiming. All pages are rotated
         * regardless of classzone as this is about consistent aging.
         */
        //不是很明白？？？这又啥用？
        kswapd_age_node(pgdat, &sc);

        /*
         * If we're getting trouble reclaiming, start doing writepage
         * even in laptop mode.
         */
        if (sc.priority < DEF_PRIORITY - 2)
            sc.may_writepage = 1;

        /* Call soft limit reclaim before calling shrink_node. */
        sc.nr_scanned = 0;
        nr_soft_scanned = 0;
        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
                        sc.gfp_mask, &nr_soft_scanned);
        sc.nr_reclaimed += nr_soft_reclaimed;

        /*
         * There should be no need to raise the scanning priority if
         * enough pages are already being scanned that that high
         * watermark would be met at 100% efficiency.
         */
        //进入回收路径
        if (kswapd_shrink_node(pgdat, &sc))
            raise_priority = false;

        /*
         * If the low watermark is met there is no need for processes
         * to be throttled on pfmemalloc_wait as they should not be
         * able to safely make forward progress. Wake them
         */

        //唤醒pfmemalloc，不过这是个啥？
        if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
                allow_direct_reclaim(pgdat))
            wake_up_all(&pgdat->pfmemalloc_wait);

        /* Check if kswapd should be suspending */
        __fs_reclaim_release(_THIS_IP_);
        ret = try_to_freeze();
        __fs_reclaim_acquire(_THIS_IP_);
        if (ret || kthread_should_stop())
            break;

        /*
         * Raise priority if scanning rate is too low or there was no
         * progress in reclaiming pages
         */
        nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
        nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);

        /*
         * If reclaim made no progress for a boost, stop reclaim as
         * IO cannot be queued and it could be an infinite loop in
         * extreme circumstances.
         */
        //感觉是啥也没榨出来，放弃吧
        if (nr_boost_reclaim && !nr_reclaimed)
            break;

        if (raise_priority || !nr_reclaimed)
            sc.priority--;
    } while (sc.priority >= 1);    //每次我都多扫描一些页面

    if (!sc.nr_reclaimed)
        pgdat->kswapd_failures++;     //卡在这里应该是没回收成功，在node中记录一下

out:
    clear_reclaim_active(pgdat, highest_zoneidx);

    /* If reclaim was boosted, account for the reclaim done in this pass */
    if (boosted) {
        unsigned long flags;

        for (i = 0; i <= highest_zoneidx; i++) {
            if (!zone_boosts[i])
                continue;

            /* Increments are under the zone lock */
            zone = pgdat->node_zones + i;
            spin_lock_irqsave(&zone->lock, flags);
            zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
            spin_unlock_irqrestore(&zone->lock, flags);
        }

        /*
         * As there is now likely space, wakeup kcompact to defragment
         * pageblocks.
         */
        wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
    }

    snapshot_refaults(NULL, pgdat);
    __fs_reclaim_release(_THIS_IP_);
    psi_memstall_leave(&pflags);
    set_task_reclaim_state(current, NULL);

    /*
     * Return the order kswapd stopped reclaiming at as
     * prepare_kswapd_sleep() takes it into account. If another caller
     * entered the allocator slow path while kswapd was awake, order will
     * remain at the higher level.
     */
    return sc.order;
}

kswapd_shrink_node是shrink_node的包装。

static bool kswapd_shrink_node(pg_data_t *pgdat,
                   struct scan_control *sc)
{
    struct zone *zone;
    int z;

    /* Reclaim a number of pages proportional to the number of zones */
    sc->nr_to_reclaim = 0;
    for (z = 0; z <= sc->reclaim_idx; z++) {
        zone = pgdat->node_zones + z;
        if (!managed_zone(zone))
            continue;
        //这里有好多跟要回收页面数量有关的参数，傻傻分不清
        sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
    }

    /*
     * Historically care was taken to put equal pressure on all zones but
     * now pressure is applied based on node LRU order.
     */
    //回收路径
    shrink_node(pgdat, sc);

    /*
     * Fragmentation may mean that the system cannot be rebalanced for
     * high-order allocations. If twice the allocation size has been
     * reclaimed then recheck watermarks only at order-0 to prevent
     * excessive reclaim. Assume that a process requested a high-order
     * can direct reclaim/compact.
     */
    if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
        sc->order = 0;
    //扫描的数量比计划要扫描的数量多？
    return sc->nr_scanned >= sc->nr_to_reclaim;
}

shrink_node_memcgs里面除了shrink_lruvec还有shrink_slab没有看。

static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
    struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
    struct mem_cgroup *memcg;

    memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
    do {
...        
        shrink_lruvec(lruvec, sc);

        shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
                sc->priority);
...

    } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}

看看shrink_slab

/**
 * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
 * @memcg: memory cgroup whose slab caches to target
 * @priority: the reclaim priority
 *
 * Call the shrink functions to age shrinkable caches.
 *
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 * are called only if it is the root cgroup.
 *
 * @priority is sc->priority, we take the number of objects and >> by priority
 * in order to get the scan target.
 *
 * Returns the number of reclaimed slab objects.
 */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
              int priority)
{
    unsigned long ret, freed = 0;
    struct shrinker *shrinker;

    /*
     * The root memcg might be allocated even though memcg is disabled
     * via "cgroup_disable=memory" boot parameter.  This could make
     * mem_cgroup_is_root() return false, then just run memcg slab
     * shrink, but skip global shrink.  This may result in premature
     * oom.
     */
    if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
        return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

    /*
     * lockless algorithm of global shrink.
     *
     * In the unregistration setp, the shrinker will be freed asynchronously
     * via RCU after its refcount reaches 0. So both rcu_read_lock() and
     * shrinker_try_get() can be used to ensure the existence of the shrinker.
     *
     * So in the global shrink:
     *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
     *          and the validity of the shrinker_list walk.
     *  step 2: use shrinker_try_get() to try get the refcount, if successful,
     *          then the existence of the shrinker can also be guaranteed,
     *          so we can release the RCU lock to do do_shrink_slab() that
     *          may sleep.
     *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
     *          which ensures that neither this shrinker nor the next shrinker
     *          will be freed in the next traversal operation.
     *  step 4: do shrinker_put() paired with step 2 to put the refcount,
     *          if the refcount reaches 0, then wake up the waiter in
     *          shrinker_free() by calling complete().
     */
    rcu_read_lock();
    //有一个全局shrink_list
    list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
        struct shrink_control sc = {
            .gfp_mask = gfp_mask,
            .nid = nid,
            .memcg = memcg,
        };

        if (!shrinker_try_get(shrinker))
            continue;

        rcu_read_unlock();
        //shrink slab
        ret = do_shrink_slab(&sc, shrinker, priority);
        if (ret == SHRINK_EMPTY)
            ret = 0;
        freed += ret;

        rcu_read_lock();
        shrinker_put(shrinker);
    }

    rcu_read_unlock();
    cond_resched();
    return freed;
}

通过shrink_register很多模块会注册shrinker到shrinker_list，这里遍历shrinker_list，使用do_shrink_slab是回收slab。

/*
 * A callback you can register to apply pressure to ageable caches.
 *
 * @count_objects should return the number of freeable items in the cache. If
 * there are no objects to free, it should return SHRINK_EMPTY, while 0 is
 * returned in cases of the number of freeable items cannot be determined
 * or shrinker should skip this cache for this time (e.g., their number
 * is below shrinkable limit). No deadlock checks should be done during the
 * count callback - the shrinker relies on aggregating scan counts that couldn't
 * be executed due to potential deadlocks to be run at a later call when the
 * deadlock condition is no longer pending.
 *
 * @scan_objects will only be called if @count_objects returned a non-zero
 * value for the number of freeable objects. The callout should scan the cache
 * and attempt to free items from the cache. It should then return the number
 * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
 * due to potential deadlocks. If SHRINK_STOP is returned, then no further
 * attempts to call the @scan_objects will be made from the current reclaim
 * context.
 *
 * @flags determine the shrinker abilities, like numa awareness
 */
struct shrinker {
    unsigned long (*count_objects)(struct shrinker *,
                       struct shrink_control *sc);
    unsigned long (*scan_objects)(struct shrinker *,
                      struct shrink_control *sc);

    long batch;    /* reclaim batch size, 0 = default */
    int seeks;    /* seeks to recreate an obj */
    unsigned flags;

    /*
     * The reference count of this shrinker. Registered shrinker have an
     * initial refcount of 1, then the lookup operations are now allowed
     * to use it via shrinker_try_get(). Later in the unregistration step,
     * the initial refcount will be discarded, and will free the shrinker
     * asynchronously via RCU after its refcount reaches 0.
     */
    refcount_t refcount;
    struct completion done;    /* use to wait for refcount to reach 0 */
    struct rcu_head rcu;

    void *private_data;

    /* These are for internal use */
    struct list_head list;
#ifdef CONFIG_MEMCG
    /* ID in shrinker_idr */
    int id;
#endif
#ifdef CONFIG_SHRINKER_DEBUG
    int debugfs_id;
    const char *name;
    struct dentry *debugfs_entry;
#endif
    /* objs pending delete, per node */
    atomic_long_t *nr_deferred;
};

可以看到shrinker结构是一个cache的回调函数即一些参数，count_objects负责返回空闲项数量，scan_objects会去回收cache。

页面回收先分析到这里，非常简略，很多地方尚不清楚。

标签：folio,pgdat,lru,内存,linux,sc,nr,reclaim,页面
From： https://www.cnblogs.com/banshanjushi/p/18004102

linux内存管理（十）- 页面回收（二）

相关文章

赞助商

阅读排行