balance_dirty_pages_ratelimited分析
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
struct inode *inode = mapping->host;
//inode_to_bdi(inode) 这个函数在 Linux 内核的文件系统中扮演着非常重要的角色,它用于将一个 inode(索引节点)映射到对应的 backing device information (BDI)。
struct backing_dev_info *bdi = inode_to_bdi(inode);
//bdi_writeback 结构体通常用于描述一个后台写回工作队列。这个工作队列负责将脏页(即内存中被修改但尚未写入磁盘的数据)异步地写入磁盘。
struct bdi_writeback *wb = NULL;
int ratelimit;
int *p;
//backing_dev_info (BDI): 这是一个结构体,它包含了关于一个块设备(比如硬盘)的信息,例如它的I/O调度策略、写回行为等。
//脏页计数: 内核会跟踪系统中哪些内存页被修改了但还没有写入磁盘(即脏页)。这个跟踪过程就是脏页计数。
//函数返回true,表示需要对这个块设备进行脏页计数。函数返回false,表示不需要对这个块设备进行脏页计数。
//BDI_CAP_NO_ACCT_DIRTY: 这是一个标志位,如果这个标志位在一个BDI结构体中被设置了,就表示这个对应的块设备不需要参与脏页会计。
if (!bdi_cap_account_dirty(bdi))
return;
//inode_cgwb_enabled 函数通过检查一系列条件来判断一个 inode 是否启用了 cgroup 写回机制。
if (inode_cgwb_enabled(inode))
//这个函数的主要作用是获取或创建一个与指定块设备信息 (BDI) 相关的 bdi_writeback 结构体。这个结构体代表一个写回工作线程,负责将脏页写入磁盘。
wb = wb_get_create_current(bdi, GFP_KERNEL);
if (!wb)
//如果没有成功获取或创建 bdi_writeback 结构体(wb == NULL),那么代码会退而求其次,直接使用 bdi 结构体中的默认 bdi_writeback 结构体
wb = &bdi->wb;
//current是指当前进程的task_struct结构体
// 获取当前进程的脏页速率限制
ratelimit = current->nr_dirtied_pause;
//wb->dirty_exceeded 表示设备的脏页数量已超出限制。
if (wb->dirty_exceeded)
//wb->dirty_exceeded 表示设备的脏页数量已超出限制。如果设备的脏页超限,ratelimit 会被降低,以减少当前进程允许脏化的页面数量。
// 32>>(12−10)=32>>2=8
//如何通过这个值限制当前进程的脏页生成速率,往下看。
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
//禁止内核抢占
preempt_disable();
/*
* This prevents one CPU to accumulate too many dirtied pages without
* calling into balance_dirty_pages(), which can happen when there are
* 1000+ tasks, all of them start dirtying pages at exactly the same
* time, hence all honoured too large initial task->nr_dirtied_pause.
*/
//获取当前 CPU 上与变量 bdp_ratelimits 对应的 每 CPU 本地变量(per-CPU variable) 的指针。
p = this_cpu_ptr(&bdp_ratelimits);
// 判断当前进程的脏页数量是否超过了速率限制。如果当前进程的脏页数量超过了速率限制,那么将当前CPU上的速率限制设置为0,意味着暂时禁止该CPU上的磁盘 I/O 操作
if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
//如果当前CPU的脏页速率计数器值*p 超过了per CPU允许的阈值 ratelimit_pages。
else if (unlikely(*p >= ratelimit_pages)) {
//将速率计数器 *p 重置为 0。
//将 ratelimit 设置为 0,表示不允许当前进程继续产生脏页。
*p = 0;
ratelimit = 0;
}
/*
* Pick up the dirtied pages by the exited tasks. This avoids lots of
* short-lived tasks (eg. gcc invocations in a kernel build) escaping
* the dirty throttling and livelock other long-run dirtiers.
*/
// per-CPU 变量,用于记录那些未能计入具体进程的脏页数量(即"泄漏"的脏页数量)
p = this_cpu_ptr(&dirty_throttle_leaks);
//确保当前进程的脏页计数未超过限制值 ratelimit 和 检查是否有未计入具体进程的泄漏脏页
//测试时if很少成立,dirty_throttle_leaks基本是0,有时会大于0
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
//取 *p(泄漏的脏页数量)和 ratelimit - current->nr_dirtied(当前进程还可以生成的脏页数量)的最小值。
//不会吸收超过泄漏的脏页数量。
//吸收后,current->nr_dirtied 不会超过 ratelimit。
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
//将吸收的脏页数量从 *p 中扣减。
*p -= nr_pages_dirtied;
//将吸收的脏页数量计入当前进程的脏页计数器 current->nr_dirtied。
current->nr_dirtied += nr_pages_dirtied;
}
//上面这段的作用是避免大量脏页泄漏积累,可能导致写入过载或系统性能下降
//恢复内核的抢占状态
preempt_enable();
//当前脏页超过限制,执行balance_dirty_pages
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(wb, current->nr_dirtied);
wb_put(wb);
}
- 当前task的脏页数量大于ratelimit
(unlikely(current->nr_dirtied >= ratelimit))
- 脏页泄漏的补偿后,依然超过限制
- 系统尝试从 dirty_throttle_leaks(泄漏的脏页池)中补偿当前进程的脏页计数。
- 补偿后,current->nr_dirtied 达到或超过 ratelimit。
p = this_cpu_ptr(&dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
*p -= nr_pages_dirtied;
current->nr_dirtied += nr_pages_dirtied;
}
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(wb, current->nr_dirtied);
- 如果当前设备的脏页数超限,则会降低 ratelimit 的值
if (wb->dirty_exceeded)
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
更低的 ratelimit 会更容易触发 balance_dirty_pages。
- 每个 CPU 的脏页生成计数超过限制时,会将 ratelimit 设置为 0,触发限制:
p = this_cpu_ptr(&bdp_ratelimits);
if (unlikely(*p >= ratelimit_pages)) {
*p = 0;
ratelimit = 0;
}
强制阻止脏页生成,随后触发 balance_dirty_pages。
balance_dirty_pages函数
#define GDTC_INIT(__wb) .wb = (__wb), \
.wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)
/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *dom;
struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
#endif
struct bdi_writeback *wb;
// 这个结构体在 Linux 内核中主要用于表示每个 CPU 上的本地事件计数器。
struct fprop_local_percpu *wb_completions;
unsigned long avail; /* dirtyable */
unsigned long dirty; /* file_dirty + write + nfs */
unsigned long thresh; /* dirty threshold */
unsigned long bg_thresh; /* dirty background threshold */
unsigned long wb_dirty; /* per-wb counterparts */
unsigned long wb_thresh;
unsigned long wb_bg_thresh;
unsigned long pos_ratio;
};
static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
return false;
}
static void balance_dirty_pages(struct bdi_writeback *wb,
unsigned long pages_dirtied)
{
//gdtc_stor 是一个全局脏页控制信息结构体,其中wb 参数是传入的 bdi_writeback 结构体的指针。wb中的completions赋值给gdtc_stor的wb_completions
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
//mdtc_stor 这个变量是一个内存控制组的脏页控制结构体。
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
//gdtc这个变量是一个指向全局脏页控制结构体的指针
struct dirty_throttle_control * const gdtc = &gdtc_stor;
//mdtc这个变量是一个指向内存控制组脏页控制结构体的指针。如果内存控制组有效,则指向 mdtc_stor,否则为 NULL。这个值为空,因为没有CONFIG_CGROUP_WRITEBACK
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
long period;
long pause;
long max_pause;
long min_pause;
int nr_dirtied_pause;
bool dirty_exceeded = false;
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
struct backing_dev_info *bdi = wb->bdi;
//检查后端设备(bdi,block device inode)的能力标志是否包含 BDI_CAP_STRICTLIMIT。这是一个布尔值,用于确定是否需要对写入脏页的行为进行严格限制。
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
//now 变量被赋值为当前的 jiffies 值,这样可以记录当前时间点,用于后续的时间计算或比较操作。
unsigned long now = jiffies;
unsigned long dirty, thresh, bg_thresh;
unsigned long m_dirty = 0; /* stop bogus uninit warnings */
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;
/*
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
//global_node_page_state(NR_FILE_DIRTY)获取系统中所有节点(NUMA 节点)的普通文件脏页数量的总和。
//global_node_page_state(NR_UNSTABLE_NFS)获取系统中所有节点的NFS协议下的不稳定脏页数量的总和。
//将上述两种类型的脏页数量相加,得到系统中总共可以被回收的脏页数量
nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
global_node_page_state(NR_UNSTABLE_NFS);
//可以被标记为脏的页面数量
gdtc->avail = global_dirtyable_memory();
//系统中实际的脏页总数 = 可回收的脏页数量 + 正在写入磁盘的页面数量
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
//传入全局脏页控制信息
domain_dirty_limits(gdtc);
//当 strictlimit 启用时,系统会使用 wb_dirty 和相关的写回限制(如 wb_thresh)来确保设备不会因过多脏页积累而导致性能问题
if (unlikely(strictlimit)) {
wb_dirty_limits(gdtc);
dirty = gdtc->wb_dirty;
thresh = gdtc->wb_thresh;
bg_thresh = gdtc->wb_bg_thresh;
} else {
/*系统中实际的脏页总数,在domain_dirty_limits中的
dtc->thresh = thresh;
dtc->bg_thresh = bg_thresh;被赋值 */
dirty = gdtc->dirty;
thresh = gdtc->thresh;
bg_thresh = gdtc->bg_thresh;
}
//这个为空,不用计算;
if (mdtc) {
unsigned long filepages, headroom, writeback;
/*
* If @wb belongs to !root memcg, repeat the same
* basic calculations for the memcg domain.
*/
mem_cgroup_wb_stats(wb, &filepages, &headroom,
&mdtc->dirty, &writeback);
mdtc->dirty += writeback;
mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc);
if (unlikely(strictlimit)) {
wb_dirty_limits(mdtc);
m_dirty = mdtc->wb_dirty;
m_thresh = mdtc->wb_thresh;
m_bg_thresh = mdtc->wb_bg_thresh;
} else {
m_dirty = mdtc->dirty;
m_thresh = mdtc->thresh;
m_bg_thresh = mdtc->bg_thresh;
}
}
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
* when the wb limits are ramping up in case of !strictlimit.
*
* In strictlimit case make decision based on the wb counters
* and limits. Small writeouts when the wb limits are ramping
* up are the price we consciously pay for strictlimit-ing.
*
* If memcg domain is in effect, @dirty should be under
* both global and memcg freerun ceilings.
*/
//这段代码的目的是根据脏页数量和阈值决定是否暂停当前的脏页写回操作。如果脏页数处于正常范围内,系统将计算适当的写回间隔,并在暂停写回时记录暂停的时间。
//dirty <= dirty_freerun_ceiling(thresh, bg_thresh):首先,判断当前脏页数 dirty 是否低于前台和后台脏页阈值的平均值。如果是,表示系统可以继续执行脏页写回。公式:(thresh + bg_thresh) / 2,
//dirty_freerun_ceiling(thresh, bg_thresh): 这个函数计算前台和后台脏页阈值的平均值,表示一个脏页的“自由运行上限”。当脏页数量低于这个上限时,系统可以继续以正常的频率进行脏页的写回,不需要进行强制的写回操作。
//dirty <= dirty_freerun_ceiling(thresh, bg_thresh): 这个条件的作用是检查当前脏页数量是否小于或等于这个“自由运行上限”。如果是,说明脏页数量在允许的范围内,系统可以继续操作而无需触发写回操作。
//(!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh)):如果 mdtc 为 NULL(即没有内存控制组信息)或者内存控制组的脏页数 m_dirty 小于或等于其相应的自由运行上限,则也允许继续执行。
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
(!mdtc ||
m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
/**
* 计算脏页写回的时间间隔:unsigned long intv = dirty_poll_interval(dirty, thresh);:根据当前的脏页数量 (dirty) 和前台脏页阈值 (thresh),调用 dirty_poll_interval 函数来计算脏页写回的时间间隔(intv)。这个间隔时间控制了系统在脏页写回操作中等待的时间,目的是避免频繁写回而导致性能问题。
* 暂停脏页写回操作:current->dirty_paused_when = now;:记录当前时间 now,表示当前线程已暂停脏页写回。
current->nr_dirtied = 0;:重置当前线程脏页数量的计数器,表示已暂停脏页写回操作。
*/
//如果 thresh 大于 dirty,即阈值大于当前的脏页数量,那么 dirty_poll_interval 函数会计算一个较长的写回间隔时间。
//如果 thresh 小于 dirty,即阈值小于当前的脏页数量,那么 dirty_poll_interval 函数会返回最小的写回间隔 1。这意味着系统会频繁地进行脏页写回操作。
unsigned long intv = dirty_poll_interval(dirty, thresh);
//#define ULONG_MAX (~0UL):~0UL 表示对 0 进行按位取反操作,并将结果转换为无符号长整型(unsigned long)。
//将变量 m_intv 初始化为无符号长整型的最大值。
unsigned long m_intv = ULONG_MAX;
//系统将当前时间(now)赋值给 current->dirty_paused_when,从而记录下当前线程暂停脏页写回操作的具体时间点。
current->dirty_paused_when = now;
//将当前线程的脏页计数器重置为零。
current->nr_dirtied = 0;
//这里跳过
if (mdtc)
m_intv = dirty_poll_interval(m_dirty, m_thresh);
//选择最小的写回间隔:
//current->nr_dirtied_pause = min(intv, m_intv);:选择全局和内存控制组脏页写回间隔中较小的一个,作为暂停脏页写回操作的时间间隔。
//这个间隔决定了脏页在暂停后恢复写回的时间,避免过度频繁的脏页写回,提升系统效率。
//current->nr_dirtied_pause 是一个变量,用于记录当前线程暂停脏页写回操作的时间间隔。
//这个变量的值是通过选择全局和内存控制组脏页写回间隔中较小的一个来确定的。如果没有内存控制组那么必然使用全局脏页写回间隔。
current->nr_dirtied_pause = min(intv, m_intv);
//break;:当脏页写回操作暂停后,退出当前循环,可能会等待下一次脏页写回的时机。
break;
}
//writeback_in_progress(wb) 是一个函数,它检查是否有正在进行的写回操作。如果没有写回操作,writeback_in_progress(wb) 返回 false,条件 !writeback_in_progress(wb) 成立,表示没有正在进行的写回。进行unlikely逻辑判断,优化,认为不太可能进入这里。
if (unlikely(!writeback_in_progress(wb)))
//如果没有写回操作,调用 wb_start_background_writeback(wb) 函数启动后台写回。这个函数唤醒后台写回线程,让它开始进行脏页的写回操作,确保数据最终被写入磁盘。
wb_start_background_writeback(wb);
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
*/
//如果 strictlimit 为 false,即当前系统或条件下不需要强制限制写回阈值
if (!strictlimit)
//计算并更新与后台写回相关的脏页阈值 (wb_thresh 和 wb_bg_thresh),以及计算当前正在写回的脏页数量 (wb_dirty)。
wb_dirty_limits(gdtc);
//gdtc->wb_dirty > gdtc->wb_thresh:判断设备的脏页数量是否超过了写回阈值(wb_thresh)。如果脏页数超过阈值,表示设备需要进行写回。
//gdtc->dirty > gdtc->thresh:判断全局脏页数是否超过了阈值(thresh),如果超过则需要进行脏页写回。
//全局脏页数量是否超过其阈值,或者是否启用了严格限制。
//dirty_exceeded 为true时,表示脏页数量已经超过了阈值,需要采取相应的措施来处理这些脏页,例如触发写回操作以减少脏页数量。
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
((gdtc->dirty > gdtc->thresh) || strictlimit);
//这个函数调整设备写回速率的比例 (pos_ratio),以确保脏页能够在合理的速率下写回。具体来说,wb_position_ratio 根据设备的脏页数量、写带宽等因素动态计算一个比例,来决定写回任务的速率。
wb_position_ratio(gdtc);
//这行代码将 gdtc(全局脏页控制结构)赋值给 sdtc(当前脏页控制结构)
sdtc = gdtc;
//跳过
if (mdtc) {
/*
* If memcg domain is in effect, calculate its
* pos_ratio. @wb should satisfy constraints from
* both global and memcg domains. Choose the one
* w/ lower pos_ratio.
*/
if (!strictlimit)
wb_dirty_limits(mdtc);
dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
((mdtc->dirty > mdtc->thresh) || strictlimit);
wb_position_ratio(mdtc);
if (mdtc->pos_ratio < gdtc->pos_ratio)
sdtc = mdtc;
}
if (dirty_exceeded && !wb->dirty_exceeded)
wb->dirty_exceeded = 1;
if (time_is_before_jiffies(wb->bw_time_stamp +
BANDWIDTH_INTERVAL)) {
spin_lock(&wb->list_lock);
__wb_update_bandwidth(gdtc, mdtc, start_time, true);
spin_unlock(&wb->list_lock);
}
/* throttle according to the chosen dtc */
dirty_ratelimit = wb->dirty_ratelimit;
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
max_pause = wb_max_pause(wb, sdtc->wb_dirty);
min_pause = wb_min_pause(wb, max_pause,
task_ratelimit, dirty_ratelimit,
&nr_dirtied_pause);
if (unlikely(task_ratelimit == 0)) {
period = max_pause;
pause = max_pause;
goto pause;
}
period = HZ * pages_dirtied / task_ratelimit;
pause = period;
if (current->dirty_paused_when)
pause -= now - current->dirty_paused_when;
/*
* For less than 1s think time (ext3/4 may block the dirtier
* for up to 800ms from time to time on 1-HDD; so does xfs,
* however at much less frequency), try to compensate it in
* future periods by updating the virtual time; otherwise just
* do a reset, as it may be a light dirtier.
*/
if (pause < min_pause) {
trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
sdtc->wb_thresh,
sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
min(pause, 0L),
start_time);
if (pause < -HZ) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
} else if (period) {
current->dirty_paused_when += period;
current->nr_dirtied = 0;
} else if (current->nr_dirtied_pause <= pages_dirtied)
current->nr_dirtied_pause += pages_dirtied;
break;
}
if (unlikely(pause > max_pause)) {
/* for occasional dropped task_ratelimit */
now += min(pause - max_pause, max_pause);
pause = max_pause;
}
pause:
trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
sdtc->wb_thresh,
sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
pause,
start_time);
__set_current_state(TASK_KILLABLE);
wb->dirty_sleep = now;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
current->nr_dirtied = 0;
current->nr_dirtied_pause = nr_dirtied_pause;
/*
* This is typically equal to (dirty < thresh) and can also
* keep "1000+ dd on a slow USB stick" under control.
*/
if (task_ratelimit)
break;
/*
* In the case of an unresponding NFS server and the NFS dirty
* pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
* In theory 1 page is enough to keep the consumer-producer
* pipe going: the flusher cleans 1 page => the task dirties 1
* more page. However wb_dirty has accounting errors. So use
* the larger and more IO friendly wb_stat_error.
*/
if (sdtc->wb_dirty <= wb_stat_error())
break;
if (fatal_signal_pending(current))
break;
}
if (!dirty_exceeded && wb->dirty_exceeded)
wb->dirty_exceeded = 0;
if (writeback_in_progress(wb))
return;
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (laptop_mode)
return;
if (nr_reclaimable > gdtc->bg_thresh)
wb_start_background_writeback(wb);
}
其中有几个函数
domain_dirty_limits
这个函数的主要目的是计算并设置脏页阈值,以控制系统何时将内存中修改但未写入磁盘的数据(脏页)写入磁盘。这个阈值决定了系统在触发后台写回进程之前,允许内存中存在多少脏页。
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
return NULL;
}
详情解释:
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
//available_memory 是当前域的可用内存页数。
const unsigned long available_memory = dtc->avail;
//gdtc这个为空
struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
//这个值在sysctl下修改;
/*
vm_dirty_bytes 是 Linux 内核中的一个参数,用于控制内存中脏页(dirty pages)的数量。脏页是指那些已经被修改并且需要被写回磁盘的数据页。vm_dirty_bytes 参数指定了当脏页所占用的内存量达到某个字节值时,系统将开始将这些脏数据写回到磁盘。
具体来说,vm_dirty_bytes 与 vm_dirty_ratio 一起工作,它们定义了内核在开始将脏页数据写回磁盘之前,允许脏页占用的内存的最大量。这两个参数只能设置其中一个,如果设置了 vm_dirty_bytes,则 vm_dirty_ratio 的设置将被忽略,反之亦然。
当设置 vm_dirty_bytes 时,它定义了一个绝对值,即系统脏内存超过该字节值时,执行磁盘写操作的进程开始回写脏页到磁盘。
如果 vm_dirty_bytes 设置为 0,则表示不通过字节值来限制脏页,而完全依赖于 vm_dirty_ratio 百分比值来决定何时开始回写脏页。
调整 vm_dirty_bytes 参数可以影响系统的写入性能和内存使用效率。如果设置得过高,可能会导致内存中积压大量脏页,增加数据丢失的风险(在系统崩溃时未写入磁盘的数据会丢失);如果设置得过低,则可能会导致频繁的磁盘写入操作,影响系统性能
节点为/proc/sys/vm/dirty_byte
*/
unsigned long bytes = vm_dirty_bytes;
unsigned long bg_bytes = dirty_background_bytes;
/* convert ratios to per-PAGE_SIZE for higher precision */
//完成百分比到实际比例的转换。例如,如果 vm_dirty_ratio 是 10(表示 10%),计算后的 ratio 将是 10 * PAGE_SIZE / 100,即 0.1 * PAGE_SIZE。
unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
unsigned long thresh;
unsigned long bg_thresh;
struct task_struct *tsk;
/* gdtc is !NULL iff @dtc is for memcg domain */
//不会进来这里
if (gdtc) {
unsigned long global_avail = gdtc->avail;
/*
* The byte settings can't be applied directly to memcg
* domains. Convert them to ratios by scaling against
* globally available memory. As the ratios are in
* per-PAGE_SIZE, they can be obtained by dividing bytes by
* number of pages.
*/
if (bytes)
ratio = min(DIV_ROUND_UP(bytes, global_avail),
PAGE_SIZE);
if (bg_bytes)
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
PAGE_SIZE);
bytes = bg_bytes = 0;
}
//优先使用bytes,计算前台阈值
if (bytes)
thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
else
//否则基于比例值和可用内存计算阈值。
thresh = (ratio * available_memory) / PAGE_SIZE;
if (bg_bytes)
bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
else
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
//调整后台阈值以避免大于前台阈值:
if (bg_thresh >= thresh)
bg_thresh = thresh / 2;
tsk = current;
//对实时任务和低节流任务的额外调整:
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
//后台阈值(bg_thresh)和前台阈值(thresh)都会增加:
//增加 25% 的现有阈值。
//加上一小部分全局脏页限制值(global_wb_domain.dirty_limit 的 1/32)。
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
//前台阈值 (thresh) 和后台阈值 (bg_thresh) 提高后,允许实时任务积累更多的脏数据,减少它们被内核强制写回或阻塞的可能性。
dtc->thresh = thresh;
dtc->bg_thresh = bg_thresh;
/* we should eventually report the domain in the TP */
if (!gdtc)
trace_global_dirty_state(bg_thresh, thresh);
}
dirty_freerun_ceiling:
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
unsigned long bg_thresh)
{
return (thresh + bg_thresh) / 2;
}
dirty_poll_interval
static unsigned long dirty_poll_interval(unsigned long dirty,
unsigned long thresh)
{
if (thresh > dirty)
return 1UL << (ilog2(thresh - dirty) >> 1);
return 1;
}
这里,ilog2(thresh - dirty) 会计算 thresh - dirty 的对数值,然后右移一位。这个操作用于调整间隔时间,使得脏页接近阈值时,写回间隔较短,而当脏页离阈值较远时,写回间隔较长。这有助于减少频繁的脏页写回,提升系统效率。
如果 thresh 大于 dirty,即阈值大于当前的脏页数量,那么 dirty_poll_interval 函数会计算一个较长的写回间隔时间。
如果 thresh 小于 dirty,即阈值小于当前的脏页数量,那么 dirty_poll_interval 函数会返回最小的写回间隔 1。这意味着系统会频繁地进行脏页写回操作。
wb_start_background_writeback
void wb_start_background_writeback(struct bdi_writeback *wb)
{
/*
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
trace_writeback_wake_background(wb);
wb_wakeup(wb);
}
wb_start_background_writeback 函数的作用是启动后台写回操作,将后台写回线程唤醒,开始执行脏页的写回工作。
详细解析
- 唤醒后台写回线程
- wb_wakeup(wb);:该函数调用唤醒后台的写回线程。具体来说,它会通知写回线程开始工作,通常后台写回线程会在没有其他任务时进行脏页数据的写入操作。
- 跟踪写回操作的启动
- trace_writeback_wake_background(wb);:该行代码通过 trace_writeback_wake_background 函数跟踪后台写回的启动过程。通常,这种跟踪机制用于日志记录或性能监控,帮助开发者了解何时启动了后台写回。
目的:
这个函数的主要目的是确保后台写回操作的启动。后台写回线程负责将内存中的脏页(修改过但尚未写入磁盘的页)定期写回磁盘,以释放内存并保持数据一致性。通过调用 wb_wakeup,后台写回线程会在没有其他任务时启动,执行这些写回操作,避免内存被过多脏页占用。
总结来说,wb_start_background_writeback 是一个用于管理后台写回过程的函数,确保脏页的写回工作被有效调度。
writeback_in_progress
该函数的作用是检查后台写回操作是否正在进行。
static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
return test_bit(WB_writeback_running, &wb->state);
}
test_bit(WB_writeback_running, &wb->state):
- 这个函数使用 test_bit 来检查 wb->state 位图中的 WB_writeback_running 标志位。如果该位为 1,表示当前有写回操作正在进行,函数返回 true。
- test_bit 是一个用于检查位图中特定位是否被设置的函数。如果指定的位被设置为 1,它会返回 true,否则返回 false。
WB_writeback_running:
- WB_writeback_running 是一个标志,通常用于指示是否有写回操作正在进行。这个标志通常在后台写回操作启动时被设置,并在写回操作完成后清除
wb_dirty_limits
wb_dirty_limits 函数的作用是计算并更新与后台写回相关的脏页阈值 (wb_thresh 和 wb_bg_thresh),以及计算当前正在写回的脏页数量 (wb_dirty)。
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
//获取写回控制结构体中的 wb 指针。
struct bdi_writeback *wb = dtc->wb;
//声明一个无符号长整型变量 wb_reclaimable,用于存储可回收的脏页数量。
unsigned long wb_reclaimable;
/*
* wb_thresh is not treated as some limiting factor as
* dirty_thresh, due to reasons
* - in JBOD setup, wb_thresh can fluctuate a lot
* - in a system with HDD and USB key, the USB key may somehow
* go into state (wb_dirty >> wb_thresh) either because
* wb_dirty starts high, or because wb_thresh drops low.
* In this case we don't want to hard throttle the USB key
* dirtiers for 100 seconds until wb_dirty drops under
* wb_thresh. Instead the auxiliary wb control line in
* wb_position_ratio() will let the dirtier task progress
* at some rate <= (write_bw / 2) for bringing down wb_dirty.
*/
//调用 __wb_calc_thresh 函数计算写回阈值,并将结果赋值给 dtc->wb_thresh。
dtc->wb_thresh = __wb_calc_thresh(dtc);
//计算后台写回阈值 wb_bg_thresh,如果 dtc->thresh 为非零,则根据比例计算,否则设置为 0。
dtc->wb_bg_thresh = dtc->thresh ?
div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
* to ensure we accurately count the 'dirty' pages when
* the threshold is low.
*
* Otherwise it would be possible to get thresh+n pages
* reported dirty, even though there are thresh-m pages
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
if (dtc->wb_thresh < 2 * wb_stat_error()) {
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
} else {
wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
}
}
更新 wb_thresh 和 wb_bg_thresh:
- dtc->wb_thresh = __wb_calc_thresh(dtc);:通过调用 __wb_calc_thresh 函数计算并设置 wb_thresh(写回阈值)。这个阈值表示何时应该启动写回操作。具体的计算方式依赖于设备和写回控制的状态。
- dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;:计算 wb_bg_thresh(后台写回阈值),如果 dtc->thresh 非零,使用给定的比率来调整前台和后台的写回阈值。否则,将 wb_bg_thresh 设置为 0。
避免 BDI 堆栈死锁:
- 该函数中提到的 "BDI 堆栈死锁"(stacked BDI deadlock)是指当写回阈值非常低时,可能会因为在脏页统计时没有正确同步,导致系统错误地报告超过阈值的脏页数量。为了避免这个问题,如果 dtc->wb_thresh 小于一个较小的阈值(2 * wb_stat_error()),它会通过 wb_stat_sum 函数更精确地计算出 wb_dirty(即正在写回的脏页数)。
计算 wb_dirty:
- 如果 wb_thresh 较低,则使用 wb_stat_sum 函数计算 WB_RECLAIMABLE 和 WB_WRITEBACK 状态的脏页数,并将其合并为 wb_dirty。
- 否则,直接使用 wb_stat 函数计算当前的 WB_RECLAIMABLE 和 WB_WRITEBACK 脏页数,并将其加总为 wb_dirty。
__wb_calc_thresh
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
//获取写回域(domain)的指针。
struct wb_domain *dom = dtc_dom(dtc);
//获取全局脏页阈值。
unsigned long thresh = dtc->thresh;
u64 wb_thresh;
long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
/*
* Calculate this BDI's share of the thresh ratio.
*/
//计算该BDI在整个写回域中的完成比例,得到分子 numerator 和分母 denominator。
fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
&numerator, &denominator);
//bdi_min_ratio这个值从bdi_min_ratio来的
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
//根据完成比例调整写回阈值。?
wb_thresh *= numerator;
do_div(wb_thresh, denominator);
//wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);:获取该BDI的最小和最大比例。
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
//根据最小比例调整写回阈值。
wb_thresh += (thresh * wb_min_ratio) / 100;
//确保写回阈值不超过最大比例。
if (wb_thresh > (thresh * wb_max_ratio) / 100)
wb_thresh = thresh * wb_max_ratio / 100;
//return wb_thresh;:返回最终计算的写回阈值。
return wb_thresh;
}
该函数的主要作用就是计算wb_thresh
wb_min_max_ratio
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
*minp = wb->bdi->min_ratio;
*maxp = wb->bdi->max_ratio;
}
- 将写回设备的最小脏页比例赋值给
minp
指向的变量。 - 将写回设备的最大脏页比例赋值给
maxp
指向的变量。
bdi_min_ratio
static unsigned int bdi_min_ratio;
//bdi_min_ratio 是一个全局变量,表示所有写回设备的最小脏页比例的总和。
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
//bdi->min_ratio 是特定写回设备的最小脏页比例。
min_ratio -= bdi->min_ratio;
//检查全局最小比例加上新的最小比例是否小于 100。
if (bdi_min_ratio + min_ratio < 100) {
//如果是,则更新全局最小比例 bdi_min_ratio 和设备的最小比例 bdi->min_ratio。
bdi_min_ratio += min_ratio;
bdi->min_ratio += min_ratio;
} else {
ret = -EINVAL;
}
}
spin_unlock_bh(&bdi_lock);
return ret;
}
bdi_set_min_ratio
的作用是设置特定写回设备(BDI)的最小脏页比例。
总结来说,这个函数确保每个写回设备的最小脏页比例在合理范围内,并且总和不超过 100%
wb_position_ratio
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
unsigned long write_bw = wb->avg_write_bandwidth;
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
unsigned long wb_setpoint;
unsigned long span;
long long pos_ratio; /* for scaling up/down the rate limit */
long x;
dtc->pos_ratio = 0;
if (unlikely(dtc->dirty >= limit))
return;
/*
* global setpoint
*
* See comment for pos_ratio_polynom().
*/
setpoint = (freerun + limit) / 2;
pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
/*
* The strictlimit feature is a tool preventing mistrusted filesystems
* from growing a large number of dirty pages before throttling. For
* such filesystems balance_dirty_pages always checks wb counters
* against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
* 1% by default. Without strictlimit feature, fuse writeback may
* consume arbitrary amount of RAM because it is accounted in
* NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
*
* Here, in wb_position_ratio(), we calculate pos_ratio based on
* two values: wb_dirty and wb_thresh. Let's consider an example:
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
* limits are set by default to 10% and 20% (background and throttle).
* Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
* wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
* about ~6K pages (as the average of background and throttle wb
* limits). The 3rd order polynomial will provide positive feedback if
* wb_dirty is under wb_setpoint and vice versa.
*
* Note, that we cannot use global counters in these calculations
* because we want to throttle process writing to a strictlimit wb
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
* in the example above).
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
if (dtc->wb_dirty < 8) {
dtc->pos_ratio = min_t(long long, pos_ratio * 2,
2 << RATELIMIT_CALC_SHIFT);
return;
}
if (dtc->wb_dirty >= wb_thresh)
return;
wb_setpoint = dirty_freerun_ceiling(wb_thresh,
dtc->wb_bg_thresh);
if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
return;
wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
wb_thresh);
/*
* Typically, for strictlimit case, wb_setpoint << setpoint
* and pos_ratio >> wb_pos_ratio. In the other words global
* state ("dirty") is not limiting factor and we have to
* make decision based on wb counters. But there is an
* important case when global pos_ratio should get precedence:
* global limits are exceeded (e.g. due to activities on other
* wb's) while given strictlimit wb is below limit.
*
* "pos_ratio * wb_pos_ratio" would work for the case above,
* but it would look too non-natural for the case of all
* activity in the system coming from a single strictlimit wb
* with bdi->max_ratio == 100%.
*
* Note that min() below somewhat changes the dynamics of the
* control system. Normally, pos_ratio value can be well over 3
* (when globally we are at freerun and wb is well below wb
* setpoint). Now the maximum pos_ratio in the same situation
* is 2. We might want to tweak this if we observe the control
* system is too slow to adapt.
*/
dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
return;
}
/*
* We have computed basic pos_ratio above based on global situation. If
* the wb is over/under its share of dirty pages, we want to scale
* pos_ratio further down/up. That is done by the following mechanism.
*/
/*
* wb setpoint
*
* f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
*
* x_intercept - wb_dirty
* := --------------------------
* x_intercept - wb_setpoint
*
* The main wb control line is a linear function that subjects to
*
* (1) f(wb_setpoint) = 1.0
* (2) k = - 1 / (8 * write_bw) (in single wb case)
* or equally: x_intercept = wb_setpoint + 8 * write_bw
*
* For single wb case, the dirty pages are observed to fluctuate
* regularly within range
* [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
* for various filesystems, where (2) can yield in a reasonable 12.5%
* fluctuation range for pos_ratio.
*
* For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
* own size, so move the slope over accordingly and choose a slope that
* yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
*/
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
/*
* It's very possible that wb_thresh is close to 0 not because the
* device is slow, but that it has remained inactive for long time.
* Honour such devices a reasonable good (hopefully IO efficient)
* threshold, so that the occasional writes won't be blocked and active
* writes can rampup the threshold quickly.
*/
wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
/*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
*/
x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
wb_setpoint = setpoint * (u64)x >> 16;
/*
* Use span=(8*write_bw) in single wb case as indicated by
* (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
*
* wb_thresh thresh - wb_thresh
* span = --------- * (8 * write_bw) + ------------------ * wb_thresh
* thresh thresh
*/
span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
x_intercept = wb_setpoint + span;
if (dtc->wb_dirty < x_intercept - span / 4) {
pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
(x_intercept - wb_setpoint) | 1);
} else
pos_ratio /= 4;
/*
* wb reserve area, safeguard against dirty pool underrun and disk idle
* It may push the desired control point of global dirty pages higher
* than setpoint.
*/
x_intercept = wb_thresh / 2;
if (dtc->wb_dirty < x_intercept) {
if (dtc->wb_dirty > x_intercept / 8)
pos_ratio = div_u64(pos_ratio * x_intercept,
dtc->wb_dirty);
else
pos_ratio *= 8;
}
dtc->pos_ratio = pos_ratio;
}
这段代码的主要功能是根据系统的写回状态(dirty pages),动态调整写回任务的速率限制(pos_ratio),以实现脏页写回的平衡控制。具体来说,这段代码是针对不同情况下的脏页写回进行精细控制,尤其是在存在严格限制(strictlimit)和多个写回设备(例如JBOD配置)时,如何动态调整写回速率,以避免过度的脏页积累和系统资源耗尽。
初始化变量:
- write_bw:获取当前设备的平均写带宽。
- freerun:计算脏页自由流动的阈值。
- limit:计算硬性脏页限制(hard_dirty_limit)。
- wb_thresh:从 dtc 中获取写回阈值。
计算 setpoint 和 pos_ratio:
- setpoint 是目标脏页平衡点(即想要达到的脏页数量),它是 freerun 和 limit 的中间值。
- pos_ratio 是位置比例,它是基于全局脏页数 (dtc->dirty) 和脏页限制 (limit) 计算的,用来调节写回速率。
处理 strictlimit 情况:
- 如果设备支持 BDI_CAP_STRICTLIMIT(即严格限制),则在 dtc->wb_dirty(设备的脏页数)小于设定的阈值时,调整 pos_ratio,以避免脏页积累过多,特别是对于不可信的文件系统(如 FUSE 文件系统),防止脏页过多而不触发写回操作。
- 在 strictlimit 情况下,pos_ratio 是通过与 wb_setpoint(写回阈值)相关的值来进一步调整的。如果 wb_dirty 数量过少,pos_ratio 被加倍;如果超过了 wb_thresh,则停止调节。
处理非 strictlimit 情况:
- 对于普通情况,根据计算出的 pos_ratio,进一步调整写回速率。如果当前设备的脏页数 (wb_dirty) 超过了设定的写回阈值,则停止进一步调整。
- 如果 wb_thresh 大于全局阈值(dtc->thresh),则将 wb_thresh 调整为全局阈值。
- 根据设备的写带宽和脏页数量,计算一个新的 wb_setpoint 和 span,并使用这些值来动态调整 pos_ratio,使其更加平衡地反映系统当前的脏页状态。
动态调整 pos_ratio:
- 如果脏页数接近设定的写回阈值,则会根据设备的写带宽和脏页数进行比例缩放。具体来说,pos_ratio 会根据设备的脏页数与目标平衡点之间的差距进行调整,以便平滑地控制写回速率。
- 此外,如果设备的脏页数低于某个安全值(如 x_intercept),则进一步调整 pos_ratio,以避免写回速率过低,保证系统的脏页写回操作不会完全停止。
__wb_update_bandwidth
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
struct dirty_throttle_control *mdtc,
unsigned long start_time,
bool update_ratelimit)
{
struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
unsigned long elapsed = now - wb->bw_time_stamp;
unsigned long dirtied;
unsigned long written;
lockdep_assert_held(&wb->list_lock);
/*
* rate-limit, only update once every 200ms.
*/
if (elapsed < BANDWIDTH_INTERVAL)
return;
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
/*
* Skip quiet periods when disk bandwidth is under-utilized.
* (at least 1s idle time between two flusher runs)
*/
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
goto snapshot;
if (update_ratelimit) {
domain_update_bandwidth(gdtc, now);
wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
/*
* @mdtc is always NULL if !CGROUP_WRITEBACK but the
* compiler has no way to figure that out. Help it.
*/
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
domain_update_bandwidth(mdtc, now);
wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
}
}
wb_update_write_bandwidth(wb, elapsed, written);
snapshot:
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
wb->bw_time_stamp = now;
}
__wb_update_bandwidth 函数的作用是更新与磁盘写回带宽相关的统计信息,并进行速率限制管理。它主要负责根据过去的时间段,更新写回的带宽、脏页写回速率以及相关的时间戳等数据。以下是对函数各部分功能的详细解释:
- 获取当前时间和计算时间间隔
unsigned long now = jiffies;
unsigned long elapsed = now - wb->bw_time_stamp;
- now 获取当前的系统时间(以 jiffies 为单位),elapsed 计算自上次更新以来的时间间隔。
- 检查时间间隔是否符合更新条件
if (elapsed < BANDWIDTH_INTERVAL)
return;
- 如果自上次更新以来的时间小于设定的 BANDWIDTH_INTERVAL(通常为 200ms),则直接返回,不更新带宽信息。这样可以避免频繁更新,减少性能开销。
- 读取脏页和写入的数据
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
- 从 wb->stat 读取当前设备的脏页(WB_DIRTIED)和已写入的数据(WB_WRITTEN)。
- 处理安静期(磁盘空闲期)
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
goto snapshot;
- 如果设备的空闲时间超过了 1 秒(elapsed > HZ),且当前时间点早于开始时间 start_time,则跳过带宽更新,直接进行快照(即不更新带宽数据)。
- 更新速率限制(如果需要)
if (update_ratelimit) {
domain_update_bandwidth(gdtc, now);
wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
domain_update_bandwidth(mdtc, now);
wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
}
}
- 如果
update_ratelimit
为真,更新写回的速率限制:
- 调用 domain_update_bandwidth 更新带宽信息。
- 调用 wb_update_dirty_ratelimit 根据脏页数和时间间隔更新脏页写回速率。
- 如果启用了 CGROUP_WRITEBACK 配置且 mdtc 不为空,则也更新 cgroup 的带宽和速率。
6. 更新写入带宽信息
wb_update_write_bandwidth(wb, elapsed, written);
- 调用 wb_update_write_bandwidth 更新磁盘的写入带宽统计。
7. 保存当前脏页和已写入数据的时间戳
snapshot:
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
wb->bw_time_stamp = now;
- 将当前的脏页数、已写入数据量和时间戳保存到 wb 结构中,以便下次更新时使用。
总结:
__wb_update_bandwidth 函数负责在一定时间间隔后,更新磁盘设备的带宽统计信息,并计算和更新脏页写回的速率。它确保在磁盘空闲时跳过更新,并且根据配置选择是否更新速率限制(例如,针对 cgroup 的写回)。该函数通过时间戳和带宽信息来控制写回速率,避免频繁更新带宽信息。
标签:ratio,wb,long,ratelimited,thresh,dirty,脏页,balance From: https://www.cnblogs.com/linhaostudy/p/18562447