【驱动】块设备驱（三）-IO调度层

标签：bio 请求队列 request list 调度 IO 驱动 struct

前言

每个块设备驱动程序都维持着自己的请求队列，它包含设备待处理的请求链表。如果磁盘控制器正在处理几个磁盘，那么通常每个物理块设备都有一个请求队列。在每个请求队列上单独执行1/O调度，这样可以提高磁盘的性能。

关键API

struct request_queue

请求队列是由一个大的数据结构_{request_queue}表示的。每个磁盘对应一个request_queue。该队列挂的就是request请求。

struct request_queue {
	/*
	 * Together with queue_head for cacheline sharing
	 */
	struct list_head	queue_head;
	struct request		*last_merge;
	struct elevator_queue	*elevator;
	int			nr_rqs[2];	/* # allocated [a]sync rqs */
	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */

	/*
	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
	 * is used, root blkg allocates from @q->root_rl and all other
	 * blkgs from their own blkg->rl.  Which one to use should be
	 * determined using bio_request_list().
	 */
	struct request_list	root_rl;

	request_fn_proc		*request_fn;
	make_request_fn		*make_request_fn;
	prep_rq_fn		*prep_rq_fn;
	unprep_rq_fn		*unprep_rq_fn;
	softirq_done_fn		*softirq_done_fn;
	rq_timed_out_fn		*rq_timed_out_fn;
	dma_drain_needed_fn	*dma_drain_needed;
	lld_busy_fn		*lld_busy_fn;

	struct blk_mq_ops	*mq_ops;

	unsigned int		*mq_map;

	/* sw queues */
	struct blk_mq_ctx __percpu	*queue_ctx;
	unsigned int		nr_queues;

	/* hw dispatch queues */
	struct blk_mq_hw_ctx	**queue_hw_ctx;
	unsigned int		nr_hw_queues;

	/*
	 * Dispatch queue sorting
	 */
	sector_t		end_sector;
	struct request		*boundary_rq;

	/*
	 * Delayed queue handling
	 */
	struct delayed_work	delay_work;

	struct backing_dev_info	backing_dev_info;

	/*
	 * The queue owner gets to use this for whatever they like.
	 * ll_rw_blk doesn't touch it.
	 */
	void			*queuedata;

	/*
	 * various queue flags, see QUEUE_* below
	 */
	unsigned long		queue_flags;

	/*
	 * ida allocated id for this queue.  Used to index queues from
	 * ioctx.
	 */
	int			id;

	/*
	 * queue needs bounce pages for pages above this limit
	 */
	gfp_t			bounce_gfp;

	/*
	 * protects queue structures from reentrancy. ->__queue_lock should
	 * _never_ be used directly, it is queue private. always use
	 * ->queue_lock.
	 */
	spinlock_t		__queue_lock;
	spinlock_t		*queue_lock;

	/*
	 * queue kobject
	 */
	struct kobject kobj;

	/*
	 * mq queue kobject
	 */
	struct kobject mq_kobj;

#ifdef  CONFIG_BLK_DEV_INTEGRITY
	struct blk_integrity integrity;
#endif	/* CONFIG_BLK_DEV_INTEGRITY */

#ifdef CONFIG_PM
	struct device		*dev;
	int			rpm_status;
	unsigned int		nr_pending;
#endif

	/*
	 * queue settings
	 */
	unsigned long		nr_requests;	/* Max # of requests */
	unsigned int		nr_congestion_on;
	unsigned int		nr_congestion_off;
	unsigned int		nr_batching;

	unsigned int		dma_drain_size;
	void			*dma_drain_buffer;
	unsigned int		dma_pad_mask;
	unsigned int		dma_alignment;

	struct blk_queue_tag	*queue_tags;
	struct list_head	tag_busy_list;

	unsigned int		nr_sorted;
	unsigned int		in_flight[2];
	/*
	 * Number of active block driver functions for which blk_drain_queue()
	 * must wait. Must be incremented around functions that unlock the
	 * queue_lock internally, e.g. scsi_request_fn().
	 */
	unsigned int		request_fn_active;

	unsigned int		rq_timeout;
	struct timer_list	timeout;
	struct list_head	timeout_list;

	struct list_head	icq_list;
#ifdef CONFIG_BLK_CGROUP
	DECLARE_BITMAP		(blkcg_pols, BLKCG_MAX_POLS);
	struct blkcg_gq		*root_blkg;
	struct list_head	blkg_list;
#endif

	struct queue_limits	limits;

	/*
	 * sg stuff
	 */
	unsigned int		sg_timeout;
	unsigned int		sg_reserved_size;
	int			node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	struct blk_trace	*blk_trace;
#endif
	/*
	 * for flush operations
	 */
	unsigned int		flush_flags;
	unsigned int		flush_not_queueable:1;
	struct blk_flush_queue	*fq;

	struct list_head	requeue_list;
	spinlock_t		requeue_lock;
	struct work_struct	requeue_work;

	struct mutex		sysfs_lock;

	int			bypass_depth;
	atomic_t		mq_freeze_depth;

#if defined(CONFIG_BLK_DEV_BSG)
	bsg_job_fn		*bsg_job_fn;
	int			bsg_job_size;
	struct bsg_class_device bsg_dev;
#endif

#ifdef CONFIG_BLK_DEV_THROTTLING
	/* Throttle data */
	struct throtl_data *td;
#endif
	struct rcu_head		rcu_head;
	wait_queue_head_t	mq_freeze_wq;
	struct percpu_ref	q_usage_counter;
	struct list_head	all_q_node;

	struct blk_mq_tag_set	*tag_set;
	struct list_head	tag_set_list;
	struct bio_set		*bio_split;

	bool			mq_sysfs_init_done;
};

queue_head：请求队列的头部，用于与其他结构体共享缓存行。
last_merge：指向最后一个合并的请求。
elevator：指向调度器队列的指针。
nr_rqs：分别表示已分配的同步和异步请求的数量。
nr_rqs_elvpriv：已分配的具有 elvpriv 的请求的数量。
root_rl：用于块控制组的根请求列表。
request_fn：请求处理函数。
make_request_fn：构建请求的函数。
prep_rq_fn：准备请求的函数。
unprep_rq_fn：取消准备请求的函数。
softirq_done_fn：软中断完成处理函数。
rq_timed_out_fn：请求超时处理函数。
dma_drain_needed：DMA排空所需的函数。
lld_busy_fn：低级驱动程序繁忙处理函数。
mq_ops：多队列操作的函数指针集合。
mq_map：用于多队列映射的位图。
queue_ctx：用于软件队列的上下文。
nr_queues：软件队列的数量。
queue_hw_ctx：用于硬件调度队列的上下文。
nr_hw_queues：硬件调度队列的数量。
end_sector：队列的结束扇区。
boundary_rq：边界请求。
delay_work：延迟处理工作。
backing_dev_info：后备设备信息。
queuedata：队列所有者可以使用的指针。
queue_flags：队列的标志位。
id：为该队列分配的唯一ID。
bounce_gfp：需要跳转页面的限制。
__queue_lock 和 queue_lock：保护队列结构的自旋锁。
kobj：队列的内核对象。
mq_kobj：多队列的内核对象。
integrity：用于块设备完整性的结构体。
dev：设备结构指针。
rpm_status：设备的电源管理状态。
nr_pending：挂起请求的数量。
nr_requests：队列允许的最大请求数量。
nr_congestion_on 和 nr_congestion_off：拥塞控制相关的参数。
nr_batching：批量请求的数量。
dma_drain_size：DMA排空的大小。
dma_drain_buffer：DMA排空缓冲区的指针。
dma_pad_mask 和 dma_alignment：DMA对齐相关的参数。
queue_tags：队列标签的指针。
tag_busy_list：标签繁忙列表。
nr_sorted：已排序的请求数量。
in_flight：请求在飞行中的数量。
request_fn_active：活动的块驱动程序函数的数量。
rq_timeout：请求超时时间。
timeout 和 timeout_list：请求超时处理相关的计时器和列表。
icq_list：IO控制队列的列表。
blkcg_pols：用于块控制组的位图。
root_blkg：根块组调度的指针。
blkg_list：块组的列表。
limits：队列的限制。
sg_timeout 和 sg_reserved_size：用于散列表的参数。
node：队列所在的NUMA节点。
blk_trace：用于块设备IO跟踪的指针。
flush_flags 和 flush_not_queueable：刷新操作相关的标志。
fq：刷新队列。
requeue_list、requeue_lock 和 requeue_work：用于重新排队的列表、自旋锁和工作。
sysfs_lock：用于保护sysfs文件系统的锁。
bypass_depth：绕过深度。
mq_freeze_depth：多队列冻结的深度。
bsg_job_fn、bsg_job_size 和 bsg_dev：用于块SG（SCSI Generic）的作业处理。
td：限流数据。
rcu_head：用于RCU（Read-Copy-Update）的头部。
mq_freeze_wq：用于多队列冻结的等待队列头。
q_usage_counter：队列使用计数器。
all_q_node：所有队列的节点。
tag_set：标签集的指针。
tag_set_list：标签集的列表。
bio_split：用于拆分BIO的指针。
mq_sysfs_init_done：多队列sysfs初始化完成标志。

struct request

每个块设备的待处理请求都是用一个请求描述符来表示的，一个request中包含了一个或多个bio，为什么要有request这个结构呢？它存在的目的就是为了进行io的调度。通过request这个辅助结构，我们来给bio进行某种调度方法的排序，从而最大化地提高磁盘访问速度。

struct request {
	struct list_head queuelist;
	union {
		struct call_single_data csd;
		unsigned long fifo_time;
	};

	struct request_queue *q;
	struct blk_mq_ctx *mq_ctx;

	u64 cmd_flags;
	unsigned cmd_type;
	unsigned long atomic_flags;
	
	int cpu;

	/* the following two fields are internal, NEVER access directly */
	unsigned int __data_len;	/* total data len */
	sector_t __sector;		/* sector cursor */

	struct bio *bio;
	struct bio *biotail;

	/*
	 * The hash is used inside the scheduler, and killed once the
	 * request reaches the dispatch list. The ipi_list is only used
	 * to queue the request for softirq completion, which is long
	 * after the request has been unhashed (and even removed from
	 * the dispatch list).
	 */
	union {
		struct hlist_node hash;	/* merge hash */
		struct list_head ipi_list;
	};

	/*
	 * The rb_node is only used inside the io scheduler, requests
	 * are pruned when moved to the dispatch queue. So let the
	 * completion_data share space with the rb_node.
	 */
	union {
		struct rb_node rb_node;	/* sort/lookup */
		void *completion_data;
	};

	/*
	 * Three pointers are available for the IO schedulers, if they need
	 * more they have to dynamically allocate it.  Flush requests are
	 * never put on the IO scheduler. So let the flush fields share
	 * space with the elevator data.
	 */
	union {
		struct {
			struct io_cq		*icq;
			void			*priv[2];
		} elv;

		struct {
			unsigned int		seq;
			struct list_head	list;
			rq_end_io_fn		*saved_end_io;
		} flush;
	};

	struct gendisk *rq_disk;
	struct hd_struct *part;
	unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
	struct request_list *rl;		/* rl this rq is alloced from */
	unsigned long long start_time_ns;
	unsigned long long io_start_time_ns;    /* when passed to hardware */
#endif
	/* Number of scatter-gather DMA addr+len pairs after
	 * physical address coalescing is performed.
	 */
	unsigned short nr_phys_segments;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	unsigned short nr_integrity_segments;
#endif

	unsigned short ioprio;

	void *special;		/* opaque pointer available for LLD use */

	int tag;
	int errors;

	/*
	 * when request is used as a packet command carrier
	 */
	unsigned char __cmd[BLK_MAX_CDB];
	unsigned char *cmd;
	unsigned short cmd_len;

	unsigned int extra_len;	/* length of alignment and padding */
	unsigned int sense_len;
	unsigned int resid_len;	/* residual count */
	void *sense;

	unsigned long deadline;
	struct list_head timeout_list;
	unsigned int timeout;
	int retries;

	/*
	 * completion callback.
	 */
	rq_end_io_fn *end_io;
	void *end_io_data;

	/* for bidi */
	struct request *next_rq;

	ktime_t			lat_hist_io_start;
	int			lat_hist_enabled;
};

queuelist：用于将请求链接到请求队列中的链表节点。
csd 和 fifo_time：用于处理单个CPU的调用数据和FIFO时间。
q：指向请求队列的指针。
mq_ctx：指向块多队列上下文的指针。
cmd_flags：命令标志。
cmd_type：命令类型。
atomic_flags：原子标志。
cpu：处理请求的CPU编号。
__data_len：数据长度。
__sector：扇区位置。
bio 和 biotail：与请求相关联的BIO（块输入/输出）链表。
hash 和 ipi_list：用于请求合并的哈希节点和IPI（中断处理程序间的插入）列表。
rb_node 和 completion_data：用于在IO调度器中进行排序和查找的红黑树节点，以及完成数据。
elv 和 flush：用于IO调度器的指针和特殊字段。
rq_disk：关联的块设备。
part：关联的分区。
start_time：请求开始时间。
rl：分配请求的请求列表。
start_time_ns 和 io_start_time_ns：请求开始时间的纳秒表示。
nr_phys_segments 和 nr_integrity_segments：物理和完整性段的数量。
ioprio：IO优先级。
special：用于低级驱动程序使用的不透明指针。
tag：请求的标签。
errors：请求的错误数。
__cmd、cmd 和 cmd_len：命令缓冲区和命令长度。
extra_len：对齐和填充的长度。
sense_len 和 resid_len：感知数据的长度和剩余计数。
sense：感知数据。
deadline：请求的截止时间。
timeout_list、timeout 和 retries：超时处理相关的链表、超时时间和重试次数。
end_io 和 end_io_data：请求完成时的回调函数和数据。
next_rq：下一个请求（用于双向请求）。
lat_hist_io_start 和 lat_hist_enabled：用于延迟直方图的IO开始时间和启用标志。

generic_make_request

请求到达block层后，通过generic_make_request这个入口函数，在通过调用一系列相关的函数把bio变成了request。具体的做法如下：如果几个bio要读写的区域是连续的，即积攒成一个request（一个request上挂多个连续的bio，就是我们通常说的“合并bio请求”），如果一个bio跟其他的bio都连不上，那它就自己创建一个新的request，把自己挂在这个request下。当然，合并bio的个数也是有限的，这个可以通过配置文件配置。

blk_qc_t generic_make_request(struct bio *bio)
{
	/*
	 * bio_list_on_stack[0] contains bios submitted by the current
	 * make_request_fn.
	 * bio_list_on_stack[1] contains bios that were submitted before
	 * the current make_request_fn, but that haven't been processed
	 * yet.
	 */
	struct bio_list bio_list_on_stack[2];
	blk_qc_t ret = BLK_QC_T_NONE;

	if (!generic_make_request_checks(bio))
		goto out;

	/*
	 * We only want one ->make_request_fn to be active at a time, else
	 * stack usage with stacked devices could be a problem.  So use
	 * current->bio_list to keep a list of requests submited by a
	 * make_request_fn function.  current->bio_list is also used as a
	 * flag to say if generic_make_request is currently active in this
	 * task or not.  If it is NULL, then no make_request is active.  If
	 * it is non-NULL, then a make_request is active, and new requests
	 * should be added at the tail
	 */
	if (current->bio_list) {
		bio_list_add(&current->bio_list[0], bio);
		goto out;
	}

	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to a pointer to the bio_list_on_stack,
	 * thus initialising the bio_list of new bios to be
	 * added.  ->make_request() may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so remove it from
	 * bio_list, and call into ->make_request() again.
	 */
	BUG_ON(bio->bi_next);
	bio_list_init(&bio_list_on_stack[0]);
	current->bio_list = bio_list_on_stack;
	do {
		struct request_queue *q = bdev_get_queue(bio->bi_bdev);

		if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
			struct bio_list lower, same;

			/* Create a fresh bio_list for all subordinate requests */
			bio_list_on_stack[1] = bio_list_on_stack[0];
			bio_list_init(&bio_list_on_stack[0]);

			ret = q->make_request_fn(q, bio);

			blk_queue_exit(q);
			/* sort new bios into those for a lower level
			 * and those for the same level
			 */
			bio_list_init(&lower);
			bio_list_init(&same);
			while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
				if (q == bdev_get_queue(bio->bi_bdev))
					bio_list_add(&same, bio);
				else
					bio_list_add(&lower, bio);
			/* now assemble so we handle the lowest level first */
			bio_list_merge(&bio_list_on_stack[0], &lower);
			bio_list_merge(&bio_list_on_stack[0], &same);
			bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
		} else {
			bio_io_error(bio);
		}
		bio = bio_list_pop(&bio_list_on_stack[0]);
	} while (bio);
	current->bio_list = NULL; /* deactivate */

out:
	return ret;
}

首先，定义了一个bio_list_on_stack数组，其中bio_list_on_stack[0]用于存储当前make_request_fn提交的bio，bio_list_on_stack[1]用于存储之前提交但尚未处理的bio。
然后，初始化ret为BLK_QC_T_NONE，表示没有请求完成。
调用generic_make_request_checks函数对bio进行检查，如果检查失败，则跳转到out标签处。
判断当前是否已有活动的make_request_fn，通过检查current->bio_list是否为空。如果不为空，将当前的bio添加到current->bio_list[0]中，并跳转到out标签处。
进入一个循环，处理bio链表中的每个bio请求。
在循环中，首先获取bio->bi_bdev对应的请求队列q。
判断是否可以进入请求队列，如果可以，则执行以下操作：
- 创建一个新的bio_list用于存储下层请求。
- 调用请求队列的make_request_fn函数处理当前bio，返回值赋给ret。
- 退出请求队列。
- 将新的请求按照下层和同层进行分类，存放到lower和same两个bio_list中。
- 合并lower、same以及之前存储的bio_list_on_stack[1]中的请求，并存放到bio_list_on_stack[0]中。
如果无法进入请求队列，说明发生了错误，调用bio_io_error函数处理当前bio的错误。
弹出bio_list_on_stack[0]中的下一个bio，继续处理下一个请求。
重复步骤7到步骤9，直到处理完所有的请求。
将current->bio_list设置为NULL，表示当前没有活动的make_request_fn。
返回ret，表示请求的完成状态。

blk_get_request

blk_get_request函数根据请求队列的类型选择相应的方法来获取请求。

首先，函数检查请求队列的mq_ops字段是否存在。如果存在，表示使用了多队列（multi-queue）的块设备驱动模型，进入相应的代码分支。

在多队列模型下，函数调用blk_mq_alloc_request函数来分配一个请求。该函数会根据指定的请求队列q、读写方向rw和内存分配标志gfp_mask来创建一个新的请求结构，并返回该请求的指针。

如果请求队列的mq_ops字段不存在，表示使用的是旧的块设备驱动模型，进入另一个代码分支。

在旧的模型下，函数调用blk_old_get_request函数来获取请求。该函数会根据指定的请求队列q、读写方向rw和内存分配标志gfp_mask来获取一个已经存在的请求结构，并返回该请求的指针。

struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
	if (q->mq_ops)
		return blk_mq_alloc_request(q, rw, gfp_mask, false);
	else
		return blk_old_get_request(q, rw, gfp_mask);
}

blk_mq_alloc_request

struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
		bool reserved)
{
	struct blk_mq_ctx *ctx;
	struct blk_mq_hw_ctx *hctx;
	struct request *rq;
	struct blk_mq_alloc_data alloc_data;
	int ret;

	ret = blk_queue_enter(q, gfp);
	if (ret)
		return ERR_PTR(ret);

	ctx = blk_mq_get_ctx(q);
	hctx = q->mq_ops->map_queue(q, ctx->cpu);
	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
			reserved, ctx, hctx);

	rq = __blk_mq_alloc_request(&alloc_data, rw);
	if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
		__blk_mq_run_hw_queue(hctx);
		blk_mq_put_ctx(ctx);

		ctx = blk_mq_get_ctx(q);
		hctx = q->mq_ops->map_queue(q, ctx->cpu);
		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
				hctx);
		rq =  __blk_mq_alloc_request(&alloc_data, rw);
		ctx = alloc_data.ctx;
	}
	blk_mq_put_ctx(ctx);
	if (!rq) {
		blk_queue_exit(q);
		return ERR_PTR(-EWOULDBLOCK);
	}
	return rq;
}

首先，函数调用blk_queue_enter函数进入请求队列的临界区。该函数会获取请求队列的锁，并根据指定的内存分配标志gfp执行相应的操作。

接下来，函数获取请求队列的上下文ctx和硬件队列上下文hctx。上下文用于跟踪请求的状态和执行信息。通过调用请求队列的map_queue方法，函数根据上下文的CPU绑定信息将请求映射到相应的硬件队列上下文。

然后，函数设置请求分配数据结构alloc_data，其中包含了请求队列、内存分配标志、是否保留等信息。

接着，函数调用__blk_mq_alloc_request函数进行请求的分配。该函数根据分配数据结构alloc_data和读写方向rw来创建一个新的请求结构，并返回该请求的指针。如果分配失败并且内存分配标志gfp中包含__GFP_DIRECT_RECLAIM标志，函数会尝试运行硬件队列以释放一些资源，并重新获取上下文和硬件队列，然后再次尝试分配请求。

在获取到请求后，函数通过调用blk_mq_put_ctx函数释放上下文。

最后，函数检查请求是否为空。如果为空，表示请求分配失败，函数调用blk_queue_exit函数退出请求队列的临界区，并返回一个错误指针ERR_PTR(-EWOULDBLOCK)。

blk_old_get_request

static struct request *blk_old_get_request(struct request_queue *q, int rw,
		gfp_t gfp_mask)
{
	struct request *rq;

	BUG_ON(rw != READ && rw != WRITE);

	/* create ioc upfront */
	create_io_context(gfp_mask, q->node);

	spin_lock_irq(q->queue_lock);
	rq = get_request(q, rw, NULL, gfp_mask);
	if (IS_ERR(rq))
		spin_unlock_irq(q->queue_lock);
	/* q->queue_lock is unlocked at this point */

	return rq;
}

首先，函数调用BUG_ON宏来断言读写方向rw必须是READ或WRITE。接下来，函数调用create_io_context函数来创建 I/O 上下文。I/O 上下文用于跟踪请求的相关信息。该函数会根据指定的内存分配标志gfp_mask和节点信息q->node来创建一个新的 I/O 上下文。

然后，函数获取请求队列的队列锁queue_lock，并进入自旋锁保护的临界区。在临界区内，函数调用get_request函数来获取请求。该函数会根据指定的请求队列q、读写方向rw、请求关联的块设备NULL和内存分配标志gfp_mask来获取一个已经存在的请求结构，并返回该请求的指针。如果获取请求失败，函数会调用spin_unlock_irq函数解锁队列锁。

IO调度算法简介

Noop算法

最简单的 I/O调度算法。该算法仅适当合并用户请求，并不排序请求：新的请求通常被插在调度队列的开头或末尾，下一个要处理的请求总是队列中的第一个请求。这种算法是为不需要寻道的块设备设计的，如SSD。

CFQ算法

"CFQ（完全公平队列）”算法的主要目标是在触发I/O请求的所有进程中确保磁盘I/O带宽的公平分配。为了达到这个目标，算法使用许多个排序队列——缺省为64。它们存放了不同进程发出的请求。当算法处理一个请求时，内核调用一个散列函数将当前进程的线程组标识符(PID)；然后，算法将一个新的请求插人该队列的末尾。因此，同一个进程发出的请求通常被插入相同的队列中。

算法本质上采用轮询方式扫描I/O输入队列，选择第一个非空队列，依次调度不同队列中特定个数(公平)的请求，然后将这些请求移动到调度队列的末尾。

最后期限算法

除了调度队列外，“最后期限”算法还使用了四个队列。其中的两个排序队列分别包含读请求和写请求，其中的请求是根据起始扇区号排序的。另外两个最后期限队列包含相同的读和写请求，但这是根据它们的“最后期限”排序的。引人这些队列是为了避免请求饿死，由于电梯策略(曾经的调度算法)优先处理与上一个所处理的请求最近的请求，因而就会对某个请求忽略很长一段时间，这时就会发生这种情况。请求的最后期限本质上就是一个超时定时器，当请求被传给电梯算法时开始计时。缺省情况下，读请求的超时时间是500ms，写请求的超时时间是5s——读请求优先于写请求，因为读请求通常阻塞发出请求的进程。最后期限保证了调度程序照顾等待很长一段时间的那个请求，即使它位于排序队列的末尾。

当算法要补充调度队列时，首先确定下一个请求的数据方向。如果同时要调度读和写两个请求，算法会选择“读”方向，除非该“写”方向已经被放弃很多次了（为了避免写请求饿死）。

接下来，算法检查与被选择方向相关的最后期限队列：如果队列中的第一个请求的最后期限已用完，那么算法将该请求移到调度队列的末尾。同时，它也会移动该过期的请求后面的一组来自排序队列的相同扇区号的请求。如果将要移动的请求在磁盘上物理相邻，那么这一批队列的长度会很长，否则就很短。

最后，如果没有请求超时，算法对来自于排序队列的最后一个请求连带之后的一组相同扇区的请求进行调度。当指针到达排序队列的末尾时，搜索又从头开始（“单方向算法”）。

预期算法

“预期”算法是Linux提供的最复杂的一种1/O调度算法。基本上，它是“最后期限”算法的一个演变，借用了“最后期限”算法的基本机制：两个最后期限队列和两个排序队列；I/O调度程序在读和写请求之间交互扫描排序队列，不过更倾向于读请求。扫描基本上是连续的，除非有某个请求超时。读请求的缺省超时时间是125ms，写请求的缺省超时时间是250ms。但是，该算法还遵循一些附加的启发式准则：

有些情况下，算法可能在排序队列当前位置之后选择一个请求，从而强制磁头从后搜索。这种情况通常发生在这个请求之后的搜索距离小于在排序队列当前位置之后对该请求搜索距离的一半时。

算法统计系统中每个进程触发的I/O操作的种类。当刚刚调度了由某个进程p发出的一个读请求之后，算法马上检查排序队列中的下一个请求是否来自同一个进程p。如果是，立即调度下一个请求。否则，查看关于该进程p的统计信息：如果确定进程p可能很快发出另一个读请求，那么就延迟一小段时间（缺省大约为7ms）。因此，算法预测进程p发出的读请求与刚被调度的请求在磁盘上可能是“近邻”。

关于调度算法详细的介绍可以参考：https://zhuanlan.zhihu.com/p/548619385

本文参考

https://blog.csdn.net/weixin_43780260/article/details/88993543

《Linux内核设计与实现》

标签：bio,请求,队列,request,list,调度,IO,驱动,struct
From： https://www.cnblogs.com/dongxb/p/18007825