首页 > 其他分享 >epoll底层原理

epoll底层原理

时间:2023-04-05 16:05:18浏览次数:36  
标签:epoll epi fd file error 原理 ep 底层

1.进程socket描述

epoll底层原理_等待队列

2.accept简单流程

epoll底层原理_sed_02

SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
		int __user *, upeer_addrlen, int, flags)
{
	struct socket *sock, *newsock;
	struct file *newfile;
	int err, len, newfd, fput_needed;
	struct sockaddr_storage address;

	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;

	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
	// 根据fd查找监听的socket
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (!sock)
		goto out;

	err = -ENFILE;
	newsock = sock_alloc();
	if (!newsock)
		goto out_put;

	newsock->type = sock->type;
	newsock->ops = sock->ops;

	/*
	 * We don't need try_module_get here, as the listening socket (sock)
	 * has the protocol module (sock->ops->owner) held.
	 */
	__module_get(newsock->ops->owner);

	newfd = get_unused_fd_flags(flags);
	if (unlikely(newfd < 0)) {
		err = newfd;
		sock_release(newsock);
		goto out_put;
	}
	// 申请新的file对象,设置到新的socket对象上
	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
	if (IS_ERR(newfile)) {
		err = PTR_ERR(newfile);
		put_unused_fd(newfd);
		sock_release(newsock);
		goto out_put;
	}

	err = security_socket_accept(sock, newsock);
	if (err)
		goto out_fd;
	// 接收链接
	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
	if (err < 0)
		goto out_fd;

	if (upeer_sockaddr) {
		if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
					  &len, 2) < 0) {
			err = -ECONNABORTED;
			goto out_fd;
		}
		err = move_addr_to_user(&address,
					len, upeer_sockaddr, upeer_addrlen);
		if (err < 0)
			goto out_fd;
	}

	/* File flags are not inherited via accept() unlike another OSes. */

	fd_install(newfd, newfile);
	err = newfd;

out_put:
	fput_light(sock->file, fput_needed);
out:
	return err;
out_fd:
	fput(newfile);
	put_unused_fd(newfd);
	goto out_put;
}

3.event_poll对应进程结构

epoll底层原理_链表_03

3.1 Event_Poll内部重要结构

  • eventwait使用的等待队列
  • rdlist 就绪队列
  • rbr红黑树,对应的epoll_item

3.2 event结构成员含义

  • wq 等待队列链表
  • rbr 管理用户进程添加进来的socket链接
  • edlist 就绪描述符链表,当有的连接就绪的时候,内核或将就绪的连接放到rdlist链表里面。应用就只需要判断就绪的链表成员就能找出来对应的就绪进程

epoll底层原理_等待队列_04

3.3相关代码

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;

	/* Check the EPOLL_* constant for consistency.  */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * Create the internal data structure ("struct eventpoll").
	 */
	// 为ep分配内存并初始化
	error = ep_alloc(&ep);
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 */
	// 获取文件描述符fd
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd < 0) {
		error = fd;
		goto out_free_ep;
	}
	// 将epoll 文件系统的操作集对象赋值给file,并且将eventpoll对象挂到file的private_data对象上
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
	ep->file = file;
	// 将文件对象挂到当前文件描述符fd中
	fd_install(fd, file);
	return fd;

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}
static int ep_alloc(struct eventpoll **pep)
{
	int error;
	struct user_struct *user;
	struct eventpoll *ep;

	user = get_current_user();
	error = -ENOMEM;
	// 分配epoll_event内存
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
	if (unlikely(!ep))
		goto free_uid;

	spin_lock_init(&ep->lock);
	mutex_init(&ep->mtx);
	// 初始化等待队列头部
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);
	// 初始化就绪队列
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT;
	ep->ovflist = EP_UNACTIVE_PTR;
	ep->user = user;

	*pep = ep;

	return 0;

free_uid:
	free_uid(user);
	return error;
}

3.4 再议握手的内部机制

epoll底层原理_sed_05

3.5 epoll_ctl添加socket

如上,客户端已经建立了多个socket连接,并且创建好了epoll 对象,使用epoll_ctl注册socket对象的时候会做的事情

  • 分配一个红黑树节点对象epitem
  • 添加等待事件到socket的等待队列
  • 将epoll_item加入到epoll对象的红黑树

epoll底层原理_等待队列_06

从上图中能够看出来,一旦eventpoll有事件发生,就会根据红黑树找到对应的epitem对象,从而快速的找到对应的socket对象

3.6 epoll_ctl 源码

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
	// 根据epfd找到对应的 eventpoll对象
	f = fdget(epfd);
	if (!f.file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	// 根据sockect 句柄找到其对应的file 对象
	tf = fdget(fd);
	if (!tf.file)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	if (!tf.file->f_op->poll)
		goto error_tgt_fput;

	/* Check if EPOLLWAKEUP is allowed */
	if (ep_op_has_event(op))
		ep_take_care_of_epollwakeup(&epds);

	...........
	epi = ep_find(ep, tf.file, fd);

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}
..............

	return error;
}
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	int error, revents, pwake = 0;
	unsigned long flags;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	user_watches = atomic_long_read(&ep->user->epoll_watches);
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	/* Initialize the poll table using the queue callback */
	// 设置socket的等待队列
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	revents = ep_item_poll(epi, &epq.pt);

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	error = -ENOMEM;
	if (epi->nwait < 0)
		goto error_unregister;

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	ep_rbtree_insert(ep, epi);

	/* now check if we've created too many backpaths */
	error = -EINVAL;
	if (full_check && reverse_path_check())
		goto error_remove_epi;

	/* We have to drop the new item inside our item list to keep track of it */
	spin_lock_irqsave(&ep->lock, flags);

	/* If the file is already "ready" we drop it inside the ready list */
	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up_locked(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	spin_unlock_irqrestore(&ep->lock, flags);

	atomic_long_inc(&ep->user->epoll_watches);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return 0;

error_remove_epi:
	spin_lock(&tfile->f_lock);
	list_del_rcu(&epi->fllink);
	spin_unlock(&tfile->f_lock);

	rb_erase(&epi->rbn, &ep->rbr);

error_unregister:
	ep_unregister_pollwait(ep, epi);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue. Note that we don't care about the ep->ovflist
	 * list, since that is used/cleaned only inside a section bound by "mtx".
	 * And ep_insert() is called with "mtx" held.
	 */
	spin_lock_irqsave(&ep->lock, flags);
	if (ep_is_linked(&epi->rdllink))
		list_del_init(&epi->rdllink);
	spin_unlock_irqrestore(&ep->lock, flags);

	wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
	kmem_cache_free(epi_cache, epi);

	return error;
}

3.7 epoll_wait等待事件

  • 观察epoll_edlist链表里面是否有数据,有数据就返回,没有数据就创建一个等待队列,将其添加到eventpoll的等待队列上,然后将自己阻塞掉即可

epoll底层原理_sed_07

3.8 epoll_wait源码

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	/* The maximum number of event must be greater than zero */
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;

	/* Verify that the area passed by the user is writeable */
	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
		return -EFAULT;

	/* Get the "struct file *" for the eventpoll file */
	f = fdget(epfd);
	if (!f.file)
		return -EBADF;

	/*
	 * We have to check that the file structure underneath the fd
	 * the user passed to us _is_ an eventpoll file.
	 */
	error = -EINVAL;
	if (!is_file_epoll(f.file))
		goto error_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	ep = f.file->private_data;

	/* Time to fish for events ... */
	error = ep_poll(ep, events, maxevents, timeout);

error_fput:
	fdput(f);
	return error;
}
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res = 0, eavail, timed_out = 0;
	unsigned long flags;
	u64 slack = 0;
	wait_queue_t wait;
	ktime_t expires, *to = NULL;

	if (timeout > 0) {
		struct timespec end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec_to_ktime(end_time);
	} else if (timeout == 0) {
		/*
		 * Avoid the unnecessary trip to the wait queue loop, if the
		 * caller specified a non blocking operation.
		 */
		timed_out = 1;
		spin_lock_irqsave(&ep->lock, flags);
		goto check_events;
	}

fetch_events:
	spin_lock_irqsave(&ep->lock, flags);
	// 判断就绪队列上是否有事件就绪
	if (!ep_events_available(ep)) {
		/*
		 * We don't have any available event to return to the caller.
		 * We need to sleep here, and we will be wake up by
		 * ep_poll_callback() when events will become available.
		 */
		// 定义等待事件并关联当前进程
		init_waitqueue_entry(&wait, current);
		// 将新的waitqueue 添加到epoll的等待队列链表
		__add_wait_queue_exclusive(&ep->wq, &wait);

		for (;;) {
			/*
			 * We don't want to sleep if the ep_poll_callback() sends us
			 * a wakeup in between. That's why we set the task state
			 * to TASK_INTERRUPTIBLE before doing the checks.
			 */
			set_current_state(TASK_INTERRUPTIBLE);
			if (ep_events_available(ep) || timed_out)
				break;
			if (signal_pending(current)) {
				res = -EINTR;
				break;
			}

			spin_unlock_irqrestore(&ep->lock, flags);
			// 主动让出CPU进入睡眠状态
			if (!freezable_schedule_hrtimeout_range(to, slack,
								HRTIMER_MODE_ABS))
				timed_out = 1;

			spin_lock_irqsave(&ep->lock, flags);
		}

	...................

	return res;
}

3.9 epoll数据到来

  • 经过上节epoll_wait过后,目前有两个等待队列,一个是socket的等待队列,另一个是epoll的等待队列
  • 在socket的等待队列其毁掉函数是ep_poll_callback,在epoll的等待队列中其回调函数是default_wake_func
  • 另外在socket的private里面是没有用了(因为将socket交给了epoll管理)指向的是NULL,在eventpoll里面的private指向的是事件的用户进程

epoll底层原理_等待队列_08

3.10 数据到来处理流程

  • 当网卡上的数据经过驱动到来的时候,进入协议栈以tcp协议栈开始,入口进入tcp_v4_rcv中根据网络数据包的header里面的source 和dest信息来在本机上查询对应的socket,找到后判断tcp的连接状态,假设是ESTABLISH 状态。
  • 进入tcp_rcv_established 函数中调用tcp_queue_rcv 将接收数据放到了scoket的接收队列上。
  • 在tcp_queue_rcv 接收完成后,接着再调用sk_data_ready 来唤醒在socket上的等待用户进程
  • 当socket上数据就绪的时候,内核就以sock_def_readable 位入口,找到epoll_ctl添加socket的时候设置的回调函数ep_poll_callback。
  • wake_up_interruptible_sync_poll这个函数只是会进入到等待队列上设置的回调函数,并不一定有唤醒进程的操作
//file: net/core/sock.c
static void sock_def_readable(struct sock *sk, int len)
{
    struct socket_wq *wq;

    rcu_read_lock();
    wq = rcu_dereference(sk->sk_wq);
    //而是判断等待队列不为空
    if (wq_has_sleeper(wq))
        //执行等待队列项上的回调函数
        wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
                        POLLRDNORM | POLLRDBAND);
    sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    rcu_read_unlock();
}
//file: include/linux/wait.h
#define wake_up_interruptible_sync_poll(x, m)       \
    __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
//file: kernel/sched/core.c
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, void *key)
{
    ...
    __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
}
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, int wake_flags, void *key)
{
    wait_queue_t *curr, *next;

    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;

        if (curr->func(curr, mode, wake_flags, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}
  • curr->func这个函数就是在ep_insert里面设置func为ep_poll_callback。
  • 在ep_poll_callback 根据等待队列上的base指针可以找到epitem.进而找到对应的eventpoll对象。
  • 首先会将epitem加入到epoll的就绪队列中,接着会查看eventpoll对象上的等待队列是否有等待项(epoll_wait执行的诗句会设置)。
  • 如果有等待项就查找等待项里面设置的回调函数。
  • 其实就是调用curr->func 这个回调函数是在epoll_wait时候传入的default_wake_function 。
  • 在default_wake_function 找到等待队列项里面的进程描述符,唤醒它。

4.总结

  1. epoll执行分为两个部分
  1. 用户进程的内核态:进行调用epoll_wait等函数的时候会陷入内核态来执行。这里只负责查看接收队列和是否将当前进程阻塞掉,让出CPU
  2. 下游数据部分即中断上下文:当网卡数据到来的时候内核会进行协议栈的处理,然后将这些数据放到socket的接收队列当中,对于epoll来说,再找到seocket关联的epitem,并且将它添加到epoll对象的就绪链表里面,再检查一下epoll是否有被阻塞的进程,有就唤醒它

标签:epoll,epi,fd,file,error,原理,ep,底层
From: https://blog.51cto.com/u_15059356/6171179

相关文章

  • new操作符的原理
    首先这里是一个构造函数:functionFun(age,name){this.age=agethis.name=name}这个是创建的函数:functioncreate(fn,...args){1.创建了一个空对象varobj={}2.将空对象的原型指向构造函数的对象Object.setPrototypeOf(obj,fn.prototype)3.将空对象作为构造函数的上下......
  • EasyUI闪屏,EasyUI页面加载提示:原理+代码+效果图
    使用EasyUI时,有个经常遇到的问题,页面还没有渲染完成的时候,就展现了。刚刚开始很混乱,等加载完成后,就好了。    $.parser.onComplete,这个是在所有组件解析完成后执行的事件。其实这个事件很有用的。很多在布局用到easyui的时候总会出现一个问题。就是在一进入主界面的时候,页......
  • 基础原理 | 善用数据手册
    【1】STM32F103ZET6定义  STM32=基于ARM核心的32位微控制器F =通用类型103=增强型Z=引脚数目为144脚(引脚越多外设越多)E=512K字节的闪存存储器T=封装为LQFP(LQFP144)6=工业级温度范围-40~85【2】引脚定义  引脚号和名称和芯片的引脚一一对应类型:S电源,I......
  • 一文带你弄懂 Maven 拉包原理
    业务需求开发的时候,我们总是会遇到拉不到依赖包的情况。此时如果不清楚Maven拉取依赖包的原理,那么很可能找不到问题所在。今天树哥就带大家了解下Maven拉包的原理,让你在遇到问题的时候能快速解决!三种仓库在Maven中,仓库指的是存放代码构建的一个位置。从分类上来说,Maven仓......
  • go run、build、install、get的原理和区别
    学习go也有好些年头了,都没怎么研究过go的编译命令。真是惭愧。今天学习并记录下结论:gorun专门用来运行命令源码文件的命令,一般用来运行单个文件gobuild主要是用于测试编译。编译某个包或者项目,在当前目录下生成可执行文件goinstall编译并安装代码包或者源码文件的。go......
  • 数据库系统原理之数据库应用设计与开发实例
    数据库应用设计与开发实例第一节需求描述与分析在此,结合某高校个性化课程在线选课的实际需求,给出一个简化的需求分析一、功能性需求1管理员后台模块学生信息管理教师信息管理课程信息管理班级信息管理2学生使用模块查询课程浏览所选课程查询成绩3教师使用模......
  • 聊聊微前端的原理和实践
    作者:TanXin本文对微前端的概念和场景进行科普,介绍一些主流的微前端的实现库及其用法,并讲解部分这些库的原理和实践知识。一、微前端在项目迭代中,随着业务的发展壮大,项目的功能模块通常也会越来越多。可能原来所有的代码模块都在一个仓库里,由一个团队负责。但随着功能模块越来越多,......
  • 一文带你弄懂 Maven 拉包原理
    业务需求开发的时候,我们总是会遇到拉不到依赖包的情况。此时如果不清楚Maven拉取依赖包的原理,那么很可能找不到问题所在。今天树哥就带大家了解下Maven拉包的原理,让你在遇到问题的时候能快速解决!三种仓库在Maven中,仓库指的是存放代码构建的一个位置。从分类上来说,Maven仓......
  • JDK ThreadPoolExecutor核心原理与实践
    一、内容概括本文内容主要围绕JDK中的ThreadPoolExecutor展开,首先描述了ThreadPoolExecutor的构造流程以及内部状态管理的机理,随后用大量篇幅深入源码探究了ThreadPoolExecutor线程分配、任务处理、拒绝策略、启动停止等过程,其中对Worker内置类进行重点分析,内容不仅包含其工作原理,......
  • 字节码引用检测原理与实战
    一、字节码与引用检测1.1Java字节码本章中的字节码重点研究Java字节码,Java字节码(Javabytecode)是Java虚拟机执行的一种指令格式。可以通过javap-c-vxxx.class(Class文件路径)命令来查看一个Class对应的字节码文件,如下图所示:1.2字节码检测字节码检测本质就是对.java或.kt文件......