【调试】kprobes(二)使用方法

标签：kprobe pt kretprobe regs kprobes handler 方法调试 struct

前言

上一节介绍了kprobe的基本概念，下面我们将使用几个具体的例子，看下kprobe在实际使用中有那些应用场景。

kprobe

内核的samples/kprobe目录下有kprobe相关的例子，我们以这些例子为基础，简单修改下。

查看函数的入参

我们所有的例子都是探测do_sys_open() 或者_do_fork()，以下是内核中的源码。

do_sys_open

struct audit_names;
struct filename {
	const char		*name;	/* pointer to actual string */
	const __user char	*uptr;	/* original userland pointer */
	struct audit_names	*aname;
	int			refcnt;
	const char		iname[];
};

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
	struct open_flags op;
	int fd = build_open_flags(flags, mode, &op);
	struct filename *tmp;

	if (fd)
		return fd;

	tmp = getname(filename);
	if (IS_ERR(tmp))
		return PTR_ERR(tmp);

	fd = get_unused_fd_flags(flags);
	if (fd >= 0) {
		struct file *f = do_filp_open(dfd, tmp, &op);
		if (IS_ERR(f)) {
			put_unused_fd(fd);
			fd = PTR_ERR(f);
		} else {
			fsnotify_open(f);
			fd_install(fd, f);
		}
	}
	putname(tmp);
	return fd;
}

_do_fork

long _do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr,
	      unsigned long tls)
{
	struct task_struct *p;
	int trace = 0;
	long nr;

	/*
	 * Determine whether and which event to report to ptracer.  When
	 * called from kernel_thread or CLONE_UNTRACED is explicitly
	 * requested, no event is reported; otherwise, report if the event
	 * for the type of forking is enabled.
	 */
	if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if ((clone_flags & CSIGNAL) != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}

	p = copy_process(clone_flags, stack_start, stack_size,
			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	if (!IS_ERR(p)) {
		struct completion vfork;
		struct pid *pid;

		cpufreq_task_times_alloc(p);

		trace_sched_process_fork(current, p);

		pid = get_task_pid(p, PIDTYPE_PID);
		nr = pid_vnr(pid);

		if (clone_flags & CLONE_PARENT_SETTID)
			put_user(nr, parent_tidptr);

		if (clone_flags & CLONE_VFORK) {
			p->vfork_done = &vfork;
			init_completion(&vfork);
			get_task_struct(p);
		}

		wake_up_new_task(p);

		/* forking complete and child started to run, tell ptracer */
		if (unlikely(trace))
			ptrace_event_pid(trace, pid);

		if (clone_flags & CLONE_VFORK) {
			if (!wait_for_vfork_done(p, &vfork))
				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
		}

		put_pid(pid);
	} else {
		nr = PTR_ERR(p);
	}
	return nr;
}

实际调试中经常需要调查函数使用的变量的值。要在kprobes的侦测器内显示某个函数的局部变量的值，需要一些技巧，原因是在printk的参数中无法直接指定变量名，因此必须给侦测器函数提供一个pt_regs结构，其中保存了指定地址的命令执行时的寄存器信息。

当然，不同架构下该结构的成员变量不尽相同，但用该结构可以显示变量等更为详细的信息。

ARM64,ARM32,X86的寄存器及其访问方式可以看文末的目录

kprobe_example.c

/*
 * NOTE: This example is works on x86 and powerpc.
 * Here's a sample kernel module showing the use of kprobes to dump a
 * stack trace and selected registers when _do_fork() is called.
 *
 * For more information on theory of operation of kprobes, see
 * Documentation/kprobes.txt
 *
 * You will see the trace data in /var/log/messages and on the console
 * whenever _do_fork() is invoked to create a new process.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#define TRACE_SYMBOL "do_filp_open"

/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
	.symbol_name	= TRACE_SYMBOL,
};
/* x86_64中寄存器中参数的顺序: rdi rsi rdx rcx r8 r9*/
/* aarch64: x0-x7 对应参数 */
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
	int dfd = -1;
	struct filename *filename = NULL;
#ifdef CONFIG_X86
	dfd = regs->di;
    filename = (struct filename *) regs->si;
#endif

#ifdef CONFIG_ARM64
  	dfd = regs->regs[0];
    filename = (struct filename *) regs->regs[1];
#endif

 	if (filename && !(strcmp(filename->name, "testfile")))
        printk(KERN_INFO "handler_pre:%s: dfd=%d, name=%s\n", p->symbol_name, dfd, filename->name);

	return 0;
}

/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
				unsigned long flags)
{
	//printk(KERN_INFO "handler_post\n");
}

/*
 * fault_handler: this is called if an exception is generated for any
 * instruction within the pre- or post-handler, or when Kprobes
 * single-steps the probed instruction.
 */
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
	/*printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn",
		p->addr, trapnr);*/
	/* Return 0 because we don't handle the fault. */
	return 0;
}

static int __init kprobe_init(void)
{
	int ret;
	kp.pre_handler = handler_pre;
	kp.post_handler = handler_post;
	kp.fault_handler = handler_fault;

	ret = register_kprobe(&kp);
	if (ret < 0) {
		printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
		return ret;
	}
	printk(KERN_INFO "Planted kprobe at %p\n", kp.addr);
	return 0;
}

static void __exit kprobe_exit(void)
{
	unregister_kprobe(&kp);
	printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");

我们以内核目录下的例程做一个简单修改，探测do_filp_open函数，当打开testfile文件时，自动打印出文件的路径。

为了减少无效信息的打印，我们将handler_post，handler_fault直接注释掉。

当探测点do_filp_open命中时，Kprobes调用handler_pre。在handler_pre根据struct filename *pathname来获得文件的名字。

在x86_64架构中，函数的参数从左到右分别保存在rdi、rsi、rdx、rcx、r8、r9中，因此查看rdi和rsi就能得到第1个、第2个参数的值。

同理，在ARM64架构中，函数的参数1~参数8分别保存到 X0~X7 寄存器中，剩下的参数从右往左依次入栈。因此，X0和X1分别存放dfd, pathname的值。

makefile

CROSS_COMPILE:=aarch64-linux-gnu-
ARCH:= arm64
CC:= $(CROSS_COMPILE)gcc
LD:= $(CROSS_COMPILE)ld

PWD:= $(shell pwd)
obj-m := kprobe_example.o jprobe_example.o  kretprobe_example.o

KERNELDIR:=/home/zhongyi/code/rk3399_linux_release_v2.5.1_20210301/kernel

all:
        make -C  $(KERNELDIR) M=$(PWD)  modules ARCH=$(ARCH)
clean:
        rm -f *.o
        rm -f *.symvers
        rm -f *.order
        rm -f *.ko
        rm -f *.mod.c

执行make编译后，在开发板上将驱动加载后，手动打开testfile文件。

insmod kprobe_example.ko
vim testfile
rmmod kprobe_example.ko
dmesg

使用dmesg可以看到成功输出文件名和dfd。

[  307.572314] Planted kprobe at ffffff80081fdf84
[  311.997767] handler_pre:do_filp_open: dfd=-100, name=testfile
[  312.034774] handler_pre:do_filp_open: dfd=-100, name=testfile
[  347.969572] kprobe at ffffff80081fdf84 unregistered

显示栈跟踪

使用kprobes的另一个有效的调试方法，就是显示栈跟踪。

我们只需要在handler_pre中调用dump_stack();即可。

/* x86_64中寄存器中参数的顺序: rdi rsi rdx rcx r8 r9*/
/* aarch64: x0-x7 对应参数 */
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
	dump_stack();
	return 0;
}

编译加载

insmod kprobe_example.ko
rmmod kprobe_example.ko
dmesg

成功打印出栈的信息。

[  451.620803] CPU: 4 PID: 1299 Comm: rmmod Tainted: G           O    4.4.194+ #18
[  451.620809] Hardware name: Firefly-RK3399 Board (Linux Opensource) (DT)
[  451.620813] Call trace:
[  451.620820] [<ffffff8008088410>] dump_backtrace+0x0/0x220
[  451.620828] [<ffffff8008088654>] show_stack+0x24/0x30
[  451.620834] [<ffffff80084f842c>] dump_stack+0x94/0xbc
[  451.620842] [<ffffff8000f22048>] handler_pre+0x14/0x24 [kprobe_example]
[  451.620848] [<ffffff8008efd824>] kprobe_breakpoint_handler+0x100/0x14c
[  451.620855] [<ffffff8008084128>] brk_handler+0x54/0x80
[  451.620860] [<ffffff8008080b0c>] do_debug_exception+0x58/0xc0
[  451.620866] Exception stack(0xffffffc0f2ef7c40 to 0xffffffc0f2ef7d70)
[  451.620879] 7c40: ffffffc0ef782000 0000008000000000 ffffffc0f2ef7e20 ffffff80081fdf84
[  451.620886] 7c60: 0000000060000145 ffffff8008efc228 ffffffc0ceff2a50 ffffffc0ee7d2988
[  451.620892] 7c80: ffffffc0f2ef7ca0 ffffff80081c0dc8 ffffffc0f0582e70 00e80000e95f3f53
[  451.620898] 7ca0: ffffffc0f2ef7d70 ffffff8008efe3e8 ffffffc0f2ef7ec0 0000005583d31928
[  451.620905] 7cc0: 0000000000000055 0000000092000047 ffffffc0ceec5100 ffffffc0dccbd500
[  451.620911] 7ce0: 0000000000000024 ffffffc0dccbd580 00000000ffffff9c ffffffc0ef782000
[  451.620917] 7d00: ffffffc0f2ef7e78 0000000000000000 0000000000000000 0000000000000003
[  451.620923] 7d20: ffffffc0dcfc9a80 0000007fd94380e8 0000000000000000 fefefefefefefeff
[  451.620929] 7d40: 0000000000000001 0000007fd9437db8 0000000000000000 0000000000000000
[  451.620934] 7d60: 0000000000000000 000000007fffffde
[  451.620940] [<ffffff8008082668>] el1_dbg+0x18/0x7c
[  451.620947] [<ffffff80081ed9a4>] SyS_openat+0x3c/0x4c
[  451.620953] [<ffffff8008082f70>] el0_svc_naked+0x24/0x28
[  451.630032] kprobe at ffffff80081fdf84 unregistered

任意位置通过变量名获取信息

kprobes拥有更加强大的功能，那就是它能在内核的任意地址插入侦测器。此外，侦测器可以在任意地址的指令执行之前或之后执行，或者前后都执行。

因此，应当观察汇编代码，找到源代码中想要调查的位置对应于编译后的二进制文件中的什么地址，并调查希望显示的变量保存在哪个寄存器、哪个内存地址。

通常，我们希望在函数执行的过程中变量，即打印一些流程中的东西，而不是函数本身被调用，此时我们不能简单设置 kprobe->symbol_name 函数名字，假设我们期望获取 _do_fork函数变量 nr 的值：

将vmlinux进行反汇编，找出_do_fork的地址。

aarch64-linux-gnu-objdump -s -d vmlinux > vmlinux.asm

_do_fork 反汇编如下所示，地址为ffffff80080ba83c。

ffffff80080ba83c <_do_fork>:
ffffff80080ba83c:       a9b97bfd        stp     x29, x30, [sp, #-112]!
ffffff80080ba840:       910003fd        mov     x29, sp
ffffff80080ba844:       a90153f3        stp     x19, x20, [sp, #16]
ffffff80080ba848:       a9025bf5        stp     x21, x22, [sp, #32]
ffffff80080ba84c:       a90363f7        stp     x23, x24, [sp, #48]
ffffff80080ba850:       aa0003f5        mov     x21, x0
ffffff80080ba854:       aa0103f3        mov     x19, x1
ffffff80080ba858:       aa0203f6        mov     x22, x2
ffffff80080ba85c:       aa0303f7        mov     x23, x3
ffffff80080ba860:       aa0403f8        mov     x24, x4
ffffff80080ba864:       aa1e03e0        mov     x0, x30
ffffff80080ba868:       97ff4e8a        bl      ffffff800808e290 <_mcount>
ffffff80080ba86c:       37b814f5        tbnz    w21, #23, ffffff80080bab08 <_do_fork+0x2cc>
ffffff80080ba870:       37701495        tbnz    w21, #14, ffffff80080bab00 <_do_fork+0x2c4>
ffffff80080ba874:       92401ea0        and     x0, x21, #0xff
ffffff80080ba878:       52800074        mov     w20, #0x3                       // #3
ffffff80080ba87c:       f100441f        cmp     x0, #0x11
ffffff80080ba880:       1a9f1694        csinc   w20, w20, wzr, ne  // ne = any
ffffff80080ba884:       11000e81        add     w1, w20, #0x3
............................
ffffff80080ba91c:       b5000fb6        cbnz    x22, ffffff80080bab10 <_do_fork+0x2d4>
ffffff80080ba920:       52800001        mov     w1, #0x0                        // #0
ffffff80080ba924:       aa1303e0        mov     x0, x19
ffffff80080ba928:       94006a17        bl      ffffff80080d5184 <get_task_pid>
ffffff80080ba92c:       aa0003f6        mov     x22, x0
ffffff80080ba930:       94006a85        bl      ffffff80080d5344 pid_vnr>
ffffff80080ba934:       93407c18        sxtw    x24, w0
ffffff80080ba938:       36a00195        tbz     w21, #20, ffffff80080ba968 <_do_fork+0x12c>
ffffff80080ba93c:       d5384101        mrs     x1, sp_el0
ffffff80080ba940:       f9400422        ldr     x2, [x1, #8]
ffffff80080ba944:       aa1703e1        mov     x1, x23
ffffff80080ba948:       b1001021        adds    x1, x1, #0x4

nr 变量是函数pid_vnr的返回值（也是子进程的pid），根据ARM调用规范，调用完成pid_vnr()后，寄存器x0存放的就是其函数返回值。

参考：ARM64调用标准 https://blog.51cto.com/u_15333820/3452605

通过反汇编可以知道，pid_vnr在 ffffff80080ba930地址处被调用，因此，侦测器的插入地址就是在ffffff80080ba930之后，并且x0被改变之前。只要符合这两个条件，放在哪里都无所谓。

因此，我们将kprobe的点设置为ffffff80080ba934，然后获取 x0，就能获取变量nr的值。

.offset 是探测点相对于_do_fork的偏移，在注册时指定。我们这里的 offset = ffffff80080ba934 - ffffff80080ba83c = F8。

另外，反汇编能力就是多看汇编以及找到几个关键点（例如常量，跳转语句）就能定位到汇编对应的源码了，这里不再展开了。

/*
 * NOTE: This example is works on x86 and powerpc.
 * Here's a sample kernel module showing the use of kprobes to dump a
 * stack trace and selected registers when _do_fork() is called.
 *
 * For more information on theory of operation of kprobes, see
 * Documentation/kprobes.txt
 *
 * You will see the trace data in /var/log/messages and on the console
 * whenever _do_fork() is invoked to create a new process.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
	.symbol_name	= "_do_fork",
    .offset = 0xF8,
};

/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
	printk(KERN_INFO "pre_handler: p->addr = 0x%p, ip = %lx,"
			" flags = 0x%lx,rax = 0x%lx\n",
		p->addr, regs->ip, regs->flags,regs->ax);
#endif

#ifdef CONFIG_ARM64
	pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
			" pstate = 0x%lx,x0 = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate,(long)regs->regs[0]);
#endif

	/* A dump_stack() here will give a stack backtrace */
	return 0;
}

/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
				unsigned long flags)
{
#ifdef CONFIG_X86
	printk(KERN_INFO "post_handler: p->addr = 0x%p, flags = 0x%lx\n",
		p->addr, regs->flags);
#endif

#ifdef CONFIG_ARM64
	pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->pstate);
#endif
}

/*
 * fault_handler: this is called if an exception is generated for any
 * instruction within the pre- or post-handler, or when Kprobes
 * single-steps the probed instruction.
 */
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
	printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn",
		p->addr, trapnr);
	/* Return 0 because we don't handle the fault. */
	return 0;
}

static int __init kprobe_init(void)
{
	int ret;
	kp.pre_handler = handler_pre;
	kp.post_handler = handler_post;
	kp.fault_handler = handler_fault;

	ret = register_kprobe(&kp);
	if (ret < 0) {
		printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
		return ret;
	}
	printk(KERN_INFO "Planted kprobe at %p\n", kp.addr);
	return 0;
}

static void __exit kprobe_exit(void)
{
	unregister_kprobe(&kp);
	printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");

insmod kprobe_example.ko
rmmod kprobe_example.ko
dmesg

编译加载后，成功打印出rax的值。

[  245.080636] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.080640] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  245.080936] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.080938] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  245.457340] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.457345] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  245.457643] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.457645] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  245.719208] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.719213] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  245.719505] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.719507] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  245.820761] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.820765] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  245.821061] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  245.821063] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  246.092572] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  246.092577] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  246.095863] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[  246.095867] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[  246.126196] kprobe at 0000000050a6c3dd unregistered

jprobe

与kprobes相比，jprobes能更容易地获取传给函数的参数。有几点需要注意：

处理程序应该有与被探测函数相同的参数列表和返回类型；
返回之前，必须调用jprobe_return()（处理程序实际上从未返回，因为jprobe_return()将控制权返回给Kprobes）。

查看函数的参数

/*
 * Here's a sample kernel module showing the use of jprobes to dump
 * the arguments of _do_fork().
 *
 * For more information on theory of operation of jprobes, see
 * Documentation/kprobes.txt
 *
 * Build and insert the kernel module as done in the kprobe example.
 * You will see the trace data in /var/log/messages and on the
 * console whenever _do_fork() is invoked to create a new process.
 * (Some messages may be suppressed if syslogd is configured to
 * eliminate duplicate messages.)
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

/*
 * Jumper probe for _do_fork.
 * Mirror principle enables access to arguments of the probed routine
 * from the probe handler.
 */

/* Proxy routine having the same arguments as actual _do_fork() routine */
#define TRACE_SYMBOL "do_filp_open"
/*与do_filp_open 的参数完全相同*/
static struct file * jp_do_filp_open(int dfd, struct filename *pathname,
		const struct open_flags *op)
{
	if (pathname && !(strcmp(pathname->name, "testfile")))
	printk(KERN_INFO "jprobe: dfd = %d, pathname = %s\n", dfd, pathname->name);

	/* Always end with a call to jprobe_return(). */
	jprobe_return();
	return 0;
}

static struct jprobe my_jprobe = {
	.entry			= jp_do_filp_open,
	.kp = {
		.symbol_name	= TRACE_SYMBOL,
	},
};

static int __init jprobe_init(void)
{
	int ret;

	ret = register_jprobe(&my_jprobe);
	if (ret < 0) {
		printk(KERN_INFO "register_jprobe failed, returned %d\n", ret);
		return -1;
	}
	printk(KERN_INFO "Planted jprobe at %p, handler addr %p\n",
	       my_jprobe.kp.addr, my_jprobe.entry);
	return 0;
}

static void __exit jprobe_exit(void)
{
	unregister_jprobe(&my_jprobe);
	printk(KERN_INFO "jprobe at %p unregistered\n", my_jprobe.kp.addr);
}

module_init(jprobe_init)
module_exit(jprobe_exit)
MODULE_LICENSE("GPL");

使用kprobes时，必须通过寄存器或栈才能计算出参数的值。此外，计算方法还依赖于架构。

如果使用jprobes，那么无须了解架构的详细知识，也能简单地查看参数的值。

编译加载驱动程序

insmod jprobe_example.ko
vim testfile
rmmod jprobe_example.ko
dmesg

成功打印出函数的参数

[  612.670453] jprobe at ffffff80081fdf84 unregistered
[  867.293765] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f1a000
[  871.107502] jprobe: dfd = -100, pathname = testfile
[  871.147747] jprobe: dfd = -100, pathname = testfile
[  875.723761] jprobe at ffffff80081fdf84 unregistered
[  907.706066] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f22000
[  911.661891] jprobe: dfd = -100, pathname = testfile
[  911.694903] jprobe: dfd = -100, pathname = testfile
[  919.272187] jprobe at ffffff80081fdf84 unregistered
[ 2296.830613] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f2a000
[ 2302.164861] jprobe: dfd = -100, pathname = testfile
[ 2302.200634] jprobe: dfd = -100, pathname = testfile
[ 2307.407014] jprobe at ffffff80081fdf84 unregistered

kretprobe

kretprobe 也是基于kprobe的，相比于kprobe和jprobe，实现相对复杂。下面我们以内核目录下的例程，简单分析下。

kretprobe_example.c

/*
 * kretprobe_example.c
 *
 * Here's a sample kernel module showing the use of return probes to
 * report the return value and total time taken for probed function
 * to run.
 *
 * usage: insmod kretprobe_example.ko func=<func_name>
 *
 * If no func_name is specified, _do_fork is instrumented
 *
 * For more information on theory of operation of kretprobes, see
 * Documentation/kprobes.txt
 *
 * Build and insert the kernel module as done in the kprobe example.
 * You will see the trace data in /var/log/messages and on the console
 * whenever the probed function returns. (Some messages may be suppressed
 * if syslogd is configured to eliminate duplicate messages.)
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/ktime.h>
#include <linux/limits.h>
#include <linux/sched.h>

static char func_name[NAME_MAX] = "do_sys_open";
module_param_string(func, func_name, NAME_MAX, S_IRUGO);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
			" function's execution time");

/* per-instance private data */
struct my_data {
	ktime_t entry_stamp;
};

/* Here we use the entry_hanlder to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
	struct my_data *data;

	if (!current->mm)
		return 1;	/* Skip kernel threads */

	data = (struct my_data *)ri->data;
	data->entry_stamp = ktime_get();
	return 0;
}

/*
 * Return-probe handler: Log the return value and duration. Duration may turn
 * out to be zero consistently, depending upon the granularity of time
 * accounting on the platform.
 */
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
	int retval = regs_return_value(regs);
	struct my_data *data = (struct my_data *)ri->data;
	s64 delta;
	ktime_t now;

	now = ktime_get();
	delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
	printk(KERN_INFO "%s returned %d and took %lld ns to execute\n",
			func_name, retval, (long long)delta);
	return 0;
}

static struct kretprobe my_kretprobe = {
	.handler		= ret_handler,
	.entry_handler		= entry_handler,
	.data_size		= sizeof(struct my_data),
	/* Probe up to 20 instances concurrently. */
	.maxactive		= 20,
};

static int __init kretprobe_init(void)
{
	int ret;

	my_kretprobe.kp.symbol_name = func_name;
	ret = register_kretprobe(&my_kretprobe);
	if (ret < 0) {
		printk(KERN_INFO "register_kretprobe failed, returned %d\n",
				ret);
		return -1;
	}
	printk(KERN_INFO "Planted return probe at %s: %p\n",
			my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
	return 0;
}

static void __exit kretprobe_exit(void)
{
	unregister_kretprobe(&my_kretprobe);
	printk(KERN_INFO "kretprobe at %p unregistered\n",
			my_kretprobe.kp.addr);

	/* nmissed > 0 suggests that maxactive was set too low. */
	printk(KERN_INFO "Missed probing %d instances of %s\n",
		my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
}

module_init(kretprobe_init)
module_exit(kretprobe_exit)
MODULE_LICENSE("GPL");

struct kretprobe

/*
 * Function-return probe -
 * Note:
 * User needs to provide a handler function, and initialize maxactive.
 * maxactive - The maximum number of instances of the probed function that
 * can be active concurrently.
 * nmissed - tracks the number of times the probed function's return was
 * ignored, due to maxactive being too low.
 *
 */
struct kretprobe {
	struct kprobe kp;
	kretprobe_handler_t handler;
	kretprobe_handler_t entry_handler;
	int maxactive;
	int nmissed;
	size_t data_size;
	struct hlist_head free_instances;
	raw_spinlock_t lock;
};

typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
				    struct pt_regs *);

其中我们可以看到 struct kretprobe 结构体中有struct kprobe成员（kretprobe时基于 kprobe实现的）。
handler：用户自定义回调函数，被探测函数返回后被调用，一般在这个函数中获取被探测函数的返回值。
entry_handler：用户自定义回调函数，这是Kretprobes 提供了一个可选的用户指定的处理程序，它在函数入口上运行。每当 kretprobe 放置在函数入口处的 kprobe 被命中时，都会调用用户定义的 entry_handler，如果有的话。如果 entry_handler 返回 0（成功），则保证在函数返回时调用相应的返回处理程序。如果 entry_handler 返回非零错误，则 Kprobes 将返回地址保持原样，并且 kretprobe 对该特定函数实例没有进一步的影响。
maxactive：被探测函数可以同时活动的最大实例数。来指定可以同时探测多少个指定函数的实例。
register_kretprobe() 预分配指定数量的 kretprobe_instance 对象。
nmissed：跟踪被探测函数的返回被忽略的次数（maxactive设置的过低）。
data_size：表示kretprobe私有数据的大小，在注册kretprobe时会根据该大小预留空间。
free_instances ：表示空闲的kretprobe运行实例链表，它链接了本kretprobe的空闲实例struct kretprobe_instance结构体表示。

struct kretprobe_instance

struct kretprobe_instance {
	struct hlist_node hlist;
	struct kretprobe *rp;
	kprobe_opcode_t *ret_addr;
	struct task_struct *task;
	char data[0];
};

这个结构体表示kretprobe的运行实例，前文说过被探测函数在跟踪期间可能存在并发执行的现象，因此kretprobe使用一个kretprobe_instance来跟踪一个执行流，支持的上限为maxactive。在没有触发探测时，所有的kretprobe_instance实例都保存在free_instances表中，每当有执行流触发一次kretprobe探测，都会从该表中取出一个空闲的kretprobe_instance实例用来跟踪。
kretprobe_instance结构提中的rp指针指向所属的kretprobe；
ret_addr用于保存原始被探测函数的返回地址（后文会看到被探测函数返回地址会被暂时替换）；
task用于绑定其跟踪的进程；
data保存用户使用的kretprobe私有数据，它会在整个kretprobe探测运行期间在entry_handler和handler回调函数之间进行传递（一般用于实现统计被探测函数的执行耗时）。

register_kretprobe

kretprobe探测点的blackpoint，用来表示不支持kretprobe探测的函数的信息。name表示该函数名，addr表示该函数的地址。

struct kretprobe_blackpoint {
	const char *name;
	void *addr;
};
1234

blackpoint与架构相关，x86架构不支持的kretprobe探测点如下：

// arch/x86/kernel/kprobes/core.c
// 不支持kretprobe探测的函数，从blacklist这个名字中我们也知道其含义了。
struct kretprobe_blackpoint kretprobe_blacklist[] = {
	{"__switch_to", }, /* This function switches only current task, but
			      doesn't switch kernel stack.*/
	{NULL, NULL}	/* Terminator */
};

const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
123456789

函数的开头首先处理 kretprobe_blacklis t，如果指定的被探测函数在这个blacklist中就直接返回EINVAL，表示不支持探测，在x86架构中是__switch_to 这个函数，表示这个函数不能被kretprobe。

int register_kretprobe(struct kretprobe *rp)
{
	int ret = 0;
	struct kretprobe_instance *inst;
	int i;
	void *addr;

	if (kretprobe_blacklist_size) {
		addr = kprobe_addr(&rp->kp);
		if (IS_ERR(addr))
			return PTR_ERR(addr);
		//如果kretprobe到kretprobe_blacklist中函数，则返回EINVAL
		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
			if (kretprobe_blacklist[i].addr == addr)
				return -EINVAL;
		}
	}

	//内核设置回调函数 pre_handler_kretprobe 。
	//与kprobe不同的是：kretprobe不支持用户定义pre_handler和post_handler等回调函数。
	
	rp->kp.pre_handler = pre_handler_kretprobe;
	rp->kp.post_handler = NULL;
	rp->kp.fault_handler = NULL;
	rp->kp.break_handler = NULL;

	/* Pre-allocate memory for max kretprobe instances */
	if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPT
		rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
		rp->maxactive = num_possible_cpus();
#endif
	}
	raw_spin_lock_init(&rp->lock);
	INIT_HLIST_HEAD(&rp->free_instances);
	//根据maxactive值分配 struct kretprobe_instance 内存空间
	for (i = 0; i < rp->maxactive; i++) {
		inst = kmalloc(sizeof(struct kretprobe_instance) +
			       rp->data_size, GFP_KERNEL);
		if (inst == NULL) {
			free_rp_inst(rp);
			return -ENOMEM;
		}
		INIT_HLIST_NODE(&inst->hlist);
		hlist_add_head(&inst->hlist, &rp->free_instances);
	}

	rp->nmissed = 0;
	/* Establish function entry probe point */
	//注册kprobe探测点
	ret = register_kprobe(&rp->kp);
	if (ret != 0)
		free_rp_inst(rp);
	return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);

最后调用 register_kprobe(&rp->kp)，注册kprobe点，可以看出kretprobe也是基于kprobe机制实现的，kretprobe也是一种特殊形式的kprobe。

kretprobe注册完成后就默认启动探测。

pre_handler_kretprobe

pre_handler_kretprobe这个函数是内核自己定义的，内核已经指定该回调函数，不支持用户自定义。这个 kprobe pre_handler 在每个 kretprobe 中注册。当探针命中时，它将设置返回探针。

#ifdef CONFIG_KRETPROBES
/*
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
	unsigned long hash, flags = 0;
	struct kretprobe_instance *ri;

	/*
	 * To avoid deadlocks, prohibit return probing in NMI contexts,
	 * just skip the probe and increase the (inexact) 'nmissed'
	 * statistical counter, so that the user is informed that
	 * something happened:
	 */
	if (unlikely(in_nmi())) {
		rp->nmissed++;
		return 0;
	}

	/* TODO: consider to only swap the RA after the last pre_handler fired */
	hash = hash_ptr(current, KPROBE_HASH_BITS);
	raw_spin_lock_irqsave(&rp->lock, flags);
	if (!hlist_empty(&rp->free_instances)) {
		ri = hlist_entry(rp->free_instances.first,
				struct kretprobe_instance, hlist);
		hlist_del(&ri->hlist);
		raw_spin_unlock_irqrestore(&rp->lock, flags);

		ri->rp = rp;
		ri->task = current;
	（1）
		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
			raw_spin_lock_irqsave(&rp->lock, flags);
			hlist_add_head(&ri->hlist, &rp->free_instances);
			raw_spin_unlock_irqrestore(&rp->lock, flags);
			return 0;
		}
	（2）
		arch_prepare_kretprobe(ri, regs);

		/* XXX(hch): why is there no hlist_move_head? */
		INIT_HLIST_NODE(&ri->hlist);
		kretprobe_table_lock(hash, &flags);
		hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
		kretprobe_table_unlock(hash, &flags);
	} else {
		rp->nmissed++;
		raw_spin_unlock_irqrestore(&rp->lock, flags);
	}
	return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);

entry_handler

struct kretprobe *rp
rp->entry_handler && rp->entry_handler(ri, regs)

entry_handler这个回调函数就是用户自己定义的回调函数（可选的用户指定的处理程序），前面我们已经介绍过了，在这里不再介绍。

/* Here we use the entry_hanlder to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
	struct my_data *data;

	//内核线程 task->mm == NULL
	if (!current->mm)
		return 1;	/* Skip kernel threads */

	data = (struct my_data *)ri->data;
	data->entry_stamp = ktime_get();
	return 0;
}

arch_prepare_kretprobe

arch_prepare_kretprobe(ri, regs)该函数架构相关，struct kretprobe_instance结构体的 ret_addr 成员用于保存并替换regs中的返回地址。返回地址被替换为kretprobe_trampoline。

x86架构

// arch/x86/kernel/kprobes/core.c

#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))

//	x86_64
//	arch/x86/include/asm/ptrace.h
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
	return regs->sp;
}
// arch/x86/kernel/kprobes/core.c
void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
	unsigned long *sara = stack_addr(regs);

	ri->ret_addr = (kprobe_opcode_t *) *sara;

	/* Replace the return addr with trampoline addr */
	*sara = (unsigned long) &kretprobe_trampoline;
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);

//struct kretprobe_instance *ri；
//ri->ret_addr；

struct kretprobe_instance {
	kprobe_opcode_t *ret_addr;  //用于保存原始被探测函数的返回地址
};

ARM64架构

// arch/arm64/kernel/probes/kprobes.c

void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
				      struct pt_regs *regs)
{
	ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];

	/* replace return addr (x30) with trampoline */
	regs->regs[30] = (long)&kretprobe_trampoline;
}

ARM64架构中regs->regs[30]是LR（procedure link register）寄存器（X30 ：LR）。

小结

kretprobe是基于kprobe实现的，有一个固定的pre_handler回调函数，在内核中实现，无需用户编写。而在kprobe中pre_handler函数是提供给用户的回调函数。

rp->kp.pre_handler = pre_handler_kretprobe;  //内核中已经实现
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;
rp->kp.break_handler = NULL;

kretprobe提供给用户的两个回调函数：

kretprobe_handler_t handler;
kretprobe_handler_t entry_handler; // (可选)

pre_handler回调函数会为kretprobe探测函数执行的返回值做准备工作，其中最主要的就是替换掉正常流程的返回地址，让被探测函数在执行之后能够跳转到kretprobe设计的函数 kretprobe_trampoline中去。

kretprobe_trampoline

pre_handler_kretprobe函数返回后，kprobe流程接着执行singlestep流程并返回到正常的执行流程，被探测函数（do_fork）继续执行，直到它执行完毕并返回。

由于返回地址被替换为kretprobe_trampoline，所以跳转到kretprobe_trampoline执行，该函数架构相关且有嵌入汇编实现。

该函数会获取被探测函数的寄存器信息并调用用户定义的回调函数输出其中的返回值，最后函数返回正常的执行流程。

static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
	unsigned long retval = regs_return_value(regs);
	......
}


static struct kretprobe my_kretprobe = {
	.handler		= ret_handler,
};

x86架构

(1)

kretprobe_trampoline
	-->trampoline_handler
kretprobe_trampoline

(2)
kretprobe_trampoline

// arch/x86/kernel/kprobes/core.c

/*
 * When a retprobed function returns, this code saves registers and
 * calls trampoline_handler() runs, which calls the kretprobe's handler.
 */
asm(
	".global kretprobe_trampoline\n"
	".type kretprobe_trampoline, @function\n"
	"kretprobe_trampoline:\n"
#ifdef CONFIG_X86_64
	/* We don't bother saving the ss register */
	"	pushq %rsp\n"
	"	pushfq\n"
	SAVE_REGS_STRING
	"	movq %rsp, %rdi\n"
	"	call trampoline_handler\n"
	/* Replace saved sp with true return address. */
	"	movq %rax, 152(%rsp)\n"
	RESTORE_REGS_STRING
	"	popfq\n"
#else
	"	pushf\n"
	SAVE_REGS_STRING
	"	movl %esp, %eax\n"
	"	call trampoline_handler\n"
	/* Move flags to cs */
	"	movl 56(%esp), %edx\n"
	"	movl %edx, 52(%esp)\n"
	/* Replace saved flags with true return address. */
	"	movl %eax, 56(%esp)\n"
	RESTORE_REGS_STRING
	"	popf\n"
#endif
	"	ret\n"
	".size kretprobe_trampoline, .-kretprobe_trampoline\n"
);
NOKPROBE_SYMBOL(kretprobe_trampoline);
STACK_FRAME_NON_STANDARD(kretprobe_trampoline);

(3)
trampoline_handler

// arch/x86/kernel/kprobes/core.c

/*
 * Called from kretprobe_trampoline
 */
__visible __used void *trampoline_handler(struct pt_regs *regs)
{
	struct kretprobe_instance *ri = NULL;
	struct hlist_head *head, empty_rp;
	struct hlist_node *tmp;
	unsigned long flags, orig_ret_address = 0;
	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
	kprobe_opcode_t *correct_ret_addr = NULL;

	INIT_HLIST_HEAD(&empty_rp);
	kretprobe_hash_lock(current, &head, &flags);
	/* fixup registers */
#ifdef CONFIG_X86_64
	regs->cs = __KERNEL_CS;
#else
	regs->cs = __KERNEL_CS | get_kernel_rpl();
	regs->gs = 0;
#endif
	regs->ip = trampoline_address;
	regs->orig_ax = ~0UL;

	/*
	 * It is possible to have multiple instances associated with a given
	 * task either because multiple functions in the call path have
	 * return probes installed on them, and/or more than one
	 * return probe was registered for a target function.
	 *
	 * We can handle this because:
	 *     - instances are always pushed into the head of the list
	 *     - when multiple return probes are registered for the same
	 *	 function, the (chronologically) first instance's ret_addr
	 *	 will be the real return address, and all the rest will
	 *	 point to kretprobe_trampoline.
	 */
	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
		if (ri->task != current)
			/* another task is sharing our hash bucket */
			continue;

		orig_ret_address = (unsigned long)ri->ret_addr;

		if (orig_ret_address != trampoline_address)
			/*
			 * This is the real return address. Any other
			 * instances associated with this task are for
			 * other calls deeper on the call stack
			 */
			break;
	}

	kretprobe_assert(ri, orig_ret_address, trampoline_address);

	correct_ret_addr = ri->ret_addr;
	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
		if (ri->task != current)
			/* another task is sharing our hash bucket */
			continue;

		orig_ret_address = (unsigned long)ri->ret_addr;
		if (ri->rp && ri->rp->handler) {
			__this_cpu_write(current_kprobe, &ri->rp->kp);
			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
			ri->ret_addr = correct_ret_addr;
			ri->rp->handler(ri, regs);
			__this_cpu_write(current_kprobe, NULL);
		}

		recycle_rp_inst(ri, &empty_rp);

		if (orig_ret_address != trampoline_address)
			/*
			 * This is the real return address. Any other
			 * instances associated with this task are for
			 * other calls deeper on the call stack
			 */
			break;
	}

	kretprobe_hash_unlock(current, &flags);

	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
		hlist_del(&ri->hlist);
		kfree(ri);
	}
	return (void *)orig_ret_address;
}
NOKPROBE_SYMBOL(trampoline_handler);

(4)
ri->rp->handler(ri, regs)表示执行用户态自定义的回调函数handler(用来获取_do_fork函数的返回值)，handler回调函数执行完毕以后，调用recycle_rp_inst函数将当前的kretprobe_instance实例从kretprobe_inst_table哈希表释放，重新链入free_instances中，以备后面kretprobe触发时使用，另外如果kretprobe已经被注销则将它添加到销毁表中待销毁。

ri->rp->handler(ri, regs);
	->recycle_rp_inst(ri, &empty_rp);
12
void recycle_rp_inst(struct kretprobe_instance *ri,
		     struct hlist_head *head)
{
	struct kretprobe *rp = ri->rp;

	/* remove rp inst off the rprobe_inst_table */
	hlist_del(&ri->hlist);
	INIT_HLIST_NODE(&ri->hlist);
	if (likely(rp)) {
		raw_spin_lock(&rp->lock);
		hlist_add_head(&ri->hlist, &rp->free_instances);
		raw_spin_unlock(&rp->lock);
	} else
		/* Unregistering */
		hlist_add_head(&ri->hlist, head);
}
NOKPROBE_SYMBOL(recycle_rp_inst);

(5)
trampoline_handler函数执行完后，返回被探测函数的原始返回地址，执行流程再次回到kretprobe_trampoline函数中，将保存的 sp 替换为真实的返回地址。
从rax寄存器中取出原始的返回地址，然后恢复原始函数调用栈空间，最后跳转到原始返回地址执行，至此函数调用的流程就回归正常流程了，整个kretprobe探测结束。

/* Replace saved sp with true return address. */
	"	movq %rax, 152(%rsp)\n"
	RESTORE_REGS_STRING
	"	popfq\n"
1234

ARM64架构

(1)

kretprobe_trampoline 
	-->trampoline_probe_handler
kretprobe_trampoline

(2)
kretprobe_trampoline

// arch/arm64/kernel/probes/kprobes_trampoline.S

ENTRY(kretprobe_trampoline)
	sub sp, sp, #S_FRAME_SIZE

	save_all_base_regs

	mov x0, sp
	bl trampoline_probe_handler
	/*
	 * Replace trampoline address in lr with actual orig_ret_addr return
	 * address.
	 */
	mov lr, x0

	restore_all_base_regs

	add sp, sp, #S_FRAME_SIZE
	ret

ENDPROC(kretprobe_trampoline)

(3)
trampoline_probe_handler

// arch/arm64/kernel/probes/kprobes.c

void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
{
	struct kretprobe_instance *ri = NULL;
	struct hlist_head *head, empty_rp;
	struct hlist_node *tmp;
	unsigned long flags, orig_ret_address = 0;
	unsigned long trampoline_address =
		(unsigned long)&kretprobe_trampoline;
	kprobe_opcode_t *correct_ret_addr = NULL;

	INIT_HLIST_HEAD(&empty_rp);
	kretprobe_hash_lock(current, &head, &flags);

	/*
	 * It is possible to have multiple instances associated with a given
	 * task either because multiple functions in the call path have
	 * return probes installed on them, and/or more than one
	 * return probe was registered for a target function.
	 *
	 * We can handle this because:
	 *     - instances are always pushed into the head of the list
	 *     - when multiple return probes are registered for the same
	 *	 function, the (chronologically) first instance's ret_addr
	 *	 will be the real return address, and all the rest will
	 *	 point to kretprobe_trampoline.
	 */
	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
		if (ri->task != current)
			/* another task is sharing our hash bucket */
			continue;

		orig_ret_address = (unsigned long)ri->ret_addr;

		if (orig_ret_address != trampoline_address)
			/*
			 * This is the real return address. Any other
			 * instances associated with this task are for
			 * other calls deeper on the call stack
			 */
			break;
	}

	kretprobe_assert(ri, orig_ret_address, trampoline_address);

	correct_ret_addr = ri->ret_addr;
	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
		if (ri->task != current)
			/* another task is sharing our hash bucket */
			continue;

		orig_ret_address = (unsigned long)ri->ret_addr;
		if (ri->rp && ri->rp->handler) {
			__this_cpu_write(current_kprobe, &ri->rp->kp);
			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
			ri->ret_addr = correct_ret_addr;
			ri->rp->handler(ri, regs);
			__this_cpu_write(current_kprobe, NULL);
		}

		recycle_rp_inst(ri, &empty_rp);

		if (orig_ret_address != trampoline_address)
			/*
			 * This is the real return address. Any other
			 * instances associated with this task are for
			 * other calls deeper on the call stack
			 */
			break;
	}

	kretprobe_hash_unlock(current, &flags);

	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
		hlist_del(&ri->hlist);
		kfree(ri);
	}
	return (void *)orig_ret_address;
}

(4)
将 lr寄存器中的trampoline地址替换为实际的 orig_ret_addr 返回地址。
从x0寄存器中取出原始的返回地址，然后恢复原始函数调用栈空间，最后跳转到原始返回地址执行，至此函数调用的流程就回归正常流程了，整个kretprobe探测结束。

/*
	 * Replace trampoline address in lr with actual orig_ret_addr return
	 * address.
	 */
	mov lr, x0

	restore_all_base_regs

	add sp, sp, #S_FRAME_SIZE
	ret

编译运行

insmod kprobe_example.ko
vim testfile
rmmod kprobe_example.ko
dmesg

成功打印出函数的执行时间

[ 1056.875938] do_sys_open returned -2 and took 10500 ns to execute
[ 1057.567400] do_sys_open returned 34 and took 59208 ns to execute
[ 1058.382932] do_sys_open returned 3 and took 31469101 ns to execute
[ 1058.567046] do_sys_open returned 34 and took 61250 ns to execute
[ 1058.975879] do_sys_open returned 3 and took 224084 ns to execute
[ 1058.975935] do_sys_open returned 3 and took 16917 ns to execute
[ 1058.976041] do_sys_open returned 3 and took 13417 ns to execute
[ 1058.976148] do_sys_open returned 3 and took 15167 ns to execute
[ 1058.976254] do_sys_open returned 3 and took 15750 ns to execute
[ 1058.976356] do_sys_open returned 3 and took 16042 ns to execute
[ 1058.978036] do_sys_open returned -2 and took 23041 ns to execute
[ 1058.978074] do_sys_open returned 3 and took 24500 ns to execute
[ 1058.978175] do_sys_open returned -2 and took 9334 ns to execute
[ 1058.978211] do_sys_open returned 3 and took 23333 ns to execute
[ 1058.978246] do_sys_open returned 3 and took 13417 ns to execute
[ 1058.978286] do_sys_open returned 3 and took 14583 ns to execute
[ 1058.989701] kretprobe at ffffff80081ed6c8 unregistered
[ 1058.989709] Missed probing 0 instances of do_sys_open

Kprobe-based Event Tracing

这些事件类似于基于tracepoint的事件。与Tracepoint不同，它是基于kprobes（kprobe和kretprobe）的。所以它可以探测任何kprobes可以探测的地方。与基于Tracepoint的事件不同的是，它可以动态地添加和删除。

要启用这个功能，在编译内核时CONFIG_KPROBE_EVENTS=y

与 Event Tracing类似，这不需要通过current_tracer来激活。可以通过/sys/kernel/debug/tracing/kprobe_events添加探测点，并通过/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enable来启用它。

你也可以使用/sys/kernel/debug/tracing/dynamic_events，而不是kprobe_events。该接口也将提供对其他动态事件的统一访问。

Synopsis of kprobe_events

kprobe和内核的ftrac结合使用，需要对内核进行配置，然后添加探测点、进行探测、查看结果。

kprobe配置

CONFIG_KPROBES=y
CONFIG_OPTPROBES=y
CONFIG_KPROBES_ON_FTRACE=y
CONFIG_UPROBES=y
CONFIG_KRETPROBES=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_OPTPROBES=y
CONFIG_HAVE_KPROBES_ON_FTRACE=y
CONFIG_KPROBE_EVENT=y

kprobe trace events使用

kprobe事件相关的节点有如下：

/sys/kernel/debug/tracing/kprobe_events-----------------------配置kprobe事件属性，增加事件之后会在kprobes下面生成对应目录。
/sys/kernel/debug/tracing/kprobe_profile----------------------kprobe事件统计属性文件。
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/enabled-------使能kprobe事件
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/filter--------过滤kprobe事件
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/format--------查询kprobe事件显示格式

kprobe事件配置

新增一个kprobe事件，通过写kprobe_events来设置。

p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS]-------------------设置一个probe探测点
r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS]------------------------------设置一个return probe探测点
-:[GRP/]EVENT----------------------------------------------------------删除一个探测点

细节解释如下：

GRP        : Group name. If omitted, use "kprobes" for it.------------设置后会在events/kprobes下创建<GRP>目录。
 EVENT        : Event name. If omitted, the event name is generated based on SYM+offs or MEMADDR.---指定后在events/kprobes/<GRP>生成<EVENT>目录。 MOD        : Module name which has given SYM.--------------------------模块名，一般不设
 SYM[+offs]    : Symbol+offset where the probe is inserted.-------------被探测函数名和偏移
 MEMADDR    : Address where the probe is inserted.----------------------指定被探测的内存绝对地址
 FETCHARGS    : Arguments. Each probe can have up to 128 args.----------指定要获取的参数信息。 %REG        : Fetch register REG---------------------------------------获取指定寄存器值
 @ADDR        : Fetch memory at ADDR (ADDR should be in kernel)--------获取指定内存地址的值
 @SYM[+|-offs]    : Fetch memory at SYM +|- offs (SYM should be a data symbol)---获取全局变量的值 $stackN    : Fetch Nth entry of stack (N >= 0)----------------------------------获取指定栈空间值，即sp寄存器+N后的位置值
 $stack    : Fetch stack address.-----------------------------------------------获取sp寄存器值
 $retval    : Fetch return value.(*)--------------------------------------------获取返回值，用户return kprobe
 $comm        : Fetch current task comm.----------------------------------------获取对应进程名称。
 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)------------- NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
 FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
          (x8/x16/x32/x64), "string" and bitfield are supported.----------------设置参数的类型，可以支持字符串和比特类型
  (*) only for return probe.
  (**) this is useful for fetching a field of data structures.

执行如下两条命令就会生成目录/sys/kernel/debug/tracing/events/kprobes/myprobe；第三条命令则可以删除指定kprobe事件，如果要全部删除则echo > /sys/kernel/debug/tracing/kprobe_events。

echo 'p:myprobe do_sys_open dfd=%x0 filename=%x1 flags=%x2 mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
echo 'r:myretprobe do_sys_open ret=$retval' >> /sys/kernel/debug/tracing/kprobe_events-----------------------------------------------------这里面一定要用">>"，不然就会覆盖前面的设置。

echo '-:myprobe' >> /sys/kernel/debug/tracing/kprobe_eventsecho '-:myretprobe' >> /sys/kernel/debug/tracing/kprobe_events

参数后面的寄存器是跟架构相关的，%x0、%x1、%x2表示第1/2/3个参数，超出部分使用$stack来存储参数。

函数返回值保存在$retval中

kprobe使能

对kprobe事件的是能通过往对应事件的enable写1开启探测；写0暂停探测。

echo > /sys/kernel/debug/tracing/trace
echo 'p:myprobe do_sys_open dfd=%x0 filename=%x1 flags=%x2 mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
echo 'r:myretprobe do_sys_open ret=$retval' >> /sys/kernel/debug/tracing/kprobe_events

echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
ls
echo 0 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
echo 0 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable

cat /sys/kernel/debug/tracing/trace

然后在/sys/kernel/debug/tracing/trace中可以看到结果。

总结

附录

ARM32,ARM64,X86寄存器及访问方式

ARM32

"r0", pt_regs->r0
"r1", pt_regs->r1
"r2", pt_regs->r2
"r3", pt_regs->r3
"r4", pt_regs->r4
"r5", pt_regs->r5
"r6", pt_regs->r6
"r7", pt_regs->r7
"r8", pt_regs->r8
"r9", pt_regs->r9
"r10",pt_regs->r10
"fp", pt_regs->fp
"ip", pt_regs->ip
"sp", pt_regs->sp
"lr", pt_regs->lr
"pc", pt_regs->pc

ARM64

"x0", pt_regs->regs[0]
"x1", pt_regs->regs[1]
"x2", pt_regs->regs[2]
"x3", pt_regs->regs[3]
"x4", pt_regs->regs[4]
"x5", pt_regs->regs[5]
"x6", pt_regs->regs[6]
"x7", pt_regs->regs[7]
"x8", pt_regs->regs[8]
"x9", pt_regs->regs[9]
"x10", pt_regs->regs[10]
"x11", pt_regs->regs[11]
"x12", pt_regs->regs[12]
"x13", pt_regs->regs[13]
"x14", pt_regs->regs[14]
"x15", pt_regs->regs[15]
"x16", pt_regs->regs[16]
"x17", pt_regs->regs[17]
"x18", pt_regs->regs[18]
"x19", pt_regs->regs[19]
"x20", pt_regs->regs[20]
"x21", pt_regs->regs[21]
"x22", pt_regs->regs[22]
"x23", pt_regs->regs[23]
"x24", pt_regs->regs[24]
"x25", pt_regs->regs[25]
"x26", pt_regs->regs[26]
"x27", pt_regs->regs[27]
"x28", pt_regs->regs[28]
"x29", pt_regs->regs[29]
"x30", pt_regs->regs[30]
"sp",  pt_regs->sp
"pc",  pt_regs->pc
"pstate",pt_regs->pstate

X86

rax     pt_regs->ax 
rcx     pt_regs->cx 
rdx     pt_regs->cx 
rbx     pt_regs->bx 
rsp     pt_regs->sp 
rbp     pt_regs->bp 
rdi     pt_regs->di 
rsi     pt_regs->si 
r8      pt_regs->r8 
r9      pt_regs->r9 
r10     pt_regs->r10 
r11     pt_regs->r11 
r12     pt_regs->r12 
r13     pt_regs->r13 
r14     pt_regs->r14 
r15     pt_regs->r15