x86宕机日志解读1

标签：00 016lx x86 宕机 regs module TAINT address 日志

下面以一个典型的x86服务的宕机日志为例进行解读：

[330931.874444] BUG: unable to handle kernel paging request at ffffffffa22a4668
[330931.874532] PGD 3a23067 P4D 3a23067 PUD 3a24063 PMD 1ee9909067 PTE 0
[330931.874618] Oops: 0000 [#1] SMP KASAN NOPTI
[330931.874694] CPU: 14 PID: 2231 Comm: rmmod Kdump: loaded Tainted: GF       W  O     --------- -t - 4.18.0-6.10.0 #1
[330931.874785] Hardware name: Supermicro X10DRi/X10DRi, BIOS 2.0 12/28/2015
[330931.874870] RIP: 0010:kprobes_module_callback+0x144/0x820
[330931.874948] Code: 85 b6 00 00 00 48 8b 1b 48 85 db 0f 84 b7 00 00 00 48 8d 7b 28 48 89 f8 48 c1 e8 03 42 80 3c 38 00 0f 85 66 03 00 00 48 89 e8 <48> 8b 4b 28 48 c1 e8 03 42 80 3c 38 00 0f 85 5a 03 00 00 49 8b 86
[330931.875583] RSP: 0000:ffff888175727d38 EFLAGS: 00010246
[330931.875928] RAX: ffffffffa028d650 RBX: ffffffffa22a4640 RCX: ffffffff81cf0610
[330931.876538] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffa22a4668
[330931.877150] RBP: ffffffffa028d650 R08: ffffed102eae4f9c R09: 0000000000000000
[330931.877760] R10: 0000000000000001 R11: ffffed102eae4f9b R12: 0000000000000002
[330931.878369] R13: ffffffff853a2a80 R14: ffffffffa028d440 R15: dffffc0000000000
[330931.878980] FS:  00007fdbaabdb700(0000) GS:ffff888814800000(0000) knlGS:0000000000000000
[330931.879592] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[330931.879933] CR2: ffffffffa22a4668 CR3: 0000001d72d72001 CR4: 00000000001626e0
[330931.880541] Call Trace:
[330931.880879]  ? mutex_unlock+0x1d/0x40
[330931.881219]  ? trace_module_notify+0xf4/0x3e0
[330931.881559]  notifier_call_chain+0xc1/0x130
[330931.881910]  __blocking_notifier_call_chain+0x67/0xa0
[330931.882256]  __x64_sys_delete_module+0x324/0x4a0
[330931.882598]  ? __ia32_sys_delete_module+0x4a0/0x4a0
[330931.882941]  ? task_work_run+0x66/0x180
[330931.883280]  ? exit_to_usermode_loop+0xec/0x180
[330931.883623]  ? filp_close+0xf0/0x130
[330931.883958]  do_syscall_64+0xa0/0x370
[330931.884294]  ? page_fault+0x8/0x30
[330931.884628]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[330931.884979] RIP: 0033:0x7fdba9ec2927
[330931.885322] Code: 73 01 c3 48 8b 0d 71 c5 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 41 c5 2b 00 f7 d8 64 89 01 48
[330931.886240] RSP: 002b:00007ffc513d8398 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0
[330931.886861] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fdba9ec2927
[330931.887478] RDX: 0000000000000800 RSI: 0000000000000800 RDI: 0000000002422398
[330931.888091] RBP: 00007ffc513d83c0 R08: 0000000000000000 R09: 1999999999999999
[330931.888705] R10: 0000000000000883 R11: 0000000000000202 R12: 0000000000402410
[330931.889314] R13: 00007ffc513d8600 R14: 0000000000000000 R15: 0000000000000000
[330931.889927] Modules linked in: dm_round_robin sch_netem sch_prio arc4 libarc4 md4 sha512_ssse3 sha512_generic cmac nls_utf8 cifs nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver xt_comment iptable_nat nf_nat_ipv4 nf_nat ipt_REJECT nf_reject_ipv4 act_gact cls_u32 sch_ingress fuse nfsd auth_rpcgss nfs_acl nfs lockd grace fscache sunrpc ramoops reed_solomon 8021q garp mrp nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack libcrc32c crc32c_intel ip6table_filter ip6_tables iptable_filter ip_tables vhost_net vhost tap mlx5_ib(O) mlx5_core(O) mlxfw tls(t) rdma_ucm(O) ib_uverbs(O) rdma_cm(O) iw_cm(O) ib_cm(O) ib_core(O) mlx_compat(O) iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi vfio_pci vfio_virqfd vfio_iommu_type1 vfio i40e(O) igb(O) i2c_algo_bit dm_multipath mlx4_en mlx4_core tipc ip6_udp_tunnel udp_tunnel tun nbd sb_edac nfit libnvdimm k10temp coretemp bridge stp llc watch_reboot sffs(O) cl_lock(O) cl_softdog(O) kvm_intel kvm irqbypass squashfs overlay loop dm_mod sg sd_mod usbhid iTCO_wdt
[330931.889987]  iTCO_vendor_support mpt3sas(O) mxm_wmi ahci libahci i2c_i801 lpc_ich libata pcspkr ioatdma i2c_core mfd_core raid_class scsi_transport_sas dca ipmi_si(-) ipmi_devintf ipmi_msghandler wmi acpi_power_meter acpi_pad acpi_cpufreq sch_fq_codel [last unloaded: fi_socket]
[330931.894918] Features: eBPF/event
[330931.895255] CR2: ffffffffa22a4668

上面这段日志是因为在内核态访问了一个没有映射到物理地址的虚拟地址引发的，这些日志是在no_context函数里输出的：

no_context

static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
	   unsigned long address, int signal, int si_code)
{
	struct task_struct *tsk = current;
	unsigned long flags;
	int sig;

	/* Are we prepared to handle this kernel fault? */
	if (fixup_exception(regs, X86_TRAP_PF)) {
		/*
		 * Any interrupt that takes a fault gets the fixup. This makes
		 * the below recursive fault logic only apply to a faults from
		 * task context.
		 */
		if (in_interrupt())
			return;

		/*
		 * Per the above we're !in_interrupt(), aka. task context.
		 *
		 * In this case we need to make sure we're not recursively
		 * faulting through the emulate_vsyscall() logic.
		 */
		if (current->thread.sig_on_uaccess_err && signal) {
			tsk->thread.trap_nr = X86_TRAP_PF;
			tsk->thread.error_code = error_code | X86_PF_USER;
			tsk->thread.cr2 = address;

			/* XXX: hwpoison faults will set the wrong code. */
			force_sig_fault(signal, si_code, (void __user *)address,
					tsk);
		}

		/*
		 * Barring that, we can do the fixup and be happy.
		 */
		return;
	}

#ifdef CONFIG_VMAP_STACK
	/*
	 * Stack overflow?  During boot, we can fault near the initial
	 * stack in the direct map, but that's not an overflow -- check
	 * that we're in vmalloc space to avoid this.
	 */
	if (is_vmalloc_addr((void *)address) &&
	    (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
	     address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
		unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
		/*
		 * We're likely to be running with very little stack space
		 * left.  It's plausible that we'd hit this condition but
		 * double-fault even before we get this far, in which case
		 * we're fine: the double-fault handler will deal with it.
		 *
		 * We don't want to make it all the way into the oops code
		 * and then double-fault, though, because we're likely to
		 * break the console driver and lose most of the stack dump.
		 */
		asm volatile ("movq %[stack], %%rsp\n\t"
			      "call handle_stack_overflow\n\t"
			      "1: jmp 1b"
			      : ASM_CALL_CONSTRAINT
			      : "D" ("kernel stack overflow (page fault)"),
				"S" (regs), "d" (address),
				[stack] "rm" (stack));
		unreachable();
	}
#endif

	/*
	 * 32-bit:
	 *
	 *   Valid to do another page fault here, because if this fault
	 *   had been triggered by is_prefetch fixup_exception would have
	 *   handled it.
	 *
	 * 64-bit:
	 *
	 *   Hall of shame of CPU/BIOS bugs.
	 */
	if (is_prefetch(regs, error_code, address))
		return;

	if (is_errata93(regs, address))
		return;

	/*
	 * Buggy firmware could access regions which might page fault, try to
	 * recover from such faults.
	 */
	if (IS_ENABLED(CONFIG_EFI))
		efi_recover_from_page_fault(address);

	/*
	 * Oops. The kernel tried to access some bad page. We will have to
	 * terminate things with extreme prejudice:
	 */
	flags = oops_begin();

	// 发生地址翻译一场的原因
	show_fault_oops(regs, error_code, address);

	// 判断进程内核栈的结尾地址的内容是否仍然是预设的魔术值STACK_END_MAGIC，如果不是的话，说明发生了内核栈溢出
	if (task_stack_end_corrupted(tsk))
		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

	// 记录发生地址翻译错误的地址
	tsk->thread.cr2		= address;
	// 缺页异常
	tsk->thread.trap_nr	= X86_TRAP_PF;
	// 错误码
	tsk->thread.error_code	= error_code;

	sig = SIGKILL;
	if (__die("Oops", regs, error_code))
		sig = 0;

	/* Executive summary in case the body of the oops scrolled away */
	printk(KERN_DEFAULT "CR2: %016lx\n", address);

	oops_end(flags, regs, sig);
}

[330931.874444] BUG: unable to handle kernel paging request at ffffffffa22a4668
这个是说内核无法处理对地址ffffffffa22a4668的pageing request，x86的地址翻译分段和分页两个阶段，这个异常发生在分页阶段，也就是MMU页表翻译阶段，导致地址翻译错误的虚拟地址是ffffffffa22a4668，这行日志的源码是：

	pr_alert("BUG: unable to handle kernel %s at %px\n",
		 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
		 (void *)address);

[330931.874532] PGD 3a23067 P4D 3a23067 PUD 3a24063 PMD 1ee9909067 PTE 0
既然是地址翻译出错了，那么内核为了方便你调试，就用CPU模拟MMU地址翻译，并且把翻译过程中中设计到的页表项都输出出来，便于你分析到底是哪一级页表翻译除了问题，这条日志依次输出了翻译这个虚拟地址时用到的PGD、P4D、PUD、PMD以及PTE的页表项的内容，可以明显看到，PTE这一级的页表项的内容是0，也就是没有建立页表映射，这行日志对应的源码是：

dump_pagetable

static void dump_pagetable(unsigned long address)
{
	pgd_t *base = __va(read_cr3_pa());
	pgd_t *pgd = base + pgd_index(address);
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	if (bad_address(pgd))
		goto bad;

	pr_info("PGD %lx ", pgd_val(*pgd));

	if (!pgd_present(*pgd))
		goto out;

	p4d = p4d_offset(pgd, address);
	if (bad_address(p4d))
		goto bad;

	pr_cont("P4D %lx ", p4d_val(*p4d));
	if (!p4d_present(*p4d) || p4d_large(*p4d))
		goto out;

	pud = pud_offset(p4d, address);
	if (bad_address(pud))
		goto bad;

	pr_cont("PUD %lx ", pud_val(*pud));
	if (!pud_present(*pud) || pud_large(*pud))
		goto out;

	pmd = pmd_offset(pud, address);
	if (bad_address(pmd))
		goto bad;

	pr_cont("PMD %lx ", pmd_val(*pmd));
	if (!pmd_present(*pmd) || pmd_large(*pmd))
		goto out;

	pte = pte_offset_kernel(pmd, address);
	if (bad_address(pte))
		goto bad;

	pr_cont("PTE %lx", pte_val(*pte));
out:
	pr_cont("\n");
	return;
bad:
	pr_info("BAD\n");
}

从这函数的实现中可以看到，在遍历页表过程中，用到了多种函数：

bad_address: 判断指定的地址是否可以访问，因为接下来会访问这个地址，如果不检查直接访问，可能会再次触发异常。这里用到了probe_kernel_address函数来判断一个内核地址是否可以访问，跟copy_from_user的原理类似，如果地址不能访问，probe_kernel_address同样会触发地址异常，不过这个函数的特殊之处是提供了修复机制，即使发生了异常，还是可以正常返回，通过返回值我们可以判断这个地址是否可以访问，如果返回0，表示访问正常，如果是-EFAULT，表示在访问时发生了地址异常，这个地址不能访问
xxx_present: 判断页表项指向的内存的内容是否被swawp到磁盘上了，如果是的话，也没有必要继续往下遍历了
xxx_large: 判断是否页表项是否以大页的方式映射（如pud_large、pmd_large等）的，如果时的话，那么就没有比较继续往下遍历了，如果输出到了PTE一级，说明没有采用大页的映射方式。

接下来来的大部分内容都是通过__die输出的：

__die

int __die(const char *str, struct pt_regs *regs, long err)
{
	/* Save the regs of the first oops for the executive summary later. */
	if (!die_counter)
		exec_summary_regs = *regs;

	printk(KERN_DEFAULT
	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "",
	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");

	show_regs(regs);
	print_modules();

	if (notify_die(DIE_OOPS, str, regs, err,
			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
		return 1;

	return 0;
}

[330931.874618] Oops: 0000 [#1] SMP KASAN NOPTI
其中"Oops"是传进来的，"0000"表示错误码的前16位，"[#1]"表示die_counter的计数值，每执行一次__die，那么这个计数值加1，后面的几个标志的含义如下：


CONFIG_PREEMPT	" PREEMPT"	内核在编译时是否开启了内核抢占
CONFIG_SMP	" SMP"	是否配置了对对称多处理器的支持
debug_pagealloc_enabled()	" DEBUG_PAGEALLOC"	开启这个功能后，当page被释放后，page对应的映射页表的present标志清楚，以便发现UAF这样的问题
CONFIG_KASAN	" KASAN"	是否开启了KASAN，用于发现内存访问问题，如UAF、越界等
CONFIG_PAGE_TABLE_ISOLATION	" PTI" 或者" NOPTI" 或者""，如果时空，表示内核配置没有开启，如果时PTI，表示内核配置开启了，并且CPU也支持，如果时NOPTI，表示内核配置虽然开启，但是CPU硬件不支持	是否支持内核页表隔离，参考Page Table Isolation (PTI)

下面的一大段是发生异常时的CPU上下文信息：

[330931.874694] CPU: 14 PID: 2231 Comm: rmmod Kdump: loaded Tainted: GF       W  O     --------- -t - 4.18.0-6.10.0 #1
[330931.874785] Hardware name: Supermicro X10DRi/X10DRi, BIOS 2.0 12/28/2015
[330931.874870] RIP: 0010:kprobes_module_callback+0x144/0x820
[330931.874948] Code: 85 b6 00 00 00 48 8b 1b 48 85 db 0f 84 b7 00 00 00 48 8d 7b 28 48 89 f8 48 c1 e8 03 42 80 3c 38 00 0f 85 66 03 00 00 48 89 e8 <48> 8b 4b 28 48 c1 e8 03 42 80 3c 38 00 0f 85 5a 03 00 00 49 8b 86
[330931.875583] RSP: 0000:ffff888175727d38 EFLAGS: 00010246
[330931.875928] RAX: ffffffffa028d650 RBX: ffffffffa22a4640 RCX: ffffffff81cf0610
[330931.876538] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffa22a4668
[330931.877150] RBP: ffffffffa028d650 R08: ffffed102eae4f9c R09: 0000000000000000
[330931.877760] R10: 0000000000000001 R11: ffffed102eae4f9b R12: 0000000000000002
[330931.878369] R13: ffffffff853a2a80 R14: ffffffffa028d440 R15: dffffc0000000000
[330931.878980] FS:  00007fdbaabdb700(0000) GS:ffff888814800000(0000) knlGS:0000000000000000
[330931.879592] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[330931.879933] CR2: ffffffffa22a4668 CR3: 0000001d72d72001 CR4: 00000000001626e0
[330931.880541] Call Trace:
[330931.880879]  ? mutex_unlock+0x1d/0x40
[330931.881219]  ? trace_module_notify+0xf4/0x3e0
[330931.881559]  notifier_call_chain+0xc1/0x130
[330931.881910]  __blocking_notifier_call_chain+0x67/0xa0
[330931.882256]  __x64_sys_delete_module+0x324/0x4a0
[330931.882598]  ? __ia32_sys_delete_module+0x4a0/0x4a0
[330931.882941]  ? task_work_run+0x66/0x180
[330931.883280]  ? exit_to_usermode_loop+0xec/0x180
[330931.883623]  ? filp_close+0xf0/0x130
[330931.883958]  do_syscall_64+0xa0/0x370
[330931.884294]  ? page_fault+0x8/0x30
[330931.884628]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[330931.884979] RIP: 0033:0x7fdba9ec2927
[330931.885322] Code: 73 01 c3 48 8b 0d 71 c5 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 41 c5 2b 00 f7 d8 64 89 01 48
[330931.886240] RSP: 002b:00007ffc513d8398 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0
[330931.886861] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fdba9ec2927
[330931.887478] RDX: 0000000000000800 RSI: 0000000000000800 RDI: 0000000002422398
[330931.888091] RBP: 00007ffc513d83c0 R08: 0000000000000000 R09: 1999999999999999
[330931.888705] R10: 0000000000000883 R11: 0000000000000202 R12: 0000000000402410
[330931.889314] R13: 00007ffc513d8600 R14: 0000000000000000 R15: 0000000000000000

上面的这段日志是通过show_regs输出的：

show_regs

void show_regs(struct pt_regs *regs)
{
	show_regs_print_info(KERN_DEFAULT);

	// 输出寄存器的值
	__show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL);

	/*
	 * When in-kernel, we also print out the stack at the time of the fault..
	   如果是在内核态触发的异常，那么会输出当前进程的内核栈里的内容，按栈回溯的方式
	 */
	if (!user_mode(regs))
		show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
}

/**
 * show_regs_print_info - print generic debug info for show_regs()
 * @log_lvl: log level
 *
 * show_regs() implementations can use this function to print out generic
 * debug information.
 */
void show_regs_print_info(const char *log_lvl)
{
	dump_stack_print_info(log_lvl);
}

/**
 * dump_stack_print_info - print generic debug info for dump_stack()
 * @log_lvl: log level
 *
 * Arch-specific dump_stack() implementations can use this function to
 * print out the same debug information as the generic dump_stack().
 */
void dump_stack_print_info(const char *log_lvl)
{
	printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s\n",
	       log_lvl, raw_smp_processor_id(), current->pid, current->comm,
	       kexec_crash_loaded() ? "Kdump: loaded " : "",
	       print_tainted(),
	       init_utsname()->release,
	       (int)strcspn(init_utsname()->version, " "),
	       init_utsname()->version);

	// 输出机器的硬件信息
	if (dump_stack_arch_desc_str[0] != '\0')
		printk("%sHardware name: %s\n",
		       log_lvl, dump_stack_arch_desc_str);

	// 如果引发异常的进程是工作队列的一个worker线程，那么会输出关于这个worker的一些信息
	// 如所属的工作列队的名字、当前在执行的回调函数等信息
	print_worker_info(log_lvl, current);
}

[330931.874694] CPU: 14 PID: 2231 Comm: rmmod Kdump: loaded Tainted: GF W O --------- -t - 4.18.0-6.10.0 #1
其中"14"表示触发异常的CPU的编号，"2231"表示触发异常时CPU14上正在执行的进程的进程号，"rmmod"表示进程2231的名字，"Kdump: loaded"表示kdump已经把second kernel加载成功，可以执行kdump，否则不打印kdump的任何信息。"Tainted: GF W O --------- -t -"表示内核的污染信息，通过print_tainted()输出，关于taint相关的信息，参考Tainted kernels，对于这些标志的含义见下文。 4.18.0-6.10.0": 表示的版本号，"#1"来自versoin的前2个字符，而version来自uname -v。

参考：https://docs.kernel.org/admin-guide/tainted-kernels.html#more-detailed-explanation-for-tainting

标志	含义	备注
G	if all modules loaded have a GPL or compatible license	`P` if any proprietary module has been loaded. Modules without a MODULE_LICENSE or with a MODULE_LICENSE that is not recognised by insmod as GPL compatible are assumed to be proprietary.
F	if any module was force loaded by insmod -f	' ' if all modules were loaded normally
W	if a warning has previously been issued by the kernel.
O	if an externally-built ("out-of-tree") module has been loaded.

print_tainted的定义如下：

点击查看代码

struct taint_flag {
	char c_true;	/* character printed when tainted */
	char c_false;	/* character printed when not tainted */
	bool module;	/* also show as a per-module taint flag */
};

/*
 * TAINT_FORCED_RMMOD could be a per-module flag but the module
 * is being removed anyway.
 */
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
	[ TAINT_PROPRIETARY_MODULE ]	= { 'P', 'G', true },
	[ TAINT_FORCED_MODULE ]		= { 'F', ' ', true },
	[ TAINT_CPU_OUT_OF_SPEC ]	= { 'S', ' ', false },
	[ TAINT_FORCED_RMMOD ]		= { 'R', ' ', false },
	[ TAINT_MACHINE_CHECK ]		= { 'M', ' ', false },
	[ TAINT_BAD_PAGE ]		= { 'B', ' ', false },
	[ TAINT_USER ]			= { 'U', ' ', false },
	[ TAINT_DIE ]			= { 'D', ' ', false },
	[ TAINT_OVERRIDDEN_ACPI_TABLE ]	= { 'A', ' ', false },
	[ TAINT_WARN ]			= { 'W', ' ', false },
	[ TAINT_CRAP ]			= { 'C', ' ', true },
	[ TAINT_FIRMWARE_WORKAROUND ]	= { 'I', ' ', false },
	[ TAINT_OOT_MODULE ]		= { 'O', ' ', true },
	[ TAINT_UNSIGNED_MODULE ]	= { 'E', ' ', true },
	[ TAINT_SOFTLOCKUP ]		= { 'L', ' ', false },
	[ TAINT_LIVEPATCH ]		= { 'K', ' ', true },
	[ TAINT_AUX ]			= { 'X', ' ', true },
	[ TAINT_RANDSTRUCT ]		= { 'T', ' ', true },
};

/**
 * print_tainted - return a string to represent the kernel taint state.
 *
 * For individual taint flag meanings, see Documentation/sysctl/kernel.txt
 *
 * The string is overwritten by the next call to print_tainted(),
 * but is always NULL terminated.
 */
const char *print_tainted(void)
{
	static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];

	BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);

	if (tainted_mask) {
		char *s;
		int i;

		s = buf + sprintf(buf, "Tainted: ");
		for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
			const struct taint_flag *t = &taint_flags[i];
			*s++ = test_bit(i, &tainted_mask) ?
					t->c_true : t->c_false;
		}
		*s = 0;
	} else
		snprintf(buf, sizeof(buf), "Not tainted");

	return buf;
}

[330931.874785] Hardware name: Supermicro X10DRi/X10DRi, BIOS 2.0 12/28/2015
这行日志是在dump_stack_print_info中输出的。
寄存器输出部分

[330931.874870] RIP: 0010:kprobes_module_callback+0x144/0x820
[330931.874948] Code: 85 b6 00 00 00 48 8b 1b 48 85 db 0f 84 b7 00 00 00 48 8d 7b 28 48 89 f8 48 c1 e8 03 42 80 3c 38 00 0f 85 66 03 00 00 48 89 e8 <48> 8b 4b 28 48 c1 e8 03 42 80 3c 38 00 0f 85 5a 03 00 00 49 8b 86
[330931.875583] RSP: 0000:ffff888175727d38 EFLAGS: 00010246
[330931.875928] RAX: ffffffffa028d650 RBX: ffffffffa22a4640 RCX: ffffffff81cf0610
[330931.876538] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffa22a4668
[330931.877150] RBP: ffffffffa028d650 R08: ffffed102eae4f9c R09: 0000000000000000
[330931.877760] R10: 0000000000000001 R11: ffffed102eae4f9b R12: 0000000000000002
[330931.878369] R13: ffffffff853a2a80 R14: ffffffffa028d440 R15: dffffc0000000000
[330931.878980] FS:  00007fdbaabdb700(0000) GS:ffff888814800000(0000) knlGS:0000000000000000
[330931.879592] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[330931.879933] CR2: ffffffffa22a4668 CR3: 0000001d72d72001 CR4: 00000000001626e0

这个是在__show_regs中完成的：

__show_regs

/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
	unsigned long d0, d1, d2, d3, d6, d7;
	unsigned int fsindex, gsindex;
	unsigned int ds, cs, es;

	show_iret_regs(regs);

	if (regs->orig_ax != -1)
		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
	else
		pr_cont("\n");

	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
	       regs->ax, regs->bx, regs->cx);
	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
	       regs->dx, regs->si, regs->di);
	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
	       regs->bp, regs->r8, regs->r9);
	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
	       regs->r10, regs->r11, regs->r12);
	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
	       regs->r13, regs->r14, regs->r15);

	if (mode == SHOW_REGS_SHORT)
		return;

	if (mode == SHOW_REGS_USER) {
		rdmsrl(MSR_FS_BASE, fs);
		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
		printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
		       fs, shadowgs);
		return;
	}

	asm("movl %%ds,%0" : "=r" (ds));
	asm("movl %%cs,%0" : "=r" (cs));
	asm("movl %%es,%0" : "=r" (es));
	asm("movl %%fs,%0" : "=r" (fsindex));
	asm("movl %%gs,%0" : "=r" (gsindex));

	rdmsrl(MSR_FS_BASE, fs);
	rdmsrl(MSR_GS_BASE, gs);
	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);

	cr0 = read_cr0();
	cr2 = read_cr2();
	cr3 = __read_cr3();
	cr4 = __read_cr4();

	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
	       fs, fsindex, gs, gsindex, shadowgs);
	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
			es, cr0);
	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
			cr4);

	get_debugreg(d0, 0);
	get_debugreg(d1, 1);
	get_debugreg(d2, 2);
	get_debugreg(d3, 3);
	get_debugreg(d6, 6);
	get_debugreg(d7, 7);

	/* Only print out debug registers if they are in their non-default state. */
	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
		printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
		       d0, d1, d2);
		printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
		       d3, d6, d7);
	}

	if (boot_cpu_has(X86_FEATURE_OSPKE))
		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
}

模块列表

[330931.889927] Modules linked in: dm_round_robin sch_netem sch_prio arc4 libarc4 md4 sha512_ssse3 sha512_generic cmac nls_utf8 cifs nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver xt_comment iptable_nat nf_nat_ipv4 nf_nat ipt_REJECT nf_reject_ipv4 act_gact cls_u32 sch_ingress fuse nfsd auth_rpcgss nfs_acl nfs lockd grace fscache sunrpc ramoops reed_solomon 8021q garp mrp nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack libcrc32c crc32c_intel ip6table_filter ip6_tables iptable_filter ip_tables vhost_net vhost tap mlx5_ib(O) mlx5_core(O) mlxfw tls(t) rdma_ucm(O) ib_uverbs(O) rdma_cm(O) iw_cm(O) ib_cm(O) ib_core(O) mlx_compat(O) iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi vfio_pci vfio_virqfd vfio_iommu_type1 vfio i40e(O) igb(O) i2c_algo_bit dm_multipath mlx4_en mlx4_core tipc ip6_udp_tunnel udp_tunnel tun nbd sb_edac nfit libnvdimm k10temp coretemp bridge stp llc watch_reboot sffs(O) cl_lock(O) cl_softdog(O) kvm_intel kvm irqbypass squashfs overlay loop dm_mod sg sd_mod usbhid iTCO_wdt
[330931.889987]  iTCO_vendor_support mpt3sas(O) mxm_wmi ahci libahci i2c_i801 lpc_ich libata pcspkr ioatdma i2c_core mfd_core raid_class scsi_transport_sas dca ipmi_si(-) ipmi_devintf ipmi_msghandler wmi acpi_power_meter acpi_pad acpi_cpufreq sch_fq_codel [last unloaded: fi_socket]

这个是通过print_modules输出的：

print_modules

/* Don't grab lock, we're oopsing. */
void print_modules(void)
{
	struct module *mod;
	char buf[MODULE_FLAGS_BUF_SIZE];

	printk(KERN_DEFAULT "Modules linked in:");
	/* Most callers should already have preempt disabled, but make sure */
	preempt_disable();
	list_for_each_entry_rcu(mod, &modules, list) {
		if (mod->state == MODULE_STATE_UNFORMED)
			continue;
		pr_cont(" %s%s", mod->name, module_flags(mod, buf));
	}
	preempt_enable();
	if (last_unloaded_module[0])
		pr_cont(" [last unloaded: %s]", last_unloaded_module);
	pr_cont("\n");
}

其中每个模块名的后面可能会输出一个标志，表示模块的的当前状态，上面module_flags就是负责输出模块标志的：

module_flags

/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
static char *module_flags(struct module *mod, char *buf)
{
	int bx = 0;

	BUG_ON(mod->state == MODULE_STATE_UNFORMED);
	if (mod->taints ||
	    mod->state == MODULE_STATE_GOING ||
	    mod->state == MODULE_STATE_COMING) {
		buf[bx++] = '(';
		bx += module_flags_taint(mod, buf + bx);
		/* Show a - for module-is-being-unloaded */
		if (mod->state == MODULE_STATE_GOING)  //  正在被移除的模块
			buf[bx++] = '-';
		/* Show a + for module-is-being-loaded */
		if (mod->state == MODULE_STATE_COMING)  // 正在被加载的模块
			buf[bx++] = '+';
		buf[bx++] = ')';
	}
	buf[bx] = '\0';

	return buf;
}

static size_t module_flags_taint(struct module *mod, char *buf)
{
	size_t l = 0;
	int i;

	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
		if (taint_flags[i].module && test_bit(i, &mod->taints))
			buf[l++] = taint_flags[i].c_true;
	}

	return l;
}

/*
 * TAINT_FORCED_RMMOD could be a per-module flag but the module
 * is being removed anyway.
 */
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
	[ TAINT_PROPRIETARY_MODULE ]	= { 'P', 'G', true },
	[ TAINT_FORCED_MODULE ]		= { 'F', ' ', true },
	[ TAINT_CPU_OUT_OF_SPEC ]	= { 'S', ' ', false },
	[ TAINT_FORCED_RMMOD ]		= { 'R', ' ', false },
	[ TAINT_MACHINE_CHECK ]		= { 'M', ' ', false },
	[ TAINT_BAD_PAGE ]		= { 'B', ' ', false },
	[ TAINT_USER ]			= { 'U', ' ', false },
	[ TAINT_DIE ]			= { 'D', ' ', false },
	[ TAINT_OVERRIDDEN_ACPI_TABLE ]	= { 'A', ' ', false },
	[ TAINT_WARN ]			= { 'W', ' ', false },
	[ TAINT_CRAP ]			= { 'C', ' ', true },
	[ TAINT_FIRMWARE_WORKAROUND ]	= { 'I', ' ', false },
	[ TAINT_OOT_MODULE ]		= { 'O', ' ', true },
	[ TAINT_UNSIGNED_MODULE ]	= { 'E', ' ', true },
	[ TAINT_SOFTLOCKUP ]		= { 'L', ' ', false },
	[ TAINT_LIVEPATCH ]		= { 'K', ' ', true },
	[ TAINT_AUX ]			= { 'X', ' ', true },
	[ TAINT_RANDSTRUCT ]		= { 'T', ' ', true },
};

参考：https://docs.kernel.org/admin-guide/tainted-kernels.html#more-detailed-explanation-for-tainting

标签：00,016lx,x86,宕机,regs,module,TAINT,address,日志
From： https://www.cnblogs.com/pengdonglin137/p/17815343.html

相关文章

赞助商

阅读排行