core 的堆栈是这样子的:
(gdb) bt
#0 0x00007ffff4a96a7c in pthread_kill () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffff4a42476 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007ffff4a287f3 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#3 0x00007ffff4a896f6 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#4 0x00007ffff4b3676a in __fortify_fail () from /lib/x86_64-linux-gnu/libc.so.6
#5 0x00007ffff4b350c6 in __chk_fail () from /lib/x86_64-linux-gnu/libc.so.6
#6 0x00007ffff4b34b09 in __strncpy_chk () from /lib/x86_64-linux-gnu/libc.so.6
#7 0x0000555555586c68 in strncpy (__len=38, __src=0x5555555be828 "Some of the addresses are not on link.", __dest=0x55555965bf58 <parp_lc_nsi_pkt_pool+344> "All addresses still on link.")
at /usr/include/x86_64-linux-gnu/bits/string_fortified.h:95
#8 rx_fr_ge_dhcp6_pkt_00 (dhcp6_hdr=0x55555965bf2e <parp_lc_nsi_pkt_pool+302>, dhcp6_hdr@entry=0x55555965bef4 <parp_lc_nsi_pkt_pool+244>, dhcp6_len=70, dhcp6_len@entry=128,
rsp_eth_ipv6_pkt=rsp_eth_ipv6_pkt@entry=0x555555619580 <pkt_resp0+256> "\002\020\030dB0", sock_vrf_id=sock_vrf_id@entry=0 '\000') at /home/jenkins/casa/parp/dhcpv6_pkt.c:2382
#9 0x0000555555582583 in rx_fr_ge_dhcp6_pkt_0 (dhcp6_hdr=dhcp6_hdr@entry=0x55555965bef4 <parp_lc_nsi_pkt_pool+244>, dhcp6_len=dhcp6_len@entry=128, sock_vrf_id=sock_vrf_id@entry=0 '\000')
at /home/jenkins/casa/parp/smm_rx_ipv6.c:192
#10 0x000055555556b34a in lc_nsi_pkt_processing (pkt_desc=pkt_desc@entry=0x55555965be00 <parp_lc_nsi_pkt_pool>) at /home/jenkins/casa/parp/smm_arp_main.c:755
#11 0x00005555555a732a in parp_lc_nsi_msg_queue_thread () at /home/jenkins/casa/parp/smm_arp_queue.c:376
frame 8 里面的代码片段是这样的:
2376 int orignal_len = ntohs(status_code->dh6_status_len);
2377 int new_len = strlen(NotOnLink);
2378 u_int8_t *ptr = (u_int8_t *)&status_code->dh6_status_code;
2379 signed short offset_len = new_len - (orignal_len - 2);
2380 status_code->dh6_status_len = htons(new_len + 2);
2381 memmove(ptr+new_len+2, ptr+orignal_len, dhcp6_len - (ptr+orignal_len -(u_int8_t*)dhcp6_hdr));
2382 strncpy((char*)(ptr+2), NotOnLink, new_len); // <===== 挂在这儿
2383 status_code->dh6_status_code = DH6OPT_STCODE_NOTONLINK;
2384 dhcp6_len += offset_len;
ptr
指向的其实是一个预分配好的全局 buffer,用来构造要发出去的报文
status_code
则是这个报文 DHCPv6 的一个 option 头
buffer 大小有2K,ptr
偏移 buffer 头才 300 字节,逻辑上来说,长度完全足够,__src 和 __dest 都没有被破坏
但偏偏就是 strncpy 检查的时候不通过了
看一下 strncpy 的代码:
91 __fortify_function char *
92 __NTH (strncpy (char *__restrict __dest, const char *__restrict __src,
93 size_t __len))
94 {
95 return __builtin___strncpy_chk (__dest, __src, __len,
96 __glibc_objsize (__dest));
97 }
检查的是 ptr
的长度,想不明白
按上面的分析, ptr 后面剩下的缓存绝对够得
后面算是灵光乍现
因为ptr
初始化的时候指向的是status_code
中的一个成员地址,strncpy
检查它长度的时候是不是按照status_code
的大小来检查了
于是写了一个验证的代码:
942
943 uint8_t zyc_buf[1024];
944
945 int zyc_dbg()
946 {
947
948 struct dhcp6_status_info *dhcp6 = (struct dhcp6_status_info *)&zyc_buf[100];
949 char src[128] = "Some of the addresses are not on link.";
950 uint8_t *ptr;
951 dhcp6->dh6_status_type = 10;
952 dhcp6->dh6_status_len = 12;
953 dhcp6->dh6_status_code = 16;
954 #if 1
955 ptr = (uint8_t *)&dhcp6->dh6_status_code; // ①
956 strncpy((char *)(ptr + 2), src, strlen(src));
957 printf("[%s():%d] (%s)\n", __func__, __LINE__, (char *)(ptr + 2));
958 #else
959 ptr = (uint8_t *)(dhcp6 + 1); // ②
960 strncpy((char *)ptr, src, strlen(src));
961 printf("[%s():%d] [%s]\n", __func__, __LINE__, (char *)(ptr));
962 #endif
963
964 return 0;
965 }
用 955 行的第①种方式来初始化,就复现了这个core
用 959 行的第②种方式来初始化,进程就能顺利跑下去
极其吊诡
后来在我自己的虚拟机上跑了下上面的 验证代码,欸,两种初始化方式都没问题
Google 发现,似乎是 __glibc_objsize 跟编译器有关
问了下别的同事,说是用Linux做cpe,在这台Linux cpe上,用"ifconfig xxx down"把端口down掉的时候,(down掉之前)Linux cpe会发dhcpv6报文通知dhcp server释放这个interface上的IP,
然后dhcp server会回复一个dhcp报文,说 ok已经释放了 ,现在出问题的地方就是parp对这个dhcp server回复的报文做改动的时候