一、现象
昨天运维人员被告知,在升级完客户集群环境后,访问管理页面偶尔会报 502 Bad Gateway。登录客户环境,发现只要请求分发到 node2,就会报 502,开始解决问题...
二、排查思路
1、看到 502 第一时间想到的应该是 php-fpm 出问题了,先看下 nginx 日志,连接被对端关闭,说明 php-fpm 进程提前退出了,那是什么原因导致 php-fpm 退出了呢?
2、查看 php-fpm 日志,有很多 warning 级别日志,而且都是重复的退出、重建。网上有很多文章说调整 php-fpm 执行时间之类,作用不大,如果报 504 可以考虑适当调整执行时间参数。这时修改 php-fpm 为如下配置,保存重启 php-fpm
pm.max_children = 1 pm.start_servers = 1 pm.min_spare_servers = 1 pm.max_spare_servers = 1
3、使用 strace 命令追踪 php-fpm worker 进程
[mlcdm@node2 ~]$ sudo strace -p 9277 strace: Process 9277 attached accept(5, {sa_family=AF_INET, sin_port=htons(46636), sin_addr=inet_addr("127.0.0.1")}, [112->16]) = 3 poll([{fd=3, events=POLLIN}], 1, 5000) = 1 ([{fd=3, revents=POLLIN}]) times({tms_utime=0, tms_stime=0, tms_cutime=0, tms_cstime=0}) = 8082094440 read(3, "\1\1\0\1\0\10\0\0", 8) = 8 read(3, "\0\1\0\0\0\0\0\0", 8) = 8 read(3, "\1\4\0\1\3a\7\0", 8) = 8 read(3, "\0174SCRIPT_FILENAME/var/www/mlclou"..., 872) = 872 read(3, "\1\4\0\1\0\0\0\0", 8) = 8 lstat("/var/www/mlcloud/public//index.php/api/appfile/pluto", 0x7ffd136a20b0) = -1 ENOTDIR (Not a directory) stat("/var/www/mlcloud/public//index.php/api/appfile", 0x7ffd136a44a0) = -1 ENOTDIR (Not a directory) stat("/var/www/mlcloud/public//index.php/api", 0x7ffd136a44a0) = -1 ENOTDIR (Not a directory) stat("/var/www/mlcloud/public//index.php", {st_mode=S_IFREG|0666, st_size=1604, ...}) = 0 read(3, "\1\5\0\1\0\275\3\0", 8) = 8 read(3, "method=dbf.task.offlinetimeoutta"..., 189) = 189 read(3, "\0\0\0", 3) = 3 stat("/var/www/mlcloud/public/.user.ini", 0x7ffd136a2fc0) = -1 ENOENT (No such file or directory) rt_sigaction(SIGPROF, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 rt_sigaction(SIGPROF, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 rt_sigaction(SIGHUP, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 rt_sigaction(SIGHUP, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 rt_sigaction(SIGINT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fc5c4ae3630}, 8) = 0 rt_sigaction(SIGINT, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 rt_sigaction(SIGQUIT, NULL, {sa_handler=0xa3ac20, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fc5c4ae3630}, 8) = 0 rt_sigaction(SIGQUIT, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 rt_sigaction(SIGTERM, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fc5c4ae3630}, 8) = 0 rt_sigaction(SIGTERM, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 rt_sigaction(SIGUSR1, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fc5c4ae3630}, 8) = 0 rt_sigaction(SIGUSR1, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 rt_sigaction(SIGUSR2, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fc5c4ae3630}, 8) = 0 rt_sigaction(SIGUSR2, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 setitimer(ITIMER_PROF, {it_interval={tv_sec=0, tv_usec=0}, it_value={tv_sec=60, tv_usec=0}}, NULL) = 0 rt_sigaction(SIGPROF, {sa_handler=0x9d26a0, sa_mask=~[ILL TRAP ABRT BUS FPE KILL SEGV CONT STOP TSTP TTIN TTOU SYS RTMIN RT_1], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7fc5c4ae3630}, NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [PROF], NULL, 8) = 0 lstat("/var/www/mlcloud/public//index.php", {st_mode=S_IFREG|0666, st_size=1604, ...}) = 0 lstat("/var/www/mlcloud/public", {st_mode=S_IFDIR|0777, st_size=4096, ...}) = 0 lstat("/var/www/mlcloud", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 lstat("/var/www", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 lstat("/var", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0 open("/var/www/mlcloud/public/index.php", O_RDONLY) = 4 fstat(4, {st_mode=S_IFREG|0666, st_size=1604, ...}) = 0 getcwd("/home/mlcdm", 4095) = 12 chdir("/var/www/mlcloud/public/") = 0 setitimer(ITIMER_PROF, {it_interval={tv_sec=0, tv_usec=0}, it_value={tv_sec=30, tv_usec=0}}, NULL) = 0 stat("/etc/sysconfig/64bit_strstr_via_64bit_strstr_sse2_unaligned", 0x7ffd136a1470) = -1 ENOENT (No such file or directory) open("/var/www/mlcloud/public//index.php", O_RDONLY) = 6 fstat(6, {st_mode=S_IFREG|0666, st_size=1604, ...}) = 0 fstat(6, {st_mode=S_IFREG|0666, st_size=1604, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc5c5145000 read(6, "<?php\n\n// Valid PHP Version?\n$mi"..., 4096) = 1604 close(6) = 0 munmap(0x7fc5c5145000, 4096) = 0 stat("/tmp", {st_mode=S_IFDIR|0700, st_size=61440, ...}) = 0 open("/tmp/tmpftRfMVP", O_RDWR|O_CREAT|O_EXCL, 0600) = -1 EACCES (Permission denied) --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=NULL} --- +++ killed by SIGSEGV +++
找到问题了,php-fpm 在 /tmp/ 目录创建临时文件失败,导致进程退出,页面响应 502
4、看下 /tmp/ 目录 属主及权限,OMG,运维部署时忘记修改 node2 /tmp/ 目录权限了,因为 php-fpm 进程属主是 www,导致无法在此目录写文件
[mlcdm@node2 ~]$ stat /tmp/ File: ‘/tmp/’ Size: 61440 Blocks: 128 IO Block: 4096 directory Device: fd01h/64769d Inode: 524291 Links: 12 Access: (0700/drwx------) Uid: ( 1003/ mlcdm) Gid: ( 1003/ mlcdm) Access: 2024-07-24 15:00:24.774052000 +0800 Modify: 2024-07-24 15:16:39.705052000 +0800 Change: 2024-07-24 15:16:39.705052000 +0800 Birth: -
5、修改 /tmp/ 目录权限,这里为什么是 1777 ,是因为需要让 /tmp/ 保持粘滞位特权,表示任何用户都可以在 tmp 目录创建文件,但是只能由文件的创建者或 root 才能删除,防止文件被意外删除导致服务异常
[mlcdm@node2 ~]$ sudo chmod 1777 /tmp/
三、总结
1、当页面响应 502 时,多半是网关程序(CGI)异常,比如服务未启动,异常退出等原因导致的,可着重排查CGI程序
2、strace 命令功能强大,对了解进程具体做了哪些系统调用很有帮助
3、linux 用户权限是遇到问题时第一个想到一个点,可以参考鸟哥的私房菜用户权限章节
标签:rt,PHP,st,Bad,sa,NULL,502,SA,php From: https://www.cnblogs.com/gentsir/p/18325576