eBPF实战之超时问题排查
问题描述
线上A模块访问B模块,A模块部署在物理机,B模块部署在容器上。早晚高峰A访问B会出现超时报警。
问题排查
业务方找到我们排查,我首先申请了监控中提示超时的两台机器权限,看一下超时规律 登录10.167.8.25容器所在的宿主机常规排查
dmesg显示如下,联系值班同学和系统部同学硬件报修。
EDAC sbridge MC1: HANDLING MCE MEMORY ERROR
漂移容器10.167.8.252后,10.167.41.18 在两台物理机上小时级别日志中会出现近10次
申请节点权限,全量统计超时数据(单机统计top10数据,汇总后再top10)
问题指向10.167.41.18 这个容器。在之前排查过程中,早晚高峰期设置了定时任务,会ping问题容器10.167.8.252和10.167.41.18 ,检查log发现10.167.41.18容器有严重延时跳变。 为了排查是否由网络链路原因,增加2个脚本,ping容器所在物理机10.89.90.53 ,ping物理机同网段机器 10.89.90.52 进行对比。 最终发现问题容器所在物理机10.89.90.53存在延时跳变,而同网段的机器10.89.90.52没有出现跳变,网络组同学也反馈交换机层没有发现延迟常见的报错。
借鉴debugging-network-stalls-on-kubernetes的排查思路和手段,首先准备了目标机器上的bcc脚本
0#!/usr/bin/python3
1bpf_text = """
2#include <linux/ptrace.h>
3#include <linux/sched.h> /* For TASK_COMM_LEN */
4#include <linux/icmp.h>
5#include <linux/netdevice.h>
6struct probe_icmp_data_t
7{
8 u64 timestamp_ns;
9 u32 tgid;
10 u32 pid;
11 char comm[TASK_COMM_LEN];
12 int v0;
13};
14BPF_PERF_OUTPUT(probe_icmp_events);
15static inline unsigned char *my_skb_transport_header(const struct sk_buff *skb)
16{
17 return skb->head + skb->transport_header;
18}
19static inline struct icmphdr *my_icmp_hdr(const struct sk_buff *skb)
20{
21 return (struct icmphdr *)my_skb_transport_header(skb);
22}
23int probe_icmp(struct pt_regs *ctx, struct sk_buff *skb)
24{
25 void *addr = NULL;
26 u64 __pid_tgid = bpf_get_current_pid_tgid();
27 u32 __tgid = __pid_tgid >> 32;
28 u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half
29
30 struct probe_icmp_data_t __data = {0};
31 __data.timestamp_ns = bpf_ktime_get_ns();
32 __data.tgid = __tgid;
33 __data.pid = __pid;
34 bpf_get_current_comm(&__data.comm, sizeof(__data.comm));
35 __be16 seq;
36 addr = &my_icmp_hdr(skb)->un.echo.sequence;
37 bpf_probe_read(&seq, sizeof(seq), addr);
38 __data.v0 = (int)seq;
39 probe_icmp_events.perf_submit(ctx, &__data, sizeof(__data));
40 return 0;
41}
42"""
43
44from bcc import BPF
45
46b = BPF(text=bpf_text)
47b.attach_kprobe(event="icmp_echo", fn_name="probe_icmp")
48
49print("%-7s %-7s %-15s %s" % ('TGID', 'PID', 'PROCESS NAME', 'ICMP_SEQ') )
50
51def print_icmp_event(cpu, data, size):
52 event = b["probe_icmp_events"].event(data)
53 # print(event)
54 print("%-7d %-7d %-15s %s" %
55 (event.tgid, event.pid,
56 event.comm.decode('utf-8', 'replace'),
57 event.v0))
58
59b["probe_icmp_events"].open_perf_buffer(print_icmp_event)
60while 1:
61 try:
62 b.perf_buffer_poll()
63 except KeyboardInterrupt:
64 exit()
hping3 10.89.90.53 --icmp -i u10000 |grep 'rtt=1.* ms'
目标机器上执行bcc脚本
汇总源和目标主机上的数据
0#!/bin/sh
1while read line ;
2do
3 seq=$(echo $line | awk '{print $5}' | sed 's/icmp_seq=//g');
4 cost=$(echo $line | awk '{print $6}' | sed 's/rtt=//g');
5 bcc=$(grep " ${seq}$" trace.log);
6 echo -e "${bcc}\t**${cost} ms";
7done < icmp.log
对比相同序号的包 发现目标机器上正在执行ksoftirqd 。 汇总数据对比,最后一列为源机器上的耗时
至此已经可以发现是23号cpu 带来的延迟,以下分析是尝试分析延迟的具体原因
问题机器上perf抓下23号cpu的情况
perf record -C 23 -g -F 999
再看调用栈情况
perf script 2>/dev/null | ./FlameGraph/stackcollapse-perf-ordered.pl | grep ksoftir -B 100
火焰图
perf script 2>/dev/null | FlameGraph/stackcollapse-perf.pl out.perf | FlameGraph/flamegraph.pl > perf.svg
后来得知内核组钱君
同学也在分析ksoftirq的延迟问题,钱君
同学建议分析softirq的耗时,所以又搞了两个脚本计算softirq entry和exit的时间差
0#!/bin/bash
1trap 'end;exit' SIGINT
2
3# 脚本会将指定cpu的softirq_entry 和 softirq_exist事件写入到文件
4# CTRL-C 会自动关闭events trace
5
6if [ $# -ne 1 ];then
7 echo "input cpu index, eg. 023 short for cpu#[023]"
8 exit 1
9fi
10
11function init(){
12 echo 1 > /sys/kernel/debug/tracing/events/irq/softirq_entry/enable
13 echo 1 > /sys/kernel/debug/tracing/events/irq/softirq_exit/enable
14
15}
16
17function task(){
18 cat /sys/kernel/debug/tracing/trace_pipe |grep "\[$1\]" | tee trace.log;
19}
20
21function end(){
22 echo 0 > /sys/kernel/debug/tracing/events/irq/softirq_entry/enable
23 echo 0 > /sys/kernel/debug/tracing/events/irq/softirq_exit/enable
24}
25
26function main(){
27 init
28 task $1
29 end
30}
31main $@
0// go run trace.go stack.go -file=xxxx.log
1// 2020.09.21 update
2// 分析上面shell脚本产生的trace文件,通过-file指定绝对路径
3package main
4
5import (
6 "flag"
7 "fmt"
8 "io/ioutil"
9 "os"
10 "sort"
11 "strconv"
12 "strings"
13 "time"
14)
15
16type (
17 event struct {
18 task string
19 cpu string
20 flag string
21 ts float64
22 timestamp time.Time
23 function string
24 vec string
25 action string
26 line string
27 match int
28 }
29
30 obj struct {
31 value float64
32 line string
33 }
34 events []event
35 objs []obj
36)
37
38func (e events) Len() int {
39 return len(e)
40}
41
42func (e events) Less(i, j int) bool {
43 return e[i].ts < e[i].ts
44}
45
46func (e events) Swap(i, j int) {
47 e[i], e[j] = e[j], e[i]
48}
49
50func (ox objs) Len() int {
51 return len(ox)
52}
53func (ox objs) Less(i, j int) bool {
54 if ox[i].value < ox[j].value {
55 return true
56 }
57 return false
58}
59
60func (ox objs) Swap(i, j int) {
61 ox[i], ox[j] = ox[j], ox[i]
62}
63
64var (
65 file string
66)
67
68func init() {
69 flag.StringVar(&file, "file", "/sys/kernerl/debug/tracing/trace", "-file=xxxx.xxx")
70}
71
72func main() {
73 flag.Parse()
74 // 分割 event 数据
75 data, err := ioutil.ReadFile(file)
76 if err != nil {
77 panic(err)
78 }
79 lines := strings.Split(string(data), "\n")
80 traces := make([]event, 0, len(lines))
81
82 for _, line := range lines {
83 ss := strings.Fields(line)
84 if len(ss) != 7 {
85 fmt.Fprintf(os.Stderr, "%s format invalid\n", line)
86 continue
87 }
88
89 ts, err := strconv.ParseFloat(strings.TrimRight(ss[3], ":"), 10)
90 if err != nil {
91 fmt.Fprintf(os.Stderr, "line %s [%s] time format invalid", line, ss[3])
92 continue
93 }
94 // 0 task-pid
95 // 1 cpu#
96 // 2 flag
97 // 3 timestamp
98 // 4 function
99 // 5 vec
100 // 6 action
101 event := event{
102 task: ss[0],
103 cpu: ss[1],
104 flag: ss[2],
105 ts: ts,
106 function: strings.TrimRight(ss[4], ":"),
107 vec: ss[5],
108 action: ss[6],
109 line: line,
110 match: 0,
111 }
112 traces = append(traces, event)
113 }
114 // 按照event的timestamp 排序先
115 sort.Sort(events(traces))
116
117 if len(traces) > 0 && strings.HasSuffix(traces[0].function, "_exit") {
118 traces = traces[1:]
119 }
120
121 i := 0
122 objects := make([]obj, 0, len(traces)/2)
123 for idx, tr := range traces {
124 if strings.HasSuffix(tr.function, "_exit") {
125 i = idx - 1
126 for i >= 0 {
127 if traces[i].match > 0 || strings.HasSuffix(traces[i].function, "_exit") {
128 // 已匹配的忽略
129 i--
130 continue
131 }
132 if traces[i].cpu == tr.cpu && traces[i].vec == tr.vec && traces[i].action == tr.action {
133 // 完成匹配
134 cost := tr.ts - traces[i].ts
135 ll := fmt.Sprintf("%s\t\n\t%*f s\n%s\t\n", traces[i].line, len(traces[i].line),
136 cost, tr.line)
137 ll += fmt.Sprintf("%s\n", "----------------------------------------")
138 o := obj{
139 value: cost,
140 line: ll,
141 }
142 objects = append(objects, o)
143 traces[i].match = 1
144 traces[idx].match = 1
145 break
146 }
147 i--
148 }
149 if i < 0 {
150 traces[idx].match = 2
151 fmt.Fprintf(os.Stderr, "%s has no match event\n", tr.line)
152 continue
153 }
154 }
155 }
156 // 按照耗时排序
157 sort.Sort(objs(objects))
158 fmt.Printf("%d objects sort finished\n", len(objects))
159 for i := range objects {
160 fmt.Println(objects[i].line)
161 }
162}
问题结论
超时报警由物理机10.89.90.53引起,漂移容器10.167.41.18后,超时报警消失
物理机10.89.90.53上,超时跟ksoftirqd有关,细节还在分析中,补基础ing
致谢
感谢内核组亚方
、钱君
的大力支持和指导,感谢网络组同学范斌
的支持和建议