you are better than you think

eBPF实战之超时问题排查

· by thur · Read in about 5 min · (954 Words)
bcc ebpf ksoftirq perf

问题描述

线上A模块访问B模块,A模块部署在物理机,B模块部署在容器上。早晚高峰A访问B会出现超时报警。

问题排查

业务方找到我们排查,我首先申请了监控中提示超时的两台机器权限,看一下超时规律 timeout 登录10.167.8.25容器所在的宿主机常规排查

dmesg显示如下,联系值班同学和系统部同学硬件报修。

EDAC sbridge MC1: HANDLING MCE MEMORY ERROR

漂移容器10.167.8.252后,10.167.41.18 在两台物理机上小时级别日志中会出现近10次

timeout

申请节点权限,全量统计超时数据(单机统计top10数据,汇总后再top10) timeout

问题指向10.167.41.18 这个容器。在之前排查过程中,早晚高峰期设置了定时任务,会ping问题容器10.167.8.252和10.167.41.18 ,检查log发现10.167.41.18容器有严重延时跳变。 timeout 为了排查是否由网络链路原因,增加2个脚本,ping容器所在物理机10.89.90.53 ,ping物理机同网段机器 10.89.90.52 进行对比。 最终发现问题容器所在物理机10.89.90.53存在延时跳变,而同网段的机器10.89.90.52没有出现跳变,网络组同学也反馈交换机层没有发现延迟常见的报错。 timeout

借鉴debugging-network-stalls-on-kubernetes的排查思路和手段,首先准备了目标机器上的bcc脚本

 0#!/usr/bin/python3
 1bpf_text = """
 2#include <linux/ptrace.h>
 3#include <linux/sched.h>        /* For TASK_COMM_LEN */
 4#include <linux/icmp.h>
 5#include <linux/netdevice.h>
 6struct probe_icmp_data_t
 7{
 8        u64 timestamp_ns;
 9        u32 tgid;
10        u32 pid;
11        char comm[TASK_COMM_LEN];
12        int v0;
13};
14BPF_PERF_OUTPUT(probe_icmp_events);
15static inline unsigned char *my_skb_transport_header(const struct sk_buff *skb)
16{
17    return skb->head + skb->transport_header;
18}
19static inline struct icmphdr *my_icmp_hdr(const struct sk_buff *skb)
20{
21    return (struct icmphdr *)my_skb_transport_header(skb);
22}
23int probe_icmp(struct pt_regs *ctx, struct sk_buff *skb)
24{
25        void *addr = NULL;
26        u64 __pid_tgid = bpf_get_current_pid_tgid();
27        u32 __tgid = __pid_tgid >> 32;
28        u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half
29 
30        struct probe_icmp_data_t __data = {0};
31        __data.timestamp_ns = bpf_ktime_get_ns();
32        __data.tgid = __tgid;
33        __data.pid = __pid;
34        bpf_get_current_comm(&__data.comm, sizeof(__data.comm));
35        __be16 seq;
36        addr = &my_icmp_hdr(skb)->un.echo.sequence;
37        bpf_probe_read(&seq, sizeof(seq), addr);
38        __data.v0 = (int)seq;
39        probe_icmp_events.perf_submit(ctx, &__data, sizeof(__data));
40        return 0;
41}
42"""
43 
44from bcc import BPF
45 
46b = BPF(text=bpf_text)
47b.attach_kprobe(event="icmp_echo", fn_name="probe_icmp")
48 
49print("%-7s %-7s %-15s %s" % ('TGID', 'PID', 'PROCESS NAME', 'ICMP_SEQ') )
50 
51def print_icmp_event(cpu, data, size):
52    event = b["probe_icmp_events"].event(data)
53    # print(event)
54    print("%-7d %-7d %-15s %s" %
55                      (event.tgid, event.pid,
56                       event.comm.decode('utf-8', 'replace'),
57                       event.v0))
58 
59b["probe_icmp_events"].open_perf_buffer(print_icmp_event)
60while 1:
61    try:
62         b.perf_buffer_poll()
63    except KeyboardInterrupt:
64        exit()
源机器上也准备好ping 目标主机. 源机器(10.89.46.61)上每10ms发送一次ping

hping3 10.89.90.53 --icmp -i u10000 |grep 'rtt=1.* ms'

hping3.png 目标机器上执行bcc脚本 bcc.png

汇总源和目标主机上的数据

0#!/bin/sh
1while read line ;
2do
3	seq=$(echo $line | awk '{print $5}' | sed 's/icmp_seq=//g');
4	cost=$(echo $line | awk '{print $6}' | sed 's/rtt=//g');
5	bcc=$(grep " ${seq}$" trace.log);
6	echo -e "${bcc}\t**${cost} ms";
7done < icmp.log

对比相同序号的包 发现目标机器上正在执行ksoftirqd 。 汇总数据对比,最后一列为源机器上的耗时 icmp_trace.png


至此已经可以发现是23号cpu 带来的延迟,以下分析是尝试分析延迟的具体原因

问题机器上perf抓下23号cpu的情况 perf record -C 23 -g -F 999

再看调用栈情况 perf script 2>/dev/null | ./FlameGraph/stackcollapse-perf-ordered.pl | grep ksoftir -B 100 perf_stack.png

火焰图 perf script 2>/dev/null | FlameGraph/stackcollapse-perf.pl out.perf | FlameGraph/flamegraph.pl > perf.svg perf_stack.png

后来得知内核组钱君同学也在分析ksoftirq的延迟问题,钱君同学建议分析softirq的耗时,所以又搞了两个脚本计算softirq entry和exit的时间差

 0#!/bin/bash
 1trap 'end;exit' SIGINT
 2
 3# 脚本会将指定cpu的softirq_entry 和 softirq_exist事件写入到文件
 4# CTRL-C 会自动关闭events trace  
 5 
 6if [ $# -ne 1 ];then
 7    echo "input cpu index, eg. 023 short for cpu#[023]"
 8    exit 1
 9fi
10 
11function init(){
12    echo 1 > /sys/kernel/debug/tracing/events/irq/softirq_entry/enable
13    echo 1 > /sys/kernel/debug/tracing/events/irq/softirq_exit/enable
14 
15}
16 
17function task(){
18    cat /sys/kernel/debug/tracing/trace_pipe |grep "\[$1\]" | tee trace.log;
19}
20 
21function end(){
22    echo 0 > /sys/kernel/debug/tracing/events/irq/softirq_entry/enable
23    echo 0 > /sys/kernel/debug/tracing/events/irq/softirq_exit/enable
24}
25 
26function main(){
27    init
28    task $1
29    end
30}
31main $@

  0// go run trace.go stack.go -file=xxxx.log
  1// 2020.09.21 update
  2// 分析上面shell脚本产生的trace文件,通过-file指定绝对路径
  3package main
  4
  5import (
  6	"flag"
  7	"fmt"
  8	"io/ioutil"
  9	"os"
 10	"sort"
 11	"strconv"
 12	"strings"
 13	"time"
 14)
 15
 16type (
 17	event struct {
 18		task      string
 19		cpu       string
 20		flag      string
 21		ts        float64
 22		timestamp time.Time
 23		function  string
 24		vec       string
 25		action    string
 26		line      string
 27		match     int
 28	}
 29
 30	obj struct {
 31		value float64
 32		line  string
 33	}
 34	events []event
 35	objs   []obj
 36)
 37
 38func (e events) Len() int {
 39	return len(e)
 40}
 41
 42func (e events) Less(i, j int) bool {
 43	return e[i].ts < e[i].ts
 44}
 45
 46func (e events) Swap(i, j int) {
 47	e[i], e[j] = e[j], e[i]
 48}
 49
 50func (ox objs) Len() int {
 51	return len(ox)
 52}
 53func (ox objs) Less(i, j int) bool {
 54	if ox[i].value < ox[j].value {
 55		return true
 56	}
 57	return false
 58}
 59
 60func (ox objs) Swap(i, j int) {
 61	ox[i], ox[j] = ox[j], ox[i]
 62}
 63
 64var (
 65	file string
 66)
 67
 68func init() {
 69	flag.StringVar(&file, "file", "/sys/kernerl/debug/tracing/trace", "-file=xxxx.xxx")
 70}
 71
 72func main() {
 73	flag.Parse()
 74	// 分割 event 数据
 75	data, err := ioutil.ReadFile(file)
 76	if err != nil {
 77		panic(err)
 78	}
 79	lines := strings.Split(string(data), "\n")
 80	traces := make([]event, 0, len(lines))
 81
 82	for _, line := range lines {
 83		ss := strings.Fields(line)
 84		if len(ss) != 7 {
 85			fmt.Fprintf(os.Stderr, "%s format invalid\n", line)
 86			continue
 87		}
 88
 89		ts, err := strconv.ParseFloat(strings.TrimRight(ss[3], ":"), 10)
 90		if err != nil {
 91			fmt.Fprintf(os.Stderr, "line %s [%s] time format invalid", line, ss[3])
 92			continue
 93		}
 94		// 0 task-pid
 95		// 1 cpu#
 96		// 2 flag
 97		// 3 timestamp
 98		// 4 function
 99		// 5 vec
100		// 6 action
101		event := event{
102			task:     ss[0],
103			cpu:      ss[1],
104			flag:     ss[2],
105			ts:       ts,
106			function: strings.TrimRight(ss[4], ":"),
107			vec:      ss[5],
108			action:   ss[6],
109			line:     line,
110			match:    0,
111		}
112		traces = append(traces, event)
113	}
114	// 按照event的timestamp 排序先
115	sort.Sort(events(traces))
116
117	if len(traces) > 0 && strings.HasSuffix(traces[0].function, "_exit") {
118		traces = traces[1:]
119	}
120
121	i := 0
122	objects := make([]obj, 0, len(traces)/2)
123	for idx, tr := range traces {
124		if strings.HasSuffix(tr.function, "_exit") {
125			i = idx - 1
126			for i >= 0 {
127				if traces[i].match > 0 || strings.HasSuffix(traces[i].function, "_exit") {
128					// 已匹配的忽略
129					i--
130					continue
131				}
132				if traces[i].cpu == tr.cpu && traces[i].vec == tr.vec && traces[i].action == tr.action {
133					// 完成匹配
134					cost := tr.ts - traces[i].ts
135					ll := fmt.Sprintf("%s\t\n\t%*f s\n%s\t\n", traces[i].line, len(traces[i].line),
136						cost, tr.line)
137					ll += fmt.Sprintf("%s\n", "----------------------------------------")
138					o := obj{
139						value: cost,
140						line:  ll,
141					}
142					objects = append(objects, o)
143					traces[i].match = 1
144					traces[idx].match = 1
145					break
146				}
147				i--
148			}
149			if i < 0 {
150				traces[idx].match = 2
151				fmt.Fprintf(os.Stderr, "%s has no match event\n", tr.line)
152				continue
153			}
154		}
155	}
156	// 按照耗时排序
157	sort.Sort(objs(objects))
158	fmt.Printf("%d objects sort finished\n", len(objects))
159	for i := range objects {
160		fmt.Println(objects[i].line)
161	}
162}
softirq entry 和 sortirq 时间差 softirq_cost.png

问题结论

  1. 超时报警由物理机10.89.90.53引起,漂移容器10.167.41.18后,超时报警消失

  2. 物理机10.89.90.53上,超时跟ksoftirqd有关,细节还在分析中,补基础ing

致谢

感谢内核组亚方钱君的大力支持和指导,感谢网络组同学范斌的支持和建议

版权所有,转载请注明作者和出处

Comments