you are better than you think

snmp插件采集超时问题

· by thur · Read in about 4 min · (707 Words)
snmp timeout

一 背景

客户反馈用categrafsnmp插件采集交换机出现数据断点,检查了categraf日志,发现里面有大量超时

categraf: 2023/09/22 08:45:16 instances.go:180: agent udp://1.2.3.4:161 ins: gathering table bgp error: performing bulk walk for field peer_addr: request timeout (after 0 retries)
...

手工用snmpwalk命令跑是正常的。作为对比,准备了一个脚本如下, 每1分钟请求一遍用户配置的oid。看categraf报超时的时刻,脚本是否也超时。最终发现categraf日志中出现超时,脚本正常采集。

#!/bin/sh

while true
do
    cat oids   | while read oid
    do
         echo "$(date) oid: ${oid}"
         snmpwalk -v2c -c public  1.2.3.4   ${oid}
    done >>debug.log 2>&1
sleep 60s
done

之前社区也提了一个issue ,反馈v0.3.27版本采集snmp出现断点,v0.3.25 却没有问题。 https://github.com/flashcatcloud/categraf/issues/639

对比v0.3.25到v0.3.27 的代码,v0.3.27 给snmp 增加了一个snmp_up 指标。 代码见 https://github.com/flashcatcloud/categraf/pull/618/files。 为了快速上报up指标,将snmp_up和其他oid放到了两个goroutine中。

categraf底层用的gosnmp做数据收发和解析。主要逻辑

	if uconn, ok := x.Conn.(net.PacketConn); ok && x.uaddr != nil {
			_, err = uconn.WriteTo(outBuf, x.uaddr)
		} else {
			_, err = x.Conn.Write(outBuf)
		}
		if err != nil {
			continue
		}
		if x.OnSent != nil {
			x.OnSent(x)
		}

		// all sends wait for the return packet, except for SNMPv2Trap
		if !wait {
			return &SnmpPacket{}, nil
		}

	waitingResponse:
    	for {
			x.Logger.Print("WAITING RESPONSE...")
			// Receive response and try receiving again on any decoding error.
			// Let the deadline abort us if we don't receive a valid response.

			var resp []byte
			resp, err = x.receive()
			if err == io.EOF && strings.HasPrefix(x.Transport, tcp) {
				// EOF on TCP: reconnect and retry. Do not count
				// as retry as socket was broken
				x.Logger.Printf("ERROR: EOF. Performing reconnect")
				err = x.netConnect()
				if err != nil {
					return nil, err
				}
				retries--
				break
			} else if err != nil {
				// receive error. retrying won't help. abort
				break
			}
			if x.OnRecv != nil {
				x.OnRecv(x)
			}
			x.Logger.Printf("GET RESPONSE OK: %+v", resp)
			result = new(SnmpPacket)
			result.Logger = x.Logger

			result.MsgFlags = packetOut.MsgFlags
			if packetOut.SecurityParameters != nil {
				result.SecurityParameters = packetOut.SecurityParameters.Copy()
			}

			var cursor int
			cursor, err = x.unmarshalHeader(resp, result)
			if err != nil {
				x.Logger.Printf("ERROR on unmarshall header: %s", err)
				break
			}

			if x.Version == Version3 {
				useResponseSecurityParameters := false
				if usp, ok := x.SecurityParameters.(*UsmSecurityParameters); ok {
					if usp.AuthoritativeEngineID == "" {
						useResponseSecurityParameters = true
					}
				}
				err = x.testAuthentication(resp, result, useResponseSecurityParameters)
				if err != nil {
					x.Logger.Printf("ERROR on Test Authentication on v3: %s", err)
					break
				}
				resp, cursor, err = x.decryptPacket(resp, cursor, result)
				if err != nil {
					x.Logger.Printf("ERROR on decryptPacket on v3: %s", err)
					break
				}
			}

			err = x.unmarshalPayload(resp, cursor, result)
			if err != nil {
				x.Logger.Printf("ERROR on UnmarshalPayload on v3: %s", err)
				break
			}

            ...

            validID := false
			for _, id := range allReqIDs {
				if id == result.RequestID {
					validID = true
				}
			}
			if result.RequestID == 0 {
				validID = true
			}
			if !validID {
				x.Logger.Print("ERROR out of order")
				continue
			}

看代码和注释会发现,gosnmp发送包之后是等待解析返回内容(非trap请求)。同一个进程的两个goroutine可能会出现解析对方响应数据的问题。解析后 request id匹配不到,goroutineA 就丢了这个响应数据,另一个goroutineB 等不到对应的数据包,直到超时。

搞个demo复现一下。demo的主要逻辑是准备了一份oid列表,然后多个goroutine去发送和解析。

package main

import (
	"fmt"
	"sync"
	"time"

	"github.com/gosnmp/gosnmp"
)

func main() {
	oidList := []string{
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11.16842753",
		".1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11.16908289",
	}
	var snmp gosnmp.GoSNMP
	snmp.Target = "1.2.3.4" // pfsense box
	snmp.Port = 161
	snmp.Version = gosnmp.Version2c
	snmp.Timeout = time.Duration(40) * time.Second
	snmp.Community = "public"
	snmp.Retries = 0

	ec := snmp.Connect()

	if ec != nil {
		fmt.Printf("Error snmp %s\n", ec)
		return
	}

	wg := &sync.WaitGroup{}
	for idx := range oidList {
		wg.Add(1)
		go func(oid string) {
			defer wg.Done()
			result, err := snmp.Get([]string{oid})
			if err != nil {
				fmt.Printf("Error get %s, %+v\n", err, result)
				return
			}
			for _, v := range result.Variables {
				fmt.Printf("%+v\n", v)
			}
		}(oidList[idx])
	}
	wg.Wait()
}

只有第一条oid 正常返回和解析了

{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.1.1 Type:Integer}
Error get request timeout(40s) (after 0 retries), error reading from socket: read udp 192.168.0.2:44953->1.2.3.4:161: i/o timeout , origin error:%!s(<nil>), <nil>

去掉goroutine后,代码如下

package main

import (
	"fmt"
	"sync"
	"time"

	"github.com/gosnmp/gosnmp"
)

func main() {
	oidList := []string{
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.1.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.1.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.2.1",
		".1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.2.2",
		".1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11.16842753",
		".1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11.16908289",
	}
	var snmp gosnmp.GoSNMP
	snmp.Target = "1.2.3.4" // pfsense box
	snmp.Port = 161
	snmp.Version = gosnmp.Version2c
	snmp.Timeout = time.Duration(40) * time.Second
	snmp.Community = "public"
	snmp.Retries = 0

	ec := snmp.Connect()

	if ec != nil {
		fmt.Printf("Error snmp %s\n", ec)
		return
	}

		result, error_get := snmp.Get(oidList)
		if error_get != nil {
			fmt.Printf("Error get %s \n", error_get, result)
			return
		}

		for _, v := range result.Variables {
			fmt.Printf("%+v\n", v)
		}
}

数据正常返回

{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.1.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.1.2 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.2.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.6.2.2 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.1.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.1.2 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.2.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.10.1.7.2.2 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.1.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.1.2 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.2.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.5.2.2 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.1.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.1.2 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.2.1 Type:Integer}
{Value:1 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.18.1.6.2.2 Type:Integer}
{Value:42 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11.16842753 Type:Integer}
{Value:41 Name:.1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11.16908289 Type:Integer}

最后修复也就简单了 https://github.com/flashcatcloud/categraf/pull/680

Comments