我的团队频道上的 Kubernetes Pod 重启通知警报



我的Pods正在AKS集群上运行。每当我的pod重新启动时,我都必须在我团队的频道上收到通知,有什么文章或命令可以配置通知吗?

为此,您可以使用类似botkube的工具或应用程序:https://www.botkube.io/

还要检查Kubewatch:https://github.com/bitnami-labs/kubewatch

您还可以使用Prometheus警报管理器实现Grafana,以监控和获取警报系统https://github.com/grafana-operator/grafana-operator

然而,如果你找不到任何工具或应用程序,你可以写下python、node或任何你擅长的语言的自定义脚本,并监控任何pod重启事件并发送slack hook事件。

共享一个示例python代码,检查POD是否运行或崩溃,并向slack发送通知,您可以根据需要更新逻辑。

from kubernetes import client, config, watch
import json
import requests
import time
logger = logging.getLogger('k8s_events')
logger.setLevel(logging.DEBUG)
# If running inside pod
#config.load_incluster_config()
# If running locally
config.load_kube_config()
v1 = client.CoreV1Api()
v1ext = client.ExtensionsV1beta1Api() 
w = watch.Watch()
mydict={}
webhook_url = '';
while True:
pod_list= v1.list_namespaced_pod("default");
for i in pod_list.items:
for c in i.status.container_statuses:
if(c.ready == True):
if i.metadata.name in mydict:
print("Inside mydict If");
print("Pod updated : ",i.metadata.name);
print("My dict value : ",mydict);
mydict[i.metadata.name]['end_time'] = i.status.conditions[1].last_transition_time;
dt_started = mydict[i.metadata.name]['start_time'].replace(tzinfo=None);
dt_ended = mydict[i.metadata.name]['end_time'].replace(tzinfo=None);
duration = str((dt_ended - dt_started).total_seconds()) + ' Sec';
fields =  [{"title": "Status", "value": "READY", "short": False }, {"title": "Pod name", "value": i.metadata.name, "short": False }, {"title": "Duration", "value": duration, "short": False }, {"title": "Service name", "value": c.name, "short": False } ]
if c.name not in ('conversation-auto-close-service-scheduler','admin-service-trail-fllow-up-scheduler','bot-trial-email-scheduler','conversation-service-scheduler','faq-service-scheduler','nlp-service-scheduler','refresh-add-on-scheduler','response-sheet-scheduler'):
text = c.name + " Pod is started"; 
data = {"text": text, "mrkdwn": True, "attachments" : [{"color": "#FBBC05", "title": "Pod Details", "fields" : fields, "footer": "Manvar", "footer_icon": "https://cdn.test.manvar.com/assets/manvar-icon.png"}, ], }
print("Final data to post: ",data);
response = requests.post(webhook_url, data=json.dumps(data),headers={'Content-Type': 'application/json'});
del mydict[i.metadata.name]
if response.status_code != 200:
raise ValueError('Request to slack returned an error %s, the response is:n%s' % (response.status_code, response.text));
time.sleep(1);
else:
mydict[i.metadata.name] = {"start_time": i.status.conditions[0].last_transition_time,"end_time": i.status.conditions[1].last_transition_time};

我尝试了Botkube,但我不想公开我的集群端点,所以我根据@Harsh-Manvar的代码编写了以下脚本。您可以使用Microsoft的Incoming Webhook Teams应用程序将其连接到Teams。

from kubernetes import client, config
import json
import requests
import time

def monitorNamespace(namespace: str, webhookUrl: str):
v1 = client.CoreV1Api()
pod_list= v1.list_namespaced_pod(namespace);
podsNotRunning = {"Namespace": namespace, "Pods": []}
for pod in pod_list.items:
status = getPodStatus(pod)        
if status != "Running":
podsNotRunning["Pods"].append({"Podname": pod.metadata.name, "status": status})  
if len(podsNotRunning)>0:
sendAlert(podsNotRunning, webhookUrl)

def sendAlert(podsNotRunning, webhookUrl):
print(podsNotRunning)
response = requests.post(webhookUrl, data=json.dumps(podsNotRunning),headers={'Content-Type': 'application/json'}); 
if response.status_code != 200:
print('Response error:', response) 

def getPodStatus(pod: client.models.v1_pod.V1Pod) -> str: 
status = pod.status.phase 
containerStatus = pod.status.container_statuses[0]
if containerStatus.started is False or containerStatus.ready is False:
waitingState = containerStatus.state.waiting
if waitingState.message is not None:
status = waitingState.reason  
return status
if __name__ == "__main__":
# If running inside pod:
#config.load_incluster_config()
# If running locally:
config.load_kube_config()

webhookUrl = 'http://webhookurl'
namespace='default
interval = 10
while True:                 
monitorNamespace(namespace, webhookUrl)           
time.sleep(interval)


最新更新