Azure VM OOM and SSH connection refused, we need capture the logs when OOM happens
OOM

[root@az-dxb-nl-rh-uat-operation01 ~]# vim /etc/systemd/system/oom_capture.service
cat /etc/systemd/system/oom_capture.service
[Unit]
Description=OOM Capture and Analysis Service
After=network.target
[Service]
ExecStart=/usr/local/sbin/oom_capture
Restart=always
User=root
[Install]
WantedBy=multi-user.target
[root@az-dxb-nl-rh-uat-operation01 ~]# vim /usr/local/sbin/oom_capture
##################################
# Define log file to monitor
LOG_FILE="/var/log/messages"
# Function to analyze OOM events
analyze_oom_event() {
local line="$1"
echo "OOM Event Detected: $line"
# Extract PID of the killed process
PID=$(echo "$line" | grep -oP 'Kill process \K[0-9]+')
if [ ! -z "$PID" ]; then
echo "Process Details:"
# Get detailed information about the process
ps -fp $PID 2>/dev/null || echo "Process with PID $PID no longer exists."
# Get the full command line of the process
CMDLINE=$(cat /proc/$PID/cmdline 2>/dev/null | tr '\0' ' ')
if [ $? -eq 0 ]; then
echo "Command Line: $CMDLINE"
else
echo "Command line for PID $PID not available."
fi
# Get memory usage details
MEMORY_USAGE=$(echo "$line" | grep -oP '(total-vm|anon-rss|file-rss):\K[0-9]+kB')
echo "Memory Usage: $MEMORY_USAGE"
# Get the user who ran the process
USER=$(ps -o user= -p $PID 2>/dev/null)
if [ $? -eq 0 ]; then
echo "User: $USER"
else
echo "User information for PID $PID not available."
fi
# Log the analysis to a file
echo "OOM Analysis:" >> /var/log/oom_analysis.log
echo "$line" >> /var/log/oom_analysis.log
ps -fp $PID 2>/dev/null >> /var/log/oom_analysis.log
echo "Command Line: $CMDLINE" >> /var/log/oom_analysis.log
echo "Memory Usage: $MEMORY_USAGE" >> /var/log/oom_analysis.log
echo "User: $USER" >> /var/log/oom_analysis.log
echo "-------------------------" >> /var/log/oom_analysis.log
else
echo "No PID found in the OOM event line."
fi
}
# Continuously monitor the log file for OOM events
tail -Fn0 $LOG_FILE | while read -r line; do
if echo "$line" | grep -qi "out of memory"; then
analyze_oom_event "$line"
fi
done
[root@az-dxb-nl-rh-uat-operation01 ~]# chmod +x /usr/local/sbin/oom_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl daemon-reload
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl enable oom_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl start oom_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl status oom_capture
● oom_capture.service - OOM Capture and Analysis Service
Loaded: loaded (/etc/systemd/system/oom_capture.service; enabled; preset: disabled)
Active: active (running) since Fri 2025-04-18 17:06:07 +04; 13min ago
Main PID: 47265 (oom_capture)
Tasks: 3 (limit: 48561)
Memory: 3.0M
CPU: 500ms
CGroup: /system.slice/oom_capture.service
├─47265 /bin/bash /usr/local/sbin/oom_capture
├─47266 tail -Fn0 /var/log/messages
└─47267 /bin/bash /usr/local/sbin/oom_capture
Apr 18 17:06:07 az-dxb-nl-rh-uat-operation01 systemd[1]: Started OOM Capture and Analysis Service.
# Capture an analysed logs saved /var/log/oom_analysis.log
[root@az-dxb-nl-rh-uat-operation01 ~]# less /var/log/oom_analysis.log
CPU
[root@az-dxb-nl-rh-uat-operation01 ~]# vim /usr/local/sbin/cpu_capture
#!/bin/bash
# Threshold for CPU usage
THRESHOLD=90
LOG_FILE="/var/log/cpu_analysis.log"
analyze_cpu_usage() {
echo "High CPU Usage Detected at $(date)"
# Get top process by CPU usage
TOP_PROCESS=$(ps -eo pid,ppid,user,cmd,%cpu,%mem --sort=-%cpu | head -n 2 | tail -n 1)
PID=$(echo "$TOP_PROCESS" | awk '{print $1}')
PPID=$(echo "$TOP_PROCESS" | awk '{print $2}')
USER=$(echo "$TOP_PROCESS" | awk '{print $3}')
CMD=$(echo "$TOP_PROCESS" | awk '{for(i=4;i<=NF-2;++i) printf $i " "; print ""}')
CPU=$(echo "$TOP_PROCESS" | awk '{print $(NF-1)}')
MEM=$(echo "$TOP_PROCESS" | awk '{print $NF}')
echo "PID: $PID"
echo "PPID: $PPID"
echo "User: $USER"
echo "Command: $CMD"
echo "CPU Usage: $CPU%"
echo "Memory Usage: $MEM%"
# Log the analysis
{
echo "CPU Analysis - $(date)"
echo "PID: $PID"
echo "PPID: $PPID"
echo "User: $USER"
echo "Command: $CMD"
echo "CPU Usage: $CPU%"
echo "Memory Usage: $MEM%"
echo "-----------------------------"
} >> "$LOG_FILE"
}
# Continuous monitoring
while true; do
CPU_IDLE=$(top -bn1 | grep "Cpu(s)" | awk '{print $8}')
CPU_USAGE=$(echo "100 - $CPU_IDLE" | bc)
CPU_INT=${CPU_USAGE%.*}
if [ "$CPU_INT" -ge "$THRESHOLD" ]; then
analyze_cpu_usage
fi
sleep 10
done
[root@az-dxb-nl-rh-uat-operation01 ~]# vim /etc/systemd/system/cpu_capture.service
[Unit]
Description=CPU Usage Capture and Analysis Service
After=network.target
[Service]
ExecStart=/usr/local/sbin/cpu_capture
Restart=always
User=root
[Install]
WantedBy=multi-user.target
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl daemon-reload
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl enable cpu_capture && systemctl start cpu_capture && systemctl status cpu_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl status cpu_capture.service
● cpu_capture.service - CPU Usage Capture and Analysis Service
Loaded: loaded (/etc/systemd/system/cpu_capture.service; enabled; preset: disabled)
Active: active (running) since Mon 2025-04-21 17:49:24 +04; 8min ago
Main PID: 44793 (cpu_capture)
Tasks: 2 (limit: 48560)
Memory: 2.2M
CPU: 1.067s
CGroup: /system.slice/cpu_capture.service
├─44793 /bin/bash /usr/local/sbin/cpu_capture
└─47388 sleep 10
Apr 21 17:49:24 az-dxb-nl-rh-uat-operation01 systemd[1]: Started CPU Usage Capture and Analysis Service.
[root@az-dxb-nl-rh-uat-operation01 ~]# tailf /var/log/cpu_analysis.log