OOM Capture and Analyse

Linux

Azure VM OOM and SSH connection refused, we need capture the logs when OOM happens

OOM

image-20250418170958276

[root@az-dxb-nl-rh-uat-operation01 ~]# vim /etc/systemd/system/oom_capture.service
cat /etc/systemd/system/oom_capture.service
[Unit]
Description=OOM Capture and Analysis Service
After=network.target

[Service]
ExecStart=/usr/local/sbin/oom_capture
Restart=always
User=root

[Install]
WantedBy=multi-user.target
[root@az-dxb-nl-rh-uat-operation01 ~]# vim /usr/local/sbin/oom_capture
##################################

# Define log file to monitor
LOG_FILE="/var/log/messages"

# Function to analyze OOM events
analyze_oom_event() {
    local line="$1"
    echo "OOM Event Detected: $line"

    # Extract PID of the killed process
    PID=$(echo "$line" | grep -oP 'Kill process \K[0-9]+')
    if [ ! -z "$PID" ]; then
        echo "Process Details:"

        # Get detailed information about the process
        ps -fp $PID 2>/dev/null || echo "Process with PID $PID no longer exists."

        # Get the full command line of the process
        CMDLINE=$(cat /proc/$PID/cmdline 2>/dev/null | tr '\0' ' ')
        if [ $? -eq 0 ]; then
            echo "Command Line: $CMDLINE"
        else
            echo "Command line for PID $PID not available."
        fi

        # Get memory usage details
        MEMORY_USAGE=$(echo "$line" | grep -oP '(total-vm|anon-rss|file-rss):\K[0-9]+kB')
        echo "Memory Usage: $MEMORY_USAGE"

        # Get the user who ran the process
        USER=$(ps -o user= -p $PID 2>/dev/null)
        if [ $? -eq 0 ]; then
            echo "User: $USER"
        else
            echo "User information for PID $PID not available."
        fi

        # Log the analysis to a file
        echo "OOM Analysis:" >> /var/log/oom_analysis.log
        echo "$line" >> /var/log/oom_analysis.log
        ps -fp $PID 2>/dev/null >> /var/log/oom_analysis.log
        echo "Command Line: $CMDLINE" >> /var/log/oom_analysis.log
        echo "Memory Usage: $MEMORY_USAGE" >> /var/log/oom_analysis.log
        echo "User: $USER" >> /var/log/oom_analysis.log
        echo "-------------------------" >> /var/log/oom_analysis.log
    else
        echo "No PID found in the OOM event line."
    fi
}

# Continuously monitor the log file for OOM events
tail -Fn0 $LOG_FILE | while read -r line; do
    if echo "$line" | grep -qi "out of memory"; then
        analyze_oom_event "$line"
    fi
done
[root@az-dxb-nl-rh-uat-operation01 ~]# chmod +x /usr/local/sbin/oom_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl daemon-reload
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl enable oom_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl start oom_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl status oom_capture
● oom_capture.service - OOM Capture and Analysis Service
     Loaded: loaded (/etc/systemd/system/oom_capture.service; enabled; preset: disabled)
     Active: active (running) since Fri 2025-04-18 17:06:07 +04; 13min ago
   Main PID: 47265 (oom_capture)
      Tasks: 3 (limit: 48561)
     Memory: 3.0M
        CPU: 500ms
     CGroup: /system.slice/oom_capture.service
             ├─47265 /bin/bash /usr/local/sbin/oom_capture
             ├─47266 tail -Fn0 /var/log/messages
             └─47267 /bin/bash /usr/local/sbin/oom_capture

Apr 18 17:06:07 az-dxb-nl-rh-uat-operation01 systemd[1]: Started OOM Capture and Analysis Service.
# Capture an analysed logs saved /var/log/oom_analysis.log
[root@az-dxb-nl-rh-uat-operation01 ~]# less /var/log/oom_analysis.log

CPU

[root@az-dxb-nl-rh-uat-operation01 ~]# vim /usr/local/sbin/cpu_capture
#!/bin/bash
# Threshold for CPU usage
THRESHOLD=90
LOG_FILE="/var/log/cpu_analysis.log"

analyze_cpu_usage() {
    echo "High CPU Usage Detected at $(date)"

    # Get top process by CPU usage
    TOP_PROCESS=$(ps -eo pid,ppid,user,cmd,%cpu,%mem --sort=-%cpu | head -n 2 | tail -n 1)

    PID=$(echo "$TOP_PROCESS" | awk '{print $1}')
    PPID=$(echo "$TOP_PROCESS" | awk '{print $2}')
    USER=$(echo "$TOP_PROCESS" | awk '{print $3}')
    CMD=$(echo "$TOP_PROCESS" | awk '{for(i=4;i<=NF-2;++i) printf $i " "; print ""}')
    CPU=$(echo "$TOP_PROCESS" | awk '{print $(NF-1)}')
    MEM=$(echo "$TOP_PROCESS" | awk '{print $NF}')

    echo "PID: $PID"
    echo "PPID: $PPID"
    echo "User: $USER"
    echo "Command: $CMD"
    echo "CPU Usage: $CPU%"
    echo "Memory Usage: $MEM%"

    # Log the analysis
    {
        echo "CPU Analysis - $(date)"
        echo "PID: $PID"
        echo "PPID: $PPID"
        echo "User: $USER"
        echo "Command: $CMD"
        echo "CPU Usage: $CPU%"
        echo "Memory Usage: $MEM%"
        echo "-----------------------------"
    } >> "$LOG_FILE"
}

# Continuous monitoring
while true; do
    CPU_IDLE=$(top -bn1 | grep "Cpu(s)" | awk '{print $8}')
    CPU_USAGE=$(echo "100 - $CPU_IDLE" | bc)
    CPU_INT=${CPU_USAGE%.*}

    if [ "$CPU_INT" -ge "$THRESHOLD" ]; then
        analyze_cpu_usage
    fi

    sleep 10
done
[root@az-dxb-nl-rh-uat-operation01 ~]# vim /etc/systemd/system/cpu_capture.service
[Unit]
Description=CPU Usage Capture and Analysis Service
After=network.target

[Service]
ExecStart=/usr/local/sbin/cpu_capture
Restart=always
User=root

[Install]
WantedBy=multi-user.target
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl daemon-reload
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl enable cpu_capture && systemctl start cpu_capture && systemctl status cpu_capture
[root@az-dxb-nl-rh-uat-operation01 ~]# systemctl status cpu_capture.service
● cpu_capture.service - CPU Usage Capture and Analysis Service
     Loaded: loaded (/etc/systemd/system/cpu_capture.service; enabled; preset: disabled)
     Active: active (running) since Mon 2025-04-21 17:49:24 +04; 8min ago
   Main PID: 44793 (cpu_capture)
      Tasks: 2 (limit: 48560)
     Memory: 2.2M
        CPU: 1.067s
     CGroup: /system.slice/cpu_capture.service
             ├─44793 /bin/bash /usr/local/sbin/cpu_capture
             └─47388 sleep 10

Apr 21 17:49:24 az-dxb-nl-rh-uat-operation01 systemd[1]: Started CPU Usage Capture and Analysis Service.
[root@az-dxb-nl-rh-uat-operation01 ~]# tailf /var/log/cpu_analysis.log