【shell】监控系统资源脚本

1.监控系统资源

编写脚本来监控CPU、内存、磁盘等系统资源的使用情况。

--------------------------------------------------------------------------------
CPU使用率:        0.8%       [状态: NORMAL]
内存使用率:     4.2%       [状态: NORMAL]
内存详情:        Total: 3.7G | Used: 157M | Free: 2.8G | Available: 3.3G
磁盘使用率(根分区): 4%         [状态: NORMAL]
磁盘详情:
Filesystem           Size       Used       Avail      Use%   Mounted
/dev/mapper/centos-root 37G        1.4G       36G        4%     /
/dev/sda1            509M       125M       384M       25%    /boot
系统负载:        0.00, 0.01, 0.05
进程数量:        371
系统运行时间:  up 1 hour, 7 minutes
--------------------------------------------------------------------------------

脚本

#!/bin/bash

# 系统资源监控脚本
# 功能:监控CPU、内存、磁盘使用情况

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
CYAN='\033[0;36m'
NC='\033[0m'

# 阈值设置
CPU_WARNING=80
CPU_CRITICAL=95
MEM_WARNING=80
MEM_CRITICAL=95
DISK_WARNING=80
DISK_CRITICAL=90

# 日志文件
LOG_FILE="/var/log/system_monitor.log"

# 报告输出文件(由-o参数指定)
REPORT_FILE=""

# 获取当前时间
get_timestamp() {
    date "+%Y-%m-%d %H:%M:%S"
}

# 记录日志
log_message() {
    local level="$1"
    local message="$2"
    local timestamp=$(get_timestamp)
    echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}

# 输出函数:支持屏幕显示和文件保存
output() {
    local line="$1"
    if [ -n "$REPORT_FILE" ]; then
        # 去除颜色代码后写入文件
        echo "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$REPORT_FILE"
    fi
    printf "%b\\n" "$line"
}

# 获取CPU使用率
get_cpu_usage() {
    local cpu_idle=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\\([0-9.]*\\)%* id.*/\\1/" | awk '{print $1}')
    local cpu_usage=$(awk -v idle="$cpu_idle" 'BEGIN {printf "%.1f", 100 - idle}')
    printf "%.1f" "$cpu_usage"
}

# 获取内存使用情况
get_memory_usage() {
    local mem_info=$(free | grep Mem)
    local total=$(echo "$mem_info" | awk '{print $2}')
    local used=$(echo "$mem_info" | awk '{print $3}')
    local mem_usage=$(awk -v used="$used" -v total="$total" 'BEGIN {printf "%.1f", (used/total)*100}')
    echo "$mem_usage"
}

# 获取内存详细信息
get_memory_detail() {
    local mem_info=$(free -h | grep Mem)
    local total=$(echo "$mem_info" | awk '{print $2}')
    local used=$(echo "$mem_info" | awk '{print $3}')
    local free=$(echo "$mem_info" | awk '{print $4}')
    local available=$(echo "$mem_info" | awk '{print $7}')
    echo "Total: $total | Used: $used | Free: $free | Available: $available"
}

# 获取磁盘使用情况
get_disk_usage() {
    local disk_usage=$(df -h / | tail -1 | awk '{print $5}' | sed 's/%//')
    echo "$disk_usage"
}

# 获取磁盘详细信息
get_disk_detail() {
    df -h | grep -v "^tmpfs\|^devtmpfs\|^overlay" | awk 'NR==1 || $1~/^\/dev/ || $1~/^\/mapper/ {printf "  %-20s %-10s %-10s %-10s %-6s %-20s\n", $1, $2, $3, $4, $5, $6}'
}

# 获取负载情况
get_load_average() {
    uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//'
}

# 获取进程数量
get_process_count() {
    ps aux | wc -l
}

# 获取运行时间
get_uptime() {
    uptime -p 2>/dev/null || uptime | awk -F',' '{print $1}' | sed 's/^ *//'
}

# 检查阈值并返回状态
check_threshold() {
    local value=$1
    local warning=$2
    local critical=$3
    
    local result=$(awk -v v="$value" -v w="$warning" -v c="$critical" 'BEGIN {
        if (v >= c) print "CRITICAL"
        else if (v >= w) print "WARNING"
        else print "NORMAL"
    }')
    
    echo "$result"
}

# 获取状态颜色
get_status_color() {
    local status=$1
    case "$status" in
        "CRITICAL") echo "$RED" ;;
        "WARNING") echo "$YELLOW" ;;
        *) echo "$GREEN" ;;
    esac
}

# 打印分隔线
print_line() {
    output "--------------------------------------------------------------------------------"
}

# 打印标题
print_header() {
    print_line
    output "系统资源监控报告                 $(get_timestamp)"
    print_line
}

# 主监控函数
monitor_system() {
    # 如果指定了报告文件,先创建/清空
    if [ -n "$REPORT_FILE" ]; then
        > "$REPORT_FILE"
    fi
    
    print_header
    
    # CPU信息
    local cpu_usage=$(get_cpu_usage)
    local cpu_status=$(check_threshold "$cpu_usage" "$CPU_WARNING" "$CPU_CRITICAL")
    local cpu_color=$(get_status_color "$cpu_status")
    
    output "${CYAN}CPU使用率:${NC}        ${cpu_usage}%       [状态: ${cpu_color}${cpu_status}${NC}]"
    
    # 内存信息
    local mem_usage=$(get_memory_usage)
    local mem_status=$(check_threshold "$mem_usage" "$MEM_WARNING" "$MEM_CRITICAL")
    local mem_color=$(get_status_color "$mem_status")
    
    output "${CYAN}内存使用率:${NC}     ${mem_usage}%       [状态: ${mem_color}${mem_status}${NC}]"
    output "${CYAN}内存详情:${NC}        $(get_memory_detail)"
    
    # 磁盘信息
    local disk_usage=$(get_disk_usage)
    local disk_status=$(check_threshold "$disk_usage" "$DISK_WARNING" "$DISK_CRITICAL")
    local disk_color=$(get_status_color "$disk_status")
    
    output "${CYAN}磁盘使用率(根分区):${NC} ${disk_usage}%         [状态: ${disk_color}${disk_status}${NC}]"
    output "${CYAN}磁盘详情:${NC}"
    get_disk_detail | while read line; do
        output "$line"
    done
    
    # 系统负载
    output "${CYAN}系统负载:${NC}        $(get_load_average)"
    
    # 进程数量
    output "${CYAN}进程数量:${NC}        $(get_process_count)"
    
    # 运行时间
    output "${CYAN}系统运行时间:${NC}  $(get_uptime)"
    
    print_line
    
    # 如果保存了报告文件,提示用户
    if [ -n "$REPORT_FILE" ]; then
        output ""
        output "报告已保存到: $REPORT_FILE"
    fi
    
    # 记录到日志
    log_message "INFO" "CPU: ${cpu_usage}%, MEM: ${mem_usage}%, DISK: ${disk_usage}%, Load: $(get_load_average)"
    
    if [ "$cpu_status" = "WARNING" ] || [ "$cpu_status" = "CRITICAL" ]; then
        log_message "$cpu_status" "CPU使用率过高: ${cpu_usage}%"
    fi
    
    if [ "$mem_status" = "WARNING" ] || [ "$mem_status" = "CRITICAL" ]; then
        log_message "$mem_status" "内存使用率过高: ${mem_usage}%"
    fi
    
    if [ "$disk_status" = "WARNING" ] || [ "$disk_status" = "CRITICAL" ]; then
        log_message "$disk_status" "磁盘使用率过高: ${disk_usage}%"
    fi
}

# 持续监控模式
continuous_monitor() {
    local interval=${1:-5}
    echo "进入持续监控模式,间隔 ${interval} 秒..."
    echo "按 Ctrl+C 退出"
    while true; do
        clear
        monitor_system
        sleep "$interval"
    done
}

# 显示帮助信息
show_help() {
    cat << EOF
用法: $0 [选项]

选项:
    -c, --continuous [秒]   持续监控模式,默认间隔5秒
    -i, --interval 秒         设置监控间隔时间
    -o, --output 文件         将报告保存到指定文件
    -l, --log                 查看日志文件
    -h, --help                显示帮助信息

示例:
    $0                        单次监控
    $0 -c                     持续监控(5秒间隔)
    $0 -c 10                  持续监控(10秒间隔)
    $0 -o /tmp/report.txt     保存报告到文件
    $0 -o report.txt          保存报告到当前目录
    $0 -l                     查看监控日志
EOF
}

# 查看日志
view_log() {
    if [ -f "$LOG_FILE" ]; then
        echo "监控日志内容:"
        print_line
        tail -n 50 "$LOG_FILE"
    else
        echo "日志文件不存在: $LOG_FILE"
    fi
}

# 主程序
main() {
    if ! command -v bc &> /dev/null; then
        echo "警告: bc 命令未安装,脚本使用awk替代计算"
    fi
    
    local log_dir=$(dirname "$LOG_FILE")
    if [ ! -d "$log_dir" ]; then
        mkdir -p "$log_dir" 2>/dev/null || LOG_FILE="/tmp/system_monitor.log"
    fi
    
    # 解析参数
    while [ $# -gt 0 ]; do
        case "$1" in
            -c|--continuous)
                shift
                continuous_monitor "${1:-5}"
                exit 0
                ;;
            -i|--interval)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定间隔时间"
                    show_help
                    exit 1
                fi
                continuous_monitor "$1"
                exit 0
                ;;
            -o|--output)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定输出文件路径"
                    show_help
                    exit 1
                fi
                REPORT_FILE="$1"
                shift
                ;;
            -l|--log)
                view_log
                exit 0
                ;;
            -h|--help)
                show_help
                exit 0
                ;;
            *)
                echo "未知选项: $1"
                show_help
                exit 1
                ;;
        esac
    done
    
    # 默认执行单次监控
    monitor_system
}

main "$@"

每天凌晨2点保存报告

0 2 * * * /home/shells/system_monitor.sh -o /var/log/daily_report_$(date +\%Y\%m\%d).txt

2.监控网络连接

--------------------------------------------------------------------------------
网络监控报告           2026-05-18 14:07:40
--------------------------------------------------------------------------------
[网卡状态]
  网卡: eno16777728
    状态: UP
    IP地址: 192.168.31.43/24
    MAC地址: 00:0c:29:6e:18:be

[网关信息]
  网关: 192.168.31.1
  连通性: OK (延迟: 4.64ms)

[网络连通性]
  目标: 223.5.5.5
    状态: OK
    丢包率: 0%
    平均延迟: 18.905ms
    最小延迟: 16.574ms

  目标: 114.114.114.114
    状态: OK
    丢包率: 100%
    平均延迟: Unknownms
    最小延迟: Unknownms

  目标: 8.8.8.8
    状态: OK
    丢包率: 0%
    平均延迟: 44.771ms
    最小延迟: 44.522ms

[HTTP 服务检测]
  URL: http://www.baidu.com
    状态: OK (HTTP 200)
    响应时间: 0.041s

  URL: http://www.aliyun.com
    状态: 异常 (HTTP 403)

[DNS 解析]
  DNS 服务器:
    192.168.31.1
    114.114.114.114
    223.5.5.5

[连接统计]
  TCP总数: 9
  已建立: 2
  监听中: 4
  等待中: 3

[带宽统计]
  网卡: eno16777728
    接收: 65MiB
    发送: 1.6MiB

编写脚本来监控网络连接状态,检查网络是否正常。

为什么会报错?-bash: ./network_monitor.sh: /bin/bash^M: bad interpreter: No such file or directory

cat network_monitor.sh | tr -d '\r' > network_monitor_fixed.sh
#!/bin/bash

# Network Connection Monitor Script
# Check network status, connectivity, DNS, bandwidth

# Colors
R='\033[0;31m'
G='\033[0;32m'
Y='\033[0;33m'
C='\033[0;36m'
N='\033[0m'

# Config
PING_COUNT=4
PING_TIMEOUT=5
LOG_FILE="/var/log/network_monitor.log"
REPORT_FILE=""

# Default ping targets
PING_TARGETS=(
    "223.5.5.5"
    "114.114.114.114"
    "8.8.8.8"
)

# HTTP check URLs
URL_TARGETS=(
    "http://www.baidu.com"
    "http://www.aliyun.com"
)

# Get timestamp
get_timestamp() {
    date "+%Y-%m-%d %H:%M:%S"
}

# Output function
output() {
    local line="$1"
    if [ -n "$REPORT_FILE" ]; then
        echo "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$REPORT_FILE"
    fi
    printf "%b\n" "$line"
}

# Log message
log_message() {
    local level="$1"
    local message="$2"
    local timestamp=$(get_timestamp)
    echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}

# Print line
print_line() {
    output "--------------------------------------------------------------------------------"
}

# Print header
print_header() {
    print_line
    output "网络监控报告           $(get_timestamp)"
    print_line
}

# Check interface status
check_interface() {
    output "\033[0;36m[网卡状态]\033[0m"
    local interfaces=$(ip link show 2>/dev/null | grep "^[0-9]" | awk -F: '{print $2}' | sed 's/ //g')
    if [ -z "$interfaces" ]; then
        interfaces=$(ifconfig -a 2>/dev/null | grep "^[^ ]" | awk '{print $1}' | sed 's/://g')
    fi
    for iface in $interfaces; do
        if [ "$iface" = "lo" ]; then continue; fi
        local status="DOWN"
        local ip_addr="No IP"
        local mac_addr="Unknown"
        if ip link show "$iface" &>/dev/null; then
            if ip link show "$iface" | grep -q "state UP"; then status="UP"; fi
            ip_addr=$(ip addr show "$iface" 2>/dev/null | grep "inet " | awk '{print $2}' | head -1)
            mac_addr=$(ip link show "$iface" 2>/dev/null | grep "link/ether" | awk '{print $2}')
        elif ifconfig "$iface" &>/dev/null; then
            if ifconfig "$iface" | grep -q "RUNNING"; then status="UP"; fi
            ip_addr=$(ifconfig "$iface" 2>/dev/null | grep "inet " | awk '{print $2}' | head -1)
            mac_addr=$(ifconfig "$iface" 2>/dev/null | grep "ether" | awk '{print $2}')
        fi
        local status_color="$R"
        if [ "$status" = "UP" ]; then status_color="$G"; fi
        output "  网卡: \033[0;33m${iface}\033[0m"
        output "    状态: ${status_color}${status}\033[0m"
        output "    IP地址: ${ip_addr:-No IP}"
        output "    MAC地址: ${mac_addr:-Unknown}"
    done
    output ""
}

# Check gateway
check_gateway() {
    output "\033[0;36m[网关信息]\033[0m"
    local gateway=$(ip route 2>/dev/null | grep "default" | awk '{print $3}' | head -1)
    if [ -z "$gateway" ]; then
        gateway=$(route -n 2>/dev/null | grep "^0.0.0.0" | awk '{print $2}' | head -1)
    fi
    if [ -n "$gateway" ]; then
        output "  网关: \033[0;32m${gateway}\033[0m"
        if ping -c 1 -W 3 "$gateway" &>/dev/null; then
            local delay=$(ping -c 1 -W 3 "$gateway" | grep "time=" | sed 's/.*time=//;s/ ms.*//')
            output "  连通性: \033[0;32mOK\033[0m (延迟: ${delay}ms)"
        else
            output "  连通性: \033[0;31mFAILED\033[0m"
            log_message "CRITICAL" "Gateway ${gateway} unreachable"
        fi
    else
        output "  网关: \033[0;31m未配置\033[0m"
        log_message "WARNING" "No default gateway found"
    fi
    output ""
}

# Check internet connectivity
check_internet() {
    output "\033[0;36m[网络连通性]\033[0m"
    for target in "${PING_TARGETS[@]}"; do
        output "  目标: \033[0;33m${target}\033[0m"
        local ping_result=$(ping -c ${PING_COUNT} -W ${PING_TIMEOUT} "$target" 2>/dev/null)
        local ping_status=$?
        if [ $ping_status -eq 0 ]; then
            local packet_loss=$(echo "$ping_result" | grep "packet loss" | awk -F',' '{print $3}' | awk '{print $1}')
            local avg_rtt=$(echo "$ping_result" | grep "avg" | awk -F'/' '{print $5}')
            local min_rtt=$(echo "$ping_result" | grep "min/avg/max" | awk -F'=' '{print $2}' | awk -F'/' '{print $1}' | tr -d ' ')
            if [ -z "$packet_loss" ]; then packet_loss="0%"; fi
            output "    状态: \033[0;32mOK\033[0m"
            output "    丢包率: ${packet_loss}"
            output "    平均延迟: ${avg_rtt:-Unknown}ms"
            output "    最小延迟: ${min_rtt:-Unknown}ms"
            log_message "INFO" "Ping ${target} OK, Loss: ${packet_loss}, RTT: ${avg_rtt}ms"
        else
            output "    状态: \033[0;31mFAILED\033[0m"
            output "    Reason: 主机不可达"
            log_message "CRITICAL" "Ping ${target} failed"
        fi
        output ""
    done
}

# Check HTTP connectivity
check_http() {
    output "\033[0;36m[HTTP 服务检测]\033[0m"
    for url in "${URL_TARGETS[@]}"; do
        output "  URL: \033[0;33m${url}\033[0m"
        if command -v curl &>/dev/null; then
            local http_code=$(curl -o /dev/null -s -w "%{http_code}" --connect-timeout 10 --max-time 15 "$url" 2>/dev/null)
            local curl_status=$?
            if [ $curl_status -eq 0 ] && [ "$http_code" = "200" ]; then
                local response_time=$(curl -o /dev/null -s -w "%{time_total}" --connect-timeout 10 --max-time 15 "$url" 2>/dev/null)
                output "    状态: \033[0;32mOK\033[0m (HTTP ${http_code})"
                output "    响应时间: ${response_time}s"
                log_message "INFO" "HTTP ${url} OK, Code: ${http_code}"
            elif [ $curl_status -eq 0 ]; then
                output "    状态: \033[0;33m异常\033[0m (HTTP ${http_code})"
                log_message "WARNING" "HTTP ${url} returned ${http_code}"
            else
                output "    状态: \033[0;31m不可达\033[0m"
                log_message "CRITICAL" "HTTP ${url} failed"
            fi
        elif command -v wget &>/dev/null; then
            wget --timeout=10 --tries=1 -q -O /dev/null "$url" 2>/dev/null
            local wget_status=$?
            if [ $wget_status -eq 0 ]; then
                output "    状态: \033[0;32mOK\033[0m"
                log_message "INFO" "HTTP ${url} OK (wget)"
            else
                output "    状态: \033[0;31m不可达\033[0m"
                log_message "CRITICAL" "HTTP ${url} failed (wget)"
            fi
        else
            output "    状态: \033[0;33m已跳过\033[0m (curl/wget 未安装)"
        fi
        output ""
    done
}

# Check DNS resolution
check_dns() {
    output "\033[0;36m[DNS 解析]\033[0m"
    local dns_servers=$(cat /etc/resolv.conf 2>/dev/null | grep "nameserver" | awk '{print $2}')
    if [ -z "$dns_servers" ]; then
        output "  DNS 服务器: \033[0;31m未配置\033[0m"
        log_message "WARNING" "DNS not configured"
    else
        output "  DNS 服务器:"
        for dns in $dns_servers; do
            output "    ${dns}"
        done
    fi
    output ""
}

# Check connection stats
check_connections() {
    output "\033[0;36m[连接统计]\033[0m"
    if command -v ss &>/dev/null; then
        local total_conn=$(ss -s 2>/dev/null | grep "TCP:" | awk '{print $2}')
        local established=$(ss -tunap 2>/dev/null | grep "ESTAB" | wc -l)
        local listen=$(ss -tunap 2>/dev/null | grep "LISTEN" | wc -l)
        local time_wait=$(ss -tunap 2>/dev/null | grep "TIME-WAIT" | wc -l)
        output "  TCP总数: ${total_conn:-Unknown}"
        output "  已建立: ${established}"
        output "  监听中: ${listen}"
        output "  等待中: ${time_wait}"
    elif command -v netstat &>/dev/null; then
        local established=$(netstat -tunap 2>/dev/null | grep "ESTABLISHED" | wc -l)
        local listen=$(netstat -tunap 2>/dev/null | grep "LISTEN" | wc -l)
        local time_wait=$(netstat -tunap 2>/dev/null | grep "TIME_WAIT" | wc -l)
        output "  已建立: ${established}"
        output "  监听中: ${listen}"
        output "  等待中: ${time_wait}"
    else
        output "  ss/netstat 未安装,已跳过"
    fi
    output ""
}

# Check bandwidth usage
check_bandwidth() {
    output "\033[0;36m[带宽统计]\033[0m"
    local interfaces=$(ip link show 2>/dev/null | grep "state UP" | awk -F: '{print $2}' | sed 's/ //g')
    if [ -z "$interfaces" ]; then
        interfaces=$(ifconfig 2>/dev/null | grep "RUNNING" | awk -F: '{print $1}' | sed 's/ //g')
    fi
    for iface in $interfaces; do
        if [ "$iface" = "lo" ]; then continue; fi
        local rx_bytes="0"
        local tx_bytes="0"
        if [ -f "/sys/class/net/${iface}/statistics/rx_bytes" ]; then
            rx_bytes=$(cat "/sys/class/net/${iface}/statistics/rx_bytes" 2>/dev/null)
            tx_bytes=$(cat "/sys/class/net/${iface}/statistics/tx_bytes" 2>/dev/null)
        elif ifconfig "$iface" &>/dev/null; then
            rx_bytes=$(ifconfig "$iface" 2>/dev/null | grep "RX bytes" | awk -F'bytes:' '{print $2}' | awk '{print $1}')
            tx_bytes=$(ifconfig "$iface" 2>/dev/null | grep "TX bytes" | awk -F'bytes:' '{print $3}' | awk '{print $1}')
        fi
        local rx_human=$(numfmt --to=iec-i --suffix=B "$rx_bytes" 2>/dev/null || echo "${rx_bytes}B")
        local tx_human=$(numfmt --to=iec-i --suffix=B "$tx_bytes" 2>/dev/null || echo "${tx_bytes}B")
        output "  网卡: \033[0;33m${iface}\033[0m"
        output "    接收: ${rx_human}"
        output "    发送: ${tx_human}"
    done
    output ""
}

# Main monitor function
monitor_network() {
    if [ -n "$REPORT_FILE" ]; then
        > "$REPORT_FILE"
    fi
    print_header
    check_interface
    check_gateway
    check_internet
    check_http
    check_dns
    check_connections
    check_bandwidth
    print_line
    if [ -n "$REPORT_FILE" ]; then
        output ""
        output "报告已保存至: $REPORT_FILE"
    fi
}

# Continuous monitor mode
continuous_monitor() {
    local interval=${1:-30}
    echo "连续监控模式,间隔: ${interval}s..."
    echo "按 Ctrl+C 退出"
    while true; do
        clear
        monitor_network
        sleep "$interval"
    done
}

# 显示帮助
show_help() {
    cat << EOF
用法: $0 [options]

选项:
    -c, --continuous [sec]  连续模式,默认30秒
    -i, --interval sec      设置监控间隔
    -o, --output file       保存报告到文件
    -t, --target IP         添加自定义ping目标
    -l, --log               查看日志文件
    -h, --help              显示帮助

示例:
    $0                      单次检查
    $0 -c                   连续监控 (30秒)
    $0 -c 60                连续监控 (60秒)
    $0 -o /tmp/report.txt   保存到文件
    $0 -t 192.168.1.1       添加目标
    $0 -l                   查看日志
EOF
}

# View log
view_log() {
    if [ -f "$LOG_FILE" ]; then
        echo "网络监控日志:"
        print_line
        tail -n 50 "$LOG_FILE"
    else
        echo "日志文件未找到: $LOG_FILE"
    fi
}

# Main
main() {
    local log_dir=$(dirname "$LOG_FILE")
    if [ ! -d "$log_dir" ]; then
        mkdir -p "$log_dir" 2>/dev/null || LOG_FILE="/tmp/network_monitor.log"
    fi
    while [ $# -gt 0 ]; do
        case "$1" in
            -c|--continuous)
                shift
                continuous_monitor "${1:-30}"
                exit 0
                ;;
            -i|--interval)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定间隔时间"
                    show_help
                    exit 1
                fi
                continuous_monitor "$1"
                exit 0
                ;;
            -o|--output)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定输出文件"
                    show_help
                    exit 1
                fi
                REPORT_FILE="$1"
                shift
                ;;
            -t|--target)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定目标IP"
                    show_help
                    exit 1
                fi
                PING_TARGETS+=("$1")
                shift
                ;;
            -l|--log)
                view_log
                exit 0
                ;;
            -h|--help)
                show_help
                exit 0
                ;;
            *)
                echo "未知选项: $1"
                show_help
                exit 1
                ;;
        esac
    done
    monitor_network
}

main "$@"

3.监控服务进程

编写脚本来监控服务状态,如Nginx、MySQL,或者是特定的进程。

--------------------------------------------------------------------------------
服务状态监控报告           2026-05-18 15:04:35
--------------------------------------------------------------------------------
[系统服务检查]

服务: nginx
    状态: 已停止

服务: mysql
    状态: 已停止

服务: mariadb
    状态: 运行中 (active)
    运行时间: Mon 2026-05-18 14:37:58 CST 
    CPU占用: 0.1%
    内存占用: 2.2% (84524KB)
    线程数: 19
    打开文件: 34

服务: redis
    状态: 已停止

服务: ssh
    状态: 运行中 (active)
    运行时间: Mon 2026-05-18 12:11:27 CST 
    CPU占用: 0.0%
    内存占用: 0.1% (3584KB)
    线程数: 1
    打开文件: 5

[检查汇总]
  总服务数: 5
  正常: 2
  异常: 3

--------------------------------------------------------------------------------

测试案例

sudo ./service_monitor.sh -p "python3||python3 -m http.server 8080" -a

编写脚本

#!/bin/bash

# 服务状态监控脚本
# 支持监控: Nginx, MySQL, Redis, SSH, 以及自定义进程
# 支持: 单次检查 / 连续监控 / 自动重启 / 告警通知

# 颜色定义
R='\033[0;31m'
G='\033[0;32m'
Y='\033[0;33m'
C='\033[0;36m'
N='\033[0m'

# 默认配置
CHECK_INTERVAL=30
LOG_FILE="/var/log/service_monitor.log"
REPORT_FILE=""
AUTO_RESTART=false
NOTIFY_CMD=""

# 服务配置数组
# 格式: ("服务名|检查方式|检查命令|启动命令")
# 检查方式: systemctl/port/process
SERVICES=(
    "nginx|systemctl|nginx|systemctl start nginx"
    "mysql|systemctl|mysqld|systemctl start mysqld"
    "mariadb|systemctl|mariadb|systemctl start mariadb"
    "redis|systemctl|redis|systemctl start redis"
    "ssh|systemctl|sshd|systemctl start sshd"
)

# 自定义进程(通过进程名或PID文件检查)
# 格式: ("进程名|PID文件路径|启动命令")
# 示例:
#   CUSTOM_PROCESSES=("java|/var/run/tomcat.pid|systemctl start tomcat")
#   CUSTOM_PROCESSES=("redis-server||redis-server /etc/redis.conf")
#   CUSTOM_PROCESSES=("myapp||/opt/myapp/bin/start.sh")
# PID文件路径可为空,此时通过进程名匹配
CUSTOM_PROCESSES=()

# 获取时间戳
get_timestamp() {
    date "+%Y-%m-%d %H:%M:%S"
}

# 输出函数
output() {
    local line="$1"
    if [ -n "$REPORT_FILE" ]; then
        echo "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$REPORT_FILE"
    fi
    printf "%b\n" "$line"
}

# 日志函数
log_message() {
    local level="$1"
    local message="$2"
    local timestamp=$(get_timestamp)
    echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}

# 打印分隔线
print_line() {
    output "--------------------------------------------------------------------------------"
}

# 打印表头
print_header() {
    print_line
    output "服务状态监控报告           $(get_timestamp)"
    print_line
}

# 检查 systemctl 服务
check_systemctl_service() {
    local service_name="$1"
    local system_name="$2"

    if command -v systemctl &>/dev/null; then
        if systemctl is-active --quiet "$system_name" 2>/dev/null; then
            local status=$(systemctl is-active "$system_name" 2>/dev/null)
            local uptime=$(systemctl status "$system_name" 2>/dev/null | grep "Active:" | sed 's/.*since //;s/;.*/ /')
            output "服务: ${Y}${service_name}${N}"
            output "    状态: ${G}运行中${N} (${status})"
            output "    运行时间: ${uptime:-未知}"
            # 获取主进程PID并检查资源
            local main_pid=$(systemctl show "$system_name" --property=MainPID --value 2>/dev/null)
            # 如果 systemctl 获取的 MainPID 无效,尝试通过进程名查找
            if [ -z "$main_pid" ] || [ "$main_pid" = "0" ] || ! kill -0 "$main_pid" 2>/dev/null; then
                main_pid=$(pgrep -x "$system_name" 2>/dev/null | head -1)
            fi
            if [ -z "$main_pid" ] || [ "$main_pid" = "0" ] || ! kill -0 "$main_pid" 2>/dev/null; then
                main_pid=$(pgrep -f "$system_name" 2>/dev/null | head -1)
            fi
            check_service_resources "$service_name" "$main_pid"
            log_message "INFO" "${service_name} 运行正常"
            return 0
        else
            output "服务: ${Y}${service_name}${N}"
            output "    状态: ${R}已停止${N}"
            log_message "CRITICAL" "${service_name} 已停止"
            return 1
        fi
    else
        output "服务: ${Y}${service_name}${N}"
        output "    状态: ${R}无法检查${N} (systemctl 不可用)"
        return 1
    fi
}

# 检查端口监听
check_port_service() {
    local service_name="$1"
    local port="$2"
    local proto="${3:-tcp}"

    local port_pid=""
    local is_running=false

    if command -v ss &>/dev/null; then
        if ss -tlnp 2>/dev/null | grep -q ":${port} "; then
            is_running=true
            port_pid=$(ss -tlnp 2>/dev/null | grep ":${port} " | grep -oP 'pid=\K[0-9]+' | head -1)
        fi
    elif command -v netstat &>/dev/null; then
        if netstat -tlnp 2>/dev/null | grep -q ":${port} "; then
            is_running=true
            port_pid=$(netstat -tlnp 2>/dev/null | grep ":${port} " | awk '{print $7}' | cut -d'/' -f1 | head -1)
        fi
    fi

    if [ "$is_running" = true ]; then
        output "服务: ${Y}${service_name}${N}"
        output "    状态: ${G}运行中${N} (端口 ${port} 监听中)"
        check_service_resources "$service_name" "$port_pid"
        log_message "INFO" "${service_name} 端口 ${port} 正常"
        return 0
    fi

    output "服务: ${Y}${service_name}${N}"
    output "    状态: ${R}已停止${N} (端口 ${port} 未监听)"
    log_message "CRITICAL" "${service_name} 端口 ${port} 未监听"
    return 1
}

# 检查进程
check_process() {
    local process_name="$1"
    local pid_file="$2"

    local pid=""

    # 优先通过 PID 文件检查
    if [ -n "$pid_file" ] && [ -f "$pid_file" ]; then
        pid=$(cat "$pid_file" 2>/dev/null)
        if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
            output "进程: ${Y}${process_name}${N}"
            output "    状态: ${G}运行中${N} (PID: ${pid})"
            check_service_resources "$process_name" "$pid"
            log_message "INFO" "${process_name} 运行正常 (PID: ${pid})"
            return 0
        fi
    fi

    # 通过进程名检查
    pid=$(pgrep -x "$process_name" 2>/dev/null | head -1)
    if [ -n "$pid" ]; then
        output "进程: ${Y}${process_name}${N}"
        output "    状态: ${G}运行中${N} (PID: ${pid})"
        check_service_resources "$process_name" "$pid"
        log_message "INFO" "${process_name} 运行正常 (PID: ${pid})"
        return 0
    fi

    # 模糊匹配
    pid=$(pgrep -f "$process_name" 2>/dev/null | head -1)
    if [ -n "$pid" ]; then
        output "进程: ${Y}${process_name}${N}"
        output "    状态: ${G}运行中${N} (PID: ${pid}, 模糊匹配)"
        check_service_resources "$process_name" "$pid"
        log_message "INFO" "${process_name} 运行正常 (PID: ${pid})"
        return 0
    fi

    output "进程: ${Y}${process_name}${N}"
    output "    状态: ${R}未运行${N}"
    log_message "CRITICAL" "${process_name} 未运行"
    return 1
}

# 检查服务资源占用
check_service_resources() {
    local service_name="$1"
    local pid="$2"

    if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
        return
    fi

    # CPU 使用率
    local cpu_usage="0"
    if [ -f "/proc/${pid}/stat" ]; then
        local utime=$(awk '{print $14}' /proc/${pid}/stat 2>/dev/null)
        local stime=$(awk '{print $15}' /proc/${pid}/stat 2>/dev/null)
        local total_time=$((utime + stime))
        local start_time=$(awk '{print $22}' /proc/${pid}/stat 2>/dev/null)
        local uptime=$(awk '{print $1}' /proc/uptime 2>/dev/null | cut -d'.' -f1)
        local hz=$(getconf CLK_TCK 2>/dev/null || echo 100)
        local proc_uptime=$((uptime - start_time / hz))
        if [ "$proc_uptime" -gt 0 ]; then
            cpu_usage=$(awk "BEGIN {printf \"%.1f\", 100 * $total_time / $hz / $proc_uptime}")
        fi
    fi

    # 内存占用
    local mem_usage="0"
    local mem_rss="0"
    if [ -f "/proc/${pid}/status" ]; then
        mem_rss=$(grep "VmRSS:" /proc/${pid}/status 2>/dev/null | awk '{print $2}')
        if [ -n "$mem_rss" ]; then
            local mem_total=$(grep "MemTotal:" /proc/meminfo 2>/dev/null | awk '{print $2}')
            if [ -n "$mem_total" ] && [ "$mem_total" -gt 0 ]; then
                mem_usage=$(awk "BEGIN {printf \"%.1f\", 100 * $mem_rss / $mem_total}")
            fi
        fi
    fi

    # 内存大小(人类可读)
    local mem_human="${mem_rss}KB"
    if command -v numfmt &>/dev/null && [ -n "$mem_rss" ]; then
        mem_human=$(numfmt --to=iec-i --suffix=B --from-unit=1024 ${mem_rss}K 2>/dev/null || echo "${mem_rss}KB")
    fi

    # 线程数
    local threads=$(ls /proc/${pid}/task 2>/dev/null | wc -l)

    # 打开文件数
    local open_files=$(ls /proc/${pid}/fd 2>/dev/null | wc -l)

    output "    CPU占用: ${cpu_usage}%"
    output "    内存占用: ${mem_usage}% (${mem_human})"
    output "    线程数: ${threads}"
    output "    打开文件: ${open_files}"
}

# 尝试启动服务
auto_restart_service() {
    local service_name="$1"
    local start_cmd="$2"

    if [ "$AUTO_RESTART" != true ]; then
        return 1
    fi

    output "    操作: ${Y}尝试自动重启...${N}"
    log_message "WARNING" "尝试自动重启 ${service_name}"

    if eval "$start_cmd" &>/dev/null; then
        sleep 2
        # 再次检查
        if check_service_quick "$service_name"; then
            output "    结果: ${G}重启成功${N}"
            log_message "INFO" "${service_name} 自动重启成功"

            # 发送通知
            if [ -n "$NOTIFY_CMD" ]; then
                eval "$NOTIFY_CMD" "${service_name} 已自动重启" &>/dev/null
            fi
            return 0
        fi
    fi

    output "    结果: ${R}重启失败${N}"
    log_message "CRITICAL" "${service_name} 自动重启失败"

    # 发送告警
    if [ -n "$NOTIFY_CMD" ]; then
        eval "$NOTIFY_CMD" "${service_name} 自动重启失败,请人工介入" &>/dev/null
    fi
    return 1
}

# 快速检查(用于重启后验证)
check_service_quick() {
    local name="$1"

    # 尝试 systemctl
    if command -v systemctl &>/dev/null && systemctl is-active --quiet "$name" 2>/dev/null; then
        return 0
    fi

    # 尝试进程
    if pgrep -x "$name" &>/dev/null || pgrep -f "$name" &>/dev/null; then
        return 0
    fi

    return 1
}

# 解析服务配置
parse_service() {
    local config="$1"
    local name=$(echo "$config" | cut -d'|' -f1)
    local method=$(echo "$config" | cut -d'|' -f2)
    local check_param=$(echo "$config" | cut -d'|' -f3)
    local start_cmd=$(echo "$config" | cut -d'|' -f4)

    echo "$name|$method|$check_param|$start_cmd"
}

# 检查所有服务
check_all_services() {
    local failed_count=0
    local total_count=0

    output "${C}[系统服务检查]${N}"
    output ""

    for service_config in "${SERVICES[@]}"; do
        total_count=$((total_count + 1))

        local parsed=$(parse_service "$service_config")
        local name=$(echo "$parsed" | cut -d'|' -f1)
        local method=$(echo "$parsed" | cut -d'|' -f2)
        local check_param=$(echo "$parsed" | cut -d'|' -f3)
        local start_cmd=$(echo "$parsed" | cut -d'|' -f4)

        local check_result=1

        case "$method" in
            systemctl)
                check_systemctl_service "$name" "$check_param"
                check_result=$?
                ;;
            port)
                check_port_service "$name" "$check_param"
                check_result=$?
                ;;
            process)
                check_process "$check_param" ""
                check_result=$?
                ;;
            *)
                output "服务: ${Y}${name}${N}"
                output "    状态: ${R}未知检查方式${N} (${method})"
                check_result=1
                ;;
        esac

        if [ $check_result -ne 0 ]; then
            failed_count=$((failed_count + 1))
            if [ -n "$start_cmd" ]; then
                auto_restart_service "$name" "$start_cmd"
            fi
        fi

        output ""
    done

    # 检查自定义进程
    if [ ${#CUSTOM_PROCESSES[@]} -gt 0 ]; then
        output "${C}[自定义进程检查]${N}"
        output ""

        for proc_config in "${CUSTOM_PROCESSES[@]}"; do
            total_count=$((total_count + 1))

            local proc_name=$(echo "$proc_config" | cut -d'|' -f1)
            local pid_file=$(echo "$proc_config" | cut -d'|' -f2)
            local start_cmd=$(echo "$proc_config" | cut -d'|' -f3)

            check_process "$proc_name" "$pid_file"
            local check_result=$?

            if [ $check_result -ne 0 ]; then
                failed_count=$((failed_count + 1))
                if [ -n "$start_cmd" ]; then
                    auto_restart_service "$proc_name" "$start_cmd"
                fi
            fi

            output ""
        done
    fi

    # 汇总
    output "${C}[检查汇总]${N}"
    output "  总服务数: ${total_count}"
    output "  正常: ${G}$((total_count - failed_count))${N}"
    output "  异常: ${R}${failed_count}${N}"

    if [ $failed_count -gt 0 ]; then
        log_message "WARNING" "检查完成: ${failed_count}/${total_count} 个服务异常"
    else
        log_message "INFO" "检查完成: 所有服务正常"
    fi

    output ""
}

# 主监控函数
monitor_services() {
    if [ -n "$REPORT_FILE" ]; then
        > "$REPORT_FILE"
    fi

    print_header
    check_all_services
    print_line

    if [ -n "$REPORT_FILE" ]; then
        output ""
        output "报告已保存至: $REPORT_FILE"
    fi
}

# 连续监控模式
continuous_monitor() {
    local interval=${1:-$CHECK_INTERVAL}

    echo "连续监控模式,间隔: ${interval}秒..."
    echo "按 Ctrl+C 退出"
    echo ""

    while true; do
        clear
        monitor_services
        sleep "$interval"
    done
}

# 显示帮助
show_help() {
    cat << EOF
用法: $0 [选项]

选项:
    -c, --continuous [秒]     连续监控模式,默认30秒
    -i, --interval 秒         设置监控间隔
    -o, --output 文件         保存报告到文件
    -a, --auto-restart        服务异常时自动重启
    -n, --notify 命令         设置告警通知命令
    -s, --service 配置        添加自定义服务
    -p, --process 配置        添加自定义进程
                              格式: "进程名|PID文件|启动命令"
                              示例: "java|/var/run/tomcat.pid|systemctl start tomcat"
                              PID文件可为空
    -l, --log                 查看日志
    -h, --help                显示帮助

服务配置格式: 名称|检查方式|检查参数|启动命令
    检查方式: systemctl / port / process

进程配置格式: 名称|PID文件|启动命令
    PID文件可为空

示例:
    $0                          单次检查
    $0 -c                       连续监控 (30秒)
    $0 -c 60                    连续监控 (60秒)
    $0 -o /tmp/report.txt       保存报告
    $0 -a                       启用自动重启
    $0 -n "mail -s '告警' [email protected]"   邮件告警
    $0 -s "nginx|port|80|systemctl start nginx"  添加端口检查
    $0 -p "myapp||/opt/myapp/start.sh"           添加自定义进程
    $0 -s "tomcat|process|java|systemctl start tomcat"  添加Tomcat

预置服务:
    nginx, mysql, redis, ssh
EOF
}

# 查看日志
view_log() {
    if [ -f "$LOG_FILE" ]; then
        echo "服务监控日志:"
        print_line
        tail -n 50 "$LOG_FILE"
    else
        echo "日志文件未找到: $LOG_FILE"
    fi
}

# 添加自定义服务
add_service() {
    local config="$1"
    SERVICES+=("$config")
}

# 添加自定义进程
add_process() {
    local config="$1"
    CUSTOM_PROCESSES+=("$config")
}

# 主函数
main() {
    # 确保日志目录存在
    local log_dir=$(dirname "$LOG_FILE")
    if [ ! -d "$log_dir" ]; then
        mkdir -p "$log_dir" 2>/dev/null || LOG_FILE="/tmp/service_monitor.log"
    fi

    while [ $# -gt 0 ]; do
        case "$1" in
            -c|--continuous)
                shift
                continuous_monitor "${1:-$CHECK_INTERVAL}"
                exit 0
                ;;
            -i|--interval)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定间隔时间"
                    show_help
                    exit 1
                fi
                CHECK_INTERVAL="$1"
                continuous_monitor "$CHECK_INTERVAL"
                exit 0
                ;;
            -o|--output)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定输出文件"
                    show_help
                    exit 1
                fi
                REPORT_FILE="$1"
                shift
                ;;
            -a|--auto-restart)
                AUTO_RESTART=true
                shift
                ;;
            -n|--notify)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定通知命令"
                    show_help
                    exit 1
                fi
                NOTIFY_CMD="$1"
                shift
                ;;
            -s|--service)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定服务配置"
                    show_help
                    exit 1
                fi
                add_service "$1"
                shift
                ;;
            -p|--process)
                shift
                if [ -z "${1:-}" ]; then
                    echo "错误: 请指定进程配置"
                    show_help
                    exit 1
                fi
                add_process "$1"
                shift
                ;;
            -l|--log)
                view_log
                exit 0
                ;;
            -h|--help)
                show_help
                exit 0
                ;;
            *)
                echo "未知选项: $1"
                show_help
                exit 1
                ;;
        esac
    done

    monitor_services
}

main "$@"
posted @ 2026-05-18 17:40  giyaYON  阅读(12)  评论(0)    收藏  举报