【shell】监控系统资源脚本
1.监控系统资源
编写脚本来监控CPU、内存、磁盘等系统资源的使用情况。
--------------------------------------------------------------------------------
CPU使用率: 0.8% [状态: NORMAL]
内存使用率: 4.2% [状态: NORMAL]
内存详情: Total: 3.7G | Used: 157M | Free: 2.8G | Available: 3.3G
磁盘使用率(根分区): 4% [状态: NORMAL]
磁盘详情:
Filesystem Size Used Avail Use% Mounted
/dev/mapper/centos-root 37G 1.4G 36G 4% /
/dev/sda1 509M 125M 384M 25% /boot
系统负载: 0.00, 0.01, 0.05
进程数量: 371
系统运行时间: up 1 hour, 7 minutes
--------------------------------------------------------------------------------
脚本
#!/bin/bash
# 系统资源监控脚本
# 功能:监控CPU、内存、磁盘使用情况
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
CYAN='\033[0;36m'
NC='\033[0m'
# 阈值设置
CPU_WARNING=80
CPU_CRITICAL=95
MEM_WARNING=80
MEM_CRITICAL=95
DISK_WARNING=80
DISK_CRITICAL=90
# 日志文件
LOG_FILE="/var/log/system_monitor.log"
# 报告输出文件(由-o参数指定)
REPORT_FILE=""
# 获取当前时间
get_timestamp() {
date "+%Y-%m-%d %H:%M:%S"
}
# 记录日志
log_message() {
local level="$1"
local message="$2"
local timestamp=$(get_timestamp)
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
# 输出函数:支持屏幕显示和文件保存
output() {
local line="$1"
if [ -n "$REPORT_FILE" ]; then
# 去除颜色代码后写入文件
echo "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$REPORT_FILE"
fi
printf "%b\\n" "$line"
}
# 获取CPU使用率
get_cpu_usage() {
local cpu_idle=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\\([0-9.]*\\)%* id.*/\\1/" | awk '{print $1}')
local cpu_usage=$(awk -v idle="$cpu_idle" 'BEGIN {printf "%.1f", 100 - idle}')
printf "%.1f" "$cpu_usage"
}
# 获取内存使用情况
get_memory_usage() {
local mem_info=$(free | grep Mem)
local total=$(echo "$mem_info" | awk '{print $2}')
local used=$(echo "$mem_info" | awk '{print $3}')
local mem_usage=$(awk -v used="$used" -v total="$total" 'BEGIN {printf "%.1f", (used/total)*100}')
echo "$mem_usage"
}
# 获取内存详细信息
get_memory_detail() {
local mem_info=$(free -h | grep Mem)
local total=$(echo "$mem_info" | awk '{print $2}')
local used=$(echo "$mem_info" | awk '{print $3}')
local free=$(echo "$mem_info" | awk '{print $4}')
local available=$(echo "$mem_info" | awk '{print $7}')
echo "Total: $total | Used: $used | Free: $free | Available: $available"
}
# 获取磁盘使用情况
get_disk_usage() {
local disk_usage=$(df -h / | tail -1 | awk '{print $5}' | sed 's/%//')
echo "$disk_usage"
}
# 获取磁盘详细信息
get_disk_detail() {
df -h | grep -v "^tmpfs\|^devtmpfs\|^overlay" | awk 'NR==1 || $1~/^\/dev/ || $1~/^\/mapper/ {printf " %-20s %-10s %-10s %-10s %-6s %-20s\n", $1, $2, $3, $4, $5, $6}'
}
# 获取负载情况
get_load_average() {
uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//'
}
# 获取进程数量
get_process_count() {
ps aux | wc -l
}
# 获取运行时间
get_uptime() {
uptime -p 2>/dev/null || uptime | awk -F',' '{print $1}' | sed 's/^ *//'
}
# 检查阈值并返回状态
check_threshold() {
local value=$1
local warning=$2
local critical=$3
local result=$(awk -v v="$value" -v w="$warning" -v c="$critical" 'BEGIN {
if (v >= c) print "CRITICAL"
else if (v >= w) print "WARNING"
else print "NORMAL"
}')
echo "$result"
}
# 获取状态颜色
get_status_color() {
local status=$1
case "$status" in
"CRITICAL") echo "$RED" ;;
"WARNING") echo "$YELLOW" ;;
*) echo "$GREEN" ;;
esac
}
# 打印分隔线
print_line() {
output "--------------------------------------------------------------------------------"
}
# 打印标题
print_header() {
print_line
output "系统资源监控报告 $(get_timestamp)"
print_line
}
# 主监控函数
monitor_system() {
# 如果指定了报告文件,先创建/清空
if [ -n "$REPORT_FILE" ]; then
> "$REPORT_FILE"
fi
print_header
# CPU信息
local cpu_usage=$(get_cpu_usage)
local cpu_status=$(check_threshold "$cpu_usage" "$CPU_WARNING" "$CPU_CRITICAL")
local cpu_color=$(get_status_color "$cpu_status")
output "${CYAN}CPU使用率:${NC} ${cpu_usage}% [状态: ${cpu_color}${cpu_status}${NC}]"
# 内存信息
local mem_usage=$(get_memory_usage)
local mem_status=$(check_threshold "$mem_usage" "$MEM_WARNING" "$MEM_CRITICAL")
local mem_color=$(get_status_color "$mem_status")
output "${CYAN}内存使用率:${NC} ${mem_usage}% [状态: ${mem_color}${mem_status}${NC}]"
output "${CYAN}内存详情:${NC} $(get_memory_detail)"
# 磁盘信息
local disk_usage=$(get_disk_usage)
local disk_status=$(check_threshold "$disk_usage" "$DISK_WARNING" "$DISK_CRITICAL")
local disk_color=$(get_status_color "$disk_status")
output "${CYAN}磁盘使用率(根分区):${NC} ${disk_usage}% [状态: ${disk_color}${disk_status}${NC}]"
output "${CYAN}磁盘详情:${NC}"
get_disk_detail | while read line; do
output "$line"
done
# 系统负载
output "${CYAN}系统负载:${NC} $(get_load_average)"
# 进程数量
output "${CYAN}进程数量:${NC} $(get_process_count)"
# 运行时间
output "${CYAN}系统运行时间:${NC} $(get_uptime)"
print_line
# 如果保存了报告文件,提示用户
if [ -n "$REPORT_FILE" ]; then
output ""
output "报告已保存到: $REPORT_FILE"
fi
# 记录到日志
log_message "INFO" "CPU: ${cpu_usage}%, MEM: ${mem_usage}%, DISK: ${disk_usage}%, Load: $(get_load_average)"
if [ "$cpu_status" = "WARNING" ] || [ "$cpu_status" = "CRITICAL" ]; then
log_message "$cpu_status" "CPU使用率过高: ${cpu_usage}%"
fi
if [ "$mem_status" = "WARNING" ] || [ "$mem_status" = "CRITICAL" ]; then
log_message "$mem_status" "内存使用率过高: ${mem_usage}%"
fi
if [ "$disk_status" = "WARNING" ] || [ "$disk_status" = "CRITICAL" ]; then
log_message "$disk_status" "磁盘使用率过高: ${disk_usage}%"
fi
}
# 持续监控模式
continuous_monitor() {
local interval=${1:-5}
echo "进入持续监控模式,间隔 ${interval} 秒..."
echo "按 Ctrl+C 退出"
while true; do
clear
monitor_system
sleep "$interval"
done
}
# 显示帮助信息
show_help() {
cat << EOF
用法: $0 [选项]
选项:
-c, --continuous [秒] 持续监控模式,默认间隔5秒
-i, --interval 秒 设置监控间隔时间
-o, --output 文件 将报告保存到指定文件
-l, --log 查看日志文件
-h, --help 显示帮助信息
示例:
$0 单次监控
$0 -c 持续监控(5秒间隔)
$0 -c 10 持续监控(10秒间隔)
$0 -o /tmp/report.txt 保存报告到文件
$0 -o report.txt 保存报告到当前目录
$0 -l 查看监控日志
EOF
}
# 查看日志
view_log() {
if [ -f "$LOG_FILE" ]; then
echo "监控日志内容:"
print_line
tail -n 50 "$LOG_FILE"
else
echo "日志文件不存在: $LOG_FILE"
fi
}
# 主程序
main() {
if ! command -v bc &> /dev/null; then
echo "警告: bc 命令未安装,脚本使用awk替代计算"
fi
local log_dir=$(dirname "$LOG_FILE")
if [ ! -d "$log_dir" ]; then
mkdir -p "$log_dir" 2>/dev/null || LOG_FILE="/tmp/system_monitor.log"
fi
# 解析参数
while [ $# -gt 0 ]; do
case "$1" in
-c|--continuous)
shift
continuous_monitor "${1:-5}"
exit 0
;;
-i|--interval)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定间隔时间"
show_help
exit 1
fi
continuous_monitor "$1"
exit 0
;;
-o|--output)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定输出文件路径"
show_help
exit 1
fi
REPORT_FILE="$1"
shift
;;
-l|--log)
view_log
exit 0
;;
-h|--help)
show_help
exit 0
;;
*)
echo "未知选项: $1"
show_help
exit 1
;;
esac
done
# 默认执行单次监控
monitor_system
}
main "$@"
每天凌晨2点保存报告
0 2 * * * /home/shells/system_monitor.sh -o /var/log/daily_report_$(date +\%Y\%m\%d).txt
2.监控网络连接
--------------------------------------------------------------------------------
网络监控报告 2026-05-18 14:07:40
--------------------------------------------------------------------------------
[网卡状态]
网卡: eno16777728
状态: UP
IP地址: 192.168.31.43/24
MAC地址: 00:0c:29:6e:18:be
[网关信息]
网关: 192.168.31.1
连通性: OK (延迟: 4.64ms)
[网络连通性]
目标: 223.5.5.5
状态: OK
丢包率: 0%
平均延迟: 18.905ms
最小延迟: 16.574ms
目标: 114.114.114.114
状态: OK
丢包率: 100%
平均延迟: Unknownms
最小延迟: Unknownms
目标: 8.8.8.8
状态: OK
丢包率: 0%
平均延迟: 44.771ms
最小延迟: 44.522ms
[HTTP 服务检测]
URL: http://www.baidu.com
状态: OK (HTTP 200)
响应时间: 0.041s
URL: http://www.aliyun.com
状态: 异常 (HTTP 403)
[DNS 解析]
DNS 服务器:
192.168.31.1
114.114.114.114
223.5.5.5
[连接统计]
TCP总数: 9
已建立: 2
监听中: 4
等待中: 3
[带宽统计]
网卡: eno16777728
接收: 65MiB
发送: 1.6MiB
编写脚本来监控网络连接状态,检查网络是否正常。
为什么会报错?-bash: ./network_monitor.sh: /bin/bash^M: bad interpreter: No such file or directory
cat network_monitor.sh | tr -d '\r' > network_monitor_fixed.sh
#!/bin/bash
# Network Connection Monitor Script
# Check network status, connectivity, DNS, bandwidth
# Colors
R='\033[0;31m'
G='\033[0;32m'
Y='\033[0;33m'
C='\033[0;36m'
N='\033[0m'
# Config
PING_COUNT=4
PING_TIMEOUT=5
LOG_FILE="/var/log/network_monitor.log"
REPORT_FILE=""
# Default ping targets
PING_TARGETS=(
"223.5.5.5"
"114.114.114.114"
"8.8.8.8"
)
# HTTP check URLs
URL_TARGETS=(
"http://www.baidu.com"
"http://www.aliyun.com"
)
# Get timestamp
get_timestamp() {
date "+%Y-%m-%d %H:%M:%S"
}
# Output function
output() {
local line="$1"
if [ -n "$REPORT_FILE" ]; then
echo "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$REPORT_FILE"
fi
printf "%b\n" "$line"
}
# Log message
log_message() {
local level="$1"
local message="$2"
local timestamp=$(get_timestamp)
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
# Print line
print_line() {
output "--------------------------------------------------------------------------------"
}
# Print header
print_header() {
print_line
output "网络监控报告 $(get_timestamp)"
print_line
}
# Check interface status
check_interface() {
output "\033[0;36m[网卡状态]\033[0m"
local interfaces=$(ip link show 2>/dev/null | grep "^[0-9]" | awk -F: '{print $2}' | sed 's/ //g')
if [ -z "$interfaces" ]; then
interfaces=$(ifconfig -a 2>/dev/null | grep "^[^ ]" | awk '{print $1}' | sed 's/://g')
fi
for iface in $interfaces; do
if [ "$iface" = "lo" ]; then continue; fi
local status="DOWN"
local ip_addr="No IP"
local mac_addr="Unknown"
if ip link show "$iface" &>/dev/null; then
if ip link show "$iface" | grep -q "state UP"; then status="UP"; fi
ip_addr=$(ip addr show "$iface" 2>/dev/null | grep "inet " | awk '{print $2}' | head -1)
mac_addr=$(ip link show "$iface" 2>/dev/null | grep "link/ether" | awk '{print $2}')
elif ifconfig "$iface" &>/dev/null; then
if ifconfig "$iface" | grep -q "RUNNING"; then status="UP"; fi
ip_addr=$(ifconfig "$iface" 2>/dev/null | grep "inet " | awk '{print $2}' | head -1)
mac_addr=$(ifconfig "$iface" 2>/dev/null | grep "ether" | awk '{print $2}')
fi
local status_color="$R"
if [ "$status" = "UP" ]; then status_color="$G"; fi
output " 网卡: \033[0;33m${iface}\033[0m"
output " 状态: ${status_color}${status}\033[0m"
output " IP地址: ${ip_addr:-No IP}"
output " MAC地址: ${mac_addr:-Unknown}"
done
output ""
}
# Check gateway
check_gateway() {
output "\033[0;36m[网关信息]\033[0m"
local gateway=$(ip route 2>/dev/null | grep "default" | awk '{print $3}' | head -1)
if [ -z "$gateway" ]; then
gateway=$(route -n 2>/dev/null | grep "^0.0.0.0" | awk '{print $2}' | head -1)
fi
if [ -n "$gateway" ]; then
output " 网关: \033[0;32m${gateway}\033[0m"
if ping -c 1 -W 3 "$gateway" &>/dev/null; then
local delay=$(ping -c 1 -W 3 "$gateway" | grep "time=" | sed 's/.*time=//;s/ ms.*//')
output " 连通性: \033[0;32mOK\033[0m (延迟: ${delay}ms)"
else
output " 连通性: \033[0;31mFAILED\033[0m"
log_message "CRITICAL" "Gateway ${gateway} unreachable"
fi
else
output " 网关: \033[0;31m未配置\033[0m"
log_message "WARNING" "No default gateway found"
fi
output ""
}
# Check internet connectivity
check_internet() {
output "\033[0;36m[网络连通性]\033[0m"
for target in "${PING_TARGETS[@]}"; do
output " 目标: \033[0;33m${target}\033[0m"
local ping_result=$(ping -c ${PING_COUNT} -W ${PING_TIMEOUT} "$target" 2>/dev/null)
local ping_status=$?
if [ $ping_status -eq 0 ]; then
local packet_loss=$(echo "$ping_result" | grep "packet loss" | awk -F',' '{print $3}' | awk '{print $1}')
local avg_rtt=$(echo "$ping_result" | grep "avg" | awk -F'/' '{print $5}')
local min_rtt=$(echo "$ping_result" | grep "min/avg/max" | awk -F'=' '{print $2}' | awk -F'/' '{print $1}' | tr -d ' ')
if [ -z "$packet_loss" ]; then packet_loss="0%"; fi
output " 状态: \033[0;32mOK\033[0m"
output " 丢包率: ${packet_loss}"
output " 平均延迟: ${avg_rtt:-Unknown}ms"
output " 最小延迟: ${min_rtt:-Unknown}ms"
log_message "INFO" "Ping ${target} OK, Loss: ${packet_loss}, RTT: ${avg_rtt}ms"
else
output " 状态: \033[0;31mFAILED\033[0m"
output " Reason: 主机不可达"
log_message "CRITICAL" "Ping ${target} failed"
fi
output ""
done
}
# Check HTTP connectivity
check_http() {
output "\033[0;36m[HTTP 服务检测]\033[0m"
for url in "${URL_TARGETS[@]}"; do
output " URL: \033[0;33m${url}\033[0m"
if command -v curl &>/dev/null; then
local http_code=$(curl -o /dev/null -s -w "%{http_code}" --connect-timeout 10 --max-time 15 "$url" 2>/dev/null)
local curl_status=$?
if [ $curl_status -eq 0 ] && [ "$http_code" = "200" ]; then
local response_time=$(curl -o /dev/null -s -w "%{time_total}" --connect-timeout 10 --max-time 15 "$url" 2>/dev/null)
output " 状态: \033[0;32mOK\033[0m (HTTP ${http_code})"
output " 响应时间: ${response_time}s"
log_message "INFO" "HTTP ${url} OK, Code: ${http_code}"
elif [ $curl_status -eq 0 ]; then
output " 状态: \033[0;33m异常\033[0m (HTTP ${http_code})"
log_message "WARNING" "HTTP ${url} returned ${http_code}"
else
output " 状态: \033[0;31m不可达\033[0m"
log_message "CRITICAL" "HTTP ${url} failed"
fi
elif command -v wget &>/dev/null; then
wget --timeout=10 --tries=1 -q -O /dev/null "$url" 2>/dev/null
local wget_status=$?
if [ $wget_status -eq 0 ]; then
output " 状态: \033[0;32mOK\033[0m"
log_message "INFO" "HTTP ${url} OK (wget)"
else
output " 状态: \033[0;31m不可达\033[0m"
log_message "CRITICAL" "HTTP ${url} failed (wget)"
fi
else
output " 状态: \033[0;33m已跳过\033[0m (curl/wget 未安装)"
fi
output ""
done
}
# Check DNS resolution
check_dns() {
output "\033[0;36m[DNS 解析]\033[0m"
local dns_servers=$(cat /etc/resolv.conf 2>/dev/null | grep "nameserver" | awk '{print $2}')
if [ -z "$dns_servers" ]; then
output " DNS 服务器: \033[0;31m未配置\033[0m"
log_message "WARNING" "DNS not configured"
else
output " DNS 服务器:"
for dns in $dns_servers; do
output " ${dns}"
done
fi
output ""
}
# Check connection stats
check_connections() {
output "\033[0;36m[连接统计]\033[0m"
if command -v ss &>/dev/null; then
local total_conn=$(ss -s 2>/dev/null | grep "TCP:" | awk '{print $2}')
local established=$(ss -tunap 2>/dev/null | grep "ESTAB" | wc -l)
local listen=$(ss -tunap 2>/dev/null | grep "LISTEN" | wc -l)
local time_wait=$(ss -tunap 2>/dev/null | grep "TIME-WAIT" | wc -l)
output " TCP总数: ${total_conn:-Unknown}"
output " 已建立: ${established}"
output " 监听中: ${listen}"
output " 等待中: ${time_wait}"
elif command -v netstat &>/dev/null; then
local established=$(netstat -tunap 2>/dev/null | grep "ESTABLISHED" | wc -l)
local listen=$(netstat -tunap 2>/dev/null | grep "LISTEN" | wc -l)
local time_wait=$(netstat -tunap 2>/dev/null | grep "TIME_WAIT" | wc -l)
output " 已建立: ${established}"
output " 监听中: ${listen}"
output " 等待中: ${time_wait}"
else
output " ss/netstat 未安装,已跳过"
fi
output ""
}
# Check bandwidth usage
check_bandwidth() {
output "\033[0;36m[带宽统计]\033[0m"
local interfaces=$(ip link show 2>/dev/null | grep "state UP" | awk -F: '{print $2}' | sed 's/ //g')
if [ -z "$interfaces" ]; then
interfaces=$(ifconfig 2>/dev/null | grep "RUNNING" | awk -F: '{print $1}' | sed 's/ //g')
fi
for iface in $interfaces; do
if [ "$iface" = "lo" ]; then continue; fi
local rx_bytes="0"
local tx_bytes="0"
if [ -f "/sys/class/net/${iface}/statistics/rx_bytes" ]; then
rx_bytes=$(cat "/sys/class/net/${iface}/statistics/rx_bytes" 2>/dev/null)
tx_bytes=$(cat "/sys/class/net/${iface}/statistics/tx_bytes" 2>/dev/null)
elif ifconfig "$iface" &>/dev/null; then
rx_bytes=$(ifconfig "$iface" 2>/dev/null | grep "RX bytes" | awk -F'bytes:' '{print $2}' | awk '{print $1}')
tx_bytes=$(ifconfig "$iface" 2>/dev/null | grep "TX bytes" | awk -F'bytes:' '{print $3}' | awk '{print $1}')
fi
local rx_human=$(numfmt --to=iec-i --suffix=B "$rx_bytes" 2>/dev/null || echo "${rx_bytes}B")
local tx_human=$(numfmt --to=iec-i --suffix=B "$tx_bytes" 2>/dev/null || echo "${tx_bytes}B")
output " 网卡: \033[0;33m${iface}\033[0m"
output " 接收: ${rx_human}"
output " 发送: ${tx_human}"
done
output ""
}
# Main monitor function
monitor_network() {
if [ -n "$REPORT_FILE" ]; then
> "$REPORT_FILE"
fi
print_header
check_interface
check_gateway
check_internet
check_http
check_dns
check_connections
check_bandwidth
print_line
if [ -n "$REPORT_FILE" ]; then
output ""
output "报告已保存至: $REPORT_FILE"
fi
}
# Continuous monitor mode
continuous_monitor() {
local interval=${1:-30}
echo "连续监控模式,间隔: ${interval}s..."
echo "按 Ctrl+C 退出"
while true; do
clear
monitor_network
sleep "$interval"
done
}
# 显示帮助
show_help() {
cat << EOF
用法: $0 [options]
选项:
-c, --continuous [sec] 连续模式,默认30秒
-i, --interval sec 设置监控间隔
-o, --output file 保存报告到文件
-t, --target IP 添加自定义ping目标
-l, --log 查看日志文件
-h, --help 显示帮助
示例:
$0 单次检查
$0 -c 连续监控 (30秒)
$0 -c 60 连续监控 (60秒)
$0 -o /tmp/report.txt 保存到文件
$0 -t 192.168.1.1 添加目标
$0 -l 查看日志
EOF
}
# View log
view_log() {
if [ -f "$LOG_FILE" ]; then
echo "网络监控日志:"
print_line
tail -n 50 "$LOG_FILE"
else
echo "日志文件未找到: $LOG_FILE"
fi
}
# Main
main() {
local log_dir=$(dirname "$LOG_FILE")
if [ ! -d "$log_dir" ]; then
mkdir -p "$log_dir" 2>/dev/null || LOG_FILE="/tmp/network_monitor.log"
fi
while [ $# -gt 0 ]; do
case "$1" in
-c|--continuous)
shift
continuous_monitor "${1:-30}"
exit 0
;;
-i|--interval)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定间隔时间"
show_help
exit 1
fi
continuous_monitor "$1"
exit 0
;;
-o|--output)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定输出文件"
show_help
exit 1
fi
REPORT_FILE="$1"
shift
;;
-t|--target)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定目标IP"
show_help
exit 1
fi
PING_TARGETS+=("$1")
shift
;;
-l|--log)
view_log
exit 0
;;
-h|--help)
show_help
exit 0
;;
*)
echo "未知选项: $1"
show_help
exit 1
;;
esac
done
monitor_network
}
main "$@"
3.监控服务进程
编写脚本来监控服务状态,如Nginx、MySQL,或者是特定的进程。
--------------------------------------------------------------------------------
服务状态监控报告 2026-05-18 15:04:35
--------------------------------------------------------------------------------
[系统服务检查]
服务: nginx
状态: 已停止
服务: mysql
状态: 已停止
服务: mariadb
状态: 运行中 (active)
运行时间: Mon 2026-05-18 14:37:58 CST
CPU占用: 0.1%
内存占用: 2.2% (84524KB)
线程数: 19
打开文件: 34
服务: redis
状态: 已停止
服务: ssh
状态: 运行中 (active)
运行时间: Mon 2026-05-18 12:11:27 CST
CPU占用: 0.0%
内存占用: 0.1% (3584KB)
线程数: 1
打开文件: 5
[检查汇总]
总服务数: 5
正常: 2
异常: 3
--------------------------------------------------------------------------------
测试案例
sudo ./service_monitor.sh -p "python3||python3 -m http.server 8080" -a
编写脚本
#!/bin/bash
# 服务状态监控脚本
# 支持监控: Nginx, MySQL, Redis, SSH, 以及自定义进程
# 支持: 单次检查 / 连续监控 / 自动重启 / 告警通知
# 颜色定义
R='\033[0;31m'
G='\033[0;32m'
Y='\033[0;33m'
C='\033[0;36m'
N='\033[0m'
# 默认配置
CHECK_INTERVAL=30
LOG_FILE="/var/log/service_monitor.log"
REPORT_FILE=""
AUTO_RESTART=false
NOTIFY_CMD=""
# 服务配置数组
# 格式: ("服务名|检查方式|检查命令|启动命令")
# 检查方式: systemctl/port/process
SERVICES=(
"nginx|systemctl|nginx|systemctl start nginx"
"mysql|systemctl|mysqld|systemctl start mysqld"
"mariadb|systemctl|mariadb|systemctl start mariadb"
"redis|systemctl|redis|systemctl start redis"
"ssh|systemctl|sshd|systemctl start sshd"
)
# 自定义进程(通过进程名或PID文件检查)
# 格式: ("进程名|PID文件路径|启动命令")
# 示例:
# CUSTOM_PROCESSES=("java|/var/run/tomcat.pid|systemctl start tomcat")
# CUSTOM_PROCESSES=("redis-server||redis-server /etc/redis.conf")
# CUSTOM_PROCESSES=("myapp||/opt/myapp/bin/start.sh")
# PID文件路径可为空,此时通过进程名匹配
CUSTOM_PROCESSES=()
# 获取时间戳
get_timestamp() {
date "+%Y-%m-%d %H:%M:%S"
}
# 输出函数
output() {
local line="$1"
if [ -n "$REPORT_FILE" ]; then
echo "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$REPORT_FILE"
fi
printf "%b\n" "$line"
}
# 日志函数
log_message() {
local level="$1"
local message="$2"
local timestamp=$(get_timestamp)
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
# 打印分隔线
print_line() {
output "--------------------------------------------------------------------------------"
}
# 打印表头
print_header() {
print_line
output "服务状态监控报告 $(get_timestamp)"
print_line
}
# 检查 systemctl 服务
check_systemctl_service() {
local service_name="$1"
local system_name="$2"
if command -v systemctl &>/dev/null; then
if systemctl is-active --quiet "$system_name" 2>/dev/null; then
local status=$(systemctl is-active "$system_name" 2>/dev/null)
local uptime=$(systemctl status "$system_name" 2>/dev/null | grep "Active:" | sed 's/.*since //;s/;.*/ /')
output "服务: ${Y}${service_name}${N}"
output " 状态: ${G}运行中${N} (${status})"
output " 运行时间: ${uptime:-未知}"
# 获取主进程PID并检查资源
local main_pid=$(systemctl show "$system_name" --property=MainPID --value 2>/dev/null)
# 如果 systemctl 获取的 MainPID 无效,尝试通过进程名查找
if [ -z "$main_pid" ] || [ "$main_pid" = "0" ] || ! kill -0 "$main_pid" 2>/dev/null; then
main_pid=$(pgrep -x "$system_name" 2>/dev/null | head -1)
fi
if [ -z "$main_pid" ] || [ "$main_pid" = "0" ] || ! kill -0 "$main_pid" 2>/dev/null; then
main_pid=$(pgrep -f "$system_name" 2>/dev/null | head -1)
fi
check_service_resources "$service_name" "$main_pid"
log_message "INFO" "${service_name} 运行正常"
return 0
else
output "服务: ${Y}${service_name}${N}"
output " 状态: ${R}已停止${N}"
log_message "CRITICAL" "${service_name} 已停止"
return 1
fi
else
output "服务: ${Y}${service_name}${N}"
output " 状态: ${R}无法检查${N} (systemctl 不可用)"
return 1
fi
}
# 检查端口监听
check_port_service() {
local service_name="$1"
local port="$2"
local proto="${3:-tcp}"
local port_pid=""
local is_running=false
if command -v ss &>/dev/null; then
if ss -tlnp 2>/dev/null | grep -q ":${port} "; then
is_running=true
port_pid=$(ss -tlnp 2>/dev/null | grep ":${port} " | grep -oP 'pid=\K[0-9]+' | head -1)
fi
elif command -v netstat &>/dev/null; then
if netstat -tlnp 2>/dev/null | grep -q ":${port} "; then
is_running=true
port_pid=$(netstat -tlnp 2>/dev/null | grep ":${port} " | awk '{print $7}' | cut -d'/' -f1 | head -1)
fi
fi
if [ "$is_running" = true ]; then
output "服务: ${Y}${service_name}${N}"
output " 状态: ${G}运行中${N} (端口 ${port} 监听中)"
check_service_resources "$service_name" "$port_pid"
log_message "INFO" "${service_name} 端口 ${port} 正常"
return 0
fi
output "服务: ${Y}${service_name}${N}"
output " 状态: ${R}已停止${N} (端口 ${port} 未监听)"
log_message "CRITICAL" "${service_name} 端口 ${port} 未监听"
return 1
}
# 检查进程
check_process() {
local process_name="$1"
local pid_file="$2"
local pid=""
# 优先通过 PID 文件检查
if [ -n "$pid_file" ] && [ -f "$pid_file" ]; then
pid=$(cat "$pid_file" 2>/dev/null)
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
output "进程: ${Y}${process_name}${N}"
output " 状态: ${G}运行中${N} (PID: ${pid})"
check_service_resources "$process_name" "$pid"
log_message "INFO" "${process_name} 运行正常 (PID: ${pid})"
return 0
fi
fi
# 通过进程名检查
pid=$(pgrep -x "$process_name" 2>/dev/null | head -1)
if [ -n "$pid" ]; then
output "进程: ${Y}${process_name}${N}"
output " 状态: ${G}运行中${N} (PID: ${pid})"
check_service_resources "$process_name" "$pid"
log_message "INFO" "${process_name} 运行正常 (PID: ${pid})"
return 0
fi
# 模糊匹配
pid=$(pgrep -f "$process_name" 2>/dev/null | head -1)
if [ -n "$pid" ]; then
output "进程: ${Y}${process_name}${N}"
output " 状态: ${G}运行中${N} (PID: ${pid}, 模糊匹配)"
check_service_resources "$process_name" "$pid"
log_message "INFO" "${process_name} 运行正常 (PID: ${pid})"
return 0
fi
output "进程: ${Y}${process_name}${N}"
output " 状态: ${R}未运行${N}"
log_message "CRITICAL" "${process_name} 未运行"
return 1
}
# 检查服务资源占用
check_service_resources() {
local service_name="$1"
local pid="$2"
if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
return
fi
# CPU 使用率
local cpu_usage="0"
if [ -f "/proc/${pid}/stat" ]; then
local utime=$(awk '{print $14}' /proc/${pid}/stat 2>/dev/null)
local stime=$(awk '{print $15}' /proc/${pid}/stat 2>/dev/null)
local total_time=$((utime + stime))
local start_time=$(awk '{print $22}' /proc/${pid}/stat 2>/dev/null)
local uptime=$(awk '{print $1}' /proc/uptime 2>/dev/null | cut -d'.' -f1)
local hz=$(getconf CLK_TCK 2>/dev/null || echo 100)
local proc_uptime=$((uptime - start_time / hz))
if [ "$proc_uptime" -gt 0 ]; then
cpu_usage=$(awk "BEGIN {printf \"%.1f\", 100 * $total_time / $hz / $proc_uptime}")
fi
fi
# 内存占用
local mem_usage="0"
local mem_rss="0"
if [ -f "/proc/${pid}/status" ]; then
mem_rss=$(grep "VmRSS:" /proc/${pid}/status 2>/dev/null | awk '{print $2}')
if [ -n "$mem_rss" ]; then
local mem_total=$(grep "MemTotal:" /proc/meminfo 2>/dev/null | awk '{print $2}')
if [ -n "$mem_total" ] && [ "$mem_total" -gt 0 ]; then
mem_usage=$(awk "BEGIN {printf \"%.1f\", 100 * $mem_rss / $mem_total}")
fi
fi
fi
# 内存大小(人类可读)
local mem_human="${mem_rss}KB"
if command -v numfmt &>/dev/null && [ -n "$mem_rss" ]; then
mem_human=$(numfmt --to=iec-i --suffix=B --from-unit=1024 ${mem_rss}K 2>/dev/null || echo "${mem_rss}KB")
fi
# 线程数
local threads=$(ls /proc/${pid}/task 2>/dev/null | wc -l)
# 打开文件数
local open_files=$(ls /proc/${pid}/fd 2>/dev/null | wc -l)
output " CPU占用: ${cpu_usage}%"
output " 内存占用: ${mem_usage}% (${mem_human})"
output " 线程数: ${threads}"
output " 打开文件: ${open_files}"
}
# 尝试启动服务
auto_restart_service() {
local service_name="$1"
local start_cmd="$2"
if [ "$AUTO_RESTART" != true ]; then
return 1
fi
output " 操作: ${Y}尝试自动重启...${N}"
log_message "WARNING" "尝试自动重启 ${service_name}"
if eval "$start_cmd" &>/dev/null; then
sleep 2
# 再次检查
if check_service_quick "$service_name"; then
output " 结果: ${G}重启成功${N}"
log_message "INFO" "${service_name} 自动重启成功"
# 发送通知
if [ -n "$NOTIFY_CMD" ]; then
eval "$NOTIFY_CMD" "${service_name} 已自动重启" &>/dev/null
fi
return 0
fi
fi
output " 结果: ${R}重启失败${N}"
log_message "CRITICAL" "${service_name} 自动重启失败"
# 发送告警
if [ -n "$NOTIFY_CMD" ]; then
eval "$NOTIFY_CMD" "${service_name} 自动重启失败,请人工介入" &>/dev/null
fi
return 1
}
# 快速检查(用于重启后验证)
check_service_quick() {
local name="$1"
# 尝试 systemctl
if command -v systemctl &>/dev/null && systemctl is-active --quiet "$name" 2>/dev/null; then
return 0
fi
# 尝试进程
if pgrep -x "$name" &>/dev/null || pgrep -f "$name" &>/dev/null; then
return 0
fi
return 1
}
# 解析服务配置
parse_service() {
local config="$1"
local name=$(echo "$config" | cut -d'|' -f1)
local method=$(echo "$config" | cut -d'|' -f2)
local check_param=$(echo "$config" | cut -d'|' -f3)
local start_cmd=$(echo "$config" | cut -d'|' -f4)
echo "$name|$method|$check_param|$start_cmd"
}
# 检查所有服务
check_all_services() {
local failed_count=0
local total_count=0
output "${C}[系统服务检查]${N}"
output ""
for service_config in "${SERVICES[@]}"; do
total_count=$((total_count + 1))
local parsed=$(parse_service "$service_config")
local name=$(echo "$parsed" | cut -d'|' -f1)
local method=$(echo "$parsed" | cut -d'|' -f2)
local check_param=$(echo "$parsed" | cut -d'|' -f3)
local start_cmd=$(echo "$parsed" | cut -d'|' -f4)
local check_result=1
case "$method" in
systemctl)
check_systemctl_service "$name" "$check_param"
check_result=$?
;;
port)
check_port_service "$name" "$check_param"
check_result=$?
;;
process)
check_process "$check_param" ""
check_result=$?
;;
*)
output "服务: ${Y}${name}${N}"
output " 状态: ${R}未知检查方式${N} (${method})"
check_result=1
;;
esac
if [ $check_result -ne 0 ]; then
failed_count=$((failed_count + 1))
if [ -n "$start_cmd" ]; then
auto_restart_service "$name" "$start_cmd"
fi
fi
output ""
done
# 检查自定义进程
if [ ${#CUSTOM_PROCESSES[@]} -gt 0 ]; then
output "${C}[自定义进程检查]${N}"
output ""
for proc_config in "${CUSTOM_PROCESSES[@]}"; do
total_count=$((total_count + 1))
local proc_name=$(echo "$proc_config" | cut -d'|' -f1)
local pid_file=$(echo "$proc_config" | cut -d'|' -f2)
local start_cmd=$(echo "$proc_config" | cut -d'|' -f3)
check_process "$proc_name" "$pid_file"
local check_result=$?
if [ $check_result -ne 0 ]; then
failed_count=$((failed_count + 1))
if [ -n "$start_cmd" ]; then
auto_restart_service "$proc_name" "$start_cmd"
fi
fi
output ""
done
fi
# 汇总
output "${C}[检查汇总]${N}"
output " 总服务数: ${total_count}"
output " 正常: ${G}$((total_count - failed_count))${N}"
output " 异常: ${R}${failed_count}${N}"
if [ $failed_count -gt 0 ]; then
log_message "WARNING" "检查完成: ${failed_count}/${total_count} 个服务异常"
else
log_message "INFO" "检查完成: 所有服务正常"
fi
output ""
}
# 主监控函数
monitor_services() {
if [ -n "$REPORT_FILE" ]; then
> "$REPORT_FILE"
fi
print_header
check_all_services
print_line
if [ -n "$REPORT_FILE" ]; then
output ""
output "报告已保存至: $REPORT_FILE"
fi
}
# 连续监控模式
continuous_monitor() {
local interval=${1:-$CHECK_INTERVAL}
echo "连续监控模式,间隔: ${interval}秒..."
echo "按 Ctrl+C 退出"
echo ""
while true; do
clear
monitor_services
sleep "$interval"
done
}
# 显示帮助
show_help() {
cat << EOF
用法: $0 [选项]
选项:
-c, --continuous [秒] 连续监控模式,默认30秒
-i, --interval 秒 设置监控间隔
-o, --output 文件 保存报告到文件
-a, --auto-restart 服务异常时自动重启
-n, --notify 命令 设置告警通知命令
-s, --service 配置 添加自定义服务
-p, --process 配置 添加自定义进程
格式: "进程名|PID文件|启动命令"
示例: "java|/var/run/tomcat.pid|systemctl start tomcat"
PID文件可为空
-l, --log 查看日志
-h, --help 显示帮助
服务配置格式: 名称|检查方式|检查参数|启动命令
检查方式: systemctl / port / process
进程配置格式: 名称|PID文件|启动命令
PID文件可为空
示例:
$0 单次检查
$0 -c 连续监控 (30秒)
$0 -c 60 连续监控 (60秒)
$0 -o /tmp/report.txt 保存报告
$0 -a 启用自动重启
$0 -n "mail -s '告警' [email protected]" 邮件告警
$0 -s "nginx|port|80|systemctl start nginx" 添加端口检查
$0 -p "myapp||/opt/myapp/start.sh" 添加自定义进程
$0 -s "tomcat|process|java|systemctl start tomcat" 添加Tomcat
预置服务:
nginx, mysql, redis, ssh
EOF
}
# 查看日志
view_log() {
if [ -f "$LOG_FILE" ]; then
echo "服务监控日志:"
print_line
tail -n 50 "$LOG_FILE"
else
echo "日志文件未找到: $LOG_FILE"
fi
}
# 添加自定义服务
add_service() {
local config="$1"
SERVICES+=("$config")
}
# 添加自定义进程
add_process() {
local config="$1"
CUSTOM_PROCESSES+=("$config")
}
# 主函数
main() {
# 确保日志目录存在
local log_dir=$(dirname "$LOG_FILE")
if [ ! -d "$log_dir" ]; then
mkdir -p "$log_dir" 2>/dev/null || LOG_FILE="/tmp/service_monitor.log"
fi
while [ $# -gt 0 ]; do
case "$1" in
-c|--continuous)
shift
continuous_monitor "${1:-$CHECK_INTERVAL}"
exit 0
;;
-i|--interval)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定间隔时间"
show_help
exit 1
fi
CHECK_INTERVAL="$1"
continuous_monitor "$CHECK_INTERVAL"
exit 0
;;
-o|--output)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定输出文件"
show_help
exit 1
fi
REPORT_FILE="$1"
shift
;;
-a|--auto-restart)
AUTO_RESTART=true
shift
;;
-n|--notify)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定通知命令"
show_help
exit 1
fi
NOTIFY_CMD="$1"
shift
;;
-s|--service)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定服务配置"
show_help
exit 1
fi
add_service "$1"
shift
;;
-p|--process)
shift
if [ -z "${1:-}" ]; then
echo "错误: 请指定进程配置"
show_help
exit 1
fi
add_process "$1"
shift
;;
-l|--log)
view_log
exit 0
;;
-h|--help)
show_help
exit 0
;;
*)
echo "未知选项: $1"
show_help
exit 1
;;
esac
done
monitor_services
}
main "$@"

浙公网安备 33010602011771号