promthues 告警规则配置

node_exporter告警规则

groups:
- name: 服务器资源
  rules:

    - alert: 内存不足
      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
      for: 2m  # 警告持续时间,超过这个时间才会发送给alertmanager
      labels:
        severity: 警告
      annotations:
        summary: "主机内存不足 (实例 {{ $labels.instance }})"
        description: "节点可用内存过低(剩余 < 10%)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 内存压力过大
      expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机内存压力过大 (实例 {{ $labels.instance }})"
        description: "节点内存压力大,频繁从磁盘加载内存页\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"


    - alert: 网卡入站流量异常
      expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机网卡入站流量异常 (实例 {{ $labels.instance }})"
        description: "网卡接收带宽占用过高(> 80%)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 网卡出站流量异常
      expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机网卡出站流量异常 (实例 {{ $labels.instance }})"
        description: "网卡发送带宽占用过高(> 80%)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 磁盘读取繁忙
      expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机磁盘读取繁忙 (实例 {{ $labels.instance }})"
        description: "磁盘IO占用过高(IO等待 > 80%)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 磁盘空间不足
      expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .15 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机磁盘空间不足 (实例 {{ $labels.instance }})"
        description: "磁盘空间即将耗尽(剩余 < 15%)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 磁盘满了
      expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .005 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
      for: 2m
      labels:
        severity: 严重告警
      annotations:
        summary: "主机磁盘满了 (实例 {{ $labels.instance }})"
        description: "磁盘空间已耗尽赶快处理 \n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"


    - alert: 磁盘24小时内将占满
      expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机磁盘24小时内将占满 (实例 {{ $labels.instance }})"
        description: "按当前速率,文件系统将在24小时内占满\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: inode资源不足
      expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
      for: 2m
      labels:
        severity: 严重告警
      annotations:
        summary: "主机inode资源不足 (实例 {{ $labels.instance }})"
        description: "文件inode即将耗尽(剩余 < 10%)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 文件系统设备异常
      #expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
      expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)",device_error!~".*[Pp]ermission denied.*"} == 1'
      for: 2m
      labels:
        severity: 严重告警
      annotations:
        summary: "主机文件系统设备异常 (实例 {{ $labels.instance }})"
        description: "文件系统 {{ $labels.mountpoint }} 状态异常\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: inode24小时内将耗尽
      expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机inode24小时内将耗尽 (实例 {{ $labels.instance }})"
        description: "按当前速率,inode将在24小时内耗尽\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 磁盘读取延迟过高
      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机磁盘读取延迟过高 (实例 {{ $labels.instance }})"
        description: "磁盘读取延迟增大(单次读取 > 100ms)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 磁盘写入延迟过高
      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机磁盘写入延迟过高 (实例 {{ $labels.instance }})"
        description: "磁盘写入延迟增大(单次写入 > 100ms)\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: CPU负载过高
      expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
      for: 1m
      labels:
        severity: 警告
      annotations:
        summary: "主机CPU负载过高 (实例 {{ $labels.instance }})"
        description: "CPU使用率 > 80%\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"
        value: "CPU使用率 > 80% 持续1分钟"


    - alert: CPU虚拟化抢占过高
      expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机CPU虚拟化抢占过高 (实例 {{ $labels.instance }})"
        description: "CPU抢占时间 > 10%,宿主机资源争抢严重\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: CPU IO等待过高
      expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机CPU IO等待过高 (实例 {{ $labels.instance }})"
        description: "CPU IO等待 > 10%,磁盘性能可能瓶颈\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 磁盘IO异常繁忙
      expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
      for: 5m
      labels:
        severity: 警告
      annotations:
        summary: "主机磁盘IO异常繁忙 (实例 {{ $labels.instance }})"
        description: "磁盘使用率 > 80%,请检查存储或提升IO能力\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 上下文切换过高
      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机上下文切换过高 (实例 {{ $labels.instance }})"
        description: "节点上下文切换次数达到日均水平的2倍\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: Swap分区占满
      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机Swap分区即将占满 (实例 {{ $labels.instance }})"
        description: "Swap使用率 > 80%\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 系统服务崩溃
      expr: '(node_systemd_unit_state{state="failed"} == 1)'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机系统服务崩溃 (实例 {{ $labels.instance }})"
        description: "系统服务 {{ $labels.name }} 运行失败\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"


    - alert: 内核版本变更
      expr: 'changes(node_uname_info[1h]) > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: "主机内核版本变更 (实例 {{ $labels.instance }})"
        description: "实例内核版本已发生变化\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 触发OOM杀死进程
      expr: '(increase(node_vmstat_oom_kill[30m]) > 0)'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机触发OOM杀死进程 (实例 {{ $labels.instance }})"
        description: "系统内存耗尽,触发OOM机制杀死进程\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 内存可纠正错误
      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: "主机内存可纠正错误 (实例 {{ $labels.instance }})"
        description: "短时间内检测到内存可纠正错误\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 内存不可纠正错误
      expr: '(node_edac_uncorrectable_errors_total > 0)'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: "主机内存不可纠正错误 (实例 {{ $labels.instance }})"
        description: "检测到内存不可纠正错误,硬件可能故障\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 网卡接收错误
      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机网卡接收错误 (实例 {{ $labels.instance }})"
        description: "网卡 {{ $labels.device }} 接收包错误率过高\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 网卡发送错误
      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机网卡发送错误 (实例 {{ $labels.instance }})"
        description: "网卡 {{ $labels.device }} 发送包错误率过高\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 网卡绑定状态降级
      expr: '((node_bonding_active - node_bonding_slaves) != 0)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机网卡绑定状态降级 (实例 {{ $labels.instance }})"
        description: "网卡绑定 {{ $labels.device }} 状态异常\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 连接跟踪表接近上限
      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
      for: 5m
      labels:
        severity: 警告
      annotations:
        summary: "主机连接跟踪表接近上限 (实例 {{ $labels.instance }})"
        description: "conntrack连接数即将达到上限\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 系统时钟偏移过大
      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
      for: 10m
      labels:
        severity: 警告
      annotations:
        summary: "主机系统时钟偏移过大 (实例 {{ $labels.instance }})"
        description: "系统时钟偏移异常,请检查NTP配置\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 系统时钟未同步
      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
      for: 2m
      labels:
        severity: 警告
      annotations:
        summary: "主机系统时钟未同步 (实例 {{ $labels.instance }})"
        description: "系统时钟未与NTP服务器同步\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"

    - alert: 进程阻塞
      expr: node_procs_blocked > 0
      for: 5m
      labels:
        severity: warning
        alert_type: 系统进程阻塞
      annotations:
        summary: "实例 {{ $labels.instance }} 存在IO阻塞进程"
        description: "当前阻塞进程数:{{ $value }},可能原因:磁盘IO慢/磁盘故障/网络IO卡住,请登录机器执行 `ps aux | grep -E 'D'` 查看具体阻塞进程"

 

cadvisor  docker容器告警规则

groups:
- name: Docker容器监控
  rules: 
  - alert: 容器已被杀死
    expr: container_last_seen{name!="", image!="", name!="cadvisor"} and time() - container_last_seen > 60
    for: 0m
    labels:
      severity: 警告
    annotations:
      #summary: "容器已消失(实例 {{ $labels.instance }})"
      #description: "容器长时间未采集到数据,可能已被杀死或退出\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}"
      summary: "容器 {{ $labels.name }} 已消失(节点 {{ $labels.instance }})"
      description: |
        容器长时间未采集到监控数据,可能已被杀死或异常退出
        容器镜像:{{ $labels.image }}
        容器名称:{{ $labels.name }}
        节点地址:{{ $labels.instance }}

  - alert: 容器CPU使用率过高
    #expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
    expr: sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100 > 80
    for: 5m
    labels:
      severity: 警告
    annotations:
      summary: "容器CPU使用率过高(实例 {{ $labels.instance }})"
      description: |
        容器CPU使用率超过80%\n  当前值 = {{ $value }}\n  标签 = {{ $labels }}
        容器镜像:{{ $labels.image }}
        容器名称:{{ $labels.name }}
        节点地址:{{ $labels.instance }}

  - alert: 容器内存使用率过高
    expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80 and container_spec_memory_limit_bytes{name!=""} > 0
    for: 5m
    labels:
      severity: 警告
    annotations:
      summary: "容器内存使用率过高(实例 {{ $labels.instance }})"
      description: "容器内存使用率超过80%\n  当前值 = {{ $value }}\n  容器镜像:{{ $labels.image }} 容器名称:{{ $labels.name }} 节点地址:{{ $labels.instance }} 标签 = {{ $labels }}"

  - alert: 容器磁盘使用率过高
    expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
    for: 5m
    labels:
      severity: 警告
    annotations:
      summary: "容器磁盘使用率过高(实例 {{ $labels.instance }})"
      description: "容器磁盘inode使用率超过80%\n  当前值 = {{ $value }}\n  容器镜像:{{ $labels.image }} 容器名称:{{ $labels.name }} 节点地址:{{ $labels.instance }} 标签 = {{ $labels }}"

  - alert: 容器磁盘IO使用率过高
    expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: 警告
    annotations:
      summary: "容器磁盘IO使用率过高(实例 {{ $labels.instance }})"
      description: "容器当前磁盘IO使用率超过80%\n  当前值 = {{ $value }}\n  容器镜像:{{ $labels.image }} 容器名称:{{ $labels.name }} 节点地址:{{ $labels.instance }}  标签 = {{ $labels }}"

  - alert: 容器CPU被频繁限流
    expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
    for: 5m
    labels:
      severity: 警告
    annotations:
      summary: "容器CPU被频繁限流(实例 {{ $labels.instance }})"
      description: "容器CPU因资源限制被频繁节流,可能影响性能\n  当前值 = {{ $value }}\n  容器镜像:{{ $labels.image }} 容器名称:{{ $labels.name }} 节点地址:{{ $labels.instance }}  标签 = {{ $labels }}"


  - alert: 容器网络入站流量异常
    expr: sum(rate(container_network_receive_bytes_total{name!=""}[5m])) by (name, instance) > 10485760  # 入站>10MB/s
    for: 2m
    labels:
      severity: 警告
      category: docker
    annotations:
      summary: "容器入站网络流量异常 ({{ $labels.name }})"
      description: "容器 {{ $labels.name }} 在主机 {{ $labels.instance }} 上入站流量超过10MB/s (当前值: {{ $value | humanize }}/s),持续2分钟。"
      suggestion: "检查是否有异常请求或网络攻击,限制容器网络带宽。"
      
  - alert: Docker容器疑似假死
    expr: |
      sum(
        rate(container_network_transmit_bytes_total{name!=""}[2m]) + 
        rate(container_network_receive_bytes_total{name!=""}[2m])
      ) BY (instance, name) == 0
      and time() - container_start_time_seconds{name!=""} > 300
    for: 0m
    labels:
      severity: 警告
    annotations:
      summary: "容器无网络流量 ({{ $labels.instance }} - {{ $labels.name }})"
      description: "容器 {{ $labels.name }} 近2分钟网络收发流量均为0,业务可能异常。"
      value: "容器{{$labels.name}}  已持续2分钟无数据"

 

redis exporter 告警规则

groups:
  - name: redis告警
    rules:
      - alert: Redis实例下线
        expr: redis_up{job="redis-server"} == 0
        for: 1m
        labels:
          severity: 严重告警
        annotations:
          summary: "Redis {{ $labels.instance }} 已下线"
          description: "Redis 实例持续不可用超过 1 分钟,请立即检查服务状态"


      - alert: Redis慢查询增多
        expr: increase(redis_slowlog_length[5m]) > 5
        for: 0m
        labels:
          severity: 警告
        annotations:
          summary: "Redis {{ $labels.instance }} 慢查询数量突增"
          description: "5 分钟内慢查询数量新增超过 5 条,请检查慢查询语句"

      - alert: Redis连接数过高
        expr: redis_connected_clients / redis_config_maxclients * 100 > 80
        for: 2m
        labels:
          severity: 警告
        annotations:
          summary: "Redis {{ $labels.instance }} 连接数过高"
          description: "Redis 连接使用率已超过 80% 并持续 2 分钟\n当前使用率:{{ $value | printf \"%.2f\" }}%"

      - alert: Redis拒绝新连接
        expr: increase(redis_rejected_connections_total[5m]) > 0
        for: 0m
        labels:
          severity: 严重告警
        annotations:
          summary: "Redis {{ $labels.instance }} 拒绝新连接"
          description: "5 分钟内检测到 Redis 拒绝新连接请求,已达到最大连接数上限"

      - alert: Redis缓存命中率过低
        expr: (redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total)) * 100 < 50
        for: 5m
        labels:
          severity: 警告
        annotations:
          summary: "Redis {{ $labels.instance }} 缓存命中率过低"
          description: "Redis 缓存命中率低于 50% 并持续 5 分钟\n当前命中率:{{ $value | printf \"%.2f\" }}%"

 

posted @ 2026-03-16 10:50  追梦$少年  阅读(1)  评论(0)    收藏  举报