Initial commit
This commit is contained in:
88
skills/toolkit/bash/analysis/analyze-logs.meta.yml
Normal file
88
skills/toolkit/bash/analysis/analyze-logs.meta.yml
Normal file
@@ -0,0 +1,88 @@
|
||||
tool_id: BASH-ANALYZE-LOGS-002
|
||||
tool_name: "日志分析器"
|
||||
|
||||
基本信息:
|
||||
语言: bash
|
||||
文件: analyze-logs.sh
|
||||
复杂度: level-2
|
||||
创建日期: 2025-11-14
|
||||
作者: CodeConscious
|
||||
|
||||
用途分类:
|
||||
- DATA # 数据分析
|
||||
- MONITOR # 监控诊断
|
||||
|
||||
功能描述:
|
||||
简介: "分析日志文件,按级别过滤、时间范围筛选、模式匹配和错误统计"
|
||||
详细: |
|
||||
支持功能:
|
||||
- 按日志级别统计(ERROR/WARN/INFO/DEBUG)
|
||||
- 按关键词模式搜索(正则表达式)
|
||||
- 错误模式识别(超时、连接错误、内存问题)
|
||||
- 统计汇总和建议生成
|
||||
- 支持时间范围筛选(需要日志包含日期)
|
||||
|
||||
使用场景:
|
||||
- "分析生产环境错误日志,找到崩溃原因"
|
||||
- "统计API接口错误频率和类型分布"
|
||||
- "监控服务健康状况,识别异常模式"
|
||||
- "排查性能问题,定位慢请求和超时"
|
||||
- "验证修复效果,比较修复前后的日志"
|
||||
|
||||
使用方法:
|
||||
命令: "bash analyze-logs.sh <日志文件> [模式] [级别] [日期范围]"
|
||||
参数:
|
||||
日志文件: "日志文件路径(必需)"
|
||||
模式: "要搜索的正则表达式(可选)"
|
||||
级别: "日志级别: ERROR/WARN/INFO/DEBUG(默认: INFO)"
|
||||
日期范围: "日期范围, 如: 2025-11-01~2025-11-14(可选,需要日志包含日期)"
|
||||
示例:
|
||||
- "分析应用日志: bash analyze-logs.sh /var/log/app.log"
|
||||
- "搜索错误: bash analyze-logs.sh /var/log/app.log 'timeout|error' ERROR"
|
||||
- "按时间筛选: bash analyze-logs.sh /var/log/app.log 'database' WARN 2025-11-01~2025-11-07"
|
||||
|
||||
依赖要求:
|
||||
系统命令:
|
||||
- bash: 支持bash 4.0+
|
||||
- awk: 文本处理
|
||||
- grep: 模式匹配
|
||||
- wc: 计数
|
||||
- du: 文件大小
|
||||
|
||||
输入输出:
|
||||
输入:
|
||||
- 类型: 文本文件
|
||||
- 格式: 任意日志格式(支持自定义分析)
|
||||
- 示例: |
|
||||
2025-11-14 10:30:00 [INFO] User login successful
|
||||
2025-11-14 10:30:01 [ERROR] Database connection timeout
|
||||
2025-11-14 10:30:02 [WARN] High memory usage: 85%
|
||||
输出:
|
||||
- stdout: 统计摘要、错误分析、建议
|
||||
- 格式: 人类可读文本 + 颜色高亮
|
||||
|
||||
上次使用:
|
||||
时间: 2025-11-14 11:00:00
|
||||
用途: "分析auth-service崩溃日志,找到连接池配置问题"
|
||||
结果: "成功识别47次超时错误,12次连接数过多错误,定位到连接池不足问题"
|
||||
满意度: 0.95
|
||||
|
||||
相关工具:
|
||||
- 前置工具: 无
|
||||
- 互补工具:
|
||||
- toolkit/bash/system/check-service.sh(服务健康检查)
|
||||
- toolkit/python/analysis/analyze_logs.py(复杂日志解析)
|
||||
- 替代工具:
|
||||
- grep + wc(手动统计)
|
||||
|
||||
维护记录:
|
||||
2025-11-14:
|
||||
- 初始创建
|
||||
- 支持级别统计、模式匹配、错误模式识别
|
||||
2025-11-15:
|
||||
- 添加时间范围筛选功能(实验性)
|
||||
- 优化颜色输出
|
||||
TODO:
|
||||
- 支持JSON日志格式解析
|
||||
- 添加图表可视化(ASCII)
|
||||
- 支持日志文件压缩格式(.gz)
|
||||
171
skills/toolkit/bash/analysis/analyze-logs.sh
Executable file
171
skills/toolkit/bash/analysis/analyze-logs.sh
Executable file
@@ -0,0 +1,171 @@
|
||||
#!/bin/bash
|
||||
# 日志分析器 - 从日志文件中提取和分析信息
|
||||
|
||||
set -e
|
||||
|
||||
LOG_FILE="${1:-}"
|
||||
PATTERN="${2:-}"
|
||||
LEVEL="${3:-INFO}"
|
||||
DATE_RANGE="${4:-}"
|
||||
|
||||
# 颜色输出
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 帮助信息
|
||||
usage() {
|
||||
echo "📝 日志分析器"
|
||||
echo ""
|
||||
echo "使用: $0 <日志文件> [模式] [级别] [日期范围]"
|
||||
echo ""
|
||||
echo "参数:"
|
||||
echo " 日志文件 日志文件路径 (必需)"
|
||||
echo " 模式 要搜索的正则表达式 (可选)"
|
||||
echo " 级别 日志级别: ERROR/WARN/INFO/DEBUG (默认: INFO)"
|
||||
echo " 日期范围 日期范围, 如: 2025-11-01~2025-11-14 (可选)"
|
||||
echo ""
|
||||
echo "示例:"
|
||||
echo " $0 /var/log/app.log"
|
||||
echo " $0 /var/log/app.log 'timeout|error' ERROR"
|
||||
echo " $0 /var/log/app.log 'database' WARN 2025-11-01~2025-11-07"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# 检查参数
|
||||
if [ -z "$LOG_FILE" ]; then
|
||||
echo "❌ 错误: 请提供日志文件路径"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f "$LOG_FILE" ]; then
|
||||
echo "❌ 错误: 文件不存在: $LOG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "📝 日志分析器"
|
||||
echo "=========================================="
|
||||
echo "文件: ${BLUE}$LOG_FILE${NC}"
|
||||
echo "大小: $(du -h "$LOG_FILE" | cut -f1)"
|
||||
echo "行数: $(wc -l < "$LOG_FILE")"
|
||||
echo "=========================================="
|
||||
|
||||
# 按级别过滤
|
||||
echo ""
|
||||
echo "📊 按级别统计:"
|
||||
echo "=========================================="
|
||||
|
||||
ERROR_COUNT=$(grep -c "ERROR" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
WARN_COUNT=$(grep -c "WARN" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
INFO_COUNT=$(grep -c "INFO" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
DEBUG_COUNT=$(grep -c "DEBUG" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
|
||||
echo -e " ${RED}ERROR${NC}: $ERROR_COUNT"
|
||||
echo -e " ${YELLOW}WARN${NC}: $WARN_COUNT"
|
||||
echo -e " ${GREEN}INFO${NC}: $INFO_COUNT"
|
||||
echo -e " ${BLUE}DEBUG${NC}: $DEBUG_COUNT"
|
||||
|
||||
# 展示特定级别的日志
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "📋 ${LEVEL} 级别日志 (前10条):"
|
||||
echo "=========================================="
|
||||
grep "$LEVEL" "$LOG_FILE" | head -10
|
||||
|
||||
# 模式匹配
|
||||
if [ -n "$PATTERN" ]; then
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "🔍 匹配模式 '${PATTERN}':"
|
||||
echo "=========================================="
|
||||
MATCH_COUNT=$(grep -c "$PATTERN" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
echo "匹配数量: $MATCH_COUNT"
|
||||
echo ""
|
||||
echo "示例:"
|
||||
grep "$PATTERN" "$LOG_FILE" | head -5
|
||||
fi
|
||||
|
||||
# 时间范围
|
||||
check_date_range() {
|
||||
if [ -n "$DATE_RANGE" ]; then
|
||||
START_DATE=$(echo "$DATE_RANGE" | cut -d'~' -f1)
|
||||
END_DATE=$(echo "$DATE_RANGE" | cut -d'~' -f2)
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "📅 时间范围 $START_DATE ~ $END_DATE:"
|
||||
echo "=========================================="
|
||||
|
||||
# 简单的日期过滤(假设日志包含日期)
|
||||
awk -v start="$START_DATE" -v end="$END_DATE" '
|
||||
$0 >= start && $0 <= end {
|
||||
print $0
|
||||
}' "$LOG_FILE" | head -10
|
||||
fi
|
||||
}
|
||||
|
||||
check_date_range
|
||||
|
||||
# 常见错误模式分析
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "⚠️ 常见错误模式分析:"
|
||||
echo "=========================================="
|
||||
|
||||
# 超时错误
|
||||
TIMEOUT_COUNT=$(grep -c "timeout\|Timeout\|TIMEOUT" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
if [ "$TIMEOUT_COUNT" -gt 0 ]; then
|
||||
echo -e "⏱️ 超时错误: ${RED}$TIMEOUT_COUNT${NC}次"
|
||||
fi
|
||||
|
||||
# 连接错误
|
||||
CONN_ERROR_COUNT=$(grep -c "connection refused\|Connection refused\|ECONNREFUSED" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
if [ "$CONN_ERROR_COUNT" -gt 0 ]; then
|
||||
echo -e "🔗 连接错误: ${RED}$CONN_ERROR_COUNT${NC}次"
|
||||
fi
|
||||
|
||||
# 内存错误
|
||||
MEMORY_ERROR_COUNT=$(grep -c "OutOfMemory\|memory\|Memory" "$LOG_FILE" 2>/dev/null || echo "0")
|
||||
if [ "$MEMORY_ERROR_COUNT" -gt 0 ]; then
|
||||
echo -e "🧠 内存问题: ${RED}$MEMORY_ERROR_COUNT${NC}次"
|
||||
fi
|
||||
|
||||
# 总结
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "📊 统计总结:"
|
||||
echo "=========================================="
|
||||
echo "总错误数: $((ERROR_COUNT + WARN_COUNT))"
|
||||
echo "关键错误数: $ERROR_COUNT"
|
||||
|
||||
if [ -n "$PATTERN" ]; then
|
||||
echo "模式'$PATTERN'匹配: $MATCH_COUNT 次"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "💡 建议:"
|
||||
echo "=========================================="
|
||||
if [ "$ERROR_COUNT" -gt 10 ]; then
|
||||
echo " - ⚠️ ERROR数量较多,建议立即调查"
|
||||
fi
|
||||
|
||||
if [ "$TIMEOUT_COUNT" -gt 5 ]; then
|
||||
echo " - ⚠️ 频繁超时,检查网络或服务响应"
|
||||
fi
|
||||
|
||||
if [ "$CONN_ERROR_COUNT" -gt 3 ]; then
|
||||
echo " - ⚠️ 连接问题,验证服务状态和配置"
|
||||
fi
|
||||
|
||||
if [ "$ERROR_COUNT" -le 5 ] && [ "$WARN_COUNT" -le 10 ]; then
|
||||
echo " - ✅ 日志健康状况良好"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "日志分析完成"
|
||||
echo "=========================================="
|
||||
86
skills/toolkit/bash/system/check-disk.meta.yml
Normal file
86
skills/toolkit/bash/system/check-disk.meta.yml
Normal file
@@ -0,0 +1,86 @@
|
||||
tool_id: BASH-CHECK-DISK-003
|
||||
tool_name: "磁盘健康检查器"
|
||||
|
||||
基本信息:
|
||||
语言: bash
|
||||
文件: check-disk.sh
|
||||
复杂度: level-2
|
||||
创建日期: 2025-11-14
|
||||
作者: CodeConscious
|
||||
|
||||
用途分类:
|
||||
- MONITOR # 监控诊断
|
||||
|
||||
功能描述:
|
||||
简介: "检查磁盘空间、inode使用和健康状态,提供详细的分析和建议"
|
||||
详细: |
|
||||
支持功能:
|
||||
- 磁盘空间使用检查(总容量、已使用、可用)
|
||||
- Inode使用检查(总数、已使用、可用)
|
||||
- 使用阈值警报(可配置阈值,默认80%)
|
||||
- 文件系统类型检测(ext4, xfs等)
|
||||
- 挂载点和访问权限检查
|
||||
- 性能指标分析(大文件 vs 小文件占比)
|
||||
- 智能建议生成(基于检查结果)
|
||||
|
||||
使用场景:
|
||||
- "监控服务器磁盘空间,预防空间不足导致的服务中断"
|
||||
- "检查日志目录磁盘使用情况,规划日志轮转策略"
|
||||
- '诊断"No space left on device"错误的根本原因'
|
||||
- "定期检查系统健康状态,识别潜在问题"
|
||||
- "在扩容前评估当前磁盘使用情况"
|
||||
- "排查inode耗尽问题(大量小文件导致)"
|
||||
|
||||
使用方法:
|
||||
命令: "bash check-disk.sh [路径] [阈值]"
|
||||
参数:
|
||||
路径: "要检查的磁盘路径(默认: /)"
|
||||
阈值: "空间使用警报阈值百分比(默认: 80%,1-100之间)"
|
||||
示例:
|
||||
- "检查根目录(阈值80%): bash check-disk.sh"
|
||||
- "检查/home(阈值85%): bash check-disk.sh /home 85"
|
||||
- "检查/var/log(阈值90%): bash check-disk.sh /var/log 90"
|
||||
|
||||
依赖要求:
|
||||
系统命令:
|
||||
- bash: 4.0+
|
||||
- df: 磁盘使用情况
|
||||
- du: 文件大小
|
||||
- find: 文件查找
|
||||
- awk: 文本处理
|
||||
- sort: 排序
|
||||
|
||||
输入输出:
|
||||
输入:
|
||||
- 无(从文件系统自动获取)
|
||||
输出:
|
||||
- stdout: 彩色格式化报告(空间使用、inode、警报、建议)
|
||||
- 退出码:
|
||||
- 0: 磁盘健康
|
||||
- 1: 有警报
|
||||
- 2: 严重问题
|
||||
|
||||
上次使用:
|
||||
时间: 2024-11-14 11:30:00
|
||||
用途: "检查服务器根目录磁盘空间,发现/var/log占用过大"
|
||||
结果: "成功识别日志目录占用85%,建议清理旧日志后立即恢复20%空间"
|
||||
满意度: 0.93
|
||||
|
||||
相关工具:
|
||||
- 前置工具: 无
|
||||
- 互补工具:
|
||||
- toolkit/bash/analysis/analyze-logs.sh(日志分析)
|
||||
- toolkit/bash/system/check-service.sh(服务健康)
|
||||
- 替代工具:
|
||||
- df -h(手动检查)
|
||||
- ncdu(交互式磁盘使用分析)
|
||||
|
||||
维护记录:
|
||||
2025-11-14:
|
||||
- 初始创建
|
||||
- 支持空间和inode检查
|
||||
- 添加性能指标分析
|
||||
TODO:
|
||||
- 支持磁盘健康度检测(S.M.A.R.T.)
|
||||
- 添加磁盘I/O性能测试
|
||||
- 支持磁盘温度监测
|
||||
246
skills/toolkit/bash/system/check-disk.sh
Executable file
246
skills/toolkit/bash/system/check-disk.sh
Executable file
@@ -0,0 +1,246 @@
|
||||
#!/bin/bash
|
||||
# 磁盘健康检查器 - 检查磁盘空间、inode使用和健康状态
|
||||
|
||||
set -e
|
||||
|
||||
DISK_PATH="${1:-/}"
|
||||
THRESHOLD="${2:-80}"
|
||||
|
||||
# 颜色输出
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 帮助信息
|
||||
usage() {
|
||||
echo "💾 磁盘健康检查器"
|
||||
echo ""
|
||||
echo "使用: $0 [路径] [阈值]"
|
||||
echo ""
|
||||
echo "参数:"
|
||||
echo " 路径 要检查的磁盘路径 (默认: /)"
|
||||
echo " 阈值 空间使用警报阈值百分比 (默认: 80%)"
|
||||
echo ""
|
||||
echo "示例:"
|
||||
echo " $0 # 检查根目录,阈值80%"
|
||||
echo " $0 /home 85 # 检查/home目录,阈值85%"
|
||||
echo " $0 /var/log 90 # 检查日志目录,阈值90%"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# 检查参数
|
||||
if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 检查路径是否存在
|
||||
if [ ! -d "$DISK_PATH" ]; then
|
||||
echo "❌ 错误: 路径不存在: $DISK_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 数字验证
|
||||
if ! [[ "$THRESHOLD" =~ ^[0-9]+$ ]] || [ "$THRESHOLD" -lt 1 ] || [ "$THRESHOLD" -gt 100 ]; then
|
||||
echo "❌ 错误: 阈值必须是1-100之间的数字"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "💾 磁盘健康检查器"
|
||||
echo "=========================================="
|
||||
echo "检查路径: ${BLUE}$DISK_PATH${NC}"
|
||||
echo "警报阈值: ${THRESHOLD}%"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# 获取磁盘使用情况
|
||||
DISK_INFO=$(df -h "$DISK_PATH" | tail -1)
|
||||
DISK_DEVICE=$(echo "$DISK_INFO" | awk '{print $1}')
|
||||
DISK_SIZE=$(echo "$DISK_INFO" | awk '{print $2}')
|
||||
DISK_USED=$(echo "$DISK_INFO" | awk '{print $3}')
|
||||
DISK_AVAIL=$(echo "$DISK_INFO" | awk '{print $4}')
|
||||
DISK_USAGE_PERCENT=$(echo "$DISK_INFO" | awk '{print $5}' | sed 's/%//')
|
||||
|
||||
echo "📊 磁盘使用概况:"
|
||||
echo "=========================================="
|
||||
echo -e "设备: ${CYAN}$DISK_DEVICE${NC}"
|
||||
echo -e "总容量: ${BLUE}$DISK_SIZE${NC}"
|
||||
echo -e "已使用: ${YELLOW}$DISK_USED${NC}"
|
||||
echo -e "可用: ${GREEN}$DISK_AVAIL${NC}"
|
||||
|
||||
# 使用百分比颜色标记
|
||||
if [ "$DISK_USAGE_PERCENT" -gt 90 ]; then
|
||||
USAGE_COLOR="$RED"
|
||||
STATUS="🔴 严重"
|
||||
elif [ "$DISK_USAGE_PERCENT" -gt "$THRESHOLD" ]; then
|
||||
USAGE_COLOR="$YELLOW"
|
||||
STATUS="🟡 警告"
|
||||
else
|
||||
USAGE_COLOR="$GREEN"
|
||||
STATUS="🟢 正常"
|
||||
fi
|
||||
|
||||
echo -e "使用率: ${USAGE_COLOR}${DISK_USAGE_PERCENT}%${NC} $STATUS"
|
||||
echo ""
|
||||
|
||||
# inode检查
|
||||
echo "=========================================="
|
||||
echo "📂 Inode使用概况:"
|
||||
echo "=========================================="
|
||||
|
||||
INODE_INFO=$(df -i "$DISK_PATH" | tail -1)
|
||||
INODE_USED_PERCENT=$(echo "$INODE_INFO" | awk '{print $5}' | sed 's/%//')
|
||||
INODE_TOTAL=$(echo "$INODE_INFO" | awk '{print $2}')
|
||||
INODE_USED=$(echo "$INODE_INFO" | awk '{print $3}')
|
||||
INODE_AVAIL=$(echo "$INODE_INFO" | awk '{print $4}')
|
||||
|
||||
echo -e "总数: ${BLUE}$INODE_TOTAL${NC}"
|
||||
echo -e "已使用: ${YELLOW}$INODE_USED${NC}"
|
||||
echo -e "可用: ${GREEN}$INODE_AVAIL${NC}"
|
||||
|
||||
# Inode百分比颜色
|
||||
if [ "$INODE_USED_PERCENT" -gt 90 ]; then
|
||||
INODE_COLOR="$RED"
|
||||
INODE_STATUS="🔴 严重"
|
||||
elif [ "$INODE_USED_PERCENT" -gt "$THRESHOLD" ]; then
|
||||
INODE_COLOR="$YELLOW"
|
||||
INODE_STATUS="🟡 警告"
|
||||
else
|
||||
INODE_COLOR="$GREEN"
|
||||
INODE_STATUS="🟢 正常"
|
||||
fi
|
||||
|
||||
echo -e "使用率: ${INODE_COLOR}${INODE_USED_PERCENT}%${NC} $INODE_STATUS"
|
||||
echo ""
|
||||
|
||||
# 详细信息检测
|
||||
echo "=========================================="
|
||||
echo "🔍 详细信息:"
|
||||
echo "=========================================="
|
||||
|
||||
# 文件系统类型
|
||||
FS_TYPE=$(df -T "$DISK_PATH" | tail -1 | awk '{print $2}')
|
||||
echo -e "文件系统类型: ${CYAN}$FS_TYPE${NC}"
|
||||
|
||||
# 挂载点
|
||||
MOUNT_POINT=$(df "$DISK_PATH" | tail -1 | awk '{print $NF}')
|
||||
echo -e "挂载点: ${BLUE}$MOUNT_POINT${NC}"
|
||||
|
||||
# 可读写性检测
|
||||
if [ -w "$DISK_PATH" ]; then
|
||||
RW_STATUS="${GREEN}可读写${NC}"
|
||||
else
|
||||
RW_STATUS="${RED}只读${NC}"
|
||||
fi
|
||||
echo -e "访问权限: $RW_STATUS"
|
||||
|
||||
echo ""
|
||||
|
||||
# 警报检查
|
||||
echo "=========================================="
|
||||
echo "🚨 警报检查:"
|
||||
echo "=========================================="
|
||||
|
||||
ALERT_COUNT=0
|
||||
|
||||
if [ "$DISK_USAGE_PERCENT" -gt 90 ]; then
|
||||
echo -e "⚠️ 磁盘使用率过高: ${RED}${DISK_USAGE_PERCENT}%${NC}"
|
||||
((ALERT_COUNT++))
|
||||
fi
|
||||
|
||||
if [ "$DISK_USAGE_PERCENT" -gt "$THRESHOLD" ]; then
|
||||
echo -e "⚠️ 磁盘使用率超过阈值": "${DISK_USAGE_PERCENT}% > ${THRESHOLD}%"
|
||||
fi
|
||||
|
||||
if [ "$INODE_USED_PERCENT" -gt 90 ]; then
|
||||
echo -e "⚠️ Inode使用率过高: ${RED}${INODE_USED_PERCENT}%${NC}"
|
||||
((ALERT_COUNT++))
|
||||
fi
|
||||
|
||||
if [ "$INODE_USED_PERCENT" -gt "$THRESHOLD" ]; then
|
||||
echo -e "⚠️ Inode使用率超过阈值": "${INODE_USED_PERCENT}% > ${THRESHOLD}%"
|
||||
fi
|
||||
|
||||
if [ ! -w "$DISK_PATH" ]; then
|
||||
echo -e "⚠️ ${YELLOW}磁盘为只读状态${NC}"
|
||||
((ALERT_COUNT++))
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "📈 性能指标:"
|
||||
echo "=========================================="
|
||||
|
||||
# 检查磁盘是被大量小文件填满还是少数大文件
|
||||
echo "按大小排序的顶级目录:"
|
||||
du -h "$DISK_PATH" 2>/dev/null | sort -hr | head -5 | while read -r size path; do
|
||||
echo -e " ${YELLOW}$size${NC}\t$path"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "按数量排序的顶级目录:"
|
||||
find "$DISK_PATH" -maxdepth 2 -type d 2>/dev/null | head -10 | while read -r dir; do
|
||||
count=$(find "$dir" -maxdepth 1 -type f 2>/dev/null | wc -l)
|
||||
if [ "$count" -gt 100 ]; then
|
||||
echo -e " ${YELLOW}$count${NC}个文件\t$dir"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "💡 建议:"
|
||||
echo "=========================================="
|
||||
|
||||
if [ "$ALERT_COUNT" -gt 0 ]; then
|
||||
echo "🚨 发现 $ALERT_COUNT 个问题需要处理:"
|
||||
echo ""
|
||||
|
||||
if [ "$DISK_USAGE_PERCENT" -gt 90 ]; then
|
||||
echo " 磁盘使用率超过90%:"
|
||||
echo " - 建议立即清理日志文件(/var/log)"
|
||||
echo " - 检查临时文件(/tmp)"
|
||||
echo " - 考虑扩容或迁移数据"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
if [ "$INODE_USED_PERCENT" -gt 90 ]; then
|
||||
echo " Inode使用率高:"
|
||||
echo " - 通常由大量小文件导致"
|
||||
echo " - 检查并清理临时文件"
|
||||
echo " - 查找并删除空文件"
|
||||
echo " - 可能是邮件队列或缓存文件过多"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
if [ ! -w "$DISK_PATH" ]; then
|
||||
echo " 磁盘只读:"
|
||||
echo " - 检查文件系统错误(fsck)"
|
||||
echo " - 可能是磁盘故障或挂载问题"
|
||||
echo " - 需要root权限检查和修复"
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
echo "✅ 磁盘健康状况良好"
|
||||
echo ""
|
||||
|
||||
if [ "$DISK_USAGE_PERCENT" -lt 70 ]; then
|
||||
echo " - 使用率充足,暂无扩容需求"
|
||||
fi
|
||||
|
||||
if [ "$INODE_USED_PERCENT" -lt 70 ]; then
|
||||
echo " - Inode充足,无小文件问题"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo " 💡 建议定期运行此工具监控磁盘状态"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "磁盘检查完成"
|
||||
echo "=========================================="
|
||||
|
||||
exit 0
|
||||
80
skills/toolkit/bash/system/check-service.meta.yml
Normal file
80
skills/toolkit/bash/system/check-service.meta.yml
Normal file
@@ -0,0 +1,80 @@
|
||||
tool_id: SERVICE-CHECK-001
|
||||
tool_name: "服务健康检查器"
|
||||
|
||||
基本信息:
|
||||
语言: bash
|
||||
文件: check-service.sh
|
||||
复杂度: level-1
|
||||
创建日期: 2025-11-14
|
||||
作者: CodeConscious
|
||||
|
||||
用途分类:
|
||||
- MONITOR # 监控诊断
|
||||
|
||||
功能描述:
|
||||
简介: "检查服务健康状态(HTTP/数据库/Redis)"
|
||||
详细: |
|
||||
支持多种服务类型的健康检查:
|
||||
- HTTP服务:检查 /health 端点
|
||||
- 数据库:PostgreSQL连接检查
|
||||
- Redis:连接和ping测试
|
||||
|
||||
使用场景:
|
||||
- "部署后验证服务是否正常启动"
|
||||
- "诊断服务不可访问问题"
|
||||
- "CI/CD流程中的健康检查"
|
||||
- "定期监控服务状态"
|
||||
|
||||
使用方法:
|
||||
命令: "bash check-service.sh [服务名] [检查类型] [超时时间]"
|
||||
参数:
|
||||
服务名: "要检查的服务名称(用于显示)"
|
||||
检查类型:
|
||||
- http: "HTTP健康检查"
|
||||
- db/database: "数据库连接检查"
|
||||
- redis: "Redis连接检查"
|
||||
超时时间: "秒(默认:5秒)"
|
||||
示例:
|
||||
- "检查HTTP服务: bash check-service.sh auth-service http"
|
||||
- "检查数据库: bash check-service.sh db-service db"
|
||||
- "检查Redis: bash check-service.sh cache redis"
|
||||
|
||||
依赖要求:
|
||||
系统命令:
|
||||
- curl: "HTTP客户端"
|
||||
- jq: "JSON解析(可选)"
|
||||
- pg_isready: "PostgreSQL客户端"
|
||||
- redis-cli: "Redis客户端"
|
||||
环境变量:
|
||||
- DB_HOST: "数据库主机(默认:localhost)"
|
||||
- DB_PORT: "数据库端口(默认:5432)"
|
||||
- DB_NAME: "数据库名"
|
||||
- DB_USER: "数据库用户"
|
||||
- REDIS_HOST: "Redis主机(默认:localhost)"
|
||||
- REDIS_PORT: "Redis端口(默认:6379)"
|
||||
|
||||
输入输出:
|
||||
输入:
|
||||
- 无(从环境变量读取配置)
|
||||
输出:
|
||||
- stdout: "人类可读的检查结果"
|
||||
- 退出码:
|
||||
- 0: 所有检查通过
|
||||
- 1: 检查失败
|
||||
- 其他: 参数错误
|
||||
|
||||
上次使用:
|
||||
时间: 2025-11-14 16:45:00
|
||||
用途: "验证auth-service修复后状态"
|
||||
结果: "服务健康,响应时间45ms"
|
||||
满意度: 0.9
|
||||
|
||||
相关工具:
|
||||
- 前置工具: 无
|
||||
- 互补工具:
|
||||
- toolkit/python/analysis/analyze_logs.py(详细日志分析)
|
||||
- toolkit/bash/system/check-disk.sh(磁盘空间检查)
|
||||
|
||||
维护记录:
|
||||
2025-11-14:
|
||||
- 初始创建
|
||||
76
skills/toolkit/bash/system/check-service.sh
Normal file
76
skills/toolkit/bash/system/check-service.sh
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/bin/bash
|
||||
# Service Health Checker
|
||||
# 检查服务健康状态,支持HTTP、数据库、Redis
|
||||
|
||||
set -e
|
||||
|
||||
SERVICE_NAME="${1:-auth-service}"
|
||||
CHECK_TYPE="${2:-http}"
|
||||
TIMEOUT="${3:-5}"
|
||||
|
||||
echo "🔍 检查服务健康状态"
|
||||
echo "服务: $SERVICE_NAME"
|
||||
echo "检查类型: $CHECK_TYPE"
|
||||
echo "超时: ${TIMEOUT}s"
|
||||
echo "=========================================="
|
||||
|
||||
case $CHECK_TYPE in
|
||||
http)
|
||||
# HTTP健康检查
|
||||
URL="http://localhost:3000/health"
|
||||
STATUS=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$TIMEOUT" "$URL" 2>/dev/null || echo "000")
|
||||
|
||||
if [ "$STATUS" = "200" ]; then
|
||||
echo "✅ HTTP服务正常 (状态码: 200)"
|
||||
curl -s --max-time 2 "$URL" | jq '.' 2>/dev/null || echo " (无法解析JSON响应)"
|
||||
else
|
||||
echo "❌ HTTP服务异常 (状态码: $STATUS)"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
|
||||
db|database)
|
||||
# 数据库连接检查
|
||||
if [ -f ".env" ]; then
|
||||
source .env
|
||||
fi
|
||||
|
||||
DB_HOST="${DB_HOST:-localhost}"
|
||||
DB_PORT="${DB_PORT:-5432}"
|
||||
DB_NAME="${DB_NAME:-myapp}"
|
||||
DB_USER="${DB_USER:-postgres}"
|
||||
|
||||
if pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t "$TIMEOUT" >/dev/null 2>&1; then
|
||||
echo "✅ 数据库连接正常"
|
||||
echo " Host: $DB_HOST:$DB_PORT"
|
||||
echo " Database: $DB_NAME"
|
||||
else
|
||||
echo "❌ 数据库连接失败"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
|
||||
redis)
|
||||
# Redis连接检查
|
||||
REDIS_HOST="${REDIS_HOST:-localhost}"
|
||||
REDIS_PORT="${REDIS_PORT:-6379}"
|
||||
|
||||
if redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then
|
||||
echo "✅ Redis连接正常"
|
||||
echo " Host: $REDIS_HOST:$REDIS_PORT"
|
||||
else
|
||||
echo "❌ Redis连接失败"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "❌ 未知的检查类型: $CHECK_TYPE"
|
||||
echo "支持的类型: http, db, redis"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "=========================================="
|
||||
echo "✅ 所有检查通过"
|
||||
exit 0
|
||||
Reference in New Issue
Block a user