一、环境概述
共有六台HP小机,telnet IP分别为1.1.1.{1..6},MP卡地址分别为1.1.3.{1..6},其中1号机用户名密码分别为user\passwd,2号机管理员密码为123456,其余管理员密码为password。脚本分为四部分,telnet到主机并执行巡检telnet_host.exp,telnet到主机MP卡并查看日志 telnet_console.exp,管理多IP登陆并处理上述两个脚本生成日志auto_check.sh,以及巡检中需要用到的命令组合。
二、脚本作用域
1、查看/var/adm/syslog/syslog.log /etc/rc.log /etc/shutdownlog,筛选其中错误信息;
2、筛选#dmesg输出错误信息;
3、检查逻辑卷有无错误;
4、检查文件系统用量有无超过80%;
5、检查硬件有无错误;
6、检查物理内存、虚拟内存使用;
7、查看磁盘I/O情况;
8、查看CPU使用率;
三、脚本代码及解释
#! /bin/bash
# auto_check.sh
#
HOST=host #定义变量,存放主机IP
USER=user #定义变量,存放用户各
PASSWD=passwd #定义变量,存放用户密码
FLAG=flag #定义变量,在调用judge函数时进行区分
[ -f ./host.log ] && rm -rf ./host.log
[ -f ./tmp.log ] && rm -rf ./tmp.log
# 判断上一句中“grep”是否查找到预期值,当结果为否,则判断为日志无错误信息,则返回相应语句。
function judge_grep() {
if [ $? -ne 0 ];then
echo " All $FLAG take right place." | tee -a ./host.log
else
echo " $FLAG error! The error log has written to file './host.log', please check."
fi
}
# 判断上一句是否正确执行
function judge_state {
if [ $? -eq 0 ];then
echo " $FLAG OK!"
else
echo " $FLAG error! Please check!"
fi
}
for I in {1..6}
do
HOST="1.1.1.$I"
USER="root"
PASSWD="password"
case $I in
1)
USER="user"
PASSWD="passwd"
;;
2)
PASSWD="123456"
;;
esac
echo "Auto Checking $HOST@$USER:"
echo " Checking, please waiting..."
# 调用脚本telnet_host.exp执行主机自动telnet并执行命令,结果重定向到./tmp.log以备整理
# (注:在结果中去除YOU ARE SUPERUSER这句,方便对日志错误进行查找。)
./telnet_host.exp $HOST $USER $PASSWD "`cat ./VMSTATE`" \
"`cat ./RUNSTATE`" "bdf" "ioscan -fn" \
"`cat ./LOGSTATE`" "vgdisplay -v" \
| grep -v -i "WARNING: YOU ARE SUPERUSER !!" > ./tmp.log
echo " Check finished. Starting analyse data..."
echo "$HOST@$USER:" >> ./host.log
echo >> ./host.log
FLAG=sar;
# 对sar -d 2 20; sar -u 2 20; sar -w 2 20;进行删选
cat ./tmp.log | grep Average -B 2 >> ./host.log
judge_state
echo >> ./host.log
FLAG=vmstat
# 对vmstat 2 20;进行删选
cat ./tmp.log | grep procs -A 2 >> ./host.log
judge_state
echo >> ./host.log
FLAG=fs
# 对文件系统使用率大于等于80%以上的项进行删选
cat ./tmp.log | grep dev -C 1 \
| grep -v "[0-7][0-9]% \| [0-9]%" | grep % >> ./host.log
judge_state
echo >> ./host.log
FLAG=log
# 删选日志输出中包含错误关键字的项
cat ./tmp.log | grep -i -e error -e warning -e \
mistake -e notice -e "time out" -e "timed out" >> ./host.log
judge_grep
echo >> ./host.log
FLAG=hardware
# 删选硬件列表中包含错误关键字的项
cat ./tmp.log | grep -i -e processor -e disk -e tape \
-e memory -e fc -e lan | grep -i -e no_hw -e \
error -e unknown >> ./host.log
judge_grep
echo >> ./host.log
FLAG=VGs
# 删选输出中有关VG错误的项
cat ./tmp.log | grep -i -e "Volume groups" -A 20 \
-e "LV Name" -C 5 -e "PV Name" -C 5 \
| grep -i Status | grep -i -v available
judge_grep
echo >> ./host.log
FLAG=dmesg
# 对dmesg进行删选时选择重新执行命令是为了与LOG错误区分开来
./telnet_host.exp $HOST $USER $PASSWD "dmesg" \
| grep -v -i "WARNING: YOU ARE SUPERUSER !!" \
| grep -i -e error -e warning -e mistake \
-e notice -e "time out" -e "timed out" >> ./host.log
judge_grep
echo >> ./host.log
# 对TOP结果的删选正在测试,正常情况下,其输出会覆盖其它结果,故先注释,另寻他法
# ./telnet_host.exp $HOST $USER $PASSWD "top -d 10" > ./tmp.log
# cat ./tmp.log | grep -i "Cpu states" -A 20 >> ./host.log
# FLAG=top; judge_state
# echo >> ./host.log
done
# 以下是主机MP卡event log的检查
for I in {1..6}
do
HOST="1.1.3.$I"
USER="Admin"
PASSWD="Admin"
echo
# 调用telnet_console.exp并传递三个参数,将结果重定向到./tmp.log,以备整理
./telnet_console.exp $HOST $USER $PASSWD > ./tmp.log
FLAG="Event log"
# 删选结果中包含日期关键字的行
cat ./tmp.log | grep -E -e "`date +"%d %b %Y"`" -e `date +%m/%d/%Y` >> host.log
judge_grep
echo >> ./mp.log
done
unset HOST USER PASSWD DATE
rm -rf ./tmp.log
exit 0
#! /usr/bin/expect
# 该脚本用expect解释,需开头注明。有关什么是expect
# 请查阅维基百科http://zh.wikipedia.org/wiki/Expect
# telnet_host.exp
#
#
set timeout -1
set HOST [lindex $argv 0]
set USER [lindex $argv 1]
set PASSWD [lindex $argv 2]
spawn telnet $HOST
expect login:
send "$USER\r"
expect Password:
send "$PASSWD\r"
# 由于1号机非管理员用户,且.profile中定义了相关脚本,捕捉到telnet登陆成功
# 最后一句,作为标识,并统一将PS1改为“#”,方便下文
expect {
" QUIT...........0" {send "\nPS1=#\n"}
"]#" {send "\nPS1=#\n"}
}
# 该脚本前三个参数分别定义为主机IP、用户名、密码,从第四个参数起为执行命令或命令组合
# 由于未知原因,排的后面的命令时而会提前出现,多次回车可解决部分
for {set I 3} {$I<$argc} {incr I} {
expect "#"
send "[lindex $argv $I]\n"
expect "#"
send "\n"
expect "#"
send "\n"
expect "#"
send "\n"
}
# 捕捉标识符并退出
expect "#"
send "exit\n"
exit 0
expect eof
#! /usr/bin/expect
# telnet_console.exp
#
set timeout -1
set HOST [lindex $argv 0]
set USER [lindex $argv 1]
set PASSWD [lindex $argv 2]
spawn telnet $HOST
expect "MP login:"
send "Admin\n"
expect "MP password"
send "Admin\n"
# 此脚本环境中,HP小机MP卡命令存在两个版本,分别捕捉不同的标识符
expect "MP>"
send "sl\n"
expect {
"MP:VW>" { send "sel\n" }
"*Quit:" { send "e" }
}
# 用sleep间隔以免反应过快,导致不正常显示
expect "*>"
send "t"
sleep .5
send "a"
sleep .5
send "3"
sleep .5
send "\n"
sleep .5
send "\n"
sleep .5
send "\n"
sleep .5
send "\n"
sleep .5
send "\n"
expect "*>"
send ""
expect "MP>"
send "x\n"
expect eof
exit 0
# ./VMSTAT内容
vmstat | head -2; vmstat 2 20 | tail -1
# ./LOGSTATE内容
for I in /var/adm/syslog/syslog.log /etc/rc.log /etc/shutdownlog
do
echo "$I:"
cat $I | grep -E "`date +'%b %d'`.*`date +%Y`"
done
# ./RUNSTATE内容
for I in u d w
do
echo "sar -$I 2 20:"
sar -$I 1 | grep -e usr -e busy -e swpin
sar -$I 2 20 | grep -i average | sort -rk 3 | head -1
done