On Feb 11, 12:41 am, sujith tv <sujith.swa
...@gmail.com> wrote:
> hi,
> I am getting this error in errpt,
> LABEL: SC_DISK_ERR7
> IDENTIFIER: DE3B8540
> Date/Time: Wed Feb 10 11:05:38 GMT 2010
> Sequence Number: 254129
> Machine Id: 00C8E80B4C00
> Node Id: seuedcsap404
> Class: H
> Type: PERM
> Resource Name: hdisk150
> Resource Class: disk
> Resource Type: NetAppMPIO
> Location: U5791.001.99B06GP-P1-C08-T1-W500A098188FCB408-L0
> VPD:
> Manufacturer................NETAPP
> Machine Type and Model......LUN
> ROS Level and ID............0.2
> Serial Number...............P3VpT4Q10KpX
> Device Specific.(Z0)........FAS2050
> Description
> PATH HAS FAILED
> Probable Causes
> ADAPTER HARDWARE OR CABLE
> DASD DEVICE
> Failure Causes
> UNDETERMINED
> Recommended Actions
> PERFORM PROBLEM DETERMINATION PROCEDURES
> CHECK PATH
> Detail Data
> PATH ID
> 3
> SENSE DATA
> 0600 1200 0000 4004 0000 0000 0000 0000 0000 0000 0000 0000 0200 0600
> 0000 0000
> 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000
> 0000 0000
> Can anyone suggest how to solve this issue ?
> Regards,
> Sujith
this will require a large amount of hacking as it is setup to run via
a wrapper script and send the output to Nagios
and the CRLF's at column 80 with make a copy and paste awkward.
Basically, it checks VG's, stale PP's and paths and other stuff I
can't remember because I wrote it ages ago :)
#!/bin/ksh
# $Id$
# check there are no stale PP's on all VG's
# do this by checking each VG for stale PP's
# also check for varyonvg processes running in the background
# send alerts if we get failures from varyonvg, or if there are still
sync processes after our limit
# functions
function fail_vars {
FAIL="$FAIL $*"
if [[ $RC -gt $Alert_Status ]] ; then
Alert_Status=$RC
fi
}
function run {
# Run a command and check return code
echo "Running $*" >&3
eval "$*" >&3 2>&3
RC=$?
if [[ $RC != 0 ]] ; then
# need to parse $* to fail_vars function
run_commands="$*"
echo "[**FAIL** RC=$RC]" >&3
fail_vars "$run_commands"
else
OK="$OK $*"
fi
}
function check_if_root {
# we only need root access if we found stale PPs, so only check root
usage here
if [[ $(whoami) != "root" ]]; then
# output to STDOUT, hopefully cron job will be root user
Alert_Status=1
Alert_Message="need to be root user, exiting"
eval $alert_command
exit
fi
}
function check_vg_pps_stale {
vg_stale_pps=$( lsvg -L $vg_name | awk '/^STALE PVs:/ {print
$6}' )
function check_vg_pps_stale {
vg_stale_pps=$( lsvg -L $vg_name | awk '/^STALE PVs:/ {print
$6}' )
}
function check_parent_path_status {
parent_path_status=$( lspath -l $pv_name -p $path_parent | awk '{print
$1}' )
}
function check_all_vgs {
migratepv_running=$( ps -eo pid,args | awk '$3 == "/usr/sbin/
migratepv"' )
set -- $migratepv_running
migrate_pv_pid=$1
if [[ -n $migratepv_running ]]; then
RC=1
Alert_Message="migratepv PID $migrate_pv_pid is running -
exiting"
eval $alert_command
exit
fi
# have a look at all VGs
for vg_name in $( lsvg -L ); do
check_vg_pps_stale
# keep track of the total number of stale PPs and their VG names
if [[ $vg_stale_pps != "0" ]]; then
# this is the seconds since the EPOCH time we found stale PPs in
$vg_name
time_found_stale_pps=$( perl -le ' print time' )
check_if_root
# test to see if VG is locked - last field of ps shows the VG
vg_locked=$(ps -eo pid,args | awk '$NF ==
"'$vg_name'" {print $1}')
if [[ -n $vg_locked ]]; then
RC=1
fail_vars "$vg_stale_pps PPs in VG $vg_name -
VG is locked by a sync process $vg_locked"
break
fi
# loop around until we can't see the background process any more
# this while loop will create a LOT of CPU activity, no sleep for the
wicked
while [[ $bg_process_loop = "yes" ]]; do
# the varyonvg runs in the background, lets see if we can see the
associated processes
# they look like this
# lresynclv -l 00c4bdbe00004c000000010826ebaf85
# /bin/ksh syncvg -v rootvg
check_sync_running_in_bg=$( ps -eo args | awk
'$1 == "lresynclv" || $2 == "syncvg"' )
# if there are no sync processes running, it could mean we had a very
quick resync
# or, more likely, the sync of the stale PP's has just finished.
if [[ -z $check_sync_running_in_bg ]]; then
bg_process_loop=no
# if this is to run via cron, it's worth an alert to
# show we found and sync'd stale PPs
RC=1
fail_vars "successfully synchronised
$vg_stale_pps PPs in VG $vg_name login and check"
fi
# we're also going to see long it takes to sync the stale PPs, and if
it
# takes longer than $stale_pp_loop_seconds_limit minutes, we probably
have other/bigger problems
time_now=$( perl -le 'print time' )
time_waiting_for_sync=$(( $time_now -
$time_found_stale_pps ))
if (( $time_waiting_for_sync >
$stale_pp_loop_seconds_limit )); then
RC=1
fail_vars "took longer than
$stale_pp_loop_seconds_limit limit to sync PPs on $vg_name"
# drop out of the loop, we may have other VGs with stale PPs too
bg_process_loop=no
fi
done
check_vg_pps_stale
stale_vg_name=$(echo $stale_vg_name:$vg_name )
else
vg_ok="$vg_ok $vg_name"
num_vg_ok=$(( $num_vg_ok + 1 ))
fi
stale_pp_ctr=$(( $stale_pp_ctr + $vg_stale_pps ))
done
# if there are STILL stale PPs, send alert
if [[ $stale_pp_ctr -ne "0" ]]; then
RC=1
fail_vars "there are $stale_pp_ctr stale PPs on VG
$stale_vg_name"
else
# tell Nagios this function ran OK
OK="VGs: $num_vg_ok"
fi
}
function check_all_pvs {
# we need to check the PATHing of any MPIO disks, only check PVs that
are active
for pv_name in $(lspv | awk '$4 == "active" {print $1}') ; do
# find out if MPIO
if lsattr -El $pv_name -a PCM > /dev/null 2>&1 ; then
for path_parent in $( lspath -l $pv_name | awk '{print
$3}' ); do
check_parent_path_status
if [[ $parent_path_status = "Enabled" ]] ;then
pv_ok="$pv_ok $pv_name:$path_parent"
num_pv_ok=$(( $num_pv_ok + 1 ))
continue
fi
# run a while loop because we want to check the status twice
while [[ $parent_path_status != "Enabled" ]];
do
mpio_status_ok=n
old_parent_path_status=
$parent_path_status
check_if_root
# lets see if we can Enable the $pv_name
run chpath -l $pv_name -p $path_parent
-s enable
check_parent_path_status
# this is our second check to see if the MPIO path is ok now
if [[ $parent_path_status !=
"Enabled" ]];then
RC=2
fail_vars "PV $pv_name on path
$path_parent has a status of $parent_path_status"
# this will exit the loop
parent_path_status=Enabled
else
RC=1
fail_vars "PV $pv_name on path
$path_parent had a status of $parent_path_status was
$old_parent_path_status"
fi
done
done
fi
done
if [[ $mpio_status_ok = "y" ]]; then
# update $OK string
OK="$OK PVs: $num_pv_ok"
fi
}
function use_sendmail_and_syslog {
alert_command='echo "$Alert_Message" | mailx -s "$base_name failed on $
(hostname)" root 2>/dev/null'
ok_command='logger $Alert_Message'
}
function check_send_nsca {
# test to see if send_nsca port is open
echo "$nsca_server;dummy_check;0;$base_name - dummy test" | send_nsca -
H $nsca_server -d ";" -c $nsca_cfg -to 2 >/dev/null
send_nsca_rc=$?
if [[ $send_nsca_rc != "0" ]]; then
# see if we can connect with Management LPAR/NIM master
nsca_server=$( awk '{if (match($2,/NIM_MASTER_HOSTNAME=/) )
{print substr($2,RLENGTH+1)}}' /etc/niminfo )
echo "$nsca_server;dummy_check;0;$base_name - dummy test" |
send_nsca -H $nsca_server -d ";" -c $nsca_cfg -to 2 >/dev/null
send_nsca_rc=$?
if [[ $send_nsca_rc != "0" ]]; then
use_sendmail_and_syslog
else
#for send_nsca
ok_command=$alert_command
fi
else
#for send_nsca
ok_command=$alert_command
fi
}
# entry
# for debug mode
if [[ $1 = "-d" ]]; then
typeset -ft $(typeset +f)
set -x
fi
# variables
PATH=/usr/bin:/etc:/usr/sbin:/usr/ucb:/sbin:/opt/local/bin
let vg_stale_pps=0
let stale_pp_ctr=0
let stale_pp_ctr=0
let stale_pp_loop_seconds_limit=600
mail_to=root
bg_process_loop=yes
this_script=$0
base_name=$( basename $this_script )
this_script=$( basename $0)
log_file_name=${this_script%%.sh}_$(date +%d).log
log_file=/tmp/$log_file_name
mpio_status_ok=y
nsca_server=wmlimon01
nagios_service=client_disk_sync_check
nsca_cfg=/opt/local/etc/nagios/send_nsca.cfg
alert_command='echo $(hostname)\@$nagios_service\@$Alert_Status\@$
(basename $0) - $Alert_Message | send_nsca -H $nsca_server -d @ -c
$nsca_cfg >/dev/null'
Alert_Status=0
OK=""
FAIL=""
MAILGRP=root
pv_ok=""
...
read more »