#!/usr/bin/perl # $mail_recipients="inst1"; # $mail_recipients="rick\@delphion.com"; $mail_recipients="rebecca\@delphion.com,jimmyers\@delphion.com"; # This script is to handle a problem in Southbury where for some bizarre reason, # java processes started to loose access to its own java classes. This broke # the java process(es) but they didn't die. Instead, they kept running, broken. # The first time we know this happened was 4/10/2001. # # The external symptom of this phenomenon was that users would get cryptic errors # such as "Mall errors". The internal symptoms of this was error messages in # Net Commerce log files (eg /ips/nclogs/ncommerce20010410221515_29702.log) # of the type # ERROR: IPN0001E cannot find Password class # or ERROR: IPN0001E cannot find SurePayRequest class # or ERROR: IPN0001E cannot find com/delphion/ipn/utility/Password class # # This script will run out of inst1's crontab and check for the above messages in # any of the active Net Commerce log files. If it finds any, it checks to insure # this process is still running (PID is in the log file's name), it's still an # inst1 process of /usr/lpp/NetCommerce3/bin/server, and if it is, we'll kill it. # We rely on the Net Commerce server to restart the killed process. # # We'll also log what we do and send e-mail to interested parties. # First, determine the latest log files. There should be 6 of these Net Commerce # processes running (sometimes less). We'll just do a "ls -ltr" and take the last # six log files. If this turns out to be wrong, we'll catch ourselves in our # checking below. # # Then for those 6 log files, see if there are any occurances of the IPN0001E messages. @log_file_list = `/usr/bin/ls -tr /ips/nclogs/ncommerce*.log | /usr/bin/tail -6`; chomp(@log_file_list); # In list mode, chomp works on each element. # print "\nThe list of log files I'm looking at, include\n"; # foreach $_ (@log_file_list) { # print ">$_<\n"; # E.G. /ips/nclogs/ncommerce20010410201447_3596.log # } # or /ips/nclogs/ncommerce20010410201447_25744.log # print "\n\n"; foreach $_ (@log_file_list) { @found_lines = `grep 'ERROR: IPN0001E cannot find' $_ >/dev/null`; # @found_lines = `grep 'ERROR: IPN0001E cannot find' $_`; $status = $?; # print "\$status=$status after executing \"grep 'ERROR: IPN0001E cannot find' $_\"\n"; if ($status == 0) { # print "$_ had some \"IPN0001E cannot find ...\" lines.>@found_lines\n" # If we do get any hits, then parse out the PID in the log's file name. # In our example above, we'd parse out 17664, 23626, or 30010. m'/ips/nclogs/ncommerce\d*_(\d*)\.log'; $PID=$1; # print "Given >$_<, I parsed out >$PID<\n"; # Now for this PID, do a "ps -ef" command to see if it's still running. # Do other sanity checks also, so we don't accidently kill some other process. # The "ps -ef" line should look something like # inst1 17664 26262 0 03:00:28 - 1:11 /usr/lpp/NetCommerce3/bin/server \ # -i /usr/lpp/NetCommerce3/instance/patents/config//ncommerce.conf -s ncpatents_3 @PID_details = `/usr/bin/ps -fp $PID | /usr/bin/grep '^ inst1 ' | /usr/bin/grep -v grep | /usr/bin/grep '/usr/lpp/NetCommerce3/bin/server -i /usr/lpp/NetCommerce3/instance/patents/config//ncommerce.conf -s ncpatents_'`; $status = $?; if ($status == 0) { # print "Got PID $PID to kill.\n"; # If it passes all our checking, kill the process(es) and send mail to Rebecca & Jim. `kill -9 $PID`; # print "\n\nNow \@killed_PID_list is ", scalar @killed_PID_list, " item long and is =>@killed_PID_list<\n"; # Keep track of the PIDs we kill for later notification. This way we batch 'em all should # there be more than one PID we kill at a time (god, I hope we don't do multiple of them). push(@killed_PID_list,$PID); } # else { print "PID $PID is no longer running.\n"; } } # else {print "$_ didn't have any \"IPN0001E cannot find ...\" lines.>@found_lines\n"; } } # print "\n\nFinally, \@killed_PID_list is ", scalar @killed_PID_list, " item long and is =>@killed_PID_list<\n"; # If we did anything above, then both log it and send mail to the interested parties. if ((scalar @killed_PID_list) > 0) { # print "Am logging and sending mail to $mail_recipients, due to killed PIDs @killed_PID_list\n"; open(LOGFILE, ">>check_running_nc_processes.log") or die "Cannot open check_running_nc_processes log file."; print LOGFILE +scalar localtime, " I killed PID(s) ", @killed_PID_list, "\n"; close LOGFILE; open(MAIL, "|/usr/bin/mail -s'Automatic Net Commerce Process Execution' $mail_recipients") or die "Cannot send mail to $mail_recipients"; $myhostname=`hostname -s`; # Returns dephds057, for example. chomp($myhostname); # E.G. The /dfs/ipntools/nc/check_running_nc_processes.pl process running on dephds057 has killed # the following process ID, 12345, on Thu Apr 12 09:47:55 2001. # or the following process IDs, 12345 67890 13579 24680, on Thu Apr 12 09:47:55 2001. print MAIL "The $0 process running on $myhostname\nhas killed the following process ID"; if ((scalar @killed_PID_list) > 1) { print MAIL "s"}; print MAIL ", @killed_PID_list, on ", scalar localtime, ".\n"; print MAIL "\n\n"; print MAIL "Background: This programs runs from inst1's crontab every 5 minutes,\n"; print MAIL "scanning through the most recent Net Commerce log files in the /ips/nclogs\n"; print MAIL "directory, looking for instances of these error messages;\n"; print MAIL " ERROR: IPN0001E cannot find Password class\n"; print MAIL " or ERROR: IPN0001E cannot find SurePayRequest class\n"; print MAIL " or ERROR: IPN0001E cannot find com/delphion/ipn/utility/Password class\n"; print MAIL "\n"; print MAIL "We found that this killed us on April 10, 2001, so we started looking for it to happen\n"; print MAIL "and taking action when it did (like just now).\n\n"; print MAIL "If you don't understand this e-mail, see Rebecca Hernandez at Delphion.\n"; print MAIL "Her e-mail address is rebecca\@delphion.com. Her phone number is (408) 284-8941.\n"; close MAIL; }