#!/dfs/prod/ipn/bin/perl # We need this version of Perl, not AIX's ancient /usr/bin/perl, because I # use exists on an array (ie, if (exists $CoC_Start[0]) { ...) # and want to avoid getting this error msg, # exists operator argument is not a HASH element at scp.old.US line ... $debug=1; # If debug, will be a lot more verbose & will not scp. $debug=0; # # Input is a DVD name of one of these DVDs, e.g. usp001, and optionally, the # number of children processes to spawn and/or whether or not to force the # scp to Southbury should an image NOT be in the Cumulative Index file. # # If the DVD name is one of the ones Danny had already partially processed, # then we look for a "ToDo" file (eg, ~/ToDo/usp001) that should contain the # output of my ~rickjas/old_US_Images/check.usp.pl program in Southbury. # # Unlike previous image-handling Perl programs, we will actually scp the file # directly to Southbury's /dfs/images/US tree, writing the file with its CD label, # e.g. /dfs/images/US/30/05/US00000530.usp001.tif. We even go directly to the DFS # server that is actually hosting the destination DFS fileset. We are able to # do this without passwords due to # A) I've set up my rickjas userid on all 8 DFS servers (dephds005-012) to be # able to ssh/scp without passwords. See my ~jasper/aixnotes/ssh.install.notes # for details on how this was done. # B) I will set up this userid (eg, jasper's AFS id in San Jose) so that I can # scp to Southbury. This entails setting 3 files in my/your ~/.ssh2 directory. # C) All the /dfs/images/US/nn/nn directories have the proper DFS ACLs, # most importantly, {any_other rwx-i-}. Note in particular that the delete # permission is missing. This is intentional and prevents an unauthenticated # web server to erase image files via some cgi back door. We can create # files, but not erase or modify them. # D) I hardcode in this script, the DFS images.US.nn fileset to DFS server mapping. # # Another improvement is the spawning of umpteen children to do all the scp-ing. # Once we determine the number of images we need to copy, we spawn up to 5 child # processes and do that many scp's at a time, splitting the work among the children. # use Errno qw(EAGAIN); # Gets you the EAGAIN symbol, used in fork loop. use POSIX ":sys_wait_h"; # Gets you the WNOHANG symbol, used in waitpid call. use Fcntl qw(:DEFAULT :flock); # Gets you the LOCK_* constants used in flock call if ($ARGV[0]) { # Was a parameter given to us? $cdlabel = lc $ARGV[0]; if ($cdlabel =~ /^\d*$/) { # Accept just the numeric portion, eg 1 $cdlabel=sprintf("usp%03d",$cdlabel); } } else { die "$0 needs the label name of an back-file US DVD, e.g. usp001.\n"; } $HOME=$ENV{"HOME"}; $number_of_children = 5; $heartbeat=50; # Have each child give a heartbeat every 50 image files. $Force_scp=0; shift @ARGV; while (@ARGV) { if ($ARGV[0] =~ /^\d*$/) { # If numeric, must be number of children to spawn. $number_of_children = $ARGV[0]; } elsif ($ARGV[0] =~ /^force$/i) { $Force_scp=1; # Force-copy even if missing from Cumulative Index file. } else { die "Invalid argument given.\n"; } shift @ARGV; } # Determine where our input directory is. Use in order, the $CDROMROOT # environment variable, our current directory, or /cdrom (the normal case). # always set. if ($ENV{"CDROMROOT"}) {$CDROMROOT=$ENV{"CDROMROOT"}; } elsif (-d "$ENV{'PWD'}/$cdlabel") {$CDROMROOT=$ENV{'PWD'}} elsif (-d "/cdrom/$cdlabel") {$CDROMROOT="/cdrom"} # Normal case. else {$CDROMROOT=$ENV{'PWD'}} if (! -d "$CDROMROOT/$cdlabel") { die "It looks like you don't have the $cdlabel CD loaded on this machine.\nPerhaps you want to be logged onto patimg1?"} $outdir = "/dfs/cdrom/$cdlabel"; # Establish our output directory. if (! -d "$outdir") { print STDOUT "Creating output directory at $outdir ...\n"; mkdir "$outdir", 0777; } # Programs $any2any="/dfs/prod/ipn/bin/any2any"; if (! -r "$any2any") { die "Could not find or execute $any2any program."} $anyinfo="/dfs/prod/ipn/bin/anyinfo"; if (! -r "$anyinfo") { die "Could not find or execute $anyinfo program."} $tiffcp="/local/bin/tiffcp"; if (! -r "$anyinfo") { die "Could not find or execute $tiffcp program."} $tiffinfo="/local/bin/tiffinfo"; if (! -r "$anyinfo") { die "Could not find or execute $tiffinfo program."} $SBY_userid="rickjas"; $scp="/local/bin/scp"; if (! -r "$scp") { die "Could not find or execute $scp program."} # Since I get fancy in this program and scp directly to Southbury, insure this userid's # ~/.ssh2 directory is set up with the proper keys. We need 3 things done, # 1) The string 'IdKey id_dsa_1024_a.009' in the ~/.ssh2/identifiation file. # 2) cp -p /afs/d/u/jasper/.ssh/id_dsa_1024_a.009 ~/.ssh2 # 3) cp -p /afs/d/u/jasper/.ssh/id_dsa_1024_a.009.pub ~/.ssh2 if (! `/usr/bin/grep id_dsa_1024_a.009 $HOME/.ssh2/identification 2>/dev/null`) {`echo 'IdKey id_dsa_1024_a.009' >> $HOME/.ssh2/identification`} if (! -r "$HOME/.ssh2/id_dsa_1024_a.009") {`/usr/bin/cp -p /afs/d/u/jasper/.ssh2/id_dsa_1024_a.009 $HOME/.ssh2`} if (! -r "$HOME/.ssh2/id_dsa_1024_a.009.pub") {`/usr/bin/cp -p /afs/d/u/jasper/.ssh2/id_dsa_1024_a.009.pub $HOME/.ssh2`} # If this is a DVD that Danny partially copied already, then work off of the # /dfs/cdrom/$cdlabel/ToDo file, else we'll scan the whole bloody DVD and # build up our own list. if ($cdlabel le "usp249" && ! grep {m/$cdlabel/} qw(usp212 usp215 usp222 usp223 usp224 usp229 usp230 usp235 usp242 usp243 usp244 usp245 usp248) ) { if (! -r "$outdir/fix.split.ToDo") { print "scp-ing $cdlabel fix.split.ToDo file from Southbury ...\n"; `$scp $SBY_userid\@dephds008:$outdir/fix.split.ToDo $outdir/fix.split.ToDo`; if (! -r "$outdir/fix.split.ToDo") {die "Could not find $outdir/fix.split.ToDo file. Maybe you need to scp from Southbury, e.g.\n scp -p $SBY_userid\@dephds008:$outdir/fix.split.ToDo $outdir\n"} } print STDOUT "Will scp only those files in our fix.split.ToDo list ...\n" if ($debug); @file_list = `/usr/bin/cat $outdir/fix.split.ToDo`; } # At this point, the file_list array holds all the image files we need to do. chomp(@file_list); # In list mode, chomp works on each element. $total_number_TODO = scalar(@file_list); if ($total_number_TODO == 0) { die "No image files found at $CDROMROOT/$cdlabel\n" } if ($total_number_TODO < $number_of_children) {$number_of_children=$total_number_TODO} if (-s "$outdir/fix.Done") { $Done_Count=`/usr/bin/wc -l $outdir/fix.Done 2>/dev/null`; chomp $Done_Count; if ($Done_Count == $total_number_TODO) { print "All $total_number_TODO images are already done.\n"; exit; } } else { $Done_Count=0; } if (-s "$outdir/fix.Done") { print STDOUT "Reading the Done file ($outdir/fix.Done) ...\n"; @lines = `/usr/bin/awk '{print \$1}' $outdir/fix.Done`; chomp(@lines); # In list mode, chomp works on each element. foreach $patn (@lines) { $Done{$patn}=1; # Lines contain just the patn } } # Before spawning all our children, open up our 5 output files so that we # all can share the same file descriptor. open(DONE,">>$outdir/fix.Done") || die "Could not open $outdir/fix.Done"; select(DONE); $|=1; open(ERRORS,">>$outdir/fix.Errors") || die "Could not open $outdir/fix.Errors"; select(ERRORS); $|=1; open(NONFATAL,">>$outdir/fix.Non-Fatal_Errors") || die "Could not open $outdir/fix.Non-Fatal_Errors"; select(NONFATAL); $|=1; select(STDOUT); $|=1; print STDOUT "Starting to process $total_number_TODO image files among $number_of_children processes for $cdlabel ...\n"; for (1 .. $number_of_children) { # print STDOUT "Spawning child #$_ ...\n"; FORK: { if ($pid = fork) { # If the fork was successful, it returns to the parent, a non-zero # number (true), which is the child's PID. Save it. # print STDOUT "Parent spawned child #$_ as PID $pid ...\n"; $PIDs{$pid}=$_; # The PIDs array has key=child's PID & value=child's number (1-n) } else { # Else we're in the child process (most likely) or the if (! defined $pid) { # fork failed. If $pid is not defined, then the fork failed. print STDERR "fork error spawning Child #$_. Will retry in 2 seconds ...\n"; sleep 2; redo FORK; } ###################################################### # # # CC H H III L DDD RRR EEEE N N # # C C H H I L D D R R E NN N # # C HHHH I L D D RRR EEE N N N # # C C H H I L D D R R E N NN # # CC H H III LLLL DDD R R EEEE N N # # # ###################################################### # If we get here, then we're inside the just-spawned child process, # who share the already-opened file descriptors for our 5 output # files, and have their own copy of all other variables. $child_number=$_; # Ranges from 1 to $number_of_children # Determine the beginning and ending line number for the images this # child process should handle. We carve the list of files to do, # into $number_of_children blocks, and each child does their block. # If not evenly divisable (the normal case), then all the ealier # children do one more image than all the later children. # # For example, if 10 images to do among 4 children, # $number_todo_for_each_child will be 2, but # child 1 will do 3, beginning will be 0 & end will be 2. # child 2 will do 3, beginning will be 3 & end will be 5. # child 3 will do 2, beginning will be 6 & end will be 7. # and child 4 will do 2, beginning will be 8 & end will be 9. # This is the minimum number each child will do and might be zero. # Some number of the first children will do one more each. $number_todo_for_each_child=int($total_number_TODO / $number_of_children); # This will get adjusted later. $beginning=$number_todo_for_each_child * ($child_number-1); # This is the number of first children that will do one more. # This might be zero if it's evenly divisable, but that's ok. $number_of_first_children = $total_number_TODO % $number_of_children; if ($child_number <= $number_of_first_children) { # One of the first children? $beginning += $child_number - 1; $end=$beginning + $number_todo_for_each_child; } else { # One of the later children. $beginning += $number_of_first_children; $end=$beginning + $number_todo_for_each_child - 1; } $number_for_me_TODO=$end-$beginning+1; # print STDOUT "Child #$child_number will handle $beginning to $end inclusive ($number_for_me_TODO total).\n" if ($debug); $file_num=0; # This child's file counter for ($index=$beginning; $index <= $end; $index++) { $this_file=$file_list[$index]; $file_num++; # print STDOUT "Child #$child_number file #$file_num out of $number_for_me_TODO is $this_file\n" if ($debug); Handle_Image_File(); # Handle this one image file. if ($file_num%$heartbeat == 0) { print STDOUT "Child #$child_number working on file #$file_num out of $number_for_me_TODO = $this_file at " . timestamp() . " ...\n"; } } # print STDOUT "After $file_num files, child #$child_number (PID $$) is ending at " . timestamp() . ".\n" if ($debug); exit; # That is, this one child is exiting. } } } ########################################### # # # PPP A RRR EEEE N N TTTTT # # P P A A R R E NN N T # # PPP AAAAA RRR EEE N N N T # # P A A R R E N NN T # # P A A R R EEEE N N T # # # ########################################### # At this point, we're back in the parent after spawning all our # children, who are likely running at this point. All that's left # for the parent to do, is reap the kiddies. # print STDOUT "Rick, the Parent's PID list is\n"; # foreach $PID (keys(%PIDs)) { print STDOUT "\$PIDs\{$PID} is Child # $PIDs{$PID}\n" } do { $kidpid = waitpid(-1,&WNOHANG); # waitpid returns 0 if the child is still running # -1 if the child is no longer around (i.e. it's been reaped) # the $pid the first time we waitpid after the child dies. if ($kidpid == 0) { # There's at least one child still running. sleep 5; # Wait a bit before checking on kids again. # print STDOUT "$$: At ", timestamp(), ", we're waiting on \$PIDs\{$PID}=>"; # foreach $PID (keys(%PIDs)) { # print STDOUT "$PID=$PIDs{$PID} "; # } # print STDOUT "<=\n"; } else { # Hey, status has changed for some child! Either one quit/ended, # print STDOUT "In reap loop at ", timestamp(), ", child #$PIDs{$kidpid} (PID=$kidpid) exited with status= $?.\n"; delete $PIDs{$kidpid} unless ($kidpid == -1); # or there are no more children. } } until $kidpid == -1; # At this point, all the children have ended. close DONE; close ERRORS; close NONFATAL; $Done_Count=`/usr/bin/wc -l $outdir/fix.Done`; chomp $Done_Count; if ($Done_Count == $total_number_TODO) { print "We've finished with all $total_number_TODO images from $cdlabel.\n\n"; } exit; # This is the parent ending. ##################################################################################### # # # Return the current timestamp in yyyy/mm/dd hh:mm:ss format. # # # ##################################################################################### sub timestamp() { my ($sec,$min,$hour,$mday,$mon,$year,undef,undef,undef)=localtime(); # # localtime() returns sec, min, hour, mday as you'd expect, but # mon = 0-11, where January = 0 and December = 11. # and year = Number of years since 1900 $year=1900+$year; # Add 1900 back to year. return sprintf("$year/%02d/%02d %02d:%02d:%02d",($mon+1),$mday,$hour,$min,$sec); } # End of timestamp subroutine. ###################################################### # # # CC H H III L DDD RRR EEEE N N # # C C H H I L D D R R E NN N # # C HHHH I L D D RRR EEE N N N # # C C H H I L D D R R E N NN # # CC H H III LLLL DDD R R EEEE N N # # # ###################################################### # Do what we need to do for this one image file, $this_file. # $this_file is for example, /cdrom/usp001/usp001/00/000/00000001.tif # or /cdrom/usp001/usp001/rx/000/rx000105.tif # # Here we process that image with the following 5 steps; # 1) Check for tif errors, rewriting the image with tiffcp if need be. # 2) scp this image to Southbury. # 3) If all's well, then write a line into our DONE file. sub Handle_Image_File() { # Parse out the patent number, in our examples, 00000001 or RX000105. if ($this_file !~ /\/(....(..)(..))\.tif$/) { print STDERR "Unknown filename ($this_file) ignored by child #$child_number after $file_num files.\n"; flock(ERRORS,LOCK_EX) or die "Can't lock Errors file at 1"; seek(ERRORS,0,2); # In case someone appended while we were waiting. (Thanks, Tom) print ERRORS "Unknown filename ($this_file) ignored by child #$child_number after $file_num files.\n"; flock(ERRORS,LOCK_UN); return; } $Image_fn=$1; # Just the file name without the trailing .tif (eg 00000001) $dir1=$2; # Used for /dfs/images/US directory names $dir2=$3; # Used also for indexing into DFS_Server associative array. # Convert DVD filename, to Delphion-standard patn. Mostly this is to convert # the rare fractional patents (all on usp422). if ($Image_fn =~ /^[A-Z]{0,2}\d{6,8}$/) { $patn = "US${Image_fn}__"; } elsif ($Image_fn =~ /^([X0D])(\d{6})H$/ ) { $patn = "US${1}0${2}12"; } elsif ($Image_fn =~ /^([X0D])(\d{6})D$/ ) { $patn = "US${1}0${2}14"; } elsif ($Image_fn =~ /^([X0D])(\d{6})L$/ ) { $patn = "US${1}0${2}34"; } elsif ($Image_fn =~ /^([X0D])(\d{6})N$/ ) { $patn = "US${1}0${2}78"; } else {die "Could not make sense of $Image_fn file name.\n"} if ($Done{$patn}) { # If we've previously scp'd this patent to Southbury, return; # then great. Just return quietly. } # Ok, we're definitely going to process this image. # Reset all temporary variables. $bad_file = $abs = $desc = $drawing = $claim = $biblio = ""; $image_pages = $db2_set_clause = $temp_file_list = ""; # $destination="/dfs/cdrom/old_US_Images/Consolidated_Images/$patn.$cdlabel.tif"; # Step 1) Check for tif errors, rewriting the image with tiffcp if need be. # # If there are errors, the tiffcp program will return error messages (thus this "if" # statement will be true), or run quietly (thus false) if there were no errors. if (`$tiffcp $this_file /dev/null 2>&1`) { # Ooops, tiffcp detected an errror. Use tiffcp to really rewrite the # image file and change $this_file to the newly-rewritten image file. $bad_file = $this_file; # Note this change! From here on, $this_file no longer points to the DVD # image file. It now is pointing to the fixed image in /dfs/cdrom/$cdlabel. $this_file="$outdir/$patn.tif"; `$tiffcp $bad_file $this_file 2>/dev/null`; if ($?) {die "Couldn't rewrite $bad_file to $this_file with tiffcp."} print STDOUT "Rewrote $bad_file to fix tif errors. Continuing ...\n"; flock(NONFATAL,LOCK_EX) or die "Can't lock NONFATAL file at 1"; seek(NONFATAL,0,2); # In case someone appended while we were waiting. (Thanks, Tom) print NONFATAL "Rewrote $bad_file to fix tif errors.\n"; flock(NONFATAL,LOCK_UN); $temp_file_list="$this_file"; # Erase this file when done. } # Step 2) scp $this_image to Southbury. Note at this point, $this_image could be # - The original DVD image. # - or the fixed image in /dfs/cdrom/$cdlabel. $target="$SBY_userid\@dephds005"; print STDOUT "scp $this_file $target:$destination\n" if ($debug); $scp_string_output = `$scp -p $this_file $target:$destination 2>&1`; $status = $?; if ($status != 0) { print STDERR "scp error: $this_file to $target:$destination ($size bytes).\n"; flock(ERRORS,LOCK_EX) or die "Can't lock Errors file at 3"; seek(ERRORS,0,2); # In case someone appended while we were waiting. (Thanks, Tom) print ERRORS "scp error: $this_file to $target:$destination ($size bytes)\n"; flock(ERRORS,LOCK_UN); return; # Abort from our Handle_Image_File routine. We did not # successfully handle this image file. } # print STDOUT "Child #$child_number copied $this_file to $destination ok.\n" if ($debug); # Step 3) If all's well, then write a line into our DONE file. flock(DONE,LOCK_EX) or die "Can't lock DONE file at 1"; seek(DONE,0,2); # In case someone appended while we were waiting. (Thanks, Tom) printf DONE "$patn $size $destination\n"; flock(DONE,LOCK_UN); # Erase any temporary files we may have created. if ($temp_file_list) {unlink $temp_file_list} }