#!/dfs/prod/ipn/bin/perl # An example run is cd ~rickjas/old_US_Images # then check.usp.pl usp002 # or maybe nohup check.usp.pl usp001 | tee Logs/usp001 # # Another way to run this script and stay authenticated to DCE/DFS # (maybe -- I haven't tested this), is to use my buddy script like so, # # ~rickjas/bin/buddy -p rickjas \ # -k ~rickjas/old_US_Images/.rickjas.ktab \ # command to run $debug=0; $debug=1; $checkpoint_frequency=20; # For testing. $checkpoint_frequency=200; if ($ARGV[0]) { # Was a parameter given to us? $cdlabel = lc $ARGV[0]; } else { die "$0 needs the label name of an back-file US DVD, e.g. usp001.\n"; } if ($cdlabel !~ /^usp[0-4][0-9][0-9]$/ && $cdlabel ne "usp999") { die "$0 needs a valid label name of an back-file US DVD, e.g. usp002.\n"; } # Input Files $DVD_Size_File="DVD.Sizes/$cdlabel"; $Cum_Idx_File="Cum.Idx/$cdlabel"; $DB2_Data_File="DB2/$cdlabel"; $Whole_Cum_Idx_File="usp423.cum.idx"; # Output Files $Checkpoint_File="Checkpoint/$cdlabel"; $DB2_Update_File="UpdateDB2/$cdlabel"; $DB2_File_Not_Written_To_Yet=1; # DB2 Header Comment Flag $DFS_Update_File="UpdateDFS/$cdlabel"; $New_DFS_File_List="NewDFSFiles/$cdlabel"; $Still_To_Get_File="ToDo/$cdlabel"; $Fix_File="TiffCP/$cdlabel"; $SNH_File="SNH/$cdlabel"; $Log_File="Logs/$cdlabel"; # Programs $any2any="/dfs/prod/ipn/bin/any2any"; if (! -x "$any2any") { die "Could not find or execute $any2any program."} $anyinfo="/dfs/prod/ipn/bin/anyinfo"; if (! -x "$anyinfo") { die "Could not find or execute $anyinfo program."} $tiffcp="/dfs/ipntools/tiffcp"; if (! -x "$anyinfo") { die "Could not find or execute $tiffcp program."} $tiffinfo="/dfs/ipntools/tiffinfo"; if (! -x "$anyinfo") { die "Could not find or execute $tiffinfo program."} if (! -r "$DVD_Size_File") { die "I can't find the DVD Size file (./$DVD_Size_File).\nAs jasper on patimg1, run ~/get.DVD.sizes $cdlabel\nthen scp it here as directed by the get.DVD.sizes script.\n"; } $max_erased_patn=""; # Maximum patent rm'd from DFS $too_small_only = $not_in_index_only = $not_in_index_and_too_small = $tif_errors_fixed = 0; $still_todo = $normal_db2_updates = $delete_from_db2 = $insert_missing_db2_data = 0; $new_original_file = $new_X6_file = $new_Reexam_file = 0; select(STDOUT); $|=1; if (! -r "$Cum_Idx_File") { die "I can't find the Cumulative Index file (./$Cum_Idx_File).\n"; } if (! -r "$Whole_Cum_Idx_File") { die "I can't find the complete Cumulative Index file (./$Whole_Cum_Idx_File).\n"; } print "Reading the $cdlabel lines from the Cumulative Index (./$Cum_Idx_File) ...\n"; @lines = `/usr/bin/awk '{print \$1}' $Cum_Idx_File`; chomp(@lines); # In list mode, chomp works on each element. $Cum_Idx_count=0; foreach $this_line (@lines) { # Lines originally are patent CDlabel Pages # like so 0000003 USP001 4 # or D001504 USP001 2 # or RD25414 USP001 2 # or RE00021 USP001 4 # or X011280 USP001 1 # or 001400H USP356 1 # or X01681H USP422 3 # but since I return just the first token, all I've got in the lines are the patent number. # Now encode the patent number to Delphion's standard patn for our hash's key, and determine # the DVD filename for our hash's value. if ($this_line =~ /^\d{7}$/) { $patn = "US0${this_line}__"; $DVD_fn = "0$this_line"; } elsif ($this_line =~ /^([DHTX])(\d{5,6})$/) { $patn = "US${1}0${2}__"; $DVD_fn = "${1}0$2"; } elsif ($this_line =~ /^(PP|AI|RE|RD|RX)(\d{5})$/) { $patn = "US${1}0${2}__"; $DVD_fn = "${1}0$2"; } elsif ($this_line =~ /^([X0D])(\d{5})H$/) { # Fractional Patent. H means 1/2, so kind=12. $patn = "US${1}00${2}12"; $DVD_fn = "${1}0${2}H"; } elsif ($this_line =~ /^([X0D])(\d{5})D$/) { # Fractional Patent. D means 1/4, so kind=14. $patn = "US${1}00${2}14"; $DVD_fn = "${1}0${2}D"; } elsif ($this_line =~ /^([X0D])(\d{5})L$/) { # Fractional Patent. L means 3/4, so kind=34. $patn = "US${1}00${2}34"; $DVD_fn = "${1}0${2}L"; } elsif ($this_line =~ /^([X0D])(\d{5})N$/) { # Fractional Patent. N means 7/8, so kind=78. $patn = "US${1}00${2}78"; $DVD_fn = "${1}0${2}N"; } else {die "Did not match $this_line from Cumulative Index file.\n"} $In_Cum_Idx{$patn} = 1; $DVD_fn{$patn} = $DVD_fn; # eg, Set $In_Cum_Idx{US05224775__}=>1< and $DVD_fn{US05224775__}=>05224775< # print "Set \$In_Cum_Idx{$patn}=>$In_Cum_Idx{$patn}< and \$DVD_fn{$patn}=>$DVD_fn<\n" if ($debug); $Cum_Idx_count++; } # $patn='USX000974112'; # if (exists($DVD_fn{$patn})) {print "$patn is on $cdlabel with filename=$DVD_fn{$patn}.\n"} # else {print "$patn is not on $cdlabel.\n"} # $patn='US00000000__'; # if (exists($DVD_fn{$patn})) {print "$patn is on $cdlabel with filename=$DVD_fn{$patn}.\n"} # else {print "$patn is not on $cdlabel.\n"} print "Reading the DVD size file (./$DVD_Size_File) ...\n"; $Size_count=0; open(SIZEFILE,"<$DVD_Size_File") || die "Error opening size file (./$DVD_Size_File)."; while () { # Each line is DVD_filename & size, e.g. 01936479 149867 or X009370H 38926 if (/^(\w{8}) (\d*)$/) { $DVD_fn=$1; $this_size=$2; # Convert DVD filename, to Delphion-standard patn to use as our hash's key. if ($DVD_fn =~ /^[A-Z]{0,2}\d{6,8}$/) { $patn = "US${DVD_fn}__"; } elsif ($DVD_fn =~ /^([X0D])(\d{6})H$/ ) { $patn = "US${1}0${2}12"; } elsif ($DVD_fn =~ /^([X0D])(\d{6})D$/ ) { $patn = "US${1}0${2}14"; } elsif ($DVD_fn =~ /^([X0D])(\d{6})L$/ ) { $patn = "US${1}0${2}34"; } elsif ($DVD_fn =~ /^([X0D])(\d{6})N$/ ) { $patn = "US${1}0${2}78"; } else {die "Could not make sense of $DVD_fn file name from size file (./$DVD_Size_File).\n"} # print "Setting \$DVD_Image_Size{$patn} to $this_size.\n" if ($debug); $Size_count++; $DVD_Image_Size{$patn} = $this_size; } else {die "Did not understand $_ from size file (./$DVD_Size_File).\n"} } print "\n$Cum_Idx_count out of $Size_count images from $cdlabel should be in Southbury.\n\n"; if (open(DB2_DATA,"<$DB2_Data_File")) { print "Reading the DB2 data from the $DB2_Data_File file\n"; } else { # The pre-extracted DB2 data ain't there. if (-f "/home/inst1/db2profile") { # Go get it from the database itself. `. /home/inst1/db2profile`; `db2 connect to patent user inst1 using inst1_password`; } elsif (-f "/home/caeadmin/sqllib/db2profile") { `. /home/caeadmin/sqllib/db2profile`; `db2 connect to pdbsrch1 user inst1 using inst1_password`; } else {die "Could not open nor create DB2 data file (./$DB2_Data_File).\n"} print "Am extracting the DB2 data for $cdlabel from the database. This will take a few minutes ...\n"; $cols="patn,srh,abs,desc,amend,drawing,claim,biblio,image_pages,cdlabel,datasrc,namesrc"; $db2cmd="db2 -x \"select $cols from imag where namesrc like '$cdlabel%' order by patn\""; open(DB2_DATA, "$db2cmd|") || die "Can't execute DB2 command.\n"; } $DB2_count=0; while () { chomp; # The Southbury imag table has a Primary Key of patn, datasrc, cdlabel. This could lead # to this interesting situation that I created with one test run of this program. # PATN IMAGE_PAGES CDLABEL DATASRC NAMESRC ABS DESC CLAIM # ------------ ----------- -------- ------- -------- --- ---- ----- # US00001383__ 2 usp001 USG usp001 1 2 2 <-- This row added instead # US00001383__ 2 UNKNOWN USG usp001 - - - <-- of updating this one. # US00001383__ 2 20030218 USG 20030218 - - - # # The input file was created with # db2 -x "select patn,srh,abs,desc,amend,drawing,claim,biblio,image_pages,cdlabel,datasrc,namesrc # from imag where namesrc like '$i%' order by patn" # # So the input doesn't have the DB2 headers. I just put them here to help me. # The input looks like, # # PATN SRH ABS DESC AMEND DRAWING CLAIM BIBLIO IMAGE_PAGES CDLABEL DATASRC NAMESRC # ------------ --- --- ---- ----- ------- ----- ------ ----------- --------- ------- ------- # USD0000013__ - - - - - - - 3 199902231 USG usp001 # USX0009885__ - - - - - - - 2 UNKNOWN USG usp001 if (/^(\w{12})\s*(-|\d*)\s*(-|\d*)\s*(-|\d*)\s*(-|\d*)\s*(-|\d*)\s*(-|\d*)\s*(-|\d*)\s*(-|\d*)\s*(\w*)\s*(\w*)\s*(\w*)\s*$/) { $patn=$1; $DB_srh{$patn} = $2; $DB_abs{$patn} = $3; $DB_desc{$patn} = $4; $DB_amend{$patn} = $5; $DB_drawing{$patn} = $6; $DB_claim{$patn} = $7; $DB_biblio{$patn} = $8; $DB_pages{$patn} = $9; $DB_cdlabel{$patn} = $10; $DB_datasrc{$patn} = $11; $DB_namesrc{$patn} = $12; # print "DB2 Data for $patn is $DB_srh{$patn} $DB_abs{$patn} $DB_desc{$patn} $DB_amend{$patn} $DB_drawing{$patn} $DB_claim{$patn} $DB_biblio{$patn} $DB_pages{$patn} $DB_cdlabel{$patn} $DB_datasrc{$patn} $DB_namesrc{$patn}\n" if ($debug); $DB2_count++; } else {die "Did not understand $_ from DB2 data.\n"} } close(DB2_DATA); `db2 terminate`; print "\nFound $DB2_count images from $cdlabel in the imag table.\n\n"; # At this point, for any Delphion-standard $patn, I've got these hashes set up # $In_Cum_Idx{$patn} $DVD_fn{$patn} $DVD_Image_Size{$patn} # # $DFS_fn{$patn} $DFS_size{$patn} # # $DB_srh{$patn} $DB_abs{$patn} $DB_desc{$patn} # $DB_amend{$patn} $DB_drawing{$patn} $DB_claim{$patn} # $DB_biblio{$patn} $DB_pages{$patn} $DB_datasrc{$patn} # $DB_namesrc{$patn} if (-w $Checkpoint_File && -s $Checkpoint_File) { # Our checkpoint file exists. Read the last line, whose first word should be # the last patent we have processed. That will be our $last_checkpoint. $last_checkpoint=`/usr/bin/tail -1 $Checkpoint_File | awk '{print \$1}'`; chomp $last_checkpoint; print "Resuming from last checkpoint, after patent $last_checkpoint (per $Checkpoint_File).\n"; if (! -w "$DB2_Update_File.chkpt") {die "Missing $DB2_Update_File.chkpt"} `/usr/bin/cp -p $DB2_Update_File.chkpt $DB2_Update_File`; open(DB2,">>$DB2_Update_File") || die "Couldn't open DB2 Update File (./$DB2_Update_File).\n"; if (! -w "$DFS_Update_File.chkpt") {die "Missing $DFS_Update_File.chkpt"} `/usr/bin/cp -p $DFS_Update_File.chkpt $DFS_Update_File`; open(DFS,">>$DFS_Update_File") || die "Couldn't open DFS Update File (./$DFS_Update_File).\n"; if (-s "$DFS_Update_File") { $DB2_File_Not_Written_To_Yet=0; # Don't write header twice } if (! -w "$New_DFS_File_List.chkpt") {die "Missing $New_DFS_File_List.chkpt"} `/usr/bin/cp -p $New_DFS_File_List.chkpt $New_DFS_File_List`; open(NEWDFS,">>$New_DFS_File_List") || die "Couldn't open New DFS File List (./$New_DFS_File_List).\n"; if (! -w "$Still_To_Get_File.chkpt") {die "Missing $Still_To_Get_File.chkpt"} `/usr/bin/cp -p $Still_To_Get_File.chkpt $Still_To_Get_File`; open(TODO,">>$Still_To_Get_File") || die "Couldn't open Still-To-Do File (./$Still_To_Get_File).\n"; if (! -w "$Fix_File.chkpt") {die "Missing $Fix_File.chkpt"} `/usr/bin/cp -p $Fix_File.chkpt $Fix_File`; open(FIX,">>$Fix_File") || die "Couldn't open Fix File (./$Fix_File).\n"; if (! -w "$SNH_File.chkpt") {die "Missing $SNH_File.chkpt"} `/usr/bin/cp -p $SNH_File.chkpt $SNH_File`; open(SNH,">>$SNH_File") || die "Couldn't open Should-Never-Happen File (./$SNH_File).\n"; if (! -w "$Log_File.chkpt") {die "Missing $Log_File.chkpt"} `/usr/bin/cp -p $Log_File.chkpt $Log_File`; open(LOG,">>$Log_File") || die "Couldn't open Log File (./$Log_File).\n"; } else { # Our checkpoint file doesn't exist or is empty, so we'll need to start # from the beginning, so set $last_checkpoint to something such that any # $patn will be less than. Open our output files, taking care to preserve # one generation. $last_checkpoint='!'; # "!" is the first ASCII character. if (-s "$DB2_Update_File") { `/usr/bin/mv $DB2_Update_File $DB2_Update_File.old`; if ($?) {die "Couldn't checkpoint the DB2 Update file."} } open(DB2,">$DB2_Update_File") || die "Couldn't open DB2 Update file (./$DB2_Update_File).\n"; if (-s "$DFS_Update_File") { `/usr/bin/mv $DFS_Update_File $DFS_Update_File.old`; if ($?) {die "Couldn't checkpoint the DFS Update file."} } open(DFS,">$DFS_Update_File") || die "Couldn't open DFS Update file (./$DFS_Update_File).\n"; if (-s "$New_DFS_File_List") { `/usr/bin/mv $New_DFS_File_List $New_DFS_File_List.old`; if ($?) {die "Couldn't checkpoint the New DFS File list."} } open(NEWDFS,">$New_DFS_File_List") || die "Couldn't open New DFS File list (./$New_DFS_File_List).\n"; if (-s "$Still_To_Get_File") { `/usr/bin/mv $Still_To_Get_File $Still_To_Get_File.old`; if ($?) {die "Couldn't checkpoint the Still-To-Do file."} } open(TODO,">$Still_To_Get_File") || die "Couldn't open Still-To-Do file (./$Still_To_Get_File).\n"; if (-s "$Fix_File") { `/usr/bin/mv $Fix_File $Fix_File.old`; if ($?) {die "Couldn't checkpoint the Fix file."} } open(FIX,">$Fix_File") || die "Couldn't open Fix file (./$Fix_File).\n"; if (-s "$SNH_File") { `/usr/bin/mv $SNH_File $SNH_File.old`; if ($?) {die "Couldn't checkpoint the Should-Never-Happen file."} } open(SNH,">$SNH_File") || die "Couldn't open Should-Never-Happen file (./$SNH_File).\n"; if (-s "$Log_File") { `/usr/bin/mv $Log_File $Log_File.old`; if ($?) {die "Couldn't checkpoint the Log file."} } open(LOG,">$Log_File") || die "Couldn't open Log file (./$Log_File).\n"; } select(DB2); $|=1; select(DFS); $|=1; select(NEWDFS); $|=1; select(TODO); $|=1; select(FIX); $|=1; select(SNH); $|=1; select(LOG); $|=1; select(STDOUT); # Now do some serious checking of what all we found. foreach $patn (sort keys %DVD_Image_Size) { if ($patn le $last_checkpoint) { # If we're not at our last checkpoint yet, $patn_number++; # then keep skipping forward while keeping next; # our heartbeat counter accurate. } # Reset all temporary variables. $DFS_fn = $abs = $desc = $drawing = $claim = $biblio = $image_pages = $db2_set_clause = ""; # E.G. /dfs/image/US/34/12 $Image_Dir="/dfs/images/US/" . substr($patn,8,2) . "/" . substr($patn,6,2); # What we ultimately want the primary image to be named. $Wanted_DFS_fn="$Image_Dir/$patn.$cdlabel.tif"; $patn_number++; print "Checking patent $patn from $cdlabel DVD = $patn_number/$Size_count ...\n" if ($debug); # Try to find this image in DFS. Search in order, # 1) $patn.$cdlabel.tif ($Wanted_DFS_fn), # 2) $patn.tif (how Danny loaded half the images back in 2001), # 3) anything else with $patn in the name and the proper number of bytes. # Perl's -r follows links and we don't want to follow links, thus the -l ... if (-r $Wanted_DFS_fn && ! -l $DFS_fn) { $DFS_fn=$Wanted_DFS_fn; # If we find the image with the CD label in its name, # then we've probably already processed this image # and now, we're only checking our work. Note that # we don't verify this image file with the DVD byte # count due to that possible munging (i.e. we've # Split_Off_Pieces or fixed tif errors with tiffcp). } else { # Else search for the name Danny would have used, # but also confirm the file's size. This way we # won't misidentify an image from another source. $tmp_fn="$Image_Dir/$patn.tif"; if (-r $tmp_fn && ! -l $tmp_fn && -s $tmp_fn==$DVD_Image_Size{$patn}) { $DFS_fn=$tmp_fn; } else { # It's not uncommon for our real DVD image to exist in DFS but it's been renamed. # For example, US00001383 came in on usp001 in April, 2001, but got replaced in # February, 2003 by another image from the normal data stream, getting renamed # to US00001383__.tif.old. # # So we make another attempt to find our DVD image based on the size of the file # when it was on the DVD and having the patent number in the filename. The # US00001383__.tif.old image for example, can be found with this find command, # find /dfs/images/US/83/13 -type f -size 159252c -name '*US00001383*' # # If we can find it, then we'll process it as normal. $find_command="/usr/bin/find $Image_Dir -type f -size $DVD_Image_Size{$patn}c -name '*$patn*'"; my @lines = `$find_command`; if ($#lines == -1) { print "Couldn't find $patn in /dfs/images anyhow/anyway.\n" if ($debug); } elsif ($#lines > 0) { print "Found too many $patn images, all the same size ($DVD_Image_Size{$patn}).\n" if ($debug); print SNH "Found too many $patn images, all the same size ($DVD_Image_Size{$patn}).\n" if ($debug); } else { $DFS_fn=$lines[0]; # Aha! Found it. We want to process this guy. chomp $DFS_fn; print "Found $DFS_fn for $patn ($DVD_Image_Size{$patn} bytes).\n" if ($debug); print LOG "Found $DFS_fn for $patn ($DVD_Image_Size{$patn} bytes).\n"; } } } # At this point, $DFS_fn contains the filename of our image from DVD, or null if we # couldn't find it in DFS. Now see if we want this image in DFS and in DB2 at all. # # There are 2 reasons why an image on DVD, should not get transferred to SBY. # 1) It's not in the final Cumulative Index file on the last DVD (usp423). # Each DVD had a Cumulative Index file, but the very last one, is the master. # 2) The image is too small. There are many images on DVD that are about # 863 bytes large (give or take) that simply say "WITHDRAWN". Sometimes # (most of the times? I don't know, I didn't check), the image is rescanned # on a later DVD (e.g. USRX00002 on usp001 says "WITHDRAWN", but is ok on usp301). # Other times, it's never rescanned (e.g. US00000100 on USP001 or US002163300 # on USP422). Oh, well. # # Identify and handle the 4 combinations of being in DFS or not, and wanting it or not. # # For historical interest, here is how the 41,495 patents on usp001 broke down, # Combination 1: In DFS, but we don't want it. 3,932 Total # 3,678 not in the index file (and not too small) # All but USD0016162 were replaced on later DVDs. # 154 not in the index file and too small # 100 too small # Combination 2: Not in DFS and we don't want it. 510? Total # Combination 3: Not in DFS and we do want it. 143 Total # Combination 4: In DFS and we do want it. 36,910? Total # Combination 1: In DFS, but we don't want it. if ($DFS_fn && (! $In_Cum_Idx{$patn} || $DVD_Image_Size{$patn} < 1000)) { if ((! $In_Cum_Idx{$patn}) && ($DVD_Image_Size{$patn} < 1000)) { $reason="not in the index file & too small."; $not_in_index_and_too_small++; } elsif (! $In_Cum_Idx{$patn}) { $reason="not in the index file ($DVD_Image_Size{$patn} bytes)."; $not_in_index_only++; print LOG "$patn will be rm'd for not being in the index file only ($DVD_Image_Size{$patn} bytes).\n" } else { $reason="too small."; $too_small_only++; } print "rm-ing $DFS_fn 'cause it's $reason\n"; print DFS "rm -f $DFS_fn # $reason\n"; $number_erased++; # Count number of patents erased if ($patn gt $max_erased_patn) {$max_erased_patn=$patn} $rmd_patents{$patn}=$DFS_fn; # Remember data from this image $rmd_reason{$patn}=$reason; Remove_From_DB2_If_There(); # Combination 2: Not in DFS and we don't want it. } elsif (! $In_Cum_Idx{$patn} || $DVD_Image_Size{$patn} < 1000) { # If this image doesn't belong in DFS & it's not there, good! That's how it should be. if ((! $In_Cum_Idx{$patn}) && ($DVD_Image_Size{$patn} < 1000)) { $reason="not in the index file & too small."; $not_in_index_and_too_small++; } elsif (! $In_Cum_Idx{$patn}) { $reason="not in the index file ($DVD_Image_Size{$patn} bytes)."; $not_in_index_only++; print LOG "$patn would have been rm'd for not being in the index file only ($DVD_Image_Size{$patn} bytes).\n" } else { $reason="too small."; $too_small_only++; } print "$patn is not and should not be in DFS 'cause it's $reason\n" if ($debug); # Even tho' we didn't have to erase it, keep track of it for later replacement checking. $number_erased++; # Count number of patents erased if ($patn gt $max_erased_patn) {$max_erased_patn=$patn} $rmd_patents{$patn}="No_Real_File"; # Remember data from this image $rmd_reason{$patn}=$reason; Remove_From_DB2_If_There(); # Combination 3: Not in DFS and we do want it. } elsif (! $DFS_fn) { # If our image is not in DFS at all, print "Still have $patn to do\n" if ($debug); # then we need to go get it still. $still_todo++; # Use the $patn to get the DVD filename, which should be just the 8-character filename # (no directory names). Write the fully-qualified file name in our ToDo file, # eg /cdrom/usp001/usp001/X0/003/X0003089.tif if ($DVD_fn{$patn}=~m/^(..)(...)...$/) { print TODO "/cdrom/$cdlabel/$cdlabel/$1/$2/$DVD_fn{$patn}.tif\n"; } else {die "Failed sanity check. DVD fn for $patn ($DVD_fn{$patn}) is not 8 characters long.\n"} Remove_From_DB2_If_There(); # I don't expect it to be in DB2, but ... # Combination 4: In DFS and we do want it. } else { Process_Image_File(); # Warning! Process_Image_File does a lot and } # things may have changed in that call, eg $DFS_fn. # Checkpoint check. After finishing $checkpoint_frequency (200) patents, we will # close all output files, copy them to a checkpoint name, append to the checkpoint # file, and then reopen the output files in append mode. Be as atomic as possible. if ($patn_number % $checkpoint_frequency == 0) { Checkpoint(); open(DB2,">>$DB2_Update_File") || die "Couldn't open DB2 Update File (./$DB2_Update_File) after checkpointing $patn.\n"; open(DFS,">>$DFS_Update_File") || die "Couldn't open DFS Update File (./$DFS_Update_File) after checkpointing $patn.\n"; open(NEWDFS,">>$New_DFS_File_List") || die "Couldn't open New DFS File List (./$New_DFS_File_List) after checkpointing $patn.\n"; open(TODO,">>$Still_To_Get_File") || die "Couldn't open Still-To-Do File (./$Still_To_Get_File) after checkpointing $patn.\n"; open(FIX,">>$Fix_File") || die "Couldn't open Fix File (./$Fix_File) after checkpointing $patn.\n"; open(SNH,">>$SNH_File") || die "Couldn't open Should-Never-Happen File (./$SNH_File) after checkpointing $patn.\n"; open(LOG,">>$Log_File") || die "Couldn't open Log File (./$Log_File) after checkpointing $patn.\n"; select(DB2); $|=1; select(DFS); $|=1; select(NEWDFS); $|=1; select(TODO); $|=1; select(FIX); $|=1; select(SNH); $|=1; select(LOG); $|=1; select(STDOUT); } } # End of our foreach $patn loop. # For completeness, scan through both the Cum.Idx and DB2 arrays, looking for patent # numbers that were not on DVD (ie, in the DVD_Image_Size array). $last_patn=$patn; # Save this for our last checkpoint below. print "Checking for Cumulative Index files that weren't on the DVD ...\n"; foreach $patn (sort keys %In_Cum_Idx) { if (! $DVD_Image_Size{$patn}) { # print "$patn found in Cumulative Index file, but not on DVD?? How can this be.\n"; print SNH "$patn found in Cumulative Index file, but not on DVD.\n"; } } print "Checking for patents in the database, but weren't on the DVD ...\n"; foreach $patn (sort keys %DB_datasrc) { if (! $DVD_Image_Size{$patn}) { # print "$patn found in database, but not on DVD?? How can this be.\n"; print SNH "$patn found in database, but not on DVD?? How can this be.\n"; } } # If we removed any files in DFS (ok, wrote into our UpdateDFS file to remove # the files -- we don't actually erase in this script), then look through the # complete Cumulative Index file to insure the (to-be-) erased file is/will be # replaced on a future DVD. Most of the times, they are, but sometimes, not. if (%rmd_patents) { # Don't bother with this check if we didn't erase any files. print "Checking the $number_erased erased patents to insure they're later replaced ...\n"; $found=0; $found_bad=0; close INPUTFILE; open(INPUTFILE,"<$Whole_Cum_Idx_File") || die "Error opening Cumulative Index file (./$Whole_Cum_Idx_File)."; while () { # Lines are like so: patent CDlabel Pages # or 0000003 USP001 4 # or D001504 USP001 2 # or RD25414 USP001 2 # Encode the patent number to Delphion's standard patn for our hash's key, and determine # the DVD filename for our hash's value. if (/^(\d{7}) (\w*)/) { $patn = "US0${1}__"; $cdlbl = $2; } elsif (/^([DHTX])(\d{5,6}) (\w*)/) { $patn = "US${1}0${2}__"; $cdlbl = $3; } elsif (/^(PP|AI|RE|RD|RX)(\d{5}) (\w*)/) { $patn = "US${1}0${2}__"; $cdlbl = $3; } elsif (/^([X0D])(\d{5})H (\w*)/) { # Fractional Patent. H means 1/2, so kind=12. $patn = "US${1}00${2}12"; $cdlbl = $3; } elsif (/^([X0D])(\d{5})D (\w*)/) { # Fractional Patent. D means 1/4, so kind=14. $patn = "US${1}00${2}14"; $cdlbl = $3; } elsif (/^([X0D])(\d{5})L (\w*)/) { # Fractional Patent. L means 3/4, so kind=34. $patn = "US${1}00${2}34"; $cdlbl = $3; } elsif (/^([X0D])(\d{5})N (\w*)/) { # Fractional Patent. N means 7/8, so kind=78. $patn = "US${1}00${2}78"; $cdlbl = $3; } else {die "Did not match $_ from Cumulative Index file.\n"} if (exists $rmd_patents{$patn}) { $found++; $real_image_cd_label{$patn}=$cdlbl; if ($cdlbl eq uc $cdlabel) { $found_bad++; print "$patn is on $cdlabel ???? That should not be.\n"; print SNH "$patn is on $cdlabel ???? That should not be.\n"; } if ($found == $number_erased || $patn gt $max_erased_patn) {leave} } } } print "Only $found found out of $number_erased. Here are the patents that were never replaced ...\n"; foreach $patn (sort keys %rmd_patents) { if (! $real_image_cd_label{$patn}) { if ($rmd_reason{$patn} =~ /too small/) { print "$rmd_patents{$patn} was erased from DFS but never replaced, but it was too small anyway.\n"; print LOG "$rmd_patents{$patn} was erased from DFS but never replaced, but it was too small anyway.\n"; } else { print "$rmd_patents{$patn} was erased from DFS but never replaced and it was not too small.\n"; print LOG "$rmd_patents{$patn} was erased from DFS but never replaced and it was not too small.\n"; } } } # Checkpoint one last time. This time, we'll keep everything closed up. $patn=$last_patn; # Restore this so our last checkpoint shows the last DVD Checkpoint(); # patn, not the last one from our last checking. print LOG "\n\nFinal Stats: Finished at $timestamp.\n"; print LOG " $too_small_only were (maybe) removed from DFS only because they were too small,\n"; print LOG " $not_in_index_only were (maybe) removed from DFS only because they weren't in the index file,\n"; print LOG " $not_in_index_and_too_small were removed from DFS for both reasons,\n"; print LOG " $tif_errors_fixed whose tif errors were fixed,\n"; print LOG " $still_todo were still to be gotten from DVD,\n"; print LOG " $normal_db2_updates required normal DB2 updates,\n"; print LOG " $delete_from_db2 were put in the delete-from-DB2 list,\n"; print LOG " $insert_missing_db2_data were inserted into DB2,\n"; print LOG " $new_original_file original images were extracted from a consolidated image,\n"; print LOG " $new_X6_file X6 were images extracted from a consolidated image,\n"; print LOG " $new_Reexam_file Reexaminations were extracted from a consolidated image.\n"; exit; # Not needed, but definitely delimits our mainline. # Everything below this point, are subroutines. # We've found an image ($DFS_fn) we believe came from our DVD. Here we process that # image with the following 5 steps; # 1) Count the number of pages in the file. # 2) Extract tagdata & metadata if possible. # 2A: If we successfully extracted the page start information from the tagdata, then # verify the page start columns in DB2 (if of course, it's in DB2 at all). # 2B: If the metadata indicates this is a "Consolidated" image, i.e. it has # Certificate of Corrections and/or Reexaminations tacked on the end, then # split off all the pieces, adding the appropriate rows into DB2 for the pieces. # 2C: If this isn't a "Consolidated" image, then check for tif errors, rewriting # the image with tiffcp if need be. # 3) Verify the non-page-count columns. # 4) If not named properly (i.e. $patn.$cdlabel.tif), then rename the image. # 5) Create a link, i.e. $patn.tif -> $patn.$cdlabel.tif. sub Process_Image_File { # Step 1) Count how many pages are in this image. $pages=Count_Pages($DFS_fn); if (! $pages) { print SNH "Bizarre error counting pages of $DFS_fn.\n"; die "Bizarre error counting pages of $DFS_fn.\n"; # I don't ever expect this. } # Step 2) Try to extract tagdata & metadata so we can split off the pieces & verify the DB2 data. if (There_Is_Tagdata($DFS_fn)) { # If any tag data was found, then the There_Is_Tagdata subroutine has already set # up many variables for us. In DB2 column order, we have # abs, desc, drawing, claim, and biblio. # And if there was also metadata present, we also have $numCoCs and $numReexams, # with their associated Start/End numbers, # ($CoC_Start[$i], $CoC_End[$i], $Reexam_Start[$i] and $Reexam_End[$i]). # We will compare with the DB2 data, $DB_abs, $DB_desc, $DB_drawing, $DB_claim, $DB_biblio. # print "Page start data is abs=$abs, desc=$desc, drawing=$drawing, claim=$claim, biblio=$biblio.\n" if ($debug); # Step 2A) Verify page count information in DB2 (if it's in DB2 at all). # Our $db2_set_clause gets built up here and may also get appended to below # in Step 3). After Step 3), we'll use $db2_set_clause to update DB2. if (exists $DB_datasrc{$patn}) { # Is it in DB2? If so, we can update columns # if need be. Else we'll insert all of it below. if (($abs) && ($abs ne $DB_abs{$patn})) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "abs=$abs"; } if (($desc) && ($desc ne $DB_desc{$patn})) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "desc=$desc"; } if (($drawing) && ($drawing ne $DB_drawing{$patn})) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "drawing=$drawing"; } if (($claim) && ($claim ne $DB_claim{$patn})) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "claim=$claim"; } if (($biblio) && ($biblio ne $DB_biblio{$patn})) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "biblio=$biblio"; } } # Step 2B) If there are any CoC's or Reexams, then split up the image. if ($numCoCs || $numReexams) { if ($DFS_fn =~ /\/$patn.$cdlabel.tif$/) { # Verify saneness. # We cannot allow a consolidated image to have the CD label in its name, because # we want to write the original to this name. This should never happen 'cause # we should have already split it up before now. print SNH "DFS Consolidated image file ($DFS_fn) has the CD label in its name.\n"; die "DFS Consolidated image file ($DFS_fn) has the CD label in its name."; } $pages=Split_Off_Pieces(); # We're ok. Break out the CoC's & Reexams, adding rows # to DB2 as we go. Returns number of pages in original. # Also archives the consolidated image, so $DFS_fn is no # longer correct. # Step 2C) There are no CoC or Reexams. Just check for tif errors. (The reason we } else { # don't have to check for tif errors above, is because the Split_Off_Pieces # routine already rewrites the image file, thus correcting any errors.) # If there are errors, the tiffcp program will return error messages (thus this "if" # statement will be true), or run quietly (thus false) if there were no errors. if (`$tiffcp $DFS_fn /dev/null 2>&1`) { $tif_errors_fixed++; $archive_fn="~rickjas/old_US_Images/tif_Error_Images/$patn.$cdlabel.tif"; `/usr/bin/mv $DFS_fn $archive_fn`; if ($?) {die "Couldn't archive $DFS_fn to $archive_fn."} `$tiffcp $archive_fn $DFS_fn 2>/dev/null`; if ($?) {die "Couldn't rewrite $archive_fn to $DFS_fn with tiffcp."} print "Rewrote $DFS_fn due to tif errors.\n"; print FIX "$DFS_fn\n"; } } } # Step 3) Verify the non-page-count columns, only if it's already in DB2. if (exists $DB_datasrc{$patn}) { # Note there IS no Search Report in a US patent, so srh should always be null. if ($DB_srh{$patn} ne "-") { $db2_set_clause .= ($db2_set_clause ? "," : "") . "srh=NULL"; } # Note there is also no Amendments for US patents, so that should always be null, too. if ($DB_amend{$patn} ne "-") { $db2_set_clause .= ($db2_set_clause ? "," : "") . "amend=NULL"; } if ($DB_pages{$patn} != $pages) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "image_pages=$pages"; } # Check the cdlabel. if ($DB_cdlabel{$patn} ne $cdlabel) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "cdlabel='$cdlabel'"; } # The datasrc should always be "USG". if ($DB_datasrc{$patn} ne "USG") { $db2_set_clause .= ($db2_set_clause ? "," : "") . "datasrc='USG'"; } # The namesrc should also be the cdlabel. if ($DB_namesrc{$patn} ne $cdlabel) { $db2_set_clause .= ($db2_set_clause ? "," : "") . "namesrc='$cdlabel'"; } # Here finally, we update DB2 if we have columns to update. if ($db2_set_clause ne "") { $normal_db2_updates++; # print "updating imag set $db2_set_clause where patn='$patn' and datasrc='$DB_datasrc{$patn}' and cdlabel='$DB_cdlabel{$patn} and namesrc='$DB_namesrc{$patn}'\n" if ($debug); Write_Into_DB2_File("update imag set $db2_set_clause where patn='$patn' and datasrc='$DB_datasrc{$patn}' and cdlabel='$DB_cdlabel{$patn}' and namesrc='$DB_namesrc{$patn}'"); } } else { # The data is not in DB2 at all. We'll add it. $column_names="patn,image_pages,cdlabel,datasrc,namesrc"; $values="'$patn',$pages,'$cdlabel','USG','$cdlabel'"; if ($abs) { $column_names .= ",abs"; $values .= ",$abs"; } if ($desc) { $column_names .= ",desc"; $values .= ",$desc"; } if ($drawing) { $column_names .= ",drawing"; $values .= ",$drawing"; } if ($claim) { $column_names .= ",claim"; $values .= ",$claim"; } if ($biblio) { $column_names .= ",biblio"; $values .= ",$biblio"; } print "inserting into imag ($column_names) values($values)\n"; $insert_missing_db2_data++; Write_Into_DB2_File("insert into imag ($column_names) values($values)"); print LOG "Had to insert $patn into DB2 ($column_names) values($values)\n"; } # Step 4) If not named properly ($patn.$cdlabel.tif), then rename the image. # Careful! If consolidated image or tif error, then $DFS_fn is gone. # print "At Step 4), checking $DFS_fn versus $Wanted_DFS_fn.\n" if ($debug); if ($DFS_fn ne $Wanted_DFS_fn && -s $DFS_fn) { # Note we're really renaming files here, not just writing into our DFS file. print "Renaming $DFS_fn to $Wanted_DFS_fn\n" if ($debug); `/usr/bin/mv $DFS_fn $Wanted_DFS_fn`; if ($?) {die "Couldn't rename $DFS_fn to $Wanted_DFS_fn."} $DFS_fn=$Wanted_DFS_fn; } # Step 5) Create a link, i.e. $patn.tif -> $patn.$cdlabel.tif. if (! -s "$Image_Dir/$patn.tif" && ! -l "$Image_Dir/$patn.tif") { # print "ln -s $patn.$cdlabel.tif $Image_Dir/$patn.tif\n"; `/usr/bin/ln -s $patn.$cdlabel.tif $Image_Dir/$patn.tif`; if ($?) {die "Couldn't ln -s $patn.$cdlabel.tif $Image_Dir/$patn.tif."} } else { if (-s "$Image_Dir/$patn.tif") {print LOG "Did not create link for $patn because $patn.tif is a file.\n"} } } # Search this image file for a "Software" tif header tag (decimal 305) and if there and in # the right format, parse out the tag data. If we're lucky, there will also be metadata # at the end of the image file. # # We return 0 if we don't find any tag data or metadata, else 1. sub There_Is_Tagdata { my $fn=$_[0]; $numCoCs=0; $numReexams=0; # print "Checking $fn for tag data ...\n"; $tagdata=`$tiffinfo $fn 2>/dev/null | /usr/bin/grep Software 2>/dev/null | /usr/bin/head -1 | /usr/bin/cut -f2 -d'\"'`; # Typically, the tag data is something like [6]0;3;2;6;6;META-535202;002 # where [nn] = Total number of pages # n; = Bibliographic data Page Start # n; = Abstracts data Page Start # n; = Drawing data Page Start # n; = Description data Page Start # n; = Claim data Page Start # META-nnnnn; = Metadata offset # print "Preliminary tagdata =>$tagdata<\n" if ($debug); if ($tagdata =~ /\[(\d*)\](\d*);(\d*);(\d*);(\d*);(\d*);META-(\d*)\D/) { $numPages=$1; $biblio=$2; $abs=$3; $drawing=$4; $desc=$5; $claim=$6; $metadata_Offset=$7; # print "I've got $numPages $biblio $abs $drawing $desc $claim $metadata_Offset from $tagdata\n" if ($debug); # The metadata starts at that metadata_Offset, which is at the end of the image file. $file_size = -s $fn; $tail_parm = $file_size - $metadata_Offset +1; if ($tail_parm < 300) { $metadata=`/usr/bin/tail -c $tail_parm $fn`; # When there's no Certificate of Corrections or Reexaminations, the metadata usually looks like # D0000002,US,S1,18430224,2,Y,NULL,1,2,0,0,2,2,2,2,0,0,0,0,CERT_OF_CORR=0,RE_EXAM=0 # ======== == == ======== = = ==== === === === === === === # 8-char patn-/ | | | | | | | | | | | \-Reexaminations Start,End # Country Code-/ | | | | | | | | | \-Certificate of Corrections Start,End # Kind-/ | | | | | | | \-Claim Start,End # Issue Date (YYYYMMDD)-/ | | | | | \-Description Start,End # Total Number of Pages-/ | | | \-Drawing Start,End # Missing Page Flag-/ | \-Abstracts Start,End # Withdrawn Flag-/ # # When there is Certificate of Corrections or Reexaminations data, the metadata is # 05224775,US,A1,19930706,28,NULL,NULL,1,1,2,10,11,18,18,20,21,21,22,28, # CERT_OF_CORR=1,19941011,NULL,21,21,RE_EXAM=2,19940719,B1,NULL,22,25,20020423,C2,NULL,26,28 # = =================== = ====================== ====================== # After the equal sign, is the number of CoC's or Reexams, followed by metadata for each. # The metadata includes Publication Date (YYYYMMDD), # the kind code for Reexaminations only, # a Missing Pages Flag, # and Start,End page numbers. # Verify the metadata starts with the 8-character patn. If not correct, try to find the # metadata on our own. US0036163 on usp001 for example, had the metadata tag truncated # (159041, not 1590412), yet the metadata was there in the last 92 bytes. # eg, metadata=>05224775,US,A1,19930706,28,NULL,NULL,1,1,2,10,11,18,18,20,21,21,22,28, # CERT_OF_CORR=1,19941011,NULL,21,21, # RE_EXAM=2,19940719,B1,NULL,22,25,20020423,C2,NULL,26,28< # and $DVD_fn{US05224775__}=>05224775< and 8-character patn=>05224775< # print "1: metadata=>$metadata<\n and \$DVD_fn{$patn}=>$DVD_fn{$patn}< and 8-character patn=>" , substr($metadata,0,8) , "<\n"; if (substr($metadata,0,8) ne $DVD_fn{$patn}) { $metadata=`/usr/bin/tail -c 300 $fn`; if ($metadata !~ /($DVD_fn{$patn}.*)$/) { print "2: $fn metadata could not be salvaged.\n" if ($debug); print LOG "2: $fn metadata could not be salvaged.\n"; return 1; # There was tag data, but no metadata. } else { $metadata=$1; print "3: $fn metadata salvaged >$metadata<\n" if ($debug); print LOG "3: $fn metadata salvaged >$metadata<\n"; } } } else { $metadata=`/usr/bin/tail -c 300 $fn`; if ($metadata !~ /($DVD_fn{$patn}.*)$/) { print "4: $fn metadata could not be salvaged.\n" if ($debug); print LOG "4: $fn metadata could not be salvaged.\n"; return 1; # There was tag data, but no metadata. } else { $metadata=$1; print "5: $fn metadata salvaged >$metadata<\n" if ($debug); print LOG "5: $fn metadata salvaged >$metadata<\n"; } } # The metadata looks good (well, at least it starts with our 8-character patn). # Parse out the CoC & Reexam data if present, and set in variables # $numCoCs & $numReexams, along with the page start info in arrays. # print "6: metadata=>$metadata<\n and \$DVD_fn{$patn}=>$DVD_fn{$patn}< and 8-character patn=>" , substr($metadata,0,8) , "<\n"; if ($metadata =~ /,CERT_OF_CORR=(\d*),(.*),?RE_EXAM=(\d*),?(.*)/) { $numCoCs=$1; $CoC_data=$2; $numReexams=$3; $Reexam_data=$4; if ($numCoCs || $numReexams) { print "Found $numCoCs CoCs and $numReexams reexaminations in $fn.\n"; } # print "Parsing CoC Data >$CoC_data<\n"; $counter=0; while ($CoC_data =~ /\d*,.*?,(\d*),(\d*),?/g) { # Do a sanity check first. Is there this many pages? On US00229322 (usp006), # the CoC supposedly on page 12 was missing. There was only 11 pages total. if ($1 <= $pages) { $CoC_Start[$counter]=$1; if ($CoC_End[$counter] > $pages) { $CoC_End[$counter]=$pages; } else { $CoC_End[$counter]=$2; } # print "Got a CoC on pages $CoC_Start[$counter]-$CoC_End[$counter] (>$`< remaining)\n" if ($debug); $counter++; } } if ($counter != $numCoCs) { print "Wrong number of CoC stanzas in the metadata for $patn. Was $counter, should have been $numCoCs.\n"; print LOG "Wrong number of CoC stanzas in the metadata for $patn. Was $counter, should have been $numCoCs.\n"; } # print "Parsing Reexam Data >$Reexam_data<\n"; $counter=0; while ($Reexam_data =~ /\d*,(\w*),\w*,(\d*),(\d*),?/g) { $Reexam_Kind[$counter]=$1; $Reexam_Start[$counter]=$2; $Reexam_End[$counter]=$3; $np=$Reexam_End[$counter] - $Reexam_Start[$counter] + 1; # print "Got a $Reexam_Kind[$counter] Reexam on pages $Reexam_Start[$counter]-$Reexam_End[$counter] (>$'< remaining)\n" if ($debug); if ($Reexam_Start[$counter] <= $pages) { $counter++; } } if ($counter != $numReexams) { print "Wrong number of Reexam stanzas in the metadata for $patn. Was $counter, should have been $numReexams.\n"; print LOG "Wrong number of Reexam stanzas in the metadata for $patn. Was $counter, should have been $numReexams.\n"; } } } else { # print "At 123, Rick. No tagdata?\n" if ($debug); return 0; # No tag data, so no metadata (probably - we didn't really check.) } return 1; } # Count the number of pages in the passed filename. Returns the number # of pages in the image or zero if there's an error. sub Count_Pages { my $fn=$_[0]; my @lines = `$anyinfo $fn`; foreach my $line (@lines) { if ( $line =~ /^([0-9]+) pages?\./ ) { return $1; } } return 0; } sub Split_Off_Pieces { # Split the image up into individual pages. # # Original: Page 1 to min(CoC_Start[0] and/or Reexam_Start[0]) # CoC (if any) : CoC_Start[i] - CoC_End[i] # Reexams (if any): Reexam_Start[i] - Reexam_End[i] if (exists $CoC_Start[0]) { # If there are any CoC's, then the original $last_original_page= $CoC_Start[0] - 1; # ends just before where the first CoC begins. } else { # Else there are no CoC's, so there must be a $last_original_page=$Reexam_Start[0] - 1; # Reexam and that's where the original ends. } $Temp_CTL_File="/tmp/$patn.original.ctl"; open(CTL,">$Temp_CTL_File") || die "Error opening Original control file ($Temp_CTL_File)."; for (1 .. $last_original_page) { # print STDOUT "Writing Original CTL File Line: filename $DFS_fn $_\n" if ($debug); print CTL "filename $DFS_fn $_\n" || die "Error writing to Original control file ($Temp_CTL_File)."; } close CTL; # Figure out which DFS name to call it. my $Base_fn="$Image_Dir/$patn"; # EG, "/dfs/images/US/72/07/US01010772__" my $Base_Patn=substr($patn,0,10); # EG, US01010772 $Original_fn="$Base_fn.$cdlabel.tif"; # "Original" referring to the original patent's image, if (-r "$Original_fn") { # not $DFS_fn. print SNH "DFS Original image file already exists??? This should never happen.\n"; die "DFS Original image file already exists??? This should never happen."; } # print "Rick, \$Original_fn=>$Original_fn< and calling Call_any2any with ($Temp_CTL_File,$Original_fn)\n" if ($debug); # Now call any2any to create the original image from the consolidated image. my $Original_Pages = Call_any2any($Temp_CTL_File,$Original_fn); # The Call_any2any routine returns the number of pages wrote (zero if there was an error). # Insure any2any's page count matches how many I thought I should've written. They should. if ($Original_Pages != $last_original_page) { print STDERR "Page count mismatch for $Original_fn. Wrote $Original_Pages, not $last_original_page.\n"; print LOG "Page count mismatch for $Original_fn. Wrote $Original_Pages, not $last_original_page.\n"; } else { # Normal case. All's ok. print "Wrote $Original_Pages pages into $Original_fn at ", scalar localtime, ".\n" if ($debug); system("/usr/bin/rm $Temp_CTL_File"); print NEWDFS "$Original_fn # Original\n"; $new_original_file++; } # If there are any Certificate of Corrections, carve them out of the consolidated image. # The filenames for the X6 CoCs will be US01234567X6 instead of US01234567__, and if # there are more than 1 CoC (so that just $cdlabel wouldn't be unique), we'll append # suffixes to the CD label to ensure uniqueness. So the first CoC will be # US01234567__.usp001_1.tif, the second, US01234567__.usp001_2.tif, etc. my $This_X6_Patn = $Base_Patn . "X6"; # EG, US01010772X6 for (my $CoC_Number=0; $CoC_Number<=$#CoC_Start ;$CoC_Number++) { # print "Extracting CoC number " , $CoC_Number+1 , " ...\n" if ($debug); $Temp_CTL_File="/tmp/$patn.CoC_$CoC_Number.ctl"; open(CTL,">$Temp_CTL_File") || die "Error opening CoC control file ($Temp_CTL_File)."; for ($CoC_Start[$CoC_Number] .. $CoC_End[$CoC_Number]) { # print STDOUT "Writing CoC CTL File Line: filename $DFS_fn $_\n" if ($debug); print CTL "filename $DFS_fn $_\n" || die "Error writing to CoC control file ($Temp_CTL_File)."; } close CTL; # Figure out which DFS name to call it. If only 1 CoC (normal case), then use the # real CD label. Otherwise, suffix all CD labels with _digit. if ($#CoC_Start) { $This_X6_CD_Label = "${cdlabel}_" . ($CoC_Number+1); # EG, usp037_1, usp037_2, ... } else { $This_X6_CD_Label = $cdlabel; # EG, usp037 } $This_X6_Fn = "$Image_Dir/$This_X6_Patn.$This_X6_CD_Label.tif"; # EG, "/dfs/images/US/72/07/US01010772X6.usp037.tif" # Now call any2any to create the image for this CoC from the consolidated image. $pages_wrote = Call_any2any($Temp_CTL_File,$This_X6_Fn); # The Call_any2any routine returns the number of pages wrote (zero if there was an error). # Insure any2any's page count matches how many I thought I should've written. They should. if ($pages_wrote != $CoC_End[$CoC_Number] - $CoC_Start[$CoC_Number] + 1) { print STDERR "Page count mismatch for CoC #" , $CoC_Number+1 , ". Wrote $pages_wrote, not " , $CoC_End[$CoC_Number]-$CoC_Start[$CoC_Number]+1 , ".\n"; print LOG "Page count mismatch for CoC #" , $CoC_Number+1 , ". Wrote $pages_wrote, not " , $CoC_End[$CoC_Number]-$CoC_Start[$CoC_Number]+1 , ".\n"; } else { # Normal case. All's ok. print "Wrote $pages_wrote pages into $This_X6_Fn at ", scalar localtime, ".\n" if ($debug); system("/usr/bin/rm $Temp_CTL_File"); print NEWDFS "$This_X6_Fn # CoC # " , $CoC_Number+1 , "\n"; } # Write row into imag to identify this X6 image. For example, # db2 "insert into imag (patn,image_pages,cdlabel,datasrc,namesrc) # values('$This_X6_Patn',$pages_wrote,'$This_X6_CD_Label','USG','$This_X6_CD_Label')"; Write_Into_DB2_File("insert into imag (patn,image_pages,cdlabel,datasrc,namesrc) values('$This_X6_Patn',$pages_wrote,'$This_X6_CD_Label','USG','$This_X6_CD_Label')"); $new_X6_file++; } # If there are any Reexaminations, carve them out of the consolidated image. # The Reexams kinds are nicely provided to us in the metadata, and will be B1 or C1. # "B" if $Reexam_Issue_Year < 2002, or "C" if $Reexam_Issue_Year > 2001. # The numeric portion increments for each Reexam. For example, for 05224775 on # usp364, the first reexam was issued in 1994, so it was a B1, # the second reexam was issued in 2002, so it was a C2. # Because of that ever-increasing digit, we don't have to play games with the CD label # to ensure uniqueness, unlike what he had to do for the CoC's. for (my $Reexam_Number=0; $Reexam_Number<=$#Reexam_Start ;$Reexam_Number++) { # print "Extracting Reexam number " , $Reexam_Number+1 , " ...\n" if ($debug); $Temp_CTL_File="/tmp/$patn.Reexam_$Reexam_Number.ctl"; open(CTL,">$Temp_CTL_File") || die "Error opening Reexam control file ($Temp_CTL_File)."; for ($Reexam_Start[$Reexam_Number] .. $Reexam_End[$Reexam_Number]) { # print STDOUT "Writing Reexam CTL File Line: filename $DFS_fn $_\n" if ($debug); print CTL "filename $DFS_fn $_\n" || die "Error writing to Reexam control file ($Temp_CTL_File)."; } close CTL; if ($Reexam_Kind[$Reexam_Number] !~ /^[BC]\d$/) { # Sanity check. die "Invalid kind ($Reexam_Kind[$Reexam_Number]) in $DFS_fn metadata."; # If this ever happens, you could go back and recapture the $Reexam_Issue_Year and # build your own kind, ala # $kind = ($Reexam_Issue_Year[$Reexam_Number] < 2002 ? "B" : "C") . $Reexam_Number+1; } my $This_Reexam_Patn="$Base_Patn$Reexam_Kind[$Reexam_Number]"; # EG, US05224775B1 # or US05224775C2 # Build up the complete name for this reexam image, EG /dfs/images/US/75/47/US05224775C2.usp364.tif $This_Reexam_Fn = "$Image_Dir/$This_Reexam_Patn.$cdlabel.tif"; # Now call any2any to create the image for this Reexam from the consolidated image. $pages_wrote = Call_any2any($Temp_CTL_File,$This_Reexam_Fn); # The Call_any2any routine returns the number of pages wrote (zero if there was an error). # Insure any2any's page count matches how many I thought I should've written. They should. if ($pages_wrote != $Reexam_End[$Reexam_Number] - $Reexam_Start[$Reexam_Number] + 1) { print STDERR "Page count mismatch for Reexam #" , $Reexam_Number+1 , ". Wrote $pages_wrote, not " , $Reexam_End[$Reexam_Number]-$Reexam_Start[$Reexam_Number]+1 , ".\n"; print LOG "Page count mismatch for Reexam #" , $Reexam_Number+1 , ". Wrote $pages_wrote, not " , $Reexam_End[$Reexam_Number]-$Reexam_Start[$Reexam_Number]+1 , ".\n"; } else { # Normal case. All's ok. print "Wrote $pages_wrote pages into $This_Reexam_Fn at ", scalar localtime, ".\n" if ($debug); system("/usr/bin/rm $Temp_CTL_File"); print NEWDFS "$This_Reexam_Fn # Reexam # " , $Reexam_Number+1 , "\n"; } # Write row into imag to identify this Reexam image. For example, # db2 "insert into imag (patn,image_pages,cdlabel,datasrc,namesrc) # values('$This_Reexam_Patn',$pages_wrote,'$cdlabel','USG','$cdlabel')"; Write_Into_DB2_File("insert into imag (patn,image_pages,cdlabel,datasrc,namesrc) values('$This_Reexam_Patn',$pages_wrote,'$cdlabel','USG','$cdlabel')"); $new_Reexam_file++; } # Archive the consolidated images we find and break up, for reference. # I may want to look at them later. It's better than erasing them. `/usr/bin/mv $DFS_fn ~rickjas/old_US_Images/Consolidated_Images/$patn.$cdlabel.tif`; if ($?) {die "Couldn't archive $DFS_fn to my Consolidated_Images directory."} return $Original_Pages; } # Given an any2any control file and an output file name, call any2any to convert/create # the image, and return the number of pages any2any wrote (our caller will probably # compare the number of pages wrote against what he expected). # # If there is an error with any2any, we will spit out error messages to STDERR. # # Called like so, my $Original_Pages = Call_any2any($Temp_CTL_File,$Original_fn); sub Call_any2any { my ($CTL_File, $Output_File)=@_; my $this_line; my $any2any_page_count = 0; # print "Calling $any2any $CTL_File $Output_File 2>&1 ...\n" if ($debug); my @lines = `$any2any $CTL_File $Output_File 2>&1`; $rc = $?; if ( $rc ) { # Did any2any fail? print STDERR "any2any failed for $Output_File at ", scalar localtime, "\n"; foreach $line (@lines) { # Print out any messages from any2any. print STDERR "$line"; } } else { # any2any worked ok. Great. How many pages did it write? foreach $this_line (@lines) { # If only 1 page got written, any2any messages says "1 page", not "1 pages". if ( $this_line =~ /([0-9]+) pages? written OK./) { $any2any_page_count = $1; last; } } } return $any2any_page_count; } # If this patent is in DB2's imag table with namesrc=cdlabel, then delete it. sub Remove_From_DB2_If_There { if (exists $DB_datasrc{$patn}) { $delete_from_db2++; Write_Into_DB2_File("delete from imag where patn='$patn' and datasrc='$DB_datasrc{$patn}' and cdlabel='$DB_cdlabel{$patn}' and namesrc='$DB_namesrc{$patn}'"); } } # We come to a common subroutine to write into our UpdateDB2 file because we # want to first write a comment, but only if we ever actually write to it. # I.E. if we never write to it, I want it to wind up being an empty file. sub Write_Into_DB2_File { my $string=$_[0]; if ($DB2_File_Not_Written_To_Yet) { print DB2 "-- Run this script by db2 -f $DB2_Update_File\n"; $DB2_File_Not_Written_To_Yet=0; } print DB2 "$string\n"; } # Checkpoint this run. Normally, we're in the middle of the run, # but we could be finished with it as well. sub Checkpoint { ($sec, $min, $hour, $mday, $mon, $year) = localtime(); $timestamp= (1900+$year) . "/" . ($mon<9?"0":"") . ($mon+1) . "/" . ($mday<10?"0":"") . $mday; $timestamp.=" " . ($hour<10?"0":"") . "$hour:" . ($min<10?"0":"") . "$min:" . ($sec<10?"0":"") . $sec; close DB2; close DFS; close NEWDFS; close TODO; close FIX; close SNH; close LOG; `/usr/bin/cp -p $DB2_Update_File $DB2_Update_File.chkpt.tmp`; `/usr/bin/cp -p $DFS_Update_File $DFS_Update_File.chkpt.tmp`; `/usr/bin/cp -p $New_DFS_File_List $New_DFS_File_List.chkpt.tmp`; `/usr/bin/cp -p $Still_To_Get_File $Still_To_Get_File.chkpt.tmp`; `/usr/bin/cp -p $Fix_File $Fix_File.chkpt.tmp`; `/usr/bin/cp -p $SNH_File $SNH_File.chkpt.tmp`; `/usr/bin/cp -p $Log_File $Log_File.chkpt.tmp`; # Only now do we do a quick rename, thus making this checkpointing as atomic as we can. `/usr/bin/mv $DB2_Update_File.chkpt.tmp $DB2_Update_File.chkpt`; `/usr/bin/mv $DFS_Update_File.chkpt.tmp $DFS_Update_File.chkpt`; `/usr/bin/mv $New_DFS_File_List.chkpt.tmp $New_DFS_File_List.chkpt`; `/usr/bin/mv $Still_To_Get_File.chkpt.tmp $Still_To_Get_File.chkpt`; `/usr/bin/mv $Fix_File.chkpt.tmp $Fix_File.chkpt`; `/usr/bin/mv $SNH_File.chkpt.tmp $SNH_File.chkpt`; `/usr/bin/mv $Log_File.chkpt.tmp $Log_File.chkpt`; `echo "$patn $timestamp" >> $Checkpoint_File`; # Summarize our run by spitting out our global counters. print "Checkpointing $cdlabel: $patn_number/$Size_count at patent $patn at $timestamp.\n"; print " $too_small_only were (maybe) removed from DFS only because they were too small,\n"; print " $not_in_index_only were (maybe) removed from DFS only because they weren't in the index file,\n"; print " $not_in_index_and_too_small were removed from DFS for both reasons,\n"; print " $tif_errors_fixed whose tif errors were fixed,\n"; print " $still_todo were still to be gotten from DVD,\n"; print " $normal_db2_updates required normal DB2 updates,\n"; print " $delete_from_db2 were put in the delete-from-DB2 list,\n"; print " $insert_missing_db2_data were inserted into DB2,\n"; print " $new_original_file original images were extracted from a consolidated image,\n"; print " $new_X6_file X6 were images extracted from a consolidated image,\n"; print " $new_Reexam_file Reexaminations were extracted from a consolidated image.\n"; }