#!/dfs/prod/perl/bin/perl # # getimage: Generic routine to look for, retrieve if need be, possibly generate, # and return the filename for the image file of the requested patent. # Typically, this routine is called from (f)cgi-bin programs to return the # filename for a patent's image. For example calls, # in the EDC, see /dfs/prod/ipn/cgi-bin/image # or in Japan, see /ips/prod/cgi-bin/viewpat.cmd # # EG, $ImageFileName=`../bin/getimage EP00904950A1 type=tif`; # if (! -r $ImageFileName !! -z _) { # If failed to find a usable image or # error ... # it's zero-length, then no image found. # # This can also be called from the command line with this syntax, # getimage patn (type=tif|pdf|ps) (document=full|clip|abstract) (page=nn) (from TPS|USPTO) # (sub|nosub) (query) (outformat=NV) (debug) # # PATN => The Patent number is typically a 2-character country code, 8-digit number, # and 2-character kind. This is the only required parameter. The middle # "number" part may have leading zeros stripped and can also be non-numeric # as is the case for non-utility US patents (USPP1234__). # You can also specify more than one patn, e.g. getimage patn1 patn2 patn3 type=pdf # # type=Wanted_Image_Type => For example, pdf or tif most commonly, but can be anything # understood by any2any. If omitted, any image type is ok and # we return whatever type we find (ie we do no conversions). # # - document=full|clip|abstract => Optional. Defaults to "full", which means "Please return # the full document". Other possibilities include # clip=Please return the clip, for example, # /dfs/images/clips/US/12/12/US6801212B1.drg001.tif # abstract=Please return the 1-page abstract, used for # Japanese images, for example, # /dfs/images/JP/2005/34/12/2005021234.tif # # - page=pagenumber => Optional. EG, page=1 to only return the first page. Only valid for # full document requests. # # - from TPS|USPTO => Force getting from this source. Will put the file in the current # or from=TPS|USPTO directory. This option is only valid from the command line. # # - sub|nosub => Allow or disallow (typically, EP-to-WO) image substitution. # E.G. EP01344442A1 -> WO00251230A1 or EP01345714A1 -> WO00251231A1. # or CA00932350A1 -> FR02075483A5 or CA01331388A1 -> US05052710A1. # The default is to NOT allow these country-to-country substitutions. # (Edward uses "sub" in his PDF Express java code.) # # - query => Ask TPS for details on what images it has available for the given patent(s) # and display the information on the screen. This option WAS going to be only # available from the command line, but fcgi-bin/getcdlabel.fpl uses this option. # # - format=NV => Return the image(s) data in name=value pairs (Sander uses this in Patolis). # For example, # # - debug => Produce debug output, either to /ips/prod/logs/images.log (or /ips/test/...) # or to the console if this is a command-line invocation. # # To test this from the command line, # getimage US05551212__ type=pdf # getimage EP00904950A1 type=tif # getimage EP00904950A1 page=1 # getimage EP1344442A1 sub (substitutes to WO002051230A1) # getimage EP1345714A1 sub (substitutes to WO020051231A1) # getimage WO120000A1 type=tif # getimage WO09209959A1 type=tif debug # getimage DE10000001A1 DEK4000012U1 q # getimage JP24100644A2 document=abstract # getimage US06801212__ document=clip # # This program is shared between Patolis & EDC, so be aware & keep them in synch. # The idea was to have one place where the give-me-an-image logic can be centralized # and is called from # * EDC's and Patolis's viewpat.cmd & download.cmd # * EDC's cgi-bin/image program, which is the TPS & Patolis interface to EDC's images. # Ironically, we might very well have a double getimage call with this sequence, # Patolis User does a viewpat, which calls # Patolis's cgi-bin/viewpat.cmd, which calls # Patolis's bin/getimage, which calls # EDC's cgi-bin/image,which calls # EDC's bin/getimage, which gets the image and finally returns the image. # * EDC's fcgi-bin/getcdlabel.fpl program, which is what Santokh uses for the # Net Commerce Fast Buy # * EDC's ImageConverter.java, which is Edward's PDF Express code. ProcessInput(); # Parse arguments and initialize variables. # Major Case 1/4: If user is forcing us to get|query image from a specific source, # do so without looking around the local file system for it. if ($from) { if ($from eq "TPS") { if (!$TPS_Server) {die "We don't have a usable EDC server from $myhostname.\n"} Get_TPS_Info(@in_patns); if ($mode eq "QUERY") { if ($outformat eq "NV") { # Satisfy Sander here, Rick. I dunno what he wants exactly. foreach $this_patn (@in_patns) { print "RequestedPatent=$this_patn\n"; print "ImageCount=$TPS_count{$this_patn}\n"; for (my $i=1; $i<=$TPS_count{$this_patn}; $i++) { $thiskey="$this_patn#$i"; print "Handle=$Image_cc{$thiskey} $Image_nn{$thiskey} $Image_kind{$thiskey}\n"; print "Volume=$Image_volume{$thiskey}\n"; print "Method=$Image_method{$thiskey}\n"; print "Pages=$Image_pages{$thiskey}\n"; print "Filer=$Image_filer{$thiskey}\n"; print "Default=" , ($TPS_default{$this_patn} eq $thiskey ? "yes" : "no") , "\n"; } } } else { # For example, if getimage WO00132477 q EP1344442A1 USD0012345__ # # TPS Has 3 Images for WO00132477 # Image "Handle" Volume Method Pages EDC Local File Name # ===================== =========== ====== ===== ==================================== # WO 2001032477 R8A2 miwo2001049 local 3 /widas/WO/77/24/WO2001032477R8A2.pdf # WO 2001032477 R5A2 miwo2001023 local 2 /widas/WO/77/24/WO2001032477R5A2.pdf # WO 2001032477 A1 miwo2001019 local 33 /widas/WO/77/24/WO2001032477A1.pdf <-- Default # # TPS Has 1 Image for EP1344442A1 # Image "Handle" Volume Method Pages EDC Local File Name # ===================== =========== ====== ===== ==================================== # WO 2002051230 A1 miwo2002026 local 18 /widas/WO/30/12/WO2002051230A1.pdf <-- Default # # TPS Has 2 Images for USD0012345__ # Image "Handle" Volume Method Pages EDC Local File Name # ===================== =========== ====== ===== ==================================== # US D12345 S1 usp7 local 2 /widas/US/45/23/USD12345S1.pdf <-- Default # US D012345 S1 remote # # Except that we don't show Patolis the EDC Local File Name. foreach $this_patn (@in_patns) { if (! $TPS_count{$this_patn}) { print "The Thompson Patent Store does not have an image for $this_patn.\n" if ($cmdline); } else { print " TPS Has $TPS_count{$this_patn} Image" , $TPS_count{$this_patn}==1 ? "" : "s" , " for $this_patn\n"; print " Image \"Handle\" Volume Method Pages"; if (! $Patolis) {print " EDC Local File Name"} print "\n"; print "===================== =========== ====== ====="; if (! $Patolis) {print " ===================================="} print "\n"; for (my $i=1; $i<=$TPS_count{$this_patn}; $i++) { $thiskey="$this_patn#$i"; logit("g01: this_patn=$this_patn TPS_default=$TPS_default{$this_patn} and thiskey=$thiskey\n"); printf("$Image_cc{$thiskey} %11s %-6s %11s %6s %5s %s %s\n", $Image_nn{$thiskey},$Image_kind{$thiskey},lc $Image_volume{$thiskey},$Image_method{$thiskey},$Image_pages{$thiskey}, ($Patolis ? "" : "$Image_filer{$thiskey}") , ($TPS_default{$this_patn} eq $thiskey ? "<-- Default" : "")); } # Alert caller when we have a country-to-country substitution which we don't allow. if (($Image_cc{$TPS_default{$this_patn}} ne $cc{$this_patn}) && ! $Automatically_Substitute && $cmdline) { print "\n===> Be aware that we don't allow this country-to-country substitution. <===\n\n"; } } } } exit; } # Not query mode. We must be getting, and specifically from TPS. We're likely running # in San Jose, maybe doing a viewpat and we need to get the image from EDC. Get_TPS_Images(); # Hopefully at this point, I have an image in $Image_filer{$TPS_default{$this_patn}} for each patn. } elsif ($from eq "USPTO") { if ($mode eq "QUERY") {die "Query mode from the US PTO not supported.\n"} foreach $this_patn (@in_patns) { if ($cc{$this_patn} eq "US") { logit("g02: Getting $this_patn image from US PTO.\n"); $test_fn=Get_USPTO_Image($this_patn); if (-r $test_fn) { # Did we get a usable image? $Gotten_From{$this_patn}="US PTO"; # Yes, we did. Set this image as the default. $TPS_default{$this_patn}="$this_patn#1"; $Image_filer{"$this_patn#1"}=$test_fn; } else { print "The US PTO doesn't have an image for $this_patn.\n" if ($cmdline); } } else { print "Can't get a $cc{$this_patn} patent from the US PTO.\n" if ($cmdline); } } } else { print "Invalid 'from' value ($from).\n" if ($cmdline); } } elsif ($document eq "CLIP") { # Major Case 2/4: Request for a patent's clip image foreach $this_patn (@in_patns) { if (! $Patolis && $cc{$this_patn} eq "JP" && (! $Wanted_Image_Type || $Wanted_Image_Type eq "tif")) { # $page is ignored, even if set $test_fn="/dfs/images/clips/$cc{$this_patn}/$Dir1{$this_patn}/$Dir2{$this_patn}/JP$num{$this_patn}A1.tif"; logit("g03: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; $Gotten_From{$this_patn}="local file system"; logit("g04: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } # end JP clip if (! $Patolis && $cc{$this_patn} eq "US" && ! $page && (! $Wanted_Image_Type || $Wanted_Image_Type eq "tif")) { $thiskey="$this_patn#1"; # I'll need this if I do find an image. $test_fn="/dfs/images/clips/$cc{$this_patn}/$Dir1{$this_patn}/$Dir2{$this_patn}/US$num{$this_patn}A1.tif"; logit("g03: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; $Gotten_From{$this_patn}="local file system"; logit("g04: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } else { $test_fn =~ s/A1\./B1\./g; logit("g05: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; $Gotten_From{$this_patn}="local file system"; logit("g06: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } else { $test_fn =~ s/B1\./B2\./g; logit("g07: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; $Gotten_From{$this_patn}="local file system"; logit("g08: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } } } } elsif ($document eq "ABSTRACT") { # Major Case 3/4: Request for the 1-page Japanese abstract image. foreach $this_patn (@in_patns) { if (! $Patolis && $cc{$this_patn} eq "JP" && (!$page || ($page eq "1")) && (! $Wanted_Image_Type || $Wanted_Image_Type eq "pdf")) { $thiskey="$this_patn#1"; # I'll need this if I do find an image. $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf"; logit("g09: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; $Gotten_From{$this_patn}="local file system"; logit("g0a: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } } else { # Major Case 4/4: This is not a forced "from" request, nor is it a # request for the 1-page Japanese abstract. # It's the normal case of a regular (probably tif|pdf) file. # For each patent requested, see if we can determine the filename without asking TPS, # just to save time. If we find an image, set $TPS_default and Image_filer, which will # short-circuit the Get_TPS_Info & Get_TPS_Images subroutine for this patent. # foreach $this_patn (@in_patns) { $thiskey="$this_patn#1"; # I'll need this if I do find an image. if ($page) { # If a specific page number was asked for, # look for just that page's image. if ($Wanted_Image_Type) { # Was a specific Image Type asked for? $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.$Wanted_Image_Type"; logit("g10: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g11: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } else { # More normal case where no image type specified. $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.tif"; logit("g12: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g13: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } else { $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.pdf"; logit("g14: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g15: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } } # If full document wanted (the normal case) or a specific page was wanted and we # didn't find it above, then look for the full document. # if (! $TPS_default{$this_patn}) { # Do we have a usable image yet? if ($Wanted_Image_Type) { # No. Was a specific Image Type asked for? # If so, look for that specific type image first. if ($PermImageStoreDir{$this_patn}) { $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.$Wanted_Image_Type"; logit("g16: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g17: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } if (! $TPS_default{$this_patn}) { $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.$Wanted_Image_Type"; logit("g18: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g19: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } if (! $TPS_default{$this_patn}) { # Do we have a usable image yet? # If no specific type was specified or we didn't find the type asked for, # look for anything, preferring pdf to tif. # if ($Wanted_Image_Type ne "pdf") { # Don't duplicate above work. if ($PermImageStoreDir{$this_patn}) { $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf"; logit("g1a: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g1b: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } if (! $TPS_default{$this_patn}) { # Do we have a usable image yet? $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.pdf"; logit("g1c: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g1d: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } if (! $TPS_default{$this_patn}) { # Do we have a usable image yet? if ($Wanted_Image_Type ne "tif") { # Don't duplicate above work. if ($PermImageStoreDir{$this_patn}) { $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.tif"; logit("g1e: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g1f: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } if (! $TPS_default{$this_patn}) { # Do we have a usable image yet? $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.tif"; logit("g1g: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g1h: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } } } if ($TPS_default{$this_patn}) { # Did we get a usable image? $Gotten_From{$this_patn}="local file system"; # Yes, we did. } elsif (($cc{$this_patn} eq "US") && (($kind{$this_patn} eq "") || ($kind{$this_patn} eq "__")) && ($TPS_kind{$this_patn} eq "B1")) { # Requested kind was empty or __ and USB1 failed, so try USB2 $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf"; $test_fn =~ s/B1\./B2\./g; # Try USB2 instead of USB1 logit("g1g: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default. $Image_filer{$thiskey}=$test_fn; logit("g1h: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } # Unset TPS kind for USB1. This allows B1 or B2 match from TPS $TPS_kind{$this_patn} = ""; } } # End of the foreach loop, trying to find the image file on our own, before asking TPS. logit("g20: Calling Get_TPS_Info\n"); # For those patents that we couldn't find an image for in our local file system, ask TPS. Get_TPS_Info(@in_patns); # It's possible we know more now, eg EP-to-WO substitutions (EP01344442A1->WO2002051230A1) # or a funky TPS_kind (WO00132477A2 -> WO132477R8A2), so recheck our local file systems # before calling TPS to fetch the image. foreach $this_patn (@in_patns) { $thiskey=$TPS_default{$this_patn}; # There's are cases when we're asking for type=tif and/or page=1 of an funky TPS kind # or an allowed EP-to-WO substituted image, # e.g. getimage WO00132477A2 type=tif (TPS_kind=R8A2) # or getimage WO00132477A2 page=1 (TPS_kind=R8A2) # or getimage EP01344442A1 type=tif page=1 sub # or getimage EP01344442A1 type=tif page=1 sub # or getimage EP01344442A1 page=1 sub # that we should look around better for an already-generated file. # In these cases, we've already failed the initial file scan because we # were looking for WO00132477A2 instead of WO00132477R8A2 or EP01344442A1-whatever # instead of the substituted WO2002051230A1. And when we called Get_TPS_Info, # it set $TPS_default to the original image we can see right now, but we still may # not want to use it. There may be an already-generated tif or page.1 image in our # temporary cache we can use, e.g. /dfs/dlcache/42/44/WO2002051230A1.tif # or /dfs/dlcache/42/44/WO2002051230A1.page.1.tif # A bit of a parley perhaps, but we should detect these cases and find the cached image. # ############################################################################################ # # # This logic to detect these cases need work here. Until it gets done and done right, # # we'll be regenerating funky TPS kind page.1 & tif files unnecessarily. # # # # In the interest of time, I'm delaying this logic 'till later. # # # ############################################################################################ # # How do we detect these cases? # if ($Wanted_Image_Type && (this image isn't of this type... - parse up $Image_filer{$thiskey}?? ) # if ($page && (cc=$cc{$this_patn} ne Image_cc{$thiskey})) { # If page and substituted patent ... # if ($page && $test_fn !~ /\.page\.$page\./) { # If fn has ".page." in it ... # if (! -r $Image_filer{$thiskey}) { # Are we still missing a usable image for this patent? # Yep, search local file system again. logit("g21: Starting second scan for $thiskey and page=>$page<\n"); if ($page) { # If a specific page number was asked for, # look for just that page's image. if ($Wanted_Image_Type) { # Was a specific Image Type asked for? $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.$Wanted_Image_Type"; # Final image put at /dfs/dlcache/42/44/WO2002051230A1.page.1.tif logit("g22: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g23: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } else { # More normal case where no image type specified. $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.tif"; logit("g24: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g25: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } # If full document wanted (the normal case) or a specific page was wanted and we # didn't find it above, then look for the full document. # if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet? if ($Wanted_Image_Type) { # No. Was a specific Image Type asked for? # If so, look for that specific type image first. if ($PermImageStoreDir{$this_patn}) { $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.$Wanted_Image_Type"; logit("g26: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g27: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet? $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.$Wanted_Image_Type"; logit("g28: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g29: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet? # If no specific type was specified or we didn't find the type asked for, # look for anything, preferring pdf to tif. # if ($Wanted_Image_Type ne "pdf") { # Don't duplicate above work. if ($PermImageStoreDir{$this_patn}) { $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf"; logit("g2a: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g2b: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet? $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.pdf"; logit("g2c: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g2c: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet? if ($Wanted_Image_Type ne "tif") { # Don't duplicate above work. if ($PermImageStoreDir{$this_patn}) { $test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.tif"; logit("g2d: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g2e: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet? $test_fn="$TempImageCacheDir{$this_patn}/$this_patn.tif"; logit("g2f: looking for $test_fn\n"); if (FileExists($test_fn)) { # Is a usable image there? $Image_filer{$thiskey}=$test_fn; # Yes! Use this fn. logit("g2g: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n"); } } } } } if ($TPS_default{$this_patn}) { # Did we get a usable image? $Gotten_From{$this_patn}="local file system"; # Yes, we did. } } # End of are we still missing a usable image for this patent? } # End of the foreach loop, trying to find the image file a second time on our own, # letting TPS get it for us. # Let TPS get the images for all patents that we couldn't find an image for. Get_TPS_Images(); foreach $this_patn (@in_patns) { # Still missing any US images? If so, we have one more trick up our sleeve. if (! $TPS_default{$this_patn} && $cc{$this_patn} eq "US") { $test_fn=Get_USPTO_Image($this_patn); logit("g2h: USPTO returned >$test_fn< for $this_patn.\n"); if (-r $test_fn) { # Did we get a usable image? $Gotten_From{$this_patn}="US PTO"; # Yes, we did. Set this image as the default. $TPS_default{$this_patn}="$this_patn#1"; $Image_filer{"$this_patn#1"}=$test_fn; } else { print "The US PTO doesn't have an image for $this_patn.\n" if ($cmdline); } } } } # We're done looking for usable image(s) and/or going off and getting one(them). # Now for each image we have, check its type. If wrong, convert to $Wanted_Image_Type. # foreach $this_patn (@in_patns) { $test_fn=$Image_filer{$TPS_default{$this_patn}}; if (-r $test_fn) { # Did we get a usable image? logit("g30: Got usable image file at $test_fn from $Gotten_From{$this_patn} when Wanted_Image_Type=$Wanted_Image_Type\n"); $test_fn =~ /^(.*)\/(.*)\.([a-z]{1,4})/i; # eg /widas/WO/77/24/WO2001032477A1.pdf $fnDirectories=$1; # /widas/WO/77/24 $fnPrefix=$2; # WO2001032477A1 $fnType=$3; # pdf # If we wanted a specific type and what we have now is not that type, # or we want just a single page and this image is not the single page image, # then we've got more work to do. We'll put our converted image into # our image cache. if (($Wanted_Image_Type && $Wanted_Image_Type ne $fnType) || ($page && $test_fn !~ /\.page\.$page\./)) { # One "rule" that makes our lives easier is, if you're ever going to write # into our temporary image cache directory, use the requested patent in # the file name. This makes finding it next time a whole lot easier. # This works fine except in the EP-to-WO substitution case. Rick, maybe address later. # Wrong action when you ask for EP01344442A1 sub type=tif, which delivers WO2002051230A1.pdf, # which you convert to /dfs/dlcache/EP01344442A1.tif and from then on, you always # find that /dfs/dlcache/EP01344442A1.tif file. You never go back to the original # pdf even if it's locally available and you ask for WO2002051230A1 type=pdf. # Maybe I should always query TPS first. If only it doesn't have a local image, # then scan around. $permfilePrefix="$TempImageCacheDir{$this_patn}/$this_patn"; # eg /dfs/dlcache/00/00/USD460000__ logit("g31: We've got work to do on $test_fn\nWanted_Image_Type=$Wanted_Image_Type page=$page and permfilePrefix=$permfilePrefix\nTempImageCacheDir{$this_patn}=$TempImageCacheDir{$this_patn}\n"); # If only one specific page wanted, we'll put that single page image in # $TempImageCacheDir{$this_patn}/$this_patn.page.$page.$Wanted_Image_Type" if ($page && $test_fn !~ /\.page\.$page\./) { logit("g32: must extract page $page from $test_fn.\n"); $permfilePrefix.=".page.$page"; if (! $Wanted_Image_Type) { # If no type specified, then we'll default $Wanted_Image_Type=$fnType; # to the same output type as input file logit("g33: Defaulting Wanted_Image_Type to $Wanted_Image_Type.\n"); } } $tempfilePrefix="$permfilePrefix.$myhostname.$$"; logit("g34: fnDirectories=$fnDirectories,fnPrefix=$fnPrefix,\nfnType=$fnType, and permfilePrefix=$permfilePrefix\n"); logit("g35: Converting to $permfilePrefix $Wanted_Image_Type (page=$page) ... \n"); if ($fnType eq "pdf") { # If input file = pdf, got to convert to ps first. $Image_filer{$TPS_default{$this_patn}}=Convert_pdf($test_fn,$tempfilePrefix,$permfilePrefix,$Wanted_Image_Type,$page); } else { $Image_filer{$TPS_default{$this_patn}}=Call_any2any($test_fn,$tempfilePrefix,$permfilePrefix,$Wanted_Image_Type,$page); } $Gotten_From{$this_patn}.=" and converted"; } # Else found image is already the type I want and I want the whole image, so # there's no need to convert anything. } # Else we didn't get an image for this patent. Oh, well. Can't convert what we don't have. } # Then you can return what you got, if anything. foreach $this_patn (@in_patns) { if (-r $Image_filer{$TPS_default{$this_patn}}) { print "$Image_filer{$TPS_default{$this_patn}}"; # Return the image's filename (if any) to our caller. print " (from $Gotten_From{$this_patn})" if ($cmdline); print "\n" if ($cmdline || $number_of_patns>1); } else { print "No usable image found for $this_patn.\n" if ($cmdline); } } exit; # Process the input and initialize global variables for the rest of the program. # # We set the following Input Variables: # $in_patns = Array of uppercased patent number that were requested. # $number_of_patns = Number of entries in the in_patns array. # $debug = Binary switch to generate debug lines in log file (or console if # called from the command line. Default is 0 (no debug output). # $Wanted_Image_Type = pdf or tif or null, which means our caller doesn't # what type is returned. They'll take any image type. # $page = Page number for single-page requests (eg, to generate or find a # one-page thumbnail). Default is null, meaning return all pages. # $from = Forcibly get an image from the specified server (command line only). # $Automatically_Substitute = Allow TPS's EP-to-WO substitutions. # $Patolis = Binary switch to designate Patolis site # $myhostname = EG, ips01i or dweb3 or penguin # $cmdline = Binary switch. Normally false, unless invoked from a command line. # $logfile = File name of log file for debugging or tracing. # $EDC_Server = I.P. address of EDC's cgi-bin/image interface # $TPS_id # $TPS_Server # $pdftopsCommand # $any2anyDir # $Erase_Intermediate_PS_File # $Send_Any2any_Error_Mail sub ProcessInput { use LWP::UserAgent; # Useful for making remote web requests. $myhostname=`hostname -s`; # Used for conditional code between EDC & Patolis, chomp $myhostname; # and to differientiate temporary filenames in a # multi-server & shared file system environment. $page=""; # All pages, please. $from=""; $TPS_Server="www.thomsonpatentstore.net"; $Wanted_Image_Type=""; $Automatically_Substitute=0; # Disallow TPS country-to-country image substitutions $Erase_Intermediate_PS_File=1; # Clean up intermediate files $Send_Any2any_Error_Mail=0; # No e-mail to Rebecca & Rick for any2any errors. $Gotten_From{$this_patn}="unknown"; $mode="GET"; # Default to get (not query) and $document="FULL"; # the full document (not clip or abstract) $outformat=""; # in normal English output (not Sander's NV) $DE{"A"}=10; $DE{"B"}=11; $DE{"C"}=12; $DE{"K"}=20; # Delphion encoding of German $DE{"L"}=21; $DE{"M"}=22; $DE{"U"}=50; $DE{"X"}=60; # patent numbers # Detect command line testing versus true web-server call (where REQUEST_METHOD will # be set) or a call from the Tomcat environment (where CATALINA_HOME will be set). if ((! $ENV{REQUEST_METHOD}) && (! $ENV{CATALINA_HOME})) { $cmdline=1; } else { # Most likely a true cgi-bin call from a web server $cmdline=0; # or from the Tomcat world. } if ($myhostname=~/^ips/) { $Patolis=1; # For Patolis-specific differences $TPS_id="patolis"; $pdf_creator="Patolis"; # When called from the web server via a viewpat or download.cmd call, we use # the SCRIPT_FILENAME environment variable, which contains our fully-qualified # name, to differentiate between the production versus test environments. # # If this is a command-line call during testing (so the SCRIPT_FILENAME environment # variable probably isn't set), then we use $0 (how we called ourselves) to # select either environment, prod or test. EG, /ips/prod/bin/getimage ... # if ($ENV{'SCRIPT_FILENAME'} =~ /\/prod\// || $0 =~ /\/prod\//) { $EDC_Server="84.18.161.12"; # EDC's Download Servers (www5.delphion.com) $pdftopsCommand="/ips/prod/bin/pdftops"; $logfile="/ips/prod/logs/getimage.log"; } else { $EDC_Server="84.18.161.14"; # EDC's Test Server (www7.delphion.com) $pdftopsCommand="/ips/test/bin/pdftops"; $logfile="/ips/test/logs/getimage.log"; } # Don't worry that /ips/prod is hardcoded here even if we're in the test environment. # This is only used to convert images and even in the test environment, # /ips/test/bin/aps2ras has /ips/prod hardcoded. $any2anyDir="/ips/prod/bin"; } else { # We must be running in EDC or maybe, San Jose $Patolis=0; $TPS_id="delphion"; $pdf_creator="Thomson Delphion - http://www.delphion.com"; if ($myhostname=~/^d[a-z]*\d*$/) { # If we're really in EDC and not just San Jose, $EDC=1; # remember this fact. $EDC_Server=""; # EDC doesn't go to EDC for images } else { # Else we're in San Jose $EDC=0; # remember this fact. $EDC_Server="84.18.161.14"; # EDC's Test Server (www7.delphion.com) } if ($ENV{'SCRIPT_FILENAME'} =~ /\/prod\// || $0 =~ /\/prod\//) { $pdftopsCommand="/dfs/prod/ipn/bin/pdftops"; } else { $pdftopsCommand="/dfs/stage/ipn/bin/pdftops"; if (! -x $pdftopsCommand) {$pdftopsCommand="/dfs/prod/ipn/bin/pdftops"} } $logfile="/ips/ipn/logs/getimage.log"; # Same place for both prod & stage $any2anyDir="/dfs/prod/ipn/bin"; } if (! -x $pdftopsCommand) {die "Can't find $pdftopsCommand program."} if (! -d $any2anyDir) {die "Can't see $any2anyDir directory."} ################################################################## # Now process the input arguments. # ################################################################## $number_of_patns=0; while (@ARGV) { $thisARG=uc shift @ARGV; logit("g40: Checking out thisARG=>$thisARG<\n"); # This msg may not show up in the log if we # haven't seen the debug option yet. if ($thisARG =~ /^type=(\w\w\w)$/i) { # This chokes on some otherwise valid types $Wanted_Image_Type=lc $1; # that any2any understands, but tough. # Who uses p12 or ras8 or scdim2 anyway? } elsif ($thisARG =~ /^document=(full|clip|abstract)$/i) { $document=$1; } elsif ($thisARG =~ /^d(ebug)?$/i) { $debug=1; ($sec,$min,$hour)=(localtime)[0,1,2]; logit(sprintf "g ============================= %02u:%02u:%02u =============================\n",$hour,$min,$sec); } elsif ($thisARG =~ /^page=(\d*)$/i) { $page=$1; } elsif ($thisARG =~ /^sub$/i) { $Automatically_Substitute=1; } elsif ($thisARG =~ /^nosub$/i) { $Automatically_Substitute=0; } elsif ($thisARG =~ /^q(uery)?$/i) { $mode="QUERY"; $from="TPS"; # A query, is presumed to be from TPS. } elsif ($thisARG =~ /^from$/i && $cmdline) { $from=shift @ARGV; } elsif ($thisARG =~ /^from=(tps|uspto)$/i && $cmdline) { $from=$1; } elsif ($thisARG =~ /^outformat=(nv)$/i) { $outformat=$1; } elsif ($thisARG =~ /^cd=/i) { # Quietly ignore old CD label parameter. } else { if (exists $out_fn{$thisARG}) { logit("g41: $thisARG was specified twice. Multiples are ignored.\n"); } else { $in_patns[$number_of_patns]=$thisARG; # Remember new patent number. $out_fn{$thisARG}=""; logit("g42: Calling parse_patn for $thisARG\n"); parse_patn($thisARG); $number_of_patns++; } } } } # End of the ProcessInput subroutine # Parse the passed patent number into the pieces needed for the rest of the program. # $my_patn = The original, untouched requested patent. Used to key the rest of these hashes. # $cc{$my_patn} = The uppercased, 2-character country that was requested. # $num{$my_patn} = The "numeric" part of the requested patent used in filenames, in # TPS-preferred format. This may not be all numeric and the format will # differ for different countries. # For WO patents, this will be in the preferred 4-digit years and # 6-digit numbers format. e.g. num=1983004466 for WO08304466A1 # num=2000000001 for WO00000001A2 # num=2002051230 for WO00251230A1 # or num=2002051231 for WO02051231A1 # For US Applications & Japanese patents, the preferred format is 4-digit # years and 7-digit numbers, e.g. num=20010024032 for US21024032A1. # We also undo Delphion's mangling if needed, for some DE & JP patents. # For all others (eg US granted or EP), remove leading zeros, e.g. 1234 or RE1234. # $kind{$my_patn} = The uppercased requested kind. Normally 2-characters, but might be longer, # eg, A9W1A1. Might also be null for kind-less requests. # $TPS_kind{$my_patn} = The uppercased, (normally 2-character) kind that we'll use to request TPS # image data, but could be null if this is a kind-less request. Usually # this is $kind{$my_patn}, but we'll supress Delphion's standard of __ kinds # for US patents and overcome a bug in Joachim's code where he doesn't # handle null kind requests for US non-utility patents. # $default_perm_fn_prefix{$my_patn} = Our guess of TPS's image filename in the Permanent image store, # store, eg "$cc{$my_patn}$num{$my_patn}$TPS_kind{$my_patn}". # If TPS_kind is null, I'll never find the image without asking TPS. # $TPS_key{$my_patn} = What TPS will use as its key, ie "$cc{$my_patn} $num{$my_patn} $TPS_kind{$my_patn}" # $Requested_patn{$TPS_key} = Reverse mapping, TPS_key-to-The patent that was asked for. # $Dir1{$my_patn} = Last pair of digits in patent number. # $Dir2{$my_patn} = Next-to-last pair of digits in patent number. # $PermImageStoreDir{$my_patn} = Permanent, Read-Only Image Store. EG, /dfs/images in EDC or SJ # $TempImageCacheDir{$my_patn} = Temporary, Read-Write Image Cache. EG, /dfs/dlcache in EDC or SJ, # /ips/images/cache in Japan. # $is_US_app{$my_patn} = Binary switch. Is this a US Application? # $is_US_special{$my_patn} = Binary switch. Is this a non-utility US patent? # $US_docid1{$my_patn} = Needed/used only if getting from US PTO. # $US_docid2{$my_patn} = Needed/used only if getting from US PTO. # sub parse_patn { my $my_patn=shift; $cc{$my_patn}=substr($my_patn,0,2); logit("g50: parse_patn is looking at $my_patn and cc=$cc{$my_patn}\n"); $is_US_app{$my_patn}=0; $is_US_special{$my_patn}=0; if ($cc{$my_patn} eq "US") { # Here we allow leading US Prefixes and only allow exactly 2-character kinds. $my_patn=~/^US0*(.*?)(\D.)?$/; # eg, US0D001234__ or US00001234__ or us1400H__ $num{$my_patn}=$1; # D001234 or 1234 or 1400H $kind{$my_patn}=$2; # __ or __ or __ logit("g51: patn=$my_patn, num=$num{$my_patn} and kind=$kind{$my_patn}\n"); if ($num{$my_patn}=~/^(\D+)(\d+([DHLN]?))/) { # Handle US Non-Utility Patents, which may be in # the NPO standard of US0D277060__ or US0RE29774A1 # or US0BRE28576__. Those take special handling. $is_US_special{$my_patn}=1; $USPrefix=$1; # eg D, PP, RE, H, BRE, etc. $nn=$2+0; # The true numeric portion w/o leading 0's. $USSuffix=$3; # Fractional Patent Suffix (extremely rare) $num{$my_patn}="$USPrefix$nn$USSuffix"; # The preferred TPS form (eg D1234 or 1234 or 1400H) $US_docid1{$my_patn}="$USPrefix$nn"; $US_docid2{$my_patn}="US0$USPrefix" . substr("000000$nn",(-8+length($USPrefix))); # Determine TPS kind. Normally, the TPS server correctly handles null or __ kinds, but for # US non-utility patents, you need to specify a good kind. This is a bug in Joachim's code # that he hasn't fixed it yet. if ($USPrefix eq "D" ) {$TPS_kind{$my_patn}="S1"} elsif ($USPrefix eq "PP") {$TPS_kind{$my_patn}="P1"} elsif ($USPrefix eq "RD") {$TPS_kind{$my_patn}="E1"} elsif ($USPrefix eq "RE") {$TPS_kind{$my_patn}="E1"} elsif ($USPrefix eq "RX") {$TPS_kind{$my_patn}="I2"} else {$TPS_kind{$my_patn}=""} # Unknown US non-utility type. Let TPS figure it out. logit("g52: Non-Utility US patent ($my_patn) prefix=$USPrefix suffix=$USSuffix\n"); } else { # Else it's US Granted Utility Patent or US Application if ($num{$my_patn} > 20000000 && $num{$my_patn} < 29999999) { # US App? $is_US_app{$my_patn}=1; # Convert US Applications from the Delphion standard of US21024032A1 # to the TPS standard of US20010024032A1 $num{$my_patn}=sprintf("200%1u%07u", substr($num{$my_patn},1,1), substr($num{$my_patn},2)); $US_docid1{$my_patn}=$num{$my_patn}; $US_docid2{$my_patn}="us$num{$my_patn}ki"; if ($kind{$my_patn}) { $TPS_kind{$my_patn}=$kind{$my_patn}; # Use what was given, eg, A1 or P1 for US Applications } logit("g53: US Application ($my_patn)\n"); } else { # Else it's a US Granted Utility Patent $US_docid1{$my_patn}=$num{$my_patn}; $US_docid2{$my_patn}="US" . substr("0000000$num{$my_patn}",-9); # Pad to 9 digits. # If the normal case of Delphion's double-underscore kind or no kind specified, # then leave TPS_kind null, else set it to what was specified. if ($kind{$my_patn} ne "__" && $kind{$my_patn}) { $TPS_kind{$my_patn}=$kind{$my_patn}; } elsif ($num{$my_patn} < 6167569) { $TPS_kind{$my_patn}="A1"; } else { $TPS_kind{$my_patn}="B1"; # B2 possible too, but handled as exception later } logit("g54: US Utility ($my_patn) num=$num{$my_patn} kind=$kind{$my_patn}" . "TPS_kind=$TPS_kind{$my_patn} US_docid1=$US_docid1{$my_patn} and US_docid2=$US_docid2{$my_patn}\n"); } } logit("g55: num=$num{$my_patn} kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn} US_docid1=$US_docid1{$my_patn} and US_docid2=$US_docid2{$my_patn}\n"); } elsif ($cc{$my_patn} eq "WO") { # Else it's not a US image request. Is it WO? $my_patn=~/^WO0*(\D*\d+)(\D.{0,5})?$/i; # eg, WO521230A1 or WO1A1 $num{$my_patn}=$1; # 521230 or 1 $kind{$my_patn}=$2; # A1 or A1 $num{$my_patn}=$num{$my_patn}; # Leading zeros (if any) have already been stripped for TPS. $TPS_kind{$my_patn}=$kind{$my_patn}; logit("g56: my_patn=$my_patn cc=$cc{$my_patn} num=$num{$my_patn} kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn}\n"); # Handle all WO numeric possiblities. In numerical order, we have if ($num{$my_patn} > 0 && $num{$my_patn} < 80000) { # Zero-stripped 2000 YYddddd $num{$my_patn}="2000" . substr("00000$num{$my_patn}",-6); # becomes 20000ddddd } elsif ($num{$my_patn} > 100000 && $num{$my_patn} < 199999) { # 2001 YYddddd $num{$my_patn}="20010" . substr($num{$my_patn},-5); # becomes 20010ddddd } elsif ($num{$my_patn} > 200000 && $num{$my_patn} < 251231) { # Jan-June, 2002 YYddddd $num{$my_patn}="20020" . substr($num{$my_patn},-5); # becomes 20020ddddd } elsif ($num{$my_patn} > 251230 && $num{$my_patn} < 299999) { # June-Dec, 2002 YYddddd $num{$my_patn}="20020" . substr($num{$my_patn},-5); # becomes 20020ddddd } elsif ($num{$my_patn} > 1000000 && $num{$my_patn} < 2051231) { # 2001-June, 2002 YYddddd $num{$my_patn}="200" . substr($num{$my_patn},0,1) . substr($num{$my_patn},-6); # becomes 200Ydddddd } elsif ($num{$my_patn} > 2051230 && $num{$my_patn} < 7800000) { # 2002-2007 YYddddd $num{$my_patn}="200" . substr($num{$my_patn},0,1) . substr($num{$my_patn},-6); # becomes 200Yddddd } elsif ($num{$my_patn} > 7800000 && $num{$my_patn} < 9999999) { # 1978-1999 YYddddd $num{$my_patn}="19" . substr($num{$my_patn},0,2) . "0" . substr($num{$my_patn},-5); # becomes 19YY0ddddd } elsif ($num{$my_patn} > 78000000 && $num{$my_patn} < 99999999) { # 1978-1999 YYdddddd $num{$my_patn}="19" . substr($num{$my_patn},0,2) . substr($num{$my_patn},2); # becomes 19YYdddddd } elsif ($num{$my_patn} > 190000000 && $num{$my_patn} < 200251231) { # Almost in expanded form YYYYddddd $num{$my_patn}=substr($num{$my_patn},0,4) . "0" . substr($num{$my_patn},-5); # becomes YYYYdddddd } elsif ($num{$my_patn} > 200251230 && $num{$my_patn} < 210000000) { # Almost in expanded form YYYYddddd $num{$my_patn}=substr($num{$my_patn},0,4) . "0" . substr($num{$my_patn},-5); # becomes YYYY0ddddd } elsif ($num{$my_patn} > 1960000000 && $num{$my_patn} < 2002051231) { # Already in expanded form YYYYdddddd # num is fine as is, YYYYdddddd } elsif ($num{$my_patn} > 1900000000 && $num{$my_patn} < 2100000000) { # Already in expanded form YYYYdddddd # num is fine as is, YYYYdddddd } else { die "Invalid WO patent number ($num{$my_patn}).\n"; } } elsif ($cc{$my_patn} eq "EP") { # Not US or WO. Is it EP? $my_patn=~/^EP0*(\D*\d+)(\D.{0,5})?$/i; # eg, EP0012345A1 $num{$my_patn}=$1; # 12345 $kind{$my_patn}=$2; # A1 $TPS_kind{$my_patn}=$2; # A1 } elsif ($cc{$my_patn} eq "DE") { # Not US or WO or EP. German Patent maybe? $my_patn=~/^DE0*(\D*\d+)(\D.{0,5})?$/i; # eg, DEK4000008U1 $num{$my_patn}=$1; # K4000008 $kind{$my_patn}=$2; # U1 $TPS_kind{$my_patn}=$2; # U1 if (substr($num{$my_patn},0,1) =~ /[A-Z]/) { # If so, check for Delphion's mangling. # We'll undo Delphion's mangling of the patent number, changing the letter back into # two digits (see the $DE hash above), as well as adding the 200 back to the year. # For example, since $DE{'K'}=20, we'll change num from K4000008 to 202004000008. substr($num{$my_patn},0,1)="$DE{substr($num{$my_patn},0,1)}200"; } } elsif ($cc{$my_patn} eq "JP") { # Not US or WO or EP or DE. Is it Japanese? $my_patn=~/^JP0*(\D*\d+)(\D.{0,5})?$/i; # eg, JP23092902A2 $num{$my_patn}=$1; # 23092902 $kind{$my_patn}=$2; # A2 $TPS_kind{$my_patn}=$2; # A2 # Delphion removes the middle two zeros from some Japanese patents, so we # got to put them back in. For example, JP23092902A2 -> JP2003092902A if (($num{$my_patn} =~ /2\d{7}/) && ($kind{$my_patn} =~ /[ATU]2/)) { $num{$my_patn}="200" . substr($num{$my_patn},1); } # Must be a pre-2000 JP Patent. $TPS_kind{$my_patn}="A1"; logit("g57: Japanese patent num=$num{$my_patn} and TPS_kind=$TPS_kind{$my_patn}\n"); } elsif ($cc{$my_patn} eq "IT") { # It's not US or WO or EP or DE or JP. Is it IT? # I don't quite know how to handle Italian patent numbers, so leave this for later, Rick. # Italian patent numbers with the city code, need to have the embedded 0's removed, # and Delphion's kind normalized, e.g. ITMI932388A1 -> IT_MI932388_A # and Delphion's kind normalized, e.g. ITMN990024A1 -> IT_MN99024_A1 # FYI, normally, IT-CityCode patents in raid.espace have 6 digits. There are no IT rows in raid.pdf. # The handful of exceptions are IT T0961019 A IT T0961019A I # IT MI9602175 A IT MI9602175A I # IT BO98000329 A ITBO98000329A I # IT MN99024 A1 IT MN99024A1 I # IT MI9602033 A1 IT MI9602033A1 I # IT MI20000136 A1 ITMI20000136A1 I # IT MI20000852 A1 ITMI20000852A1 I # The question is, do any of these exist in our main table and if so, how exactly? # if ($my_patn=~/^IT([A-Z]{2})0*(\d+)(\D.{0-1}?)$/i) ... # if ITMI96002033A1 or ??? # $num{$my_patn}=$2; # then 521230 or 12345 or 1 or MI1234 # $kind{$my_patn}=$3; # A1 or A1 or A1 or A1 } else { # Not US or WO or EP or DE or JP or IT. Who knows what it is? if ($my_patn=~/^\D\D0*(\d+)(\D.{0,5})?$/i) { # eg, FR2075483A5 $num{$my_patn}=$1; # 2075483 $kind{$my_patn}=$2; # A5 $TPS_kind{$my_patn}=$2; # A5 $TPS_kind{$my_patn} =~ s/_*$//g; # Strip any trailing underscores, ie GBnnA_ } } # End of the big, ol' if the country ($cc) is this, that, or the other thing, 170-some odd lines ago. # Predict what key TPS will use/return in its patent:requested_patent patentnumber="..." field. # When we parse TPS's XML reply, we'll need to be able to map this back to what our input was. $TPS_key{$my_patn}="$cc{$my_patn} $num{$my_patn}" . ($TPS_kind{$my_patn} ? " $TPS_kind{$my_patn}" : ""); $Requested_patn{$TPS_key{$my_patn}}=$my_patn; logit("g58: For $my_patn, num=$num{$my_patn}, kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn}, TPS_key=$TPS_key{$my_patn} and Requested_patn=$Requested_patn{$TPS_key{$my_patn}}\n"); # Pad dirs with zero if num < 1000 if($num{$my_patn} =~ /.*(\d\d)(\d\d)/) { $Dir1{$my_patn}=$2; $Dir2{$my_patn}=$1; } elsif($num{$my_patn} =~ /.*(\d)(\d\d)/) { $Dir1{$my_patn}=$2; $Dir2{$my_patn}="0$1"; } elsif($num{$my_patn} =~ /.*(\d\d)/) { $Dir1{$my_patn}=$1; $Dir2{$my_patn}="00"; } elsif($num{$my_patn} =~ /.*(\d)/) { $Dir1{$my_patn}="0$1"; $Dir2{$my_patn}="00"; } else { logit("g58a: Problem finding directory. Using 00/00 for $my_patn, num=$num{$my_patn}, kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn}, TPS_key=$TPS_key{$my_patn} and Requested_patn=$Requested_patn{$TPS_key{$my_patn}}\n"); $Dir1{$my_patn}="00"; $Dir2{$my_patn}="00"; } if ($Patolis) { # Here is where we limit Patolis to just US, WO, or EP images. if ($cc{$my_patn} ne "US" && $cc{$my_patn} ne "WO" && $cc{$my_patn} ne "EP") {exit} $PermImageStoreDir{$my_patn}=""; # Patolis has no permanent image store. $TempImageCacheDir{$my_patn}="/ips/images/cache/$Dir1{$my_patn}/$Dir2{$my_patn}"; # Temporary Read-Write Image Cache } else { # San Jose & EDC permanent image store. $PermImageStoreDir{$my_patn}="/dfs/images/$cc{$my_patn}/$Dir1{$my_patn}/$Dir2{$my_patn}"; $TempImageCacheDir{$my_patn}="/dfs/dlcache/$Dir1{$my_patn}/$Dir2{$my_patn}"; # Temporary Read-Write Image Cache } # Predict what filename TPS stores this image as in its /widas tree. # These may change later if we allow EP-to-WO substitutions or if TPS uses one of its funky kinds. $default_perm_fn_prefix{$my_patn}="$cc{$my_patn}$num{$my_patn}$TPS_kind{$my_patn}"; logit("g59: PermImageStoreDir=$PermImageStoreDir{$my_patn} TempImageCacheDir=$TempImageCacheDir{$my_patn} and default_perm_fn_prefix=$default_perm_fn_prefix{$my_patn}\n"); if ($from && $mode eq "GET") { # For forced image retrieval from a specific server, we will write the image # we're about to go get, into our Temporary Image Cache unless we already have # an image there, then target our home directory. if (-f "$TempImageCacheDir{$my_patn}/$my_patn.tif" || -f "$TempImageCacheDir{$my_patn}/$my_patn.pdf") { $TempImageCacheDir{$my_patn}=glob("~"); } logit("g5a: Will put image from $from into $TempImageCacheDir{$my_patn}\n"); } } # End of the parse_patn routine ################################################################################## # # # Query TPS for all the data for the passed patent number(s). # # # # It establishes the following set of arrays for each patent: # # TPS_count # # TPS_default (index to image TPS considers "best") # # # # And the following set of arrays for each image for each patent: # # Image_cc # # Image_nn # # Image_kind # # Image_patentnumber # # Image_patentnumber_real # # Image_filer # # $Image_method (always "local" or "remote" as far as I've ever seen) # # $Image_size (might be undefined) # # $Image_pages (might be undefined) # # $Image_volume (might be undefined) # # $Image_type (always "Original" as far as I've ever seen) # # # ################################################################################## sub Get_TPS_Info { my $TPS_patn_list=""; foreach my $this_patn (@_) { if (! -r $Image_filer{$TPS_default{$this_patn}}) { logit("g60: \$this_patn=$this_patn cc=$cc{$this_patn} num=$num{$this_patn} TPS_kind=>$TPS_kind{$this_patn}<\n"); $TPS_patn_list.="&patentnumber=$cc{$this_patn}_$num{$this_patn}" . ($TPS_kind{$this_patn} ? "_$TPS_kind{$this_patn}" : ""); } } logit("g61: Final \$TPS_patn_list=$TPS_patn_list\n"); if (! $TPS_patn_list) {return} # No images to get? That's odd. # supercp "http://www.thomsonpatentstore.net/charon/charon?charon:customer=patolis\ # &charon:service=patent_information_pdf&patentnumber=EP_1441797_A2" %stdout # $query_url="charon/charon?charon:customer=$TPS_id&charon:service=patent_information_pdf$TPS_patn_list"; $query_url=~s/\+/%2B/g; # Sometimes kind=C+ as for SE00101465C+ $ua = LWP::UserAgent->new(timeout=>10); # Give TPS 10 seconds to respond. Note this isn't a # total time, timeout. IE if TPS starts to respond # but takes 10 minutes to deliver everything, that's # ok - we won't time out. This timeout happens if we # ever go this long without receiving any bytes. logit("g62: Query URL = \"http://$TPS_Server/$query_url\"\n"); my $response = $ua->get("http://$TPS_Server/$query_url"); # Query TPS. if (! $response->is_success) { # Check the outcome of the response. # For example, if the server is down, the error message will be # Query TPS Error 500 Can't connect to www.thomsonpatentstore.net:80 (Interrupted system call) logit("g63: Query TPS Error " . $response->status_line . "\n"); return; } # The HTTP Headers returned by TPS aren't interesting. The content though, is. # But to document, here's what we get back for # supercp "http://www.thomsonpatentstore.net/charon/charon?charon:customer=delphion\ # &charon:service=patent_information_pdf&patentnumber=EP_1441797&patentnumber=WO_132477" %stdout # # HTTP Header: Date: Mon, 14 Mar 2005 22:43:46 GMT # HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5 # HTTP Header: Cache-Control: no-cache, no-store, max-age=0 # HTTP Header: Connection: close # HTTP Header: Content-Type: text/xml; charset=UTF-8 # # # # # # # # # # # # <--- Including this extra blank line. # # Or for an "equivalence" example, the result for EP_1441797 is, # HTTP Header: Date: Wed, 09 Mar 2005 22:52:30 GMT # HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5 # HTTP Header: Cache-Control: no-cache, no-store, max-age=0 # HTTP Header: Connection: close # HTTP Header: Content-Type: text/xml; charset=UTF-8 # # # # # # # <--- Including this extra blank line. # $content= $response->content(); # ------------------------ Parse image data. ------------------------ # # Loop through the output (our input), keying off the /g) { $thisStanzaData=$1; logit("g64: Got =>$thisStanzaData<=\n"); if ($thisStanzaData =~ /^\?xml version=/) {next} # Ignore beginning junk stanza if ($thisStanzaData =~ /^patent:information_pdf /) {next} # Another junk stanza # # The normal case has '... available="true">' at the end of the patent:requested_patent line, # but for the invalid or missing patents, the patent:requested line ends with available="false" />, # e.g. # versus # or # This was explained to me as being perfectly valid XML syntax. Oh, ok. if ($thisStanzaData =~ /^patent:requested_patent patentnumber="(.*)" available="false"/) { # Grab TPS's key and translate it back to the original input'd patent number. $this_patent=$Requested_patn{$1}; $TPS_count{$this_patent}=0; logit("g65: There are no images for $this_patent and TPS's key=>$1<.\n"); next; } if ($thisStanzaData =~ /^patent:requested_patent patentnumber="(.*)" available="true"/) { # Grab TPS's key and translate it back to the original input'd patent number. $this_patent=$Requested_patn{$1}; $TPS_count{$this_patent}=0; logit("g66: Am now working on $this_patent and TPS's key=>$1<\n"); next; } if ($thisStanzaData =~ /^patent:patent (patentnumber=".*")/) { my $this_data=$1; # First of all, pick out the real patent number to see if we're going to disallow # this EP-to-WO substitution. if ($this_data=~/ patentnumber_real="(..) ([^ ]+)(?: ?(.+?))"/) { # For EP-to-WO substitutions, patentnumber_real here is the WO substitute. if (($cc{$this_patent} eq $1) || $Automatically_Substitute || $mode eq "QUERY") { # We're cool. Either this isn't a EP-to-WO substitution or we're allowing # substitutions, or we're just querying, either way, accept this image. # # Save each piece of the patent:patent stanza for later use. # patent:patent # patentnumber="WO 2001032477 R8A2" # patentnumber_real="WO 2001032477 R8A2" # method="local" # size="118591" # pages="3" # default="default" <-- Perhaps # volume="MIWO2001049" # filer="/widas/WO/77/24/WO2001032477R8A2.pdf" # type="Original" $TPS_count{$this_patent}++; my $thiskey="$this_patent#$TPS_count{$this_patent}"; logit("g67: Am now working on Image # $TPS_count{$this_patent} for $this_patent thiskey=>$thiskey<\n"); logit("g68: this_data=>$this_data<\n"); $Image_cc{$thiskey}=$1; $Image_nn{$thiskey}=$2; $Image_kind{$thiskey}=$3; $Image_patentnumber_real{$thiskey}="$Image_cc{$thiskey} $Image_nn{$thiskey}" . ($Image_kind{$thiskey} ? " $Image_kind{$thiskey}" : ""); logit("g69: set Image_cc=$Image_cc{$thiskey} Image_nn=$Image_nn{$thiskey} Image_kind=$Image_kind{$thiskey} Image_patentnumber_real=>$Image_patentnumber_real{$thiskey}<\n"); if ($this_data=~/patentnumber="(.+?)"/) { $Image_patentnumber{$thiskey}=$1; logit("g6a: set Image_patentnumber{$thiskey} to >$Image_patentnumber{$thiskey}<\n"); } if ($this_data=~/ method="(.+?)"/) {$Image_method{$thiskey}=$1} if ($this_data=~/ size="(.+?)"/) {$Image_size{$thiskey}=$1} if ($this_data=~/ pages="(.+?)"/) {$Image_pages{$thiskey}=$1} # Handle the cases where TPS fails to designate a default patent by setting the if ($TPS_count{$this_patent}==1) { # first patent we get, as the default. $TPS_default{$this_patent}=$thiskey; # This will likely get replaced by the real default. logit("g6b: TPS_default initially set to $thiskey\n"); } if ($this_data=~/ default="default"/) { $TPS_default{$this_patent}=$thiskey; logit("g6c: TPS_default default reset to $thiskey ...\n"); } if ($this_data=~/ volume="(.+?)"/) {$Image_volume{$thiskey}=$1} if (! $Patolis && $this_data=~/ filer="(.+?)"/) {$Image_filer{$thiskey}=$1} if ($this_data =~ / type="(.+?)"/) { $Image_type{$thiskey}=$1; logit("g6d: Image_type=$Image_type{$thiskey}\n"); # Joachim's Charon spec says this may be "JAPIO" if it's an # "JP JAPIO PDF, english translation of the JP title page" # but I've never seen one of those things. But it IS indeed "JAPIO" for # the Japanese language image, e.g. JP23092902A2 (JP_2003092902_A1). if ($Image_type{$thiskey} ne "Original" && $Image_type{$thiskey} ne "JAPIO") { die "Strange 'type=' value ($Image_type{$thiskey}) for $thiskey.\n"; } } logit("g6e: For key=$thiskey, Image_patentnumber=>$Image_patentnumber{$thiskey}< Image_patentnumber_real=>$Image_patentnumber_real{$thiskey}< Image_method=$Image_method{$thiskey} Image_size=$Image_size{$thiskey} Image_pages=$Image_pages{$thiskey} TPS_default=$TPS_default{$this_patent} Image_volume=$Image_volume{$thiskey} Image_filer=$Image_filer{$thiskey} and Image_type=$Image_type{$thiskey}.\n"); } else { logit("g6f: Ignoring disallowed substitution =>$this_data<\n"); } # End of the It's cool to accept this line } # End of the parsing check for the "patentnumber_real=" field } # End of the check for the "patent:patent (patentnumber=..." line } # End of the loop to consume all the output of the TPS query foreach my $this_patn (@_) { if (! -r $Image_filer{$TPS_default{$this_patn}}) { # It's possible that our default patent has changed from our initial parse_patn guess, so # adjust it now that we have TPS's opinion of what image the default should be. $thiskey=$TPS_default{$this_patn}; if ($Image_patentnumber_real{$thiskey} =~ /^(..) .*(\d\d)(\d\d)\D* (.*)$/i) { # WO 200205 12 30 A1 if ($Patolis) { $TempImageCacheDir{$this_patn}="/ips/images/cache/$3/$2"; # Reset Temporary Read-Write Image Cache } else { # San Jose & EDC permanent image store. $PermImageStoreDir{$this_patn}="/dfs/images/$1/$3/$2"; # Reset Permanent Image Store and $TempImageCacheDir{$this_patn}="/dfs/dlcache/$3/$2"; # Temporary Read-Write Image Cache } $default_perm_fn_prefix{$this_patn}="$Image_cc{$thiskey}$Image_nn{$thiskey}$Image_kind{$thiskey}"; } logit("g6g: PermImageStoreDir=$PermImageStoreDir{$this_patn} TempImageCacheDir=$TempImageCacheDir{$this_patn} and default_perm_fn_prefix=$default_perm_fn_prefix{$this_patn}\n"); } } } # End of the Get_TPS_Info subroutine ################################################################################## # # # Get the default image(s) from TPS, write the file(s) into the local image # # cache, and save the filename(s) of the file(s) we wrote, into Image_filer. # # # # We don't get images if we can already see them in our NFS /widas mount, # # and we'll write the retrieved images in our temporary image cache. # # # ################################################################################## sub Get_TPS_Images { # Don't go to TPS if this request came from TPS. This avoids an endless loop # when we think they have it and they think we have it. if (($ENV{'REQUEST_ADDR'} eq 63.84.162.201) || # marge01us ($ENV{'REQUEST_ADDR'} eq 195.27.130.113)) { # marge01eu logit("g70: Endless loop avoided. REQUEST_ADDR=$ENV{'REQUEST_ADDR'}\n"); return; } # Verify I can get to each patent's default image. For those images I can't see, # go get it (probably from eSpaceNet), put the image in our temporary image cache, # and it's fn in $Image_filer{$TPS_default{$this_patn}}. $TPS_patn_list=""; foreach $this_patn (@in_patns) { $thiskey=$TPS_default{$this_patn}; if (! $thiskey) {next} # Skip if TPS didn't have any images logit("g71: cc=$cc{$this_patn} Image_cc{$thiskey}=$Image_cc{$thiskey} and we are" . ($Automatically_Substitute ? "" : " not") . " automatically substituting and TPS_default=$thiskey\n"); # If found in my local file system, great! Else save in my ask-TPS list. if (-r $Image_filer{$thiskey}) { $Gotten_From{$this_patn}="local file system"; } else { my $tmp_patn="$Image_cc{$thiskey}_$Image_nn{$thiskey}_$Image_kind{$thiskey}"; $TPS_patn_list.="&patentnumber=$tmp_patn"; # Insure I can get back to my original patent number ($this_patn). # The key may be different due to a change in kind, e.g. A2 -> R8A2. $TPS_key{$this_patn}="$Image_cc{$thiskey} $Image_nn{$thiskey} $Image_kind{$thiskey}"; $Requested_patn{$TPS_key{$this_patn}}=$this_patn; $Requested_key{$TPS_key{$this_patn}}=$thiskey; logit("g72: Can't see $Image_filer{$thiskey} so $tmp_patn added to TPS_patn_list and reset TPS_key{$this_patn}=$TPS_key{$this_patn} and Requested_key=$Requested_key{$TPS_key{$this_patn}}\n"); } } logit("g73: Final \$TPS_patn_list=$TPS_patn_list\n"); if (! $TPS_patn_list) {return} # Great, we have all the images. There's nothing to get. # "http://www.thomsonpatentstore.net/charon/charon?charon:customer=patolis\ # &charon:service=patent_link_pdf&patentnumber=$key" %stdout # $get_link_url = "http://$TPS_Server/charon/charon?charon:customer=$TPS_id&charon:service=patent_link_pdf$TPS_patn_list"; $get_link_url =~ s/\+/%2B/g; # Sometimes kind=C+ as for SE00101465C+ $ua = LWP::UserAgent->new(timeout=>300); # Give TPS 5 minutes to respond. TPS may have to fetch # it from espacenet and this could take a coupla minutes. logit("g74: Get Link URL = \"$get_link_url\" \n\n"); my $response = $ua->get($get_link_url); # Get Link URL (hopefully) from TPS. if (! $response->is_success) { # Check the outcome of the response. # For example, if the server is down, the error message will be # Get TPS Error 500 Can't connect to www.thomsonpatentstore.net:80 (Interrupted system call) logit("g75: Get TPS Error: " . $response->status_line . "\n"); return; } # To document what's returned by TPS, here is a patent_link_pdf call for WO_1989004114_A3, # EP_1344442_A1 and a non-existent, EP_9999442_A1. # # http://www.thomsonpatentstore.net/charon/charon?charon:customer=patolis&charon:service=patent_link_pdf # patentnumber=WO_1989004114_A3&patentnumber=EP_1344442_A1 # # HTTP Header: Date: Fri, 18 Mar 2005 04:49:41 GMT # HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5 # HTTP Header: Cache-Control: no-cache, no-store, max-age=0 # HTTP Header: Connection: close # HTTP Header: Content-Type: text/xml; charset=UTF-8 # # # http://marge01eu.thomsonpatentstore.net/pdf/WO1989004114R4A3.pdf # http://marge01eu.thomsonpatentstore.net/pdf/WO2002051230A1.pdf # # # <--- Including this extra blank line. # # The only relevant lines in our response are those long ones (line breaks added below for readability) # \ # http://marge01eu.thomsonpatentstore.net/pdf/WO1989004114R4A3.pdf # # \ # http://marge01eu.thomsonpatentstore.net/pdf/WO2002051230A1.pdf # # $content= $response->content(); logit("g76: Got content >$content<\n"); # # We use Perl's g modifier (see "Progressive Matching" in the Perl book, page 180) and # minimal matching (+?). # my $tmp_patn=""; while ($content =~ /(http:\/\/.*?)<\/patent:patent_link>/gs) { # Grab TPS's key and translate it back to the original input'd patent number. # For example, use "WO 2001032477 R8A2" to get back to WO00132477A2. logit("g77: \$1 is our tmp_TPS_key=>$1<\n"); $this_patent=$Requested_patn{$1}; $thiskey=$Requested_key{$1}; logit("g78: this_patent=$this_patent and thiskey=>$thiskey<\n"); $temp_fn="$TempImageCacheDir{$this_patent}/$this_patent.$myhostname.$$.pdf"; logit("g79: Will put image at $temp_fn\n"); $this_TPS_link=$2; # URL, eg http://marge01eu.thomsonpatentstore.net/pdf/WO2001032477R8A2.pdf logit("g7a: Got =>$this_TPS_link<= for $this_patent\n"); if (! $this_TPS_link) {next} # Sanity check. Should have valid URL but may not # if TPS doesn't have an image for this patent. ########################################################################### # # # At this point, $this_TPS_link is the URL of our image file, e.g. # # http://marge01eu.thomsonpatentstore.net/pdf/WO2001032477R8A2.pdf # # # # Go get the PDF file, sticking it in a temporary image file, then if # # it looks ok, rename it at the last moment to a permanent image name. # # # ########################################################################### $ua = LWP::UserAgent->new(timeout=>40); # Give TPS 40 seconds to respond. Note this isn't a # total time, timeout. IE if TPS starts to respond # but takes 20 minutes to deliver everything, that's # ok - we won't time out. This timeout happens if we # ever go this long without recieving any bytes. # Send output to a temporary filename and rename if complete. This avoids the problem of # the user getting impatient (it takes about 17 seconds for a 1.7 MB file to get copied # from TPS to Japan), and retries his request. What was happening was the second process # saw and used the partially-written tif file, resulting in bizarreness. my $response = $ua->get($this_TPS_link,':content_file'=> $temp_fn); if ($response->is_success) { # Check the outcome of the response # The HTTP Headers returned by TPS include # HTTP Header: Date: Tue, 02 Nov 2004 19:25:36 GMT # HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5 # HTTP Header: Last-Modified: Fri, 08 Aug 2003 14:14:08 GMT # HTTP Header: ETag: "963f20-5e201-73286c00" # HTTP Header: Accept-Ranges: bytes # HTTP Header: Content-Length: 385537 # HTTP Header: Connection: close # HTTP Header: Content-Type: application/pdf # $length= $response->header('Content-Length'); logit("g7b: $temp_fn file is " . (-s $temp_fn) . "-bytes (" . ((-s $temp_fn == $length) ? "right" : "wrong") . ").\n"); # Perform some sanity checking on our resulting image. PDF files start with %PDF # and end with %%EOF. Also check number of bytes, since it's handy to do so. if (! stat $temp_fn || `/bin/head -c4 $temp_fn` !~ /%PDF/ || `/bin/tail -c7 $temp_fn` !~ /%%EOF/ || $length != -s $temp_fn) { if (! stat $temp_fn) {logit("g7c: $temp_fn does not exist.\n")} elsif (`/bin/head -c4 $temp_fn` !~ /%PDF/) {logit("g7d: head -c4 $temp_fn is not %PDF\n")} elsif (`/bin/head -c4 $temp_fn` !~ /%PDF/) {logit("g7e: head -c4 $temp_fn is not %PDF\n")} elsif (`/bin/tail -c7 $temp_fn` !~ /%%EOF/) {logit("g7f: tail -c7 $temp_fn is not %%EOF\n")} elsif ($length != -s $temp_fn) {logit("g7g: File size(" . (-s $temp_fn) . ") is not $length.\n")}; unlink $temp_fn unless ($debug); # Image retrieval failed for this image server. ############################################################################## # # # Consider retrying this error, not to the other server like I do above, # # but to the same server. It'll give me a chance to use another goto. ;-) # # # ############################################################################## $perm_fn=""; } else { # We've got a good PDF image file, so determine our permanent # file name, and try to rename it to that. If a file already # exists with our permanent name, then leave it with the temp name. # One "rule" that makes our lives easier is, if you're ever going to write # into our temporary image cache directory, use the requested patent in # the file name. This makes finding it next time a whole lot easier. # This works fine except in the EP-to-WO substitution case. Rick, maybe address later. $perm_fn="$TempImageCacheDir{$this_patent}/$this_patent.pdf"; if (! -f $perm_fn) { logit("g7h: Renaming to $perm_fn.\n"); if (! rename "$temp_fn", "$perm_fn") { die "Rename $temp_fn to $perm_fn failed.\n"; } } } } else { # Oops, image retrieval failed for this image server. die ("Image TPS error. Status_line=>", $response->status_line , "\n"); } $Image_filer{$thiskey}="$perm_fn"; $Gotten_From{$this_patent}="TPS"; logit("g7i: Image_filer{$thiskey}=$Image_filer{$thiskey} gotten from $Gotten_From{$this_patent}\n"); } } # End of the Get_TPS_Image subroutine # Use the USPTO website to pull each of the individual TIFF image pages of a # patent, then use any2any to combine them into a single multi-page TIFF file, # writing $TempImageCacheDir{$this_patn}/$in_patn.tif. # # This code was modified from Rick's /dfs/ipntools/uspto_pull_image.pl # # If the US PTO's image has Reexaminations or Certificate of Corrections # appended at the end, we get them, too. # sub Get_USPTO_Image { my $in_patn=shift; my $pagecount=0, $idkey="", $imagelink_url=""; my $working_dir="$TempImageCacheDir{$this_patn}/$in_patn.$myhostname.$$"; my $ctl_file="$working_dir/$in_patn.ctl"; my $temp_fn="$TempImageCacheDir{$this_patn}/downloading.$myhostname.$$.$in_patn.tif"; my $perm_fn="$TempImageCacheDir{$this_patn}/$in_patn.tif"; # We need to get the US PTO's image link. The URL for granted images looks like # href=http://patimg1.uspto.gov/.piw?Docid=05551212 # &homeurl=http%3A%2F%2F164.195.100.11%2Fnetacgi% ... this piece is pretty long ... # &PageNum= # &Rtype= # &SectionNum= # &idkey=BD21661A21D6 # # The US Application image URL is similar, but uses a different server and .aix # instead of .piw. From both URL's, the only 2 parms we really need are Docid and # idkey. We know the format of Docid and if we could figure out the idkey format, # we could generate the .piw or .aiw URL's ourselves. Alas though, we can't decipher # idkey, so we gotta "steal" the whole link URL from their details view. if ($is_US_app) { # The URL for US Applications, are a bit different than for Granted images. # The server for example, is different, as is some parms (d=PG01 versus d=PALL). $details_url="http://151.207.241.118/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=/netahtml/PTO/srchnum.html&r=1&f=G&l=50&s1='$US_docid1'.PGNR.&OS=DN/$US_docid1&RS=DN/$US_docid1"; } else { $details_url="http://164.195.100.11/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=/netahtml/srchnum.htm&r=1&f=G&l=50&s1=$US_docid1.WKU.&OS=PN/$US_docid1&RS=PN/$US_docid1"; } $ua = LWP::UserAgent->new(timeout=>10); # Give the US PTO 10 seconds to respond. logit("g80: US PTO Details URL = \"$details_url\"\n\n"); my $response = $ua->get($details_url); # Get US PTO Details view. if (! $response->is_success) { # Abort if we fail. return; } $temp=$response->content(); # logit("g81: Details Content=>$temp<\n"); if($temp =~ m/No (?:patents|application publications) have matched your query/) { return; } if ($temp !~ /\[Image]$imagelink_url<\n\n"); # Get what info we can from the image link URL from the US PTO Detais page, # namely the idkey, which is a unique id for this patent/application, # the image server's host name, and the document ID. if($imagelink_url !~ m/idkey=([^&>"\n]*)/i) { logit("g84: Didn't find idkey in image link\n>$imagelink_url<\n"); return; # If not expected image link format, then abort. } $idkey = $1; logit("g85: Found idkey=$idkey\n"); # Find hostname, either patimg1.uspto.gov or patimg2.uspto.gov for US Granted, # or aiw1.uspto.gov for US Applications. if($imagelink_url !~ m/http:\/\/([^\/]*)\//) { logit("g86: Didn't find server name in image link\n>$imagelink_url<\n"); return; # If not expected image link format, then abort. } $img_host = $1; logit("g87: Found img_host=$img_host\n"); # Follow that image link, pulling the image's first page to get the total number # of pages in the image. The resulting HTML conveniently has this line near the beginning, # my $response = $ua->get($imagelink_url); # Get US PTO Details view. if (! $response->is_success) { # Abort if we fail. return; } $temp=$response->content(); # logit("g88: Image Content=>$temp<\n"); if($temp !~ m/NumPages=(\d[\d]*)/i) { logit("g89: Didn't find page count in image link. I got back $temp\n"); return; # If no image link, then abort. } $pagecount = $1; $pages_to_get=$pagecount; logit("g8a: There are $pagecount pages to retrieve.\n"); # Pull each TIFF image page. The US PTO presents single-page TIF files inside # a one-page-at-a-time viewer. The relevant link inside that page is # # # The Docid in this .DImg URL is a bit different than the previous details view, # which is why we have US_docid1 & US_docid2. $ua = LWP::UserAgent->new(timeout=>20); # Give US PTO 20 seconds to respond. Note this isn't a # total time, timeout. IE if US PTO starts to respond # but takes 20 minutes to deliver everything, that's # ok - we won't time out. This timeout happens if we # ever go this long without recieving any bytes. if (! -r $working_dir ) { # We finally need to make this directory mkdir $working_dir, 0775; if (! -r $working_dir ) { return } # Just in case we fail to make the directory. } unlink glob("$working_dir/*"); # Insure we are starting fresh. open CTLFILE, ">$ctl_file" or die "Can't open $ctl_file."; for ($i=1; $i<=$pages_to_get; $i++) { # Loop to get each page my $img_url="http://$img_host/.DImg?Docid=$US_docid2&PageNum=$i&IDKey=$idkey&ImgFormat=tif"; my $outfile="$working_dir/$in_patn.page.$i.tif"; logit("g8b: Getting page $i of $pages_to_get in $outfile. URL is\n$img_url\n"); my $response = $ua->get($img_url,':content_file'=> $outfile); # Get this image page. if (! $response->is_success || -z $outfile) { logit("g8c: Failure getting pate $i of $pages_to_get in $outfile\n"); unlink glob("$working_dir/*") if (! $debug); rmdir "$working_dir" if (! $debug); return; } print CTLFILE "filename $outfile\n" } close CTLFILE; # At this point, all pages of our US image are sitting in our $working_dir along with # our $ctl_file. Convert to multi-page tif into a temporary file, then rename. logit("g8d: Calling any2any $ctl_file $temp_fn\n"); @lines = `$any2anyDir/any2any $ctl_file $temp_fn 2>&1`; $rc = $?; if ($rc) { # Did any2any fail? $errno = $!; logit("g8e: Write of $temp_fn failed. any2any rc=$rc & errno=$errno\n\n"); foreach $line (@lines) { # Log any messages from anyinfo. logit("g8f: $line"); } } else { # any2any worked ok. Great. How many pages did it write? $any2any_page_count = 0; foreach $this_line (@lines) { # If only 1 page got written, any2any messages says "1 page", not "1 pages". if ( $this_line =~ /([0-9]+) pages? written OK./) { $any2any_page_count = $1; last; } } if ( ! $any2any_page_count ) { logit("g8g: any2any counted zero pages for $temp_fn\n"); foreach $line (@lines) { # Log any messages from any2any. logit("g8h: $line"); } } else { # any2any returned with no errors. # Check page count of the image file we just wrote, with anyinfo. # Maybe we got an error writing the image (AFS/DFS/NFS was down?). $anyinfo_page_count = 0; logit("g8i: Calling $any2anyDir/anyinfo $temp_fn\n"); @lines = `$any2anyDir/anyinfo $temp_fn`; foreach $line (@lines) { $_ = $line; if ( /^([0-9]+) pages?\./ ) { $anyinfo_page_count = $1; last; } } if ( ! $anyinfo_page_count ) { $err_count++; logit("g8j: anyinfo counted zero pages for $temp_fn\n"); foreach $line (@lines) { # Log any messages from anyinfo. logit("g8k: $line"); } } else { # Insure the two page counts match. They should. if ( $any2any_page_count != $anyinfo_page_count ) { logit("g8l: Page count mismatch for $temp_fn: $any2any_page_count vs $anyinfo_page_count\n"); foreach $line (@lines) { # Log any messages from anyinfo. logit("g8m: $line"); } } else { # Normal case. All's ok. logit("g8n: Wrote $anyinfo_page_count pages into $temp_fn at " . scalar localtime() . ".\n"); if (! rename "$temp_fn", "$perm_fn") { unlink glob("$working_dir/*") if (! $debug); rmdir "$working_dir" if (! $debug); die "Rename $temp_fn to $perm_fn failed.\n"; } } } # End of "Insure the two page counts match. They should." } # End of "any2any returned with no errors." } # End of "any2any worked ok." if (-r $perm_fn) { # If everything above was successful and we unlink glob("$working_dir/*"); # have a readable image, then erase our rmdir "$working_dir"; # working directories and return the return "$perm_fn"; # result of our labors. } else { unlink glob("$working_dir/*") if (! $debug); rmdir "$working_dir" if (! $debug); return ""; # Something above failed. Return nothing. } } # End of the Get_USPTO_Image subroutine # Convert a given PDF file to desired format, using the supplied filename prefix for my # temporary working files, then when finished, quickly rename it to the supplied # permanent name. # # Converting a PDF file is a two-step process. # First, /ips/prod/bin/pdftops EP00618926B1.pdf EP00618926B1.ps # Then, /ips/prod/bin/any2any EP00618926B1.ps EP00618926B1.$Wanted_Image_Type # sub Convert_pdf { my ($inputfilename, $tempfilePrefix, $permfilePrefix, $type, $pdf_page) = @_; logit("g90: Inside Convert_pdf with inputfilename=$inputfilename\n tempfilePrefix=$tempfilePrefix\n permfilePrefix=$permfilePrefix\n type=$type\n and pdf_page=$pdf_page\n"); my $pdftopsArgs = ""; if($pdf_page) { # If page was set, only convert that specific page of pdf to ps # This speeds up the subsequent PS to xxx conversion. $pdftopsArgs .= " -f $pdf_page -l $pdf_page "; # Reset $page to 1, since ps will only have one page to convert # Works for both cached and non-cached .ps $page = 1; } # See if we can save ourselves a step. Might our postscript file exist in our image, # cache directory, already converted for us? If it is, save ourselves this step. my $testfilename="$permfilePrefix.ps"; if (! FileExists($testfilename)) { logit("g91: Calling pdftops $pdftopsArgs $inputfilename $tempfilePrefix.ps\n"); system("$pdftopsCommand $pdftopsArgs $inputfilename $tempfilePrefix.ps 2>$tempfilePrefix.err"); if (-s "$tempfilePrefix.err") { # Error in pdftops call? logit("g92: pdftops Error.\n"); unlink "$tempfilePrefix.ps"; unlink "$tempfilePrefix.err"; return; } unlink "$tempfilePrefix.err"; # Get rid of zero-length error output file # Rename temporary file to real name once the conversion is complete. if ((-s "$tempfilePrefix.ps") && (-r _)) { if (! rename "$tempfilePrefix.ps", "$testfilename") { die "Rename $tempfilePrefix.ps to $testfilename failed.\n"; } logit("g93: Renamed $tempfilePrefix.ps to $testfilename\n"); } } logit("g94: Calling Call_any2any($testfilename, $tempfilePrefix, $permfilePrefix, $type, $page)\n"); my $returned_fn=Call_any2any($testfilename, $tempfilePrefix, $permfilePrefix, $type, $page); logit("g95: Call_any2any returned $returned_fn\n"); # Set Erase_Intermediate_PS_File=0 to save this temporary postscript file. # The code above will save a step (converting pdf to ps) at the expense of # using additional disk space in our image cache. if ($Erase_Intermediate_PS_File) {unlink "$testfilename"} unlink "$tempfilePrefix.err" if (! $debug); unlink "$tempfilePrefix.ps"; # Shouldn't exist. unlink "$tempfilePrefix.tif"; # Shouldn't exist. return $returned_fn; } # End of the Convert_pdf subroutine # Convert a given file to some other format (hopefully and presumably, a format # understood by any2any), using the supplied filename prefix for my temporary # working files, then when finished, quickly rename it to the supplied permanent # name. # # This is a simple any2any call, # /ips/prod/bin/any2any -# num EP00618926B1.tif EP00618926B1.pdf # sub Call_any2any { my ($inputfilename, $tempfilePrefix, $permfilePrefix, $type, $pageNum) = @_; logit("g90: Inside Call_any2any with inputfilename=$inputfilename wanting type=$type\n tempfilePrefix=$tempfilePrefix\n permfilePrefix=$permfilePrefix and pageNum=$pageNum\n"); if ($type eq "pdf") { $ENV{"PDF_TITLE"}=$in_patn; $ENV{"PDF_SUBJECT"}=$in_patn; $ENV{"PDF_CREATOR"}="$pdf_creator"; } # If there's any other output besides the normal # Opened: this, that, and the other file # ... # Opened: /tmp/aaaqffIya # hscale=1.000000, vscale=1.000000 (When writing pdf's only) # Execution of PostScript Interpreter is complete # 7 pages written OK. # then any2any got an error, e.g. the file system is full. # # When creating pdf files, one needs to be cd'd into /ips/prod/bin else you get errors. if ($pageNum) { $numParm="'-#' $pageNum" } else { $numParm="" } my $any2any_full_command = "cd $any2anyDir;export PATH=\$PATH:;export ARCPS=../arcps/current;./any2any -e \"ARCPS=../arcps/current\" -e \"PS_FILT=../arcps/current/aps2ras\" $numParm $inputfilename $tempfilePrefix.$type 2>&1 | /usr/bin/egrep -v '^Opened:|^Execution of PostScript Interpreter is complete|pages? written OK|hscale|Decompression failed' > $tempfilePrefix.err"; logit("g91: Calling $any2any_full_command"); # Strangely enough, ipsrun's PATH does not include the current directory, so would get # sh: aps2ras: not found # sh: aps2ras: not found # Cannot open input file /ips/images/cache/87/56/US04965687__.ps or unsupported format # when converting postscript files. (So how did this EVER work??) Fix path here. system("$any2any_full_command"); if (-s "$tempfilePrefix.err") { # Error in any2any call? # There is a not uncommon problem with any2any that we code around here. # Sometimes any2any spits out an error message, yet writes a partial # output file. An example is US21035478A1. Converting this 11-page # tif file to PDF, with this command # /dfs/prod/ipn/bin/any2any /dfs/images/US/78/54/US21035478A1.tif /dfs/dlcache/US21035478A1.pdf # gives this error message, # <<>>:Decompression failed with rc = 8f0e # and you get an 11-page PDF file that seems ok 'till you view page 11, when Acrobat Reader # gives an error msg or just a blank page. See bugzilla bug #2264 (closed WORKSFORME by Tom). # # Eric says to deliver what we can and send e-mail to Rebecca so she can fix the image. # In May, 2004, the sending of mail was aborted due to heavy volume. if ($Send_Any2any_Error_Mail && ! $cmdline) { if ($debug) { $mail_TO_recipients="rick.jasper\@thomson.com"; } else { $mail_TO_recipients="rebecca.hernandez\@thomson.com"; $mail_CC_recipients="rick.jasper\@thomson.com"; } open(MAIL, "|/usr/bin/mail -s'any2any Conversion Error for $in_patn on $myhostname' -c $mail_CC_recipients $mail_TO_recipients") or logit("g92: Cannot send mail to $mail_TO_recipients\n"); logit("g93: Sending e-mail to $mail_TO_recipients (CCing $mail_CC_recipients) due to $in_patn conversion error.\n"); # E.G. This is automatically-generated mail from the getimage program due to an any2any conversion error. # # The image came from EDC. Please investigate. # At Mon Jun 9 19:03:16 2003, getimage on dweb3 # # The commands were # cd /dfs/prod/ipn/bin # ./any2any /dfs/images/US/78/54/US21035478A1.tif /dfs/dlcache/US21035478A1dephds043.85854.pdf # # The error message generated by any2any was # <<>>:Decompression failed with rc = 8f0e print MAIL "This is automatically-generated mail from the $0 program due to an any2any conversion error.\n\n"; print MAIL "The image came from $from. Please investigate.\n"; print MAIL "At " . scalar localtime() . ", $0 on $myhostname\n\n"; print MAIL "The commands were\n"; print MAIL " cd $any2anyDir\n ./any2any $numParm $inputfilename $tempfilePrefix.$type\n\n"; print MAIL "The error message generated by any2any was \n"; open(ANYERRFILE,"< $tempfilePrefix.err"); while () { print MAIL " $_"; } close ANYERRFILE; close MAIL; } # Only if there was no output file generated by any2any, do we abort. # As described above, often we DO have something we can deliver. # It may have errors in it or be incomplete, but it's the best we can do. if (! -s "$tempfilePrefix.$type") { logit("g94: any2any error resulting in a zero-length output file.\n"); unlink "$tempfilePrefix.$type" if (! $debug); unlink "$tempfilePrefix.err" if (! $debug); return; } } unlink "$tempfilePrefix.err"; # Rename temporary file to real name once the conversion is complete. if ((-s "$tempfilePrefix.$type") && (-r _)) { if (! rename "$tempfilePrefix.$type", "$permfilePrefix.$type") { die "Rename $tempfilePrefix.$type to $permfilePrefix.$type failed.\n"; } logit("g95: Renamed $tempfilePrefix.$type to $permfilePrefix.$type\n"); } unlink "$tempfilePrefix.$type"; # Shouldn't exist. return "$permfilePrefix.$type"; } # End of the Call_any2any subroutine # Check for the existence of a particular file and while you're at it, # erase any zero-length files you find. sub FileExists { my ($fn) = @_; if ($fn && -f $fn && -r _) { if (-z $fn) { unlink $fn; # File found, but is zero length. return 0; # Clean up this junk and report no file found. } return 1; # This file IS there. All's well. } return 0; # No file found. } # End of the FileExists subroutine # If we have debug turned on, write the given line to our logfile, or # to the console if this is a command line invocation. sub logit { if ($debug) { if ($cmdline) { print "@_[0]"} else { open(LOGFILE,">>$logfile"); print LOGFILE "@_[0]"; close LOGFILE; } } } # End of the logit subroutine # $Header: /cvsroot/ipn/bin/getimage,v 1.19 2005/03/19 04:25:43 jasper Exp $