#!/dfs/prod/perl/bin/perl
#
# getimage: Generic routine to look for, retrieve if need be, possibly generate,
# and return the filename for the image file of the requested patent.
# Typically, this routine is called from (f)cgi-bin programs to return the
# filename for a patent's image. For example calls,
# in the EDC, see /dfs/prod/ipn/cgi-bin/image
# or in Japan, see /ips/prod/cgi-bin/viewpat.cmd
#
# EG, $ImageFileName=`../bin/getimage EP00904950A1 type=tif`;
# if (! -r $ImageFileName !! -z _) { # If failed to find a usable image or
# error ... # it's zero-length, then no image found.
#
# This can also be called from the command line with this syntax,
# getimage patn (type=tif|pdf|ps) (document=full|clip|abstract) (page=nn) (from TPS|USPTO)
# (sub|nosub) (query) (outformat=NV) (debug)
#
# PATN => The Patent number is typically a 2-character country code, 8-digit number,
# and 2-character kind. This is the only required parameter. The middle
# "number" part may have leading zeros stripped and can also be non-numeric
# as is the case for non-utility US patents (USPP1234__).
# You can also specify more than one patn, e.g. getimage patn1 patn2 patn3 type=pdf
#
# type=Wanted_Image_Type => For example, pdf or tif most commonly, but can be anything
# understood by any2any. If omitted, any image type is ok and
# we return whatever type we find (ie we do no conversions).
#
# - document=full|clip|abstract => Optional. Defaults to "full", which means "Please return
# the full document". Other possibilities include
# clip=Please return the clip, for example,
# /dfs/images/clips/US/12/12/US6801212B1.drg001.tif
# abstract=Please return the 1-page abstract, used for
# Japanese images, for example,
# /dfs/images/JP/2005/34/12/2005021234.tif
#
# - page=pagenumber => Optional. EG, page=1 to only return the first page. Only valid for
# full document requests.
#
# - from TPS|USPTO => Force getting from this source. Will put the file in the current
# or from=TPS|USPTO directory. This option is only valid from the command line.
#
# - sub|nosub => Allow or disallow (typically, EP-to-WO) image substitution.
# E.G. EP01344442A1 -> WO00251230A1 or EP01345714A1 -> WO00251231A1.
# or CA00932350A1 -> FR02075483A5 or CA01331388A1 -> US05052710A1.
# The default is to NOT allow these country-to-country substitutions.
# (Edward uses "sub" in his PDF Express java code.)
#
# - query => Ask TPS for details on what images it has available for the given patent(s)
# and display the information on the screen. This option WAS going to be only
# available from the command line, but fcgi-bin/getcdlabel.fpl uses this option.
#
# - format=NV => Return the image(s) data in name=value pairs (Sander uses this in Patolis).
# For example,
#
# - debug => Produce debug output, either to /ips/prod/logs/images.log (or /ips/test/...)
# or to the console if this is a command-line invocation.
#
# To test this from the command line,
# getimage US05551212__ type=pdf
# getimage EP00904950A1 type=tif
# getimage EP00904950A1 page=1
# getimage EP1344442A1 sub (substitutes to WO002051230A1)
# getimage EP1345714A1 sub (substitutes to WO020051231A1)
# getimage WO120000A1 type=tif
# getimage WO09209959A1 type=tif debug
# getimage DE10000001A1 DEK4000012U1 q
# getimage JP24100644A2 document=abstract
# getimage US06801212__ document=clip
#
# This program is shared between Patolis & EDC, so be aware & keep them in synch.
# The idea was to have one place where the give-me-an-image logic can be centralized
# and is called from
# * EDC's and Patolis's viewpat.cmd & download.cmd
# * EDC's cgi-bin/image program, which is the TPS & Patolis interface to EDC's images.
# Ironically, we might very well have a double getimage call with this sequence,
# Patolis User does a viewpat, which calls
# Patolis's cgi-bin/viewpat.cmd, which calls
# Patolis's bin/getimage, which calls
# EDC's cgi-bin/image,which calls
# EDC's bin/getimage, which gets the image and finally returns the image.
# * EDC's fcgi-bin/getcdlabel.fpl program, which is what Santokh uses for the
# Net Commerce Fast Buy
# * EDC's ImageConverter.java, which is Edward's PDF Express code.
ProcessInput(); # Parse arguments and initialize variables.
# Major Case 1/4: If user is forcing us to get|query image from a specific source,
# do so without looking around the local file system for it.
if ($from) {
if ($from eq "TPS") {
if (!$TPS_Server) {die "We don't have a usable EDC server from $myhostname.\n"}
Get_TPS_Info(@in_patns);
if ($mode eq "QUERY") {
if ($outformat eq "NV") { # Satisfy Sander here, Rick. I dunno what he wants exactly.
foreach $this_patn (@in_patns) {
print "RequestedPatent=$this_patn\n";
print "ImageCount=$TPS_count{$this_patn}\n";
for (my $i=1; $i<=$TPS_count{$this_patn}; $i++) {
$thiskey="$this_patn#$i";
print "Handle=$Image_cc{$thiskey} $Image_nn{$thiskey} $Image_kind{$thiskey}\n";
print "Volume=$Image_volume{$thiskey}\n";
print "Method=$Image_method{$thiskey}\n";
print "Pages=$Image_pages{$thiskey}\n";
print "Filer=$Image_filer{$thiskey}\n";
print "Default=" , ($TPS_default{$this_patn} eq $thiskey ? "yes" : "no") , "\n";
}
}
} else {
# For example, if getimage WO00132477 q EP1344442A1 USD0012345__
#
# TPS Has 3 Images for WO00132477
# Image "Handle" Volume Method Pages EDC Local File Name
# ===================== =========== ====== ===== ====================================
# WO 2001032477 R8A2 miwo2001049 local 3 /widas/WO/77/24/WO2001032477R8A2.pdf
# WO 2001032477 R5A2 miwo2001023 local 2 /widas/WO/77/24/WO2001032477R5A2.pdf
# WO 2001032477 A1 miwo2001019 local 33 /widas/WO/77/24/WO2001032477A1.pdf <-- Default
#
# TPS Has 1 Image for EP1344442A1
# Image "Handle" Volume Method Pages EDC Local File Name
# ===================== =========== ====== ===== ====================================
# WO 2002051230 A1 miwo2002026 local 18 /widas/WO/30/12/WO2002051230A1.pdf <-- Default
#
# TPS Has 2 Images for USD0012345__
# Image "Handle" Volume Method Pages EDC Local File Name
# ===================== =========== ====== ===== ====================================
# US D12345 S1 usp7 local 2 /widas/US/45/23/USD12345S1.pdf <-- Default
# US D012345 S1 remote
#
# Except that we don't show Patolis the EDC Local File Name.
foreach $this_patn (@in_patns) {
if (! $TPS_count{$this_patn}) {
print "The Thompson Patent Store does not have an image for $this_patn.\n" if ($cmdline);
} else {
print " TPS Has $TPS_count{$this_patn} Image" , $TPS_count{$this_patn}==1 ? "" : "s" , " for $this_patn\n";
print " Image \"Handle\" Volume Method Pages";
if (! $Patolis) {print " EDC Local File Name"}
print "\n";
print "===================== =========== ====== =====";
if (! $Patolis) {print " ===================================="}
print "\n";
for (my $i=1; $i<=$TPS_count{$this_patn}; $i++) {
$thiskey="$this_patn#$i";
logit("g01: this_patn=$this_patn TPS_default=$TPS_default{$this_patn} and thiskey=$thiskey\n");
printf("$Image_cc{$thiskey} %11s %-6s %11s %6s %5s %s %s\n", $Image_nn{$thiskey},$Image_kind{$thiskey},lc $Image_volume{$thiskey},$Image_method{$thiskey},$Image_pages{$thiskey}, ($Patolis ? "" : "$Image_filer{$thiskey}") , ($TPS_default{$this_patn} eq $thiskey ? "<-- Default" : ""));
}
# Alert caller when we have a country-to-country substitution which we don't allow.
if (($Image_cc{$TPS_default{$this_patn}} ne $cc{$this_patn}) && ! $Automatically_Substitute && $cmdline) {
print "\n===> Be aware that we don't allow this country-to-country substitution. <===\n\n";
}
}
}
}
exit;
}
# Not query mode. We must be getting, and specifically from TPS. We're likely running
# in San Jose, maybe doing a viewpat and we need to get the image from EDC.
Get_TPS_Images();
# Hopefully at this point, I have an image in $Image_filer{$TPS_default{$this_patn}} for each patn.
} elsif ($from eq "USPTO") {
if ($mode eq "QUERY") {die "Query mode from the US PTO not supported.\n"}
foreach $this_patn (@in_patns) {
if ($cc{$this_patn} eq "US") {
logit("g02: Getting $this_patn image from US PTO.\n");
$test_fn=Get_USPTO_Image($this_patn);
if (-r $test_fn) { # Did we get a usable image?
$Gotten_From{$this_patn}="US PTO"; # Yes, we did. Set this image as the default.
$TPS_default{$this_patn}="$this_patn#1";
$Image_filer{"$this_patn#1"}=$test_fn;
} else {
print "The US PTO doesn't have an image for $this_patn.\n" if ($cmdline);
}
} else {
print "Can't get a $cc{$this_patn} patent from the US PTO.\n" if ($cmdline);
}
}
} else {
print "Invalid 'from' value ($from).\n" if ($cmdline);
}
} elsif ($document eq "CLIP") {
# Major Case 2/4: Request for a patent's clip image
foreach $this_patn (@in_patns) {
if (! $Patolis && $cc{$this_patn} eq "JP" && (! $Wanted_Image_Type || $Wanted_Image_Type eq "tif")) {
# $page is ignored, even if set
$test_fn="/dfs/images/clips/$cc{$this_patn}/$Dir1{$this_patn}/$Dir2{$this_patn}/JP$num{$this_patn}A1.tif";
logit("g03: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
$Gotten_From{$this_patn}="local file system";
logit("g04: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
} # end JP clip
if (! $Patolis && $cc{$this_patn} eq "US" && ! $page && (! $Wanted_Image_Type || $Wanted_Image_Type eq "tif")) {
$thiskey="$this_patn#1"; # I'll need this if I do find an image.
$test_fn="/dfs/images/clips/$cc{$this_patn}/$Dir1{$this_patn}/$Dir2{$this_patn}/US$num{$this_patn}A1.tif";
logit("g03: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
$Gotten_From{$this_patn}="local file system";
logit("g04: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
} else {
$test_fn =~ s/A1\./B1\./g;
logit("g05: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
$Gotten_From{$this_patn}="local file system";
logit("g06: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
} else {
$test_fn =~ s/B1\./B2\./g;
logit("g07: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
$Gotten_From{$this_patn}="local file system";
logit("g08: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
}
}
} elsif ($document eq "ABSTRACT") {
# Major Case 3/4: Request for the 1-page Japanese abstract image.
foreach $this_patn (@in_patns) {
if (! $Patolis && $cc{$this_patn} eq "JP" && (!$page || ($page eq "1")) && (! $Wanted_Image_Type || $Wanted_Image_Type eq "pdf")) {
$thiskey="$this_patn#1"; # I'll need this if I do find an image.
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf";
logit("g09: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
$Gotten_From{$this_patn}="local file system";
logit("g0a: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
} else { # Major Case 4/4: This is not a forced "from" request, nor is it a
# request for the 1-page Japanese abstract.
# It's the normal case of a regular (probably tif|pdf) file.
# For each patent requested, see if we can determine the filename without asking TPS,
# just to save time. If we find an image, set $TPS_default and Image_filer, which will
# short-circuit the Get_TPS_Info & Get_TPS_Images subroutine for this patent.
#
foreach $this_patn (@in_patns) {
$thiskey="$this_patn#1"; # I'll need this if I do find an image.
if ($page) { # If a specific page number was asked for,
# look for just that page's image.
if ($Wanted_Image_Type) { # Was a specific Image Type asked for?
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.$Wanted_Image_Type";
logit("g10: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g11: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
} else { # More normal case where no image type specified.
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.tif";
logit("g12: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g13: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
} else {
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.pdf";
logit("g14: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g15: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
}
# If full document wanted (the normal case) or a specific page was wanted and we
# didn't find it above, then look for the full document.
#
if (! $TPS_default{$this_patn}) { # Do we have a usable image yet?
if ($Wanted_Image_Type) { # No. Was a specific Image Type asked for?
# If so, look for that specific type image first.
if ($PermImageStoreDir{$this_patn}) {
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.$Wanted_Image_Type";
logit("g16: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g17: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
if (! $TPS_default{$this_patn}) {
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.$Wanted_Image_Type";
logit("g18: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g19: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
if (! $TPS_default{$this_patn}) { # Do we have a usable image yet?
# If no specific type was specified or we didn't find the type asked for,
# look for anything, preferring pdf to tif.
#
if ($Wanted_Image_Type ne "pdf") { # Don't duplicate above work.
if ($PermImageStoreDir{$this_patn}) {
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf";
logit("g1a: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g1b: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
if (! $TPS_default{$this_patn}) { # Do we have a usable image yet?
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.pdf";
logit("g1c: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g1d: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
if (! $TPS_default{$this_patn}) { # Do we have a usable image yet?
if ($Wanted_Image_Type ne "tif") { # Don't duplicate above work.
if ($PermImageStoreDir{$this_patn}) {
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.tif";
logit("g1e: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g1f: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
if (! $TPS_default{$this_patn}) { # Do we have a usable image yet?
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.tif";
logit("g1g: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g1h: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
}
}
if ($TPS_default{$this_patn}) { # Did we get a usable image?
$Gotten_From{$this_patn}="local file system"; # Yes, we did.
} elsif (($cc{$this_patn} eq "US") && (($kind{$this_patn} eq "") || ($kind{$this_patn} eq "__")) && ($TPS_kind{$this_patn} eq "B1")) {
# Requested kind was empty or __ and USB1 failed, so try USB2
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf";
$test_fn =~ s/B1\./B2\./g; # Try USB2 instead of USB1
logit("g1g: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$TPS_default{$this_patn}=$thiskey; # Yes! Set this image as the default.
$Image_filer{$thiskey}=$test_fn;
logit("g1h: setting TPS_default{$this_patn}=$TPS_default{$this_patn} & Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
# Unset TPS kind for USB1. This allows B1 or B2 match from TPS
$TPS_kind{$this_patn} = "";
}
} # End of the foreach loop, trying to find the image file on our own, before asking TPS.
logit("g20: Calling Get_TPS_Info\n");
# For those patents that we couldn't find an image for in our local file system, ask TPS.
Get_TPS_Info(@in_patns);
# It's possible we know more now, eg EP-to-WO substitutions (EP01344442A1->WO2002051230A1)
# or a funky TPS_kind (WO00132477A2 -> WO132477R8A2), so recheck our local file systems
# before calling TPS to fetch the image.
foreach $this_patn (@in_patns) {
$thiskey=$TPS_default{$this_patn};
# There's are cases when we're asking for type=tif and/or page=1 of an funky TPS kind
# or an allowed EP-to-WO substituted image,
# e.g. getimage WO00132477A2 type=tif (TPS_kind=R8A2)
# or getimage WO00132477A2 page=1 (TPS_kind=R8A2)
# or getimage EP01344442A1 type=tif page=1 sub
# or getimage EP01344442A1 type=tif page=1 sub
# or getimage EP01344442A1 page=1 sub
# that we should look around better for an already-generated file.
# In these cases, we've already failed the initial file scan because we
# were looking for WO00132477A2 instead of WO00132477R8A2 or EP01344442A1-whatever
# instead of the substituted WO2002051230A1. And when we called Get_TPS_Info,
# it set $TPS_default to the original image we can see right now, but we still may
# not want to use it. There may be an already-generated tif or page.1 image in our
# temporary cache we can use, e.g. /dfs/dlcache/42/44/WO2002051230A1.tif
# or /dfs/dlcache/42/44/WO2002051230A1.page.1.tif
# A bit of a parley perhaps, but we should detect these cases and find the cached image.
#
############################################################################################
# #
# This logic to detect these cases need work here. Until it gets done and done right, #
# we'll be regenerating funky TPS kind page.1 & tif files unnecessarily. #
# #
# In the interest of time, I'm delaying this logic 'till later. #
# #
############################################################################################
#
# How do we detect these cases?
# if ($Wanted_Image_Type && (this image isn't of this type... - parse up $Image_filer{$thiskey}?? )
# if ($page && (cc=$cc{$this_patn} ne Image_cc{$thiskey})) { # If page and substituted patent ...
# if ($page && $test_fn !~ /\.page\.$page\./) { # If fn has ".page." in it ...
#
if (! -r $Image_filer{$thiskey}) { # Are we still missing a usable image for this patent?
# Yep, search local file system again.
logit("g21: Starting second scan for $thiskey and page=>$page<\n");
if ($page) { # If a specific page number was asked for,
# look for just that page's image.
if ($Wanted_Image_Type) { # Was a specific Image Type asked for?
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.$Wanted_Image_Type";
# Final image put at /dfs/dlcache/42/44/WO2002051230A1.page.1.tif
logit("g22: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g23: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
} else { # More normal case where no image type specified.
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.page.$page.tif";
logit("g24: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g25: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
# If full document wanted (the normal case) or a specific page was wanted and we
# didn't find it above, then look for the full document.
#
if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet?
if ($Wanted_Image_Type) { # No. Was a specific Image Type asked for?
# If so, look for that specific type image first.
if ($PermImageStoreDir{$this_patn}) {
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.$Wanted_Image_Type";
logit("g26: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g27: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet?
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.$Wanted_Image_Type";
logit("g28: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g29: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet?
# If no specific type was specified or we didn't find the type asked for,
# look for anything, preferring pdf to tif.
#
if ($Wanted_Image_Type ne "pdf") { # Don't duplicate above work.
if ($PermImageStoreDir{$this_patn}) {
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.pdf";
logit("g2a: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g2b: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet?
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.pdf";
logit("g2c: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g2c: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet?
if ($Wanted_Image_Type ne "tif") { # Don't duplicate above work.
if ($PermImageStoreDir{$this_patn}) {
$test_fn="$PermImageStoreDir{$this_patn}/$default_perm_fn_prefix{$this_patn}.tif";
logit("g2d: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g2e: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
if (! -r $TPS_filer{$thiskey}) { # Do we have a usable image yet?
$test_fn="$TempImageCacheDir{$this_patn}/$this_patn.tif";
logit("g2f: looking for $test_fn\n");
if (FileExists($test_fn)) { # Is a usable image there?
$Image_filer{$thiskey}=$test_fn; # Yes! Use this fn.
logit("g2g: setting Image_filer{$thiskey}=$Image_filer{$thiskey}=\n");
}
}
}
}
}
if ($TPS_default{$this_patn}) { # Did we get a usable image?
$Gotten_From{$this_patn}="local file system"; # Yes, we did.
}
} # End of are we still missing a usable image for this patent?
} # End of the foreach loop, trying to find the image file a second time on our own,
# letting TPS get it for us.
# Let TPS get the images for all patents that we couldn't find an image for.
Get_TPS_Images();
foreach $this_patn (@in_patns) {
# Still missing any US images? If so, we have one more trick up our sleeve.
if (! $TPS_default{$this_patn} && $cc{$this_patn} eq "US") {
$test_fn=Get_USPTO_Image($this_patn);
logit("g2h: USPTO returned >$test_fn< for $this_patn.\n");
if (-r $test_fn) { # Did we get a usable image?
$Gotten_From{$this_patn}="US PTO"; # Yes, we did. Set this image as the default.
$TPS_default{$this_patn}="$this_patn#1";
$Image_filer{"$this_patn#1"}=$test_fn;
} else {
print "The US PTO doesn't have an image for $this_patn.\n" if ($cmdline);
}
}
}
}
# We're done looking for usable image(s) and/or going off and getting one(them).
# Now for each image we have, check its type. If wrong, convert to $Wanted_Image_Type.
#
foreach $this_patn (@in_patns) {
$test_fn=$Image_filer{$TPS_default{$this_patn}};
if (-r $test_fn) { # Did we get a usable image?
logit("g30: Got usable image file at $test_fn from $Gotten_From{$this_patn} when Wanted_Image_Type=$Wanted_Image_Type\n");
$test_fn =~ /^(.*)\/(.*)\.([a-z]{1,4})/i; # eg /widas/WO/77/24/WO2001032477A1.pdf
$fnDirectories=$1; # /widas/WO/77/24
$fnPrefix=$2; # WO2001032477A1
$fnType=$3; # pdf
# If we wanted a specific type and what we have now is not that type,
# or we want just a single page and this image is not the single page image,
# then we've got more work to do. We'll put our converted image into
# our image cache.
if (($Wanted_Image_Type && $Wanted_Image_Type ne $fnType) ||
($page && $test_fn !~ /\.page\.$page\./)) {
# One "rule" that makes our lives easier is, if you're ever going to write
# into our temporary image cache directory, use the requested patent in
# the file name. This makes finding it next time a whole lot easier.
# This works fine except in the EP-to-WO substitution case. Rick, maybe address later.
# Wrong action when you ask for EP01344442A1 sub type=tif, which delivers WO2002051230A1.pdf,
# which you convert to /dfs/dlcache/EP01344442A1.tif and from then on, you always
# find that /dfs/dlcache/EP01344442A1.tif file. You never go back to the original
# pdf even if it's locally available and you ask for WO2002051230A1 type=pdf.
# Maybe I should always query TPS first. If only it doesn't have a local image,
# then scan around.
$permfilePrefix="$TempImageCacheDir{$this_patn}/$this_patn"; # eg /dfs/dlcache/00/00/USD460000__
logit("g31: We've got work to do on $test_fn\nWanted_Image_Type=$Wanted_Image_Type page=$page and permfilePrefix=$permfilePrefix\nTempImageCacheDir{$this_patn}=$TempImageCacheDir{$this_patn}\n");
# If only one specific page wanted, we'll put that single page image in
# $TempImageCacheDir{$this_patn}/$this_patn.page.$page.$Wanted_Image_Type"
if ($page && $test_fn !~ /\.page\.$page\./) {
logit("g32: must extract page $page from $test_fn.\n");
$permfilePrefix.=".page.$page";
if (! $Wanted_Image_Type) { # If no type specified, then we'll default
$Wanted_Image_Type=$fnType; # to the same output type as input file
logit("g33: Defaulting Wanted_Image_Type to $Wanted_Image_Type.\n");
}
}
$tempfilePrefix="$permfilePrefix.$myhostname.$$";
logit("g34: fnDirectories=$fnDirectories,fnPrefix=$fnPrefix,\nfnType=$fnType, and permfilePrefix=$permfilePrefix\n");
logit("g35: Converting to $permfilePrefix $Wanted_Image_Type (page=$page) ... \n");
if ($fnType eq "pdf") { # If input file = pdf, got to convert to ps first.
$Image_filer{$TPS_default{$this_patn}}=Convert_pdf($test_fn,$tempfilePrefix,$permfilePrefix,$Wanted_Image_Type,$page);
} else {
$Image_filer{$TPS_default{$this_patn}}=Call_any2any($test_fn,$tempfilePrefix,$permfilePrefix,$Wanted_Image_Type,$page);
}
$Gotten_From{$this_patn}.=" and converted";
} # Else found image is already the type I want and I want the whole image, so
# there's no need to convert anything.
} # Else we didn't get an image for this patent. Oh, well. Can't convert what we don't have.
}
# Then you can return what you got, if anything.
foreach $this_patn (@in_patns) {
if (-r $Image_filer{$TPS_default{$this_patn}}) {
print "$Image_filer{$TPS_default{$this_patn}}"; # Return the image's filename (if any) to our caller.
print " (from $Gotten_From{$this_patn})" if ($cmdline);
print "\n" if ($cmdline || $number_of_patns>1);
} else {
print "No usable image found for $this_patn.\n" if ($cmdline);
}
}
exit;
# Process the input and initialize global variables for the rest of the program.
#
# We set the following Input Variables:
# $in_patns = Array of uppercased patent number that were requested.
# $number_of_patns = Number of entries in the in_patns array.
# $debug = Binary switch to generate debug lines in log file (or console if
# called from the command line. Default is 0 (no debug output).
# $Wanted_Image_Type = pdf or tif or null, which means our caller doesn't
# what type is returned. They'll take any image type.
# $page = Page number for single-page requests (eg, to generate or find a
# one-page thumbnail). Default is null, meaning return all pages.
# $from = Forcibly get an image from the specified server (command line only).
# $Automatically_Substitute = Allow TPS's EP-to-WO substitutions.
# $Patolis = Binary switch to designate Patolis site
# $myhostname = EG, ips01i or dweb3 or penguin
# $cmdline = Binary switch. Normally false, unless invoked from a command line.
# $logfile = File name of log file for debugging or tracing.
# $EDC_Server = I.P. address of EDC's cgi-bin/image interface
# $TPS_id
# $TPS_Server
# $pdftopsCommand
# $any2anyDir
# $Erase_Intermediate_PS_File
# $Send_Any2any_Error_Mail
sub ProcessInput {
use LWP::UserAgent; # Useful for making remote web requests.
$myhostname=`hostname -s`; # Used for conditional code between EDC & Patolis,
chomp $myhostname; # and to differientiate temporary filenames in a
# multi-server & shared file system environment.
$page=""; # All pages, please.
$from="";
$TPS_Server="www.thomsonpatentstore.net";
$Wanted_Image_Type="";
$Automatically_Substitute=0; # Disallow TPS country-to-country image substitutions
$Erase_Intermediate_PS_File=1; # Clean up intermediate files
$Send_Any2any_Error_Mail=0; # No e-mail to Rebecca & Rick for any2any errors.
$Gotten_From{$this_patn}="unknown";
$mode="GET"; # Default to get (not query) and
$document="FULL"; # the full document (not clip or abstract)
$outformat=""; # in normal English output (not Sander's NV)
$DE{"A"}=10; $DE{"B"}=11; $DE{"C"}=12; $DE{"K"}=20; # Delphion encoding of German
$DE{"L"}=21; $DE{"M"}=22; $DE{"U"}=50; $DE{"X"}=60; # patent numbers
# Detect command line testing versus true web-server call (where REQUEST_METHOD will
# be set) or a call from the Tomcat environment (where CATALINA_HOME will be set).
if ((! $ENV{REQUEST_METHOD}) && (! $ENV{CATALINA_HOME})) {
$cmdline=1;
} else { # Most likely a true cgi-bin call from a web server
$cmdline=0; # or from the Tomcat world.
}
if ($myhostname=~/^ips/) {
$Patolis=1; # For Patolis-specific differences
$TPS_id="patolis";
$pdf_creator="Patolis";
# When called from the web server via a viewpat or download.cmd call, we use
# the SCRIPT_FILENAME environment variable, which contains our fully-qualified
# name, to differentiate between the production versus test environments.
#
# If this is a command-line call during testing (so the SCRIPT_FILENAME environment
# variable probably isn't set), then we use $0 (how we called ourselves) to
# select either environment, prod or test. EG, /ips/prod/bin/getimage ...
#
if ($ENV{'SCRIPT_FILENAME'} =~ /\/prod\// || $0 =~ /\/prod\//) {
$EDC_Server="84.18.161.12"; # EDC's Download Servers (www5.delphion.com)
$pdftopsCommand="/ips/prod/bin/pdftops";
$logfile="/ips/prod/logs/getimage.log";
} else {
$EDC_Server="84.18.161.14"; # EDC's Test Server (www7.delphion.com)
$pdftopsCommand="/ips/test/bin/pdftops";
$logfile="/ips/test/logs/getimage.log";
}
# Don't worry that /ips/prod is hardcoded here even if we're in the test environment.
# This is only used to convert images and even in the test environment,
# /ips/test/bin/aps2ras has /ips/prod hardcoded.
$any2anyDir="/ips/prod/bin";
} else { # We must be running in EDC or maybe, San Jose
$Patolis=0;
$TPS_id="delphion";
$pdf_creator="Thomson Delphion - http://www.delphion.com";
if ($myhostname=~/^d[a-z]*\d*$/) { # If we're really in EDC and not just San Jose,
$EDC=1; # remember this fact.
$EDC_Server=""; # EDC doesn't go to EDC for images
} else { # Else we're in San Jose
$EDC=0; # remember this fact.
$EDC_Server="84.18.161.14"; # EDC's Test Server (www7.delphion.com)
}
if ($ENV{'SCRIPT_FILENAME'} =~ /\/prod\// || $0 =~ /\/prod\//) {
$pdftopsCommand="/dfs/prod/ipn/bin/pdftops";
} else {
$pdftopsCommand="/dfs/stage/ipn/bin/pdftops";
if (! -x $pdftopsCommand) {$pdftopsCommand="/dfs/prod/ipn/bin/pdftops"}
}
$logfile="/ips/ipn/logs/getimage.log"; # Same place for both prod & stage
$any2anyDir="/dfs/prod/ipn/bin";
}
if (! -x $pdftopsCommand) {die "Can't find $pdftopsCommand program."}
if (! -d $any2anyDir) {die "Can't see $any2anyDir directory."}
##################################################################
# Now process the input arguments. #
##################################################################
$number_of_patns=0;
while (@ARGV) {
$thisARG=uc shift @ARGV;
logit("g40: Checking out thisARG=>$thisARG<\n"); # This msg may not show up in the log if we
# haven't seen the debug option yet.
if ($thisARG =~ /^type=(\w\w\w)$/i) { # This chokes on some otherwise valid types
$Wanted_Image_Type=lc $1; # that any2any understands, but tough.
# Who uses p12 or ras8 or scdim2 anyway?
} elsif ($thisARG =~ /^document=(full|clip|abstract)$/i) {
$document=$1;
} elsif ($thisARG =~ /^d(ebug)?$/i) {
$debug=1;
($sec,$min,$hour)=(localtime)[0,1,2];
logit(sprintf "g ============================= %02u:%02u:%02u =============================\n",$hour,$min,$sec);
} elsif ($thisARG =~ /^page=(\d*)$/i) {
$page=$1;
} elsif ($thisARG =~ /^sub$/i) {
$Automatically_Substitute=1;
} elsif ($thisARG =~ /^nosub$/i) {
$Automatically_Substitute=0;
} elsif ($thisARG =~ /^q(uery)?$/i) {
$mode="QUERY";
$from="TPS"; # A query, is presumed to be from TPS.
} elsif ($thisARG =~ /^from$/i && $cmdline) {
$from=shift @ARGV;
} elsif ($thisARG =~ /^from=(tps|uspto)$/i && $cmdline) {
$from=$1;
} elsif ($thisARG =~ /^outformat=(nv)$/i) {
$outformat=$1;
} elsif ($thisARG =~ /^cd=/i) {
# Quietly ignore old CD label parameter.
} else {
if (exists $out_fn{$thisARG}) {
logit("g41: $thisARG was specified twice. Multiples are ignored.\n");
} else {
$in_patns[$number_of_patns]=$thisARG; # Remember new patent number.
$out_fn{$thisARG}="";
logit("g42: Calling parse_patn for $thisARG\n");
parse_patn($thisARG);
$number_of_patns++;
}
}
}
} # End of the ProcessInput subroutine
# Parse the passed patent number into the pieces needed for the rest of the program.
# $my_patn = The original, untouched requested patent. Used to key the rest of these hashes.
# $cc{$my_patn} = The uppercased, 2-character country that was requested.
# $num{$my_patn} = The "numeric" part of the requested patent used in filenames, in
# TPS-preferred format. This may not be all numeric and the format will
# differ for different countries.
# For WO patents, this will be in the preferred 4-digit years and
# 6-digit numbers format. e.g. num=1983004466 for WO08304466A1
# num=2000000001 for WO00000001A2
# num=2002051230 for WO00251230A1
# or num=2002051231 for WO02051231A1
# For US Applications & Japanese patents, the preferred format is 4-digit
# years and 7-digit numbers, e.g. num=20010024032 for US21024032A1.
# We also undo Delphion's mangling if needed, for some DE & JP patents.
# For all others (eg US granted or EP), remove leading zeros, e.g. 1234 or RE1234.
# $kind{$my_patn} = The uppercased requested kind. Normally 2-characters, but might be longer,
# eg, A9W1A1. Might also be null for kind-less requests.
# $TPS_kind{$my_patn} = The uppercased, (normally 2-character) kind that we'll use to request TPS
# image data, but could be null if this is a kind-less request. Usually
# this is $kind{$my_patn}, but we'll supress Delphion's standard of __ kinds
# for US patents and overcome a bug in Joachim's code where he doesn't
# handle null kind requests for US non-utility patents.
# $default_perm_fn_prefix{$my_patn} = Our guess of TPS's image filename in the Permanent image store,
# store, eg "$cc{$my_patn}$num{$my_patn}$TPS_kind{$my_patn}".
# If TPS_kind is null, I'll never find the image without asking TPS.
# $TPS_key{$my_patn} = What TPS will use as its key, ie "$cc{$my_patn} $num{$my_patn} $TPS_kind{$my_patn}"
# $Requested_patn{$TPS_key} = Reverse mapping, TPS_key-to-The patent that was asked for.
# $Dir1{$my_patn} = Last pair of digits in patent number.
# $Dir2{$my_patn} = Next-to-last pair of digits in patent number.
# $PermImageStoreDir{$my_patn} = Permanent, Read-Only Image Store. EG, /dfs/images in EDC or SJ
# $TempImageCacheDir{$my_patn} = Temporary, Read-Write Image Cache. EG, /dfs/dlcache in EDC or SJ,
# /ips/images/cache in Japan.
# $is_US_app{$my_patn} = Binary switch. Is this a US Application?
# $is_US_special{$my_patn} = Binary switch. Is this a non-utility US patent?
# $US_docid1{$my_patn} = Needed/used only if getting from US PTO.
# $US_docid2{$my_patn} = Needed/used only if getting from US PTO.
#
sub parse_patn {
my $my_patn=shift;
$cc{$my_patn}=substr($my_patn,0,2);
logit("g50: parse_patn is looking at $my_patn and cc=$cc{$my_patn}\n");
$is_US_app{$my_patn}=0;
$is_US_special{$my_patn}=0;
if ($cc{$my_patn} eq "US") {
# Here we allow leading US Prefixes and only allow exactly 2-character kinds.
$my_patn=~/^US0*(.*?)(\D.)?$/; # eg, US0D001234__ or US00001234__ or us1400H__
$num{$my_patn}=$1; # D001234 or 1234 or 1400H
$kind{$my_patn}=$2; # __ or __ or __
logit("g51: patn=$my_patn, num=$num{$my_patn} and kind=$kind{$my_patn}\n");
if ($num{$my_patn}=~/^(\D+)(\d+([DHLN]?))/) { # Handle US Non-Utility Patents, which may be in
# the NPO standard of US0D277060__ or US0RE29774A1
# or US0BRE28576__. Those take special handling.
$is_US_special{$my_patn}=1;
$USPrefix=$1; # eg D, PP, RE, H, BRE, etc.
$nn=$2+0; # The true numeric portion w/o leading 0's.
$USSuffix=$3; # Fractional Patent Suffix (extremely rare)
$num{$my_patn}="$USPrefix$nn$USSuffix"; # The preferred TPS form (eg D1234 or 1234 or 1400H)
$US_docid1{$my_patn}="$USPrefix$nn";
$US_docid2{$my_patn}="US0$USPrefix" . substr("000000$nn",(-8+length($USPrefix)));
# Determine TPS kind. Normally, the TPS server correctly handles null or __ kinds, but for
# US non-utility patents, you need to specify a good kind. This is a bug in Joachim's code
# that he hasn't fixed it yet.
if ($USPrefix eq "D" ) {$TPS_kind{$my_patn}="S1"}
elsif ($USPrefix eq "PP") {$TPS_kind{$my_patn}="P1"}
elsif ($USPrefix eq "RD") {$TPS_kind{$my_patn}="E1"}
elsif ($USPrefix eq "RE") {$TPS_kind{$my_patn}="E1"}
elsif ($USPrefix eq "RX") {$TPS_kind{$my_patn}="I2"}
else {$TPS_kind{$my_patn}=""} # Unknown US non-utility type. Let TPS figure it out.
logit("g52: Non-Utility US patent ($my_patn) prefix=$USPrefix suffix=$USSuffix\n");
} else { # Else it's US Granted Utility Patent or US Application
if ($num{$my_patn} > 20000000 && $num{$my_patn} < 29999999) { # US App?
$is_US_app{$my_patn}=1;
# Convert US Applications from the Delphion standard of US21024032A1
# to the TPS standard of US20010024032A1
$num{$my_patn}=sprintf("200%1u%07u", substr($num{$my_patn},1,1), substr($num{$my_patn},2));
$US_docid1{$my_patn}=$num{$my_patn};
$US_docid2{$my_patn}="us$num{$my_patn}ki";
if ($kind{$my_patn}) {
$TPS_kind{$my_patn}=$kind{$my_patn}; # Use what was given, eg, A1 or P1 for US Applications
}
logit("g53: US Application ($my_patn)\n");
} else { # Else it's a US Granted Utility Patent
$US_docid1{$my_patn}=$num{$my_patn};
$US_docid2{$my_patn}="US" . substr("0000000$num{$my_patn}",-9); # Pad to 9 digits.
# If the normal case of Delphion's double-underscore kind or no kind specified,
# then leave TPS_kind null, else set it to what was specified.
if ($kind{$my_patn} ne "__" && $kind{$my_patn}) {
$TPS_kind{$my_patn}=$kind{$my_patn};
} elsif ($num{$my_patn} < 6167569) {
$TPS_kind{$my_patn}="A1";
} else {
$TPS_kind{$my_patn}="B1"; # B2 possible too, but handled as exception later
}
logit("g54: US Utility ($my_patn) num=$num{$my_patn} kind=$kind{$my_patn}" .
"TPS_kind=$TPS_kind{$my_patn} US_docid1=$US_docid1{$my_patn} and US_docid2=$US_docid2{$my_patn}\n");
}
}
logit("g55: num=$num{$my_patn} kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn} US_docid1=$US_docid1{$my_patn} and US_docid2=$US_docid2{$my_patn}\n");
} elsif ($cc{$my_patn} eq "WO") { # Else it's not a US image request. Is it WO?
$my_patn=~/^WO0*(\D*\d+)(\D.{0,5})?$/i; # eg, WO521230A1 or WO1A1
$num{$my_patn}=$1; # 521230 or 1
$kind{$my_patn}=$2; # A1 or A1
$num{$my_patn}=$num{$my_patn}; # Leading zeros (if any) have already been stripped for TPS.
$TPS_kind{$my_patn}=$kind{$my_patn};
logit("g56: my_patn=$my_patn cc=$cc{$my_patn} num=$num{$my_patn} kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn}\n");
# Handle all WO numeric possiblities. In numerical order, we have
if ($num{$my_patn} > 0 && $num{$my_patn} < 80000) { # Zero-stripped 2000 YYddddd
$num{$my_patn}="2000" . substr("00000$num{$my_patn}",-6); # becomes 20000ddddd
} elsif ($num{$my_patn} > 100000 && $num{$my_patn} < 199999) { # 2001 YYddddd
$num{$my_patn}="20010" . substr($num{$my_patn},-5); # becomes 20010ddddd
} elsif ($num{$my_patn} > 200000 && $num{$my_patn} < 251231) { # Jan-June, 2002 YYddddd
$num{$my_patn}="20020" . substr($num{$my_patn},-5); # becomes 20020ddddd
} elsif ($num{$my_patn} > 251230 && $num{$my_patn} < 299999) { # June-Dec, 2002 YYddddd
$num{$my_patn}="20020" . substr($num{$my_patn},-5); # becomes 20020ddddd
} elsif ($num{$my_patn} > 1000000 && $num{$my_patn} < 2051231) { # 2001-June, 2002 YYddddd
$num{$my_patn}="200" . substr($num{$my_patn},0,1) . substr($num{$my_patn},-6); # becomes 200Ydddddd
} elsif ($num{$my_patn} > 2051230 && $num{$my_patn} < 7800000) { # 2002-2007 YYddddd
$num{$my_patn}="200" . substr($num{$my_patn},0,1) . substr($num{$my_patn},-6); # becomes 200Yddddd
} elsif ($num{$my_patn} > 7800000 && $num{$my_patn} < 9999999) { # 1978-1999 YYddddd
$num{$my_patn}="19" . substr($num{$my_patn},0,2) . "0" . substr($num{$my_patn},-5); # becomes 19YY0ddddd
} elsif ($num{$my_patn} > 78000000 && $num{$my_patn} < 99999999) { # 1978-1999 YYdddddd
$num{$my_patn}="19" . substr($num{$my_patn},0,2) . substr($num{$my_patn},2); # becomes 19YYdddddd
} elsif ($num{$my_patn} > 190000000 && $num{$my_patn} < 200251231) { # Almost in expanded form YYYYddddd
$num{$my_patn}=substr($num{$my_patn},0,4) . "0" . substr($num{$my_patn},-5); # becomes YYYYdddddd
} elsif ($num{$my_patn} > 200251230 && $num{$my_patn} < 210000000) { # Almost in expanded form YYYYddddd
$num{$my_patn}=substr($num{$my_patn},0,4) . "0" . substr($num{$my_patn},-5); # becomes YYYY0ddddd
} elsif ($num{$my_patn} > 1960000000 && $num{$my_patn} < 2002051231) { # Already in expanded form YYYYdddddd
# num is fine as is, YYYYdddddd
} elsif ($num{$my_patn} > 1900000000 && $num{$my_patn} < 2100000000) { # Already in expanded form YYYYdddddd
# num is fine as is, YYYYdddddd
} else {
die "Invalid WO patent number ($num{$my_patn}).\n";
}
} elsif ($cc{$my_patn} eq "EP") { # Not US or WO. Is it EP?
$my_patn=~/^EP0*(\D*\d+)(\D.{0,5})?$/i; # eg, EP0012345A1
$num{$my_patn}=$1; # 12345
$kind{$my_patn}=$2; # A1
$TPS_kind{$my_patn}=$2; # A1
} elsif ($cc{$my_patn} eq "DE") { # Not US or WO or EP. German Patent maybe?
$my_patn=~/^DE0*(\D*\d+)(\D.{0,5})?$/i; # eg, DEK4000008U1
$num{$my_patn}=$1; # K4000008
$kind{$my_patn}=$2; # U1
$TPS_kind{$my_patn}=$2; # U1
if (substr($num{$my_patn},0,1) =~ /[A-Z]/) { # If so, check for Delphion's mangling.
# We'll undo Delphion's mangling of the patent number, changing the letter back into
# two digits (see the $DE hash above), as well as adding the 200 back to the year.
# For example, since $DE{'K'}=20, we'll change num from K4000008 to 202004000008.
substr($num{$my_patn},0,1)="$DE{substr($num{$my_patn},0,1)}200";
}
} elsif ($cc{$my_patn} eq "JP") { # Not US or WO or EP or DE. Is it Japanese?
$my_patn=~/^JP0*(\D*\d+)(\D.{0,5})?$/i; # eg, JP23092902A2
$num{$my_patn}=$1; # 23092902
$kind{$my_patn}=$2; # A2
$TPS_kind{$my_patn}=$2; # A2
# Delphion removes the middle two zeros from some Japanese patents, so we
# got to put them back in. For example, JP23092902A2 -> JP2003092902A
if (($num{$my_patn} =~ /2\d{7}/) && ($kind{$my_patn} =~ /[ATU]2/)) {
$num{$my_patn}="200" . substr($num{$my_patn},1);
} # Must be a pre-2000 JP Patent.
$TPS_kind{$my_patn}="A1";
logit("g57: Japanese patent num=$num{$my_patn} and TPS_kind=$TPS_kind{$my_patn}\n");
} elsif ($cc{$my_patn} eq "IT") { # It's not US or WO or EP or DE or JP. Is it IT?
# I don't quite know how to handle Italian patent numbers, so leave this for later, Rick.
# Italian patent numbers with the city code, need to have the embedded 0's removed,
# and Delphion's kind normalized, e.g. ITMI932388A1 -> IT_MI932388_A
# and Delphion's kind normalized, e.g. ITMN990024A1 -> IT_MN99024_A1
# FYI, normally, IT-CityCode patents in raid.espace have 6 digits. There are no IT rows in raid.pdf.
# The handful of exceptions are IT T0961019 A IT T0961019A I
# IT MI9602175 A IT MI9602175A I
# IT BO98000329 A ITBO98000329A I
# IT MN99024 A1 IT MN99024A1 I
# IT MI9602033 A1 IT MI9602033A1 I
# IT MI20000136 A1 ITMI20000136A1 I
# IT MI20000852 A1 ITMI20000852A1 I
# The question is, do any of these exist in our main table and if so, how exactly?
# if ($my_patn=~/^IT([A-Z]{2})0*(\d+)(\D.{0-1}?)$/i) ... # if ITMI96002033A1 or ???
# $num{$my_patn}=$2; # then 521230 or 12345 or 1 or MI1234
# $kind{$my_patn}=$3; # A1 or A1 or A1 or A1
} else { # Not US or WO or EP or DE or JP or IT. Who knows what it is?
if ($my_patn=~/^\D\D0*(\d+)(\D.{0,5})?$/i) { # eg, FR2075483A5
$num{$my_patn}=$1; # 2075483
$kind{$my_patn}=$2; # A5
$TPS_kind{$my_patn}=$2; # A5
$TPS_kind{$my_patn} =~ s/_*$//g; # Strip any trailing underscores, ie GBnnA_
}
} # End of the big, ol' if the country ($cc) is this, that, or the other thing, 170-some odd lines ago.
# Predict what key TPS will use/return in its patent:requested_patent patentnumber="..." field.
# When we parse TPS's XML reply, we'll need to be able to map this back to what our input was.
$TPS_key{$my_patn}="$cc{$my_patn} $num{$my_patn}" . ($TPS_kind{$my_patn} ? " $TPS_kind{$my_patn}" : "");
$Requested_patn{$TPS_key{$my_patn}}=$my_patn;
logit("g58: For $my_patn, num=$num{$my_patn}, kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn}, TPS_key=$TPS_key{$my_patn} and Requested_patn=$Requested_patn{$TPS_key{$my_patn}}\n");
# Pad dirs with zero if num < 1000
if($num{$my_patn} =~ /.*(\d\d)(\d\d)/) {
$Dir1{$my_patn}=$2;
$Dir2{$my_patn}=$1;
} elsif($num{$my_patn} =~ /.*(\d)(\d\d)/) {
$Dir1{$my_patn}=$2;
$Dir2{$my_patn}="0$1";
} elsif($num{$my_patn} =~ /.*(\d\d)/) {
$Dir1{$my_patn}=$1;
$Dir2{$my_patn}="00";
} elsif($num{$my_patn} =~ /.*(\d)/) {
$Dir1{$my_patn}="0$1";
$Dir2{$my_patn}="00";
} else {
logit("g58a: Problem finding directory. Using 00/00 for $my_patn, num=$num{$my_patn}, kind=$kind{$my_patn} TPS_kind=$TPS_kind{$my_patn}, TPS_key=$TPS_key{$my_patn} and Requested_patn=$Requested_patn{$TPS_key{$my_patn}}\n");
$Dir1{$my_patn}="00";
$Dir2{$my_patn}="00";
}
if ($Patolis) {
# Here is where we limit Patolis to just US, WO, or EP images.
if ($cc{$my_patn} ne "US" && $cc{$my_patn} ne "WO" && $cc{$my_patn} ne "EP") {exit}
$PermImageStoreDir{$my_patn}=""; # Patolis has no permanent image store.
$TempImageCacheDir{$my_patn}="/ips/images/cache/$Dir1{$my_patn}/$Dir2{$my_patn}"; # Temporary Read-Write Image Cache
} else {
# San Jose & EDC permanent image store.
$PermImageStoreDir{$my_patn}="/dfs/images/$cc{$my_patn}/$Dir1{$my_patn}/$Dir2{$my_patn}";
$TempImageCacheDir{$my_patn}="/dfs/dlcache/$Dir1{$my_patn}/$Dir2{$my_patn}"; # Temporary Read-Write Image Cache
}
# Predict what filename TPS stores this image as in its /widas tree.
# These may change later if we allow EP-to-WO substitutions or if TPS uses one of its funky kinds.
$default_perm_fn_prefix{$my_patn}="$cc{$my_patn}$num{$my_patn}$TPS_kind{$my_patn}";
logit("g59: PermImageStoreDir=$PermImageStoreDir{$my_patn} TempImageCacheDir=$TempImageCacheDir{$my_patn} and default_perm_fn_prefix=$default_perm_fn_prefix{$my_patn}\n");
if ($from && $mode eq "GET") {
# For forced image retrieval from a specific server, we will write the image
# we're about to go get, into our Temporary Image Cache unless we already have
# an image there, then target our home directory.
if (-f "$TempImageCacheDir{$my_patn}/$my_patn.tif" || -f "$TempImageCacheDir{$my_patn}/$my_patn.pdf") {
$TempImageCacheDir{$my_patn}=glob("~");
}
logit("g5a: Will put image from $from into $TempImageCacheDir{$my_patn}\n");
}
} # End of the parse_patn routine
##################################################################################
# #
# Query TPS for all the data for the passed patent number(s). #
# #
# It establishes the following set of arrays for each patent: #
# TPS_count #
# TPS_default (index to image TPS considers "best") #
# #
# And the following set of arrays for each image for each patent: #
# Image_cc #
# Image_nn #
# Image_kind #
# Image_patentnumber #
# Image_patentnumber_real #
# Image_filer #
# $Image_method (always "local" or "remote" as far as I've ever seen) #
# $Image_size (might be undefined) #
# $Image_pages (might be undefined) #
# $Image_volume (might be undefined) #
# $Image_type (always "Original" as far as I've ever seen) #
# #
##################################################################################
sub Get_TPS_Info {
my $TPS_patn_list="";
foreach my $this_patn (@_) {
if (! -r $Image_filer{$TPS_default{$this_patn}}) {
logit("g60: \$this_patn=$this_patn cc=$cc{$this_patn} num=$num{$this_patn} TPS_kind=>$TPS_kind{$this_patn}<\n");
$TPS_patn_list.="&patentnumber=$cc{$this_patn}_$num{$this_patn}" . ($TPS_kind{$this_patn} ? "_$TPS_kind{$this_patn}" : "");
}
}
logit("g61: Final \$TPS_patn_list=$TPS_patn_list\n");
if (! $TPS_patn_list) {return} # No images to get? That's odd.
# supercp "http://www.thomsonpatentstore.net/charon/charon?charon:customer=patolis\
# &charon:service=patent_information_pdf&patentnumber=EP_1441797_A2" %stdout
#
$query_url="charon/charon?charon:customer=$TPS_id&charon:service=patent_information_pdf$TPS_patn_list";
$query_url=~s/\+/%2B/g; # Sometimes kind=C+ as for SE00101465C+
$ua = LWP::UserAgent->new(timeout=>10); # Give TPS 10 seconds to respond. Note this isn't a
# total time, timeout. IE if TPS starts to respond
# but takes 10 minutes to deliver everything, that's
# ok - we won't time out. This timeout happens if we
# ever go this long without receiving any bytes.
logit("g62: Query URL = \"http://$TPS_Server/$query_url\"\n");
my $response = $ua->get("http://$TPS_Server/$query_url"); # Query TPS.
if (! $response->is_success) { # Check the outcome of the response.
# For example, if the server is down, the error message will be
# Query TPS Error 500 Can't connect to www.thomsonpatentstore.net:80 (Interrupted system call)
logit("g63: Query TPS Error " . $response->status_line . "\n");
return;
}
# The HTTP Headers returned by TPS aren't interesting. The content though, is.
# But to document, here's what we get back for
# supercp "http://www.thomsonpatentstore.net/charon/charon?charon:customer=delphion\
# &charon:service=patent_information_pdf&patentnumber=EP_1441797&patentnumber=WO_132477" %stdout
#
# HTTP Header: Date: Mon, 14 Mar 2005 22:43:46 GMT
# HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5
# HTTP Header: Cache-Control: no-cache, no-store, max-age=0
# HTTP Header: Connection: close
# HTTP Header: Content-Type: text/xml; charset=UTF-8
#
#
#
#
#
#
#
#
#
#
#
# <--- Including this extra blank line.
#
# Or for an "equivalence" example, the result for EP_1441797 is,
# HTTP Header: Date: Wed, 09 Mar 2005 22:52:30 GMT
# HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5
# HTTP Header: Cache-Control: no-cache, no-store, max-age=0
# HTTP Header: Connection: close
# HTTP Header: Content-Type: text/xml; charset=UTF-8
#
#
#
#
#
#
# <--- Including this extra blank line.
#
$content= $response->content();
# ------------------------ Parse image data. ------------------------ #
# Loop through the output (our input), keying off the /g) {
$thisStanzaData=$1;
logit("g64: Got =>$thisStanzaData<=\n");
if ($thisStanzaData =~ /^\?xml version=/) {next} # Ignore beginning junk stanza
if ($thisStanzaData =~ /^patent:information_pdf /) {next} # Another junk stanza
#
# The normal case has '... available="true">' at the end of the patent:requested_patent line,
# but for the invalid or missing patents, the patent:requested line ends with available="false" />,
# e.g.
# versus
# or
# This was explained to me as being perfectly valid XML syntax. Oh, ok.
if ($thisStanzaData =~ /^patent:requested_patent patentnumber="(.*)" available="false"/) {
# Grab TPS's key and translate it back to the original input'd patent number.
$this_patent=$Requested_patn{$1};
$TPS_count{$this_patent}=0;
logit("g65: There are no images for $this_patent and TPS's key=>$1<.\n");
next;
}
if ($thisStanzaData =~ /^patent:requested_patent patentnumber="(.*)" available="true"/) {
# Grab TPS's key and translate it back to the original input'd patent number.
$this_patent=$Requested_patn{$1};
$TPS_count{$this_patent}=0;
logit("g66: Am now working on $this_patent and TPS's key=>$1<\n");
next;
}
if ($thisStanzaData =~ /^patent:patent (patentnumber=".*")/) {
my $this_data=$1;
# First of all, pick out the real patent number to see if we're going to disallow
# this EP-to-WO substitution.
if ($this_data=~/ patentnumber_real="(..) ([^ ]+)(?: ?(.+?))"/) {
# For EP-to-WO substitutions, patentnumber_real here is the WO substitute.
if (($cc{$this_patent} eq $1) || $Automatically_Substitute || $mode eq "QUERY") {
# We're cool. Either this isn't a EP-to-WO substitution or we're allowing
# substitutions, or we're just querying, either way, accept this image.
#
# Save each piece of the patent:patent stanza for later use.
# patent:patent
# patentnumber="WO 2001032477 R8A2"
# patentnumber_real="WO 2001032477 R8A2"
# method="local"
# size="118591"
# pages="3"
# default="default" <-- Perhaps
# volume="MIWO2001049"
# filer="/widas/WO/77/24/WO2001032477R8A2.pdf"
# type="Original"
$TPS_count{$this_patent}++;
my $thiskey="$this_patent#$TPS_count{$this_patent}";
logit("g67: Am now working on Image # $TPS_count{$this_patent} for $this_patent thiskey=>$thiskey<\n");
logit("g68: this_data=>$this_data<\n");
$Image_cc{$thiskey}=$1;
$Image_nn{$thiskey}=$2;
$Image_kind{$thiskey}=$3;
$Image_patentnumber_real{$thiskey}="$Image_cc{$thiskey} $Image_nn{$thiskey}" . ($Image_kind{$thiskey} ? " $Image_kind{$thiskey}" : "");
logit("g69: set Image_cc=$Image_cc{$thiskey} Image_nn=$Image_nn{$thiskey} Image_kind=$Image_kind{$thiskey} Image_patentnumber_real=>$Image_patentnumber_real{$thiskey}<\n");
if ($this_data=~/patentnumber="(.+?)"/) {
$Image_patentnumber{$thiskey}=$1;
logit("g6a: set Image_patentnumber{$thiskey} to >$Image_patentnumber{$thiskey}<\n");
}
if ($this_data=~/ method="(.+?)"/) {$Image_method{$thiskey}=$1}
if ($this_data=~/ size="(.+?)"/) {$Image_size{$thiskey}=$1}
if ($this_data=~/ pages="(.+?)"/) {$Image_pages{$thiskey}=$1}
# Handle the cases where TPS fails to designate a default patent by setting the
if ($TPS_count{$this_patent}==1) { # first patent we get, as the default.
$TPS_default{$this_patent}=$thiskey; # This will likely get replaced by the real default.
logit("g6b: TPS_default initially set to $thiskey\n");
}
if ($this_data=~/ default="default"/) {
$TPS_default{$this_patent}=$thiskey;
logit("g6c: TPS_default default reset to $thiskey ...\n");
}
if ($this_data=~/ volume="(.+?)"/) {$Image_volume{$thiskey}=$1}
if (! $Patolis && $this_data=~/ filer="(.+?)"/) {$Image_filer{$thiskey}=$1}
if ($this_data =~ / type="(.+?)"/) {
$Image_type{$thiskey}=$1;
logit("g6d: Image_type=$Image_type{$thiskey}\n");
# Joachim's Charon spec says this may be "JAPIO" if it's an
# "JP JAPIO PDF, english translation of the JP title page"
# but I've never seen one of those things. But it IS indeed "JAPIO" for
# the Japanese language image, e.g. JP23092902A2 (JP_2003092902_A1).
if ($Image_type{$thiskey} ne "Original" && $Image_type{$thiskey} ne "JAPIO") {
die "Strange 'type=' value ($Image_type{$thiskey}) for $thiskey.\n";
}
}
logit("g6e: For key=$thiskey, Image_patentnumber=>$Image_patentnumber{$thiskey}< Image_patentnumber_real=>$Image_patentnumber_real{$thiskey}< Image_method=$Image_method{$thiskey} Image_size=$Image_size{$thiskey} Image_pages=$Image_pages{$thiskey} TPS_default=$TPS_default{$this_patent} Image_volume=$Image_volume{$thiskey} Image_filer=$Image_filer{$thiskey} and Image_type=$Image_type{$thiskey}.\n");
} else {
logit("g6f: Ignoring disallowed substitution =>$this_data<\n");
} # End of the It's cool to accept this line
} # End of the parsing check for the "patentnumber_real=" field
} # End of the check for the "patent:patent (patentnumber=..." line
} # End of the loop to consume all the output of the TPS query
foreach my $this_patn (@_) {
if (! -r $Image_filer{$TPS_default{$this_patn}}) {
# It's possible that our default patent has changed from our initial parse_patn guess, so
# adjust it now that we have TPS's opinion of what image the default should be.
$thiskey=$TPS_default{$this_patn};
if ($Image_patentnumber_real{$thiskey} =~ /^(..) .*(\d\d)(\d\d)\D* (.*)$/i) {
# WO 200205 12 30 A1
if ($Patolis) {
$TempImageCacheDir{$this_patn}="/ips/images/cache/$3/$2"; # Reset Temporary Read-Write Image Cache
} else {
# San Jose & EDC permanent image store.
$PermImageStoreDir{$this_patn}="/dfs/images/$1/$3/$2"; # Reset Permanent Image Store and
$TempImageCacheDir{$this_patn}="/dfs/dlcache/$3/$2"; # Temporary Read-Write Image Cache
}
$default_perm_fn_prefix{$this_patn}="$Image_cc{$thiskey}$Image_nn{$thiskey}$Image_kind{$thiskey}";
}
logit("g6g: PermImageStoreDir=$PermImageStoreDir{$this_patn} TempImageCacheDir=$TempImageCacheDir{$this_patn} and default_perm_fn_prefix=$default_perm_fn_prefix{$this_patn}\n");
}
}
} # End of the Get_TPS_Info subroutine
##################################################################################
# #
# Get the default image(s) from TPS, write the file(s) into the local image #
# cache, and save the filename(s) of the file(s) we wrote, into Image_filer. #
# #
# We don't get images if we can already see them in our NFS /widas mount, #
# and we'll write the retrieved images in our temporary image cache. #
# #
##################################################################################
sub Get_TPS_Images {
# Don't go to TPS if this request came from TPS. This avoids an endless loop
# when we think they have it and they think we have it.
if (($ENV{'REQUEST_ADDR'} eq 63.84.162.201) || # marge01us
($ENV{'REQUEST_ADDR'} eq 195.27.130.113)) { # marge01eu
logit("g70: Endless loop avoided. REQUEST_ADDR=$ENV{'REQUEST_ADDR'}\n");
return;
}
# Verify I can get to each patent's default image. For those images I can't see,
# go get it (probably from eSpaceNet), put the image in our temporary image cache,
# and it's fn in $Image_filer{$TPS_default{$this_patn}}.
$TPS_patn_list="";
foreach $this_patn (@in_patns) {
$thiskey=$TPS_default{$this_patn};
if (! $thiskey) {next} # Skip if TPS didn't have any images
logit("g71: cc=$cc{$this_patn} Image_cc{$thiskey}=$Image_cc{$thiskey} and we are" . ($Automatically_Substitute ? "" : " not") . " automatically substituting and TPS_default=$thiskey\n");
# If found in my local file system, great! Else save in my ask-TPS list.
if (-r $Image_filer{$thiskey}) {
$Gotten_From{$this_patn}="local file system";
} else {
my $tmp_patn="$Image_cc{$thiskey}_$Image_nn{$thiskey}_$Image_kind{$thiskey}";
$TPS_patn_list.="&patentnumber=$tmp_patn";
# Insure I can get back to my original patent number ($this_patn).
# The key may be different due to a change in kind, e.g. A2 -> R8A2.
$TPS_key{$this_patn}="$Image_cc{$thiskey} $Image_nn{$thiskey} $Image_kind{$thiskey}";
$Requested_patn{$TPS_key{$this_patn}}=$this_patn;
$Requested_key{$TPS_key{$this_patn}}=$thiskey;
logit("g72: Can't see $Image_filer{$thiskey} so $tmp_patn added to TPS_patn_list and reset TPS_key{$this_patn}=$TPS_key{$this_patn} and Requested_key=$Requested_key{$TPS_key{$this_patn}}\n");
}
}
logit("g73: Final \$TPS_patn_list=$TPS_patn_list\n");
if (! $TPS_patn_list) {return} # Great, we have all the images. There's nothing to get.
# "http://www.thomsonpatentstore.net/charon/charon?charon:customer=patolis\
# &charon:service=patent_link_pdf&patentnumber=$key" %stdout
#
$get_link_url = "http://$TPS_Server/charon/charon?charon:customer=$TPS_id&charon:service=patent_link_pdf$TPS_patn_list";
$get_link_url =~ s/\+/%2B/g; # Sometimes kind=C+ as for SE00101465C+
$ua = LWP::UserAgent->new(timeout=>300); # Give TPS 5 minutes to respond. TPS may have to fetch
# it from espacenet and this could take a coupla minutes.
logit("g74: Get Link URL = \"$get_link_url\" \n\n");
my $response = $ua->get($get_link_url); # Get Link URL (hopefully) from TPS.
if (! $response->is_success) { # Check the outcome of the response.
# For example, if the server is down, the error message will be
# Get TPS Error 500 Can't connect to www.thomsonpatentstore.net:80 (Interrupted system call)
logit("g75: Get TPS Error: " . $response->status_line . "\n");
return;
}
# To document what's returned by TPS, here is a patent_link_pdf call for WO_1989004114_A3,
# EP_1344442_A1 and a non-existent, EP_9999442_A1.
#
# http://www.thomsonpatentstore.net/charon/charon?charon:customer=patolis&charon:service=patent_link_pdf
# patentnumber=WO_1989004114_A3&patentnumber=EP_1344442_A1
#
# HTTP Header: Date: Fri, 18 Mar 2005 04:49:41 GMT
# HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5
# HTTP Header: Cache-Control: no-cache, no-store, max-age=0
# HTTP Header: Connection: close
# HTTP Header: Content-Type: text/xml; charset=UTF-8
#
#
# http://marge01eu.thomsonpatentstore.net/pdf/WO1989004114R4A3.pdf
# http://marge01eu.thomsonpatentstore.net/pdf/WO2002051230A1.pdf
#
#
# <--- Including this extra blank line.
#
# The only relevant lines in our response are those long ones (line breaks added below for readability)
# \
# http://marge01eu.thomsonpatentstore.net/pdf/WO1989004114R4A3.pdf
#
# \
# http://marge01eu.thomsonpatentstore.net/pdf/WO2002051230A1.pdf
#
#
$content= $response->content();
logit("g76: Got content >$content<\n");
#
# We use Perl's g modifier (see "Progressive Matching" in the Perl book, page 180) and
# minimal matching (+?).
#
my $tmp_patn="";
while ($content =~ /(http:\/\/.*?)<\/patent:patent_link>/gs) {
# Grab TPS's key and translate it back to the original input'd patent number.
# For example, use "WO 2001032477 R8A2" to get back to WO00132477A2.
logit("g77: \$1 is our tmp_TPS_key=>$1<\n");
$this_patent=$Requested_patn{$1};
$thiskey=$Requested_key{$1};
logit("g78: this_patent=$this_patent and thiskey=>$thiskey<\n");
$temp_fn="$TempImageCacheDir{$this_patent}/$this_patent.$myhostname.$$.pdf";
logit("g79: Will put image at $temp_fn\n");
$this_TPS_link=$2; # URL, eg http://marge01eu.thomsonpatentstore.net/pdf/WO2001032477R8A2.pdf
logit("g7a: Got =>$this_TPS_link<= for $this_patent\n");
if (! $this_TPS_link) {next} # Sanity check. Should have valid URL but may not
# if TPS doesn't have an image for this patent.
###########################################################################
# #
# At this point, $this_TPS_link is the URL of our image file, e.g. #
# http://marge01eu.thomsonpatentstore.net/pdf/WO2001032477R8A2.pdf #
# #
# Go get the PDF file, sticking it in a temporary image file, then if #
# it looks ok, rename it at the last moment to a permanent image name. #
# #
###########################################################################
$ua = LWP::UserAgent->new(timeout=>40); # Give TPS 40 seconds to respond. Note this isn't a
# total time, timeout. IE if TPS starts to respond
# but takes 20 minutes to deliver everything, that's
# ok - we won't time out. This timeout happens if we
# ever go this long without recieving any bytes.
# Send output to a temporary filename and rename if complete. This avoids the problem of
# the user getting impatient (it takes about 17 seconds for a 1.7 MB file to get copied
# from TPS to Japan), and retries his request. What was happening was the second process
# saw and used the partially-written tif file, resulting in bizarreness.
my $response = $ua->get($this_TPS_link,':content_file'=> $temp_fn);
if ($response->is_success) { # Check the outcome of the response
# The HTTP Headers returned by TPS include
# HTTP Header: Date: Tue, 02 Nov 2004 19:25:36 GMT
# HTTP Header: Server: Apache/2.0.48 (Unix) mod_ssl/2.0.48 OpenSSL/0.9.6l mod_jk/1.2.5
# HTTP Header: Last-Modified: Fri, 08 Aug 2003 14:14:08 GMT
# HTTP Header: ETag: "963f20-5e201-73286c00"
# HTTP Header: Accept-Ranges: bytes
# HTTP Header: Content-Length: 385537
# HTTP Header: Connection: close
# HTTP Header: Content-Type: application/pdf
#
$length= $response->header('Content-Length');
logit("g7b: $temp_fn file is " . (-s $temp_fn) . "-bytes (" . ((-s $temp_fn == $length) ? "right" : "wrong") . ").\n");
# Perform some sanity checking on our resulting image. PDF files start with %PDF
# and end with %%EOF. Also check number of bytes, since it's handy to do so.
if (! stat $temp_fn ||
`/bin/head -c4 $temp_fn` !~ /%PDF/ ||
`/bin/tail -c7 $temp_fn` !~ /%%EOF/ ||
$length != -s $temp_fn) {
if (! stat $temp_fn) {logit("g7c: $temp_fn does not exist.\n")}
elsif (`/bin/head -c4 $temp_fn` !~ /%PDF/) {logit("g7d: head -c4 $temp_fn is not %PDF\n")}
elsif (`/bin/head -c4 $temp_fn` !~ /%PDF/) {logit("g7e: head -c4 $temp_fn is not %PDF\n")}
elsif (`/bin/tail -c7 $temp_fn` !~ /%%EOF/) {logit("g7f: tail -c7 $temp_fn is not %%EOF\n")}
elsif ($length != -s $temp_fn) {logit("g7g: File size(" . (-s $temp_fn) . ") is not $length.\n")};
unlink $temp_fn unless ($debug); # Image retrieval failed for this image server.
##############################################################################
# #
# Consider retrying this error, not to the other server like I do above, #
# but to the same server. It'll give me a chance to use another goto. ;-) #
# #
##############################################################################
$perm_fn="";
} else { # We've got a good PDF image file, so determine our permanent
# file name, and try to rename it to that. If a file already
# exists with our permanent name, then leave it with the temp name.
# One "rule" that makes our lives easier is, if you're ever going to write
# into our temporary image cache directory, use the requested patent in
# the file name. This makes finding it next time a whole lot easier.
# This works fine except in the EP-to-WO substitution case. Rick, maybe address later.
$perm_fn="$TempImageCacheDir{$this_patent}/$this_patent.pdf";
if (! -f $perm_fn) {
logit("g7h: Renaming to $perm_fn.\n");
if (! rename "$temp_fn", "$perm_fn") {
die "Rename $temp_fn to $perm_fn failed.\n";
}
}
}
} else { # Oops, image retrieval failed for this image server.
die ("Image TPS error. Status_line=>", $response->status_line , "\n");
}
$Image_filer{$thiskey}="$perm_fn";
$Gotten_From{$this_patent}="TPS";
logit("g7i: Image_filer{$thiskey}=$Image_filer{$thiskey} gotten from $Gotten_From{$this_patent}\n");
}
} # End of the Get_TPS_Image subroutine
# Use the USPTO website to pull each of the individual TIFF image pages of a
# patent, then use any2any to combine them into a single multi-page TIFF file,
# writing $TempImageCacheDir{$this_patn}/$in_patn.tif.
#
# This code was modified from Rick's /dfs/ipntools/uspto_pull_image.pl
#
# If the US PTO's image has Reexaminations or Certificate of Corrections
# appended at the end, we get them, too.
#
sub Get_USPTO_Image {
my $in_patn=shift;
my $pagecount=0, $idkey="", $imagelink_url="";
my $working_dir="$TempImageCacheDir{$this_patn}/$in_patn.$myhostname.$$";
my $ctl_file="$working_dir/$in_patn.ctl";
my $temp_fn="$TempImageCacheDir{$this_patn}/downloading.$myhostname.$$.$in_patn.tif";
my $perm_fn="$TempImageCacheDir{$this_patn}/$in_patn.tif";
# We need to get the US PTO's image link. The URL for granted images looks like
# href=http://patimg1.uspto.gov/.piw?Docid=05551212
# &homeurl=http%3A%2F%2F164.195.100.11%2Fnetacgi% ... this piece is pretty long ...
# &PageNum=
# &Rtype=
# &SectionNum=
# &idkey=BD21661A21D6
#
# The US Application image URL is similar, but uses a different server and .aix
# instead of .piw. From both URL's, the only 2 parms we really need are Docid and
# idkey. We know the format of Docid and if we could figure out the idkey format,
# we could generate the .piw or .aiw URL's ourselves. Alas though, we can't decipher
# idkey, so we gotta "steal" the whole link URL from their details view.
if ($is_US_app) {
# The URL for US Applications, are a bit different than for Granted images.
# The server for example, is different, as is some parms (d=PG01 versus d=PALL).
$details_url="http://151.207.241.118/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=/netahtml/PTO/srchnum.html&r=1&f=G&l=50&s1='$US_docid1'.PGNR.&OS=DN/$US_docid1&RS=DN/$US_docid1";
} else {
$details_url="http://164.195.100.11/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=/netahtml/srchnum.htm&r=1&f=G&l=50&s1=$US_docid1.WKU.&OS=PN/$US_docid1&RS=PN/$US_docid1";
}
$ua = LWP::UserAgent->new(timeout=>10); # Give the US PTO 10 seconds to respond.
logit("g80: US PTO Details URL = \"$details_url\"\n\n");
my $response = $ua->get($details_url); # Get US PTO Details view.
if (! $response->is_success) { # Abort if we fail.
return;
}
$temp=$response->content();
# logit("g81: Details Content=>$temp<\n");
if($temp =~ m/No (?:patents|application publications) have matched your query/) {
return;
}
if ($temp !~ /
$imagelink_url<\n\n");
# Get what info we can from the image link URL from the US PTO Detais page,
# namely the idkey, which is a unique id for this patent/application,
# the image server's host name, and the document ID.
if($imagelink_url !~ m/idkey=([^&>"\n]*)/i) {
logit("g84: Didn't find idkey in image link\n>$imagelink_url<\n");
return; # If not expected image link format, then abort.
}
$idkey = $1;
logit("g85: Found idkey=$idkey\n");
# Find hostname, either patimg1.uspto.gov or patimg2.uspto.gov for US Granted,
# or aiw1.uspto.gov for US Applications.
if($imagelink_url !~ m/http:\/\/([^\/]*)\//) {
logit("g86: Didn't find server name in image link\n>$imagelink_url<\n");
return; # If not expected image link format, then abort.
}
$img_host = $1;
logit("g87: Found img_host=$img_host\n");
# Follow that image link, pulling the image's first page to get the total number
# of pages in the image. The resulting HTML conveniently has this line near the beginning,
#
my $response = $ua->get($imagelink_url); # Get US PTO Details view.
if (! $response->is_success) { # Abort if we fail.
return;
}
$temp=$response->content();
# logit("g88: Image Content=>$temp<\n");
if($temp !~ m/NumPages=(\d[\d]*)/i) {
logit("g89: Didn't find page count in image link. I got back $temp\n");
return; # If no image link, then abort.
}
$pagecount = $1;
$pages_to_get=$pagecount;
logit("g8a: There are $pagecount pages to retrieve.\n");
# Pull each TIFF image page. The US PTO presents single-page TIF files inside
# a one-page-at-a-time viewer. The relevant link inside that page is
#
#
# The Docid in this .DImg URL is a bit different than the previous details view,
# which is why we have US_docid1 & US_docid2.
$ua = LWP::UserAgent->new(timeout=>20); # Give US PTO 20 seconds to respond. Note this isn't a
# total time, timeout. IE if US PTO starts to respond
# but takes 20 minutes to deliver everything, that's
# ok - we won't time out. This timeout happens if we
# ever go this long without recieving any bytes.
if (! -r $working_dir ) { # We finally need to make this directory
mkdir $working_dir, 0775;
if (! -r $working_dir ) { return } # Just in case we fail to make the directory.
}
unlink glob("$working_dir/*"); # Insure we are starting fresh.
open CTLFILE, ">$ctl_file" or die "Can't open $ctl_file.";
for ($i=1; $i<=$pages_to_get; $i++) { # Loop to get each page
my $img_url="http://$img_host/.DImg?Docid=$US_docid2&PageNum=$i&IDKey=$idkey&ImgFormat=tif";
my $outfile="$working_dir/$in_patn.page.$i.tif";
logit("g8b: Getting page $i of $pages_to_get in $outfile. URL is\n$img_url\n");
my $response = $ua->get($img_url,':content_file'=> $outfile); # Get this image page.
if (! $response->is_success || -z $outfile) {
logit("g8c: Failure getting pate $i of $pages_to_get in $outfile\n");
unlink glob("$working_dir/*") if (! $debug);
rmdir "$working_dir" if (! $debug);
return;
}
print CTLFILE "filename $outfile\n"
}
close CTLFILE;
# At this point, all pages of our US image are sitting in our $working_dir along with
# our $ctl_file. Convert to multi-page tif into a temporary file, then rename.
logit("g8d: Calling any2any $ctl_file $temp_fn\n");
@lines = `$any2anyDir/any2any $ctl_file $temp_fn 2>&1`;
$rc = $?;
if ($rc) { # Did any2any fail?
$errno = $!;
logit("g8e: Write of $temp_fn failed. any2any rc=$rc & errno=$errno\n\n");
foreach $line (@lines) { # Log any messages from anyinfo.
logit("g8f: $line");
}
} else { # any2any worked ok. Great. How many pages did it write?
$any2any_page_count = 0;
foreach $this_line (@lines) {
# If only 1 page got written, any2any messages says "1 page", not "1 pages".
if ( $this_line =~ /([0-9]+) pages? written OK./) {
$any2any_page_count = $1;
last;
}
}
if ( ! $any2any_page_count ) {
logit("g8g: any2any counted zero pages for $temp_fn\n");
foreach $line (@lines) { # Log any messages from any2any.
logit("g8h: $line");
}
} else { # any2any returned with no errors.
# Check page count of the image file we just wrote, with anyinfo.
# Maybe we got an error writing the image (AFS/DFS/NFS was down?).
$anyinfo_page_count = 0;
logit("g8i: Calling $any2anyDir/anyinfo $temp_fn\n");
@lines = `$any2anyDir/anyinfo $temp_fn`;
foreach $line (@lines) {
$_ = $line;
if ( /^([0-9]+) pages?\./ ) {
$anyinfo_page_count = $1;
last;
}
}
if ( ! $anyinfo_page_count ) {
$err_count++;
logit("g8j: anyinfo counted zero pages for $temp_fn\n");
foreach $line (@lines) { # Log any messages from anyinfo.
logit("g8k: $line");
}
} else { # Insure the two page counts match. They should.
if ( $any2any_page_count != $anyinfo_page_count ) {
logit("g8l: Page count mismatch for $temp_fn: $any2any_page_count vs $anyinfo_page_count\n");
foreach $line (@lines) { # Log any messages from anyinfo.
logit("g8m: $line");
}
} else { # Normal case. All's ok.
logit("g8n: Wrote $anyinfo_page_count pages into $temp_fn at " . scalar localtime() . ".\n");
if (! rename "$temp_fn", "$perm_fn") {
unlink glob("$working_dir/*") if (! $debug);
rmdir "$working_dir" if (! $debug);
die "Rename $temp_fn to $perm_fn failed.\n";
}
}
} # End of "Insure the two page counts match. They should."
} # End of "any2any returned with no errors."
} # End of "any2any worked ok."
if (-r $perm_fn) { # If everything above was successful and we
unlink glob("$working_dir/*"); # have a readable image, then erase our
rmdir "$working_dir"; # working directories and return the
return "$perm_fn"; # result of our labors.
} else {
unlink glob("$working_dir/*") if (! $debug);
rmdir "$working_dir" if (! $debug);
return ""; # Something above failed. Return nothing.
}
} # End of the Get_USPTO_Image subroutine
# Convert a given PDF file to desired format, using the supplied filename prefix for my
# temporary working files, then when finished, quickly rename it to the supplied
# permanent name.
#
# Converting a PDF file is a two-step process.
# First, /ips/prod/bin/pdftops EP00618926B1.pdf EP00618926B1.ps
# Then, /ips/prod/bin/any2any EP00618926B1.ps EP00618926B1.$Wanted_Image_Type
#
sub Convert_pdf {
my ($inputfilename, $tempfilePrefix, $permfilePrefix, $type, $pdf_page) = @_;
logit("g90: Inside Convert_pdf with inputfilename=$inputfilename\n tempfilePrefix=$tempfilePrefix\n permfilePrefix=$permfilePrefix\n type=$type\n and pdf_page=$pdf_page\n");
my $pdftopsArgs = "";
if($pdf_page) {
# If page was set, only convert that specific page of pdf to ps
# This speeds up the subsequent PS to xxx conversion.
$pdftopsArgs .= " -f $pdf_page -l $pdf_page ";
# Reset $page to 1, since ps will only have one page to convert
# Works for both cached and non-cached .ps
$page = 1;
}
# See if we can save ourselves a step. Might our postscript file exist in our image,
# cache directory, already converted for us? If it is, save ourselves this step.
my $testfilename="$permfilePrefix.ps";
if (! FileExists($testfilename)) {
logit("g91: Calling pdftops $pdftopsArgs $inputfilename $tempfilePrefix.ps\n");
system("$pdftopsCommand $pdftopsArgs $inputfilename $tempfilePrefix.ps 2>$tempfilePrefix.err");
if (-s "$tempfilePrefix.err") { # Error in pdftops call?
logit("g92: pdftops Error.\n");
unlink "$tempfilePrefix.ps";
unlink "$tempfilePrefix.err";
return;
}
unlink "$tempfilePrefix.err"; # Get rid of zero-length error output file
# Rename temporary file to real name once the conversion is complete.
if ((-s "$tempfilePrefix.ps") && (-r _)) {
if (! rename "$tempfilePrefix.ps", "$testfilename") {
die "Rename $tempfilePrefix.ps to $testfilename failed.\n";
}
logit("g93: Renamed $tempfilePrefix.ps to $testfilename\n");
}
}
logit("g94: Calling Call_any2any($testfilename, $tempfilePrefix, $permfilePrefix, $type, $page)\n");
my $returned_fn=Call_any2any($testfilename, $tempfilePrefix, $permfilePrefix, $type, $page);
logit("g95: Call_any2any returned $returned_fn\n");
# Set Erase_Intermediate_PS_File=0 to save this temporary postscript file.
# The code above will save a step (converting pdf to ps) at the expense of
# using additional disk space in our image cache.
if ($Erase_Intermediate_PS_File) {unlink "$testfilename"}
unlink "$tempfilePrefix.err" if (! $debug);
unlink "$tempfilePrefix.ps"; # Shouldn't exist.
unlink "$tempfilePrefix.tif"; # Shouldn't exist.
return $returned_fn;
} # End of the Convert_pdf subroutine
# Convert a given file to some other format (hopefully and presumably, a format
# understood by any2any), using the supplied filename prefix for my temporary
# working files, then when finished, quickly rename it to the supplied permanent
# name.
#
# This is a simple any2any call,
# /ips/prod/bin/any2any -# num EP00618926B1.tif EP00618926B1.pdf
#
sub Call_any2any {
my ($inputfilename, $tempfilePrefix, $permfilePrefix, $type, $pageNum) = @_;
logit("g90: Inside Call_any2any with inputfilename=$inputfilename wanting type=$type\n tempfilePrefix=$tempfilePrefix\n permfilePrefix=$permfilePrefix and pageNum=$pageNum\n");
if ($type eq "pdf") {
$ENV{"PDF_TITLE"}=$in_patn;
$ENV{"PDF_SUBJECT"}=$in_patn;
$ENV{"PDF_CREATOR"}="$pdf_creator";
}
# If there's any other output besides the normal
# Opened: this, that, and the other file
# ...
# Opened: /tmp/aaaqffIya
# hscale=1.000000, vscale=1.000000 (When writing pdf's only)
# Execution of PostScript Interpreter is complete
# 7 pages written OK.
# then any2any got an error, e.g. the file system is full.
#
# When creating pdf files, one needs to be cd'd into /ips/prod/bin else you get errors.
if ($pageNum) { $numParm="'-#' $pageNum" }
else { $numParm="" }
my $any2any_full_command = "cd $any2anyDir;export PATH=\$PATH:;export ARCPS=../arcps/current;./any2any -e \"ARCPS=../arcps/current\" -e \"PS_FILT=../arcps/current/aps2ras\" $numParm $inputfilename $tempfilePrefix.$type 2>&1 | /usr/bin/egrep -v '^Opened:|^Execution of PostScript Interpreter is complete|pages? written OK|hscale|Decompression failed' > $tempfilePrefix.err";
logit("g91: Calling $any2any_full_command");
# Strangely enough, ipsrun's PATH does not include the current directory, so would get
# sh: aps2ras: not found
# sh: aps2ras: not found
# Cannot open input file /ips/images/cache/87/56/US04965687__.ps or unsupported format
# when converting postscript files. (So how did this EVER work??) Fix path here.
system("$any2any_full_command");
if (-s "$tempfilePrefix.err") { # Error in any2any call?
# There is a not uncommon problem with any2any that we code around here.
# Sometimes any2any spits out an error message, yet writes a partial
# output file. An example is US21035478A1. Converting this 11-page
# tif file to PDF, with this command
# /dfs/prod/ipn/bin/any2any /dfs/images/US/78/54/US21035478A1.tif /dfs/dlcache/US21035478A1.pdf
# gives this error message,
# <<>>:Decompression failed with rc = 8f0e
# and you get an 11-page PDF file that seems ok 'till you view page 11, when Acrobat Reader
# gives an error msg or just a blank page. See bugzilla bug #2264 (closed WORKSFORME by Tom).
#
# Eric says to deliver what we can and send e-mail to Rebecca so she can fix the image.
# In May, 2004, the sending of mail was aborted due to heavy volume.
if ($Send_Any2any_Error_Mail && ! $cmdline) {
if ($debug) {
$mail_TO_recipients="rick.jasper\@thomson.com";
} else {
$mail_TO_recipients="rebecca.hernandez\@thomson.com";
$mail_CC_recipients="rick.jasper\@thomson.com";
}
open(MAIL, "|/usr/bin/mail -s'any2any Conversion Error for $in_patn on $myhostname' -c $mail_CC_recipients $mail_TO_recipients") or logit("g92: Cannot send mail to $mail_TO_recipients\n");
logit("g93: Sending e-mail to $mail_TO_recipients (CCing $mail_CC_recipients) due to $in_patn conversion error.\n");
# E.G. This is automatically-generated mail from the getimage program due to an any2any conversion error.
#
# The image came from EDC. Please investigate.
# At Mon Jun 9 19:03:16 2003, getimage on dweb3
#
# The commands were
# cd /dfs/prod/ipn/bin
# ./any2any /dfs/images/US/78/54/US21035478A1.tif /dfs/dlcache/US21035478A1dephds043.85854.pdf
#
# The error message generated by any2any was
# <<>>:Decompression failed with rc = 8f0e
print MAIL "This is automatically-generated mail from the $0 program due to an any2any conversion error.\n\n";
print MAIL "The image came from $from. Please investigate.\n";
print MAIL "At " . scalar localtime() . ", $0 on $myhostname\n\n";
print MAIL "The commands were\n";
print MAIL " cd $any2anyDir\n ./any2any $numParm $inputfilename $tempfilePrefix.$type\n\n";
print MAIL "The error message generated by any2any was \n";
open(ANYERRFILE,"< $tempfilePrefix.err");
while () {
print MAIL " $_";
}
close ANYERRFILE;
close MAIL;
}
# Only if there was no output file generated by any2any, do we abort.
# As described above, often we DO have something we can deliver.
# It may have errors in it or be incomplete, but it's the best we can do.
if (! -s "$tempfilePrefix.$type") {
logit("g94: any2any error resulting in a zero-length output file.\n");
unlink "$tempfilePrefix.$type" if (! $debug);
unlink "$tempfilePrefix.err" if (! $debug);
return;
}
}
unlink "$tempfilePrefix.err";
# Rename temporary file to real name once the conversion is complete.
if ((-s "$tempfilePrefix.$type") && (-r _)) {
if (! rename "$tempfilePrefix.$type", "$permfilePrefix.$type") {
die "Rename $tempfilePrefix.$type to $permfilePrefix.$type failed.\n";
}
logit("g95: Renamed $tempfilePrefix.$type to $permfilePrefix.$type\n");
}
unlink "$tempfilePrefix.$type"; # Shouldn't exist.
return "$permfilePrefix.$type";
} # End of the Call_any2any subroutine
# Check for the existence of a particular file and while you're at it,
# erase any zero-length files you find.
sub FileExists {
my ($fn) = @_;
if ($fn && -f $fn && -r _) {
if (-z $fn) {
unlink $fn; # File found, but is zero length.
return 0; # Clean up this junk and report no file found.
}
return 1; # This file IS there. All's well.
}
return 0; # No file found.
} # End of the FileExists subroutine
# If we have debug turned on, write the given line to our logfile, or
# to the console if this is a command line invocation.
sub logit {
if ($debug) {
if ($cmdline) { print "@_[0]"}
else {
open(LOGFILE,">>$logfile");
print LOGFILE "@_[0]";
close LOGFILE;
}
}
} # End of the logit subroutine
# $Header: /cvsroot/ipn/bin/getimage,v 1.19 2005/03/19 04:25:43 jasper Exp $