#!../bin/perl $debug=1; $debug=0; # image: CGI Script to download a patent image file (or just information # about the image). # # This is used by the Patolis/Japanese NPO customer for all of their # images. It will also be used by our other Thomson partner companies, # for example, the Thomson Patent Store. # # Expects a URL like one of these: # http://www.delphion.com/cgi-bin/image/US04936585__ # # - This program is protected by I.P. address. Only requests from "blessed" # addresses are allowed, defined by the $ACCESS_LIST file below. # # - The host can be either www or www5 (Southbury's download server), since they # both share the same cgi-bin directory. # # - The Patent number must be the fully-qualified, 12-character number, that # is, - 2-character country code, 8-digit number, and 2-character kind. # (However, see the not-yet implemented FilterPATN subroutine for ideas) # # - The actual hard work of finding and possibly converting the image, is # all done by the getimage program. See it for more details. # # The following are the optional parameters: # # - type, e.g. type=pdf or type=tif to specify the desired image type. # If unspecified, defaults to whatever image type we can find # (usually tif or pdf). Possible values are whatever the # any2any command can take (or Z if a Japanese patent). # # - cdlabel, e.g. cdlabel=esp1998007 to identify among multiple images # for the same patent. Defaults to null. See the getimage # program for further details on how it handles null cdlabels. # !!! Obsolete. # # - infoonly, e.g. infoonly=1 to return just the information about the # available image(s) for this patent in whatever format # is specified by the type parameter (eg XML), defaulting # to plain text. Data returned is all DB2 columns. # !!! Not implemented. # # - page, e.g. page=1 to only return this specific page. Can also be a # range of pages (used in the NPO-world when printing selected # sections of a patent, eg title & claims). # # # Presuming infoonly was not specified, this cgi-bin script returns the # image in the requested (or defaulted) type. Also returns the following # HTTP headers with example values; # cdlabel: mepa2001063 (if easily available) Not implemented yet. # Content-length: 191691 # Image-Type: pdf # # If infoonly was specified, we return the information about each image # we can find for the given patent. Also not implemented yet. # # To debug from the command line, try # export REMOTE_ADDR=210.197.102.194 # export PATH_INFO=EP00904950A1 # export QUERY_STRING='type=tif&cdlabel=esp1999024' # /dfs/prod/ipn/cgi-bin/image $myhostname = `hostname -s`; chomp $myhostname; $rc = 0; $error = 0; $errorMessage=""; if (! Blessed_User() ) { # Before all else, validate this guy. $error=1; $errorMessage="Access denied."; $rc=403; goto done; } processQueryString(); # In the CGI environment, PATH_INFO is the string after the "cgi-bin/image" and # before the first "?", e.g. "/WO00202709A1", which is the 12-character patent number. $PATN = $ENV{'PATH_INFO'}; $PATN =~ s?/??; # Strip leading slash and possible newline from filename $PATN =~ s/\n//g; if ($PATN eq '') { $error=2; $errorMessage="No patent number specified."; $rc=-1; goto done; } FilterPATN(); # Massage the patent number # At this point, $PATN had better be exactly 12 characters long. if (length($PATN) != 12) { $error=3; $errorMessage="Invalid patent number ($PATN) specified."; $rc=-1; goto done; } # Wanted_Image_Type: Usually "tif", but could be any valid type that # any2any supports. Do a any2any -l to see the valid types, which # include afp af1 bmp can col cra dcx ep2 eps g3 gif im0 im1 ima img ips # irw ix1 jpg lj p12 pbm pcl pct pcx pdf pgm pmp ppd ppm ps ps2 # psc psg ps1 ras ra8 rgb s40 scd sc2 sc4 tek tif xbm xpm xwd yuv # # We could put a check here for valid types if we don't want to allow # everything, or to catch invalid parameters, but I didn't do that. # Instead, I just insure type is 1-3 alphanumerics. # $Wanted_Image_Type = $SARRAY{'type'}; if ($Wanted_Image_Type && $Wanted_Image_Type !~ m/^\w{1,3}$/ ) { $error=4; $errorMessage="Invalid type ($Wanted_Image_Type)"; $rc=-1; goto done; } $cdlabel = $SARRAY{'cdlabel'}; # Get the optional CD label. if ($cdlabel && $cdlabel !~ m/^[\w-]*$/ ) { $error=5; $errorMessage="Invalid CD Label ($cdlabel)"; $rc=-1; goto done; } $pagelist = $SARRAY{'page'}; # Get the optional (and rare) page list. if ($pagelist && $pagelist !~ m/^[\d,-]*$/ ) { $error=5; $errorMessage="Invalid Page List ($pagelist)"; $rc=-1; goto done; } if ($Wanted_Image_Type) {$getimageArgs ="type=$Wanted_Image_Type"} if ($cdlabel ) {$getimageArgs.=" cd=$cdlabel"} if ($pagelist ) {$getimageArgs.=" page=$pagelist"} # Call getimage to do all the hard work of locating & converting an image. print "Calling getimage $PATN $getimageArgs\n" if ($debug); $ImageFileName=`../bin/getimage $PATN $getimageArgs`; print "getimage returned >$ImageFileName<\n" if ($debug); # Log all usage for a while to see who calls me and for which collections. # 210.197.102.194 = ips01i at Patolis # 210.197.102.195 = ips06i at Patolis # 12.44.168.100 = sjcp.delphion.com, so coming from San Jose office $time=scalar localtime(); `echo "$PATN called from $REMOTE_ADDR at $time" >> /dfs/home/rickjas/cgi-bin.image.log`; if (! -r $ImageFileName || -z _) { # If failed to find a usable image, quit now. $error = 3; $errorMessage="No image found for $PATN."; $rc=403; goto done; } done: if ($error != 0) { # Some sort of problem has happened # Cleanup and inform user. $| = 1; # Force flush after every write # On error, echo $errorMessage (if any) and write message out (if any) as text/plain print "Content-type: text/plain\n\n"; print "Sorry, image request failed (error=$error).\n\t"; if ($errorMessage) { print STDOUT "\n$errorMessage\n"; # stdout is for web client and print STDERR "$errorMessage"; # stderr is for our own httpd/error_log } if (-s $errorFile) { system("cat $errorFile"); # Echo to stdout, for web client system("cat $errorFile >&2"); # Echo to stderr, for error_log } print "\nThe image for patent $PATN is not available at this time.\n"; } else { # # Output the download file # if ((-s$ImageFileName) && (-r _) && ($error==0)) { $ImageFileName=~/.*\.(\w{1,3})$/; $Got_Image_Type=$1; if ($Got_Image_Type eq 'pdf') { system( "echo 'Content-type: application/pdf'"); } elsif ($Got_Image_Type eq 'tif') { system( "echo 'Content-type: image/tiff'"); } elsif ($Got_Image_Type eq 'Z' ) { system( "echo 'Content-type: application/Z'"); } elsif ($Got_Image_Type eq 'ps' ) { system( "echo 'Content-type: application/postscript'"); } elsif ($Got_Image_Type eq 'afp') { system( "echo 'Content-type: application/afp'"); } elsif ($Got_Image_Type eq 'gif') { system( "echo 'Content-type: image/gif'"); } elsif ($Got_Image_Type eq 'jpg') { system( "echo 'Content-type: image/jpeg'"); } else { system( "echo 'Content-type: application/x-any2html-dump'"); } # Get filelength (see getimage) $filelength = -s $ImageFileName; # NOTE: no spaces allowed between length and \n if ($ENV{'SERVER_SOFTWARE'}) { system ("echo 'Content-length: $filelength\nImage-Type: $Got_Image_Type\n';cat $ImageFileName;"); } else { # Debugging call from the command line. system ("echo 'Content-length: $filelength\nImage-Type: $Got_Image_Type\n';echo cat $ImageFileName;ls -l $ImageFileName"); } } } exit($rc); ############################################################## # This section is added to restrict remote access # ############################################################## sub Blessed_User { # Restricted access list file. Contains dotted decimal I.P. address # in column 1 for "blessed" hosts. $ACCESS_LIST="/dfs/prod/ipn/config/vpilaccess.conf"; $REMOTE_ADDR = $ENV{'REMOTE_ADDR'}; # The remote address will be null when invoked from if ($REMOTE_ADDR eq '') { # the command line, in which case, we give some help. print "Content-type: text/plain\n\n"; print "REMOTE_ADDR not defined\n"; print "\nTry export REMOTE_ADDR=210.197.102.194\n export PATH_INFO=EP00904950A1\n export QUERY_STRING='cdlabel=esp1999024&type=tif'\n"; exit(-1); } # # Verify remote address $match="0"; # set to false open(ACCESS_FH, $ACCESS_LIST) or die; while ($TrustedHost = ) { chomp $TrustedHost; if (! ($TrustedHost =~ /\*/)) { # If no wildcard on this trusted host entry, then do a complete comparsion if ($TrustedHost eq $REMOTE_ADDR) { return 1; # Got a match. This guy's ok. } } elsif ($TrustedHost =~ /\A[0-9.]+\*\Z/) { # Only process if TrustedHost ends with the * $TrustedHost =~ s/\./\\\./g; $TrustedHost =~ s/\*//g; if ($REMOTE_ADDR =~ /\A$TrustedHost/) { return 1; } } } # If we get here, we never found a line that blessed this guy. Reject him. return 0; } sub processQueryString { $METHOD = $ENV{'REQUEST_METHOD'}; if ( $METHOD eq 'POST'){ read(stdin, $QUERY_STRING, $ENV{'CONTENT_LENGTH'}); # POSTs may have a trailing newline chomp $QUERY_STRING; } else { # GET method, or something else, in which case we treat as GET $QUERY_STRING = $ENV{'QUERY_STRING'}; } # GetRight4.2 and 4.2c encode & into &. 4.1.2 also has some problems # Repair QUERY_STRING if this is the case if ($agent =~ /GetRight\/4\./) { $QUERY_STRING =~ s/&\;/&/g; } # Replace %2B to '+' $QUERY_STRING =~ s/%2B/\+/gi; #Build the array with name/value pairs @combo = split(/&/, $QUERY_STRING); foreach $combo (@combo) { $combo =~ s/\+/ /g; ($input_name, $value) = split(/=/,$combo); $input_name =~ s/%([A-Fa-f0-9]{2})/pack("c",hex($1))/ge; $value =~ s/%([A-Fa-f0-9]{2})/pack("c",hex($1))/ge; #print "[$combo][$input_name][$value]
"; if($SARRAY{$input_name}) { # If an entry appears more than once, append a space and new # value to the old value. (ex: -c coll_f, -c coll_b -> -c coll_f coll_b) $SARRAY{$input_name} = $SARRAY{$input_name} . " " . $value; } else { $SARRAY{$input_name} = $value; } } } # Fix up the patent number before proceeding, by # - Converting NPO-style US Patent numbers to Southbury's standard, # e.g. US0D123456__ to USD0123456__ or US05551212A1 to US05551212__. # - Parsing out the numeric portion and zero-padding it to 8 digits. # # Other things we may want to do, but aren't done yet, are # - Any other conversion of Thomson Patent Store standard (whatever that is) # to Southbury standard. # sub FilterPATN { my $tmpPATN=$PATN; $tmpPATN=~/(..)0*(.*)(..)$/; my $COUNTRY=$1; $tmpPATN=$2; my $KIND=$3; # US Non-Utility Patents are probably in the NPO standard of US0D277060__ # or US0RE29774A1 or US0BRE28576. Those take special handling. if ($COUNTRY eq "US" && $tmpPATN=~/(\D+)(\d+)/) { $USType = $1; # EG D, PP, RE, H, BRE, etc. $tmpPATN = $2; # The true patent number. $PATN="$COUNTRY" . $USType; $PATN.="0" x (10-length($PATN)-length($tmpPATN)) . $tmpPATN . $KIND; } else { $PATN="$COUNTRY" . "0" x (8-length($tmpPATN)) . $tmpPATN . $KIND; } return; }