#!/usr/bin/perl # uspto_pull_image.pl: Use the USPTO website to pull each of the individual # TIFF image pages of a patent, then use any2any to combine them # into a single multi-page TIFF file. Be nice and sleep a few seconds between # each page requested from the USPTO. # # 8/06/02 TDG - Addded support for US applications, USD, USH and USPP # 11/15/02 RAJ - Changed input to be patn and an optional number of pages. # necessary for when the US PTO has Certificate of Correction # pages tacked onto the the end. E.G. US05085918 has 22 # original pages + 1 page of a COC. To retrieve just the # original 22 pages, say uspto_pull_image.pl 5085918 22. my $supercp_cmd = "/dfs/prod/ipn/bin/supercp"; my $any2any_cmd = "/dfs/prod/ipn/bin/any2any"; my $anyinfo_cmd = "/dfs/prod/ipn/bin/anyinfo"; my $sleeptime = 5; # Seconds to wait between pulls my $usage = " USAGE: uspto_pull_image.pl patent Where patent is the stripped patent number and nn is an optional number of pages to get (useful for ignoring the US PTO's Certificate of Corrections). Examples: uspto_pull_image.pl 5551212 or uspto_pull_image.pl 5085918 22 to get US05085918 without the COC or uspto_pull_image.pl PP12918 for a Plant patent or uspto_pull_image.pl D462162 for a Design patent or uspto_pull_image.pl 20010000044 for a US Application The results are multi-page TIFF files in the current directory, in our examples: US05551212__.tif, US05085918__.tif, USPP012918__.tif, USD0462162__.tif or US21000044A1,tif.\n\n"; $| = 1; # Force flush after every write $argc = $#ARGV; if($argc < 0 || $argc > 2) { print STDERR $usage; exit(0); } $pn_pto = $ARGV[0]; $pages_to_get = $ARGV[1]; my $is_app=0; my $is_special=0; # Look for 20010000044, which is an application document number if((length($pn_pto)==11) && ($pn_pto =~ m/^200/)){ $is_app=1; } if((substr($pn_pto,0,1)=="H") || (substr($pn_pto,0,1)=="P") || (substr($pn_pto,0,1)=="D")) { $is_special = 1; } if((($pn_pto =~ m/[^\d]/) || ($pn_pto < 1) || ($pn_pto > 6999999)) && !$is_app && !$is_special) { print STDERR "$pn_pto out of range of 1 to 6999999. STOPPING.\n "; print STDERR $usage; exit(-1); } if($is_app) { print STDOUT "\nWorking on application $pn_pto ...\n"; } else { print STDOUT "\nWorking on patent $pn_pto ...\n"; } my $tmp_file="$pn_pto.ctl"; # Insure we are starting fresh. system("/usr/bin/rm $pn_pto.uspto.*.tif $tmp_file 2> /dev/null"); # Ineligant, but obvious my $pad_to_8 = ""; if($pn_pto >= 1000000) { $pad_to_8 = "0"; } elsif($pn_pto >= 100000) { $pad_to_8 = "00"; } elsif($pn_pto >= 10000) { $pad_to_8 = "000"; } elsif($pn_pto >= 1000) { $pad_to_8 = "0000"; } elsif($pn_pto >= 100) { $pad_to_8 = "00000"; } elsif($pn_pto >= 10) { $pad_to_8 = "000000"; } else { $pad_to_8 = "0000000"; } my $idkey = ""; my $pagecount = 0; my $imagelink_url = ""; my $docid = ""; # On 6-11-2002, 164.195.100.11 is/was pto.dwsearch.com $details_url = "http://164.195.100.11/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=/netahtml/srchnum.htm&r=1&f=G&l=50&s1='$pn_pto'.WKU.&OS=PN/$pn_pto&RS=PN/$pn_pto"; if($is_app) { $details_url = "http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=/netahtml/PTO/srchnum.html&r=1&f=G&l=50&s1='$pn_pto'.PGNR.&OS=DN/$pn_pto&RS=DN/$pn_pto"; } # On 6-11-2002, a "host patimg1.uspto.gov" command said # potw1.uspto.gov is 63.71.228.119, Aliases: patimg1.uspto.gov my $img_host="patimg1.uspto.gov"; #my $details_url = "http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=/netahtml/srchnum.htm&r=1&f=G&l=50&s1='$pn_pto'.WKU.&OS=PN/$pn_pto&RS=PN/$pn_pto"; # On the details page, they have 2 image links that look something like this: # href=http://patimg1.uspto.gov/.piw?Docid=05551212 # &homeurl=http%3A%2F%2F164.195.100.11%2Fnetacgi% ... this piece is pretty long ... # &PageNum= # &Rtype= # &SectionNum= # &idkey=BD21661A21D6 # We only need one of these image links, so we use head -1 to grab only the first. if($is_app) { $details_cmd = "$supercp_cmd \"$details_url\" \"%stdout\" | grep '.aiw' | head -1 |"; } else { $details_cmd = "$supercp_cmd \"$details_url\" \"%stdout\" | grep '.piw' | head -1 |"; } open(DETAILS, "$details_cmd"); $DETAILS_OUTPUT =
; close(DETAILS); # Parse out the image link URL from HREF on details page. if($DETAILS_OUTPUT =~ m/(http:[^>"\n]*)/i) { $imagelink_url = $1; # print STDOUT "Found imagelink_url=$imagelink_url.\n"; # Get what info we can from the image link URL from the US PTO Detais page, # namely the idkey, which is a unique id for this patent/application, # the image server's host name, and the document ID. if($DETAILS_OUTPUT =~ m/idkey=([^&>"\n]*)/i) { $idkey = $1; # print STDOUT "Found idkey=$idkey.\n"; # Find hostname, either patimg1.uspto.gov or patimg2.uspto.gov for US Granted, # or aiw1.uspto.gov for US Applications. if($DETAILS_OUTPUT =~ m/http:\/\/([^\/]*)\//) { $img_host = $1; # print STDOUT "Found img_host=$img_host.\n"; if($DETAILS_OUTPUT =~ m/docid=([^&>"\n]*)/i) { $docid = $1; # print STDOUT "Found docid=$docid.\n"; } } else { print STDERR "Failed to find USPTO's image server host name for patent $pn_pto.\n"; print STDERR "It's probably the case that the US PTO server is busy.\n\n"; # print STDERR "detail_cmd = \"$details_cmd\"\n"; exit; } } else { print STDERR "Failed to find the idkey for patent $pn_pto.\n"; print STDERR "It's probably the case that the US PTO server is busy.\n\n"; # print STDERR "detail_cmd = \"$details_cmd\"\n"; exit; } } else { print STDERR "Failed to find the image link URL for patent $pn_pto.\n"; print STDERR "It's probably the case that the US PTO server is busy.\n\n"; print STDERR "detail_cmd = \"$details_cmd\"\n"; print STDERR "The supercp command was\n$supercp_cmd \"$details_url\"\n"; print STDERR "DETAILS: $DETAILS_OUTPUT \n"; exit; } sleep $sleeptime; # Follow that image link, pulling the image's first page to get the total number # of pages in the image. The resulting HTML conveniently has this line near the beginning, # $pagecount_cmd = "$supercp_cmd \"$imagelink_url\" \"%stdout\" | grep -i 'NumPages' | head -1 |"; open(PAGECOUNT, "$pagecount_cmd"); $PAGECOUNT_OUTPUT = ; close(PAGECOUNT); # Find the number of pages and sqirrel it away. Look for: if($PAGECOUNT_OUTPUT =~ m/NumPages=(\d[\d]*)/i) { $pagecount = $1; if ($pages_to_get == "") { # Normal case where he didn't specify how many pages to get? $pages_to_get=$pagecount; } else { # He only wants nn pages. Sanity check - Is nn < pagecount? if ($pages_to_get < $pagecount) { print STDOUT "I will get only $pages_to_get out of the $pagecount pages in $pn_pto ...\n"; } else { if ($pages_to_get == $pagecount) { print STDOUT "That's how many pages there are in $pn_pto, $pages_to_get. Will get all $pagecount pages ...\n"; } else { print STDOUT "There are only $pagecount pages in $pn_pto, not $pages_to_get. Will get all $pagecount pages ...\n"; $pages_to_get=$pagecount; } } } } else { print STDERR "Failed to find pagecount for patent $pn_pto.\n"; print STDERR "It's probably the case that the US PTO server is busy.\n\n"; # print STDERR "pagecount_cmd = \"$pagecount_cmd\"\n"; exit; } # Pull each TIFF image page for ($i=1; $i<=$pages_to_get; $i++) { # if($is_special) { # docid is ok as found in detailed view } else { $docid = "$pad_to_8$pn_pto"; } if($is_app) { $docid = "us" . $pn_pto . "ki"; } my $img_url = "http://$img_host/.DImg?Docid=$docid&PageNum=$i&IDKey=$idkey&ImgFormat=tif"; my $outfile = "$pn_pto.uspto.$i.tif"; my $img_cmd = "$supercp_cmd \"$img_url\" \"$outfile\" |"; sleep $sleeptime; print STDOUT "Getting page $i of $pages_to_get for $pn_pto ...\n"; open(IMGPAGE, $img_cmd); $IMGPAGE_OUTPUT = ; close(IMGPAGE); # print STDOUT "Called $img_cmd\n"; # NOTE: Should check IMGPAGE_OUTPUT for ERRORS!! if (! -e $outfile) { print STDERR "Error getting page $i for $pn_pto in $outfile. Perhaps you don't have\n"; print STDERR "write permission in this directory or if this is DFS, you aren't DCE-authenticated?\n"; exit; } if (-z $outfile) { print STDERR "Error getting page $i for $pn_pto in $outfile: Resulting file is zero-length.\n"; exit; } # print STDOUT "$img_cmd returned $IMGPAGE_OUTPUT\n"; } # Get a sorted list of all tif files in that directory, pre-pend a "filename " to # each line, and write it into a control file for use by any2any. # Since the filenames look like 6294115.uspto.1.tif, the sort must be numeric on # the third, period-delimited field, thus the "-n +2 -t'.'" on the sort command. @lines = `/usr/bin/ls $pn_pto.uspto.*.tif 2>/dev/null | /usr/bin/sort -n +2 -t'.' | /usr/bin/sed 's/^/filename /' > $tmp_file`; $rc = $?; if ( $rc ) { # Did we find any TIF files? $errno = $!; # Ooops, something went wrong. print STDERR "Error: ls of the $pn_pto.uspto.*.tif files failed. ls rc=$rc & errno=$errno\n"; } else { # Good, we have our control file now. # process_an_image(); # process the previous patent's image, my $out_file = "US$pad_to_8${pn_pto}__.tif"; if($is_app) { # Map 20010000044 to US21000044A1.tif $out_file = "US2". substr($pn_pto,3,1) . substr($pn_pto,5) . "A1.tif"; } elsif ($is_special) { # Map H209 to USH0000209__.tif $out_file = "US" . $docid . "__.tif"; } @lines = `$any2any_cmd $tmp_file $out_file 2>&1`; $rc = $?; if ( $rc ) { # Did any2any fail? $errno = $!; print STDERR "Write of $out_file failed. any2any rc=$rc & errno=$errno\n\n"; } else { # any2any worked ok. Great. How many pages did it write? $any2any_page_count = 0; foreach $this_line (@lines) { # If only 1 page got written, any2any messages says "1 page", not "1 pages". if ( $this_line =~ /([0-9]+) pages? written OK./) { $any2any_page_count = $1; last; } } if ( ! $any2any_page_count ) { print STDERR "any2any counted zero pages for $out_file\n"; foreach $line (@lines) { # Print out any messages from any2any. print STDERR "$line"; } } else { # any2any returned with no errors. # Check page count of the image file we just wrote, with anyinfo. # Maybe we got an error writing the image (AFS/DFS was down?). $anyinfo_page_count = 0; @lines = `$anyinfo_cmd $out_file`; foreach $line (@lines) { $_ = $line; if ( /^([0-9]+) pages?\./ ) { $anyinfo_page_count = $1; last; } } if ( ! $anyinfo_page_count ) { $err_count++; print STDERR "anyinfo counted zero pages for $out_file\n"; foreach $line (@lines) { # Print out any messages from anyinfo. print STDERR "$line"; } } else { # Insure the two page counts match. They should. if ( $any2any_page_count != $anyinfo_page_count ) { print STDERR "Page count mismatch for $out_file: $any2any_page_count vs $anyinfo_page_count\n"; foreach $line (@lines) { # Print out any messages from anyinfo. print STDERR "$line"; } } else { # Normal case. All's ok. print STDOUT "Wrote $anyinfo_page_count pages into $out_file at ", scalar localtime, ".\n"; system("/usr/bin/rm $pn_pto.uspto.*.tif $tmp_file 2> /dev/null"); } } # End of "Insure the two page counts match. They should." } # End of "any2any returned with no errors." } # End of "any2any worked ok." } # End of "Good, we have our control file now." exit(0);