#!/usr/bin/perl { # Undef $/ locally so we can slurp the entire input file at once. local $/; open(FH,$ARGV[0]); $_ = ; } print "I'm working on >$_<\n"; # Our input is a bunch of SGML. We want to pick out # - The US Patent number between and tags, # - The Japanese title between and tags, but be # careful to pick out the Japanese title, not the English one, # identified by a preceeding JP<\/B541>. # - And the Japanese abstract between and . # Be aware that inside the Japanese abstract, are multiple paragraphs, # bracketed by

and

, so assist our loop below by picking # out the first

and last

tags. if (/([0-9]*)<\/B110>.*JP<\/B541>(.*?)<\/B542>.*

(.*?)(?:<\/P>)?<\/SDOAB>/s) { $patn = sprintf("US%08s__",$1); $japanese_title = $2; $raw_japanese_abstract = $3; # print "The raw japanese abstract is >$raw_japanese_abstract<\n"; $raw_japanese_abstract =~ s/\r\n/ /g; # Parse out each paragraph of the Japanese abstract. # Keep track of each with a numeric key. $num=0; foreach $paragraph (split('

',$raw_japanese_abstract)) { if ($paragraph) { $num++; $paragraphs{$num} = $paragraph; print "Set \$paragraphs{$num} to =>$paragraph<\n"; } } print "Aha, I passed my search with patn=>$patn\n"; print "and japanese title =>$japanese_title<\n"; print "and japanese abstract\n"; # Access our paragraphs hash numerically (code to sort numerically # lifted from Perl book, page 790 -- the sort function). foreach $num (sort {$a<=>$b} keys %paragraphs) { print "Paragraph $num=>$paragraphs{$num}<\n"; }; } else { print "Error. Input file did not match my pattern.\n" };