#!/usr/bin/perl $COLLROOT='abc'; $SARRAY{"-c"} = "$COLLROOT/coll_us"; print "\n"; print " Input Original Output New Output\n"; print "================================================================= =============== ==========\n"; if ( $ARGV[0] eq "" ) { output_a_line("H01A 1/23","H01A 00123"); print "\n"; } else { # output_a_line($ARGV[0], "who knows?"); output_a_line("@ARGV", "who knows?"); } print "\n"; exit; sub output_a_line { local($test_value)=@_[0]; local($correct_answer)=@_[1]; $orig_answer=orig_filter_IC($test_value); $new_answer=filter_IC($test_value); $orig_answer_length=length($orig_answer); $new_answer_length=length($new_answer); unless ($orig_answer eq $correct_answer) { $orig_answer="\033[7m$orig_answer\033[0m";} unless ($new_answer eq $correct_answer) { $new_answer="\033[7m$new_answer\033[0m";} $right_pad1=" " x (16-$orig_answer_length); $right_pad2=" " x (58-$new_answer_length); printf "%-66s %s%s %s%s\n", $test_value, $orig_answer, $right_pad1, $new_answer, $right_pad2; return; } # "Free" input of IPC Class # Name: IC # Input: 123 / 1A1[11 . EE] and many more # Output: 12311111EE and 123* sub orig_filter_IC { local($in_var) = @_; $in_var =~ s/-//; $in_var =~ / \s* (\d+) \s* \/ \s* /x; $aantalNullen = 3 - length($1); $prefix = "0" x $aantalNullen; # HACK FOR EXTRA SPACE $in_var =~ s/ \s* (\d+) \s* \/ \s* / $prefix$1/x; # In the Almaden V4 database, IC has a space after the first 4 chars; # in NIPO and UK it doesn't, so, the following line needs to be # uncommented there!!! #$in_var =~ s/ //g; return $in_var; } # Filter input of IPC Class Search Term # Name: IC, MC, MAINCLASS, CLASS # # The format of IPC class numbers as rendered to the user and thus, in our input, is in # three parts, with the first part mandatory and the other two optional (but if the second # part exists, then so does the third). The parts are separated by a space and slash and # leading zeros in the second part are removed, eg C09D 3/84. # # In DB/2, we normalize the second part (if it exists), zero-padding it on the left to # three characters. We also remove the slash, leaving us with C09D 00384, for example. # Verity indexes this as two separate words. When we search, we give search arguments to # Verity like C09D 00384. Verity treats this as two words which must follow each other, # thus returns what we want. # # But before searching, Verity does it checking to see if this search might return too many # results and this is done on a word boundary. Thus if the user entered "C09D 5/*", we # filter this to "C09D 005*" and this will fail because of the 005* term (there are too # many patents indexed with words starting at 005, e.g. the patent numbers in the 5 million # range. We've got to do something about that. # # Our solution is based upon the length of valid IPC classes. After padding the second # part to three characters, there are three possible lengths of valid IPC classes. # Length of 4, e.g. C09D # Length of 10, e.g. C09D 00384 # Length of 11, e.g. C09D 003733 # If we filter anything, we will never pass along *'s to Verity in the first two parts. # Instead, we will always pad the first two parts with ?'s, to these fixed lengths. # # For example (these examples are not exhaustive or comprehensive), # C* will get filtered to C??? # C0* will get filtered to C0?? # C09* will get filtered to C09? # # C09D will get filtered to C09D, which will also pick up all the 10- and # or C09D* 11-character IPC classes, e.g. C09D 003/733 since Verity has indexed # or C09D * these as two words. # # C09D *1 will get filtered to C09D ??1?? or C09D ??1??? # C09D *12 will get filtered to C09D ?12?? or C09D ?12??? # # C09D 1 you might think C09D 1* would get filtered to the very long (C09D 1???? # or C09D 1* C09D 1????? C09D 01??? C09D 01???? C09D 001?? C09D 001???) # but due to the 1????? term, this query is too complex for Verity, so just like # we did in the National Class filter (filter_NC below), we'll interpret this as # C09D 1/* and filter both to (C09D 001?? C09D 001???). If the guy doesn't # like it, tough. He can put in his own multi-term search. # C09D 12 again, you might think C09D 12* would get filtered to (C09D 12??? # or C09D 12* C09D 12???? C09D 012?? C09D 012???), but to keep things consistent with # the C09D 1* case above, we simply filter this to (C09D 012?? C09D 012???). # # C09D 1*2 will get filtered to (C09D 1?2?? C09D 1?2??? C09D 012?? C09D 012???) # # C09D 123/x will never match since the third part is always 2- or 3-characters long, # so we guess he meant 0x and filter to C09D 1230x. # C09D 123/* will get filtered to (C09D 123?? C09D 123???) # We also try to handle more complex input, for example, # C09* 1*2/* will get filtered to (C09? 1?2?? C09? 1?2??? C09? 012?? C09? 012???) # We may wind up with *'s in the third part, for example, # C09D 3/*x will get filtered to C09D 003*x. # # In the National Patent Offices on the other hand, the space is removed from the IPC # class when stored in DB/2. This changes completely how Verity indexes and thus, how # we should filter, namely # 1) No longer do you need to worry about the "C09D 1*" failing due to the 1????? term. # Instead what you could have is "C09D001*" which would work fine. However, we use the # same logic as for the US, padding to fixed lengths, and simply remove any spaces at # the end of it all. # 2) In the C*, C0*, C09*, and C09D* cases, the filtered output can no longer be simply # the 4-character output it was before. They now have to include the two longer cases. # So "C0*" now becomes (C0??C0???????C0???????). # There are 4, 1-line changes for the National Patent Offices indicated with comments. sub filter_IC { local($in_var_string) = @_; $output = @in_var_string; # Default output to input in case some of these tests fail. # If there's any kind of funny character in this search string, don't modify is at all. if ( $in_var_string !~ /^[a-zA-Z\d\/ \*,]*$/ ) { $output = $in_var_string; # Pass everything on untouched. } else { # We want to accept multiple national class search arguments, which in the Verity # search language, are separated by commas or spaces (e.g. C09D 1/23,C09D *12/3* $output=""; # Reset output to null 'cause we're gonna # split our input (the search argument) into separate, comma-delimited terms, # and iterate across all search terms, rebuilding our filtered output. foreach (split /,/, $in_var_string) { # Parse out the three parts of the IPC Class search argument, # 0-4 characters, delimited by an optional space, with possible asterisks (*), # 0-3 characters, delimited by an optional slash (/) with possible asterisks (*), # and the rest. # If this term doesn't follow this syntax, then don't muck with it. if ( /^\s* ([\*a-zA-Z\d]{0,4}) \s* \ ? # First part \s* ([\*a-zA-Z\d]{0,3}) \s* \/? # Second part \s* ([\*a-zA-Z\d]* ) $/x ) { # Third part $part1=$1; $part2=$2; $part3=$3; $length1 = length($part1); IC_CASE1: { # If part1 is null or "*", e.g. " 123/45" or "* 123/45", then we can # just drop it and let Verity search on simply part2 & part3. # Of course, this statement only applies to the US. if ($length1 == 0 || $part1 eq "*" ) { # " 123/45" or "* 123/45" $part1 = ""; # Just search on the rest. # For the National Patent Offices, uncomment this line (1 of 4). $part1 = "????"; last IC_CASE1; } # The philosophy of whether or not to append an asterisk to any of the # parts or not, is to do so if he has not asked for a specific class. # A one-, two-, or three-character first part can never be a specific # class, so we wild card that here, but four-character first parts # that contain no wild-card characters, are specific. Remember, due # to the check for "funny" characters above, we don't see ?'s here. # Only *s wild-card characters are possible. if (length($part1) < 4 && index($part1,"*")<0) { $part1 = substr($part1 . "????",0,4); # Pad to length 4. last IC_CASE1; } # If he's used *'s in his first term, e.g. C* or C*D or *D, we're # going to be nice and expand them to C??? or C??D or ???D for him. # But it's too much to ask us to do anything with *0*. That 0 oculd # in any position, It's not reasonable for us to filter that into # (0??? ?0?? ??0? ???0). Likewise for *0**. So only # handle here, exactly one * in something that's 4 characters or less. # Perl Note: $part1=~tr/*// counts how many *'s there are in $part1. if ( $part1=~tr/*// == 1 && length($part1) <= 4 ) { substr($part1, index($part1,"*"), 1) = "?" x (5-length($part1)); last IC_CASE1; } } # End of IC_CASE1 block. # The problem with wild-carding the second part is, we also zero-pad it. # Thus if the guy asks for H01B *1/04, we need to make that H01B ??1/04. # That one is easy, as is the 2-character case, H01B *12/04 to H01B ?1204. # But asterisks on the right side are more difficult. You might guess # that H01B 1* should catch H01B 001*, H01B 01?*, and H01B 1??*, but # as mentioned above, Verity fails the 1????? search as too complex, so we # interpret H01B 1* as H01B 1/*, yielding (H01B 001?? H01B 001???) only. $length1 = length($part1); $length2 = length($part2); $length3 = length($part3); $part2b = ""; IC_CASE2: { # In general, if a specific class is given, we don't want to wild-card # the search. For example, C09D should not find C09D 003/84. In the US # however, due to the space in the IPC class in DB/2, there's nothing we # can do about it. C09D will find all classes, C09D as well as all # C09D * classes. # # If nothing was specified for part2 or part3, and part1 is a valid # specific class (4 characters and no wildcards), then do nothing to the # null part2 and part3. In particular, we don't want C09D to find # C09D 00384 because C09D is a specific class. However in the US, we # can't get what we want. C09D *will* find all classes, C09D as well as # all C09D * classes). This is because of the space preserved in the # IPC class, so even though it's not the desired action, there's nothing # we can do to avoid it. # # In the National Patent Office sites though, this space is stripped, so # that's where the behavior would differ and we'd get what we wanted. # There, we don't doctor C09D in any way, but if there are any wild-card # characters in the already-filtered $part1, e.g. C09? along with null $part2 # & $part3, we want to filter the rest to (C09? C09?????? C09??????). # Here, we take one step to accomplish this, make $part2="???". # # null or / or /* or * or */ or */* if ( ($length2 == 0 || $part2 eq "*") && ($length3 == 0 || $part3 eq "*") ) { $part2=""; # For the National Patent Offices, uncomment this line (2 of 4). if ( $part1=~tr/?*// > 0 ) { $part2="???"; } last IC_CASE2; } if ( $length2 == 0 || $part2 eq "*") { # /something or */something ==> ??? $part2 = "???"; last IC_CASE2; } if ( $length2 == 1 ) { # C ==> 00C $part2 = "00$part2"; last IC_CASE2; } if ( $part2 =~ /^\*(.)$/ ) { # *C ==> ??C $part2 = "??$1"; last IC_CASE2; } if ( $part2 =~ /^(.)\*$/ ) { # C* ==> 00C (intentionally not C?? or 0C?) $part2 = "00$1"; last IC_CASE2; } if ( $length2 == 2 ) { # CC ==> 0CC $part2 = "0$part2"; last IC_CASE2; } if ( $part2 =~ /^\*(..)$/ ) { # *CC ==> ?CC $part2 = "?$1"; last IC_CASE2; } if ( $part2 =~ /^(.)\*(.)$/ ) { # C*C ==> C?C or 0CC, since * could be null. $part2 = "$1?$2"; $part2b = "0$1$2"; last IC_CASE2; } if ( $part2 =~ /^(..)\*$/ ) { # CC* ==> 0CC (intentionally not CC? to keep $part2 = "0$1$2"; # consistent with the C* case above. last IC_CASE2; } } # End of IC_CASE2 block. # As a nice guy, we'll convert C09D 123/4 to C09D 123/04, else it'll never match. if ( length($part3) == 1 && $part3 ne "*" ) { $part3 = "0$part3" } $length1 = length($part1); # Recompute current lengths. $length2 = length($part2); $length2b = length($part2b); # If length of $part3 is less than 2, it must be null or "*" (used below). $length3 = length($part3); $term = "$part1 $part2$part3"; # If there are any holes in my logic here, at least # default to something reasonable. IC_CASE3: { # In the US, for the C* or C0* or C09* or C09D or C09* / or C09D /* or C09D */* cases # (i.e. a null part2 & part3), all we need do return is $part1. if ( $length3 < 2 && ($length2 == 0 || $part2 eq "???") ) { $term = $part1; # If just * or " /", term would be blank. Oh, well. # For the National Patent Offices, uncomment this line (3 of 4). $term = "($part1 $part1????? $part1??????)"; last IC_CASE3; } # Any $part1, $part2 = C*C, and null or * part3. e.g. C09D 123 or C09D 123 # or C09D 123/* and other cases. if ( $length3 < 2 && $length2b == 0 ) { $term = "($part1 $part2?? $part1 $part2???)"; last IC_CASE3; } # E.G. C09D C*C or C09D C*C/* if ( $length3 < 2 && $length2b > 0) { $term = "($part1 $part2?? $part1 $part2??? $part1 $part2b?? $part1 $part2b???)"; last IC_CASE3; } # E.G. C09D CCC/xx or C09D CC*/xx or C09D CCC/xxx or others if ( $length3 > 1 && $part3=~tr/*?// == 0 && $length2b == 0) { $term = "$part1 $part2$part3"; last IC_CASE3; } # E.G. C09D C*C/xx or C09D C*C/xxx if ( $length3 > 1 && $part3=~tr/*?// == 0 ) { $term = "$part1 $part2$part3 $part1 $part2b$part3"; last IC_CASE3; } # All others C09D C*C cases, e.g. C09D C*C/12*3 if ( $length2b > 0 ) { $term = "$part1 $part2$part3 $part1 $part2b$part3"; last IC_CASE3; } } # End of IC_CASE3 block. } else { # Else this term didn't match our "don't muck with it" match. $term = $_; } # Rebuild the search string with the commas we're parsing on, in our foreach/split loop. if ($output eq "") { # First term? I.E. First time through? $output = $term; # If so, it doesn't get a comma } else { # Else separate this term from earlier ones with a $output = $output . "," . $term; # comma. Note that we remove trailing spaces after } # the comma. EG "A01, B02" => "A01,B02" } # End of foreach split ... # For the National Patent Offices, uncomment this line (4 of 4). $output =~ s/ //g; } return $output; }