#!/usr/bin/perl $COLLROOT='abc'; $SARRAY{"-c"} = "$COLLROOT/coll_us"; print "\n"; print " Input Original Output New Output\n"; print "================================================================= =============== ==========\n"; if ( $ARGV[0] eq "" ) { output_a_line("D","00D???*"); output_a_line("5","005???*"); output_a_line("0","000???*"); output_a_line("*","??????*"); output_a_line("/","??????*"); output_a_line(".","??????*"); print "\n"; output_a_line("DD","0DD???*"); output_a_line("5D","05D???*"); output_a_line("12","012???*"); output_a_line("50","050???*"); output_a_line("0D","(0D????*,D0????*)"); output_a_line("0A","00A???*"); output_a_line("01","001???*"); output_a_line("00","000???*"); print "\n"; output_a_line("D*","00D???*"); output_a_line("D/","00D???*"); output_a_line("D.","00D???*"); output_a_line("5*","005???*"); output_a_line("5/","005???*"); output_a_line("5.","005???*"); output_a_line("0*","000???*"); output_a_line("0/","000???*"); output_a_line("0.","000???*"); print "\n"; output_a_line("*D","??D???*"); output_a_line("*5","??5???*"); output_a_line("*0","??0???*"); output_a_line("*A*","00A???*"); print "\n"; output_a_line("**","??????*"); output_a_line("*.","??????*"); output_a_line("*/","??????*"); output_a_line("/*","??????*"); output_a_line("//","??????*"); output_a_line("/.","??????*"); output_a_line(".*","??????*"); output_a_line("..","??????*"); output_a_line("./","./"); print "\n"; output_a_line("/D","???00D*"); output_a_line("/5","???005*"); output_a_line("/0","???000*"); print "\n"; output_a_line(".D","??????D"); output_a_line(".5","??????5"); output_a_line(".0","??????0"); print "\n"; output_a_line("D5","(0D5???*,D05???*)"); output_a_line("D0","(0D0???*,D00???*)"); print "\n"; output_a_line("ABC","ABC???*"); output_a_line("ABC/123","ABC123"); output_a_line("ABC/123.456","ABC123456"); print "\n"; output_a_line("A","00A???*"); output_a_line("AB","0AB???*"); output_a_line("A/","00A???*"); print "\n"; output_a_line("*A/","??A???*"); output_a_line("A*/","00A???*"); output_a_line("*A*/","00A???*"); print "\n"; output_a_line("*BC","?BC???*"); output_a_line("A*C","00A00C"); output_a_line("A*C/","A*C/"); output_a_line("A*C.*","00A00C*"); print "\n"; output_a_line("A*/C.*","00A00C*"); output_a_line("*A*/C","00A00C"); print "\n"; output_a_line("AB*","(AB????*,0AB???*)"); output_a_line("A*B*","00A00B"); output_a_line("A*/B*.","00A00B"); print "\n"; output_a_line("*AB/","?AB???*"); output_a_line("AB*/","(AB????*,0AB???*)"); output_a_line("*AB*","0AB???*"); print "\n"; output_a_line("AB*/CD*","(AB?CD?*,0ABCD?*,AB?0CD*,0AB0CD*)"); print "\n"; output_a_line("A/1","00A001"); output_a_line("A*/1*","00A001"); output_a_line("A*/1*.*","00A001*"); output_a_line("A/1,B/2,C/3", "00A001,00B002,00C003"); output_a_line("A/1,B-2,C/3.4", "A/1,B-2,C/3.4"); print "\n"; output_a_line("084/433", "084433"); # From US03738216__ output_a_line("084/174; 084/177; 084/423.R; 084/108; 084/438; 403/408.1;", "084174,084177,084423R,084108,084438,4034081"); # From US06180859__ output_a_line("084/423.R;424/405", "084423R,424405"); # From US06211450__ print "\n"; output_a_line("084/715; 084/678; 084/DIG.022; 084/DIG.023; 984/350; 984/DIG.001;", "084715,084678,084DIG022,084DIG023,984350,984DIG001"); # From US03725560__ output_a_line("084/001.01; 084/001.17; 084/DIG.22; 084/DIG.23;", "08400101,08400117,084DIG22,084DIG23"); # Also From US03725560__ print "\n"; output_a_line("(junk1,junk2)", "(junk1,junk2)"); output_a_line(" < or > ( junk1 , junk2 ) ", " < or > ( junk1 , junk2 ) "); } else { output_a_line($ARGV[0], "who knows?"); } print "\n"; exit; sub output_a_line { local($test_value)=@_[0]; local($correct_answer)=@_[1]; $orig_answer=orig_filter_NC($test_value); $new_answer=filter_NC($test_value); $orig_answer_length=length($orig_answer); $new_answer_length=length($new_answer); unless ($orig_answer eq $correct_answer) { $orig_answer="\033[7m$orig_answer\033[0m";} unless ($new_answer eq $correct_answer) { $new_answer="\033[7m$new_answer\033[0m";} $right_pad1=" " x (16-$orig_answer_length); $right_pad2=" " x (58-$new_answer_length); printf "%-66s %s%s %s%s\n", $test_value, $orig_answer, $right_pad1, $new_answer, $right_pad2; return; } # "Free" input of National (US) Class # Name: NC # Input: 123 / 1A1[11 . EE] and many more # Output: 12311111EE and 123* sub orig_filter_NC { local($in_var) = @_; if($SARRAY{"-c"} eq "$COLLROOT/coll_us") { # 06/28/2000 -sjb, UKPO fix $in_var =~ s/\s*//g; $in_var =~ /([a-zA-Z\d]+\*?) \/? (\d*[a-zA-Z]*\*?) \.* ([0-9a-zA-Z]*\**) /x; $term1=$1; $term2=$2; $term3=$3; $prefix1= "0" x (3-length($term1)); $prefix2= "0" x (3-length($term2)); $in_var = ($term1 =~ /\*/?"":$prefix1) . $term1 . ((($term2 =~ /\*/)||($term2 eq ""))?"":$prefix2) . $term2 . $term3; if (($in_var !~ /\*/) && (length($in_var) <= 3)) { $in_var .= "*"; } } return $in_var; } # Filter input of US National Class # Name: filter_NC # Example Input: 084/174; 084/423.R, 084/DIG.022, D5, 084DIG23, AB* # Output: 084174, 084423R, 084DIG022, (0D5???*,D05???*), 084DIG23, (AB????*,0AB???*) # Note: This filter should not be used for national GB classes, the UKPO # version of patsearch will only use this filter if only the US # collection is being searched -sjb sub filter_NC { local($in_var_string) = @_; $output = @in_var_string; # Default output to input in case some of these tests fail. # If there's any kind of funny character in this search string, don't modify is at all. # The idea is, if this guy knows what he's doing by specifying a Verity special symbol # (? ) in his search, in which case we don't want to get in his way (Carol's # suggestion), or he's given us some junk, in which case we'll rely on Verity to fail # on this search. if ( $in_var_string !~ /^[a-zA-Z\d\/\.,; \*]*$/ ) { $output = $in_var_string; # Pass everything on untouched. } else { $in_var_string =~ s/\s//g; # Get rid of any whitespace characters. $in_var_string =~ tr?\*\*//\.\.??s; # Also eliminate repeated special characters, eg ** # We want to accept multiple national class search arguments, which in the Verity # search language, are separated by commas or spaces (e.g. 084/423,432/436). # An improvement suggested by CHT is, because the detail page shows national # class separated by semicolons, somebody might "swipe" (cut & paste) that # semicolon-separated list (e.g. 084/423;084/432) and use it. Thus, here we $in_var_string =~ s/;/,/g; # translate all semicolons into commas. $output=""; # Reset output to null 'cause we're gonna # split our input (the search argument) into separate, comma-delimited terms, # and iterate across all search terms, rebuilding our filtered output. foreach (split /,/, $in_var_string) { # Parse out the three parts of the National Class search argument, # 0-3 characters, delimited by an optional slash (/) with possible asterisks (*), # 0-3 characters, delimited by an optional comma (.) with possible asterisks (*), # and the rest. # If this term doesn't follow this syntax, then don't muck with it. if ( /^(\*?) ([a-zA-Z\d]{0,3}) (\*?) (\/?) # First part (\*?) ([a-zA-Z\d]{0,3}) (\*?) (\.?) # Second part ([\*a-zA-Z\d]*) $/x ) { # Third part $ls1=($1 eq "*"); # True if there was a leading star on the first part. $part1=$2; $ts1=($3 eq "*"); # True if there was a trailing star on the first part. $slash=($4 eq "/"); # True if we found a slash. $ls2=($5 eq "*"); # True if there was a leading star on the second part. $part2=$6; $ts2=($7 eq "*"); # True if there was a trailing star on the second part. $dot=($8 eq "."); # True if we found a period. $part3=$9; $length = length($part1); $first2chars = substr($part1,0,2); $part1b = ""; # Interpret part 1. In the comments for these cases below, # l = Any letter [A-Z] # d = Any digit [0-9] # c = Any character [A-Z0-9] # D = The letter D or d # 0 = The digit zero # () = Optional part CASE1: { # If nothing was specified for part1, make it ???. if ( $length == 0 ) { # / or . or * or */ or *. ==> ??? $part1 = "???"; last CASE1; } if ( $length == 1 && $ls1 && !$ts1) { # *C/ ==> ??C $part1 = "??$part1"; last CASE1; } # C* is a special case. One could try to interpret this as a search for # C?????* or 0C????* or 00C???*, but the C?????* guarantees that the # search will fail since there are many words starting with any single # character you choose. So we're going to interpret this as the most # restrictive, yet still plausible and guaranteed to at least work as # far as Verity is concerned, 00C???*. Again, if the user doesn't like # the way we interpreted their search, they can use a different syntax. # # So C* or C*/ is handled the same as C or C/, ignoring the trailing *. # This may be counter-intuitive, but heck, searching for every National # Class starting with a given single character, doesn't make sense. # Or worse, *C*. # # If the intent was that 3*/ would find 030-039/* and/or 300-399/*, # you would need to specify 03?/ or 3??/. if ( $length == 1 ) { # C or C* or *C* ==> 00C $part1 = "00$part1"; last CASE1; } # [Dd][0-9] (e.g. D5) is a very special case because these national classes # are in the DB/2 database and Verity index in two different ways, 0D5 or D05. # The simpliest case, D5, must be converted to 2 terms, D05 or 0D5. Likewise, # 0D has to be converted to 0D? or D0?. But to really complicate things, # D5*/ might be expected to include D50-D59/ as well as D5, so that would # have to be converted to 0D5???* or D05???* or D5????*. Aaaaargh! No, we # only handle the simpliest two cases now. if ( $length == 2 && $first2chars=~"0([Dd])" && !$ls1 && !$ts1 ) { # 0D => 0D? or D0? $part1 = "0$1?"; $part1b = "${1}0?"; last CASE1; } if ( $length == 2 && $first2chars=~"([Dd])([0-9])" && !$ls1 && !$ts1 ) { # D5 => 0D5 or D05 $part1 = "0$1$2"; $part1b = "${1}0$2"; last CASE1; } if ( $length == 2 && $first2chars=~"([Dd])([0-9])" && $ls1 && !$ts1 ) { # *D5 => ?D5 or D05 /..(.)/; $part1 = "?$1$2"; $part1b = "${1}0$2"; last CASE1; } if ( $length == 2 && $ls1 && !$ts1 ) { # *CC ==> ?CC $part1 = "?$part1"; last CASE1; } if ( $length == 2 && !$ls1 && $ts1 ) { # CC* ==> CC? or 0CC $part1b = "0$part1"; $part1 = "$part1?"; last CASE1; } if ( $length == 2 ) { # CC or *CC* ==> 0CC $part1 = "0$part1"; last CASE1; } if ( $length == 3 && $first2chars=~"0([Dd])" && !$ls1 && !$ts1 ) { # 0D5 => 0D5 or D05 $part1b = "${1}0" . substr($part1,2,1); last CASE1; } if ( $length == 3 && $first2chars=~"([Dd])0" && !$ls1 && !$ts1 ) { # D05 => D05 or 0D5 $part1b = "0$1" . substr($part1,2,1); last CASE1; } } # End of CASE1 block. $length = length($part2); $part2b = ""; CASE2: { # If nothing was specified for part2, make it ???. if ( $length == 0 ) { # /. or . or * or */ or *. ==> ??? $part2 = "???"; last CASE2; } if ( $length == 1 && $ls2 && !$ts2) { # *C ==> ??C $part2 = "??$part2"; last CASE2; } if ( $length == 1 ) { # C or C* or *C* ==> 00C $part2 = "00$part2"; last CASE2; } if ( $length == 2 && $ls2 && !$ts2 ) { # *CC ==> ?CC $part2 = "?$part2"; last CASE2; } if ( $length == 2 && !$ls2 && $ts2 ) { # CC* ==> CC? or 0CC $part2b = "0$part2"; $part2 = "$part2?"; last CASE2; } if ( $length == 2 ) { # CC or *CC* ==> 0CC $part2 = "0$part2"; last CASE2; } } # End of CASE2 block. # The philosophy of whether or not to append an asterisk if part3 wasn't specified, # is to do so if he has not asked for a specific class, which is easily identified # by checking for wild card characters (?) in any of the first two parts. # Thus, 123, which is interpreted as 123/*, gets an asterisk (123???*), but # 12/34 is a specific class, so doesn't (012/034). if ($part3 eq "" && index($part1.$part1b.$part2.$part2b,"?")>=0 ) { $part3="*" } if ($part1b eq "" && $part2b eq "") { $term = sprintf '%03s%03s%s', $part1, $part2, $part3; } elsif ($part1b ne "" && $part2b eq "") { $term = sprintf '(%03s%03s%s,%03s%03s%s)', $part1, $part2, $part3, $part1b, $part2, $part3; } elsif ($part1b eq "" && $part2b ne "") { $term = sprintf '(%03s%03s%s,%03s%03s%s)', $part1, $part2, $part3, $part1, $part2b, $part3; } else { $term = sprintf '(%03s%03s%s,%03s%03s%s,%03s%03s%s,%03s%03s%s)', $part1, $part2, $part3, $part1b, $part2, $part3, $part1, $part2b, $part3, $part1b, $part2b, $part3; } } else { # Else this term didn't match our "don't muck with it" match. $term = $_; } # Rebuild the search string with the commas we're parsing on, in our foreach/split loop. if ($output eq "") { # First term? I.E. First time through? $output = $term; # If so, it doesn't get a comma } else { # Else separate this term from earlier $output = $output . "," . $term; # ones with a comma. } } # End of foreach split ... } # End of else don't pass everything on untouched return $output; }