Changeset 231 for trunk/supersearch.cgi

Show
Ignore:
Timestamp:
09/01/03 15:21:11 (5 years ago)
Author:
kake
Message:

0.24

Fix to recent changes so minor changes don't mask major ones.
Fix supersearch.cgi to use a template instead of CGI.pm to avoid
weird errors, also turned it into a module and added tests.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/supersearch.cgi

    r202 r231  
    22 
    33use strict; 
    4 use warnings; 
    54 
    6 use vars qw( $VERSION ); 
    7 $VERSION = '1.06'; 
     5use CGI; 
     6use Config::Tiny; 
     7use OpenGuides::SuperSearch; 
    88 
    9 use CGI qw(:standard *ol *div); 
    10 use CGI::Carp qw(fatalsToBrowser);      #Remove fatalsToBrowser if paranoid 
    11  
    12 use Parse::RecDescent; 
    13 use Data::Dumper; 
    14 use File::Spec::Functions qw(:ALL); 
    15 use Config::Tiny; 
    16 use OpenGuides::Utils; 
    17  
    18 my $config = Config::Tiny->read('wiki.conf'); 
    19  
    20 my $wiki = OpenGuides::Utils->make_wiki_object( config => $config ); 
    21  
    22 use vars qw($wiki_dbpath $wikimain $css $head %wikitext); 
    23  
    24 $wikimain = $config->{_}->{script_url} . $config->{_}->{script_name}; 
    25 $css = $config->{_}->{stylesheet_url}; 
    26 $head = $config->{_}->{site_name} . " Search"; 
    27  
    28 # sub matched_items is called with parse tree. Uses horrible subname concatenation - this 
    29 # could be rewritten to us OO instead and be much neater. This would be a major refactor: 
    30 # need to address design issues - patterns? 
    31  
    32 sub matched_items { 
    33         my $op = shift; 
    34          
    35         no strict 'subs'; 
    36         goto &{matched_.$op}; 
    37 } 
    38  
    39 # sub mungepage is used to filter out undesirable markup from the raw wiki text 
    40  
    41 sub mungepage { 
    42         my $text = shift; 
    43  
    44 # Remove HTML tags (sort of) 
    45  
    46         $text =~ s/<.*?>//g; 
    47  
    48 # Change WikiLinks into plain text 
    49  
    50         # titled WikiLink 
    51         $text =~ s/\[\[(.*?)\|(.*?)\]\]/$2/g; 
    52  
    53         # normal WikiLink 
    54         $text =~ s/\[\[(.*?)\]\]/$1/g; 
    55  
    56         # titled web link 
    57         $text =~ s/\[(.*?) (.*?)\]/$2/g; 
    58          
    59  
    60 # Remove WikiFormatting 
    61  
    62         $text =~ s/=//g;        # heading 
    63         $text =~ s/'''//g;      # bold 
    64         $text =~ s/''//g;       # italic 
    65         $text =~ s/\*//g;       # bullet point 
    66         $text =~ s/----//g;     # horizontal rule 
    67  
    68 # Change "#REDIRECT" to something prettier 
    69  
    70         $text =~ s/\#REDIRECT/\(redirect\)/g; 
    71  
    72         $text; 
    73 } 
    74  
    75 sub prime_wikitext { 
    76         my $search = shift; 
    77  
    78         my %res = $wiki->search_nodes($search,' ','or'); 
    79  
    80         for (keys %res) { 
    81                 $wikitext{$wiki->formatter->node_name_to_node_param($_)}  
    82                         ||= mungepage($_ . ' ' . $wiki->retrieve_node($_)); 
    83         } 
    84 } 
    85          
    86 # Build HTML search form with appropriate headers. 
    87 # Don't output it just yet 
    88  
    89 my $outstr = header . start_html(-style => {src => $css}, -title => $head) . 
    90               div({ -id => 'header'}, h1($head)) . start_div({-id => 'content'}) . 
    91  
    92                 p(small("Version $VERSION. See the <a 
    93 href=\"http://grault.net/cgi-bin/grubstreet.pl?Search_Script\">information 
    94 page</a> for help and more details.")). "\n". 
    95                 start_form( -method => "GET" ) .  
    96                 textfield( 
    97                         -name=>'search', 
    98                         -size=>50, 
    99                         -maxlength=>80) . " " . 
    100                         submit('go','Go') . 
    101                 end_form . "\n"; 
    102  
    103 # Do we have an existing search? if so, do it. 
    104  
    105 my $q = CGI->new; 
    106 my $srh = $q->param('search'); 
    107  
    108 RESULTS: 
    109 { 
    110         if ($srh) { 
    111                  
    112 # Check for only valid characters in tainted search param 
    113 # (quoted literals are OK, as they are escaped) 
    114  
    115                 if ($srh !~ /^("[^"]*"|[\w \-'&|()!*%\[\]])+$/i) { #" 
    116                         print $outstr,h1("Search expression contains invalid character"); 
    117                         last RESULTS; 
    118                 } 
    119  
    120 # Build RecDescent grammar for search syntax. 
    121 # Note: '&' and '|' can be replaced with other non-alpha. This may be needed if 
    122 # you need to call the script from METHOD=GET (as & is a separator) 
    123 # Also, word: pattern could be changed to include numbers and handle locales properly. 
    124 # However, quoted literals are usually good enough for most odd characters. 
    125          
    126                 my $parse = Parse::RecDescent->new(q{ 
    127  
    128                         search: list eostring {$return = $item[1]} 
    129  
    130                         list: <leftop: comby '|' comby>  
    131                                 {$return = (@{$item[1]}>1) ? ['OR', @{$item[1]}] : $item[1][0]} 
    132  
    133                         comby: <leftop: term '&' term>  
    134                                 {$return = (@{$item[1]}>1) ? ['AND', @{$item[1]}] : $item[1][0]} 
    135  
    136                         term: '(' list ')' {$return = $item[2]} 
    137                         |               '!' term {$return = ['NOT', @{$item[2]}]} 
    138                         |               '"' /[^"]*/ '"' {$return = ['literal', $item[2]]} 
    139                         |               word(s) {$return = ['word', @{$item[1]}]} 
    140                         |               '[' word(s) ']' {$return = ['title', @{$item[2]}]} 
    141  
    142                         word: /[\w'*%]+/ {$return = $item[1]} 
    143                          
    144                         eostring: /^\Z/ 
    145  
    146                 }) or die $@;   
    147  
    148 # Turn search string into parse tree 
    149          
    150                 my $tree = $parse->search($srh) or (print $outstr,h1("Search syntax error")),last RESULTS; 
    151 #               print $outstr,pre(Dumper($tree)); 
    152  
    153                 my $startpos = $q->param('next') || 0; 
    154  
    155 # Apply search 
    156 # Do different things depending on how many results: 
    157  
    158                 my %results = matched_items(@$tree); 
    159                 my $numres = scalar(keys %results); 
    160  
    161 # 0 results - 'No items matched' 
    162  
    163                 (print $outstr,hr,h2('No items matched')),last RESULTS if !$numres; 
    164  
    165 # 1 result - redirect to the page 
    166  
    167                 if ($numres == 1) { 
    168                         my ($pag) = each %results; 
    169                         print redirect($wikimain.'?'.$pag); 
    170                         exit; 
    171                 } 
    172  
    173 # Otherwise browse selection of results, 20 at a time 
    174  
    175                 print $outstr,hr,h2('Search Results'),start_ol({start=>$startpos+1}),"\n"; 
    176  
    177 # Sort the results - first index of array in HoA is the score. 
    178  
    179                 my @res_selected = sort {$results{$b}[0] <=> $results{$a}[0]} keys %results; 
    180                 my $tot_results = @res_selected; 
    181  
    182 # Skip those before $startpos 
    183  
    184                 splice @res_selected,0,$startpos; 
    185  
    186 # Display 20 
    187  
    188                 for (@res_selected[0..19]) { 
    189                         (print end_ol,"\n"),last RESULTS if !$_; 
    190                          
    191                         print p(li(a({href=>$wikimain."?$_"},b($_)),br,@{$results{$_}}[1..6])); 
    192                 } 
    193  
    194 # More to do: display 'out of' how many, and 'more' button 
    195  
    196                 print end_ol,p($startpos+20,'/',$tot_results,"matches"),"\n"; 
    197  
    198                 if ($tot_results > $startpos + 20) { 
    199                         my $nq = CGI->new(''); 
    200                         print start_form, 
    201                                 $nq->hidden( -name=>'search', 
    202                                         -value=>$srh), 
    203                                 $nq->hidden( -name=>'next', 
    204                                         -value=>($startpos + 20)), 
    205                                 submit( 'More'), 
    206                                 end_form; 
    207                 } 
    208         } else { 
    209                 print $outstr; 
    210         } 
    211 } 
    212  
    213 print end_div, end_html,"\n"; 
    214  
    215 ######### End of main program. 
    216  
    217 # Utility routines to actually do the search 
    218  
    219 sub do_search { 
    220         my $wmatch = shift; 
    221  
    222 # Build regexp from parameter. Gobble upto 60 characters of context either side. 
    223  
    224         my $wexp = qr/\b.{0,60}\b$wmatch\b.{0,60}\b/is; 
    225         my %res; 
    226  
    227 # Search every wiki page for matches 
    228          
    229         while (my ($k,$v) = each %wikitext) { 
    230                 my @out; 
    231                 for ($v =~ /$wexp/g) { 
    232                         my $match .= "...$_..."; 
    233                         $match =~ s/<[^>]+>//gs; 
    234                         $match =~ s!\b($wmatch)\b!<b>$&</b>!i; 
    235                         push @out,$match; 
    236                 } 
    237                 my $temp = $k; 
    238                 $temp =~ s/_/ /g; 
    239  
    240 # Compute score and prepend to array of matches 
    241  
    242                 my $score = @out; 
    243                 $score +=10 if $temp =~ /$wexp/; 
    244                 $res{$k} = unshift(@out,$score) && \@out if @out; 
    245         } 
    246          
    247         %res; 
    248 } 
    249  
    250 sub intersperse { 
    251         my $pagnam = shift; 
    252          
    253         my @mixed;    
    254         my $score = 0; 
    255          
    256         for my $j (@_) { 
    257                 if (exists $j->{$pagnam}) { 
    258                         $score += $j->{$pagnam}[0]; 
    259                         push @mixed,[$_,$j->{$pagnam}[$_]] for 1..$#{$j->{$pagnam}}; 
    260                 } 
    261         } 
    262          
    263         my @interspersed = map $_->[1], sort {$a->[0] <=> $b->[0]} @mixed; 
    264          
    265         unshift @interspersed,$score; 
    266          
    267         \@interspersed; 
    268 } 
    269  
    270 # matched_word - we have a list of adjacent words. Words are allowed to contain 
    271 # wildcards * and % 
    272  
    273 sub matched_word { 
    274  
    275         my $wmatch = join '\W+',@_; 
    276         $wmatch =~ s/%/\\w/g; 
    277         $wmatch =~ s/\*/\\w*/g; 
    278  
    279 # Read in pages from the database that are candidates for the search. 
    280         prime_wikitext(join ' ',@_); 
    281  
    282         do_search($wmatch); 
    283 } 
    284  
    285 # matched_literal - we have a literal. 
    286  
    287 sub matched_literal { 
    288         my $lit = shift; 
    289          
    290         do_search(quotemeta $lit); 
    291 } 
    292  
    293 # matched_title - title only search, we have a list of words 
    294  
    295 sub matched_title { 
    296         my $wmatch = join '\W+',@_; 
    297         $wmatch =~ s/%/\\w/g; 
    298         $wmatch =~ s/\*/\\w*/g; 
    299  
    300         my $wexp = qr/\b$wmatch\b/is; 
    301         my %res; 
    302          
    303         for (keys %wikitext) { 
    304                 $res{$_} = [10] if /$wexp/g; 
    305         } 
    306          
    307         %res; 
    308 } 
    309  
    310  
    311 # matched_AND - we have a combination of subsearches. 
    312  
    313 sub matched_AND { 
    314  
    315 # Do all the searches 
    316  
    317         my @comby_res = map {my %match_hash = matched_items(@$_);\%match_hash} @_; 
    318  
    319 # Use the first one's results as a basis for the output hash 
    320          
    321         my @out= keys %{$comby_res[0]}; 
    322         my %out; 
    323  
    324 # Zap out any entries which do not appear in one of the other searches. 
    325          
    326         PAGE: 
    327         for my $page (@out) { 
    328                 for (@comby_res[1..$#comby_res]) { 
    329                         (delete $out{$page}),next PAGE if !exists $_->{$page}; 
    330                 } 
    331                  
    332                 $out{$page} = intersperse($page,@comby_res); 
    333         } 
    334          
    335         %out; 
    336 } 
    337  
    338 # matched_OR - we have a list of subsearches 
    339  
    340 sub matched_OR { 
    341  
    342 # Do all the searches 
    343  
    344         my @list_res = map {my %match_hash = matched_items(@$_);\%match_hash} @_; 
    345          
    346         my %union; 
    347  
    348 # Apply union of hashes, merging any duplicates. 
    349          
    350         for (@list_res) { 
    351                 while (my ($k,$v) = each %$_) { 
    352                         $union{$k}++; 
    353                 } 
    354         } 
    355          
    356         my %out; 
    357          
    358         $out{$_} = intersperse($_,@list_res) for keys %union; 
    359          
    360         %out; 
    361 } 
    362  
    363 # matched_NOT - Form complement of hash against %wikitext 
    364  
    365 sub matched_NOT { 
    366  
    367         my %excludes = matched_items(@_); 
    368         my %out = map {$_=>[0]} keys %wikitext; 
    369  
    370         delete $out{$_} for keys %excludes; 
    371         %out; 
    372 } 
    373  
    374 =head1 NAME 
    375  
    376 supersearch.cgi - Search script for OpenGuides. 
    377  
    378 =head1 SYNOPSIS 
    379  
    380 Invoked as a CGI script. 
    381  
    382 Examples of search strings: 
    383  
    384 king's head 
    385 king's head&fullers 
    386 coach and horses|crown and anchor 
    387 (vegetarian|vegan)&takeaway 
    388 category restaurants&!expensive 
    389  
    390 =head1 DESCRIPTION 
    391  
    392 This script presents a single search form when called. The search 
    393 string is parsed with a full RecDescent grammar, and the wiki pages 
    394 are searched for matches. 
    395  
    396 Borrowing from Perl (or C) & represents AND, | represents OR, and ! 
    397 represents NOT. 
    398  
    399 =head1 AUTHOR 
    400  
    401 The OpenGuides Project (openguides-dev@openguides.org) 
    402  
    403 =head1 COPYRIGHT 
    404  
    405      Copyright (C) 2003 The OpenGuides Project.  All Rights Reserved. 
    406  
    407 The OpenGuides distribution is free software; you can redistribute it 
    408 and/or modify it under the same terms as Perl itself. 
    409  
    410 =head1 CREDITS 
    411  
    412 Most of the work in this script done by 
    413 I. Williams, E<lt>ivor.williams@tiscali.co.ukE<gt> 
    414  
    415 =head1 SEE ALSO 
    416  
    417 L<OpenGuides> 
     9my $config = Config::Tiny->read( "wiki.conf" ); 
     10my $search = OpenGuides::SuperSearch->new( config => $config ); 
     11my %vars = CGI::Vars(); 
     12$search->run( vars => \%vars );