| 9 | | use CGI qw(:standard *ol *div); |
| 10 | | use CGI::Carp qw(fatalsToBrowser); #Remove fatalsToBrowser if paranoid |
| 11 | | |
| 12 | | use Parse::RecDescent; |
| 13 | | use Data::Dumper; |
| 14 | | use File::Spec::Functions qw(:ALL); |
| 15 | | use Config::Tiny; |
| 16 | | use OpenGuides::Utils; |
| 17 | | |
| 18 | | my $config = Config::Tiny->read('wiki.conf'); |
| 19 | | |
| 20 | | my $wiki = OpenGuides::Utils->make_wiki_object( config => $config ); |
| 21 | | |
| 22 | | use vars qw($wiki_dbpath $wikimain $css $head %wikitext); |
| 23 | | |
| 24 | | $wikimain = $config->{_}->{script_url} . $config->{_}->{script_name}; |
| 25 | | $css = $config->{_}->{stylesheet_url}; |
| 26 | | $head = $config->{_}->{site_name} . " Search"; |
| 27 | | |
| 28 | | # sub matched_items is called with parse tree. Uses horrible subname concatenation - this |
| 29 | | # could be rewritten to us OO instead and be much neater. This would be a major refactor: |
| 30 | | # need to address design issues - patterns? |
| 31 | | |
| 32 | | sub matched_items { |
| 33 | | my $op = shift; |
| 34 | | |
| 35 | | no strict 'subs'; |
| 36 | | goto &{matched_.$op}; |
| 37 | | } |
| 38 | | |
| 39 | | # sub mungepage is used to filter out undesirable markup from the raw wiki text |
| 40 | | |
| 41 | | sub mungepage { |
| 42 | | my $text = shift; |
| 43 | | |
| 44 | | # Remove HTML tags (sort of) |
| 45 | | |
| 46 | | $text =~ s/<.*?>//g; |
| 47 | | |
| 48 | | # Change WikiLinks into plain text |
| 49 | | |
| 50 | | # titled WikiLink |
| 51 | | $text =~ s/\[\[(.*?)\|(.*?)\]\]/$2/g; |
| 52 | | |
| 53 | | # normal WikiLink |
| 54 | | $text =~ s/\[\[(.*?)\]\]/$1/g; |
| 55 | | |
| 56 | | # titled web link |
| 57 | | $text =~ s/\[(.*?) (.*?)\]/$2/g; |
| 58 | | |
| 59 | | |
| 60 | | # Remove WikiFormatting |
| 61 | | |
| 62 | | $text =~ s/=//g; # heading |
| 63 | | $text =~ s/'''//g; # bold |
| 64 | | $text =~ s/''//g; # italic |
| 65 | | $text =~ s/\*//g; # bullet point |
| 66 | | $text =~ s/----//g; # horizontal rule |
| 67 | | |
| 68 | | # Change "#REDIRECT" to something prettier |
| 69 | | |
| 70 | | $text =~ s/\#REDIRECT/\(redirect\)/g; |
| 71 | | |
| 72 | | $text; |
| 73 | | } |
| 74 | | |
| 75 | | sub prime_wikitext { |
| 76 | | my $search = shift; |
| 77 | | |
| 78 | | my %res = $wiki->search_nodes($search,' ','or'); |
| 79 | | |
| 80 | | for (keys %res) { |
| 81 | | $wikitext{$wiki->formatter->node_name_to_node_param($_)} |
| 82 | | ||= mungepage($_ . ' ' . $wiki->retrieve_node($_)); |
| 83 | | } |
| 84 | | } |
| 85 | | |
| 86 | | # Build HTML search form with appropriate headers. |
| 87 | | # Don't output it just yet |
| 88 | | |
| 89 | | my $outstr = header . start_html(-style => {src => $css}, -title => $head) . |
| 90 | | div({ -id => 'header'}, h1($head)) . start_div({-id => 'content'}) . |
| 91 | | |
| 92 | | p(small("Version $VERSION. See the <a |
| 93 | | href=\"http://grault.net/cgi-bin/grubstreet.pl?Search_Script\">information |
| 94 | | page</a> for help and more details.")). "\n". |
| 95 | | start_form( -method => "GET" ) . |
| 96 | | textfield( |
| 97 | | -name=>'search', |
| 98 | | -size=>50, |
| 99 | | -maxlength=>80) . " " . |
| 100 | | submit('go','Go') . |
| 101 | | end_form . "\n"; |
| 102 | | |
| 103 | | # Do we have an existing search? if so, do it. |
| 104 | | |
| 105 | | my $q = CGI->new; |
| 106 | | my $srh = $q->param('search'); |
| 107 | | |
| 108 | | RESULTS: |
| 109 | | { |
| 110 | | if ($srh) { |
| 111 | | |
| 112 | | # Check for only valid characters in tainted search param |
| 113 | | # (quoted literals are OK, as they are escaped) |
| 114 | | |
| 115 | | if ($srh !~ /^("[^"]*"|[\w \-'&|()!*%\[\]])+$/i) { #" |
| 116 | | print $outstr,h1("Search expression contains invalid character"); |
| 117 | | last RESULTS; |
| 118 | | } |
| 119 | | |
| 120 | | # Build RecDescent grammar for search syntax. |
| 121 | | # Note: '&' and '|' can be replaced with other non-alpha. This may be needed if |
| 122 | | # you need to call the script from METHOD=GET (as & is a separator) |
| 123 | | # Also, word: pattern could be changed to include numbers and handle locales properly. |
| 124 | | # However, quoted literals are usually good enough for most odd characters. |
| 125 | | |
| 126 | | my $parse = Parse::RecDescent->new(q{ |
| 127 | | |
| 128 | | search: list eostring {$return = $item[1]} |
| 129 | | |
| 130 | | list: <leftop: comby '|' comby> |
| 131 | | {$return = (@{$item[1]}>1) ? ['OR', @{$item[1]}] : $item[1][0]} |
| 132 | | |
| 133 | | comby: <leftop: term '&' term> |
| 134 | | {$return = (@{$item[1]}>1) ? ['AND', @{$item[1]}] : $item[1][0]} |
| 135 | | |
| 136 | | term: '(' list ')' {$return = $item[2]} |
| 137 | | | '!' term {$return = ['NOT', @{$item[2]}]} |
| 138 | | | '"' /[^"]*/ '"' {$return = ['literal', $item[2]]} |
| 139 | | | word(s) {$return = ['word', @{$item[1]}]} |
| 140 | | | '[' word(s) ']' {$return = ['title', @{$item[2]}]} |
| 141 | | |
| 142 | | word: /[\w'*%]+/ {$return = $item[1]} |
| 143 | | |
| 144 | | eostring: /^\Z/ |
| 145 | | |
| 146 | | }) or die $@; |
| 147 | | |
| 148 | | # Turn search string into parse tree |
| 149 | | |
| 150 | | my $tree = $parse->search($srh) or (print $outstr,h1("Search syntax error")),last RESULTS; |
| 151 | | # print $outstr,pre(Dumper($tree)); |
| 152 | | |
| 153 | | my $startpos = $q->param('next') || 0; |
| 154 | | |
| 155 | | # Apply search |
| 156 | | # Do different things depending on how many results: |
| 157 | | |
| 158 | | my %results = matched_items(@$tree); |
| 159 | | my $numres = scalar(keys %results); |
| 160 | | |
| 161 | | # 0 results - 'No items matched' |
| 162 | | |
| 163 | | (print $outstr,hr,h2('No items matched')),last RESULTS if !$numres; |
| 164 | | |
| 165 | | # 1 result - redirect to the page |
| 166 | | |
| 167 | | if ($numres == 1) { |
| 168 | | my ($pag) = each %results; |
| 169 | | print redirect($wikimain.'?'.$pag); |
| 170 | | exit; |
| 171 | | } |
| 172 | | |
| 173 | | # Otherwise browse selection of results, 20 at a time |
| 174 | | |
| 175 | | print $outstr,hr,h2('Search Results'),start_ol({start=>$startpos+1}),"\n"; |
| 176 | | |
| 177 | | # Sort the results - first index of array in HoA is the score. |
| 178 | | |
| 179 | | my @res_selected = sort {$results{$b}[0] <=> $results{$a}[0]} keys %results; |
| 180 | | my $tot_results = @res_selected; |
| 181 | | |
| 182 | | # Skip those before $startpos |
| 183 | | |
| 184 | | splice @res_selected,0,$startpos; |
| 185 | | |
| 186 | | # Display 20 |
| 187 | | |
| 188 | | for (@res_selected[0..19]) { |
| 189 | | (print end_ol,"\n"),last RESULTS if !$_; |
| 190 | | |
| 191 | | print p(li(a({href=>$wikimain."?$_"},b($_)),br,@{$results{$_}}[1..6])); |
| 192 | | } |
| 193 | | |
| 194 | | # More to do: display 'out of' how many, and 'more' button |
| 195 | | |
| 196 | | print end_ol,p($startpos+20,'/',$tot_results,"matches"),"\n"; |
| 197 | | |
| 198 | | if ($tot_results > $startpos + 20) { |
| 199 | | my $nq = CGI->new(''); |
| 200 | | print start_form, |
| 201 | | $nq->hidden( -name=>'search', |
| 202 | | -value=>$srh), |
| 203 | | $nq->hidden( -name=>'next', |
| 204 | | -value=>($startpos + 20)), |
| 205 | | submit( 'More'), |
| 206 | | end_form; |
| 207 | | } |
| 208 | | } else { |
| 209 | | print $outstr; |
| 210 | | } |
| 211 | | } |
| 212 | | |
| 213 | | print end_div, end_html,"\n"; |
| 214 | | |
| 215 | | ######### End of main program. |
| 216 | | |
| 217 | | # Utility routines to actually do the search |
| 218 | | |
| 219 | | sub do_search { |
| 220 | | my $wmatch = shift; |
| 221 | | |
| 222 | | # Build regexp from parameter. Gobble upto 60 characters of context either side. |
| 223 | | |
| 224 | | my $wexp = qr/\b.{0,60}\b$wmatch\b.{0,60}\b/is; |
| 225 | | my %res; |
| 226 | | |
| 227 | | # Search every wiki page for matches |
| 228 | | |
| 229 | | while (my ($k,$v) = each %wikitext) { |
| 230 | | my @out; |
| 231 | | for ($v =~ /$wexp/g) { |
| 232 | | my $match .= "...$_..."; |
| 233 | | $match =~ s/<[^>]+>//gs; |
| 234 | | $match =~ s!\b($wmatch)\b!<b>$&</b>!i; |
| 235 | | push @out,$match; |
| 236 | | } |
| 237 | | my $temp = $k; |
| 238 | | $temp =~ s/_/ /g; |
| 239 | | |
| 240 | | # Compute score and prepend to array of matches |
| 241 | | |
| 242 | | my $score = @out; |
| 243 | | $score +=10 if $temp =~ /$wexp/; |
| 244 | | $res{$k} = unshift(@out,$score) && \@out if @out; |
| 245 | | } |
| 246 | | |
| 247 | | %res; |
| 248 | | } |
| 249 | | |
| 250 | | sub intersperse { |
| 251 | | my $pagnam = shift; |
| 252 | | |
| 253 | | my @mixed; |
| 254 | | my $score = 0; |
| 255 | | |
| 256 | | for my $j (@_) { |
| 257 | | if (exists $j->{$pagnam}) { |
| 258 | | $score += $j->{$pagnam}[0]; |
| 259 | | push @mixed,[$_,$j->{$pagnam}[$_]] for 1..$#{$j->{$pagnam}}; |
| 260 | | } |
| 261 | | } |
| 262 | | |
| 263 | | my @interspersed = map $_->[1], sort {$a->[0] <=> $b->[0]} @mixed; |
| 264 | | |
| 265 | | unshift @interspersed,$score; |
| 266 | | |
| 267 | | \@interspersed; |
| 268 | | } |
| 269 | | |
| 270 | | # matched_word - we have a list of adjacent words. Words are allowed to contain |
| 271 | | # wildcards * and % |
| 272 | | |
| 273 | | sub matched_word { |
| 274 | | |
| 275 | | my $wmatch = join '\W+',@_; |
| 276 | | $wmatch =~ s/%/\\w/g; |
| 277 | | $wmatch =~ s/\*/\\w*/g; |
| 278 | | |
| 279 | | # Read in pages from the database that are candidates for the search. |
| 280 | | prime_wikitext(join ' ',@_); |
| 281 | | |
| 282 | | do_search($wmatch); |
| 283 | | } |
| 284 | | |
| 285 | | # matched_literal - we have a literal. |
| 286 | | |
| 287 | | sub matched_literal { |
| 288 | | my $lit = shift; |
| 289 | | |
| 290 | | do_search(quotemeta $lit); |
| 291 | | } |
| 292 | | |
| 293 | | # matched_title - title only search, we have a list of words |
| 294 | | |
| 295 | | sub matched_title { |
| 296 | | my $wmatch = join '\W+',@_; |
| 297 | | $wmatch =~ s/%/\\w/g; |
| 298 | | $wmatch =~ s/\*/\\w*/g; |
| 299 | | |
| 300 | | my $wexp = qr/\b$wmatch\b/is; |
| 301 | | my %res; |
| 302 | | |
| 303 | | for (keys %wikitext) { |
| 304 | | $res{$_} = [10] if /$wexp/g; |
| 305 | | } |
| 306 | | |
| 307 | | %res; |
| 308 | | } |
| 309 | | |
| 310 | | |
| 311 | | # matched_AND - we have a combination of subsearches. |
| 312 | | |
| 313 | | sub matched_AND { |
| 314 | | |
| 315 | | # Do all the searches |
| 316 | | |
| 317 | | my @comby_res = map {my %match_hash = matched_items(@$_);\%match_hash} @_; |
| 318 | | |
| 319 | | # Use the first one's results as a basis for the output hash |
| 320 | | |
| 321 | | my @out= keys %{$comby_res[0]}; |
| 322 | | my %out; |
| 323 | | |
| 324 | | # Zap out any entries which do not appear in one of the other searches. |
| 325 | | |
| 326 | | PAGE: |
| 327 | | for my $page (@out) { |
| 328 | | for (@comby_res[1..$#comby_res]) { |
| 329 | | (delete $out{$page}),next PAGE if !exists $_->{$page}; |
| 330 | | } |
| 331 | | |
| 332 | | $out{$page} = intersperse($page,@comby_res); |
| 333 | | } |
| 334 | | |
| 335 | | %out; |
| 336 | | } |
| 337 | | |
| 338 | | # matched_OR - we have a list of subsearches |
| 339 | | |
| 340 | | sub matched_OR { |
| 341 | | |
| 342 | | # Do all the searches |
| 343 | | |
| 344 | | my @list_res = map {my %match_hash = matched_items(@$_);\%match_hash} @_; |
| 345 | | |
| 346 | | my %union; |
| 347 | | |
| 348 | | # Apply union of hashes, merging any duplicates. |
| 349 | | |
| 350 | | for (@list_res) { |
| 351 | | while (my ($k,$v) = each %$_) { |
| 352 | | $union{$k}++; |
| 353 | | } |
| 354 | | } |
| 355 | | |
| 356 | | my %out; |
| 357 | | |
| 358 | | $out{$_} = intersperse($_,@list_res) for keys %union; |
| 359 | | |
| 360 | | %out; |
| 361 | | } |
| 362 | | |
| 363 | | # matched_NOT - Form complement of hash against %wikitext |
| 364 | | |
| 365 | | sub matched_NOT { |
| 366 | | |
| 367 | | my %excludes = matched_items(@_); |
| 368 | | my %out = map {$_=>[0]} keys %wikitext; |
| 369 | | |
| 370 | | delete $out{$_} for keys %excludes; |
| 371 | | %out; |
| 372 | | } |
| 373 | | |
| 374 | | =head1 NAME |
| 375 | | |
| 376 | | supersearch.cgi - Search script for OpenGuides. |
| 377 | | |
| 378 | | =head1 SYNOPSIS |
| 379 | | |
| 380 | | Invoked as a CGI script. |
| 381 | | |
| 382 | | Examples of search strings: |
| 383 | | |
| 384 | | king's head |
| 385 | | king's head&fullers |
| 386 | | coach and horses|crown and anchor |
| 387 | | (vegetarian|vegan)&takeaway |
| 388 | | category restaurants&!expensive |
| 389 | | |
| 390 | | =head1 DESCRIPTION |
| 391 | | |
| 392 | | This script presents a single search form when called. The search |
| 393 | | string is parsed with a full RecDescent grammar, and the wiki pages |
| 394 | | are searched for matches. |
| 395 | | |
| 396 | | Borrowing from Perl (or C) & represents AND, | represents OR, and ! |
| 397 | | represents NOT. |
| 398 | | |
| 399 | | =head1 AUTHOR |
| 400 | | |
| 401 | | The OpenGuides Project (openguides-dev@openguides.org) |
| 402 | | |
| 403 | | =head1 COPYRIGHT |
| 404 | | |
| 405 | | Copyright (C) 2003 The OpenGuides Project. All Rights Reserved. |
| 406 | | |
| 407 | | The OpenGuides distribution is free software; you can redistribute it |
| 408 | | and/or modify it under the same terms as Perl itself. |
| 409 | | |
| 410 | | =head1 CREDITS |
| 411 | | |
| 412 | | Most of the work in this script done by |
| 413 | | I. Williams, E<lt>ivor.williams@tiscali.co.ukE<gt> |
| 414 | | |
| 415 | | =head1 SEE ALSO |
| 416 | | |
| 417 | | L<OpenGuides> |
| | 9 | my $config = Config::Tiny->read( "wiki.conf" ); |
| | 10 | my $search = OpenGuides::SuperSearch->new( config => $config ); |
| | 11 | my %vars = CGI::Vars(); |
| | 12 | $search->run( vars => \%vars ); |