source: trunk/lib/OpenGuides/Search.pm @ 799

Last change on this file since 799 was 799, checked in by nick, 15 years ago

Add in a guide to how the searching code works, to aid in following it

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 23.3 KB
Line 
1package OpenGuides::Search;
2use strict;
3our $VERSION = '0.10';
4
5use CGI qw( :standard );
6use Wiki::Toolkit::Plugin::Locator::Grid;
7use File::Spec::Functions qw(:ALL);
8use OpenGuides::Template;
9use OpenGuides::Utils;
10use Parse::RecDescent;
11
12=head1 NAME
13
14OpenGuides::Search - Search form generation and processing for OpenGuides.
15
16=head1 DESCRIPTION
17
18Does search stuff for OpenGuides.  Distributed and installed as part of
19the OpenGuides project, not intended for independent installation.
20This documentation is probably only useful to OpenGuides developers.
21
22=head1 SYNOPSIS
23
24  use CGI;
25  use OpenGuides::Config;
26  use OpenGuides::Search;
27
28  my $config = OpenGuides::Config->new( file => "wiki.conf" );
29  my $search = OpenGuides::Search->new( config => $config );
30  my %vars = CGI::Vars();
31  $search->run( vars => \%vars );
32
33=head1 METHODS
34
35=over 4
36
37=item B<new>
38
39  my $config = OpenGuides::Config->new( file => "wiki.conf" );
40  my $search = OpenGuides::Search->new( config => $config );
41
42=cut
43
44sub new {
45    my ($class, %args) = @_;
46    my $config = $args{config};
47    my $self   = { config => $config };
48    bless $self, $class;
49
50    my $wiki = OpenGuides::Utils->make_wiki_object( config => $config );
51
52    $self->{wiki}     = $wiki;
53    $self->{wikimain} = $config->script_url . $config->script_name;
54    $self->{css}      = $config->stylesheet_url;
55    $self->{head}     = $config->site_name . " Search";
56
57    my $geo_handler = $config->geo_handler;
58    my %locator_params;
59    if ( $geo_handler == 1 ) {
60        %locator_params = ( x => "os_x", y => "os_y" );
61    } elsif ( $geo_handler == 2 ) {
62        %locator_params = ( x => "osie_x", y => "osie_y" );
63    } elsif ( $geo_handler == 3 ) {
64        %locator_params = ( x => "easting", y => "northing" );
65    }
66
67    my $locator = Wiki::Toolkit::Plugin::Locator::Grid->new( %locator_params );
68    $wiki->register_plugin( plugin => $locator );
69    $self->{locator} = $locator;
70
71    return $self;
72}
73
74=item B<wiki>
75
76  my $wiki = $search->wiki;
77
78An accessor; returns the underlying L<Wiki::Toolkit> object.
79
80=cut
81
82sub wiki {
83    my $self = shift;
84    return $self->{wiki};
85}
86
87=item B<config>
88
89  my $config = $search->config;
90
91An accessor; returns the underlying L<OpenGuides::Config> object.
92
93=cut
94
95sub config {
96    my $self = shift;
97    return $self->{config};
98}
99
100=item B<run>
101
102  my %vars = CGI::Vars();
103  $search->run(
104                vars           => \%vars,
105                return_output  => 1,   # defaults to 0
106                return_tt_vars => 1,  # defaults to 0
107              );
108
109The C<return_output> parameter is optional.  If supplied and true, the
110stuff that would normally be printed to STDOUT will be returned as a
111string instead.
112
113The C<return_tt_vars> parameter is also optional.  If supplied and
114true, the template is not processed and the variables that would have
115been passed to it are returned as a hash.  This parameter takes
116precedence over C<return_output>.
117
118These two parameters exist to make testing easier; you probably don't
119want to use them in production.
120
121
122In case you're struggling to follow the code, it does the following:
1231) Processes the parameters, and bails out if it hit a problem with them
1242) If a search string was given, do a text search
1253) If distance search paramaters were given, do a distance search
1264) If no search has occured, print out the search form
1275) If an error occured, bail out
1286) If we got a single hit on a string search, redirect to it
1297) If no results were found, give an empty search results page
1308) Sort the results by either score or distance
1319) Decide which results to show, based on paging
13210) Display the appropriate page of the results
133
134=back
135
136=cut
137
138sub run {
139    my ($self, %args) = @_;
140    $self->{return_output}  = $args{return_output}  || 0;
141    $self->{return_tt_vars} = $args{return_tt_vars} || 0;
142
143    $self->process_params( $args{vars} );
144    if ( $self->{error} ) {
145        warn $self->{error};
146        my %tt_vars = ( error_message => $self->{error} );
147        $self->process_template( tt_vars => \%tt_vars );
148        return;
149    }
150
151    my %tt_vars = (
152                   ss_version  => $VERSION,
153                   ss_info_url => 'http://openguides.org/page/search_help'
154                  );
155
156    my $doing_search;
157
158    # Run a text search if we have a search string.
159    if ( $self->{search_string} ) {
160        $doing_search = 1;
161        $tt_vars{search_terms} = $self->{search_string};
162        $self->run_text_search;
163    }
164
165    # Run a distance search if we have sufficient criteria.
166    if ( defined $self->{distance_in_metres}
167         && defined $self->{x} && defined $self->{y} ) {
168        $doing_search = 1;
169        # Make sure to pass the criteria to the template.
170        $tt_vars{dist} = $self->{distance_in_metres};
171        $tt_vars{latitude} = $self->{latitude};
172        $tt_vars{longitude} = $self->{longitude};
173        if ( $self->config->geo_handler eq 1 ) {
174            $tt_vars{coord_field_1_value} = $self->{os_x};
175            $tt_vars{coord_field_2_value} = $self->{os_y};
176        } elsif ( $self->config->geo_handler eq 2 ) {
177            $tt_vars{coord_field_1_value} = $self->{osie_x};
178            $tt_vars{coord_field_2_value} = $self->{osie_y};
179        } elsif ( $self->config->geo_handler eq 3 ) {
180            $tt_vars{coord_field_1_value} = $self->{latitude};
181            $tt_vars{coord_field_2_value} = $self->{longitude};
182        }
183        $self->run_distance_search;
184    }
185
186    # If we're not doing a search then just print the search form.
187    unless ( $doing_search ) {
188        return $self->process_template( tt_vars => \%tt_vars );
189    }
190
191    # At this point either $self->{error} or $self->{results} will be filled.
192    if ( $self->{error} ) {
193        $tt_vars{error_message} = $self->{error};
194        $self->process_template( tt_vars => \%tt_vars );
195        return;
196    }
197
198    # So now we know that we have been asked to perform a search, and we
199    # have performed it.
200    #
201    # $self->{results} will be a hash of refs to hashes like so:
202    #   'Node Name' => {
203    #                    name     => 'Node Name',
204    #                    distance => $distance_from_origin_if_any,
205    #                    score    => $relevance_to_search_string
206    #                  }
207
208    my %results_hash = %{ $self->{results} || [] };
209    my @results = values %results_hash;
210    my $numres = scalar @results;
211
212    # If we only have a single hit, and the title is a good enough match
213    # to the search string, redirect to that node.
214    # (Don't try a fuzzy search on a blank search string - Plucene chokes.)
215    if ( $self->{search_string} && $numres == 1 && !$self->{return_tt_vars}) {
216        my %fuzzies = $self->wiki->fuzzy_title_match($self->{search_string});
217        if ( scalar keys %fuzzies ) {
218            my $node = $results[0]{name};
219            my $formatter = $self->wiki->formatter;
220            my $node_param = CGI::escape(
221                            $formatter->node_name_to_node_param( $node )
222                                        );
223            my $output = CGI::redirect( $self->{wikimain} . "?$node_param" );
224            return $output if $self->{return_output};
225            print $output;
226            return;
227        }
228    }
229
230    # If we had no hits then go straight to the template.
231    if ( $numres == 0 ) {
232        %tt_vars = (
233                     %tt_vars,
234                     first_num => 0,
235                     results   => [],
236                   );
237        return $self->process_template( tt_vars => \%tt_vars );
238    }
239
240    # Otherwise, we browse through the results a page at a time.
241
242    # Figure out which results we're going to be showing on this
243    # page, and what the first one for the next page will be.
244    my $startpos = $args{vars}{next} || 0;
245    $tt_vars{first_num} = $numres ? $startpos + 1 : 0;
246    $tt_vars{last_num}  = $numres > $startpos + 20 ? $startpos + 20 : $numres;
247    $tt_vars{total_num} = $numres;
248    if ( $numres > $startpos + 20 ) {
249        $tt_vars{next_page_startpos} = $startpos + 20;
250    }
251
252    # Sort the results - by distance if we're searching on that
253    # or by score otherwise.
254    if ( $self->{distance_in_metres} ) {
255        @results = sort { $a->{distance} <=> $b->{distance} } @results;
256    } else {
257        @results = sort { $b->{score} <=> $a->{score} } @results;
258    }
259
260    # Now snip out just the ones for this page.  The -1 is because
261    # arrays index from 0 and people from 1.
262    my $from = $tt_vars{first_num} ? $tt_vars{first_num} - 1 : 0;
263    my $to   = $tt_vars{last_num} - 1; # kludge to empty arr for no results
264    @results = @results[ $from .. $to ];
265
266    # Add the URL to each result hit.
267    my $formatter = $self->wiki->formatter;
268    foreach my $i ( 0 .. $#results ) {
269        my $name = $results[$i]{name};
270
271        # Add the one-line summary of the node, if there is one.
272        my %node = $self->wiki->retrieve_node($name);
273        $results[$i]{summary} = $node{metadata}{summary}[0];
274
275        my $node_param = $formatter->node_name_to_node_param( $name );
276        $results[$i]{url} = $self->{wikimain} . "?$node_param";
277    }
278
279    # Finally pass the results to the template.
280    $tt_vars{results} = \@results;
281    $self->process_template( tt_vars => \%tt_vars );
282}
283
284sub run_text_search {
285    my $self = shift;
286    my $searchstr = $self->{search_string};
287    my $wiki = $self->wiki;
288
289    # Create parser to parse the search string.
290    my $parser = Parse::RecDescent->new( q{
291
292        search: list eostring {$return = $item[1]}
293
294        list: comby(s)
295            {$return = (@{$item[1]}>1) ? ['AND', @{$item[1]}] : $item[1][0]}
296
297        comby: <leftop: term ',' term>
298            {$return = (@{$item[1]}>1) ? ['OR', @{$item[1]}] : $item[1][0]}
299
300        term: '(' list ')' {$return = $item[2]}
301            |        '-' term {$return = ['NOT', @{$item[2]}]}
302            |        '"' word(s) '"' {$return = ['phrase', join " ", @{$item[2]}]}
303            |        word {$return = ['word', $item[1]]}
304            |        '[' word(s) ']' {$return = ['title', @{$item[2]}]}
305
306        word: /[\w'*%]+/ {$return = $item[1]}
307
308        eostring: /^\Z/
309
310    } );
311
312    unless ( $parser ) {
313        warn $@;
314        $self->{error} = "Can't create parse object - $@";
315        return $self;
316    }
317
318    # Run parser over search string.
319    my $tree = $parser->search( $searchstr );
320    unless ( $tree ) {
321        $self->{error} = "Syntax error in search: $searchstr";
322        return $self;
323    }
324
325    # Run the search over the generated search tree.
326    my %results = $self->_run_search_tree( tree => $tree );
327    $self->{results} = \%results;
328    return $self;
329}
330
331sub _run_search_tree {
332    my ($self, %args) = @_;
333    my $tree = $args{tree};
334    my @tree_arr = @$tree;
335    my $op = shift @tree_arr;
336    my $method = "_run_" . $op . "_search";
337    return $self->can($method) ? $self->$method(@tree_arr) : undef;
338}
339
340=head1 INPUT
341
342=over
343
344=item B<word>
345
346a single word will be matched as-is. For example, a search on
347
348  escalator
349
350will return all pages containing the word "escalator".
351
352=cut
353
354sub _run_word_search {
355    my ($self, $word) = @_;
356    # A word is just a small phrase.
357    return $self->_run_phrase_search( $word );
358}
359
360=item B<AND searches>
361
362A list of words with no punctuation will be ANDed, for example:
363
364  restaurant vegetarian
365
366will return all pages containing both the word "restaurant" and the word
367"vegetarian".
368
369=cut
370
371sub _run_AND_search {
372    my ($self, @subsearches) = @_;
373
374    # Do the first subsearch.
375    my %results = $self->_run_search_tree( tree => $subsearches[0] );
376
377    # Now do the rest one at a time and remove from the results anything
378    # that doesn't come up in each subsearch.  Results that survive will
379    # have a score that's the sum of their score in each subsearch.
380    foreach my $tree ( @subsearches[ 1 .. $#subsearches ] ) {
381        my %subres = $self->_run_search_tree( tree => $tree );
382        my @pages = keys %results;
383        foreach my $page ( @pages ) {
384          if ( exists $subres{$page} ) {
385                $results{$page}{score} += $subres{$page}{score};
386              } else {
387                delete $results{$page};
388            }
389        }
390      }
391
392    return %results;
393}
394
395=item B<OR searches>
396
397A list of words separated by commas (and optional spaces) will be ORed,
398for example:
399
400  restaurant, cafe
401
402will return all pages containing either the word "restaurant" or the
403word "cafe".
404
405=cut
406
407sub _run_OR_search {
408    my ($self, @subsearches) = @_;
409
410    # Do all the searches.  Results will have a score that's the sum
411    # of their score in each subsearch.
412    my %results;
413    foreach my $tree ( @subsearches ) {
414        my %subres = $self->_run_search_tree( tree => $tree );
415        foreach my $page ( keys %subres ) {
416          if ( $results{$page} ) {
417                $results{$page}{score} += $subres{$page}{score};
418              } else {
419                $results{$page} = $subres{$page};
420            }
421        }
422      }
423    return %results;
424}
425
426=item B<phrase searches>
427
428Enclose phrases in double quotes, for example:
429
430  "meat pie"
431
432will return all pages that contain the exact phrase "meat pie" - not pages
433that only contain, for example, "apple pie and meat sausage".
434
435=cut
436
437sub _run_phrase_search {
438    my ($self, $phrase) = @_;
439    my $wiki = $self->wiki;
440
441    # Search title and body.
442    my %contents_res = $wiki->search_nodes( $phrase );
443
444    # Rationalise the scores a little.  The scores returned by
445    # Wiki::Toolkit::Search::Plucene are simply a ranking.
446    my $num_results = scalar keys %contents_res;
447    foreach my $node ( keys %contents_res ) {
448        $contents_res{$node} = int( $contents_res{$node} / $num_results ) + 1;
449    }
450
451    # It'll be a real phrase (as opposed to a word) if it has a space in it.
452    # In this case, dump out the nodes that don't match the search exactly.
453    # I don't know why the phrase searching isn't working properly.  Fix later.
454    if ( $phrase =~ /\s/ ) {
455        my @tmp = keys %contents_res;
456        foreach my $node ( @tmp ) {
457            my $content = $wiki->retrieve_node( $node );
458            unless ( $content =~ /$phrase/i || $node =~ /$phrase/i ) {
459                delete $contents_res{$node};
460            }
461        }
462    }
463
464    my %results = map { $_ => { name => $_, score => $contents_res{$_} } }
465                      keys %contents_res;
466
467    # Bump up the score if the title matches.
468    foreach my $node ( keys %results ) {
469        $results{$node}{score} += 10 if $node =~ /$phrase/i;
470    }
471
472    # Search categories.
473    my @catmatches = $wiki->list_nodes_by_metadata(
474                                 metadata_type  => "category",
475                                 metadata_value => $phrase,
476                                 ignore_case    => 1,
477    );
478
479    foreach my $node ( @catmatches ) {
480        if ( $results{$node} ) {
481            $results{$node}{score} += 3;
482        } else {
483            $results{$node} = { name => $node, score => 3 };
484        }
485    }
486
487    # Search locales.
488    my @locmatches = $wiki->list_nodes_by_metadata(
489                                 metadata_type  => "locale",
490                                 metadata_value => $phrase,
491                                 ignore_case    => 1,
492    );
493
494    foreach my $node ( @locmatches ) {
495        if ( $results{$node} ) {
496            $results{$node}{score} += 3;
497        } else {
498            $results{$node} = { name => $node, score => 3 };
499        }
500    }
501
502    return %results;
503}
504
505=back
506
507=head1 SEARCHING BY DISTANCE
508
509To perform a distance search, you need to supply one of the following
510sets of criteria to specify the distance to search within, and the
511origin (centre) of the search:
512
513=over
514
515=item B<os_dist, os_x, and os_y>
516
517Only works if you chose to use British National Grid in wiki.conf
518
519=item B<osie_dist, osie_x, and osie_y>
520
521Only works if you chose to use Irish National Grid in wiki.conf
522
523=item B<latlong_dist, latitude, and longitude>
524
525Should always work, but has a habit of "finding" things a couple of
526metres away from themselves.
527
528=back
529
530You can perform both pure distance searches and distance searches in
531combination with text searches.
532
533=cut
534
535# Note this is called after any text search is run, and it is only called
536# if there are sufficient criteria to perform the search.
537sub run_distance_search {
538    my $self = shift;
539    my $x    = $self->{x};
540    my $y    = $self->{y};
541    my $dist = $self->{distance_in_metres};
542
543    my @close = $self->{locator}->find_within_distance(
544                                                        x      => $x,
545                                                        y      => $y,
546                                                        metres => $dist,
547                                                      );
548
549    if ( $self->{results} ) {
550        my %close_hash = map { $_ => 1 } @close;
551        my %results = %{ $self->{results} };
552        my @candidates = keys %results;
553        foreach my $node ( @candidates ) {
554            if ( exists $close_hash{$node} ) {
555                my $distance = $self->_get_distance(
556                                                     node => $node,
557                                                     x    => $x,
558                                                     y    => $y,
559                                                   );
560                $results{$node}{distance} = $distance;
561            } else {
562                delete $results{$node};
563            }
564        }
565        $self->{results} = \%results;
566    } else {
567        my %results;
568        foreach my $node ( @close ) {
569            my $distance = $self->_get_distance (
570                                                     node => $node,
571                                                     x    => $x,
572                                                     y    => $y,
573                                                   );
574            $results{$node} = {
575                                name     => $node,
576                                distance => $distance,
577                              };
578        }
579        $self->{results} = \%results;
580    }
581    return $self;
582}
583
584sub _get_distance {
585    my ($self, %args) = @_;
586    my ($node, $x, $y) = @args{ qw( node x y ) };
587    return $self->{locator}->distance(
588                                       from_x  => $x,
589                                       from_y  => $y,
590                                       to_node => $node,
591                                       unit    => "metres"
592                                     );
593}
594
595sub process_params {
596    my ($self, $vars_hashref) = @_;
597    my %vars = %{ $vars_hashref || {} };
598
599    # Make sure that we don't have any data left over from previous invocation.
600    # This is useful for testing purposes at the moment and will be essential
601    # for mod_perl implementations.
602    delete $self->{x};
603    delete $self->{y};
604    delete $self->{distance_in_metres};
605    delete $self->{search_string};
606
607    # Strip out any non-digits from distance and OS co-ords.
608    foreach my $param ( qw( os_x os_y osie_x osie_y
609                            osie_dist os_dist latlong_dist ) ) {
610        if ( defined $vars{$param} ) {
611            $vars{$param} =~ s/[^0-9]//g;
612            # 0 is an allowed value but the empty string isn't.
613            delete $vars{$param} if $vars{$param} eq "";
614        }
615    }
616
617    # Latitude and longitude are also allowed '-' and '.'
618    foreach my $param( qw( latitude longitude ) ) {
619        if ( defined $vars{$param} ) {
620            $vars{$param} =~ s/[^-\.0-9]//g;
621            # 0 is an allowed value but the empty string isn't.
622            delete $vars{$param} if $vars{$param} eq "";
623        }
624    }
625
626    # Set $self->{distance_in_metres}, $self->{x}, $self->{y},
627    # depending on whether we got
628    # OS co-ords or lat/long.  Only store parameters if they're complete,
629    # and supported by our method of distance calculation.
630    if ( defined $vars{os_x} && defined $vars{os_y} && defined $vars{os_dist}
631         && $self->config->geo_handler eq 1 ) {
632        $self->{x} = $vars{os_x};
633        $self->{y} = $vars{os_y};
634        $self->{distance_in_metres} = $vars{os_dist};
635    } elsif ( defined $vars{osie_x} && defined $vars{osie_y}
636         && defined $vars{osie_dist}
637         && $self->config->geo_handler eq 2 ) {
638        $self->{x} = $vars{osie_x};
639        $self->{y} = $vars{osie_y};
640        $self->{distance_in_metres} = $vars{osie_dist};
641    } elsif ( defined $vars{latitude} && defined $vars{longitude}
642              && defined $vars{latlong_dist} ) {
643        # All handlers can do lat/long, but they all do it differently.
644        if ( $self->config->geo_handler eq 1 ) {
645            require Geography::NationalGrid::GB;
646            my $point = Geography::NationalGrid::GB->new(
647                Latitude  => $vars{latitude},
648                Longitude => $vars{longitude},
649            );
650            $self->{x} = $point->easting;
651            $self->{y} = $point->northing;
652        } elsif ( $self->config->geo_handler eq 2 ) {
653            require Geography::NationalGrid::IE;
654            my $point = Geography::NationalGrid::IE->new(
655                Latitude  => $vars{latitude},
656                Longitude => $vars{longitude},
657            );
658            $self->{x} = $point->easting;
659            $self->{y} = $point->northing;
660        } elsif ( $self->config->geo_handler eq 3 ) {
661            require Geo::Coordinates::UTM;
662            my ($zone, $x, $y) = Geo::Coordinates::UTM::latlon_to_utm(
663                                                $self->config->ellipsoid,
664                                                $vars{latitude},
665                                                $vars{longitude},
666                                              );
667            $self->{x} = $x;
668            $self->{y} = $y;
669        }
670        $self->{distance_in_metres} = $vars{latlong_dist};
671    }
672
673    # Store os_x etc so we can pass them to template.
674    foreach my $param ( qw( os_x os_y osie_x osie_y latitude longitude ) ) {
675        $self->{$param} = $vars{$param};
676    }
677
678    # Strip leading and trailing whitespace from search text.
679    $vars{search} ||= ""; # avoid uninitialised value warning
680    $vars{search} =~ s/^\s*//;
681    $vars{search} =~ s/\s*$//;
682
683    # Check for only valid characters in tainted search param
684    # (quoted literals are OK, as they are escaped)
685    # This regex copied verbatim from Ivor's old supersearch.
686    if ( $vars{search}
687         && $vars{search} !~ /^("[^"]*"|[\w \-',()!*%\[\]])+$/i) {
688        $self->{error} = "Search expression $vars{search} contains invalid character(s)";
689        return $self;
690    }
691    $self->{search_string} = $vars{search};
692
693    return $self;
694}
695
696# thin wrapper around OpenGuides::Template
697sub process_template {
698    my ($self, %args) = @_;
699
700    my $tt_vars = $args{tt_vars} || {};
701    $tt_vars->{not_editable} = 1;
702    $tt_vars->{not_deletable} = 1;
703    return %$tt_vars if $self->{return_tt_vars};
704    my $output =  OpenGuides::Template->output(
705                                                wiki     => $self->wiki,
706                                                config   => $self->config,
707                                                template => "search.tt",
708                                                vars     => $tt_vars,
709                                              );
710    return $output if $self->{return_output};
711
712    print $output;
713    return 1;
714}
715
716=head1 OUTPUT
717
718Results will be put into some form of relevance ordering.  These are
719the rules we have tests for so far (and hence the only rules that can
720be relied on):
721
722=over
723
724=item *
725
726A match on page title will score higher than a match on page category
727or locale.
728
729=item *
730
731A match on page category or locale will score higher than a match on
732page content.
733
734=item *
735
736Two matches in the title beats one match in the title and one in the content.
737
738=back
739
740=cut
741
742=head1 AUTHOR
743
744The OpenGuides Project (openguides-dev@openguides.org)
745
746=head1 COPYRIGHT
747
748     Copyright (C) 2003-2004 The OpenGuides Project.  All Rights Reserved.
749
750The OpenGuides distribution is free software; you can redistribute it
751and/or modify it under the same terms as Perl itself.
752
753=head1 SEE ALSO
754
755L<OpenGuides>
756
757=cut
758
7591;
Note: See TracBrowser for help on using the repository browser.