tagging: Allow using titles in for related stories.
[matthijs/upstream/blosxom-plugins.git] / general / find
1 # Blosxom Plugin: Find
2 # Author: Fletcher T. Penney
3 #               advanced search concept and keywords code by Eric Sherman
4 #               Recent Searches feature based on code by Marc Nozell
5 # Version: 0.9
6 package find;
7
8 # --- Configurable variables -----
9 # None yet - may add ability to change search behaviors, such as
10 # always starting at the root level of your datadir while staying in a subdirectory
11
12 $keywords_tag = "meta-keywords:" unless defined $keywords_tag;
13
14 $do_local_search = 0;   # Perform search relative to the current page, 
15                                                 # not the whole site.  If set to 1, this will
16                                                 # override the advanced search option
17
18 $show_advanced = 0;             # Set to 1 to always show the advanced form
19
20 $show_debug = 0;                # display more info about search terms for debugging
21
22 $default_to_and = 0;            # Set to 1 to always do "and" searches by default
23
24 $match_whole_words = 0;         # Set to 1 to only match whole words by default
25
26 $log_searches = 1;              # Log search queries to a file?
27 $log_filename = "$blosxom::plugin_state_dir/queries";   # Where should I log?
28
29
30 $max_previous_searches = 10;    # Maximum old queries to display
31
32 $search_writebacks = 1;         # Should I also search writebacks?
33 $writeback_dir = "$blosxom::plugin_state_dir/writeback";
34 $writeback_ext = "wb";
35
36
37 $search_filenames = 1;          # Should I also search filenames?
38
39 # --------------------------------
40
41
42 $results = "";
43 $recentsearches = "";
44
45 use CGI qw/:standard/;
46
47
48 sub start {
49
50
51         # Figure out the current path and flavour for the form
52         $path_withflavour = $blosxom::path_info;
53         if ($path_withflavour !~ s/\.[^\.]*$//) {
54                 $path_withflavour =~ s/\/$//;
55                 $path_withflavour .= "\/index";
56                 $path_withflavour =~ s/^([^\/])/$1/;
57         }
58         $path_withflavour =~ s/^\/*//;
59         $path_withflavour.="\.$blosxom::flavour";
60
61         # Insert this html code only if advanced form is indicated
62         $advancedform = qq!<br />Search:<br />
63 <input checked type="radio" name="domain" value="all" />Entire Site
64 <input type="radio" name="domain" value="topic" />This Topic Only
65 <br />Match:<br />
66 <input checked="checked" type="radio" name="type" value="any" />Any
67 <input type="radio" name="type" value="all" />All
68 <br />
69 <input checked="checked" type="radio" name="match" value="any" />Partial
70 <input type="radio" name="match" value="whole" />Whole Words only
71
72                 if ((param('advanced_search')) || $show_advanced);
73
74         # This is the basic form
75
76 $searchform = qq!<form method="get" action="$blosxom::url/index.$blosxom::flavour">
77         <div>
78                 <input type="text" name="find" size="15" value=""/>
79                 <input type="submit" value="Search" />
80                 <input type="hidden" name="plugin" value="find"/>
81                 <input type="hidden" name="path" value="$blosxom::path_info"/>
82                 <br/>
83                 <a href="$blosxom::url/$path_withflavour?advanced_search=1">Advanced Search</a>
84 $advancedform
85 </div></form>!;
86
87         1;
88 }
89
90 sub filter {
91         # Check that writebacks are working
92         #$search_writebacks = 0 if ( $writeback::writeback_dir eq "");
93
94         my ($pkg, $files_ref) = @_;
95         my @files_list = keys %$files_ref;
96         if (param('find')) {
97                 my $terms = param('find');
98                 $searchpath = "$blosxom::datadir/" . param('path');
99                 $do_local_search = 1 if (param('domain') eq "topic");
100
101                 $match_whole_words =1 if (param('match') eq 'whole');
102                 $match_whole_words =0 if (param('match') eq 'any');
103
104                 my $searchtype = param('type');
105                 $default_to_and = 0 if ($searchtype eq 'any');
106
107                 my @requiredterms, @forbiddenterms;
108                 
109
110                 $results = "These pages matched: $terms";
111
112                 if ($log_searches eq 1) {
113                         if ( !-e $log_filename ) {
114                                 open (LOG, ">>$log_filename");
115                                 chmod (0666, "$log_filename");
116                         } else {
117                                 open (LOG, ">>$log_filename") or warn "Error in find logging file."
118                         }
119                         print LOG "$terms\n";
120                         close (LOG);
121                 }
122
123                 $terms = " " . $terms;  # Add a space for pattern matching reasons
124
125
126                 # Handle double quotations (exact phrases)
127                 $terms =~ s/\"([^\"]+)\"/\[\{$1\}\]/g;
128                 while ($terms =~ s/\[\{([^\}]*)\s+([^\}]*)/\[\{$1\\s\+$2/g) {
129                 }
130                 $terms =~ s/\[\{/(/g;
131                 $terms =~ s/\}\]/)/g;
132                 # Any left over quotes were "odd-numbered"
133                 $terms =~ s/\"//g;
134                 
135                 # Handle parentheses
136                 while ($terms =~ s/\(([^\)]*)\s+([^\)]*)\)/\($1\|$2\)/g) {
137                 }
138
139                 # Strip trailing spaces to prevent empty terms
140                 # Don't strip leading spaces yet!
141                 $terms =~ s/\s+$//;
142                 
143                 # Convert English to symbols
144                 # The "OR"'s will wait til the end
145                 # Handle "NOT"'s
146                 $terms =~ s/\s+not\s+/ \-/ig;
147                 # Handle "AND"'s and convert to "+", unless preceded by "-"
148                 $terms =~ s/\s+(\([^\)]+\))\s+and\s+/ \+$1 \+/ig;
149                 $terms =~ s/\-(\([^\)]+\))\s+and\s+/\-$1 \+/ig;
150                 $terms =~ s/\s+([^\)]+)\s+and\s+/ \+$1 \+/ig;
151                 $terms =~ s/\-([^\)]+)\s+and\s+/\-$1 \+/ig;
152                 $terms =~ s/\+\-/\-/g;  # Fix if the second term already had "-"
153                 
154                 $results = "These pages matched: $terms" if ($show_debug eq 1); 
155         
156                 # If doing "all" search, then every term is required
157                 # Will not override terms already set to "NOT"
158                 $terms =~ s/\s+\+?([\(\)\|\w]+)/ \+$1/g if (($searchtype eq "all") || ($default_to_and eq 1));
159
160                 # Extract all required terms ("AND"  terms)
161                 while ($terms =~ s/\s+\+([\(\)\|\\\+\w]+)//){
162                         $theterm = $1;
163                         $theterm = "\\b$theterm\\b" if ($match_whole_words eq 1);
164                         push(@requiredterms,$theterm);
165                         $results.="<br>Required Term: $theterm" if ($show_debug eq 1);
166                 }
167
168                 # Extract all "forbidden" terms ("NOT" terms)
169                 while ($terms =~ s/\s+\-([\(\)\|\\\+\w]+)//){
170                         $theterm = $1;
171                         $theterm = "\\b$theterm\\b" if ($match_whole_words eq 1);
172                         push(@forbiddenterms,$theterm);
173                         $results.="<br>Forbidden Term: $theterm" if ($show_debug eq 1);
174                 }
175
176                 # Strip "OR"'s with only one term
177                 while ($terms =~ s/^\s*or\s+//i) {}
178                 while ($terms =~ s/\s+or\s*$//i) {}
179                 
180
181                 # Now cleanup for regexp's
182                 $terms =~ s/^\s+//;     #Strip leading and trailing spaces
183                 $terms =~ s/\s+$//;
184                 # Finally, convert all the "OR" terms to a single regexp
185                 $terms =~ s/\s+(or\s+)?/\|/ig;
186                 $terms =~ s/(\s)\+/$1/g;        # Loose '+' will crash regexp
187
188                 # Handle whole word matching on remainder
189                 $terms = "\\b$terms\\b" if ($match_whole_words eq 1);
190
191                 # Debugging Aids
192                 $results.="<br>Remainder regexp: $terms<br>" if ($show_debug eq 1);
193                 $results.="Search path: $searchpath <br>" if ($show_debug eq 1);
194
195                 # Quit now if nothing to search for
196                 if (($terms eq "") & (scalar(@requiredterms) eq 0) & (scalar(@forbiddenterms) eq 0)) {
197                         $results = "";
198                         return 0;
199                 }
200
201                 foreach $file (@files_list) {
202                 #       next;           # Enable this line to debug terms parsing only
203                         if ($do_local_search eq 1) {
204                                 # Limit search to the current path only
205                                 if ($file !~ /^$searchpath/) {
206                                         delete $files_ref->{$file};
207                                         next;
208                                 }
209                         }
210                         my $keep = 0;
211                         my $delete = 0;
212                         open (FILE, "<$file") or next;
213                         my $contents = "";
214                         my $pastHeader = 0;
215                         while ($line = <FILE>) {
216                                 if (!$pastHeader) {
217                                         # include keywords
218                                         if ($line =~ /^$keywords_tag/i) {
219                                                 $line =~ s/^$keywords_tag(.*)$/\1/;
220                                         }
221                                         # don't read other meta- tags
222                                         elsif ($line =~ /^meta-/i) {
223                                                 next;
224                                         }
225                                         # if reached the header, say so
226                                         elsif ($line =~ /^\s.*$/) {
227                                                 $pastHeader = 1;
228                                         }
229                                 }
230                                 $contents .= $line;
231                         }
232
233                         close (FILE);
234                         
235                         # Now scan writebacks for this story
236                         if ( $search_writebacks == 1) {
237                                 my $writeback_file = $file;
238                                 $writeback_file =~ s/$blosxom::datadir/$writeback_dir/;
239                                 $writeback_file =~ s/$blosxom::file_extension$/$writeback_ext/;
240
241                                 if (open (FILE, "<$writeback_file")) {
242                                         while ($line = <FILE>) {
243                                                 # We'll just appened writebacks to the story
244                                                 $contents .= $line;
245                                         }       
246                                         close (FILE);           
247                                 }
248                         }
249                         
250                         # If searching filenames, append that to the story for 
251                         # searching as well
252                         
253                         if ($search_filenames == 1) {
254                                 $contents.=$file;
255                         }
256                         
257                         # If we match any "OR" terms flag file for keeping
258                         $keep = 1 if ($contents =~ /$terms/si);
259                                                 
260                         # If we match required terms, keep, else delete for sure
261                         foreach (@requiredterms) {
262                                 if ($contents =~ /$_/si) {
263                                         $keep =1;
264                                 } else {
265                                         $delete = 1;
266                                 }
267                         }
268
269                         # If we match forbidden terms, then delete
270                         foreach (@forbiddenterms) {
271                                 if ($contents =~ /$_/si) {
272                                         $delete =1;
273                                 } 
274                         }
275
276                         # Remove file if marked for delete or not marked to keep
277                         delete $files_ref->{$file} if (($delete eq 1) or ($keep eq 0));
278                 }
279         }
280
281         1;
282 }
283
284 sub getrecentsearches {
285         if ( open(LOG, "< $log_filename")) { 
286                 my @searches = <LOG>; 
287                 close(LOG); 
288                 @searches = reverse(@searches); 
289                 $recentsearches = "<ul>";
290                 for ($count = 0; $count < $max_previous_searches; $count++) { 
291                         $recentsearches .= '<li>' . $searches[$count] . '</li>';
292                 }
293                 $recentsearches .= "</ul>";
294         } else {
295                 warn "Couldn't open $log_filename: $!\n" if ($log_searches == 1);
296         }
297         1;
298 }
299
300 sub head {
301         getrecentsearches();
302         1;
303 }
304
305
306 1;
307
308 __END__
309
310 =head1 NAME
311
312 Blosxom Plug-in: find
313
314 =head1 DESCRIPTION
315
316 Find searches through the available articles and filters out those that do not match the submitted search terms.  To use it, simply place $find::searchform in your template, and it will create a search box that automatically calls the search routine.  It performs a boolean "OR" search by default, or you can use regular expressions for more complicated search terms.
317
318 This plugin is capable of handling the following search terms
319
320 term1 term2; term1 or term2
321         These match any page with term1 OR term2
322         
323 term1 and term2; +term1 +term2
324         These match any page with both term1 AND term2
325         
326 term1 not term2; term1 -term2
327         This matches pages with term1 that DO NOT contain term2
328         
329 term1 not (term2 term3)
330         This matches pages with term1 that DO NOT contain term2 OR term3
331
332 "term1 term2 term3"
333         This matches the exact phrase, term1 term2 term3
334
335 " pen "
336         This will match the word "pen", but not the word "pencil".
337         
338 You can also use regular expressions within your search terms to further refine your searches, creating a very powerful search engine.
339
340 Additionally, you can include the most recent search requests in your blog.  Add $find::recentsearches in your template.  By default, the last 10 searches will be shown in an unordered list.  You can change $max_previous_searches to alter the number displayed.
341
342 =head1 AUTHORS
343
344 Fletcher T. Penney - http://fletcher.freeshell.org
345
346 Eric Sherman            <enkidu@enkidu.bloggedup.com>
347 Marc Nozell             <marc@nozell.com> http://www.nozell.com/blog
348
349 This plugin is now maintained by the Blosxom Sourceforge Team,
350 <blosxom-devel@lists.sourceforge.net>.
351
352 =head1 LICENSE
353
354 This source is submitted to the public domain.  Feel free to use and modify it.  If you like, a comment in your modified source attributing credit for my original work would be appreciated.
355
356 THIS SOFTWARE IS PROVIDED AS IS AND WITHOUT ANY WARRANTY OF ANY KIND.  USE AT YOUR OWN RISK!