1 |
#!/usr/bin/perl -w |
2 |
use strict; |
3 |
|
4 |
=pod |
5 |
|
6 |
This is an example program for use with swish-e's -S prog indexing method. |
7 |
|
8 |
This will scan and index a MailMan (http://www.list.org) mailing list archive. |
9 |
This script is based on original index_hypermail.pl from swish-e distribution. |
10 |
|
11 |
You might use a config file such as: |
12 |
|
13 |
IndexDir /data/swish/mailman/index_mailman.pl |
14 |
SwishProgParameters /var/lib/mailman/archives/public/ |
15 |
|
16 |
MetaNames swishtitle name email |
17 |
PropertyNames name email |
18 |
PropertyNamesDate sent |
19 |
IndexContents HTML2 .html |
20 |
StoreDescription HTML2 <body> 100000 |
21 |
UndefinedMetaTags ignore |
22 |
|
23 |
ReplaceRules replace /var/lib/mailman/archives/public/ /pipermail/ |
24 |
|
25 |
All examples assume that this is Debian system. if it's not, you might want |
26 |
to change paths above. |
27 |
|
28 |
Index with the command: |
29 |
|
30 |
./swish-e -c swish.conf -S prog |
31 |
|
32 |
See perldoc examples/swish.cgi for how to search this index. Here's a possible |
33 |
config file for use with swish.cgi: |
34 |
|
35 |
>cat .swishcgi.conf |
36 |
|
37 |
return { |
38 |
title => "Search the Foo List Archive", |
39 |
swish_binary => '../swish-e', |
40 |
display_props => [qw/ name email sent /], |
41 |
sorts => [qw/swishrank swishtitle email sent/], |
42 |
secondary_sort => [qw/sent desc/], |
43 |
metanames => [qw/swishdefault swishtitle name email/], |
44 |
name_labels => { |
45 |
swishrank => 'Rank', |
46 |
swishtitle => 'Subject Only', |
47 |
name => "Poster's Name", |
48 |
email => "Poster's Email", |
49 |
sent => 'Message Date', |
50 |
swishdefault => 'Subject & Body', |
51 |
}, |
52 |
|
53 |
highlight => { |
54 |
package => 'PhraseHighlight', |
55 |
show_words => 10, # Number of swish words words to show around highlighted word |
56 |
max_words => 100, # If no words are found to highlighted then show this many words |
57 |
occurrences => 6, # Limit number of occurrences of highlighted words |
58 |
highlight_on => '<font style="background:#FFFF99">', |
59 |
highlight_off => '</font>', |
60 |
meta_to_prop_map => { # this maps search metatags to display properties |
61 |
swishdefault => [ qw/swishtitle swishdescription/ ], |
62 |
swishtitle => [ qw/swishtitle/ ], |
63 |
email => [ qw/email/ ], |
64 |
name => [ qw/name/ ], |
65 |
swishdocpath => [ qw/swishdocpath/ ], |
66 |
}, |
67 |
}, |
68 |
date_ranges => { |
69 |
property_name => 'sent', # property name to limit by |
70 |
time_periods => [ |
71 |
'All', |
72 |
'Today', |
73 |
'Yesterday', |
74 |
'This Week', |
75 |
'Last Week', |
76 |
'Last 90 Days', |
77 |
'This Month', |
78 |
'Last Month', |
79 |
], |
80 |
|
81 |
line_break => 0, |
82 |
default => 'All', |
83 |
date_range => 1, |
84 |
}, |
85 |
}; |
86 |
|
87 |
|
88 |
|
89 |
=cut |
90 |
|
91 |
|
92 |
|
93 |
use File::Find; # for recursing a directory tree |
94 |
use Date::Parse; |
95 |
|
96 |
# Recurse the directory(s) passed in on the command line |
97 |
|
98 |
find( { wanted => \&wanted, no_chdir => 1, follow => 1 }, @ARGV ); |
99 |
|
100 |
|
101 |
sub wanted { |
102 |
return if -d; |
103 |
return unless m!(^|/)\d+\.html$!; |
104 |
|
105 |
print STDERR $File::Find::name,"\n"; |
106 |
|
107 |
my $mtime = (stat $File::Find::name )[9]; |
108 |
|
109 |
my $html = format_message($File::Find::name ); |
110 |
return unless $html; |
111 |
|
112 |
my $size = length $html; |
113 |
|
114 |
my $name = $File::Find::name; |
115 |
$name =~ s[^./][]; |
116 |
|
117 |
print <<EOF; |
118 |
Content-Length: $size |
119 |
Last-Mtime: $mtime |
120 |
Path-Name: $name |
121 |
|
122 |
EOF |
123 |
|
124 |
print $html; |
125 |
} |
126 |
|
127 |
|
128 |
|
129 |
sub format_message { |
130 |
my $file = shift; |
131 |
local $_; |
132 |
|
133 |
unless ( open FH, "<$file" ) { |
134 |
warn "Failed to open '$file'. Error: $!"; |
135 |
return; |
136 |
} |
137 |
|
138 |
my %fields; |
139 |
my $content = ''; |
140 |
my $body = ''; |
141 |
|
142 |
while (<FH>) { |
143 |
$content .= $_; |
144 |
} |
145 |
|
146 |
if ($content =~ m{<I>(.+?)</I>}s) { |
147 |
$fields{sent} = str2time $1; |
148 |
} |
149 |
|
150 |
if ($content =~ m{<B>(.+?)</B>}s) { |
151 |
$fields{name} = $1; |
152 |
} |
153 |
|
154 |
if ($content =~ m{<A\s+HREF="mailto:.+?>(.+?)</A>}s) { |
155 |
my $email = $1; |
156 |
$email =~ s/^\s+//s; |
157 |
$email =~ s/\s+$//s; |
158 |
$fields{email} = $email; |
159 |
} |
160 |
|
161 |
if ($content =~ m{<H1>(.+?)</H1>}s) { |
162 |
my $subject = $1; |
163 |
# if you have just one public mail list, you |
164 |
# might want to remove it's name from subject! |
165 |
#$subject =~ s/\[.+?\]//; |
166 |
if ( $subject =~ s/\s*Re:\s*//i ) { |
167 |
$subject .= ' (Re)'; |
168 |
} |
169 |
$fields{subject} = $subject; |
170 |
} |
171 |
|
172 |
if ($content =~ m{<!--beginarticle-->(.*)<!--endarticle-->}s) { |
173 |
$body = $1; |
174 |
} |
175 |
|
176 |
return join "\n", |
177 |
'<html>', |
178 |
'<head>', |
179 |
'<title>', |
180 |
($fields{subject} || '' ), |
181 |
'</title>', |
182 |
map( { qq[<meta name="$_" content="$fields{$_}">] } keys %fields ), |
183 |
'</head><body>', |
184 |
$body, |
185 |
'</body>', |
186 |
'</html>', |
187 |
''; |
188 |
} |
189 |
|