1 |
dpavlin |
20 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
=head1 NAME |
4 |
|
|
|
5 |
|
|
cpanest - generate an Hyper Estraier index for CPAN |
6 |
|
|
|
7 |
|
|
=head1 SYNOPSIS |
8 |
|
|
|
9 |
|
|
B<cpanest> |
10 |
|
|
[B<-clean>] [B<-noclean>] |
11 |
|
|
[B<-cpan> I<url or directory>] |
12 |
|
|
[B<-node> I<node_uri>] |
13 |
|
|
[B<-force>] [B<-noforce>] |
14 |
|
|
[B<-keep> I<directory>] |
15 |
|
|
[B<-match> I<regexp>] |
16 |
|
|
[B<-test> I<level>] |
17 |
|
|
[B<-trust_mtime>] [B<-notrust_mtime>] |
18 |
|
|
|
19 |
|
|
=head1 DESCRIPTION |
20 |
|
|
|
21 |
|
|
This is a port of C<cpanwait> from L<WAIT> perl search engine to node API of |
22 |
|
|
Hyper Estraier. |
23 |
|
|
|
24 |
|
|
All the hard work was done by Ulrich Pfeifer who wrote all parsers and |
25 |
|
|
formatters. I just added support for Hyper Estraier back-end after. |
26 |
|
|
|
27 |
|
|
B<This documentation is somewhat incomplete and off-the-sync with code.> |
28 |
|
|
|
29 |
|
|
=head1 OPTIONS |
30 |
|
|
|
31 |
|
|
=over 5 |
32 |
|
|
|
33 |
|
|
=item B<-clean> / B<-noclean> |
34 |
|
|
|
35 |
|
|
Clean the table befor indexing. Default is B<off>. |
36 |
|
|
|
37 |
|
|
=item B<-cpan> I<url or directory> |
38 |
|
|
|
39 |
|
|
Default directory or URL for indexing. If an URL is given, there |
40 |
|
|
currently must be a file F<indices/find-ls.gz> relative to it which |
41 |
|
|
contains the output of C<find . -ls | gzip>. |
42 |
|
|
Default is F<ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN>. |
43 |
|
|
|
44 |
|
|
|
45 |
|
|
=item B<-node> I<http://localhost:1978/node/cpan> |
46 |
|
|
|
47 |
|
|
Specify node URI |
48 |
|
|
|
49 |
|
|
=item B<-force> |
50 |
|
|
|
51 |
|
|
Force reindexing, even if B<cpan> thinks files are up to date. |
52 |
|
|
Default is B<off> |
53 |
|
|
|
54 |
|
|
=item B<-keep> I<directory> |
55 |
|
|
|
56 |
|
|
If fetching from a remote server, keep files in I<directory>. Default is |
57 |
|
|
F</app/unido-i06/src/share/lang/perl/96a/CPAN/sources>. |
58 |
|
|
|
59 |
|
|
=item B<-match> I<regexp> |
60 |
|
|
|
61 |
|
|
Limit to patches matching I<regexp>. Default is F<authors/id/>. |
62 |
|
|
|
63 |
|
|
=item B<-test> I<level> |
64 |
|
|
|
65 |
|
|
Set test level, were B<0> means normal operation, B<1> means, don't |
66 |
|
|
really index and B<2> means, don't even get archives and examine them. |
67 |
|
|
|
68 |
|
|
=item B<-trust_mtime> / B<-notrust_mtime> |
69 |
|
|
|
70 |
|
|
If B<on>, the files mtimes are used to decide, which version of an |
71 |
|
|
archive is the newest. If b<off>, the version extracted is used |
72 |
|
|
(beware, there are far more version numbering schemes than B<cpan> can |
73 |
|
|
parse). |
74 |
|
|
|
75 |
|
|
=back |
76 |
|
|
|
77 |
|
|
=head1 AUTHORS |
78 |
|
|
|
79 |
|
|
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortumund.de>E<gt> |
80 |
|
|
|
81 |
|
|
Dobrica Pavlinusic E<lt>F<dpavlin@rot13.org>E<gt> |
82 |
|
|
|
83 |
|
|
=head1 COPYRIGHT |
84 |
|
|
|
85 |
|
|
Copyright (c) 1996-1997, Ulrich Pfeifer |
86 |
|
|
|
87 |
|
|
Copyright (c) 2005, Dobrica Pavlinusic |
88 |
|
|
|
89 |
|
|
=cut |
90 |
|
|
|
91 |
|
|
use strict; |
92 |
|
|
|
93 |
|
|
use File::Path; |
94 |
|
|
use DB_File; |
95 |
|
|
use Getopt::Long; |
96 |
|
|
use File::Find; |
97 |
|
|
use File::Basename; |
98 |
|
|
use IO::File; |
99 |
|
|
use IO::Zlib; |
100 |
|
|
|
101 |
|
|
use lib '/data/wait/lib'; |
102 |
|
|
|
103 |
|
|
use WAIT::Parse::Base; |
104 |
|
|
use WAIT::Parse::Pod; |
105 |
|
|
use WAIT::Document::Tar; |
106 |
|
|
|
107 |
|
|
sub fname($); |
108 |
|
|
|
109 |
|
|
# maximum number of archives to index (set to -1 for unlimited) |
110 |
|
|
my $max = -1; |
111 |
|
|
|
112 |
|
|
my %OPT = ( |
113 |
|
|
node => 'http://localhost:1978/node/cpan', |
114 |
|
|
clean => 0, |
115 |
|
|
remove => [], |
116 |
|
|
force => 0, |
117 |
|
|
# cpan => '/usr/src/perl/CPAN/sources', |
118 |
|
|
cpan => '/rest/cpan/CPAN/', |
119 |
|
|
trust_mtime => 1, |
120 |
|
|
match => 'authors/id/', |
121 |
|
|
test => 0, |
122 |
|
|
# cpan => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN', |
123 |
|
|
# cpan => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN', |
124 |
|
|
keep => '/tmp/CPAN/', |
125 |
|
|
); |
126 |
|
|
|
127 |
|
|
GetOptions(\%OPT, |
128 |
|
|
'node=s', |
129 |
|
|
'cpan=s', |
130 |
|
|
'keep=s', |
131 |
|
|
'match=s', |
132 |
|
|
'clean!', |
133 |
|
|
'test=i', # test level 0: normal |
134 |
|
|
# 1: don't change db |
135 |
|
|
# 2: don't look at archives even |
136 |
|
|
|
137 |
|
|
'remove=s@', |
138 |
|
|
'force!', # force indexing even if seen |
139 |
|
|
'trust_mtime!', # use mtime instead of version number |
140 |
|
|
'max=i', |
141 |
|
|
'debug!', |
142 |
|
|
) || die "Usage: ...\n"; |
143 |
|
|
|
144 |
|
|
if ($OPT{max}) { |
145 |
|
|
$max = $OPT{max}; |
146 |
|
|
print STDERR "processing just first $max modules\n"; |
147 |
|
|
} |
148 |
|
|
|
149 |
|
|
# FIXME |
150 |
|
|
#clean_node( |
151 |
|
|
# node => $OPT{node}, |
152 |
|
|
# ) if $OPT{clean}; |
153 |
|
|
|
154 |
|
|
my $tb = new HyperEstraier::WAIT::Table( |
155 |
|
|
uri => $OPT{node}, |
156 |
|
|
attr => ['docid', 'headline', 'source', 'size', 'parent'], |
157 |
|
|
key => 'docid', |
158 |
|
|
invindex => [ qw/name synopsis bugs description text environment example author/ ], |
159 |
|
|
debug => $OPT{debug}, |
160 |
|
|
) or die "Could not open node '$OPT{node}'"; |
161 |
|
|
|
162 |
|
|
my $layout= new WAIT::Parse::Pod; |
163 |
|
|
|
164 |
|
|
# Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version |
165 |
|
|
# considerations. Value *must* match common prefix. Aliasing should be |
166 |
|
|
# used if CPAN contains serveral distributions with different name but |
167 |
|
|
# same root directory. |
168 |
|
|
# We still have a problem if there are different root directories! |
169 |
|
|
|
170 |
|
|
my %ALIAS = (# tar name real (root) name |
171 |
|
|
'Games-Scrabble' => 'Games', |
172 |
|
|
'HTML-ParseBrowser' => 'HTML', |
173 |
|
|
'iodbc_ext' => 'iodbc-ext-0.1', |
174 |
|
|
'sol-inst' => 'Solaris', |
175 |
|
|
'WebService-Validator-CSS-223C' => 'WebService-Validator-CSS-W3C-0.02', |
176 |
|
|
'MPEG-ID3212Tag' => 'MPEG-ID3v2Tag-0.36', |
177 |
|
|
'WebService-GoogleHack' => 'WebService', |
178 |
|
|
'Db-Mediasurface-ReadConfig' => 'ReadConfig', |
179 |
|
|
'Tie-Array-RestrictUpdates' => 'Tie', |
180 |
|
|
'HTML-Lister' => 'HTML', |
181 |
|
|
'Net-253950-AsyncZ' => 'Net-Z3950-AsyncZ-0.08', |
182 |
|
|
'ChildExit_0' => 'ChildExit-0.1', |
183 |
|
|
'Tie-TieConstant' => 'TieConstant.pm', |
184 |
|
|
'Crypt-OpenSSL-23509' => 'Crypt-OpenSSL-X509-0.2', |
185 |
|
|
'subclustv' => 'blib', |
186 |
|
|
'finance-yahooquote' => 'Finance-YahooQuote-0.20', |
187 |
|
|
'HPUX-FS' => 'FS', |
188 |
|
|
'Business-DE-Konto' => 'Business', |
189 |
|
|
'Digest-MD5-124p' => 'Digest-MD5-M4p-0.01', |
190 |
|
|
'AKDB_Okewo_de' => 'AKDB', |
191 |
|
|
'ExtUtils-0577' => 'ExtUtils-F77-1.14', |
192 |
|
|
'LispFmt' => 'Lisp::Fmt-0.00', |
193 |
|
|
'Acme-Stegano' => 'Acme', |
194 |
|
|
'Acme-RTB' => 'Acme', |
195 |
|
|
'WWW-Search-PRWire' => 'work', |
196 |
|
|
'Video-Capture-214l' => 'Video-Capture-V4l-0.224', |
197 |
|
|
'Tie-DirHandle' => 'Tie', |
198 |
|
|
'DB2' => 'DBD-DB2-0.71a', |
199 |
|
|
'Tie-Scalar-RestrictUpdates' => 'Tie', |
200 |
|
|
'Math-MVPoly' => 'MVPoly', |
201 |
|
|
'PlugIn' => 'PlugIn.pm', |
202 |
|
|
'Lingua-ID-Nums2Words' => 'Nums2Words-0.01', |
203 |
|
|
'chronos-1.' => 'Chronos', |
204 |
|
|
'jp_beta' => 'jperl_beta_r1', |
205 |
|
|
'Bundle-223C-Validator' => 'Bundle-W3C-Validator-0.6.5', |
206 |
|
|
'Text-199' => 'Text-T9-1.0', |
207 |
|
|
'Games-Literati' => 'Games', |
208 |
|
|
'VMS-IndexedFile' => 'VMS', |
209 |
|
|
'authen-rbac' => 'Authen', |
210 |
|
|
'Graphics-EPS' => 'EPS.pm', |
211 |
|
|
'new.spirit-2.' => 'new.spirit', |
212 |
|
|
'Tk-MListbox' => 'MListbox-1.11', |
213 |
|
|
'DBD-SQLrelay' => 'SQLRelay.pm', |
214 |
|
|
'Tie-RDBM-Cached' => 'RDBM', |
215 |
|
|
'PDL_IO_HDF' => 'HDF', |
216 |
|
|
'HPUX-LVM' => 'LVM', |
217 |
|
|
'Parse-Nibbler' => 'Parse', |
218 |
|
|
'Digest-Perl-MD4' => 'MD4', |
219 |
|
|
'Crypt-Imail' => 'Imail', |
220 |
|
|
'ubertext' => 'Text-UberText-0.95', |
221 |
|
|
'MP3-123U' => 'M3U', |
222 |
|
|
'Qmail-Control' => 'Qmail', |
223 |
|
|
'T-LXS' => 'Text-LevenshteinXS-0.02', |
224 |
|
|
'HTML-Paginator' => 'HTML', |
225 |
|
|
'swig' => 'SWIG1.1p5', |
226 |
|
|
'MIDI-Realtime' => 'MIDI', |
227 |
|
|
'sparky-public' => 'Sparky-Public-1.06', |
228 |
|
|
'Chemistry-MolecularMass' => 'Chemistry', |
229 |
|
|
'Net-253950-SimpleServer' => 'Net-Z3950-SimpleServer-0.08', |
230 |
|
|
'NewsClipper-OpenSource' => 'NewsClipper-1.32-OpenSource', |
231 |
|
|
'Win32API-Resources' => 'Resources.pm', |
232 |
|
|
'Unicode-Collate-Standard-2131_1' => 'Unicode-Collate-Standard-V3_1_1-0.1', |
233 |
|
|
'Net-026Term' => 'Net-C6Term-0.11', |
234 |
|
|
'BitArray1' => 'BitArray', |
235 |
|
|
'Audio-Radio-214L' => 'Audio-Radio-V4L-0.01', |
236 |
|
|
'Devel-AutoProfiler' => 'Devel', |
237 |
|
|
'Brasil-Checar-CGC' => 'Brasil', |
238 |
|
|
'AI-NeuralNet-SOM' => 'SOM.pm', |
239 |
|
|
'Net-BitTorrent-File-fix' => 'Net-BitTorrent-File-1.01', |
240 |
|
|
'VMS-FindFile' => 'VMS', |
241 |
|
|
'LoadHtml.' => 'README', |
242 |
|
|
'Time-Compare' => 'Time', |
243 |
|
|
'ShiftJIS-230213-MapUTF' => 'ShiftJIS-X0213-MapUTF-0.21', |
244 |
|
|
'Image-WMF' => 'Image', |
245 |
|
|
'sdf-2.0.eta' => 'sdf-2.001beta1', |
246 |
|
|
'Math-Expr-LATEST' => 'Math-Expr-0.4', |
247 |
|
|
'MP3-Player-PktConcert' => 'MP3', |
248 |
|
|
'Apache-OWA' => 'OWA', |
249 |
|
|
'Audio-Gramofile' => 'Audio', |
250 |
|
|
'DBIx-Copy' => 'Copy', |
251 |
|
|
'P4-024' => 'P4-C4-2.021', |
252 |
|
|
'Disassemble-2386' => 'Disassemble-X86-0.13', |
253 |
|
|
'Proc-Swarm' => 'Swarm-0.5', |
254 |
|
|
'Smil' => 'perlysmil', |
255 |
|
|
'Net-SSH-2232Perl' => 'Net-SSH-W32Perl-0.05', |
256 |
|
|
'Win32-SerialPort' => 'SerialPort-0.19', |
257 |
|
|
'Lingua-ID-Words2Nums' => 'Words2Nums-0.01', |
258 |
|
|
'Parse-Text' => 'Text', |
259 |
|
|
'DBIx-HTMLView-LATEST' => 'DBIx-HTMLView-0.9', |
260 |
|
|
'Apache-NNTPGateway' => 'NNTPGateway-0.9', |
261 |
|
|
'XPathToXML' => 'XPathToXML.pm', |
262 |
|
|
'XML-WMM-ASX' => 'XML', |
263 |
|
|
'CGISession' => 'CGI', |
264 |
|
|
'Net-SMS-142' => 'Net-SMS-O2-0.019', |
265 |
|
|
'Search-253950' => 'Search-Z3950-0.05', |
266 |
|
|
'Date-Christmas' => 'Christmas', |
267 |
|
|
'Win32-InternetExplorer-Window' => 'Win32', |
268 |
|
|
'Apache-WAP-MailPeek' => 'MailPeek', |
269 |
|
|
'Statistics-Table-F' => 'Statistics', |
270 |
|
|
'BerkeleyDB_Locks' => 'BerkeleyDB-Locks-0_2', |
271 |
|
|
'HookPrePostCall' => 'PrePostCall-1.2', |
272 |
|
|
'Oak-AAS-Service-DBI_13_PAM' => 'Oak-AAS-Service-DBI_N_PAM-1.8', |
273 |
|
|
'Math-Vector' => 'Vector.pm', |
274 |
|
|
'Audio-124pDecrypt' => 'Audio-M4pDecrypt-0.04', |
275 |
|
|
'libao-perl_0.03' => 'libao-perl-0.03', |
276 |
|
|
'CGI-EZForm' => 'EZForm', |
277 |
|
|
'Data-Locations-fixed' => 'Data-Locations-5.2-fixed', |
278 |
|
|
'HTML-Template-Filter-Dreamweaver' => 'Dreamweaver', |
279 |
|
|
'LineByLine' => 'LineByLine.pm', |
280 |
|
|
'Geo-0400' => 'Geo-E00-0.05', |
281 |
|
|
'WebService-Validator-HTML-223C' => 'WebService-Validator-HTML-W3C-0.03', |
282 |
|
|
'DateTime-Format-223CDTF' => 'DateTime-Format-W3CDTF-0.04', |
283 |
|
|
'DBD_SQLFLEX' => 'DBD-Sqlflex', |
284 |
|
|
'Text-Number' => 'Number', |
285 |
|
|
'DBIx-DataLookup' => 'DBIx', |
286 |
|
|
'MP3-ID3211Tag' => 'MP3-ID3v1Tag-1.11', |
287 |
|
|
'Text-Striphigh' => 'Striphigh-0.02', |
288 |
|
|
'Tie-SortHash' => 'SortHash', |
289 |
|
|
'Apache-AccessAbuse' => 'AccessAbuse', |
290 |
|
|
'MP3-123U-Parser' => 'MP3-M3U-Parser', |
291 |
|
|
'Net-253950' => 'Net-Z3950-0.44', |
292 |
|
|
'Net-RBLClient' => 'RBLCLient-0.2', |
293 |
|
|
'CGI-EasyCGI' => 'CGI', |
294 |
|
|
'http-handle' => 'HTTP::Handle', |
295 |
|
|
'JPEG-Comment' => 'JPEG', |
296 |
|
|
'router-lg' => 'Router', |
297 |
|
|
'Db-Mediasurface' => 'Mediasurface', |
298 |
|
|
'Text-BarGraph' => 'bargraph', |
299 |
|
|
'TL' => 'Text-Levenshtein-0.04', |
300 |
|
|
'Config-Vars' => 'Config-0.01', |
301 |
|
|
'Tie-PerfectHash' => 'Tie', |
302 |
|
|
'DNS-TinyDNS' => 'DNS', |
303 |
|
|
'DesignPattern-Factory' => 'Factory', |
304 |
|
|
'WWW-01_Rail' => 'WWW-B_Rail-0.01', |
305 |
|
|
'Win32-Exchange' => 'blib', |
306 |
|
|
'Math-RPN' => 'Math', |
307 |
|
|
'Db-Mediasurface-Cache' => 'Cache', |
308 |
|
|
'perl_archie.' => 'Archie.pm', |
309 |
|
|
'Acme-PGPSign' => 'Acme', |
310 |
|
|
'HTML-Widget-sideBar' => 'HTML-Widget-SideBar-1.00', |
311 |
|
|
'log' => 'Games', |
312 |
|
|
'File-List' => 'File', |
313 |
|
|
'Schedule-Cronchik' => 'Schedule', |
314 |
|
|
'Curses-Devkit' => 'Cdk', |
315 |
|
|
'Pod-PalmDoc' => 'Pod', |
316 |
|
|
'Easy-WML' => 'Easy WML 0.1', |
317 |
|
|
'Interval.' => 'Date', |
318 |
|
|
'Brasil-Checar-CPF' => 'Brasil', |
319 |
|
|
'Apache-WAP-AutoIndex' => 'AutoIndex', |
320 |
|
|
|
321 |
|
|
'SOM.pm' => 'SOM.pm', |
322 |
|
|
'PlugIn.pm' => 'PlugIn.pm', |
323 |
|
|
'XPathToXML.pm' => 'XPathToXML.pm', |
324 |
|
|
'Vector.pm' => 'Vector.pm', |
325 |
|
|
'LineByLine.pm' => 'LineByLine.pm', |
326 |
|
|
'Archie.pm' => 'Archie.pm', |
327 |
|
|
'TieConstant.pm' => 'TieConstant.pm', |
328 |
|
|
'EPS.pm' => 'EPS.pm', |
329 |
|
|
'SQLRelay.pm' => 'SQLRelay.pm', |
330 |
|
|
'Resources.pm' => 'Resources.pm', |
331 |
|
|
'README' => 'README', |
332 |
|
|
|
333 |
|
|
); |
334 |
|
|
my %NEW_ALIAS; # found in this pass |
335 |
|
|
|
336 |
|
|
# Map module names to pathes. Generated by wanted() doing alisaing. |
337 |
|
|
my %ARCHIVE; |
338 |
|
|
|
339 |
|
|
# Map module names to latest version. Generated by wanted() |
340 |
|
|
my %VERSION; |
341 |
|
|
|
342 |
|
|
|
343 |
|
|
# Mapping for modules with common root not matching modules name that |
344 |
|
|
# are not aliased. This is just for prefix stripping and not strictly |
345 |
|
|
# necessary. Takes effect after version considerations. |
346 |
|
|
my %TR = (# tar name root to strip |
347 |
|
|
'Net_SSLeay.pm' => 'SSLeay/', |
348 |
|
|
'EventDrivenServer' => 'Server/', |
349 |
|
|
'bio_lib.pl.' => '', |
350 |
|
|
'AlarmCall' => 'Sys/', |
351 |
|
|
'Cdk-ext' => 'Cdk/', |
352 |
|
|
'Sx' => '\d.\d/', |
353 |
|
|
'DumpStack' => 'Devel/', |
354 |
|
|
'StatisticsDescriptive'=> 'Statistics/', |
355 |
|
|
'Term-Gnuplot' => 'Gnuplot/', |
356 |
|
|
'iodbc_ext' => 'iodbc-ext-\d.\d/', |
357 |
|
|
'UNIVERSAL' => '', |
358 |
|
|
'Term-Query' => 'Query/', |
359 |
|
|
'SelfStubber' => 'Devel/', |
360 |
|
|
'CallerItem' => 'Devel/', |
361 |
|
|
); |
362 |
|
|
|
363 |
|
|
my $LWP; |
364 |
|
|
|
365 |
|
|
# FIXME |
366 |
|
|
my $DIR = '/rest/estseek/cpan/'; |
367 |
|
|
my $DATA = $DIR . '/data'; |
368 |
|
|
|
369 |
|
|
|
370 |
|
|
if (@{$OPT{remove}}) { |
371 |
|
|
my $pod; |
372 |
|
|
for $pod (@{$OPT{remove}}) { |
373 |
|
|
unless (-e $pod) { |
374 |
|
|
$pod = "$DIR/$pod"; |
375 |
|
|
} |
376 |
|
|
index_pod(file => $pod, remove => 1) if -f $pod; |
377 |
|
|
unlink $pod or warn "Could not unlink '$pod': $!\n"; |
378 |
|
|
} |
379 |
|
|
exit; |
380 |
|
|
} |
381 |
|
|
|
382 |
|
|
# Now get the beef |
383 |
|
|
if ($OPT{cpan} =~ /^(http|ftp):/) { |
384 |
|
|
$LWP = 1; |
385 |
|
|
require LWP::Simple; |
386 |
|
|
LWP::Simple->import(); |
387 |
|
|
|
388 |
|
|
mkpath($DATA,1,0755) or |
389 |
|
|
die "Could not generate '$DATA/': $!" |
390 |
|
|
unless -d $DATA; |
391 |
|
|
|
392 |
|
|
if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) { |
393 |
|
|
my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz"); |
394 |
|
|
if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) { |
395 |
|
|
# we could use Net:FTP here ... |
396 |
|
|
die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n"; |
397 |
|
|
} |
398 |
|
|
} |
399 |
|
|
my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |"; |
400 |
|
|
die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh; |
401 |
|
|
|
402 |
|
|
my $line; |
403 |
|
|
while (defined ($line = <$fh>)) { |
404 |
|
|
chomp($line); |
405 |
|
|
my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11]; |
406 |
|
|
|
407 |
|
|
next if defined $is_link; |
408 |
|
|
my $mtime = mtime($mon, $mday, $time); |
409 |
|
|
|
410 |
|
|
$file =~ s:^\./::; |
411 |
|
|
($_) = fileparse($file); |
412 |
|
|
$File::Find::name = $file; |
413 |
|
|
wanted($mtime); |
414 |
|
|
} |
415 |
|
|
} else { |
416 |
|
|
find(sub {&wanted((stat($_))[9])}, $OPT{cpan}); |
417 |
|
|
} |
418 |
|
|
|
419 |
|
|
ARCHIVE: |
420 |
|
|
for my $tar (sort keys %ARCHIVE) { |
421 |
|
|
next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o; |
422 |
|
|
my $base = (split /\//, $ARCHIVE{$tar})[-1]; |
423 |
|
|
my $parent; |
424 |
|
|
|
425 |
|
|
# logging |
426 |
|
|
if ($OPT{trust_mtime}) { |
427 |
|
|
printf "%-20s %10s %s\t", $tar, |
428 |
|
|
substr(scalar(localtime($VERSION{$tar})),0,10), $base; |
429 |
|
|
} else { |
430 |
|
|
printf "%-20s %10.5f %s\t", $tar, $VERSION{$tar}, $base; |
431 |
|
|
} |
432 |
|
|
|
433 |
|
|
# Remember the archive |
434 |
|
|
# We should have an extra table for the tar file data ... |
435 |
|
|
if (!$OPT{force} and $tb->have(docid => $base)) { |
436 |
|
|
print "skipping\n"; |
437 |
|
|
next ARCHIVE; |
438 |
|
|
} else { |
439 |
|
|
$parent = $tb->insert(docid => $base, |
440 |
|
|
headline => $ARCHIVE{$tar}) unless $OPT{test}; |
441 |
|
|
print "indexing\n"; |
442 |
|
|
} |
443 |
|
|
|
444 |
|
|
next ARCHIVE if $OPT{test} > 1; |
445 |
|
|
|
446 |
|
|
my $TAR = myget($tar); |
447 |
|
|
|
448 |
|
|
next ARCHIVE unless $TAR; # not able to fetch it |
449 |
|
|
|
450 |
|
|
my %tar; |
451 |
|
|
tie (%tar, |
452 |
|
|
'WAIT::Document::Tar', |
453 |
|
|
sub { $_[0] =~ /\.(pm|pod|PL)$/i or $_[0] =~ /readme/i}, |
454 |
|
|
#sub { $_[0] !~ m:/$: }, |
455 |
|
|
$TAR) |
456 |
|
|
or warn "Could not tie '$TAR'\n"; |
457 |
|
|
|
458 |
|
|
my $sloppy; |
459 |
|
|
my ($key, $val); |
460 |
|
|
|
461 |
|
|
FILE: |
462 |
|
|
while (($key, $val) = each %tar) { |
463 |
|
|
my $file = fname($key); |
464 |
|
|
|
465 |
|
|
# don't index directories |
466 |
|
|
next if $file =~ /\/$/; |
467 |
|
|
|
468 |
|
|
# is it a POD file? |
469 |
|
|
next FILE unless $file =~ /readme/i or $val =~ /\n=head/; |
470 |
|
|
|
471 |
|
|
# remove directory prefix |
472 |
|
|
unless ($sloppy # no common root |
473 |
|
|
or $file =~ s:^\Q$tar\E[^/]*/:: # common root, maybe alias |
474 |
|
|
or ($TR{$tar} # common root, not aliased |
475 |
|
|
and $file =~ s:^\Q$TR{$tar}\E::) |
476 |
|
|
) { |
477 |
|
|
# try to determine an alias |
478 |
|
|
warn "Bad directory prefix: '$file'\n"; |
479 |
|
|
my ($prefix) = split /\//, $file; |
480 |
|
|
|
481 |
|
|
while ($key = (tied %tar)->NEXTKEY) { |
482 |
|
|
my $file = fname($key); |
483 |
|
|
|
484 |
|
|
next if $file =~ /\/$/; |
485 |
|
|
unless ($file =~ m:^$prefix/: or $file eq $prefix) { |
486 |
|
|
warn "Archive contains different prefixes: $prefix,$file\n"; |
487 |
|
|
$prefix = ''; |
488 |
|
|
last; |
489 |
|
|
} |
490 |
|
|
} |
491 |
|
|
if ($prefix) { |
492 |
|
|
print "Please alias '$tar' to '$prefix' next time!\n"; |
493 |
|
|
print "See alias table later.\n"; |
494 |
|
|
$NEW_ALIAS{$tar} = $prefix; |
495 |
|
|
$tb->delete_by_key($parent); |
496 |
|
|
next ARCHIVE; |
497 |
|
|
} else { |
498 |
|
|
print "Assuming that tar file name $tar is a valid prefix\n"; |
499 |
|
|
$sloppy = 1; |
500 |
|
|
|
501 |
|
|
# We may reset too much here! But that this is not exact |
502 |
|
|
# science anyway. Maybe we should ignore using 'next ARCHIVE'. |
503 |
|
|
|
504 |
|
|
$key = (tied %tar)->FIRSTKEY; |
505 |
|
|
redo FILE; |
506 |
|
|
} |
507 |
|
|
} |
508 |
|
|
|
509 |
|
|
# remove /lib prefix |
510 |
|
|
$file =~ s:^lib/::; |
511 |
|
|
|
512 |
|
|
# generate new path |
513 |
|
|
my $path = "$DATA/$tar/$file"; |
514 |
|
|
|
515 |
|
|
my ($sbase, $sdir) = fileparse($path); |
516 |
|
|
my $fh; |
517 |
|
|
|
518 |
|
|
unless ($OPT{test}) { |
519 |
|
|
if (-f $path) { |
520 |
|
|
index_pod(file => $path, remove => 1); |
521 |
|
|
unlink $path or warn "Could not unlink '$path' $!\n"; |
522 |
|
|
} elsif (!-d $sdir) { |
523 |
|
|
mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n"; |
524 |
|
|
} |
525 |
|
|
# $fh = new IO::File "> $path"; |
526 |
|
|
$fh = new IO::Zlib "$path.gz","wb"; |
527 |
|
|
die "Could not write '$path': $!\n" unless $fh; |
528 |
|
|
} |
529 |
|
|
|
530 |
|
|
if ($file =~ /readme|install/i) { # make READMEs verbatim pods |
531 |
|
|
$val =~ s/\n/\n /g; |
532 |
|
|
$val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val" |
533 |
|
|
unless $val =~ /^=head/m; |
534 |
|
|
} else { # remove non-pod stuff |
535 |
|
|
my $nval = $val; $val = ''; |
536 |
|
|
my $cutting = 1; |
537 |
|
|
|
538 |
|
|
for (split /\n/, $nval) { |
539 |
|
|
if (/^=cut|!NO!SUBS!/) { |
540 |
|
|
$cutting = 1; |
541 |
|
|
} elsif ($cutting and /^=head/) { |
542 |
|
|
$cutting = 0; |
543 |
|
|
} |
544 |
|
|
unless ($cutting) { |
545 |
|
|
$val .= $_ . "\n"; |
546 |
|
|
} |
547 |
|
|
} |
548 |
|
|
} |
549 |
|
|
unless ($OPT{test}) { |
550 |
|
|
$fh->print($val); |
551 |
|
|
index_pod(file => $path, parent => $parent, |
552 |
|
|
text => $val, source => $ARCHIVE{$tar}); |
553 |
|
|
} |
554 |
|
|
} |
555 |
|
|
|
556 |
|
|
if ($LWP and !$OPT{keep}) { |
557 |
|
|
unlink $TAR or warn |
558 |
|
|
"Could not unlink '$TAR': $!\n"; |
559 |
|
|
} |
560 |
|
|
} |
561 |
|
|
|
562 |
|
|
if (%NEW_ALIAS) { |
563 |
|
|
print "\%ALIAS = (\n"; |
564 |
|
|
for (keys %NEW_ALIAS) { |
565 |
|
|
print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n"; |
566 |
|
|
} |
567 |
|
|
print "\t);\n"; |
568 |
|
|
} |
569 |
|
|
|
570 |
|
|
exit; |
571 |
|
|
|
572 |
|
|
sub fname ($) { |
573 |
|
|
my $key = shift; |
574 |
|
|
my ($ntar, $file) = split $;, $key; |
575 |
|
|
|
576 |
|
|
# remove leading './' - shudder |
577 |
|
|
$file =~ s/^\.\///; |
578 |
|
|
|
579 |
|
|
return($file); |
580 |
|
|
} |
581 |
|
|
|
582 |
|
|
sub myget { |
583 |
|
|
my $tar = shift; |
584 |
|
|
my $TAR; |
585 |
|
|
|
586 |
|
|
if ($LWP) { # fetch the archive |
587 |
|
|
if ($OPT{keep}) { |
588 |
|
|
$TAR = "$OPT{keep}/$ARCHIVE{$tar}"; |
589 |
|
|
print "Keeping in '$TAR'\n" unless -e $TAR; |
590 |
|
|
my ($base, $path) = fileparse($TAR); |
591 |
|
|
unless (-d $path) { |
592 |
|
|
mkpath($path,1,0755) or |
593 |
|
|
die "Could not mkpath($path)\n"; |
594 |
|
|
} |
595 |
|
|
} else { |
596 |
|
|
$TAR = "/tmp/$tar.tar.gz"; |
597 |
|
|
} |
598 |
|
|
unless (-e $TAR) { # lwp mirror seems to fetch ftp: in any case? |
599 |
|
|
print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n"; |
600 |
|
|
my $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR); |
601 |
|
|
if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) { |
602 |
|
|
warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n"; |
603 |
|
|
return; |
604 |
|
|
} |
605 |
|
|
} |
606 |
|
|
} else { |
607 |
|
|
$TAR = $ARCHIVE{$tar}; |
608 |
|
|
} |
609 |
|
|
$TAR; |
610 |
|
|
} |
611 |
|
|
|
612 |
|
|
sub index_pod { |
613 |
|
|
my %parm = @_; |
614 |
|
|
my $did = $parm{file}; |
615 |
|
|
my $rel_did = $did; |
616 |
|
|
my $abs_did = $did; |
617 |
|
|
|
618 |
|
|
if ($rel_did =~ s:$DIR/::) { |
619 |
|
|
$abs_did = "$DIR/$rel_did"; |
620 |
|
|
} |
621 |
|
|
|
622 |
|
|
undef $did; |
623 |
|
|
|
624 |
|
|
# check for both variants |
625 |
|
|
if ($tb->have('docid' => $rel_did)) { |
626 |
|
|
$did = $rel_did; |
627 |
|
|
} elsif ($tb->have('docid' => $abs_did)) { |
628 |
|
|
$did = $abs_did; |
629 |
|
|
} |
630 |
|
|
if ($did) { # have it version |
631 |
|
|
if (!$parm{remove}) { |
632 |
|
|
warn "duplicate: $did\n"; |
633 |
|
|
return; |
634 |
|
|
} |
635 |
|
|
} else { # not seen yet |
636 |
|
|
$did = $rel_did; |
637 |
|
|
if ($parm{remove}) { |
638 |
|
|
print "missing: $did\n"; |
639 |
|
|
return; |
640 |
|
|
} |
641 |
|
|
} |
642 |
|
|
|
643 |
|
|
$parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did); |
644 |
|
|
|
645 |
|
|
unless (defined $parm{'text'}) { |
646 |
|
|
print "unavailable: $did\n"; |
647 |
|
|
return; |
648 |
|
|
} |
649 |
|
|
|
650 |
|
|
my $record = $layout->split($parm{'text'}); |
651 |
|
|
|
652 |
|
|
if (! $record) { |
653 |
|
|
print "empty pod: $did\n"; |
654 |
|
|
return; |
655 |
|
|
} |
656 |
|
|
|
657 |
|
|
$record->{size} = length($parm{'text'}); |
658 |
|
|
my $headline = $record->{name} || $did; |
659 |
|
|
|
660 |
|
|
$headline =~ s/^$DATA//o; # $did |
661 |
|
|
$headline =~ s/\s+/ /g; $headline =~ s/^\s+//; |
662 |
|
|
|
663 |
|
|
printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70); |
664 |
|
|
if ($parm{remove}) { |
665 |
|
|
$tb->delete('docid' => $did, |
666 |
|
|
headline => $headline, |
667 |
|
|
%{$record}); |
668 |
|
|
} else { |
669 |
|
|
$tb->insert('docid' => $did, |
670 |
|
|
headline => $headline, |
671 |
|
|
source => $parm{source}, |
672 |
|
|
parent => $parm{parent}, |
673 |
|
|
%{$record}); |
674 |
|
|
} |
675 |
|
|
} |
676 |
|
|
|
677 |
|
|
# This *must* remove the version in *any* case. It should compute a |
678 |
|
|
# resonable version number - but usually mtimes should be used. |
679 |
|
|
sub version { |
680 |
|
|
local ($_) = @_; |
681 |
|
|
|
682 |
|
|
# remove alpha/beta postfix |
683 |
|
|
s/([-_\d])(a|b|alpha|beta|src)$/$1/; |
684 |
|
|
|
685 |
|
|
# jperl1.3@4.019.tar.gz |
686 |
|
|
s/@\d.\d+//; |
687 |
|
|
|
688 |
|
|
# oraperl-v2.4-gk.tar.gz |
689 |
|
|
s/-v(\d)/$1/; |
690 |
|
|
|
691 |
|
|
# lettered versions - shudder |
692 |
|
|
s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei; |
693 |
|
|
s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei; |
694 |
|
|
|
695 |
|
|
# thanks libwww-5b12 ;-) |
696 |
|
|
s/(\d+)b/($1-1).'.'/e; |
697 |
|
|
s/(\d+)a/($1-2).'.'/e; |
698 |
|
|
|
699 |
|
|
# replace '-pre' by '0.' |
700 |
|
|
s/-pre([\.\d])/-0.$1/; |
701 |
|
|
s/\.\././g; |
702 |
|
|
s/(\d)_(\d)/$1$2/g; |
703 |
|
|
|
704 |
|
|
# chop '[-.]' and thelike |
705 |
|
|
s/\W$//; |
706 |
|
|
|
707 |
|
|
# ram's versions Storable-0.4@p |
708 |
|
|
s/\@/./; |
709 |
|
|
|
710 |
|
|
if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) { |
711 |
|
|
return($_, $1 + "0.$2" + $3 / 1000000); |
712 |
|
|
} elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) { |
713 |
|
|
return($_, $1 + $2/1000 + $3 / 1000000); |
714 |
|
|
} elsif (s/[-_]?(\d+\.[\d_]+)$//) { |
715 |
|
|
return($_, $1); |
716 |
|
|
} elsif (s/[-_]?([\d_]+)$//) { |
717 |
|
|
return($_, $1); |
718 |
|
|
} elsif (s/-(\d+.\d+)-/-/) { # perl-4.019-ref-guide |
719 |
|
|
return($_, $1); |
720 |
|
|
} else { |
721 |
|
|
if ($_ =~ /\d/) { # smells like an unknown scheme |
722 |
|
|
warn "Odd version Numbering: '$File::Find::name'\n"; |
723 |
|
|
return($_, undef); |
724 |
|
|
} else { # assume version 0 |
725 |
|
|
warn "No version Numbering: '$File::Find::name'\n"; |
726 |
|
|
return($_, 0); |
727 |
|
|
} |
728 |
|
|
|
729 |
|
|
} |
730 |
|
|
} |
731 |
|
|
|
732 |
|
|
sub wanted { |
733 |
|
|
my $mtime = shift; # called by parse_file_ls(); |
734 |
|
|
|
735 |
|
|
return if (! $max); |
736 |
|
|
$max--; |
737 |
|
|
|
738 |
|
|
return unless /^(.*)\.tar(\.gz|\.Z)$/; |
739 |
|
|
|
740 |
|
|
my ($archive, $version) = version($1); |
741 |
|
|
|
742 |
|
|
unless (defined $version) { |
743 |
|
|
warn "Skipping $1\n"; |
744 |
|
|
return; |
745 |
|
|
} |
746 |
|
|
|
747 |
|
|
# Check for file alias |
748 |
|
|
$archive = $ALIAS{$archive} if $ALIAS{$archive}; |
749 |
|
|
|
750 |
|
|
# Check for path alias. |
751 |
|
|
if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) { |
752 |
|
|
if ($ALIAS{$1}) { |
753 |
|
|
$archive = $ALIAS{$1}; |
754 |
|
|
} |
755 |
|
|
} |
756 |
|
|
|
757 |
|
|
if ($OPT{trust_mtime}) { |
758 |
|
|
$version = $mtime; |
759 |
|
|
} else { |
760 |
|
|
$version =~ s/(\d)_/$1/; |
761 |
|
|
$version ||= $mtime; # mtime |
762 |
|
|
} |
763 |
|
|
|
764 |
|
|
if (!exists $ARCHIVE{$archive} |
765 |
|
|
or $VERSION{$archive} < $version) { |
766 |
|
|
$ARCHIVE{$archive} = $File::Find::name; |
767 |
|
|
$VERSION{$archive} = $version; |
768 |
|
|
} |
769 |
|
|
} |
770 |
|
|
|
771 |
|
|
sub create_table { |
772 |
|
|
my %parm = @_; |
773 |
|
|
|
774 |
|
|
my $access = bless {}, 'WAIT::Document::Find'; |
775 |
|
|
|
776 |
|
|
my $stem = [{ |
777 |
|
|
'prefix' => ['isotr', 'isolc'], |
778 |
|
|
'intervall' => ['isotr', 'isolc'], |
779 |
|
|
}, 'isotr', 'isolc', 'split2', 'stop', 'Stem']; |
780 |
|
|
my $text = [{ |
781 |
|
|
'prefix' => ['isotr', 'isolc'], |
782 |
|
|
'intervall' => ['isotr', 'isolc'], |
783 |
|
|
}, |
784 |
|
|
'isotr', 'isolc', 'split2', 'stop']; |
785 |
|
|
my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],; |
786 |
|
|
|
787 |
|
|
my $tb = |
788 |
|
|
$parm{db}->create_table |
789 |
|
|
(name => $parm{table}, |
790 |
|
|
attr => ['docid', 'headline', 'source', 'size', 'parent'], |
791 |
|
|
keyset => [['docid']], |
792 |
|
|
layout => $parm{layout}, |
793 |
|
|
access => $access, |
794 |
|
|
invindex => |
795 |
|
|
[ |
796 |
|
|
'name' => $stem, |
797 |
|
|
'synopsis' => $stem, |
798 |
|
|
'bugs' => $stem, |
799 |
|
|
'description' => $stem, |
800 |
|
|
'text' => $stem, |
801 |
|
|
'environment' => $text, |
802 |
|
|
'example' => $text, 'example' => $stem, |
803 |
|
|
'author' => $sound, 'author' => $stem, |
804 |
|
|
] |
805 |
|
|
); |
806 |
|
|
die "Could not create table '$parm{table}'" unless $tb; |
807 |
|
|
$tb; |
808 |
|
|
} |
809 |
|
|
|
810 |
|
|
my %MON; |
811 |
|
|
my $YEAR; |
812 |
|
|
|
813 |
|
|
BEGIN { |
814 |
|
|
my $i = 1; |
815 |
|
|
for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) { |
816 |
|
|
$MON{$_} = $i++; |
817 |
|
|
} |
818 |
|
|
$YEAR = (localtime(time))[5]; |
819 |
|
|
} |
820 |
|
|
|
821 |
|
|
# We could/should use Date::GetDate here |
822 |
|
|
use Time::Local; |
823 |
|
|
sub mtime { |
824 |
|
|
my ($mon, $mday, $time) = @_; |
825 |
|
|
my ($hour, $min, $year, $monn) = (0,0); |
826 |
|
|
|
827 |
|
|
if ($time =~ /(\d+):(\d+)/) { |
828 |
|
|
($hour, $min) = ($1, $2); |
829 |
|
|
$year = $YEAR; |
830 |
|
|
} else { |
831 |
|
|
$year = $time; |
832 |
|
|
} |
833 |
|
|
$monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'"; |
834 |
|
|
my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year); |
835 |
|
|
if ($guess > time) { |
836 |
|
|
$guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1); |
837 |
|
|
} |
838 |
|
|
$guess; |
839 |
|
|
} |
840 |
|
|
|
841 |
|
|
package HyperEstraier::WAIT::Table; |
842 |
|
|
|
843 |
|
|
use HyperEstraier; |
844 |
|
|
use Text::Iconv; |
845 |
|
|
|
846 |
|
|
=head1 NAME |
847 |
|
|
|
848 |
|
|
HyperEstraier::WAIT::Table |
849 |
|
|
|
850 |
|
|
=head1 DESCRIPTION |
851 |
|
|
|
852 |
|
|
This is a mode that emulates C<WAIT::Table> functionality somewhat. |
853 |
|
|
|
854 |
|
|
There are some limitations and only one key attribute is supported (and used |
855 |
|
|
for C<@uri>). |
856 |
|
|
|
857 |
|
|
=head2 Porting from WAIT to this module. |
858 |
|
|
|
859 |
|
|
Since only one key is supported (and used as C<@uri> attribute), |
860 |
|
|
use first parametar of C<keyset> as C<key>. |
861 |
|
|
|
862 |
|
|
Full text index is specified as C<invindex>, but you need just name of fields. |
863 |
|
|
|
864 |
|
|
You will probably need to add |
865 |
|
|
|
866 |
|
|
use WAIT::Parse::Base; |
867 |
|
|
|
868 |
|
|
to your code after you remove C<WAIT::Config> and C<WAIT::Database>. |
869 |
|
|
|
870 |
|
|
=head1 METHODS |
871 |
|
|
|
872 |
|
|
=head2 new |
873 |
|
|
|
874 |
|
|
my $tb = new HyperEstraier::WAIT::Table( |
875 |
|
|
uri => 'http://localhost:1978/node/cpan', |
876 |
|
|
attr => qw/docid headline source size parent/, |
877 |
|
|
key => 'docid', |
878 |
|
|
invindex => qw/name synopsis bugs description text environment example author/, |
879 |
|
|
); |
880 |
|
|
|
881 |
|
|
=cut |
882 |
|
|
|
883 |
|
|
sub new { |
884 |
|
|
my $class = shift; |
885 |
|
|
my $self = {@_}; |
886 |
|
|
bless($self, $class); |
887 |
|
|
|
888 |
|
|
foreach my $p (qw/uri attr key invindex/) { |
889 |
|
|
die "need $p" unless ($self->{$p}); |
890 |
|
|
} |
891 |
|
|
|
892 |
|
|
$self->{'iso2utf'} = Text::Iconv->new('ISO-8859-1','UTF-8'); |
893 |
|
|
|
894 |
|
|
my $node = HyperEstraier::Node->new($self->{'uri'}); |
895 |
|
|
$node->set_auth('admin', 'admin'); |
896 |
|
|
|
897 |
|
|
$self->{'node'} = $node; |
898 |
|
|
|
899 |
|
|
$self ? return $self : return undef; |
900 |
|
|
} |
901 |
|
|
|
902 |
|
|
=head2 have |
903 |
|
|
|
904 |
|
|
if ( $tb->have(docid => $something) ) ... |
905 |
|
|
|
906 |
|
|
=cut |
907 |
|
|
|
908 |
|
|
sub have { |
909 |
|
|
my $self = shift; |
910 |
|
|
my $args = {@_}; |
911 |
|
|
my $key = $self->{'key'} || die "no key in object"; |
912 |
|
|
my $key_v = $args->{$key} || die "no key $key in data"; |
913 |
|
|
|
914 |
|
|
my $id = $self->{'node'}->uri_to_id('file://' . $key_v); |
915 |
|
|
|
916 |
|
|
return ($id == -1 ? undef : $id); |
917 |
|
|
} |
918 |
|
|
|
919 |
|
|
=head2 insert |
920 |
|
|
|
921 |
|
|
my $key = $tb->insert( |
922 |
|
|
docid => $base, |
923 |
|
|
headline => 'Something', |
924 |
|
|
... |
925 |
|
|
); |
926 |
|
|
|
927 |
|
|
=cut |
928 |
|
|
|
929 |
|
|
sub insert { |
930 |
|
|
my $self = shift; |
931 |
|
|
my $args = {@_}; |
932 |
|
|
|
933 |
|
|
my $uri = 'file://'; |
934 |
|
|
$uri .= $args->{'docid'} or die "no docid"; |
935 |
|
|
|
936 |
|
|
my $doc = HyperEstraier::Document->new; |
937 |
|
|
|
938 |
|
|
$doc->add_attr('@uri', $uri); |
939 |
|
|
$doc->add_attr('@title', $args->{'headline'}) if ($args->{'headline'}); |
940 |
|
|
$doc->add_attr('@size', $args->{'size'}) if ($args->{'size'}); |
941 |
|
|
|
942 |
|
|
my @attr = $self->{'attr'} || die "no attr in object"; |
943 |
|
|
my @invindex = $self->{'invindex'} || die "no invindex in object"; |
944 |
|
|
|
945 |
|
|
foreach my $attr (keys %{$args}) { |
946 |
|
|
if (grep(/^$attr$/, @{ $self->{'attr'} })) { |
947 |
|
|
$doc->add_attr($attr, $args->{$attr}); |
948 |
|
|
} |
949 |
|
|
if (grep(/^$attr$/, @{ $self->{'invindex'} })) { |
950 |
|
|
$doc->add_text($args->{$attr}); |
951 |
|
|
} |
952 |
|
|
} |
953 |
|
|
|
954 |
|
|
print STDERR $doc->dump_draft if ($self->{'debug'}); |
955 |
|
|
|
956 |
|
|
my $id; |
957 |
|
|
unless ($id = $self->{'node'}->put_doc($doc)) { |
958 |
|
|
printf STDERR "ERROR: %d\n", $self->{'node'}->status; |
959 |
|
|
#} else { |
960 |
|
|
# print STDERR "id: $id\n"; |
961 |
|
|
} |
962 |
|
|
|
963 |
|
|
return $id; |
964 |
|
|
} |
965 |
|
|
|
966 |
|
|
=head2 delete_by_key |
967 |
|
|
|
968 |
|
|
$tb->delete_by_key($key); |
969 |
|
|
|
970 |
|
|
=cut |
971 |
|
|
|
972 |
|
|
sub delete_by_key { |
973 |
|
|
my $self = shift; |
974 |
|
|
my $key_v = shift || die "no key?"; |
975 |
|
|
|
976 |
dpavlin |
21 |
my $id = $self->{'node'}->uri_to_id( 'file://' . $key_v ); |
977 |
|
|
|
978 |
|
|
if ($id == -1) { |
979 |
|
|
print STDERR "WARNING: can't delete document uri file://$key_v\n"; |
980 |
|
|
return; |
981 |
|
|
} |
982 |
|
|
|
983 |
|
|
unless ($self->{'node'}->out_doc($id)) { |
984 |
|
|
print STDERR "WARNING: can't out_doc($id)\n"; |
985 |
|
|
} |
986 |
dpavlin |
20 |
} |
987 |
|
|
|
988 |
|
|
=head2 delete |
989 |
|
|
|
990 |
|
|
$tb->delete( docid => $did, ... ); |
991 |
|
|
|
992 |
|
|
=cut |
993 |
|
|
|
994 |
|
|
sub delete { |
995 |
|
|
my $self = shift; |
996 |
|
|
my $args = {@_}; |
997 |
|
|
|
998 |
|
|
my $key = $self->{'key'} || die "no key in object"; |
999 |
|
|
|
1000 |
|
|
die "no $key in data" unless (my $key_v = $args->{$key}); |
1001 |
|
|
|
1002 |
|
|
$self->delete_by_key($key_v); |
1003 |
|
|
|
1004 |
|
|
} |