/[hyperestraier_wrappers]/trunk/perl/scripts/cpanest
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/perl/scripts/cpanest

Parent Directory Parent Directory | Revision Log Revision Log


Revision 23 - (show annotations)
Fri Sep 16 23:29:27 2005 UTC (18 years, 6 months ago) by dpavlin
File size: 28171 byte(s)
add mdate to archives and set all files within archive mdate to archive date.
1 #!/usr/bin/perl -w
2
3 =head1 NAME
4
5 cpanest - generate an Hyper Estraier index for CPAN
6
7 =head1 SYNOPSIS
8
9 B<cpanest>
10 [B<-clean>] [B<-noclean>]
11 [B<-cpan> I<url or directory>]
12 [B<-node> I<node_uri>]
13 [B<-force>] [B<-noforce>]
14 [B<-keep> I<directory>]
15 [B<-match> I<regexp>]
16 [B<-test> I<level>]
17 [B<-trust_mtime>] [B<-notrust_mtime>]
18
19 =head1 DESCRIPTION
20
21 This is a port of C<cpanwait> from L<WAIT> perl search engine to node API of
22 Hyper Estraier.
23
24 All the hard work was done by Ulrich Pfeifer who wrote all parsers and
25 formatters. I just added support for Hyper Estraier back-end after.
26
27 B<This documentation is somewhat incomplete and off-the-sync with code.>
28
29 =head1 OPTIONS
30
31 =over 5
32
33 =item B<-clean> / B<-noclean>
34
35 Clean the table befor indexing. Default is B<off>.
36
37 =item B<-cpan> I<url or directory>
38
39 Default directory or URL for indexing. If an URL is given, there
40 currently must be a file F<indices/find-ls.gz> relative to it which
41 contains the output of C<find . -ls | gzip>.
42 Default is F<ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN>.
43
44
45 =item B<-node> I<http://localhost:1978/node/cpan>
46
47 Specify node URI
48
49 =item B<-force>
50
51 Force reindexing, even if B<cpan> thinks files are up to date.
52 Default is B<off>
53
54 =item B<-keep> I<directory>
55
56 If fetching from a remote server, keep files in I<directory>. Default is
57 F</app/unido-i06/src/share/lang/perl/96a/CPAN/sources>.
58
59 =item B<-match> I<regexp>
60
61 Limit to patches matching I<regexp>. Default is F<authors/id/>.
62
63 =item B<-test> I<level>
64
65 Set test level, were B<0> means normal operation, B<1> means, don't
66 really index and B<2> means, don't even get archives and examine them.
67
68 =item B<-trust_mtime> / B<-notrust_mtime>
69
70 If B<on>, the files mtimes are used to decide, which version of an
71 archive is the newest. If b<off>, the version extracted is used
72 (beware, there are far more version numbering schemes than B<cpan> can
73 parse).
74
75 =back
76
77 =head1 AUTHORS
78
79 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortumund.de>E<gt>
80
81 Dobrica Pavlinusic E<lt>F<dpavlin@rot13.org>E<gt>
82
83 =head1 COPYRIGHT
84
85 Copyright (c) 1996-1997, Ulrich Pfeifer
86
87 Copyright (c) 2005, Dobrica Pavlinusic
88
89 =cut
90
91 use strict;
92
93 use File::Path;
94 use DB_File;
95 use Getopt::Long;
96 use File::Find;
97 use File::Basename;
98 use IO::File;
99 use IO::Zlib;
100 use POSIX qw/strftime/;
101
102 use lib '/data/wait/lib';
103
104 use WAIT::Parse::Base;
105 use WAIT::Parse::Pod;
106 use WAIT::Document::Tar;
107
108 sub fname($);
109
110 # maximum number of archives to index (set to -1 for unlimited)
111 my $max = -1;
112
113 my %OPT = (
114 node => 'http://localhost:1978/node/cpan',
115 clean => 0,
116 remove => [],
117 force => 0,
118 # cpan => '/usr/src/perl/CPAN/sources',
119 cpan => '/rest/cpan/CPAN/',
120 trust_mtime => 1,
121 match => 'authors/id/',
122 test => 0,
123 # cpan => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN',
124 # cpan => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN',
125 keep => '/tmp/CPAN/',
126 );
127
128 GetOptions(\%OPT,
129 'node=s',
130 'cpan=s',
131 'keep=s',
132 'match=s',
133 'clean!',
134 'test=i', # test level 0: normal
135 # 1: don't change db
136 # 2: don't look at archives even
137
138 'remove=s@',
139 'force!', # force indexing even if seen
140 'trust_mtime!', # use mtime instead of version number
141 'max=i',
142 'debug!',
143 ) || die "Usage: ...\n";
144
145 if ($OPT{max}) {
146 $max = $OPT{max};
147 print STDERR "processing just first $max modules\n";
148 }
149
150 # FIXME
151 #clean_node(
152 # node => $OPT{node},
153 # ) if $OPT{clean};
154
155 my $tb = new HyperEstraier::WAIT::Table(
156 uri => $OPT{node},
157 attr => ['docid', 'headline', 'source', 'size', 'parent'],
158 key => 'docid',
159 invindex => [ qw/name synopsis bugs description text environment example author/ ],
160 debug => $OPT{debug},
161 ) or die "Could not open node '$OPT{node}'";
162
163 my $layout= new WAIT::Parse::Pod;
164
165 # Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version
166 # considerations. Value *must* match common prefix. Aliasing should be
167 # used if CPAN contains serveral distributions with different name but
168 # same root directory.
169 # We still have a problem if there are different root directories!
170
171 my %ALIAS = (# tar name real (root) name
172 'Games-Scrabble' => 'Games',
173 'HTML-ParseBrowser' => 'HTML',
174 'iodbc_ext' => 'iodbc-ext-0.1',
175 'sol-inst' => 'Solaris',
176 'WebService-Validator-CSS-223C' => 'WebService-Validator-CSS-W3C-0.02',
177 'MPEG-ID3212Tag' => 'MPEG-ID3v2Tag-0.36',
178 'WebService-GoogleHack' => 'WebService',
179 'Db-Mediasurface-ReadConfig' => 'ReadConfig',
180 'Tie-Array-RestrictUpdates' => 'Tie',
181 'HTML-Lister' => 'HTML',
182 'Net-253950-AsyncZ' => 'Net-Z3950-AsyncZ-0.08',
183 'ChildExit_0' => 'ChildExit-0.1',
184 'Tie-TieConstant' => 'TieConstant.pm',
185 'Crypt-OpenSSL-23509' => 'Crypt-OpenSSL-X509-0.2',
186 'subclustv' => 'blib',
187 'finance-yahooquote' => 'Finance-YahooQuote-0.20',
188 'HPUX-FS' => 'FS',
189 'Business-DE-Konto' => 'Business',
190 'Digest-MD5-124p' => 'Digest-MD5-M4p-0.01',
191 'AKDB_Okewo_de' => 'AKDB',
192 'ExtUtils-0577' => 'ExtUtils-F77-1.14',
193 'LispFmt' => 'Lisp::Fmt-0.00',
194 'Acme-Stegano' => 'Acme',
195 'Acme-RTB' => 'Acme',
196 'WWW-Search-PRWire' => 'work',
197 'Video-Capture-214l' => 'Video-Capture-V4l-0.224',
198 'Tie-DirHandle' => 'Tie',
199 'DB2' => 'DBD-DB2-0.71a',
200 'Tie-Scalar-RestrictUpdates' => 'Tie',
201 'Math-MVPoly' => 'MVPoly',
202 'PlugIn' => 'PlugIn.pm',
203 'Lingua-ID-Nums2Words' => 'Nums2Words-0.01',
204 'chronos-1.' => 'Chronos',
205 'jp_beta' => 'jperl_beta_r1',
206 'Bundle-223C-Validator' => 'Bundle-W3C-Validator-0.6.5',
207 'Text-199' => 'Text-T9-1.0',
208 'Games-Literati' => 'Games',
209 'VMS-IndexedFile' => 'VMS',
210 'authen-rbac' => 'Authen',
211 'Graphics-EPS' => 'EPS.pm',
212 'new.spirit-2.' => 'new.spirit',
213 'Tk-MListbox' => 'MListbox-1.11',
214 'DBD-SQLrelay' => 'SQLRelay.pm',
215 'Tie-RDBM-Cached' => 'RDBM',
216 'PDL_IO_HDF' => 'HDF',
217 'HPUX-LVM' => 'LVM',
218 'Parse-Nibbler' => 'Parse',
219 'Digest-Perl-MD4' => 'MD4',
220 'Crypt-Imail' => 'Imail',
221 'ubertext' => 'Text-UberText-0.95',
222 'MP3-123U' => 'M3U',
223 'Qmail-Control' => 'Qmail',
224 'T-LXS' => 'Text-LevenshteinXS-0.02',
225 'HTML-Paginator' => 'HTML',
226 'swig' => 'SWIG1.1p5',
227 'MIDI-Realtime' => 'MIDI',
228 'sparky-public' => 'Sparky-Public-1.06',
229 'Chemistry-MolecularMass' => 'Chemistry',
230 'Net-253950-SimpleServer' => 'Net-Z3950-SimpleServer-0.08',
231 'NewsClipper-OpenSource' => 'NewsClipper-1.32-OpenSource',
232 'Win32API-Resources' => 'Resources.pm',
233 'Unicode-Collate-Standard-2131_1' => 'Unicode-Collate-Standard-V3_1_1-0.1',
234 'Net-026Term' => 'Net-C6Term-0.11',
235 'BitArray1' => 'BitArray',
236 'Audio-Radio-214L' => 'Audio-Radio-V4L-0.01',
237 'Devel-AutoProfiler' => 'Devel',
238 'Brasil-Checar-CGC' => 'Brasil',
239 'AI-NeuralNet-SOM' => 'SOM.pm',
240 'Net-BitTorrent-File-fix' => 'Net-BitTorrent-File-1.01',
241 'VMS-FindFile' => 'VMS',
242 'LoadHtml.' => 'README',
243 'Time-Compare' => 'Time',
244 'ShiftJIS-230213-MapUTF' => 'ShiftJIS-X0213-MapUTF-0.21',
245 'Image-WMF' => 'Image',
246 'sdf-2.0.eta' => 'sdf-2.001beta1',
247 'Math-Expr-LATEST' => 'Math-Expr-0.4',
248 'MP3-Player-PktConcert' => 'MP3',
249 'Apache-OWA' => 'OWA',
250 'Audio-Gramofile' => 'Audio',
251 'DBIx-Copy' => 'Copy',
252 'P4-024' => 'P4-C4-2.021',
253 'Disassemble-2386' => 'Disassemble-X86-0.13',
254 'Proc-Swarm' => 'Swarm-0.5',
255 'Smil' => 'perlysmil',
256 'Net-SSH-2232Perl' => 'Net-SSH-W32Perl-0.05',
257 'Win32-SerialPort' => 'SerialPort-0.19',
258 'Lingua-ID-Words2Nums' => 'Words2Nums-0.01',
259 'Parse-Text' => 'Text',
260 'DBIx-HTMLView-LATEST' => 'DBIx-HTMLView-0.9',
261 'Apache-NNTPGateway' => 'NNTPGateway-0.9',
262 'XPathToXML' => 'XPathToXML.pm',
263 'XML-WMM-ASX' => 'XML',
264 'CGISession' => 'CGI',
265 'Net-SMS-142' => 'Net-SMS-O2-0.019',
266 'Search-253950' => 'Search-Z3950-0.05',
267 'Date-Christmas' => 'Christmas',
268 'Win32-InternetExplorer-Window' => 'Win32',
269 'Apache-WAP-MailPeek' => 'MailPeek',
270 'Statistics-Table-F' => 'Statistics',
271 'BerkeleyDB_Locks' => 'BerkeleyDB-Locks-0_2',
272 'HookPrePostCall' => 'PrePostCall-1.2',
273 'Oak-AAS-Service-DBI_13_PAM' => 'Oak-AAS-Service-DBI_N_PAM-1.8',
274 'Math-Vector' => 'Vector.pm',
275 'Audio-124pDecrypt' => 'Audio-M4pDecrypt-0.04',
276 'libao-perl_0.03' => 'libao-perl-0.03',
277 'CGI-EZForm' => 'EZForm',
278 'Data-Locations-fixed' => 'Data-Locations-5.2-fixed',
279 'HTML-Template-Filter-Dreamweaver' => 'Dreamweaver',
280 'LineByLine' => 'LineByLine.pm',
281 'Geo-0400' => 'Geo-E00-0.05',
282 'WebService-Validator-HTML-223C' => 'WebService-Validator-HTML-W3C-0.03',
283 'DateTime-Format-223CDTF' => 'DateTime-Format-W3CDTF-0.04',
284 'DBD_SQLFLEX' => 'DBD-Sqlflex',
285 'Text-Number' => 'Number',
286 'DBIx-DataLookup' => 'DBIx',
287 'MP3-ID3211Tag' => 'MP3-ID3v1Tag-1.11',
288 'Text-Striphigh' => 'Striphigh-0.02',
289 'Tie-SortHash' => 'SortHash',
290 'Apache-AccessAbuse' => 'AccessAbuse',
291 'MP3-123U-Parser' => 'MP3-M3U-Parser',
292 'Net-253950' => 'Net-Z3950-0.44',
293 'Net-RBLClient' => 'RBLCLient-0.2',
294 'CGI-EasyCGI' => 'CGI',
295 'http-handle' => 'HTTP::Handle',
296 'JPEG-Comment' => 'JPEG',
297 'router-lg' => 'Router',
298 'Db-Mediasurface' => 'Mediasurface',
299 'Text-BarGraph' => 'bargraph',
300 'TL' => 'Text-Levenshtein-0.04',
301 'Config-Vars' => 'Config-0.01',
302 'Tie-PerfectHash' => 'Tie',
303 'DNS-TinyDNS' => 'DNS',
304 'DesignPattern-Factory' => 'Factory',
305 'WWW-01_Rail' => 'WWW-B_Rail-0.01',
306 'Win32-Exchange' => 'blib',
307 'Math-RPN' => 'Math',
308 'Db-Mediasurface-Cache' => 'Cache',
309 'perl_archie.' => 'Archie.pm',
310 'Acme-PGPSign' => 'Acme',
311 'HTML-Widget-sideBar' => 'HTML-Widget-SideBar-1.00',
312 'log' => 'Games',
313 'File-List' => 'File',
314 'Schedule-Cronchik' => 'Schedule',
315 'Curses-Devkit' => 'Cdk',
316 'Pod-PalmDoc' => 'Pod',
317 'Easy-WML' => 'Easy WML 0.1',
318 'Interval.' => 'Date',
319 'Brasil-Checar-CPF' => 'Brasil',
320 'Apache-WAP-AutoIndex' => 'AutoIndex',
321
322 'SOM.pm' => 'SOM.pm',
323 'PlugIn.pm' => 'PlugIn.pm',
324 'XPathToXML.pm' => 'XPathToXML.pm',
325 'Vector.pm' => 'Vector.pm',
326 'LineByLine.pm' => 'LineByLine.pm',
327 'Archie.pm' => 'Archie.pm',
328 'TieConstant.pm' => 'TieConstant.pm',
329 'EPS.pm' => 'EPS.pm',
330 'SQLRelay.pm' => 'SQLRelay.pm',
331 'Resources.pm' => 'Resources.pm',
332 'README' => 'README',
333
334 );
335 my %NEW_ALIAS; # found in this pass
336
337 # Map module names to pathes. Generated by wanted() doing alisaing.
338 my %ARCHIVE;
339
340 # Map module names to latest version. Generated by wanted()
341 my %VERSION;
342
343
344 # Mapping for modules with common root not matching modules name that
345 # are not aliased. This is just for prefix stripping and not strictly
346 # necessary. Takes effect after version considerations.
347 my %TR = (# tar name root to strip
348 'Net_SSLeay.pm' => 'SSLeay/',
349 'EventDrivenServer' => 'Server/',
350 'bio_lib.pl.' => '',
351 'AlarmCall' => 'Sys/',
352 'Cdk-ext' => 'Cdk/',
353 'Sx' => '\d.\d/',
354 'DumpStack' => 'Devel/',
355 'StatisticsDescriptive'=> 'Statistics/',
356 'Term-Gnuplot' => 'Gnuplot/',
357 'iodbc_ext' => 'iodbc-ext-\d.\d/',
358 'UNIVERSAL' => '',
359 'Term-Query' => 'Query/',
360 'SelfStubber' => 'Devel/',
361 'CallerItem' => 'Devel/',
362 );
363
364 my $LWP;
365
366 # FIXME
367 my $DIR = '/rest/estseek/cpan/';
368 my $DATA = $DIR . '/data';
369
370
371 if (@{$OPT{remove}}) {
372 my $pod;
373 for $pod (@{$OPT{remove}}) {
374 unless (-e $pod) {
375 $pod = "$DIR/$pod";
376 }
377 index_pod(file => $pod, remove => 1) if -f $pod;
378 unlink $pod or warn "Could not unlink '$pod': $!\n";
379 }
380 exit;
381 }
382
383 # Now get the beef
384 if ($OPT{cpan} =~ /^(http|ftp):/) {
385 $LWP = 1;
386 require LWP::Simple;
387 LWP::Simple->import();
388
389 mkpath($DATA,1,0755) or
390 die "Could not generate '$DATA/': $!"
391 unless -d $DATA;
392
393 if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) {
394 my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz");
395 if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
396 # we could use Net:FTP here ...
397 die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n";
398 }
399 }
400 my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |";
401 die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh;
402
403 my $line;
404 while (defined ($line = <$fh>)) {
405 chomp($line);
406 my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11];
407
408 next if defined $is_link;
409 my $mtime = mtime($mon, $mday, $time);
410
411 $file =~ s:^\./::;
412 ($_) = fileparse($file);
413 $File::Find::name = $file;
414 wanted($mtime);
415 }
416 } else {
417 find(sub {&wanted((stat($_))[9])}, $OPT{cpan});
418 }
419
420 ARCHIVE:
421 for my $tar (sort keys %ARCHIVE) {
422 next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o;
423 my $base = (split /\//, $ARCHIVE{$tar})[-1];
424 my $parent;
425 my $parent_mdate;
426
427 my %attr;
428
429 # logging
430 if ($OPT{trust_mtime}) {
431 $attr{'@mdate'} = strftime('%Y-%m-%dT%H:%M:%S+00:00', gmtime($VERSION{$tar}));
432 $parent_mdate = $attr{'@mdate'};
433 printf "%-20s %10s %s\t", $tar, $attr{'@mdate'}, $base;
434 } else {
435 $attr{'version'} = $VERSION{$tar};
436 printf "%-20s %10.5f %s\t", $tar, $attr{'version'}, $base;
437 }
438
439 # Remember the archive
440 # We should have an extra table for the tar file data ...
441 if (!$OPT{force} and $tb->have(docid => $base)) {
442 print "skipping\n";
443 next ARCHIVE;
444 } else {
445 $parent = $tb->insert(docid => $base,
446 headline => $ARCHIVE{$tar},
447 %attr
448 ) unless $OPT{test};
449 print "indexing\n";
450 }
451
452 next ARCHIVE if $OPT{test} > 1;
453
454 my $TAR = myget($tar);
455
456 next ARCHIVE unless $TAR; # not able to fetch it
457
458 my %tar;
459 tie (%tar,
460 'WAIT::Document::Tar',
461 sub { $_[0] =~ /\.(pm|pod|PL)$/i or $_[0] =~ /readme/i},
462 #sub { $_[0] !~ m:/$: },
463 $TAR)
464 or warn "Could not tie '$TAR'\n";
465
466 my $sloppy;
467 my ($key, $val);
468
469 FILE:
470 while (($key, $val) = each %tar) {
471 my $file = fname($key);
472
473 # don't index directories
474 next if $file =~ /\/$/;
475
476 # is it a POD file?
477 next FILE unless $file =~ /readme/i or $val =~ /\n=head/;
478
479 # remove directory prefix
480 unless ($sloppy # no common root
481 or $file =~ s:^\Q$tar\E[^/]*/:: # common root, maybe alias
482 or ($TR{$tar} # common root, not aliased
483 and $file =~ s:^\Q$TR{$tar}\E::)
484 ) {
485 # try to determine an alias
486 warn "Bad directory prefix: '$file'\n";
487 my ($prefix) = split /\//, $file;
488
489 while ($key = (tied %tar)->NEXTKEY) {
490 my $file = fname($key);
491
492 next if $file =~ /\/$/;
493 unless ($file =~ m:^$prefix/: or $file eq $prefix) {
494 warn "Archive contains different prefixes: $prefix,$file\n";
495 $prefix = '';
496 last;
497 }
498 }
499 if ($prefix) {
500 print "Please alias '$tar' to '$prefix' next time!\n";
501 print "See alias table later.\n";
502 $NEW_ALIAS{$tar} = $prefix;
503 $tb->delete_by_key($parent);
504 next ARCHIVE;
505 } else {
506 print "Assuming that tar file name $tar is a valid prefix\n";
507 $sloppy = 1;
508
509 # We may reset too much here! But that this is not exact
510 # science anyway. Maybe we should ignore using 'next ARCHIVE'.
511
512 $key = (tied %tar)->FIRSTKEY;
513 redo FILE;
514 }
515 }
516
517 # remove /lib prefix
518 $file =~ s:^lib/::;
519
520 # generate new path
521 my $path = "$DATA/$tar/$file";
522
523 my ($sbase, $sdir) = fileparse($path);
524 my $fh;
525
526 unless ($OPT{test}) {
527 if (-f $path) {
528 index_pod(file => $path, remove => 1);
529 unlink $path or warn "Could not unlink '$path' $!\n";
530 } elsif (!-d $sdir) {
531 mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n";
532 }
533 # $fh = new IO::File "> $path";
534 $fh = new IO::Zlib "$path.gz","wb";
535 die "Could not write '$path': $!\n" unless $fh;
536 }
537
538 if ($file =~ /readme|install/i) { # make READMEs verbatim pods
539 $val =~ s/\n/\n /g;
540 $val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val"
541 unless $val =~ /^=head/m;
542 } else { # remove non-pod stuff
543 my $nval = $val; $val = '';
544 my $cutting = 1;
545
546 for (split /\n/, $nval) {
547 if (/^=cut|!NO!SUBS!/) {
548 $cutting = 1;
549 } elsif ($cutting and /^=head/) {
550 $cutting = 0;
551 }
552 unless ($cutting) {
553 $val .= $_ . "\n";
554 }
555 }
556 }
557 unless ($OPT{test}) {
558 $fh->print($val);
559 index_pod(file => $path, parent => $parent,
560 text => $val, source => $ARCHIVE{$tar},
561 mdate => $parent_mdate,
562 );
563 }
564 }
565
566 if ($LWP and !$OPT{keep}) {
567 unlink $TAR or warn
568 "Could not unlink '$TAR': $!\n";
569 }
570 }
571
572 if (%NEW_ALIAS) {
573 print "\%ALIAS = (\n";
574 for (keys %NEW_ALIAS) {
575 print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n";
576 }
577 print "\t);\n";
578 }
579
580 exit;
581
582 sub fname ($) {
583 my $key = shift;
584 my ($ntar, $file) = split $;, $key;
585
586 # remove leading './' - shudder
587 $file =~ s/^\.\///;
588
589 return($file);
590 }
591
592 sub myget {
593 my $tar = shift;
594 my $TAR;
595
596 if ($LWP) { # fetch the archive
597 if ($OPT{keep}) {
598 $TAR = "$OPT{keep}/$ARCHIVE{$tar}";
599 print "Keeping in '$TAR'\n" unless -e $TAR;
600 my ($base, $path) = fileparse($TAR);
601 unless (-d $path) {
602 mkpath($path,1,0755) or
603 die "Could not mkpath($path)\n";
604 }
605 } else {
606 $TAR = "/tmp/$tar.tar.gz";
607 }
608 unless (-e $TAR) { # lwp mirror seems to fetch ftp: in any case?
609 print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n";
610 my $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR);
611 if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
612 warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n";
613 return;
614 }
615 }
616 } else {
617 $TAR = $ARCHIVE{$tar};
618 }
619 $TAR;
620 }
621
622 sub index_pod {
623 my %parm = @_;
624 my $did = $parm{file};
625 my $rel_did = $did;
626 my $abs_did = $did;
627
628 if ($rel_did =~ s:$DIR/::) {
629 $abs_did = "$DIR/$rel_did";
630 }
631
632 undef $did;
633
634 # check for both variants
635 if ($tb->have('docid' => $rel_did)) {
636 $did = $rel_did;
637 } elsif ($tb->have('docid' => $abs_did)) {
638 $did = $abs_did;
639 }
640 if ($did) { # have it version
641 if (!$parm{remove} and !$OPT{force}) {
642 warn "duplicate: $did\n";
643 return;
644 }
645 } else { # not seen yet
646 $did = $rel_did;
647 if ($parm{remove}) {
648 print "missing: $did\n";
649 return;
650 }
651 }
652
653 $parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did);
654
655 unless (defined $parm{'text'}) {
656 print "unavailable: $did\n";
657 return;
658 }
659
660 my $record = $layout->split($parm{'text'});
661
662 if (! $record) {
663 print "empty pod: $did\n";
664 return;
665 }
666
667 $record->{size} = length($parm{'text'});
668 my $headline = $record->{name} || $did;
669
670 $record->{'@mdate'} = $parm{'mdate'} if ($parm{'mdate'});
671
672 $headline =~ s/^$DATA//o; # $did
673 $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
674
675 printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70);
676 if ($parm{remove}) {
677 $tb->delete('docid' => $did,
678 headline => $headline,
679 %{$record});
680 } else {
681 $tb->insert('docid' => $did,
682 headline => $headline,
683 source => $parm{source},
684 parent => $parm{parent},
685 %{$record});
686 }
687 }
688
689 # This *must* remove the version in *any* case. It should compute a
690 # resonable version number - but usually mtimes should be used.
691 sub version {
692 local ($_) = @_;
693
694 # remove alpha/beta postfix
695 s/([-_\d])(a|b|alpha|beta|src)$/$1/;
696
697 # jperl1.3@4.019.tar.gz
698 s/@\d.\d+//;
699
700 # oraperl-v2.4-gk.tar.gz
701 s/-v(\d)/$1/;
702
703 # lettered versions - shudder
704 s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei;
705 s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei;
706
707 # thanks libwww-5b12 ;-)
708 s/(\d+)b/($1-1).'.'/e;
709 s/(\d+)a/($1-2).'.'/e;
710
711 # replace '-pre' by '0.'
712 s/-pre([\.\d])/-0.$1/;
713 s/\.\././g;
714 s/(\d)_(\d)/$1$2/g;
715
716 # chop '[-.]' and thelike
717 s/\W$//;
718
719 # ram's versions Storable-0.4@p
720 s/\@/./;
721
722 if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) {
723 return($_, $1 + "0.$2" + $3 / 1000000);
724 } elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) {
725 return($_, $1 + $2/1000 + $3 / 1000000);
726 } elsif (s/[-_]?(\d+\.[\d_]+)$//) {
727 return($_, $1);
728 } elsif (s/[-_]?([\d_]+)$//) {
729 return($_, $1);
730 } elsif (s/-(\d+.\d+)-/-/) { # perl-4.019-ref-guide
731 return($_, $1);
732 } else {
733 if ($_ =~ /\d/) { # smells like an unknown scheme
734 warn "Odd version Numbering: '$File::Find::name'\n";
735 return($_, undef);
736 } else { # assume version 0
737 warn "No version Numbering: '$File::Find::name'\n";
738 return($_, 0);
739 }
740
741 }
742 }
743
744 sub wanted {
745 my $mtime = shift; # called by parse_file_ls();
746
747 return if (! $max);
748 $max--;
749
750 return unless /^(.*)\.(tar\.(gz|Z)|tgz)$/;
751
752 my ($archive, $version) = version($1);
753
754 unless (defined $version) {
755 warn "Skipping $1\n";
756 return;
757 }
758
759 # Check for file alias
760 $archive = $ALIAS{$archive} if $ALIAS{$archive};
761
762 # Check for path alias.
763 if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) {
764 if ($ALIAS{$1}) {
765 $archive = $ALIAS{$1};
766 }
767 }
768
769 if ($OPT{trust_mtime}) {
770 $version = $mtime;
771 } else {
772 $version =~ s/(\d)_/$1/;
773 $version ||= $mtime; # mtime
774 }
775
776 if (!exists $ARCHIVE{$archive}
777 or $VERSION{$archive} < $version) {
778 $ARCHIVE{$archive} = $File::Find::name;
779 $VERSION{$archive} = $version;
780 }
781 }
782
783 sub create_table {
784 my %parm = @_;
785
786 my $access = bless {}, 'WAIT::Document::Find';
787
788 my $stem = [{
789 'prefix' => ['isotr', 'isolc'],
790 'intervall' => ['isotr', 'isolc'],
791 }, 'isotr', 'isolc', 'split2', 'stop', 'Stem'];
792 my $text = [{
793 'prefix' => ['isotr', 'isolc'],
794 'intervall' => ['isotr', 'isolc'],
795 },
796 'isotr', 'isolc', 'split2', 'stop'];
797 my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],;
798
799 my $tb =
800 $parm{db}->create_table
801 (name => $parm{table},
802 attr => ['docid', 'headline', 'source', 'size', 'parent'],
803 keyset => [['docid']],
804 layout => $parm{layout},
805 access => $access,
806 invindex =>
807 [
808 'name' => $stem,
809 'synopsis' => $stem,
810 'bugs' => $stem,
811 'description' => $stem,
812 'text' => $stem,
813 'environment' => $text,
814 'example' => $text, 'example' => $stem,
815 'author' => $sound, 'author' => $stem,
816 ]
817 );
818 die "Could not create table '$parm{table}'" unless $tb;
819 $tb;
820 }
821
822 my %MON;
823 my $YEAR;
824
825 BEGIN {
826 my $i = 1;
827 for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) {
828 $MON{$_} = $i++;
829 }
830 $YEAR = (localtime(time))[5];
831 }
832
833 # We could/should use Date::GetDate here
834 use Time::Local;
835 sub mtime {
836 my ($mon, $mday, $time) = @_;
837 my ($hour, $min, $year, $monn) = (0,0);
838
839 if ($time =~ /(\d+):(\d+)/) {
840 ($hour, $min) = ($1, $2);
841 $year = $YEAR;
842 } else {
843 $year = $time;
844 }
845 $monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'";
846 my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year);
847 if ($guess > time) {
848 $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1);
849 }
850 $guess;
851 }
852
853 package HyperEstraier::WAIT::Table;
854
855 use HyperEstraier;
856 use Text::Iconv;
857
858 =head1 NAME
859
860 HyperEstraier::WAIT::Table
861
862 =head1 DESCRIPTION
863
864 This is a mode that emulates C<WAIT::Table> functionality somewhat.
865
866 There are some limitations and only one key attribute is supported (and used
867 for C<@uri>).
868
869 =head2 Porting from WAIT to this module.
870
871 Since only one key is supported (and used as C<@uri> attribute),
872 use first parametar of C<keyset> as C<key>.
873
874 Full text index is specified as C<invindex>, but you need just name of fields.
875
876 You will probably need to add
877
878 use WAIT::Parse::Base;
879
880 to your code after you remove C<WAIT::Config> and C<WAIT::Database>.
881
882 =head1 METHODS
883
884 =head2 new
885
886 my $tb = new HyperEstraier::WAIT::Table(
887 uri => 'http://localhost:1978/node/cpan',
888 attr => qw/docid headline source size parent/,
889 key => 'docid',
890 invindex => qw/name synopsis bugs description text environment example author/,
891 );
892
893 =cut
894
895 sub new {
896 my $class = shift;
897 my $self = {@_};
898 bless($self, $class);
899
900 foreach my $p (qw/uri attr key invindex/) {
901 die "need $p" unless ($self->{$p});
902 }
903
904 $self->{'iso2utf'} = Text::Iconv->new('ISO-8859-1','UTF-8');
905
906 my $node = HyperEstraier::Node->new($self->{'uri'});
907 $node->set_auth('admin', 'admin');
908
909 $self->{'node'} = $node;
910
911 $self ? return $self : return undef;
912 }
913
914 =head2 have
915
916 if ( $tb->have(docid => $something) ) ...
917
918 =cut
919
920 sub have {
921 my $self = shift;
922 my $args = {@_};
923 my $key = $self->{'key'} || die "no key in object";
924 my $key_v = $args->{$key} || die "no key $key in data";
925
926 my $id = $self->{'node'}->uri_to_id('file://' . $key_v);
927
928 return ($id == -1 ? undef : $id);
929 }
930
931 =head2 insert
932
933 my $key = $tb->insert(
934 docid => $base,
935 headline => 'Something',
936 ...
937 );
938
939 =cut
940
941 sub insert {
942 my $self = shift;
943 my $args = {@_};
944
945 my $uri = 'file://';
946 $uri .= $args->{'docid'} or die "no docid";
947
948 my $doc = HyperEstraier::Document->new;
949
950 $doc->add_attr('@uri', $uri);
951 $doc->add_attr('@title', $args->{'headline'}) if ($args->{'headline'});
952 $doc->add_attr('@size', $args->{'size'}) if ($args->{'size'});
953
954 my @attr = $self->{'attr'} || die "no attr in object";
955 my @invindex = $self->{'invindex'} || die "no invindex in object";
956
957 foreach my $attr (keys %{$args}) {
958 if (grep(/^$attr$/, @{ $self->{'attr'} }) or $attr =~ m/^@/o) {
959 $doc->add_attr($attr, $args->{$attr});
960 }
961 if (grep(/^$attr$/, @{ $self->{'invindex'} })) {
962 $doc->add_text($args->{$attr});
963 }
964 }
965
966 print STDERR $doc->dump_draft if ($self->{'debug'});
967
968 my $id;
969 unless ($id = $self->{'node'}->put_doc($doc)) {
970 printf STDERR "ERROR: %d\n", $self->{'node'}->status;
971 #} else {
972 # print STDERR "id: $id\n";
973 }
974
975 return $id;
976 }
977
978 =head2 delete_by_key
979
980 $tb->delete_by_key($key);
981
982 =cut
983
984 sub delete_by_key {
985 my $self = shift;
986 my $key_v = shift || die "no key?";
987
988 unless ($self->{'node'}->out_doc_by_uri( 'file://' . $key_v )) {
989 print STDERR "WARNING: can't delete document $key_v\n";
990 }
991 }
992
993 =head2 delete
994
995 $tb->delete( docid => $did, ... );
996
997 =cut
998
999 sub delete {
1000 my $self = shift;
1001 my $args = {@_};
1002
1003 my $key = $self->{'key'} || die "no key in object";
1004
1005 die "no $key in data" unless (my $key_v = $args->{$key});
1006
1007 $self->delete_by_key($key_v);
1008
1009 }

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26