/[hyperestraier_wrappers]/trunk/perl/scripts/cpanest
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/perl/scripts/cpanest

Parent Directory Parent Directory | Revision Log Revision Log


Revision 24 - (hide annotations)
Sat Sep 17 00:48:32 2005 UTC (18 years, 6 months ago) by dpavlin
File size: 28337 byte(s)
insert should return document_id and not true/false so that links to parent
(distribution document id) would work
1 dpavlin 20 #!/usr/bin/perl -w
2    
3     =head1 NAME
4    
5     cpanest - generate an Hyper Estraier index for CPAN
6    
7     =head1 SYNOPSIS
8    
9     B<cpanest>
10     [B<-clean>] [B<-noclean>]
11     [B<-cpan> I<url or directory>]
12     [B<-node> I<node_uri>]
13     [B<-force>] [B<-noforce>]
14     [B<-keep> I<directory>]
15     [B<-match> I<regexp>]
16     [B<-test> I<level>]
17     [B<-trust_mtime>] [B<-notrust_mtime>]
18    
19     =head1 DESCRIPTION
20    
21     This is a port of C<cpanwait> from L<WAIT> perl search engine to node API of
22     Hyper Estraier.
23    
24     All the hard work was done by Ulrich Pfeifer who wrote all parsers and
25     formatters. I just added support for Hyper Estraier back-end after.
26    
27     B<This documentation is somewhat incomplete and off-the-sync with code.>
28    
29     =head1 OPTIONS
30    
31     =over 5
32    
33     =item B<-clean> / B<-noclean>
34    
35     Clean the table befor indexing. Default is B<off>.
36    
37     =item B<-cpan> I<url or directory>
38    
39     Default directory or URL for indexing. If an URL is given, there
40     currently must be a file F<indices/find-ls.gz> relative to it which
41     contains the output of C<find . -ls | gzip>.
42     Default is F<ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN>.
43    
44    
45     =item B<-node> I<http://localhost:1978/node/cpan>
46    
47     Specify node URI
48    
49     =item B<-force>
50    
51     Force reindexing, even if B<cpan> thinks files are up to date.
52     Default is B<off>
53    
54     =item B<-keep> I<directory>
55    
56     If fetching from a remote server, keep files in I<directory>. Default is
57     F</app/unido-i06/src/share/lang/perl/96a/CPAN/sources>.
58    
59     =item B<-match> I<regexp>
60    
61     Limit to patches matching I<regexp>. Default is F<authors/id/>.
62    
63     =item B<-test> I<level>
64    
65     Set test level, were B<0> means normal operation, B<1> means, don't
66     really index and B<2> means, don't even get archives and examine them.
67    
68     =item B<-trust_mtime> / B<-notrust_mtime>
69    
70     If B<on>, the files mtimes are used to decide, which version of an
71     archive is the newest. If b<off>, the version extracted is used
72     (beware, there are far more version numbering schemes than B<cpan> can
73     parse).
74    
75     =back
76    
77     =head1 AUTHORS
78    
79     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortumund.de>E<gt>
80    
81     Dobrica Pavlinusic E<lt>F<dpavlin@rot13.org>E<gt>
82    
83     =head1 COPYRIGHT
84    
85     Copyright (c) 1996-1997, Ulrich Pfeifer
86    
87     Copyright (c) 2005, Dobrica Pavlinusic
88    
89     =cut
90    
91     use strict;
92    
93     use File::Path;
94     use DB_File;
95     use Getopt::Long;
96     use File::Find;
97     use File::Basename;
98     use IO::File;
99     use IO::Zlib;
100 dpavlin 23 use POSIX qw/strftime/;
101 dpavlin 20
102     use lib '/data/wait/lib';
103    
104     use WAIT::Parse::Base;
105     use WAIT::Parse::Pod;
106     use WAIT::Document::Tar;
107    
108     sub fname($);
109    
110     # maximum number of archives to index (set to -1 for unlimited)
111     my $max = -1;
112    
113     my %OPT = (
114     node => 'http://localhost:1978/node/cpan',
115     clean => 0,
116     remove => [],
117     force => 0,
118     # cpan => '/usr/src/perl/CPAN/sources',
119     cpan => '/rest/cpan/CPAN/',
120     trust_mtime => 1,
121     match => 'authors/id/',
122     test => 0,
123     # cpan => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN',
124     # cpan => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN',
125     keep => '/tmp/CPAN/',
126     );
127    
128     GetOptions(\%OPT,
129     'node=s',
130     'cpan=s',
131     'keep=s',
132     'match=s',
133     'clean!',
134     'test=i', # test level 0: normal
135     # 1: don't change db
136     # 2: don't look at archives even
137    
138     'remove=s@',
139     'force!', # force indexing even if seen
140     'trust_mtime!', # use mtime instead of version number
141     'max=i',
142     'debug!',
143     ) || die "Usage: ...\n";
144    
145     if ($OPT{max}) {
146     $max = $OPT{max};
147     print STDERR "processing just first $max modules\n";
148     }
149    
150     # FIXME
151     #clean_node(
152     # node => $OPT{node},
153     # ) if $OPT{clean};
154    
155     my $tb = new HyperEstraier::WAIT::Table(
156     uri => $OPT{node},
157     attr => ['docid', 'headline', 'source', 'size', 'parent'],
158     key => 'docid',
159     invindex => [ qw/name synopsis bugs description text environment example author/ ],
160     debug => $OPT{debug},
161     ) or die "Could not open node '$OPT{node}'";
162    
163     my $layout= new WAIT::Parse::Pod;
164    
165     # Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version
166     # considerations. Value *must* match common prefix. Aliasing should be
167     # used if CPAN contains serveral distributions with different name but
168     # same root directory.
169     # We still have a problem if there are different root directories!
170    
171     my %ALIAS = (# tar name real (root) name
172     'Games-Scrabble' => 'Games',
173     'HTML-ParseBrowser' => 'HTML',
174     'iodbc_ext' => 'iodbc-ext-0.1',
175     'sol-inst' => 'Solaris',
176     'WebService-Validator-CSS-223C' => 'WebService-Validator-CSS-W3C-0.02',
177     'MPEG-ID3212Tag' => 'MPEG-ID3v2Tag-0.36',
178     'WebService-GoogleHack' => 'WebService',
179     'Db-Mediasurface-ReadConfig' => 'ReadConfig',
180     'Tie-Array-RestrictUpdates' => 'Tie',
181     'HTML-Lister' => 'HTML',
182     'Net-253950-AsyncZ' => 'Net-Z3950-AsyncZ-0.08',
183     'ChildExit_0' => 'ChildExit-0.1',
184     'Tie-TieConstant' => 'TieConstant.pm',
185     'Crypt-OpenSSL-23509' => 'Crypt-OpenSSL-X509-0.2',
186     'subclustv' => 'blib',
187     'finance-yahooquote' => 'Finance-YahooQuote-0.20',
188     'HPUX-FS' => 'FS',
189     'Business-DE-Konto' => 'Business',
190     'Digest-MD5-124p' => 'Digest-MD5-M4p-0.01',
191     'AKDB_Okewo_de' => 'AKDB',
192     'ExtUtils-0577' => 'ExtUtils-F77-1.14',
193     'LispFmt' => 'Lisp::Fmt-0.00',
194     'Acme-Stegano' => 'Acme',
195     'Acme-RTB' => 'Acme',
196     'WWW-Search-PRWire' => 'work',
197     'Video-Capture-214l' => 'Video-Capture-V4l-0.224',
198     'Tie-DirHandle' => 'Tie',
199     'DB2' => 'DBD-DB2-0.71a',
200     'Tie-Scalar-RestrictUpdates' => 'Tie',
201     'Math-MVPoly' => 'MVPoly',
202     'PlugIn' => 'PlugIn.pm',
203     'Lingua-ID-Nums2Words' => 'Nums2Words-0.01',
204     'chronos-1.' => 'Chronos',
205     'jp_beta' => 'jperl_beta_r1',
206     'Bundle-223C-Validator' => 'Bundle-W3C-Validator-0.6.5',
207     'Text-199' => 'Text-T9-1.0',
208     'Games-Literati' => 'Games',
209     'VMS-IndexedFile' => 'VMS',
210     'authen-rbac' => 'Authen',
211     'Graphics-EPS' => 'EPS.pm',
212     'new.spirit-2.' => 'new.spirit',
213     'Tk-MListbox' => 'MListbox-1.11',
214     'DBD-SQLrelay' => 'SQLRelay.pm',
215     'Tie-RDBM-Cached' => 'RDBM',
216     'PDL_IO_HDF' => 'HDF',
217     'HPUX-LVM' => 'LVM',
218     'Parse-Nibbler' => 'Parse',
219     'Digest-Perl-MD4' => 'MD4',
220     'Crypt-Imail' => 'Imail',
221     'ubertext' => 'Text-UberText-0.95',
222     'MP3-123U' => 'M3U',
223     'Qmail-Control' => 'Qmail',
224     'T-LXS' => 'Text-LevenshteinXS-0.02',
225     'HTML-Paginator' => 'HTML',
226     'swig' => 'SWIG1.1p5',
227     'MIDI-Realtime' => 'MIDI',
228     'sparky-public' => 'Sparky-Public-1.06',
229     'Chemistry-MolecularMass' => 'Chemistry',
230     'Net-253950-SimpleServer' => 'Net-Z3950-SimpleServer-0.08',
231     'NewsClipper-OpenSource' => 'NewsClipper-1.32-OpenSource',
232     'Win32API-Resources' => 'Resources.pm',
233     'Unicode-Collate-Standard-2131_1' => 'Unicode-Collate-Standard-V3_1_1-0.1',
234     'Net-026Term' => 'Net-C6Term-0.11',
235     'BitArray1' => 'BitArray',
236     'Audio-Radio-214L' => 'Audio-Radio-V4L-0.01',
237     'Devel-AutoProfiler' => 'Devel',
238     'Brasil-Checar-CGC' => 'Brasil',
239     'AI-NeuralNet-SOM' => 'SOM.pm',
240     'Net-BitTorrent-File-fix' => 'Net-BitTorrent-File-1.01',
241     'VMS-FindFile' => 'VMS',
242     'LoadHtml.' => 'README',
243     'Time-Compare' => 'Time',
244     'ShiftJIS-230213-MapUTF' => 'ShiftJIS-X0213-MapUTF-0.21',
245     'Image-WMF' => 'Image',
246     'sdf-2.0.eta' => 'sdf-2.001beta1',
247     'Math-Expr-LATEST' => 'Math-Expr-0.4',
248     'MP3-Player-PktConcert' => 'MP3',
249     'Apache-OWA' => 'OWA',
250     'Audio-Gramofile' => 'Audio',
251     'DBIx-Copy' => 'Copy',
252     'P4-024' => 'P4-C4-2.021',
253     'Disassemble-2386' => 'Disassemble-X86-0.13',
254     'Proc-Swarm' => 'Swarm-0.5',
255     'Smil' => 'perlysmil',
256     'Net-SSH-2232Perl' => 'Net-SSH-W32Perl-0.05',
257     'Win32-SerialPort' => 'SerialPort-0.19',
258     'Lingua-ID-Words2Nums' => 'Words2Nums-0.01',
259     'Parse-Text' => 'Text',
260     'DBIx-HTMLView-LATEST' => 'DBIx-HTMLView-0.9',
261     'Apache-NNTPGateway' => 'NNTPGateway-0.9',
262     'XPathToXML' => 'XPathToXML.pm',
263     'XML-WMM-ASX' => 'XML',
264     'CGISession' => 'CGI',
265     'Net-SMS-142' => 'Net-SMS-O2-0.019',
266     'Search-253950' => 'Search-Z3950-0.05',
267     'Date-Christmas' => 'Christmas',
268     'Win32-InternetExplorer-Window' => 'Win32',
269     'Apache-WAP-MailPeek' => 'MailPeek',
270     'Statistics-Table-F' => 'Statistics',
271     'BerkeleyDB_Locks' => 'BerkeleyDB-Locks-0_2',
272     'HookPrePostCall' => 'PrePostCall-1.2',
273     'Oak-AAS-Service-DBI_13_PAM' => 'Oak-AAS-Service-DBI_N_PAM-1.8',
274     'Math-Vector' => 'Vector.pm',
275     'Audio-124pDecrypt' => 'Audio-M4pDecrypt-0.04',
276     'libao-perl_0.03' => 'libao-perl-0.03',
277     'CGI-EZForm' => 'EZForm',
278     'Data-Locations-fixed' => 'Data-Locations-5.2-fixed',
279     'HTML-Template-Filter-Dreamweaver' => 'Dreamweaver',
280     'LineByLine' => 'LineByLine.pm',
281     'Geo-0400' => 'Geo-E00-0.05',
282     'WebService-Validator-HTML-223C' => 'WebService-Validator-HTML-W3C-0.03',
283     'DateTime-Format-223CDTF' => 'DateTime-Format-W3CDTF-0.04',
284     'DBD_SQLFLEX' => 'DBD-Sqlflex',
285     'Text-Number' => 'Number',
286     'DBIx-DataLookup' => 'DBIx',
287     'MP3-ID3211Tag' => 'MP3-ID3v1Tag-1.11',
288     'Text-Striphigh' => 'Striphigh-0.02',
289     'Tie-SortHash' => 'SortHash',
290     'Apache-AccessAbuse' => 'AccessAbuse',
291     'MP3-123U-Parser' => 'MP3-M3U-Parser',
292     'Net-253950' => 'Net-Z3950-0.44',
293     'Net-RBLClient' => 'RBLCLient-0.2',
294     'CGI-EasyCGI' => 'CGI',
295     'http-handle' => 'HTTP::Handle',
296     'JPEG-Comment' => 'JPEG',
297     'router-lg' => 'Router',
298     'Db-Mediasurface' => 'Mediasurface',
299     'Text-BarGraph' => 'bargraph',
300     'TL' => 'Text-Levenshtein-0.04',
301     'Config-Vars' => 'Config-0.01',
302     'Tie-PerfectHash' => 'Tie',
303     'DNS-TinyDNS' => 'DNS',
304     'DesignPattern-Factory' => 'Factory',
305     'WWW-01_Rail' => 'WWW-B_Rail-0.01',
306     'Win32-Exchange' => 'blib',
307     'Math-RPN' => 'Math',
308     'Db-Mediasurface-Cache' => 'Cache',
309     'perl_archie.' => 'Archie.pm',
310     'Acme-PGPSign' => 'Acme',
311     'HTML-Widget-sideBar' => 'HTML-Widget-SideBar-1.00',
312     'log' => 'Games',
313     'File-List' => 'File',
314     'Schedule-Cronchik' => 'Schedule',
315     'Curses-Devkit' => 'Cdk',
316     'Pod-PalmDoc' => 'Pod',
317     'Easy-WML' => 'Easy WML 0.1',
318     'Interval.' => 'Date',
319     'Brasil-Checar-CPF' => 'Brasil',
320     'Apache-WAP-AutoIndex' => 'AutoIndex',
321    
322     'SOM.pm' => 'SOM.pm',
323     'PlugIn.pm' => 'PlugIn.pm',
324     'XPathToXML.pm' => 'XPathToXML.pm',
325     'Vector.pm' => 'Vector.pm',
326     'LineByLine.pm' => 'LineByLine.pm',
327     'Archie.pm' => 'Archie.pm',
328     'TieConstant.pm' => 'TieConstant.pm',
329     'EPS.pm' => 'EPS.pm',
330     'SQLRelay.pm' => 'SQLRelay.pm',
331     'Resources.pm' => 'Resources.pm',
332     'README' => 'README',
333    
334     );
335     my %NEW_ALIAS; # found in this pass
336    
337     # Map module names to pathes. Generated by wanted() doing alisaing.
338     my %ARCHIVE;
339    
340     # Map module names to latest version. Generated by wanted()
341     my %VERSION;
342    
343    
344     # Mapping for modules with common root not matching modules name that
345     # are not aliased. This is just for prefix stripping and not strictly
346     # necessary. Takes effect after version considerations.
347     my %TR = (# tar name root to strip
348     'Net_SSLeay.pm' => 'SSLeay/',
349     'EventDrivenServer' => 'Server/',
350     'bio_lib.pl.' => '',
351     'AlarmCall' => 'Sys/',
352     'Cdk-ext' => 'Cdk/',
353     'Sx' => '\d.\d/',
354     'DumpStack' => 'Devel/',
355     'StatisticsDescriptive'=> 'Statistics/',
356     'Term-Gnuplot' => 'Gnuplot/',
357     'iodbc_ext' => 'iodbc-ext-\d.\d/',
358     'UNIVERSAL' => '',
359     'Term-Query' => 'Query/',
360     'SelfStubber' => 'Devel/',
361     'CallerItem' => 'Devel/',
362     );
363    
364     my $LWP;
365    
366     # FIXME
367     my $DIR = '/rest/estseek/cpan/';
368     my $DATA = $DIR . '/data';
369    
370    
371     if (@{$OPT{remove}}) {
372     my $pod;
373     for $pod (@{$OPT{remove}}) {
374     unless (-e $pod) {
375     $pod = "$DIR/$pod";
376     }
377     index_pod(file => $pod, remove => 1) if -f $pod;
378     unlink $pod or warn "Could not unlink '$pod': $!\n";
379     }
380     exit;
381     }
382    
383     # Now get the beef
384     if ($OPT{cpan} =~ /^(http|ftp):/) {
385     $LWP = 1;
386     require LWP::Simple;
387     LWP::Simple->import();
388    
389     mkpath($DATA,1,0755) or
390     die "Could not generate '$DATA/': $!"
391     unless -d $DATA;
392    
393     if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) {
394     my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz");
395     if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
396     # we could use Net:FTP here ...
397     die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n";
398     }
399     }
400     my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |";
401     die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh;
402    
403     my $line;
404     while (defined ($line = <$fh>)) {
405     chomp($line);
406     my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11];
407    
408     next if defined $is_link;
409     my $mtime = mtime($mon, $mday, $time);
410    
411     $file =~ s:^\./::;
412     ($_) = fileparse($file);
413     $File::Find::name = $file;
414     wanted($mtime);
415     }
416     } else {
417     find(sub {&wanted((stat($_))[9])}, $OPT{cpan});
418     }
419    
420     ARCHIVE:
421     for my $tar (sort keys %ARCHIVE) {
422     next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o;
423     my $base = (split /\//, $ARCHIVE{$tar})[-1];
424     my $parent;
425 dpavlin 23 my $parent_mdate;
426 dpavlin 20
427 dpavlin 23 my %attr;
428    
429 dpavlin 20 # logging
430     if ($OPT{trust_mtime}) {
431 dpavlin 23 $attr{'@mdate'} = strftime('%Y-%m-%dT%H:%M:%S+00:00', gmtime($VERSION{$tar}));
432     $parent_mdate = $attr{'@mdate'};
433     printf "%-20s %10s %s\t", $tar, $attr{'@mdate'}, $base;
434 dpavlin 20 } else {
435 dpavlin 23 $attr{'version'} = $VERSION{$tar};
436     printf "%-20s %10.5f %s\t", $tar, $attr{'version'}, $base;
437 dpavlin 20 }
438    
439     # Remember the archive
440     # We should have an extra table for the tar file data ...
441     if (!$OPT{force} and $tb->have(docid => $base)) {
442     print "skipping\n";
443     next ARCHIVE;
444     } else {
445     $parent = $tb->insert(docid => $base,
446 dpavlin 23 headline => $ARCHIVE{$tar},
447     %attr
448     ) unless $OPT{test};
449 dpavlin 20 print "indexing\n";
450     }
451    
452     next ARCHIVE if $OPT{test} > 1;
453    
454     my $TAR = myget($tar);
455    
456     next ARCHIVE unless $TAR; # not able to fetch it
457    
458     my %tar;
459     tie (%tar,
460     'WAIT::Document::Tar',
461     sub { $_[0] =~ /\.(pm|pod|PL)$/i or $_[0] =~ /readme/i},
462     #sub { $_[0] !~ m:/$: },
463     $TAR)
464     or warn "Could not tie '$TAR'\n";
465    
466     my $sloppy;
467     my ($key, $val);
468    
469     FILE:
470     while (($key, $val) = each %tar) {
471     my $file = fname($key);
472    
473     # don't index directories
474     next if $file =~ /\/$/;
475    
476     # is it a POD file?
477     next FILE unless $file =~ /readme/i or $val =~ /\n=head/;
478    
479     # remove directory prefix
480     unless ($sloppy # no common root
481     or $file =~ s:^\Q$tar\E[^/]*/:: # common root, maybe alias
482     or ($TR{$tar} # common root, not aliased
483     and $file =~ s:^\Q$TR{$tar}\E::)
484     ) {
485     # try to determine an alias
486     warn "Bad directory prefix: '$file'\n";
487     my ($prefix) = split /\//, $file;
488    
489     while ($key = (tied %tar)->NEXTKEY) {
490     my $file = fname($key);
491    
492     next if $file =~ /\/$/;
493     unless ($file =~ m:^$prefix/: or $file eq $prefix) {
494     warn "Archive contains different prefixes: $prefix,$file\n";
495     $prefix = '';
496     last;
497     }
498     }
499     if ($prefix) {
500     print "Please alias '$tar' to '$prefix' next time!\n";
501     print "See alias table later.\n";
502     $NEW_ALIAS{$tar} = $prefix;
503     $tb->delete_by_key($parent);
504     next ARCHIVE;
505     } else {
506     print "Assuming that tar file name $tar is a valid prefix\n";
507     $sloppy = 1;
508    
509     # We may reset too much here! But that this is not exact
510     # science anyway. Maybe we should ignore using 'next ARCHIVE'.
511    
512     $key = (tied %tar)->FIRSTKEY;
513     redo FILE;
514     }
515     }
516    
517     # remove /lib prefix
518     $file =~ s:^lib/::;
519    
520     # generate new path
521     my $path = "$DATA/$tar/$file";
522    
523     my ($sbase, $sdir) = fileparse($path);
524     my $fh;
525    
526     unless ($OPT{test}) {
527     if (-f $path) {
528     index_pod(file => $path, remove => 1);
529     unlink $path or warn "Could not unlink '$path' $!\n";
530     } elsif (!-d $sdir) {
531     mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n";
532     }
533     # $fh = new IO::File "> $path";
534     $fh = new IO::Zlib "$path.gz","wb";
535     die "Could not write '$path': $!\n" unless $fh;
536     }
537    
538     if ($file =~ /readme|install/i) { # make READMEs verbatim pods
539     $val =~ s/\n/\n /g;
540     $val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val"
541     unless $val =~ /^=head/m;
542     } else { # remove non-pod stuff
543     my $nval = $val; $val = '';
544     my $cutting = 1;
545    
546     for (split /\n/, $nval) {
547     if (/^=cut|!NO!SUBS!/) {
548     $cutting = 1;
549     } elsif ($cutting and /^=head/) {
550     $cutting = 0;
551     }
552     unless ($cutting) {
553     $val .= $_ . "\n";
554     }
555     }
556     }
557     unless ($OPT{test}) {
558     $fh->print($val);
559     index_pod(file => $path, parent => $parent,
560 dpavlin 23 text => $val, source => $ARCHIVE{$tar},
561     mdate => $parent_mdate,
562     );
563 dpavlin 20 }
564     }
565    
566     if ($LWP and !$OPT{keep}) {
567     unlink $TAR or warn
568     "Could not unlink '$TAR': $!\n";
569     }
570     }
571    
572     if (%NEW_ALIAS) {
573     print "\%ALIAS = (\n";
574     for (keys %NEW_ALIAS) {
575     print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n";
576     }
577     print "\t);\n";
578     }
579    
580     exit;
581    
582     sub fname ($) {
583     my $key = shift;
584     my ($ntar, $file) = split $;, $key;
585    
586     # remove leading './' - shudder
587     $file =~ s/^\.\///;
588    
589     return($file);
590     }
591    
592     sub myget {
593     my $tar = shift;
594     my $TAR;
595    
596     if ($LWP) { # fetch the archive
597     if ($OPT{keep}) {
598     $TAR = "$OPT{keep}/$ARCHIVE{$tar}";
599     print "Keeping in '$TAR'\n" unless -e $TAR;
600     my ($base, $path) = fileparse($TAR);
601     unless (-d $path) {
602     mkpath($path,1,0755) or
603     die "Could not mkpath($path)\n";
604     }
605     } else {
606     $TAR = "/tmp/$tar.tar.gz";
607     }
608     unless (-e $TAR) { # lwp mirror seems to fetch ftp: in any case?
609     print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n";
610     my $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR);
611     if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
612     warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n";
613     return;
614     }
615     }
616     } else {
617     $TAR = $ARCHIVE{$tar};
618     }
619     $TAR;
620     }
621    
622     sub index_pod {
623     my %parm = @_;
624     my $did = $parm{file};
625     my $rel_did = $did;
626     my $abs_did = $did;
627    
628     if ($rel_did =~ s:$DIR/::) {
629     $abs_did = "$DIR/$rel_did";
630     }
631    
632     undef $did;
633    
634     # check for both variants
635     if ($tb->have('docid' => $rel_did)) {
636     $did = $rel_did;
637     } elsif ($tb->have('docid' => $abs_did)) {
638     $did = $abs_did;
639     }
640     if ($did) { # have it version
641 dpavlin 23 if (!$parm{remove} and !$OPT{force}) {
642 dpavlin 20 warn "duplicate: $did\n";
643     return;
644     }
645     } else { # not seen yet
646     $did = $rel_did;
647     if ($parm{remove}) {
648     print "missing: $did\n";
649     return;
650     }
651     }
652    
653     $parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did);
654    
655     unless (defined $parm{'text'}) {
656     print "unavailable: $did\n";
657     return;
658     }
659    
660     my $record = $layout->split($parm{'text'});
661    
662     if (! $record) {
663     print "empty pod: $did\n";
664     return;
665     }
666    
667     $record->{size} = length($parm{'text'});
668     my $headline = $record->{name} || $did;
669    
670 dpavlin 23 $record->{'@mdate'} = $parm{'mdate'} if ($parm{'mdate'});
671    
672 dpavlin 20 $headline =~ s/^$DATA//o; # $did
673     $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
674    
675     printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70);
676     if ($parm{remove}) {
677     $tb->delete('docid' => $did,
678     headline => $headline,
679     %{$record});
680     } else {
681     $tb->insert('docid' => $did,
682     headline => $headline,
683     source => $parm{source},
684     parent => $parm{parent},
685     %{$record});
686     }
687     }
688    
689     # This *must* remove the version in *any* case. It should compute a
690     # resonable version number - but usually mtimes should be used.
691     sub version {
692     local ($_) = @_;
693    
694     # remove alpha/beta postfix
695     s/([-_\d])(a|b|alpha|beta|src)$/$1/;
696    
697     # jperl1.3@4.019.tar.gz
698     s/@\d.\d+//;
699    
700     # oraperl-v2.4-gk.tar.gz
701     s/-v(\d)/$1/;
702    
703     # lettered versions - shudder
704     s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei;
705     s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei;
706    
707     # thanks libwww-5b12 ;-)
708     s/(\d+)b/($1-1).'.'/e;
709     s/(\d+)a/($1-2).'.'/e;
710    
711     # replace '-pre' by '0.'
712     s/-pre([\.\d])/-0.$1/;
713     s/\.\././g;
714     s/(\d)_(\d)/$1$2/g;
715    
716     # chop '[-.]' and thelike
717     s/\W$//;
718    
719     # ram's versions Storable-0.4@p
720     s/\@/./;
721    
722     if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) {
723     return($_, $1 + "0.$2" + $3 / 1000000);
724     } elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) {
725     return($_, $1 + $2/1000 + $3 / 1000000);
726     } elsif (s/[-_]?(\d+\.[\d_]+)$//) {
727     return($_, $1);
728     } elsif (s/[-_]?([\d_]+)$//) {
729     return($_, $1);
730     } elsif (s/-(\d+.\d+)-/-/) { # perl-4.019-ref-guide
731     return($_, $1);
732     } else {
733     if ($_ =~ /\d/) { # smells like an unknown scheme
734     warn "Odd version Numbering: '$File::Find::name'\n";
735     return($_, undef);
736     } else { # assume version 0
737     warn "No version Numbering: '$File::Find::name'\n";
738     return($_, 0);
739     }
740    
741     }
742     }
743    
744     sub wanted {
745     my $mtime = shift; # called by parse_file_ls();
746    
747     return if (! $max);
748     $max--;
749    
750 dpavlin 23 return unless /^(.*)\.(tar\.(gz|Z)|tgz)$/;
751 dpavlin 20
752     my ($archive, $version) = version($1);
753    
754     unless (defined $version) {
755     warn "Skipping $1\n";
756     return;
757     }
758    
759     # Check for file alias
760     $archive = $ALIAS{$archive} if $ALIAS{$archive};
761    
762     # Check for path alias.
763     if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) {
764     if ($ALIAS{$1}) {
765     $archive = $ALIAS{$1};
766     }
767     }
768    
769     if ($OPT{trust_mtime}) {
770     $version = $mtime;
771     } else {
772     $version =~ s/(\d)_/$1/;
773     $version ||= $mtime; # mtime
774     }
775    
776     if (!exists $ARCHIVE{$archive}
777     or $VERSION{$archive} < $version) {
778     $ARCHIVE{$archive} = $File::Find::name;
779     $VERSION{$archive} = $version;
780     }
781     }
782    
783     sub create_table {
784     my %parm = @_;
785    
786     my $access = bless {}, 'WAIT::Document::Find';
787    
788     my $stem = [{
789     'prefix' => ['isotr', 'isolc'],
790     'intervall' => ['isotr', 'isolc'],
791     }, 'isotr', 'isolc', 'split2', 'stop', 'Stem'];
792     my $text = [{
793     'prefix' => ['isotr', 'isolc'],
794     'intervall' => ['isotr', 'isolc'],
795     },
796     'isotr', 'isolc', 'split2', 'stop'];
797     my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],;
798    
799     my $tb =
800     $parm{db}->create_table
801     (name => $parm{table},
802     attr => ['docid', 'headline', 'source', 'size', 'parent'],
803     keyset => [['docid']],
804     layout => $parm{layout},
805     access => $access,
806     invindex =>
807     [
808     'name' => $stem,
809     'synopsis' => $stem,
810     'bugs' => $stem,
811     'description' => $stem,
812     'text' => $stem,
813     'environment' => $text,
814     'example' => $text, 'example' => $stem,
815     'author' => $sound, 'author' => $stem,
816     ]
817     );
818     die "Could not create table '$parm{table}'" unless $tb;
819     $tb;
820     }
821    
822     my %MON;
823     my $YEAR;
824    
825     BEGIN {
826     my $i = 1;
827     for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) {
828     $MON{$_} = $i++;
829     }
830     $YEAR = (localtime(time))[5];
831     }
832    
833     # We could/should use Date::GetDate here
834     use Time::Local;
835     sub mtime {
836     my ($mon, $mday, $time) = @_;
837     my ($hour, $min, $year, $monn) = (0,0);
838    
839     if ($time =~ /(\d+):(\d+)/) {
840     ($hour, $min) = ($1, $2);
841     $year = $YEAR;
842     } else {
843     $year = $time;
844     }
845     $monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'";
846     my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year);
847     if ($guess > time) {
848     $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1);
849     }
850     $guess;
851     }
852    
853     package HyperEstraier::WAIT::Table;
854    
855     use HyperEstraier;
856     use Text::Iconv;
857    
858     =head1 NAME
859    
860     HyperEstraier::WAIT::Table
861    
862     =head1 DESCRIPTION
863    
864     This is a mode that emulates C<WAIT::Table> functionality somewhat.
865    
866     There are some limitations and only one key attribute is supported (and used
867     for C<@uri>).
868    
869     =head2 Porting from WAIT to this module.
870    
871     Since only one key is supported (and used as C<@uri> attribute),
872     use first parametar of C<keyset> as C<key>.
873    
874     Full text index is specified as C<invindex>, but you need just name of fields.
875    
876     You will probably need to add
877    
878     use WAIT::Parse::Base;
879    
880     to your code after you remove C<WAIT::Config> and C<WAIT::Database>.
881    
882     =head1 METHODS
883    
884     =head2 new
885    
886     my $tb = new HyperEstraier::WAIT::Table(
887     uri => 'http://localhost:1978/node/cpan',
888     attr => qw/docid headline source size parent/,
889     key => 'docid',
890     invindex => qw/name synopsis bugs description text environment example author/,
891     );
892    
893     =cut
894    
895     sub new {
896     my $class = shift;
897     my $self = {@_};
898     bless($self, $class);
899    
900     foreach my $p (qw/uri attr key invindex/) {
901     die "need $p" unless ($self->{$p});
902     }
903    
904     $self->{'iso2utf'} = Text::Iconv->new('ISO-8859-1','UTF-8');
905    
906     my $node = HyperEstraier::Node->new($self->{'uri'});
907     $node->set_auth('admin', 'admin');
908    
909     $self->{'node'} = $node;
910    
911     $self ? return $self : return undef;
912     }
913    
914     =head2 have
915    
916     if ( $tb->have(docid => $something) ) ...
917    
918     =cut
919    
920     sub have {
921     my $self = shift;
922     my $args = {@_};
923     my $key = $self->{'key'} || die "no key in object";
924     my $key_v = $args->{$key} || die "no key $key in data";
925    
926     my $id = $self->{'node'}->uri_to_id('file://' . $key_v);
927    
928     return ($id == -1 ? undef : $id);
929     }
930    
931     =head2 insert
932    
933     my $key = $tb->insert(
934     docid => $base,
935     headline => 'Something',
936     ...
937     );
938    
939     =cut
940    
941     sub insert {
942     my $self = shift;
943     my $args = {@_};
944    
945     my $uri = 'file://';
946     $uri .= $args->{'docid'} or die "no docid";
947    
948     my $doc = HyperEstraier::Document->new;
949    
950     $doc->add_attr('@uri', $uri);
951     $doc->add_attr('@title', $args->{'headline'}) if ($args->{'headline'});
952     $doc->add_attr('@size', $args->{'size'}) if ($args->{'size'});
953    
954     my @attr = $self->{'attr'} || die "no attr in object";
955     my @invindex = $self->{'invindex'} || die "no invindex in object";
956    
957     foreach my $attr (keys %{$args}) {
958 dpavlin 23 if (grep(/^$attr$/, @{ $self->{'attr'} }) or $attr =~ m/^@/o) {
959 dpavlin 20 $doc->add_attr($attr, $args->{$attr});
960     }
961     if (grep(/^$attr$/, @{ $self->{'invindex'} })) {
962     $doc->add_text($args->{$attr});
963     }
964     }
965    
966     print STDERR $doc->dump_draft if ($self->{'debug'});
967    
968     my $id;
969 dpavlin 24 unless ($self->{'node'}->put_doc($doc)) {
970 dpavlin 20 printf STDERR "ERROR: %d\n", $self->{'node'}->status;
971 dpavlin 24 } else {
972     $id = $self->{'node'}->uri_to_id( $uri );
973     if ($id != -1) {
974     print STDERR "id: $id\n" if ($self->{'debug'})
975     } else {
976     print STDERR "ERROR: can't find id for newly insrted document $uri\n";
977     }
978 dpavlin 20 }
979    
980     return $id;
981     }
982    
983     =head2 delete_by_key
984    
985     $tb->delete_by_key($key);
986    
987     =cut
988    
989     sub delete_by_key {
990     my $self = shift;
991     my $key_v = shift || die "no key?";
992    
993 dpavlin 22 unless ($self->{'node'}->out_doc_by_uri( 'file://' . $key_v )) {
994     print STDERR "WARNING: can't delete document $key_v\n";
995 dpavlin 21 }
996 dpavlin 20 }
997    
998     =head2 delete
999    
1000     $tb->delete( docid => $did, ... );
1001    
1002     =cut
1003    
1004     sub delete {
1005     my $self = shift;
1006     my $args = {@_};
1007    
1008     my $key = $self->{'key'} || die "no key in object";
1009    
1010     die "no $key in data" unless (my $key_v = $args->{$key});
1011    
1012     $self->delete_by_key($key_v);
1013    
1014     }

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26