#!/usr/bin/perl -w
=head1 NAME
cpanest - generate an Hyper Estraier index for CPAN
=head1 SYNOPSIS
B<cpanest>
[B<-clean>] [B<-noclean>]
[B<-cpan> I<url or directory>]
[B<-node> I<node_uri>]
[B<-force>] [B<-noforce>]
[B<-keep> I<directory>]
[B<-match> I<regexp>]
[B<-test> I<level>]
[B<-trust_mtime>] [B<-notrust_mtime>]
=head1 DESCRIPTION
This is a port of C<cpanwait> from L<WAIT> perl search engine to node API of
Hyper Estraier.
All the hard work was done by Ulrich Pfeifer who wrote all parsers and
formatters. I just added support for Hyper Estraier back-end after.
B<This documentation is somewhat incomplete and off-the-sync with code.>
=head1 OPTIONS
=over 5
=item B<-clean> / B<-noclean>
Clean the table befor indexing. Default is B<off>.
=item B<-cpan> I<url or directory>
Default directory or URL for indexing. If an URL is given, there
currently must be a file F<indices/find-ls.gz> relative to it which
contains the output of C<find . -ls | gzip>.
Default is F<ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN>.
=item B<-node> I<http://localhost:1978/node/cpan>
Specify node URI
=item B<-force>
Force reindexing, even if B<cpan> thinks files are up to date.
Default is B<off>
=item B<-keep> I<directory>
If fetching from a remote server, keep files in I<directory>. Default is
F</app/unido-i06/src/share/lang/perl/96a/CPAN/sources>.
=item B<-match> I<regexp>
Limit to patches matching I<regexp>. Default is F<authors/id/>.
=item B<-test> I<level>
Set test level, were B<0> means normal operation, B<1> means, don't
really index and B<2> means, don't even get archives and examine them.
=item B<-trust_mtime> / B<-notrust_mtime>
If B<on>, the files mtimes are used to decide, which version of an
archive is the newest. If b<off>, the version extracted is used
(beware, there are far more version numbering schemes than B<cpan> can
parse).
=back
=head1 AUTHORS
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortumund.de>E<gt>
Dobrica Pavlinusic E<lt>F<dpavlin@rot13.org>E<gt>
=head1 COPYRIGHT
Copyright (c) 1996-1997, Ulrich Pfeifer
Copyright (c) 2005, Dobrica Pavlinusic
=cut
use strict;
use File::Path;
use Getopt::Long;
use File::Find;
use File::Basename;
use IO::File;
use IO::Zlib;
use POSIX qw/strftime/;
use lib '/data/wait/lib';
use WAIT::Parse::Base;
use WAIT::Parse::Pod;
use WAIT::Document::Tar;
use WAIT::Document::Find;
sub fname($);
# maximum number of archives to index (set to -1 for unlimited)
my $max = -1;
my %OPT = (
node => 'http://localhost:1978/node/cpan',
clean => 0,
remove => [],
force => 0,
# cpan => '/usr/src/perl/CPAN/sources',
cpan => '/rest/cpan/CPAN/',
trust_mtime => 1,
match => 'authors/id/',
test => 0,
# cpan => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN',
# cpan => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN',
keep => '/tmp/CPAN/',
);
GetOptions(\%OPT,
'node=s',
'cpan=s',
'keep=s',
'match=s',
'clean!',
'test=i', # test level 0: normal
# 1: don't change db
# 2: don't look at archives even
'remove=s@',
'force!', # force indexing even if seen
'trust_mtime!', # use mtime instead of version number
'max=i',
'debug!',
) || die "Usage: ...\n";
if ($OPT{max}) {
$max = $OPT{max};
print STDERR "processing just first $max modules\n";
}
# FIXME
#clean_node(
# node => $OPT{node},
# ) if $OPT{clean};
my $tb = new HyperEstraier::WAIT::Table(
uri => $OPT{node},
attr => ['docid', 'headline', 'source', 'size', 'parent', 'version'],
key => 'docid',
invindex => [ qw/name synopsis bugs description text environment example author/ ],
debug => $OPT{debug},
) or die "Could not open node '$OPT{node}'";
my $layout= new WAIT::Parse::Pod;
# Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version
# considerations. Value *must* match common prefix. Aliasing should be
# used if CPAN contains serveral distributions with different name but
# same root directory.
# We still have a problem if there are different root directories!
my %ALIAS = (# tar name real (root) name
'Games-Scrabble' => 'Games',
'HTML-ParseBrowser' => 'HTML',
'iodbc_ext' => 'iodbc-ext-0.1',
'sol-inst' => 'Solaris',
'WebService-Validator-CSS-223C' => 'WebService-Validator-CSS-W3C-0.02',
'MPEG-ID3212Tag' => 'MPEG-ID3v2Tag-0.36',
'WebService-GoogleHack' => 'WebService',
'Db-Mediasurface-ReadConfig' => 'ReadConfig',
'Tie-Array-RestrictUpdates' => 'Tie',
'HTML-Lister' => 'HTML',
'Net-253950-AsyncZ' => 'Net-Z3950-AsyncZ-0.08',
'ChildExit_0' => 'ChildExit-0.1',
'Tie-TieConstant' => 'TieConstant.pm',
'Crypt-OpenSSL-23509' => 'Crypt-OpenSSL-X509-0.2',
'subclustv' => 'blib',
'finance-yahooquote' => 'Finance-YahooQuote-0.20',
'HPUX-FS' => 'FS',
'Business-DE-Konto' => 'Business',
'Digest-MD5-124p' => 'Digest-MD5-M4p-0.01',
'AKDB_Okewo_de' => 'AKDB',
'ExtUtils-0577' => 'ExtUtils-F77-1.14',
'LispFmt' => 'Lisp::Fmt-0.00',
'Acme-Stegano' => 'Acme',
'Acme-RTB' => 'Acme',
'WWW-Search-PRWire' => 'work',
'Video-Capture-214l' => 'Video-Capture-V4l-0.224',
'Tie-DirHandle' => 'Tie',
'DB2' => 'DBD-DB2-0.71a',
'Tie-Scalar-RestrictUpdates' => 'Tie',
'Math-MVPoly' => 'MVPoly',
'PlugIn' => 'PlugIn.pm',
'Lingua-ID-Nums2Words' => 'Nums2Words-0.01',
'chronos-1.' => 'Chronos',
'jp_beta' => 'jperl_beta_r1',
'Bundle-223C-Validator' => 'Bundle-W3C-Validator-0.6.5',
'Text-199' => 'Text-T9-1.0',
'Games-Literati' => 'Games',
'VMS-IndexedFile' => 'VMS',
'authen-rbac' => 'Authen',
'Graphics-EPS' => 'EPS.pm',
'new.spirit-2.' => 'new.spirit',
'Tk-MListbox' => 'MListbox-1.11',
'DBD-SQLrelay' => 'SQLRelay.pm',
'Tie-RDBM-Cached' => 'RDBM',
'PDL_IO_HDF' => 'HDF',
'HPUX-LVM' => 'LVM',
'Parse-Nibbler' => 'Parse',
'Digest-Perl-MD4' => 'MD4',
'Crypt-Imail' => 'Imail',
'ubertext' => 'Text-UberText-0.95',
'MP3-123U' => 'M3U',
'Qmail-Control' => 'Qmail',
'T-LXS' => 'Text-LevenshteinXS-0.02',
'HTML-Paginator' => 'HTML',
'swig' => 'SWIG1.1p5',
'MIDI-Realtime' => 'MIDI',
'sparky-public' => 'Sparky-Public-1.06',
'Chemistry-MolecularMass' => 'Chemistry',
'Net-253950-SimpleServer' => 'Net-Z3950-SimpleServer-0.08',
'NewsClipper-OpenSource' => 'NewsClipper-1.32-OpenSource',
'Win32API-Resources' => 'Resources.pm',
'Unicode-Collate-Standard-2131_1' => 'Unicode-Collate-Standard-V3_1_1-0.1',
'Net-026Term' => 'Net-C6Term-0.11',
'BitArray1' => 'BitArray',
'Audio-Radio-214L' => 'Audio-Radio-V4L-0.01',
'Devel-AutoProfiler' => 'Devel',
'Brasil-Checar-CGC' => 'Brasil',
'AI-NeuralNet-SOM' => 'SOM.pm',
'Net-BitTorrent-File-fix' => 'Net-BitTorrent-File-1.01',
'VMS-FindFile' => 'VMS',
'LoadHtml.' => 'README',
'Time-Compare' => 'Time',
'ShiftJIS-230213-MapUTF' => 'ShiftJIS-X0213-MapUTF-0.21',
'Image-WMF' => 'Image',
'sdf-2.0.eta' => 'sdf-2.001beta1',
'Math-Expr-LATEST' => 'Math-Expr-0.4',
'MP3-Player-PktConcert' => 'MP3',
'Apache-OWA' => 'OWA',
'Audio-Gramofile' => 'Audio',
'DBIx-Copy' => 'Copy',
'P4-024' => 'P4-C4-2.021',
'Disassemble-2386' => 'Disassemble-X86-0.13',
'Proc-Swarm' => 'Swarm-0.5',
'Smil' => 'perlysmil',
'Net-SSH-2232Perl' => 'Net-SSH-W32Perl-0.05',
'Win32-SerialPort' => 'SerialPort-0.19',
'Lingua-ID-Words2Nums' => 'Words2Nums-0.01',
'Parse-Text' => 'Text',
'DBIx-HTMLView-LATEST' => 'DBIx-HTMLView-0.9',
'Apache-NNTPGateway' => 'NNTPGateway-0.9',
'XPathToXML' => 'XPathToXML.pm',
'XML-WMM-ASX' => 'XML',
'CGISession' => 'CGI',
'Net-SMS-142' => 'Net-SMS-O2-0.019',
'Search-253950' => 'Search-Z3950-0.05',
'Date-Christmas' => 'Christmas',
'Win32-InternetExplorer-Window' => 'Win32',
'Apache-WAP-MailPeek' => 'MailPeek',
'Statistics-Table-F' => 'Statistics',
'BerkeleyDB_Locks' => 'BerkeleyDB-Locks-0_2',
'HookPrePostCall' => 'PrePostCall-1.2',
'Oak-AAS-Service-DBI_13_PAM' => 'Oak-AAS-Service-DBI_N_PAM-1.8',
'Math-Vector' => 'Vector.pm',
'Audio-124pDecrypt' => 'Audio-M4pDecrypt-0.04',
'libao-perl_0.03' => 'libao-perl-0.03',
'CGI-EZForm' => 'EZForm',
'Data-Locations-fixed' => 'Data-Locations-5.2-fixed',
'HTML-Template-Filter-Dreamweaver' => 'Dreamweaver',
'LineByLine' => 'LineByLine.pm',
'Geo-0400' => 'Geo-E00-0.05',
'WebService-Validator-HTML-223C' => 'WebService-Validator-HTML-W3C-0.03',
'DateTime-Format-223CDTF' => 'DateTime-Format-W3CDTF-0.04',
'DBD_SQLFLEX' => 'DBD-Sqlflex',
'Text-Number' => 'Number',
'DBIx-DataLookup' => 'DBIx',
'MP3-ID3211Tag' => 'MP3-ID3v1Tag-1.11',
'Text-Striphigh' => 'Striphigh-0.02',
'Tie-SortHash' => 'SortHash',
'Apache-AccessAbuse' => 'AccessAbuse',
'MP3-123U-Parser' => 'MP3-M3U-Parser',
'Net-253950' => 'Net-Z3950-0.44',
'Net-RBLClient' => 'RBLCLient-0.2',
'CGI-EasyCGI' => 'CGI',
'http-handle' => 'HTTP::Handle',
'JPEG-Comment' => 'JPEG',
'router-lg' => 'Router',
'Db-Mediasurface' => 'Mediasurface',
'Text-BarGraph' => 'bargraph',
'TL' => 'Text-Levenshtein-0.04',
'Config-Vars' => 'Config-0.01',
'Tie-PerfectHash' => 'Tie',
'DNS-TinyDNS' => 'DNS',
'DesignPattern-Factory' => 'Factory',
'WWW-01_Rail' => 'WWW-B_Rail-0.01',
'Win32-Exchange' => 'blib',
'Math-RPN' => 'Math',
'Db-Mediasurface-Cache' => 'Cache',
'perl_archie.' => 'Archie.pm',
'Acme-PGPSign' => 'Acme',
'HTML-Widget-sideBar' => 'HTML-Widget-SideBar-1.00',
'log' => 'Games',
'File-List' => 'File',
'Schedule-Cronchik' => 'Schedule',
'Curses-Devkit' => 'Cdk',
'Pod-PalmDoc' => 'Pod',
'Easy-WML' => 'Easy WML 0.1',
'Interval.' => 'Date',
'Brasil-Checar-CPF' => 'Brasil',
'Apache-WAP-AutoIndex' => 'AutoIndex',
'SOM.pm' => 'SOM.pm',
'PlugIn.pm' => 'PlugIn.pm',
'XPathToXML.pm' => 'XPathToXML.pm',
'Vector.pm' => 'Vector.pm',
'LineByLine.pm' => 'LineByLine.pm',
'Archie.pm' => 'Archie.pm',
'TieConstant.pm' => 'TieConstant.pm',
'EPS.pm' => 'EPS.pm',
'SQLRelay.pm' => 'SQLRelay.pm',
'Resources.pm' => 'Resources.pm',
'README' => 'README',
);
my %NEW_ALIAS; # found in this pass
# Map module names to pathes. Generated by wanted() doing alisaing.
my %ARCHIVE;
# Map module names to latest version. Generated by wanted()
my %VERSION;
# Mapping for modules with common root not matching modules name that
# are not aliased. This is just for prefix stripping and not strictly
# necessary. Takes effect after version considerations.
my %TR = (# tar name root to strip
'Net_SSLeay.pm' => 'SSLeay/',
'EventDrivenServer' => 'Server/',
'bio_lib.pl.' => '',
'AlarmCall' => 'Sys/',
'Cdk-ext' => 'Cdk/',
'Sx' => '\d.\d/',
'DumpStack' => 'Devel/',
'StatisticsDescriptive'=> 'Statistics/',
'Term-Gnuplot' => 'Gnuplot/',
'iodbc_ext' => 'iodbc-ext-\d.\d/',
'UNIVERSAL' => '',
'Term-Query' => 'Query/',
'SelfStubber' => 'Devel/',
'CallerItem' => 'Devel/',
);
my $LWP;
# FIXME
my $DIR = '/rest/estseek/cpan/';
my $DATA = $DIR . '/data';
if (@{$OPT{remove}}) {
my $pod;
for $pod (@{$OPT{remove}}) {
unless (-e $pod) {
$pod = "$DIR/$pod";
}
index_pod(file => $pod, remove => 1) if -f $pod;
unlink $pod or warn "Could not unlink '$pod': $!\n";
}
exit;
}
# Now get the beef
if ($OPT{cpan} =~ /^(http|ftp):/) {
$LWP = 1;
require LWP::Simple;
LWP::Simple->import();
mkpath($DATA,1,0755) or
die "Could not generate '$DATA/': $!"
unless -d $DATA;
if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) {
my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz");
if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
# we could use Net:FTP here ...
die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n";
}
}
my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |";
die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh;
my $line;
while (defined ($line = <$fh>)) {
chomp($line);
my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11];
next if defined $is_link;
my $mtime = mtime($mon, $mday, $time);
$file =~ s:^\./::;
($_) = fileparse($file);
$File::Find::name = $file;
wanted($mtime);
}
} else {
find(sub {&wanted((stat($_))[9])}, $OPT{cpan});
}
ARCHIVE:
for my $tar (sort keys %ARCHIVE) {
next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o;
my $base = (split /\//, $ARCHIVE{$tar})[-1];
my $parent;
my %attr;
# logging
if ($OPT{trust_mtime}) {
$attr{'@mdate'} = strftime('%Y-%m-%dT%H:%M:%S+00:00', gmtime($VERSION{$tar}));
$parent->{'@mdate'} = $attr{'@mdate'};
printf "%-20s %10s %s\t", $tar, $attr{'@mdate'}, $base;
} else {
$attr{'version'} = $VERSION{$tar};
printf "%-20s %10.5f %s\t", $tar, $attr{'version'}, $base;
}
# Remember the archive
# We should have an extra table for the tar file data ...
if (!$OPT{force} and $tb->have(docid => $base)) {
print "skipping\n";
next ARCHIVE;
} else {
$parent->{_id} = $tb->insert(docid => $base,
headline => $ARCHIVE{$tar},
%attr
) unless $OPT{test};
print "indexing\n";
}
next ARCHIVE if $OPT{test} > 1;
my $TAR = myget($tar);
next ARCHIVE unless $TAR; # not able to fetch it
my %tar;
tie (%tar,
'WAIT::Document::Tar',
sub { $_[0] =~ /\.(pm|pod|PL)$/i or $_[0] =~ /readme/i},
#sub { $_[0] !~ m:/$: },
$TAR)
or warn "Could not tie '$TAR'\n";
my $sloppy;
my ($key, $val);
FILE:
while (($key, $val) = each %tar) {
my $file = fname($key);
# don't index directories
next if $file =~ /\/$/;
# is it a POD file?
next FILE unless $file =~ /readme/i or $val =~ /\n=head/;
# remove directory prefix
unless ($sloppy # no common root
or $file =~ s:^\Q$tar\E[^/]*/:: # common root, maybe alias
or ($TR{$tar} # common root, not aliased
and $file =~ s:^\Q$TR{$tar}\E::)
) {
# try to determine an alias
warn "Bad directory prefix: '$file'\n";
my ($prefix) = split /\//, $file;
while ($key = (tied %tar)->NEXTKEY) {
my $file = fname($key);
next if $file =~ /\/$/;
unless ($file =~ m:^$prefix/: or $file eq $prefix) {
warn "Archive contains different prefixes: $prefix,$file\n";
$prefix = '';
last;
}
}
if ($prefix) {
print "Please alias '$tar' to '$prefix' next time!\n";
print "See alias table later.\n";
$NEW_ALIAS{$tar} = $prefix;
$tb->delete_by_key($parent->{_id});
next ARCHIVE;
} else {
print "Assuming that tar file name $tar is a valid prefix\n";
$sloppy = 1;
# We may reset too much here! But that this is not exact
# science anyway. Maybe we should ignore using 'next ARCHIVE'.
$key = (tied %tar)->FIRSTKEY;
redo FILE;
}
}
# remove /lib prefix
$file =~ s:^lib/::;
# generate new path
my $path = "$DATA/$tar/$file";
my ($sbase, $sdir) = fileparse($path);
my $fh;
unless ($OPT{test}) {
if (-f $path) {
index_pod(file => $path, remove => 1);
unlink $path or warn "Could not unlink '$path' $!\n";
} elsif (!-d $sdir) {
mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n";
}
# $fh = new IO::File "> $path";
$fh = new IO::Zlib "$path.gz","wb";
die "Could not write '$path': $!\n" unless $fh;
}
if ($file =~ /readme|install/i) { # make READMEs verbatim pods
$val =~ s/\n/\n /g;
$val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val"
unless $val =~ /^=head/m;
} else { # remove non-pod stuff
my $nval = $val; $val = '';
my $cutting = 1;
for (split /\n/, $nval) {
if (/^=cut|!NO!SUBS!/) {
$cutting = 1;
} elsif ($cutting and /^=head/) {
$cutting = 0;
}
unless ($cutting) {
$val .= $_ . "\n";
}
}
}
unless ($OPT{test}) {
$fh->print($val);
index_pod(file => $path, parent => $parent,
text => $val, source => $ARCHIVE{$tar},
);
}
}
if ($LWP and !$OPT{keep}) {
unlink $TAR or warn
"Could not unlink '$TAR': $!\n";
}
}
if (%NEW_ALIAS) {
print "\%ALIAS = (\n";
for (keys %NEW_ALIAS) {
print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n";
}
print "\t);\n";
}
exit;
sub fname ($) {
my $key = shift;
my ($ntar, $file) = split $;, $key;
# remove leading './' - shudder
$file =~ s/^\.\///;
return($file);
}
sub myget {
my $tar = shift;
my $TAR;
if ($LWP) { # fetch the archive
if ($OPT{keep}) {
$TAR = "$OPT{keep}/$ARCHIVE{$tar}";
print "Keeping in '$TAR'\n" unless -e $TAR;
my ($base, $path) = fileparse($TAR);
unless (-d $path) {
mkpath($path,1,0755) or
die "Could not mkpath($path)\n";
}
} else {
$TAR = "/tmp/$tar.tar.gz";
}
unless (-e $TAR) { # lwp mirror seems to fetch ftp: in any case?
print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n";
my $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR);
if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n";
return;
}
}
} else {
$TAR = $ARCHIVE{$tar};
}
$TAR;
}
sub index_pod {
my %parm = @_;
my $did = $parm{file};
my $rel_did = $did;
my $abs_did = $did;
if ($rel_did =~ s:$DIR/::) {
$abs_did = "$DIR/$rel_did";
}
undef $did;
# check for both variants
if ($tb->have('docid' => $rel_did)) {
$did = $rel_did;
} elsif ($tb->have('docid' => $abs_did)) {
$did = $abs_did;
}
if ($did) { # have it version
if (!$parm{remove} and !$OPT{force}) {
warn "duplicate: $did\n";
return;
}
} else { # not seen yet
$did = $rel_did;
if ($parm{remove}) {
print "missing: $did\n";
return;
}
}
$parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did);
unless (defined $parm{'text'}) {
print "unavailable: $did\n";
return;
}
my $record = $layout->split($parm{'text'});
if (! $record) {
print "empty pod: $did\n";
return;
}
$record->{size} = length($parm{'text'});
my $headline = $record->{name} || $did;
# additional fields for Hyper Estraier
$record->{'@mdate'} = $parm{'mdate'} if ($parm{'mdate'});
$headline =~ s/^$DATA//o; # $did
$headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70);
if ($parm{remove}) {
$tb->delete('docid' => $did,
headline => $headline,
%{$record});
} else {
foreach (keys %{$parm{parent}}) {
next if (/^_/);
$record->{$_} = $parm{parent}->{$_} if ($parm{parent}->{$_});
}
$tb->insert('docid' => $did,
headline => $headline,
source => $parm{source},
parent => $parm{parent}->{_id},
%{$record});
}
}
# This *must* remove the version in *any* case. It should compute a
# resonable version number - but usually mtimes should be used.
sub version {
local ($_) = @_;
# remove alpha/beta postfix
s/([-_\d])(a|b|alpha|beta|src)$/$1/;
# jperl1.3@4.019.tar.gz
s/@\d.\d+//;
# oraperl-v2.4-gk.tar.gz
s/-v(\d)/$1/;
# lettered versions - shudder
s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei;
s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei;
# thanks libwww-5b12 ;-)
s/(\d+)b/($1-1).'.'/e;
s/(\d+)a/($1-2).'.'/e;
# replace '-pre' by '0.'
s/-pre([\.\d])/-0.$1/;
s/\.\././g;
s/(\d)_(\d)/$1$2/g;
# chop '[-.]' and thelike
s/\W$//;
# ram's versions Storable-0.4@p
s/\@/./;
if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) {
return($_, $1 + "0.$2" + $3 / 1000000);
} elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) {
return($_, $1 + $2/1000 + $3 / 1000000);
} elsif (s/[-_]?(\d+\.[\d_]+)$//) {
return($_, $1);
} elsif (s/[-_]?([\d_]+)$//) {
return($_, $1);
} elsif (s/-(\d+.\d+)-/-/) { # perl-4.019-ref-guide
return($_, $1);
} else {
if ($_ =~ /\d/) { # smells like an unknown scheme
warn "Odd version Numbering: '$File::Find::name'\n";
return($_, undef);
} else { # assume version 0
warn "No version Numbering: '$File::Find::name'\n";
return($_, 0);
}
}
}
sub wanted {
my $mtime = shift; # called by parse_file_ls();
return if (! $max);
$max--;
return unless /^(.*)\.(tar\.(gz|Z)|tgz)$/;
my ($archive, $version) = version($1);
unless (defined $version) {
warn "Skipping $1\n";
return;
}
# Check for file alias
$archive = $ALIAS{$archive} if $ALIAS{$archive};
# Check for path alias.
if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) {
if ($ALIAS{$1}) {
$archive = $ALIAS{$1};
}
}
if ($OPT{trust_mtime}) {
$version = $mtime;
} else {
$version =~ s/(\d)_/$1/;
$version ||= $mtime; # mtime
}
if (!exists $ARCHIVE{$archive}
or $VERSION{$archive} < $version) {
$ARCHIVE{$archive} = $File::Find::name;
$VERSION{$archive} = $version;
}
}
my %MON;
my $YEAR;
BEGIN {
my $i = 1;
for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) {
$MON{$_} = $i++;
}
$YEAR = (localtime(time))[5];
}
# We could/should use Date::GetDate here
use Time::Local;
sub mtime {
my ($mon, $mday, $time) = @_;
my ($hour, $min, $year, $monn) = (0,0);
if ($time =~ /(\d+):(\d+)/) {
($hour, $min) = ($1, $2);
$year = $YEAR;
} else {
$year = $time;
}
$monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'";
my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year);
if ($guess > time) {
$guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1);
}
$guess;
}
package HyperEstraier::WAIT::Table;
use Search::Estraier;
use Text::Iconv;
=head1 NAME
HyperEstraier::WAIT::Table
=head1 DESCRIPTION
This is a mode that emulates C<WAIT::Table> functionality somewhat.
There are some limitations and only one key attribute is supported (and used
for C<@uri>).
=head2 Porting from WAIT to this module.
Since only one key is supported (and used as C<@uri> attribute),
use first parametar of C<keyset> as C<key>.
Full text index is specified as C<invindex>, but you need just name of fields.
You will probably need to add
use WAIT::Parse::Base;
to your code after you remove C<WAIT::Config> and C<WAIT::Database>.
=head1 METHODS
=head2 new
my $tb = new HyperEstraier::WAIT::Table(
uri => 'http://localhost:1978/node/cpan',
attr => qw/docid headline source size parent/,
key => 'docid',
invindex => qw/name synopsis bugs description text environment example author/,
);
=cut
sub new {
my $class = shift;
my $self = {@_};
bless($self, $class);
foreach my $p (qw/uri attr key invindex/) {
die "need $p" unless ($self->{$p});
}
$self->{'iso2utf'} = Text::Iconv->new('ISO-8859-1','UTF-8');
my $node = Search::Estraier::Node->new(
url => $self->{'uri'},
user => 'admin',
passwd => 'admin',
create => 1,
);
$self->{'node'} = $node;
$self ? return $self : return undef;
}
=head2 have
if ( $tb->have(docid => $something) ) ...
=cut
sub have {
my $self = shift;
my $args = {@_};
my $key = $self->{'key'} || die "no key in object";
my $key_v = $args->{$key} || die "no key $key in data";
my $id = $self->{'node'}->uri_to_id('file://' . $key_v);
return unless($id);
return ($id == -1 ? undef : $id);
}
=head2 insert
my $key = $tb->insert(
docid => $base,
headline => 'Something',
...
);
=cut
sub insert {
my $self = shift;
my $args = {@_};
my $uri = 'file://';
$uri .= $args->{'docid'} or die "no docid";
my $doc = Search::Estraier::Document->new;
$doc->add_attr('@uri', $uri);
$doc->add_attr('@title', $args->{'headline'}) if ($args->{'headline'});
$doc->add_attr('@size', $args->{'size'}) if ($args->{'size'});
my @attr = $self->{'attr'} || die "no attr in object";
my @invindex = $self->{'invindex'} || die "no invindex in object";
foreach my $attr (keys %{$args}) {
if (grep(/^$attr$/, @{ $self->{'attr'} }) or $attr =~ m/^@/o) {
$doc->add_attr($attr, $args->{$attr});
}
if (grep(/^$attr$/, @{ $self->{'invindex'} })) {
$doc->add_text($args->{$attr});
}
}
print STDERR $doc->dump_draft if ($self->{'debug'});
my $id;
unless ($self->{'node'}->put_doc($doc)) {
printf STDERR "ERROR: %d\n", $self->{'node'}->status;
} else {
$id = $self->{'node'}->uri_to_id( $uri );
if ($id != -1) {
print STDERR "id: $id\n" if ($self->{'debug'})
} else {
print STDERR "ERROR: can't find id for newly insrted document $uri\n";
}
}
return $id;
}
=head2 delete_by_key
$tb->delete_by_key($key);
=cut
sub delete_by_key {
my $self = shift;
my $key_v = shift || die "no key?";
unless ($self->{'node'}->out_doc_by_uri( 'file://' . $key_v )) {
print STDERR "WARNING: can't delete document $key_v\n";
}
}
=head2 delete
$tb->delete( docid => $did, ... );
=cut
sub delete {
my $self = shift;
my $args = {@_};
my $key = $self->{'key'} || die "no key in object";
die "no $key in data" unless (my $key_v = $args->{$key});
$self->delete_by_key($key_v);
}