--- trunk/spider/progspider 2004/01/20 16:01:13 48 +++ trunk/spider/progspider 2004/02/06 13:29:39 63 @@ -1,7 +1,21 @@ #!/usr/local/bin/perl -w use strict; use File::Find; +use Getopt::Long; +use File::Which; +my $collection; # name which will be inserted +my $path_add; # add additional info in path +my $verbose; + +#$verbose = 1; + +my $result = GetOptions( + "collection=s" => \$collection, + "path=s" => \$path_add, + "verbose!" => \$verbose, + "debug!" => \$verbose, +); my $dir = shift @ARGV || die "usage: $0 [dir]"; @@ -9,6 +23,10 @@ $basedir =~ s,/[^/]+$,/,; require "$basedir/filter.pm"; +my $pdftotext = which('pdftotext'); + +print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); + find({ wanted => \&file, follow => 1, no_chdir => 1 @@ -16,28 +34,53 @@ sub file { - return if (! -f || ! m/\.html*/i); - my $path = $_; - - open(F,"$path") || die "can't open file: $path"; -# print STDERR "$path"; my $contents; - while() { -# chomp; -# chomp; -# $contents .= " ".$_; - $contents .= $_; - } -# $contents =~ s/<(\/*\w+)\s+>/<$1>/g; + if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) { + + print STDERR "$path {converting}" if ($verbose); + + open(F,"$pdftotext -htmlmeta $path - |") || die "can't open $pdftotext with '$path'"; + my $html; + while() { + # XXX why pdftotext barks if I try to use this is beyond me. + #$contents .= $_; + + $html .= $_; + } + close(F); + + $contents = "\n$html\n"; - $contents = filter($contents); + } else { + + return if (! -f $path || ! m/\.html*$/i); + + # skip index files + return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); + + open(F,"$path") || die "can't open file: $path"; + print STDERR "$path" if ($verbose); + while() { + $contents .= "$_"; + } + $contents .= "\n\n"; + + $contents = filter($contents,$collection); + } + +# die "zero size content in '$path'" if (! $contents); + return if (! $contents); # don't die on empty files my $mtime = time; + use bytes; my $size = length $contents; -# print STDERR " [$size]\n"; + print STDERR " [$size]\n" if ($verbose); + + # add optional components to path + $path .= " $path_add" if ($path_add); # Output the document (to swish) print <