--- trunk/spider/progspider 2004/01/25 16:49:50 57 +++ trunk/spider/progspider 2004/02/06 13:29:39 63 @@ -2,6 +2,7 @@ use strict; use File::Find; use Getopt::Long; +use File::Which; my $collection; # name which will be inserted my $path_add; # add additional info in path @@ -18,11 +19,13 @@ my $dir = shift @ARGV || die "usage: $0 [dir]"; - my $basedir = $0; $basedir =~ s,/[^/]+$,/,; require "$basedir/filter.pm"; +my $pdftotext = which('pdftotext'); + +print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); find({ wanted => \&file, follow => 1, @@ -31,22 +34,41 @@ sub file { - return if (! -f || ! m/\.html*/i); + my $path = $_; + my $contents; - # skip index files - return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); + if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) { - my $path = $_; + print STDERR "$path {converting}" if ($verbose); - open(F,"$path") || die "can't open file: $path"; - print STDERR "$path" if ($verbose); - my $contents; - while() { - $contents .= "$_"; - } - $contents .= "\n\n"; + open(F,"$pdftotext -htmlmeta $path - |") || die "can't open $pdftotext with '$path'"; + my $html; + while() { + # XXX why pdftotext barks if I try to use this is beyond me. + #$contents .= $_; - $contents = filter($contents,$collection); + $html .= $_; + } + close(F); + + $contents = "\n$html\n"; + + } else { + + return if (! -f $path || ! m/\.html*$/i); + + # skip index files + return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); + + open(F,"$path") || die "can't open file: $path"; + print STDERR "$path" if ($verbose); + while() { + $contents .= "$_"; + } + $contents .= "\n\n"; + + $contents = filter($contents,$collection); + } # die "zero size content in '$path'" if (! $contents); return if (! $contents); # don't die on empty files