--- trunk/spider/progspider 2004/01/25 16:49:50 57 +++ trunk/spider/progspider 2004/03/17 12:19:42 66 @@ -2,6 +2,7 @@ use strict; use File::Find; use Getopt::Long; +use File::Which; my $collection; # name which will be inserted my $path_add; # add additional info in path @@ -18,47 +19,31 @@ my $dir = shift @ARGV || die "usage: $0 [dir]"; - my $basedir = $0; $basedir =~ s,/[^/]+$,/,; require "$basedir/filter.pm"; +my $pdftotext = which('pdftotext'); + +select(STDERR); $|=1; +select(STDOUT); $|=1; + +print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); find({ wanted => \&file, follow => 1, no_chdir => 1 }, $dir); -sub file { - - return if (! -f || ! m/\.html*/i); - - # skip index files - return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); - - my $path = $_; - - open(F,"$path") || die "can't open file: $path"; - print STDERR "$path" if ($verbose); - my $contents; - while() { - $contents .= "$_"; - } - $contents .= "\n\n"; - - $contents = filter($contents,$collection); +sub dump_contents($$$) { + my ($contents,$mtime,$path) = @_; -# die "zero size content in '$path'" if (! $contents); return if (! $contents); # don't die on empty files - my $mtime = time; use bytes; my $size = length $contents; - print STDERR " [$size]\n" if ($verbose); - - # add optional components to path - $path .= " $path_add" if ($path_add); + print STDERR " [$size]" if ($verbose); # Output the document (to swish) print <) { + # XXX why pdftotext barks if I try to use this is beyond me. + #$contents .= $_; + + $html .= $_; + } + close(F); + + my ($pre_html,$pages,$post_html) = ('$path :: page ##page_nr##
',$html,'
'); + + ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(.+
)(.+)(<\/pre>.+)$/si);
+
+		$pre_html =~ s/(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
+
+		my $page_nr = 1;
+		foreach my $page (split(/\f/,$pages)) {
+			my $pre_tmp = $pre_html;
+			$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
+			dump_contents($pre_tmp . $page . $post_html,time(), $path);
+			$page_nr++;
+		}
+
+	} else {
+
+		return if (! -f $path || ! m/\.html*$/i);
+
+		# skip index files
+		return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
+
+		open(F,"$path") || die "can't open file: $path";
+		print STDERR "$path" if ($verbose);
+		while(<F>) {
+			$contents .= "$_";
+		}
+		$contents .= "\n\n";
+
+		$contents = filter($contents,$collection);
+
+		# add optional components to path
+		$path .= " $path_add" if ($path_add);
+
+		dump_contents($contents,time(), $path);
+	}
+
+	print STDERR "\n" if ($verbose);
+#	die "zero size content in '$path'" if (! $contents);
+
+}
+