$path :: page ##page

--- trunk/spider/progspider 2004/01/23 13:10:40 56 +++ trunk/spider/progspider 2004/08/29 21:19:13 84 @@ -1,12 +1,15 @@ -#!/usr/local/bin/perl -w +#!/usr/bin/perl -w use strict; use File::Find; use Getopt::Long; +use File::Which; my $collection; # name which will be inserted my $path_add; # add additional info in path my $verbose; +#$verbose = 1; + my $result = GetOptions( "collection=s" => \$collection, "path=s" => \$path_add, @@ -16,56 +19,112 @@ my $dir = shift @ARGV || die "usage: $0 [dir]"; - my $basedir = $0; $basedir =~ s,/[^/]+$,/,; require "$basedir/filter.pm"; +my $pdftotext = which('pdftotext'); + +select(STDERR); $|=1; +select(STDOUT); $|=1; + +print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); find({ wanted => \&file, follow => 1, no_chdir => 1 }, $dir); -sub file { - - return if (! -f || ! m/\.html*/i); - - # skip index files - return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); - - my $path = $_; - - open(F,"$path") || die "can't open file: $path"; - print STDERR "$path" if ($verbose); - my $contents; - while() { - $contents .= "$_"; - } - $contents .= "\n\n"; - - $contents = filter($contents,$collection); +sub dump_contents($$$) { + my ($contents,$mtime,$path) = @_; -# die "zero size content in '$path'" if (! $contents); return if (! $contents); # don't die on empty files - my $mtime = time; use bytes; my $size = length $contents; - print STDERR " [$size]\n" if ($verbose); - - # add optional components to path - $path .= " $path_add" if ($path_add); + print STDERR " [$size]" if ($verbose); # Output the document (to swish) print <) { + # XXX why pdftotext barks if I try to use this is beyond me. + #$contents .= $_; + + $html .= $_; + } + close(F); + + return if (! $html); + + my $file_only = $path; + $file_only =~ s/^.*\/([^\/]+)$/$1/g; + + my ($pre_html,$pages,$post_html) = ('$path :: page ##page_nr##

',$html,'

'); + + ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(.+?

)(.+)(<\/pre>.+?)$/si);
+
+		if ($collection) {
+			$pre_html =~ s/(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
+		} else {
+			$pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
+			$pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
+		}
+
+		my $page_nr = 1;
+		foreach my $page (split(/\f/s,$pages)) {
+			print STDERR " $page_nr" if ($verbose);
+			my $pre_tmp = $pre_html;
+			$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
+			dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
+			$page_nr++;
+		}
+
+	} else {
+
+		return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
+
+		# skip index files
+		return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
+
+		open(F,"$path") || die "can't open file: $path";
+		print STDERR "$path" if ($verbose);
+		while(<F>) {
+			$contents .= "$_";
+		}
+		$contents .= "\n\n";
+
+		$contents = filter($contents,$collection);
+
+		# add optional components to path
+		$path .= " $path_add" if ($path_add);
+
+		dump_contents($contents,time(), $path);
+	}
+
+	print STDERR "\n" if ($verbose);
+#	die "zero size content in '$path'" if (! $contents);
+
+}
+