$path :: page ##page

--- trunk/spider/progspider 2004/01/20 18:13:32 50 +++ trunk/spider/progspider 2005/04/24 16:33:53 95 @@ -1,7 +1,23 @@ -#!/usr/local/bin/perl -w +#!/usr/bin/perl -w use strict; use File::Find; +use Getopt::Long; +use File::Which; +my $collection; # name which will be inserted +my $path_add; # add additional info in path +my $verbose; +my $exclude; + +#$verbose = 1; + +my $result = GetOptions( + "collection=s" => \$collection, + "path=s" => \$path_add, + "verbose!" => \$verbose, + "debug!" => \$verbose, + "exclude=s" => \$exclude, +); my $dir = shift @ARGV || die "usage: $0 [dir]"; @@ -9,43 +25,115 @@ $basedir =~ s,/[^/]+$,/,; require "$basedir/filter.pm"; +my $pdftotext = which('pdftotext'); + +select(STDERR); $|=1; +select(STDOUT); $|=1; + +print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); + find({ wanted => \&file, follow => 1, no_chdir => 1 }, $dir); -sub file { - - return if (! -f || ! m/\.html*/i); +sub dump_contents($$$) { + my ($contents,$mtime,$path) = @_; - my $path = $_; + return unless ($contents); # don't die on empty files - open(F,"$path") || die "can't open file: $path"; -# print STDERR "$path\n"; - my $contents; - while() { - $contents .= $_; + if ($exclude && $path =~ m/$exclude/i) { + print STDERR "skip: $path\n" if ($verbose); + return; } - $contents = filter($contents); - -# die "zero size content in '$path'" if (! $contents); - return if (! $contents); # don't die on empty files - - my $mtime = time; + use bytes; my $size = length $contents; -# print STDERR " [$size]\n"; + print STDERR " [$size]" if ($verbose); # Output the document (to swish) print <) { + # XXX why pdftotext barks if I try to use this is beyond me. + #$contents .= $_; + + $html .= $_; + } + close(F); + + return if (! $html); + + my $file_only = $path; + $file_only =~ s/^.*\/([^\/]+)$/$1/g; + + my ($pre_html,$pages,$post_html) = ('$path :: page ##page_nr##

',$html,'

'); + + ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(.+?

)(.+)(<\/pre>.+?)$/si);
+
+		if ($collection) {
+			$pre_html =~ s/(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
+		} else {
+			$pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
+			$pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
+		}
+
+		my $page_nr = 1;
+		foreach my $page (split(/\f/s,$pages)) {
+			print STDERR " $page_nr" if ($verbose);
+			my $pre_tmp = $pre_html;
+			$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
+			dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
+			$page_nr++;
+		}
+
+	} else {
+
+		return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
+
+		# skip index files
+		return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
+
+		open(F,"$path") || die "can't open file: $path";
+		print STDERR "$path" if ($verbose);
+		while(<F>) {
+			$contents .= "$_";
+		}
+		$contents .= "\n\n";
+
+		$contents = filter($contents,$collection);
+
+		# add optional components to path
+		$path .= " $path_add" if ($path_add);
+
+		dump_contents($contents,time(), $path);
+	}
+
+	print STDERR "\n" if ($verbose);
+#	die "zero size content in '$path'" if (! $contents);
+
+}
+