--- trunk/spider/progspider 2004/01/20 18:13:32 50 +++ trunk/spider/progspider 2004/03/18 11:14:49 68 @@ -1,7 +1,21 @@ #!/usr/local/bin/perl -w use strict; use File::Find; +use Getopt::Long; +use File::Which; +my $collection; # name which will be inserted +my $path_add; # add additional info in path +my $verbose; + +#$verbose = 1; + +my $result = GetOptions( + "collection=s" => \$collection, + "path=s" => \$path_add, + "verbose!" => \$verbose, + "debug!" => \$verbose, +); my $dir = shift @ARGV || die "usage: $0 [dir]"; @@ -9,33 +23,27 @@ $basedir =~ s,/[^/]+$,/,; require "$basedir/filter.pm"; +my $pdftotext = which('pdftotext'); + +select(STDERR); $|=1; +select(STDOUT); $|=1; + +print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); + find({ wanted => \&file, follow => 1, no_chdir => 1 }, $dir); -sub file { - - return if (! -f || ! m/\.html*/i); - - my $path = $_; - - open(F,"$path") || die "can't open file: $path"; -# print STDERR "$path\n"; - my $contents; - while() { - $contents .= $_; - } - - $contents = filter($contents); +sub dump_contents($$$) { + my ($contents,$mtime,$path) = @_; -# die "zero size content in '$path'" if (! $contents); return if (! $contents); # don't die on empty files - my $mtime = time; + use bytes; my $size = length $contents; -# print STDERR " [$size]\n"; + print STDERR " [$size]" if ($verbose); # Output the document (to swish) print <) { + # XXX why pdftotext barks if I try to use this is beyond me. + #$contents .= $_; + + $html .= $_; + } + close(F); + + my ($pre_html,$pages,$post_html) = ('$path :: page ##page_nr##
',$html,'
'); + + ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(.+
)(.+)(<\/pre>.+)$/si);
+
+		$pre_html =~ s/(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
+
+		my $page_nr = 1;
+		foreach my $page (split(/\f/,$pages)) {
+			my $pre_tmp = $pre_html;
+			$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
+			dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
+			$page_nr++;
+		}
+
+	} else {
+
+		return if (! -f $path || ! m/\.html*$/i);
+
+		# skip index files
+		return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
+
+		open(F,"$path") || die "can't open file: $path";
+		print STDERR "$path" if ($verbose);
+		while(<F>) {
+			$contents .= "$_";
+		}
+		$contents .= "\n\n";
+
+		$contents = filter($contents,$collection);
+
+		# add optional components to path
+		$path .= " $path_add" if ($path_add);
+
+		dump_contents($contents,time(), $path);
+	}
+
+	print STDERR "\n" if ($verbose);
+#	die "zero size content in '$path'" if (! $contents);
+
+}
+