--- trunk/spider/progspider 2004/03/17 12:19:14 65 +++ trunk/spider/progspider 2004/03/17 12:19:42 66 @@ -25,6 +25,9 @@ my $pdftotext = which('pdftotext'); +select(STDERR); $|=1; +select(STDOUT); $|=1; + print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); find({ wanted => \&file, @@ -32,6 +35,28 @@ no_chdir => 1 }, $dir); +sub dump_contents($$$) { + my ($contents,$mtime,$path) = @_; + + return if (! $contents); # don't die on empty files + + use bytes; + my $size = length $contents; + + print STDERR " [$size]" if ($verbose); + + # Output the document (to swish) + print <) { # XXX why pdftotext barks if I try to use this is beyond me. @@ -51,7 +76,19 @@ } close(F); - $contents = "\n$html\n"; + my ($pre_html,$pages,$post_html) = ('$path :: page ##page_nr##
',$html,'
'); + + ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(.+
)(.+)(<\/pre>.+)$/si);
+
+		$pre_html =~ s/(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
+
+		my $page_nr = 1;
+		foreach my $page (split(/\f/,$pages)) {
+			my $pre_tmp = $pre_html;
+			$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
+			dump_contents($pre_tmp . $page . $post_html,time(), $path);
+			$page_nr++;
+		}
 
 	} else {
 
@@ -68,28 +105,15 @@
 		$contents .= "\n\n";
 
 		$contents = filter($contents,$collection);
-	}
 
-#	die "zero size content in '$path'" if (! $contents);
-	return if (! $contents);	# don't die on empty files
+		# add optional components to path
+		$path .= " $path_add" if ($path_add);
 
-	my $mtime = time;
-	use bytes;
-	my $size = length $contents;
-
-	print STDERR " [$size]\n" if ($verbose);
-
-	# add optional components to path
-	$path .= " $path_add" if ($path_add);
-
-	# Output the document (to swish)
-	print <<EOF;
-Path-Name: $path
-Content-Length: $size
-Last-Mtime: $mtime
-Document-Type: HTML
+		dump_contents($contents,time(), $path);
+	}
 
-EOF
-	print $contents;
+	print STDERR "\n" if ($verbose);
+#	die "zero size content in '$path'" if (! $contents);
 
 }
+