Contents of /trunk/spider/progspider

#!/usr/local/bin/perl -w
use strict;
use File::Find;
use Getopt::Long;
use File::Which;

my $collection;         # name which will be inserted
my $path_add;           # add additional info in path
my $verbose;

#$verbose = 1;

my $result = GetOptions(
        "collection=s" => \$collection,
        "path=s" => \$path_add,
        "verbose!" => \$verbose,
        "debug!" => \$verbose,
);

my $dir = shift @ARGV || die "usage: $0 [dir]";

my $basedir = $0;
$basedir =~ s,/[^/]+$,/,;
require "$basedir/filter.pm";

my $pdftotext = which('pdftotext');

select(STDERR); $|=1;
select(STDOUT); $|=1;

print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);

find({ wanted => \&file, 
        follow => 1,
        no_chdir => 1
}, $dir);

sub dump_contents($$$) {
        my ($contents,$mtime,$path) = @_;

        return if (! $contents);        # don't die on empty files

        use bytes;
        my $size = length $contents;

        print STDERR " [$size]" if ($verbose);

        # Output the document (to swish)
        print <<EOF;
Path-Name: $path
Content-Length: $size
Last-Mtime: $mtime
Document-Type: HTML

EOF
        print $contents;

}

sub file {

        my $path = $_;
        my $contents;

        if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {

                print STDERR "$path {converting}" if ($verbose);

                open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
                my $html;
                while(<F>) {
                        # XXX why pdftotext barks if I try to use this is beyond me.
                        #$contents .= $_;

                        $html .= $_;
                }
                close(F);

                my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');

                ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);

                if ($collection) {
                        $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
                } else {
                        $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
                }

                my $page_nr = 1;
                foreach my $page (split(/\f/s,$pages)) {
                        print STDERR " $page_nr" if ($verbose);
                        my $pre_tmp = $pre_html;
                        $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
                        dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
                        $page_nr++;
                }

        } else {

                return if (! -f $path || ! m/\.html*$/i);

                # skip index files
                return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);

                open(F,"$path") || die "can't open file: $path";
                print STDERR "$path" if ($verbose);
                while(<F>) {
                        $contents .= "$_";
                }
                $contents .= "\n\n";

                $contents = filter($contents,$collection);

                # add optional components to path
                $path .= " $path_add" if ($path_add);

                dump_contents($contents,time(), $path);
        }

        print STDERR "\n" if ($verbose);
#       die "zero size content in '$path'" if (! $contents);

}

1	#!/usr/local/bin/perl -w
2	use strict;
3	use File::Find;
4	use Getopt::Long;
5	use File::Which;
6
7	my $collection; # name which will be inserted
8	my $path_add; # add additional info in path
9	my $verbose;
10
11	#$verbose = 1;
12
13	my $result = GetOptions(
14	"collection=s" => \$collection,
15	"path=s" => \$path_add,
16	"verbose!" => \$verbose,
17	"debug!" => \$verbose,
18	);
19
20	my $dir = shift @ARGV \|\| die "usage: $0 [dir]";
21
22	my $basedir = $0;
23	$basedir =~ s,/[^/]+$,/,;
24	require "$basedir/filter.pm";
25
26	my $pdftotext = which('pdftotext');
27
28	select(STDERR); $\|=1;
29	select(STDOUT); $\|=1;
30
31	print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
32
33	find({ wanted => \&file,
34	follow => 1,
35	no_chdir => 1
36	}, $dir);
37
38	sub dump_contents($$$) {
39	my ($contents,$mtime,$path) = @_;
40
41	return if (! $contents); # don't die on empty files
42
43	use bytes;
44	my $size = length $contents;
45
46	print STDERR " [$size]" if ($verbose);
47
48	# Output the document (to swish)
49	print <<EOF;
50	Path-Name: $path
51	Content-Length: $size
52	Last-Mtime: $mtime
53	Document-Type: HTML
54
55	EOF
56	print $contents;
57
58	}
59
60	sub file {
61
62	my $path = $_;
63	my $contents;
64
65	if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
66
67	print STDERR "$path {converting}" if ($verbose);
68
69	open(F,"$pdftotext -htmlmeta \"$path\" - \|") \|\| die "can't open $pdftotext with '$path'";
70	my $html;
71	while(<F>) {
72	# XXX why pdftotext barks if I try to use this is beyond me.
73	#$contents .= $_;
74
75	$html .= $_;
76	}
77	close(F);
78
79	my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
80
81	($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
82
83	if ($collection) {
84	$pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
85	} else {
86	$pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
87	}
88
89	my $page_nr = 1;
90	foreach my $page (split(/\f/s,$pages)) {
91	print STDERR " $page_nr" if ($verbose);
92	my $pre_tmp = $pre_html;
93	$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
94	dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
95	$page_nr++;
96	}
97
98	} else {
99
100	return if (! -f $path \|\| ! m/\.html*$/i);
101
102	# skip index files
103	return if (m/index_[a-z]\.html/i \|\| m/index_symbol\.html/i);
104
105	open(F,"$path") \|\| die "can't open file: $path";
106	print STDERR "$path" if ($verbose);
107	while(<F>) {
108	$contents .= "$_";
109	}
110	$contents .= "\n\n";
111
112	$contents = filter($contents,$collection);
113
114	# add optional components to path
115	$path .= " $path_add" if ($path_add);
116
117	dump_contents($contents,time(), $path);
118	}
119
120	print STDERR "\n" if ($verbose);
121	# die "zero size content in '$path'" if (! $contents);
122
123	}
124