Contents of /trunk/spider/progspider

#!/usr/bin/perl -w
use strict;
use File::Find;
use Getopt::Long;
use File::Which;

my $collection;         # name which will be inserted
my $path_add;           # add additional info in path
my $verbose;
my $exclude;
my $skip_output;

#$verbose = 1;

my $result = GetOptions(
        "collection=s" => \$collection,
        "path=s" => \$path_add,
        "verbose!" => \$verbose,
        "debug!" => \$verbose,
        "exclude=s" => \$exclude,
        "skipoutput!" => \$skip_output,
);

my $dir = shift @ARGV || die "usage: $0 [dir]";

my $basedir = $0;
$basedir =~ s,/[^/]+$,/,;
require "$basedir/filter.pm";

my $pdftotext = which('pdftotext');

select(STDERR); $|=1;
select(STDOUT); $|=1;

print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);

find({ wanted => \&file, 
        follow => 1,
        no_chdir => 1
}, $dir);

sub dump_contents($$$) {
        my ($contents,$mtime,$path) = @_;

        return unless ($contents);      # don't die on empty files

        if ($exclude && $path =~ m/$exclude/i) {
                print STDERR "skip: $path\n" if ($verbose);
                return;
        }

        use bytes;
        my $size = length $contents;

        print STDERR " [$size]" if ($verbose);

        return if ($skip_output);

        # Output the document (to swish)
        print <<EOF;
Path-Name: $path
Content-Length: $size
Last-Mtime: $mtime
Document-Type: html*

EOF
        print $contents;

}

sub file {

        my $path = $_;
        my $contents;

        return if (-l $path);

        if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {

                print STDERR "$path {converting}" if ($verbose);

                open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
                my $html;
                while(<F>) {
                        # XXX why pdftotext barks if I try to use this is beyond me.
                        #$contents .= $_;

                        $html .= $_;
                }
                close(F);

                return if (! $html);

                my $file_only = $path;
                $file_only =~ s/^.*\/([^\/]+)$/$1/g;

                my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');

                ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);

                if ($collection) {
                        $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
                } else {
                        $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
                        $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
                }

                my $page_nr = 1;
                foreach my $page (split(/\f/s,$pages)) {
                        print STDERR " $page_nr" if ($verbose);
                        my $pre_tmp = $pre_html;
                        $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
                        dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
                        $page_nr++;
                }

        } else {

                return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);

                # skip index files
                return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);

                open(F,"$path") || die "can't open file: $path";
                print STDERR "$path" if ($verbose);
                while(<F>) {
                        $contents .= $_;
                }
                $contents .= "\n\n";

                $contents = filter($contents,$collection);

                # add optional components to path
                $path .= " $path_add" if ($path_add);

                dump_contents($contents,time(), $path);
        }

        print STDERR "\n" if ($verbose);
#       die "zero size content in '$path'" if (! $contents);

}

1	#!/usr/bin/perl -w
2	use strict;
3	use File::Find;
4	use Getopt::Long;
5	use File::Which;
6
7	my $collection; # name which will be inserted
8	my $path_add; # add additional info in path
9	my $verbose;
10	my $exclude;
11	my $skip_output;
12
13	#$verbose = 1;
14
15	my $result = GetOptions(
16	"collection=s" => \$collection,
17	"path=s" => \$path_add,
18	"verbose!" => \$verbose,
19	"debug!" => \$verbose,
20	"exclude=s" => \$exclude,
21	"skipoutput!" => \$skip_output,
22	);
23
24	my $dir = shift @ARGV \|\| die "usage: $0 [dir]";
25
26	my $basedir = $0;
27	$basedir =~ s,/[^/]+$,/,;
28	require "$basedir/filter.pm";
29
30	my $pdftotext = which('pdftotext');
31
32	select(STDERR); $\|=1;
33	select(STDOUT); $\|=1;
34
35	print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
36
37	find({ wanted => \&file,
38	follow => 1,
39	no_chdir => 1
40	}, $dir);
41
42	sub dump_contents($$$) {
43	my ($contents,$mtime,$path) = @_;
44
45	return unless ($contents); # don't die on empty files
46
47	if ($exclude && $path =~ m/$exclude/i) {
48	print STDERR "skip: $path\n" if ($verbose);
49	return;
50	}
51
52	use bytes;
53	my $size = length $contents;
54
55	print STDERR " [$size]" if ($verbose);
56
57	return if ($skip_output);
58
59	# Output the document (to swish)
60	print <<EOF;
61	Path-Name: $path
62	Content-Length: $size
63	Last-Mtime: $mtime
64	Document-Type: html*
65
66	EOF
67	print $contents;
68
69	}
70
71	sub file {
72
73	my $path = $_;
74	my $contents;
75
76	return if (-l $path);
77
78	if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
79
80	print STDERR "$path {converting}" if ($verbose);
81
82	open(F,"$pdftotext -htmlmeta \"$path\" - \|") \|\| die "can't open $pdftotext with '$path'";
83	my $html;
84	while(<F>) {
85	# XXX why pdftotext barks if I try to use this is beyond me.
86	#$contents .= $_;
87
88	$html .= $_;
89	}
90	close(F);
91
92	return if (! $html);
93
94	my $file_only = $path;
95	$file_only =~ s/^.*\/([^\/]+)$/$1/g;
96
97	my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
98
99	($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
100
101	if ($collection) {
102	$pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
103	} else {
104	$pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si \|\|
105	$pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
106	}
107
108	my $page_nr = 1;
109	foreach my $page (split(/\f/s,$pages)) {
110	print STDERR " $page_nr" if ($verbose);
111	my $pre_tmp = $pre_html;
112	$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
113	dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
114	$page_nr++;
115	}
116
117	} else {
118
119	return if (! -f $path \|\| ! m/\.(html*\|php\|pl\|txt\|info\|log\|text)$/i);
120
121	# skip index files
122	return if (m/index_[a-z]\.html/i \|\| m/index_symbol\.html/i);
123
124	open(F,"$path") \|\| die "can't open file: $path";
125	print STDERR "$path" if ($verbose);
126	while(<F>) {
127	$contents .= $_;
128	}
129	$contents .= "\n\n";
130
131	$contents = filter($contents,$collection);
132
133	# add optional components to path
134	$path .= " $path_add" if ($path_add);
135
136	dump_contents($contents,time(), $path);
137	}
138
139	print STDERR "\n" if ($verbose);
140	# die "zero size content in '$path'" if (! $contents);
141
142	}
143