Annotation of /trunk/spider/progspider

#!/usr/bin/perl -w
use strict;
use File::Find;
use Getopt::Long;
use File::Which;

my $collection;         # name which will be inserted
my $path_add;           # add additional info in path
my $verbose;
my $exclude;
my $skip_output;

#$verbose = 1;

my $result = GetOptions(
        "collection=s" => \$collection,
        "path=s" => \$path_add,
        "verbose!" => \$verbose,
        "debug!" => \$verbose,
        "exclude=s" => \$exclude,
        "skipoutput!" => \$skip_output,
);

my $dir = shift @ARGV || die "usage: $0 [dir]";

my $basedir = $0;
$basedir =~ s,/[^/]+$,/,;
require "$basedir/filter.pm";

my $pdftotext = which('pdftotext');

select(STDERR); $|=1;
select(STDOUT); $|=1;

print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);

find({ wanted => \&file, 
        follow => 1,
        no_chdir => 1
}, $dir);

sub dump_contents($$$) {
        my ($contents,$mtime,$path) = @_;

        return unless ($contents);      # don't die on empty files

        if ($exclude && $path =~ m/$exclude/i) {
                print STDERR "skip: $path\n" if ($verbose);
                return;
        }

        use bytes;
        my $size = length $contents;

        print STDERR " [$size]" if ($verbose);

        return if ($skip_output);

        # Output the document (to swish)
        print <<EOF;
Path-Name: $path
Content-Length: $size
Last-Mtime: $mtime
Document-Type: html*

EOF
        print $contents;

}

sub file {

        my $path = $_;
        my $contents;

        return if (-l $path);

        if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {

                print STDERR "$path {converting}" if ($verbose);

                open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
                my $html;
                while(<F>) {
                        # XXX why pdftotext barks if I try to use this is beyond me.
                        #$contents .= $_;

                        $html .= $_;
                }
                close(F);

                return if (! $html);

                my $file_only = $path;
                $file_only =~ s/^.*\/([^\/]+)$/$1/g;

                my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');

                ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);

                if ($collection) {
                        $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
                } else {
                        $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
                        $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
                }

                my $page_nr = 1;
                foreach my $page (split(/\f/s,$pages)) {
                        print STDERR " $page_nr" if ($verbose);
                        my $pre_tmp = $pre_html;
                        $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
                        dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
                        $page_nr++;
                }

        } else {

                return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);

                # skip index files
                return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);

                open(F,"$path") || die "can't open file: $path";
                print STDERR "$path" if ($verbose);
                while(<F>) {
                        $contents .= $_;
                }
                $contents .= "\n\n";

                $contents = filter($contents,$collection);

                # add optional components to path
                $path .= " $path_add" if ($path_add);

                dump_contents($contents,time(), $path);
        }

        print STDERR "\n" if ($verbose);
#       die "zero size content in '$path'" if (! $contents);

}

1	dpavlin	81	#!/usr/bin/perl -w
2	dpavlin	46	use strict;
3			use File::Find;
4	dpavlin	56	use Getopt::Long;
5	dpavlin	63	use File::Which;
6	dpavlin	46
7	dpavlin	56	my $collection; # name which will be inserted
8			my $path_add; # add additional info in path
9			my $verbose;
10	dpavlin	95	my $exclude;
11	dpavlin	98	my $skip_output;
12	dpavlin	46
13	dpavlin	57	#$verbose = 1;
14
15	dpavlin	56	my $result = GetOptions(
16			"collection=s" => \$collection,
17			"path=s" => \$path_add,
18			"verbose!" => \$verbose,
19			"debug!" => \$verbose,
20	dpavlin	95	"exclude=s" => \$exclude,
21	dpavlin	98	"skipoutput!" => \$skip_output,
22	dpavlin	56	);
23
24	dpavlin	46	my $dir = shift @ARGV \|\| die "usage: $0 [dir]";
25
26			my $basedir = $0;
27			$basedir =~ s,/[^/]+$,/,;
28			require "$basedir/filter.pm";
29
30	dpavlin	63	my $pdftotext = which('pdftotext');
31	dpavlin	56
32	dpavlin	66	select(STDERR); $\|=1;
33			select(STDOUT); $\|=1;
34
35	dpavlin	63	print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
36
37	dpavlin	46	find({ wanted => \&file,
38			follow => 1,
39			no_chdir => 1
40			}, $dir);
41
42	dpavlin	66	sub dump_contents($$$) {
43			my ($contents,$mtime,$path) = @_;
44
45	dpavlin	95	return unless ($contents); # don't die on empty files
46	dpavlin	66
47	dpavlin	95	if ($exclude && $path =~ m/$exclude/i) {
48			print STDERR "skip: $path\n" if ($verbose);
49			return;
50			}
51
52	dpavlin	66	use bytes;
53			my $size = length $contents;
54
55			print STDERR " [$size]" if ($verbose);
56
57	dpavlin	98	return if ($skip_output);
58
59	dpavlin	66	# Output the document (to swish)
60			print <<EOF;
61			Path-Name: $path
62			Content-Length: $size
63			Last-Mtime: $mtime
64	dpavlin	81	Document-Type: html*
65	dpavlin	66
66			EOF
67			print $contents;
68
69			}
70
71	dpavlin	46	sub file {
72
73	dpavlin	63	my $path = $_;
74			my $contents;
75	dpavlin	46
76	dpavlin	92	return if (-l $path);
77
78	dpavlin	63	if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
79	dpavlin	56
80	dpavlin	63	print STDERR "$path {converting}" if ($verbose);
81	dpavlin	46
82	dpavlin	66	open(F,"$pdftotext -htmlmeta \"$path\" - \|") \|\| die "can't open $pdftotext with '$path'";
83	dpavlin	63	my $html;
84			while(<F>) {
85			# XXX why pdftotext barks if I try to use this is beyond me.
86			#$contents .= $_;
87
88			$html .= $_;
89			}
90			close(F);
91
92	dpavlin	81	return if (! $html);
93
94	dpavlin	84	my $file_only = $path;
95			$file_only =~ s/^.*\/([^\/]+)$/$1/g;
96
97	dpavlin	66	my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
98	dpavlin	63
99	dpavlin	72	($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
100	dpavlin	66
101	dpavlin	72	if ($collection) {
102			$pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
103			} else {
104	dpavlin	84	$pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si \|\|
105			$pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
106	dpavlin	72	}
107	dpavlin	66
108			my $page_nr = 1;
109	dpavlin	72	foreach my $page (split(/\f/s,$pages)) {
110			print STDERR " $page_nr" if ($verbose);
111	dpavlin	66	my $pre_tmp = $pre_html;
112			$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
113	dpavlin	68	dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
114	dpavlin	66	$page_nr++;
115			}
116
117	dpavlin	63	} else {
118
119	dpavlin	81	return if (! -f $path \|\| ! m/\.(html*\|php\|pl\|txt\|info\|log\|text)$/i);
120	dpavlin	63
121			# skip index files
122			return if (m/index_[a-z]\.html/i \|\| m/index_symbol\.html/i);
123
124			open(F,"$path") \|\| die "can't open file: $path";
125			print STDERR "$path" if ($verbose);
126			while(<F>) {
127	dpavlin	98	$contents .= $_;
128	dpavlin	63	}
129			$contents .= "\n\n";
130
131			$contents = filter($contents,$collection);
132	dpavlin	66
133			# add optional components to path
134			$path .= " $path_add" if ($path_add);
135
136			dump_contents($contents,time(), $path);
137	dpavlin	46	}
138
139	dpavlin	66	print STDERR "\n" if ($verbose);
140	dpavlin	50	# die "zero size content in '$path'" if (! $contents);
141
142	dpavlin	66	}
143	dpavlin	46