/[swish]/trunk/spider/progspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/spider/progspider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 63 - (hide annotations)
Fri Feb 6 13:29:39 2004 UTC (20 years, 2 months ago) by dpavlin
File size: 1888 byte(s)
convert pdf files when indexing with progspider

1 dpavlin 46 #!/usr/local/bin/perl -w
2     use strict;
3     use File::Find;
4 dpavlin 56 use Getopt::Long;
5 dpavlin 63 use File::Which;
6 dpavlin 46
7 dpavlin 56 my $collection; # name which will be inserted
8     my $path_add; # add additional info in path
9     my $verbose;
10 dpavlin 46
11 dpavlin 57 #$verbose = 1;
12    
13 dpavlin 56 my $result = GetOptions(
14     "collection=s" => \$collection,
15     "path=s" => \$path_add,
16     "verbose!" => \$verbose,
17     "debug!" => \$verbose,
18     );
19    
20 dpavlin 46 my $dir = shift @ARGV || die "usage: $0 [dir]";
21    
22     my $basedir = $0;
23     $basedir =~ s,/[^/]+$,/,;
24     require "$basedir/filter.pm";
25    
26 dpavlin 63 my $pdftotext = which('pdftotext');
27 dpavlin 56
28 dpavlin 63 print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
29    
30 dpavlin 46 find({ wanted => \&file,
31     follow => 1,
32     no_chdir => 1
33     }, $dir);
34    
35     sub file {
36    
37 dpavlin 63 my $path = $_;
38     my $contents;
39 dpavlin 46
40 dpavlin 63 if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
41 dpavlin 56
42 dpavlin 63 print STDERR "$path {converting}" if ($verbose);
43 dpavlin 46
44 dpavlin 63 open(F,"$pdftotext -htmlmeta $path - |") || die "can't open $pdftotext with '$path'";
45     my $html;
46     while(<F>) {
47     # XXX why pdftotext barks if I try to use this is beyond me.
48     #$contents .= $_;
49    
50     $html .= $_;
51     }
52     close(F);
53    
54     $contents = "<!-- html from $path -->\n$html\n";
55    
56     } else {
57    
58     return if (! -f $path || ! m/\.html*$/i);
59    
60     # skip index files
61     return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
62    
63     open(F,"$path") || die "can't open file: $path";
64     print STDERR "$path" if ($verbose);
65     while(<F>) {
66     $contents .= "$_";
67     }
68     $contents .= "\n\n";
69    
70     $contents = filter($contents,$collection);
71 dpavlin 46 }
72    
73 dpavlin 50 # die "zero size content in '$path'" if (! $contents);
74     return if (! $contents); # don't die on empty files
75    
76 dpavlin 46 my $mtime = time;
77 dpavlin 56 use bytes;
78 dpavlin 46 my $size = length $contents;
79    
80 dpavlin 56 print STDERR " [$size]\n" if ($verbose);
81 dpavlin 46
82 dpavlin 56 # add optional components to path
83     $path .= " $path_add" if ($path_add);
84    
85 dpavlin 46 # Output the document (to swish)
86     print <<EOF;
87     Path-Name: $path
88     Content-Length: $size
89     Last-Mtime: $mtime
90     Document-Type: HTML
91    
92     EOF
93     print $contents;
94    
95     }

Properties

Name Value
cvs2svn:cvs-rev 1.6
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26