/[swish]/trunk/spider/progspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/progspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 57 by dpavlin, Sun Jan 25 16:49:50 2004 UTC revision 63 by dpavlin, Fri Feb 6 13:29:39 2004 UTC
# Line 2  Line 2 
2  use strict;  use strict;
3  use File::Find;  use File::Find;
4  use Getopt::Long;  use Getopt::Long;
5    use File::Which;
6    
7  my $collection;         # name which will be inserted  my $collection;         # name which will be inserted
8  my $path_add;           # add additional info in path  my $path_add;           # add additional info in path
# Line 18  my $result = GetOptions( Line 19  my $result = GetOptions(
19    
20  my $dir = shift @ARGV || die "usage: $0 [dir]";  my $dir = shift @ARGV || die "usage: $0 [dir]";
21    
   
22  my $basedir = $0;  my $basedir = $0;
23  $basedir =~ s,/[^/]+$,/,;  $basedir =~ s,/[^/]+$,/,;
24  require "$basedir/filter.pm";  require "$basedir/filter.pm";
25    
26    my $pdftotext = which('pdftotext');
27    
28    print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
29    
30  find({ wanted => \&file,  find({ wanted => \&file,
31          follow => 1,          follow => 1,
# Line 31  find({ wanted => \&file, Line 34  find({ wanted => \&file,
34    
35  sub file {  sub file {
36    
37          return if (! -f || ! m/\.html*/i);          my $path = $_;
38            my $contents;
39    
40          # skip index files          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
         return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);  
41    
42          my $path = $_;                  print STDERR "$path {converting}" if ($verbose);
43    
44          open(F,"$path") || die "can't open file: $path";                  open(F,"$pdftotext -htmlmeta $path - |") || die "can't open $pdftotext with '$path'";
45          print STDERR "$path" if ($verbose);                  my $html;
46          my $contents;                  while(<F>) {
47          while(<F>) {                          # XXX why pdftotext barks if I try to use this is beyond me.
48                  $contents .= "$_";                          #$contents .= $_;
         }  
         $contents .= "\n\n";  
49    
50          $contents = filter($contents,$collection);                          $html .= $_;
51                    }
52                    close(F);
53    
54                    $contents = "<!-- html from $path -->\n$html\n";
55    
56            } else {
57    
58                    return if (! -f $path || ! m/\.html*$/i);
59    
60                    # skip index files
61                    return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
62    
63                    open(F,"$path") || die "can't open file: $path";
64                    print STDERR "$path" if ($verbose);
65                    while(<F>) {
66                            $contents .= "$_";
67                    }
68                    $contents .= "\n\n";
69    
70                    $contents = filter($contents,$collection);
71            }
72    
73  #       die "zero size content in '$path'" if (! $contents);  #       die "zero size content in '$path'" if (! $contents);
74          return if (! $contents);        # don't die on empty files          return if (! $contents);        # don't die on empty files

Legend:
Removed from v.57  
changed lines
  Added in v.63

  ViewVC Help
Powered by ViewVC 1.1.26