/[swish]/trunk/spider/progspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/progspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 81 by dpavlin, Sat Aug 28 22:15:59 2004 UTC revision 99 by dpavlin, Sat Apr 30 20:20:42 2005 UTC
# Line 7  use File::Which; Line 7  use File::Which;
7  my $collection;         # name which will be inserted  my $collection;         # name which will be inserted
8  my $path_add;           # add additional info in path  my $path_add;           # add additional info in path
9  my $verbose;  my $verbose;
10    my $exclude;
11    my $skip_output;
12    
13  #$verbose = 1;  #$verbose = 1;
14    
# Line 15  my $result = GetOptions( Line 17  my $result = GetOptions(
17          "path=s" => \$path_add,          "path=s" => \$path_add,
18          "verbose!" => \$verbose,          "verbose!" => \$verbose,
19          "debug!" => \$verbose,          "debug!" => \$verbose,
20            "exclude=s" => \$exclude,
21            "skipoutput!" => \$skip_output,
22  );  );
23    
24  my $dir = shift @ARGV || die "usage: $0 [dir]";  die "usage: $0 [dir] ..." unless (@ARGV);
25    
26  my $basedir = $0;  my $basedir = $0;
27  $basedir =~ s,/[^/]+$,/,;  $basedir =~ s,/[^/]+$,/,;
# Line 30  select(STDOUT); $|=1; Line 34  select(STDOUT); $|=1;
34    
35  print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);  print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
36    
37  find({ wanted => \&file,  while (my $dir = shift @ARGV) {
38          follow => 1,          print STDERR "indexing $dir\n" if ($verbose);
39          no_chdir => 1  
40  }, $dir);          find({ wanted => \&file,
41                    follow => 1,
42                    no_chdir => 1
43            }, $dir);
44    }
45    
46  sub dump_contents($$$) {  sub dump_contents($$$) {
47          my ($contents,$mtime,$path) = @_;          my ($contents,$mtime,$path) = @_;
48    
49          return if (! $contents);        # don't die on empty files          return unless ($contents);      # don't die on empty files
50    
51            if ($exclude && $path =~ m/$exclude/i) {
52                    print STDERR "skip: $path\n" if ($verbose);
53                    return;
54            }
55    
56          use bytes;          use bytes;
57          my $size = length $contents;          my $size = length $contents;
58    
59          print STDERR " [$size]" if ($verbose);          print STDERR " [$size]" if ($verbose);
60    
61            return if ($skip_output);
62    
63          # Output the document (to swish)          # Output the document (to swish)
64          print <<EOF;          print <<EOF;
65  Path-Name: $path  Path-Name: $path
# Line 62  sub file { Line 77  sub file {
77          my $path = $_;          my $path = $_;
78          my $contents;          my $contents;
79    
80            return if (-l $path);
81    
82          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
83    
84                  print STDERR "$path {converting}" if ($verbose);                  print STDERR "$path {converting}" if ($verbose);
# Line 78  sub file { Line 95  sub file {
95    
96                  return if (! $html);                  return if (! $html);
97    
98                    my $file_only = $path;
99                    $file_only =~ s/^.*\/([^\/]+)$/$1/g;
100    
101                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
102    
103                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
# Line 85  sub file { Line 105  sub file {
105                  if ($collection) {                  if ($collection) {
106                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
107                  } else {                  } else {
108                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
109                            $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
110                  }                  }
111    
112                  my $page_nr = 1;                  my $page_nr = 1;
# Line 107  sub file { Line 128  sub file {
128                  open(F,"$path") || die "can't open file: $path";                  open(F,"$path") || die "can't open file: $path";
129                  print STDERR "$path" if ($verbose);                  print STDERR "$path" if ($verbose);
130                  while(<F>) {                  while(<F>) {
131                          $contents .= "$_";                          $contents .= $_;
132                  }                  }
133                  $contents .= "\n\n";                  $contents .= "\n\n";
134    

Legend:
Removed from v.81  
changed lines
  Added in v.99

  ViewVC Help
Powered by ViewVC 1.1.26