/[swish]/trunk/spider/progspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/progspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 68 by dpavlin, Thu Mar 18 11:14:49 2004 UTC revision 92 by dpavlin, Mon Nov 22 17:09:23 2004 UTC
# Line 1  Line 1 
1  #!/usr/local/bin/perl -w  #!/usr/bin/perl -w
2  use strict;  use strict;
3  use File::Find;  use File::Find;
4  use Getopt::Long;  use Getopt::Long;
# Line 50  sub dump_contents($$$) { Line 50  sub dump_contents($$$) {
50  Path-Name: $path  Path-Name: $path
51  Content-Length: $size  Content-Length: $size
52  Last-Mtime: $mtime  Last-Mtime: $mtime
53  Document-Type: HTML  Document-Type: html*
54    
55  EOF  EOF
56          print $contents;          print $contents;
# Line 62  sub file { Line 62  sub file {
62          my $path = $_;          my $path = $_;
63          my $contents;          my $contents;
64    
65            return if (-l $path);
66    
67          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
68    
69                  print STDERR "$path {converting}" if ($verbose);                  print STDERR "$path {converting}" if ($verbose);
# Line 76  sub file { Line 78  sub file {
78                  }                  }
79                  close(F);                  close(F);
80    
81                    return if (! $html);
82    
83                    my $file_only = $path;
84                    $file_only =~ s/^.*\/([^\/]+)$/$1/g;
85    
86                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
87    
88                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+<pre>)(.+)(<\/pre>.+)$/si);                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
89    
90                  $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;                  if ($collection) {
91                            $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
92                    } else {
93                            $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
94                            $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
95                    }
96    
97                  my $page_nr = 1;                  my $page_nr = 1;
98                  foreach my $page (split(/\f/,$pages)) {                  foreach my $page (split(/\f/s,$pages)) {
99                            print STDERR " $page_nr" if ($verbose);
100                          my $pre_tmp = $pre_html;                          my $pre_tmp = $pre_html;
101                          $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;                          $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
102                          dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);                          dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
# Line 92  sub file { Line 105  sub file {
105    
106          } else {          } else {
107    
108                  return if (! -f $path || ! m/\.html*$/i);                  return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
109    
110                  # skip index files                  # skip index files
111                  return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);                  return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);

Legend:
Removed from v.68  
changed lines
  Added in v.92

  ViewVC Help
Powered by ViewVC 1.1.26