/[swish]/trunk/spider/progspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/progspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 72 by dpavlin, Tue Apr 6 15:06:58 2004 UTC revision 95 by dpavlin, Sun Apr 24 16:33:53 2005 UTC
# Line 1  Line 1 
1  #!/usr/local/bin/perl -w  #!/usr/bin/perl -w
2  use strict;  use strict;
3  use File::Find;  use File::Find;
4  use Getopt::Long;  use Getopt::Long;
# Line 7  use File::Which; Line 7  use File::Which;
7  my $collection;         # name which will be inserted  my $collection;         # name which will be inserted
8  my $path_add;           # add additional info in path  my $path_add;           # add additional info in path
9  my $verbose;  my $verbose;
10    my $exclude;
11    
12  #$verbose = 1;  #$verbose = 1;
13    
# Line 15  my $result = GetOptions( Line 16  my $result = GetOptions(
16          "path=s" => \$path_add,          "path=s" => \$path_add,
17          "verbose!" => \$verbose,          "verbose!" => \$verbose,
18          "debug!" => \$verbose,          "debug!" => \$verbose,
19            "exclude=s" => \$exclude,
20  );  );
21    
22  my $dir = shift @ARGV || die "usage: $0 [dir]";  my $dir = shift @ARGV || die "usage: $0 [dir]";
# Line 38  find({ wanted => \&file, Line 40  find({ wanted => \&file,
40  sub dump_contents($$$) {  sub dump_contents($$$) {
41          my ($contents,$mtime,$path) = @_;          my ($contents,$mtime,$path) = @_;
42    
43          return if (! $contents);        # don't die on empty files          return unless ($contents);      # don't die on empty files
44    
45            if ($exclude && $path =~ m/$exclude/i) {
46                    print STDERR "skip: $path\n" if ($verbose);
47                    return;
48            }
49    
50          use bytes;          use bytes;
51          my $size = length $contents;          my $size = length $contents;
# Line 50  sub dump_contents($$$) { Line 57  sub dump_contents($$$) {
57  Path-Name: $path  Path-Name: $path
58  Content-Length: $size  Content-Length: $size
59  Last-Mtime: $mtime  Last-Mtime: $mtime
60  Document-Type: HTML  Document-Type: html*
61    
62  EOF  EOF
63          print $contents;          print $contents;
# Line 62  sub file { Line 69  sub file {
69          my $path = $_;          my $path = $_;
70          my $contents;          my $contents;
71    
72            return if (-l $path);
73    
74          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
75    
76                  print STDERR "$path {converting}" if ($verbose);                  print STDERR "$path {converting}" if ($verbose);
# Line 76  sub file { Line 85  sub file {
85                  }                  }
86                  close(F);                  close(F);
87    
88                    return if (! $html);
89    
90                    my $file_only = $path;
91                    $file_only =~ s/^.*\/([^\/]+)$/$1/g;
92    
93                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
94    
95                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
# Line 83  sub file { Line 97  sub file {
97                  if ($collection) {                  if ($collection) {
98                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
99                  } else {                  } else {
100                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
101                            $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
102                  }                  }
103    
104                  my $page_nr = 1;                  my $page_nr = 1;
# Line 97  sub file { Line 112  sub file {
112    
113          } else {          } else {
114    
115                  return if (! -f $path || ! m/\.html*$/i);                  return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
116    
117                  # skip index files                  # skip index files
118                  return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);                  return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);

Legend:
Removed from v.72  
changed lines
  Added in v.95

  ViewVC Help
Powered by ViewVC 1.1.26