/[swish]/trunk/spider/progspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/spider/progspider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 46 - (hide annotations)
Sat Jan 17 23:57:55 2004 UTC (20 years, 3 months ago) by dpavlin
File size: 785 byte(s)
- moved text/html content filtering to filter.pm to faciliate code re-use
- added progspider which can be used with -S prog to crawl files and
  use filtering subroutines

1 dpavlin 46 #!/usr/local/bin/perl -w
2     use strict;
3     use File::Find;
4    
5    
6     my $dir = shift @ARGV || die "usage: $0 [dir]";
7    
8     my $basedir = $0;
9     $basedir =~ s,/[^/]+$,/,;
10     require "$basedir/filter.pm";
11    
12     find({ wanted => \&file,
13     follow => 1,
14     no_chdir => 1
15     }, $dir);
16    
17     sub file {
18    
19     return if (! -f || ! m/\.html*/i);
20    
21     my $path = $_;
22    
23     open(F,"$path") || die "can't open file: $path";
24     print STDERR "$path";
25     my $contents;
26     while(<F>) {
27     # chomp;
28     # chomp;
29     # $contents .= " ".$_;
30     $contents .= $_;
31     }
32    
33     # $contents =~ s/<(\/*\w+)\s+>/<$1>/g;
34    
35     $contents = filter($contents);
36    
37     my $mtime = time;
38     my $size = length $contents;
39    
40     print STDERR " [$size]\n";
41    
42     # Output the document (to swish)
43     print <<EOF;
44     Path-Name: $path
45     Content-Length: $size
46     Last-Mtime: $mtime
47     Document-Type: HTML
48    
49     EOF
50    
51     print $contents;
52    
53     }

Properties

Name Value
cvs2svn:cvs-rev 1.1
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26