/[perl]/HouseSpider.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /HouseSpider.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations)
Thu May 8 16:05:12 2003 UTC (17 years, 2 months ago) by dpavlin
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/plain
create index for HouseSpider

1 dpavlin 1.1 #!/usr/bin/perl -w
2     #
3    
4     use strict;
5    
6     my $home = "/home/dpavlin/private/home_html";
7    
8     my @dirs = ( $home );
9    
10     while (@dirs) {
11     my $path = shift @dirs;
12     opendir(DIR, $path) || die "can't opendir $path: $!";
13     foreach (readdir(DIR)) {
14     next if (/^\./); # skip .dot files
15     if (-d "$path/$_") {
16     push @dirs,"$path/$_";
17     next;
18     }
19     if (-f "$path/$_" && /\.html?$/i) {
20     my $file="$path/$_";
21    
22     open(HTML,$file) || warn "can't open $file: $!";
23     my $html = "";
24     while(<HTML>) {
25     chomp;
26     $html .= "$_ ";
27     }
28     close(HTML);
29    
30     my $title = $file;
31     $title = $1 if ($html =~ m/<title[^>]*>([^<]+)<\/title>/i);
32    
33     $html =~ s/<\/?[^>]+>//g;
34     $html =~ s/\s+/ /g;
35    
36     # remove noindex
37     $html =~ s,<noindex>.+?</noindex>,,isg;
38     $html =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
39     # remove all script from indexing content
40     $html =~ s,<script>.+?</script>,,isg;
41    
42     my $url = $file;
43     $url =~ s/^$home//;
44     $url =~ s/^\///;
45    
46     print "$html\n$title\n$url\n";
47     }
48     }
49     closedir DIR;
50     }
51    

  ViewVC Help
Powered by ViewVC 1.1.26