/[perl]/HouseSpider.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /HouseSpider.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations)
Thu May 8 16:05:12 2003 UTC (16 years, 1 month ago) by dpavlin
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/plain
create index for HouseSpider

1 #!/usr/bin/perl -w
2 #
3
4 use strict;
5
6 my $home = "/home/dpavlin/private/home_html";
7
8 my @dirs = ( $home );
9
10 while (@dirs) {
11 my $path = shift @dirs;
12 opendir(DIR, $path) || die "can't opendir $path: $!";
13 foreach (readdir(DIR)) {
14 next if (/^\./); # skip .dot files
15 if (-d "$path/$_") {
16 push @dirs,"$path/$_";
17 next;
18 }
19 if (-f "$path/$_" && /\.html?$/i) {
20 my $file="$path/$_";
21
22 open(HTML,$file) || warn "can't open $file: $!";
23 my $html = "";
24 while(<HTML>) {
25 chomp;
26 $html .= "$_ ";
27 }
28 close(HTML);
29
30 my $title = $file;
31 $title = $1 if ($html =~ m/<title[^>]*>([^<]+)<\/title>/i);
32
33 $html =~ s/<\/?[^>]+>//g;
34 $html =~ s/\s+/ /g;
35
36 # remove noindex
37 $html =~ s,<noindex>.+?</noindex>,,isg;
38 $html =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
39 # remove all script from indexing content
40 $html =~ s,<script>.+?</script>,,isg;
41
42 my $url = $file;
43 $url =~ s/^$home//;
44 $url =~ s/^\///;
45
46 print "$html\n$title\n$url\n";
47 }
48 }
49 closedir DIR;
50 }
51

  ViewVC Help
Powered by ViewVC 1.1.26