| Revision 20 (by dpavlin, 2003/05/08 16:05:12) |
create index for HouseSpider
|
#!/usr/bin/perl -w
#
use strict;
my $home = "/home/dpavlin/private/home_html";
my @dirs = ( $home );
while (@dirs) {
my $path = shift @dirs;
opendir(DIR, $path) || die "can't opendir $path: $!";
foreach (readdir(DIR)) {
next if (/^\./); # skip .dot files
if (-d "$path/$_") {
push @dirs,"$path/$_";
next;
}
if (-f "$path/$_" && /\.html?$/i) {
my $file="$path/$_";
open(HTML,$file) || warn "can't open $file: $!";
my $html = "";
while(<HTML>) {
chomp;
$html .= "$_ ";
}
close(HTML);
my $title = $file;
$title = $1 if ($html =~ m/<title[^>]*>([^<]+)<\/title>/i);
$html =~ s/<\/?[^>]+>//g;
$html =~ s/\s+/ /g;
# remove noindex
$html =~ s,<noindex>.+?</noindex>,,isg;
$html =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
# remove all script from indexing content
$html =~ s,<script>.+?</script>,,isg;
my $url = $file;
$url =~ s/^$home//;
$url =~ s/^\///;
print "$html\n$title\n$url\n";
}
}
closedir DIR;
}