| 1 |
20 |
dpavlin |
#!/usr/bin/perl -w |
| 2 |
|
|
# |
| 3 |
|
|
|
| 4 |
|
|
use strict; |
| 5 |
|
|
|
| 6 |
|
|
my $home = "/home/dpavlin/private/home_html"; |
| 7 |
|
|
|
| 8 |
|
|
my @dirs = ( $home ); |
| 9 |
|
|
|
| 10 |
|
|
while (@dirs) { |
| 11 |
|
|
my $path = shift @dirs; |
| 12 |
|
|
opendir(DIR, $path) || die "can't opendir $path: $!"; |
| 13 |
|
|
foreach (readdir(DIR)) { |
| 14 |
|
|
next if (/^\./); # skip .dot files |
| 15 |
|
|
if (-d "$path/$_") { |
| 16 |
|
|
push @dirs,"$path/$_"; |
| 17 |
|
|
next; |
| 18 |
|
|
} |
| 19 |
|
|
if (-f "$path/$_" && /\.html?$/i) { |
| 20 |
|
|
my $file="$path/$_"; |
| 21 |
|
|
|
| 22 |
|
|
open(HTML,$file) || warn "can't open $file: $!"; |
| 23 |
|
|
my $html = ""; |
| 24 |
|
|
while(<HTML>) { |
| 25 |
|
|
chomp; |
| 26 |
|
|
$html .= "$_ "; |
| 27 |
|
|
} |
| 28 |
|
|
close(HTML); |
| 29 |
|
|
|
| 30 |
|
|
my $title = $file; |
| 31 |
|
|
$title = $1 if ($html =~ m/<title[^>]*>([^<]+)<\/title>/i); |
| 32 |
|
|
|
| 33 |
|
|
$html =~ s/<\/?[^>]+>//g; |
| 34 |
|
|
$html =~ s/\s+/ /g; |
| 35 |
|
|
|
| 36 |
|
|
# remove noindex |
| 37 |
|
|
$html =~ s,<noindex>.+?</noindex>,,isg; |
| 38 |
|
|
$html =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg; |
| 39 |
|
|
# remove all script from indexing content |
| 40 |
|
|
$html =~ s,<script>.+?</script>,,isg; |
| 41 |
|
|
|
| 42 |
|
|
my $url = $file; |
| 43 |
|
|
$url =~ s/^$home//; |
| 44 |
|
|
$url =~ s/^\///; |
| 45 |
|
|
|
| 46 |
|
|
print "$html\n$title\n$url\n"; |
| 47 |
|
|
} |
| 48 |
|
|
} |
| 49 |
|
|
closedir DIR; |
| 50 |
|
|
} |
| 51 |
|
|
|