1 |
#!/usr/bin/perl -w |
2 |
# |
3 |
|
4 |
use strict; |
5 |
|
6 |
my $home = "/home/dpavlin/private/home_html"; |
7 |
|
8 |
my @dirs = ( $home ); |
9 |
|
10 |
while (@dirs) { |
11 |
my $path = shift @dirs; |
12 |
opendir(DIR, $path) || die "can't opendir $path: $!"; |
13 |
foreach (readdir(DIR)) { |
14 |
next if (/^\./); # skip .dot files |
15 |
if (-d "$path/$_") { |
16 |
push @dirs,"$path/$_"; |
17 |
next; |
18 |
} |
19 |
if (-f "$path/$_" && /\.html?$/i) { |
20 |
my $file="$path/$_"; |
21 |
|
22 |
open(HTML,$file) || warn "can't open $file: $!"; |
23 |
my $html = ""; |
24 |
while(<HTML>) { |
25 |
chomp; |
26 |
$html .= "$_ "; |
27 |
} |
28 |
close(HTML); |
29 |
|
30 |
my $title = $file; |
31 |
$title = $1 if ($html =~ m/<title[^>]*>([^<]+)<\/title>/i); |
32 |
|
33 |
$html =~ s/<\/?[^>]+>//g; |
34 |
$html =~ s/\s+/ /g; |
35 |
|
36 |
# remove noindex |
37 |
$html =~ s,<noindex>.+?</noindex>,,isg; |
38 |
$html =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg; |
39 |
# remove all script from indexing content |
40 |
$html =~ s,<script>.+?</script>,,isg; |
41 |
|
42 |
my $url = $file; |
43 |
$url =~ s/^$home//; |
44 |
$url =~ s/^\///; |
45 |
|
46 |
print "$html\n$title\n$url\n"; |
47 |
} |
48 |
} |
49 |
closedir DIR; |
50 |
} |
51 |
|