5 |
use File::Which; |
use File::Which; |
6 |
use HyperEstraier; |
use HyperEstraier; |
7 |
use Text::Iconv; |
use Text::Iconv; |
8 |
|
#use File::MMagic; |
9 |
|
use File::MMagic::XS qw/:compat/; |
10 |
|
|
11 |
# do we use Node API? |
# do we use Node API? |
12 |
my $node_url; |
my $node_url; |
35 |
|
|
36 |
my $pdftotext = which('pdftotext'); |
my $pdftotext = which('pdftotext'); |
37 |
|
|
38 |
|
#my $mm = new File::MMagic('/usr/share/misc/file/magic'); |
39 |
|
my $mm = new File::MMagic::XS(); |
40 |
|
|
41 |
my $iconv = new Text::Iconv('iso-8859-2', 'utf-8'); |
my $iconv = new Text::Iconv('iso-8859-2', 'utf-8'); |
42 |
|
|
43 |
select(STDERR); $|=1; |
select(STDERR); $|=1; |
67 |
|
|
68 |
find({ wanted => \&file, |
find({ wanted => \&file, |
69 |
follow => 1, |
follow => 1, |
70 |
no_chdir => 1 |
follow_skip => 2, |
71 |
|
no_chdir => 1, |
72 |
}, $dir); |
}, $dir); |
73 |
|
|
74 |
unless ($node_url) { |
unless ($node_url) { |
100 |
|
|
101 |
my $title = $1 if ($contents =~ m#<title>(.+)</title>#is); |
my $title = $1 if ($contents =~ m#<title>(.+)</title>#is); |
102 |
|
|
103 |
|
# chop long titles to 100 chars |
104 |
|
$title = substr($title, 0, 100) . '...' if ($title && length($title) > 100); |
105 |
|
# use path if no title is found |
106 |
|
$title ||= $path; |
107 |
|
|
108 |
# add attributes to the document object |
# add attributes to the document object |
109 |
$doc->add_attr('@uri', "file:///$path"); |
$doc->add_attr('@uri', "file:///$path"); |
110 |
$doc->add_attr('@title', $title || $path); |
$doc->add_attr('@title', $iconv->convert($title)); |
111 |
$doc->add_attr('@size', $size); |
$doc->add_attr('@size', $size); |
112 |
$doc->add_attr('@mtime', $mtime); |
$doc->add_attr('@mtime', $mtime); |
113 |
|
|
133 |
my $path = $_; |
my $path = $_; |
134 |
my $contents; |
my $contents; |
135 |
|
|
136 |
return if (-l $path); |
return if (-l $path || $path =~ m#/.svn#); |
137 |
|
|
138 |
|
my $type = $mm->checktype_filename($path); |
139 |
|
$type =~ s/\s+/ /gs; |
140 |
|
|
141 |
|
print STDERR "# $path $type\n" if ($verbose); |
142 |
|
|
143 |
if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) { |
if ($pdftotext && -f $path && $type =~ m/pdf/i) { |
144 |
|
|
145 |
print STDERR "$path {converting}" if ($verbose); |
print STDERR "$path {converting}" if ($verbose); |
146 |
|
|
181 |
|
|
182 |
} else { |
} else { |
183 |
|
|
184 |
return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); |
# return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); |
185 |
|
return if (! -f $path || ( |
186 |
|
$type !~ m/html/ || |
187 |
|
($type !~ m#text/plain# && m/\.(php|pl|txt|info|log|text)$/) |
188 |
|
)); |
189 |
|
|
190 |
# skip index files |
# skip index files |
191 |
return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); |
return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); |
192 |
|
|
193 |
open(F,"$path") || die "can't open file: $path"; |
open(F,"$path") || die "can't open file: $path"; |
194 |
print STDERR "$path" if ($verbose); |
print STDERR "$path ($type)" if ($verbose); |
195 |
while(<F>) { |
while(<F>) { |
196 |
$contents .= "$_"; |
$contents .= "$_"; |
197 |
} |
} |