--- trunk/perl/scripts/est-spider 2005/09/17 21:22:26 26
+++ trunk/perl/scripts/est-spider 2005/09/17 23:07:52 27
@@ -5,6 +5,8 @@
use File::Which;
use HyperEstraier;
use Text::Iconv;
+#use File::MMagic;
+use File::MMagic::XS qw/:compat/;
# do we use Node API?
my $node_url;
@@ -33,6 +35,9 @@
my $pdftotext = which('pdftotext');
+#my $mm = new File::MMagic('/usr/share/misc/file/magic');
+my $mm = new File::MMagic::XS();
+
my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
select(STDERR); $|=1;
@@ -95,9 +100,14 @@
my $title = $1 if ($contents =~ m#
(.+)#is);
+ # chop long titles to 100 chars
+ $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
+ # use path if no title is found
+ $title ||= $path;
+
# add attributes to the document object
$doc->add_attr('@uri', "file:///$path");
- $doc->add_attr('@title', $title || $path);
+ $doc->add_attr('@title', $iconv->convert($title));
$doc->add_attr('@size', $size);
$doc->add_attr('@mtime', $mtime);
@@ -123,9 +133,14 @@
my $path = $_;
my $contents;
- return if (-l $path);
+ return if (-l $path || $path =~ m#/.svn#);
+
+ my $type = $mm->checktype_filename($path);
+ $type =~ s/\s+/ /gs;
+
+ print STDERR "# $path $type\n" if ($verbose);
- if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
+ if ($pdftotext && -f $path && $type =~ m/pdf/i) {
print STDERR "$path {converting}" if ($verbose);
@@ -166,13 +181,17 @@
} else {
- return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
+# return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
+ return if (! -f $path || (
+ $type !~ m/html/ ||
+ ($type !~ m#text/plain# && m/\.(php|pl|txt|info|log|text)$/)
+ ));
# skip index files
return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
open(F,"$path") || die "can't open file: $path";
- print STDERR "$path" if ($verbose);
+ print STDERR "$path ($type)" if ($verbose);
while() {
$contents .= "$_";
}