--- trunk/perl/scripts/est-spider 2005/09/17 21:22:26 26 +++ trunk/perl/scripts/est-spider 2005/09/17 23:07:52 27 @@ -5,6 +5,8 @@ use File::Which; use HyperEstraier; use Text::Iconv; +#use File::MMagic; +use File::MMagic::XS qw/:compat/; # do we use Node API? my $node_url; @@ -33,6 +35,9 @@ my $pdftotext = which('pdftotext'); +#my $mm = new File::MMagic('/usr/share/misc/file/magic'); +my $mm = new File::MMagic::XS(); + my $iconv = new Text::Iconv('iso-8859-2', 'utf-8'); select(STDERR); $|=1; @@ -95,9 +100,14 @@ my $title = $1 if ($contents =~ m#(.+)#is); + # chop long titles to 100 chars + $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100); + # use path if no title is found + $title ||= $path; + # add attributes to the document object $doc->add_attr('@uri', "file:///$path"); - $doc->add_attr('@title', $title || $path); + $doc->add_attr('@title', $iconv->convert($title)); $doc->add_attr('@size', $size); $doc->add_attr('@mtime', $mtime); @@ -123,9 +133,14 @@ my $path = $_; my $contents; - return if (-l $path); + return if (-l $path || $path =~ m#/.svn#); + + my $type = $mm->checktype_filename($path); + $type =~ s/\s+/ /gs; + + print STDERR "# $path $type\n" if ($verbose); - if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) { + if ($pdftotext && -f $path && $type =~ m/pdf/i) { print STDERR "$path {converting}" if ($verbose); @@ -166,13 +181,17 @@ } else { - return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); +# return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); + return if (! -f $path || ( + $type !~ m/html/ || + ($type !~ m#text/plain# && m/\.(php|pl|txt|info|log|text)$/) + )); # skip index files return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); open(F,"$path") || die "can't open file: $path"; - print STDERR "$path" if ($verbose); + print STDERR "$path ($type)" if ($verbose); while() { $contents .= "$_"; }