--- trunk/perl/scripts/est-spider 2005/09/03 19:16:48 5
+++ trunk/perl/scripts/est-spider 2005/09/17 23:55:09 29
@@ -5,6 +5,11 @@
use File::Which;
use HyperEstraier;
use Text::Iconv;
+#use File::MMagic;
+use File::MMagic::XS qw/:compat/;
+
+# do we use Node API?
+my $node_url;
my $collection; # name which will be inserted
my $path_add; # add additional info in path
@@ -12,13 +17,17 @@
my $exclude;
#$verbose = 1;
+my $debug = 0;
+my $force = 0;
my $result = GetOptions(
"collection=s" => \$collection,
"path=s" => \$path_add,
"verbose!" => \$verbose,
- "debug!" => \$verbose,
+ "debug!" => \$debug,
"exclude=s" => \$exclude,
+ "node=s" => \$node_url,
+ "force!" => \$force,
);
my $dir = shift @ARGV || die "usage: $0 [dir]";
@@ -29,6 +38,9 @@
my $pdftotext = which('pdftotext');
+#my $mm = new File::MMagic('/usr/share/misc/file/magic');
+my $mm = new File::MMagic::XS();
+
my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
select(STDERR); $|=1;
@@ -36,31 +48,39 @@
print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
+my $db;
+if ($node_url) {
+ $db = HyperEstraier::Node->new($node_url);
+ $db->set_auth('admin', 'admin');
+} else {
+ # open the database
+ $db = HyperEstraier::Database->new();
+ $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
+
+ sub signal {
+ my($sig) = @_;
+ print "\nCaught a SIG$sig--syncing database and shutting down\n";
+ $db->sync();
+ exit(0);
+ }
-# open the database
-my $db = HyperEstraier::Database->new();
-$db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
-
-sub signal {
- my($sig) = @_;
- print "\nCaught a SIG$sig--syncing database and shutting down\n";
- $db->sync();
- exit(0);
+ $SIG{'INT'} = \&signal;
+ $SIG{'QUIT'} = \&signal;
}
-$SIG{'INT'} = \&signal;
-$SIG{'QUIT'} = \&signal;
-
find({ wanted => \&file,
follow => 1,
- no_chdir => 1
+ follow_skip => 2,
+ no_chdir => 1,
}, $dir);
-print "--- sync\n";
-$db->sync();
+unless ($node_url) {
+ print "--- sync\n";
+ $db->sync();
-print "--- optimize...\n";
-$db->optimize(0);
+ print "--- optimize...\n";
+ $db->optimize(0);
+}
exit;
sub dump_contents($$$$) {
@@ -81,11 +101,16 @@
# create a document object
my $doc = HyperEstraier::Document->new;
- my $title = $1 if ($contents =~ m#
(.+)#is);
+ my $title = $1 if ($contents =~ m#(.+?)#is);
+
+ # chop long titles to 100 chars
+ $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
+ # use path if no title is found
+ $title ||= $path;
# add attributes to the document object
$doc->add_attr('@uri', "file:///$path");
- $doc->add_attr('@title', $title || $path);
+ $doc->add_attr('@title', $iconv->convert($title));
$doc->add_attr('@size', $size);
$doc->add_attr('@mtime', $mtime);
@@ -98,7 +123,11 @@
# print $doc->dump_draft if ($verbose);
# register the document object to the database
- $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
+ if ($node_url) {
+ $db->put_doc($doc);
+ } else {
+ $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
+ }
}
@@ -107,9 +136,27 @@
my $path = $_;
my $contents;
- return if (-l $path);
+ return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/);
+
+ my $mtime = (stat($path))[9];
+ my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2;
+
+ if ($mtime == $mtime_db) {
+ print STDERR "# same: $path $mtime\n" if ($verbose);
+ return unless($force);
+ } else {
+ print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
+ }
+
+ # skip files on which File::MMagic::XS croaks
+ return if ($path =~ m#\.au$#);
+
+ my $type = $mm->checktype_filename($path);
+ $type =~ s/\s+/ /gs;
+
+ print STDERR "# $path $type\n" if ($debug);
- if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
+ if ($pdftotext && -f $path && $type =~ m/pdf/i) {
print STDERR "$path {converting}" if ($verbose);
@@ -144,19 +191,22 @@
print STDERR " $page_nr" if ($verbose);
my $pre_tmp = $pre_html;
$pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
- dump_contents($db, $pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
+ dump_contents($db, $pre_tmp . $page . $post_html, $mtime, $path) if ($page !~ m/^\s*$/s);
$page_nr++;
}
} else {
- return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
+# return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
+ return unless (-f $path && $type =~ m/html/ ||
+ ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io)
+ );
# skip index files
return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
open(F,"$path") || die "can't open file: $path";
- print STDERR "$path" if ($verbose);
+ print STDERR "$path ($type)" if ($verbose);
while() {
$contents .= "$_";
}
@@ -167,7 +217,7 @@
# add optional components to path
$path .= " $path_add" if ($path_add);
- dump_contents($db, $contents,time(), $path);
+ dump_contents($db, $contents, $mtime, $path);
}
print STDERR "\n" if ($verbose);