--- trunk/perl/scripts/est-spider 2005/09/03 19:16:48 5 +++ trunk/perl/scripts/est-spider 2005/09/17 23:55:09 29 @@ -5,6 +5,11 @@ use File::Which; use HyperEstraier; use Text::Iconv; +#use File::MMagic; +use File::MMagic::XS qw/:compat/; + +# do we use Node API? +my $node_url; my $collection; # name which will be inserted my $path_add; # add additional info in path @@ -12,13 +17,17 @@ my $exclude; #$verbose = 1; +my $debug = 0; +my $force = 0; my $result = GetOptions( "collection=s" => \$collection, "path=s" => \$path_add, "verbose!" => \$verbose, - "debug!" => \$verbose, + "debug!" => \$debug, "exclude=s" => \$exclude, + "node=s" => \$node_url, + "force!" => \$force, ); my $dir = shift @ARGV || die "usage: $0 [dir]"; @@ -29,6 +38,9 @@ my $pdftotext = which('pdftotext'); +#my $mm = new File::MMagic('/usr/share/misc/file/magic'); +my $mm = new File::MMagic::XS(); + my $iconv = new Text::Iconv('iso-8859-2', 'utf-8'); select(STDERR); $|=1; @@ -36,31 +48,39 @@ print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose); +my $db; +if ($node_url) { + $db = HyperEstraier::Node->new($node_url); + $db->set_auth('admin', 'admin'); +} else { + # open the database + $db = HyperEstraier::Database->new(); + $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT); + + sub signal { + my($sig) = @_; + print "\nCaught a SIG$sig--syncing database and shutting down\n"; + $db->sync(); + exit(0); + } -# open the database -my $db = HyperEstraier::Database->new(); -$db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT); - -sub signal { - my($sig) = @_; - print "\nCaught a SIG$sig--syncing database and shutting down\n"; - $db->sync(); - exit(0); + $SIG{'INT'} = \&signal; + $SIG{'QUIT'} = \&signal; } -$SIG{'INT'} = \&signal; -$SIG{'QUIT'} = \&signal; - find({ wanted => \&file, follow => 1, - no_chdir => 1 + follow_skip => 2, + no_chdir => 1, }, $dir); -print "--- sync\n"; -$db->sync(); +unless ($node_url) { + print "--- sync\n"; + $db->sync(); -print "--- optimize...\n"; -$db->optimize(0); + print "--- optimize...\n"; + $db->optimize(0); +} exit; sub dump_contents($$$$) { @@ -81,11 +101,16 @@ # create a document object my $doc = HyperEstraier::Document->new; - my $title = $1 if ($contents =~ m#(.+)#is); + my $title = $1 if ($contents =~ m#(.+?)#is); + + # chop long titles to 100 chars + $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100); + # use path if no title is found + $title ||= $path; # add attributes to the document object $doc->add_attr('@uri', "file:///$path"); - $doc->add_attr('@title', $title || $path); + $doc->add_attr('@title', $iconv->convert($title)); $doc->add_attr('@size', $size); $doc->add_attr('@mtime', $mtime); @@ -98,7 +123,11 @@ # print $doc->dump_draft if ($verbose); # register the document object to the database - $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN); + if ($node_url) { + $db->put_doc($doc); + } else { + $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN); + } } @@ -107,9 +136,27 @@ my $path = $_; my $contents; - return if (-l $path); + return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/); + + my $mtime = (stat($path))[9]; + my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2; + + if ($mtime == $mtime_db) { + print STDERR "# same: $path $mtime\n" if ($verbose); + return unless($force); + } else { + print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug); + } + + # skip files on which File::MMagic::XS croaks + return if ($path =~ m#\.au$#); + + my $type = $mm->checktype_filename($path); + $type =~ s/\s+/ /gs; + + print STDERR "# $path $type\n" if ($debug); - if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) { + if ($pdftotext && -f $path && $type =~ m/pdf/i) { print STDERR "$path {converting}" if ($verbose); @@ -144,19 +191,22 @@ print STDERR " $page_nr" if ($verbose); my $pre_tmp = $pre_html; $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s; - dump_contents($db, $pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s); + dump_contents($db, $pre_tmp . $page . $post_html, $mtime, $path) if ($page !~ m/^\s*$/s); $page_nr++; } } else { - return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); +# return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); + return unless (-f $path && $type =~ m/html/ || + ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io) + ); # skip index files return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); open(F,"$path") || die "can't open file: $path"; - print STDERR "$path" if ($verbose); + print STDERR "$path ($type)" if ($verbose); while() { $contents .= "$_"; } @@ -167,7 +217,7 @@ # add optional components to path $path .= " $path_add" if ($path_add); - dump_contents($db, $contents,time(), $path); + dump_contents($db, $contents, $mtime, $path); } print STDERR "\n" if ($verbose);