--- trunk/perl/scripts/est-spider 2005/09/17 23:07:52 27 +++ trunk/perl/scripts/est-spider 2005/09/17 23:43:20 28 @@ -17,12 +17,13 @@ my $exclude; #$verbose = 1; +my $debug = 0; my $result = GetOptions( "collection=s" => \$collection, "path=s" => \$path_add, "verbose!" => \$verbose, - "debug!" => \$verbose, + "debug!" => \$debug, "exclude=s" => \$exclude, "node=s" => \$node_url, ); @@ -133,12 +134,25 @@ my $path = $_; my $contents; - return if (-l $path || $path =~ m#/.svn#); + return if (-l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/); + + my $mtime = (stat($path))[9]; + my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2; + + if ($mtime == $mtime_db) { + print STDERR "# same: $path $mtime\n" if ($verbose); + return; + } else { + print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug); + } + + # skip files on which File::MMagic::XS croaks + return if ($path =~ m#\.au$#); my $type = $mm->checktype_filename($path); $type =~ s/\s+/ /gs; - print STDERR "# $path $type\n" if ($verbose); + print STDERR "# $path $type\n" if ($debug); if ($pdftotext && -f $path && $type =~ m/pdf/i) { @@ -175,17 +189,16 @@ print STDERR " $page_nr" if ($verbose); my $pre_tmp = $pre_html; $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s; - dump_contents($db, $pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s); + dump_contents($db, $pre_tmp . $page . $post_html, $mtime, $path) if ($page !~ m/^\s*$/s); $page_nr++; } } else { # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); - return if (! -f $path || ( - $type !~ m/html/ || - ($type !~ m#text/plain# && m/\.(php|pl|txt|info|log|text)$/) - )); + return unless (-f $path && $type =~ m/html/ || + ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io) + ); # skip index files return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); @@ -202,7 +215,7 @@ # add optional components to path $path .= " $path_add" if ($path_add); - dump_contents($db, $contents,time(), $path); + dump_contents($db, $contents, $mtime, $path); } print STDERR "\n" if ($verbose);