/[swish]/trunk/spider/progspider

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/spider/progspider

Parent Directory | Revision Log | View Patch Patch

-revision 81 by dpavlin,
Sat Aug 28 22:15:59 2004 UTC
+revision 99 by dpavlin,
Sat Apr 30 20:20:42 2005 UTC
 Line 7 
 use File::Which;
  my $collection;         # name which will be inserted
  my $path_add;           # add additional info in path
  my $verbose;
+ my $exclude;
+ my $skip_output;
  #$verbose = 1;
-Line 15 
 my $result = GetOptions(
+Line 17 
 my $result = GetOptions(
          "path=s" => \$path_add,
          "verbose!" => \$verbose,
          "debug!" => \$verbose,
+         "exclude=s" => \$exclude,
+         "skipoutput!" => \$skip_output,
  );
- my $dir = shift @ARGV || die "usage: $0 [dir]";
+ die "usage: $0 [dir] ..." unless (@ARGV);
  my $basedir = $0;
  $basedir =~ s,/[^/]+$,/,;
-Line 30 
 select(STDOUT); $|=1;
+Line 34 
 select(STDOUT); $|=1;
  print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
- find({ wanted => \&file,
+ while (my $dir = shift @ARGV) {
-         follow => 1,
+         print STDERR "indexing $dir\n" if ($verbose);
-         no_chdir => 1
- }, $dir);
+         find({ wanted => \&file,
+                 follow => 1,
+                 no_chdir => 1
+         }, $dir);
+ }
  sub dump_contents($$$) {
          my ($contents,$mtime,$path) = @_;
-         return if (! $contents);        # don't die on empty files
+         return unless ($contents);      # don't die on empty files
+         if ($exclude && $path =~ m/$exclude/i) {
+                 print STDERR "skip: $path\n" if ($verbose);
+                 return;
+         }
          use bytes;
          my $size = length $contents;
          print STDERR " [$size]" if ($verbose);
+         return if ($skip_output);
          # Output the document (to swish)
          print <<EOF;
  Path-Name: $path
-Line 62 
 sub file {
+Line 77 
 sub file {
          my $path = $_;
          my $contents;
+         return if (-l $path);
          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
                  print STDERR "$path {converting}" if ($verbose);
-Line 78 
 sub file {
+Line 95 
 sub file {
                  return if (! $html);
+                 my $file_only = $path;
+                 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
-Line 85 
 sub file {
+Line 105 
 sub file {
                  if ($collection) {
                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
                  } else {
-                         $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
+                         $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
+                         $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
                  }
                  my $page_nr = 1;
-Line 107 
 sub file {
+Line 128 
 sub file {
                  open(F,"$path") || die "can't open file: $path";
                  print STDERR "$path" if ($verbose);
                  while(<F>) {
-                         $contents .= "$_";
+                         $contents .= $_;
                  }
                  $contents .= "\n\n";

 Legend:



Removed from v.81
 


changed lines


 
Added in v.99
 Legend:



Removed from v.81
 


changed lines


 
Added in v.99
-Removed from v.81
+Added in v.99

	ViewVC Help
Powered by ViewVC 1.1.26