/[swish]/trunk/spider/progspider

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/spider/progspider

Parent Directory | Revision Log | View Patch Patch

-revision 72 by dpavlin,
Tue Apr  6 15:06:58 2004 UTC
+revision 95 by dpavlin,
Sun Apr 24 16:33:53 2005 UTC
 Line 1
- #!/usr/local/bin/perl -w
+ #!/usr/bin/perl -w
  use strict;
  use File::Find;
  use Getopt::Long;
 Line 7 
 use File::Which;
  my $collection;         # name which will be inserted
  my $path_add;           # add additional info in path
  my $verbose;
+ my $exclude;
  #$verbose = 1;
-Line 15 
 my $result = GetOptions(
+Line 16 
 my $result = GetOptions(
          "path=s" => \$path_add,
          "verbose!" => \$verbose,
          "debug!" => \$verbose,
+         "exclude=s" => \$exclude,
  );
  my $dir = shift @ARGV || die "usage: $0 [dir]";
-Line 38 
 find({ wanted => \&file,
+Line 40 
 find({ wanted => \&file,
  sub dump_contents($$$) {
          my ($contents,$mtime,$path) = @_;
-         return if (! $contents);        # don't die on empty files
+         return unless ($contents);      # don't die on empty files
+         if ($exclude && $path =~ m/$exclude/i) {
+                 print STDERR "skip: $path\n" if ($verbose);
+                 return;
+         }
          use bytes;
          my $size = length $contents;
-Line 50 
 sub dump_contents($$$) {
+Line 57 
 sub dump_contents($$$) {
  Path-Name: $path
  Content-Length: $size
  Last-Mtime: $mtime
- Document-Type: HTML
+ Document-Type: html*
  EOF
          print $contents;
-Line 62 
 sub file {
+Line 69 
 sub file {
          my $path = $_;
          my $contents;
+         return if (-l $path);
          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
                  print STDERR "$path {converting}" if ($verbose);
-Line 76 
 sub file {
+Line 85 
 sub file {
                  }
                  close(F);
+                 return if (! $html);
+                 my $file_only = $path;
+                 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
                  my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
                  ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
-Line 83 
 sub file {
+Line 97 
 sub file {
                  if ($collection) {
                          $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
                  } else {
-                         $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
+                         $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
+                         $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
                  }
                  my $page_nr = 1;
-Line 97 
 sub file {
+Line 112 
 sub file {
          } else {
-                 return if (! -f $path || ! m/\.html*$/i);
+                 return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
                  # skip index files
                  return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);

 Legend:



Removed from v.72
 


changed lines


 
Added in v.95
 Legend:



Removed from v.72
 


changed lines


 
Added in v.95
-Removed from v.72
+Added in v.95

	ViewVC Help
Powered by ViewVC 1.1.26