/[swish]/trunk/spider/progspider

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/spider/progspider

Parent Directory | Revision Log | View Patch Patch

-revision 56 by dpavlin,
Fri Jan 23 13:10:40 2004 UTC
+revision 95 by dpavlin,
Sun Apr 24 16:33:53 2005 UTC
 Line 1
- #!/usr/local/bin/perl -w
+ #!/usr/bin/perl -w
  use strict;
  use File::Find;
  use Getopt::Long;
+ use File::Which;
  my $collection;         # name which will be inserted
  my $path_add;           # add additional info in path
  my $verbose;
+ my $exclude;
+ #$verbose = 1;
  my $result = GetOptions(
          "collection=s" => \$collection,
          "path=s" => \$path_add,
          "verbose!" => \$verbose,
          "debug!" => \$verbose,
+         "exclude=s" => \$exclude,
  );
  my $dir = shift @ARGV || die "usage: $0 [dir]";
  my $basedir = $0;
  $basedir =~ s,/[^/]+$,/,;
  require "$basedir/filter.pm";
+ my $pdftotext = which('pdftotext');
+ select(STDERR); $|=1;
+ select(STDOUT); $|=1;
+ print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
  find({ wanted => \&file,
          follow => 1,
          no_chdir => 1
  }, $dir);
- sub file {
+ sub dump_contents($$$) {
+         my ($contents,$mtime,$path) = @_;
-         return if (! -f || ! m/\.html*/i);
-         # skip index files
+         return unless ($contents);      # don't die on empty files
-         return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
-         my $path = $_;
+         if ($exclude && $path =~ m/$exclude/i) {
+                 print STDERR "skip: $path\n" if ($verbose);
-         open(F,"$path") || die "can't open file: $path";
+                 return;
-         print STDERR "$path" if ($verbose);
-         my $contents;
-         while(<F>) {
-                 $contents .= "$_";
          }
-         $contents .= "\n\n";
-         $contents = filter($contents,$collection);
- #       die "zero size content in '$path'" if (! $contents);
-         return if (! $contents);        # don't die on empty files
-         my $mtime = time;
          use bytes;
          my $size = length $contents;
-         print STDERR " [$size]\n" if ($verbose);
+         print STDERR " [$size]" if ($verbose);
-         # add optional components to path
-         $path .= " $path_add" if ($path_add);
          # Output the document (to swish)
          print <<EOF;
  Path-Name: $path
  Content-Length: $size
  Last-Mtime: $mtime
- Document-Type: HTML
+ Document-Type: html*
  EOF
          print $contents;
  }
+ sub file {
+         my $path = $_;
+         my $contents;
+         return if (-l $path);
+         if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
+                 print STDERR "$path {converting}" if ($verbose);
+                 open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
+                 my $html;
+                 while(<F>) {
+                         # XXX why pdftotext barks if I try to use this is beyond me.
+                         #$contents .= $_;
+                         $html .= $_;
+                 }
+                 close(F);
+                 return if (! $html);
+                 my $file_only = $path;
+                 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
+                 my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
+                 ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
+                 if ($collection) {
+                         $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
+                 } else {
+                         $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
+                         $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
+                 }
+                 my $page_nr = 1;
+                 foreach my $page (split(/\f/s,$pages)) {
+                         print STDERR " $page_nr" if ($verbose);
+                         my $pre_tmp = $pre_html;
+                         $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
+                         dump_contents($pre_tmp . $page . $post_html,time(), $path) if ($page !~ m/^\s*$/s);
+                         $page_nr++;
+                 }
+         } else {
+                 return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
+                 # skip index files
+                 return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
+                 open(F,"$path") || die "can't open file: $path";
+                 print STDERR "$path" if ($verbose);
+                 while(<F>) {
+                         $contents .= "$_";
+                 }
+                 $contents .= "\n\n";
+                 $contents = filter($contents,$collection);
+                 # add optional components to path
+                 $path .= " $path_add" if ($path_add);
+                 dump_contents($contents,time(), $path);
+         }
+         print STDERR "\n" if ($verbose);
+ #       die "zero size content in '$path'" if (! $contents);
+ }

 Legend:



Removed from v.56
 


changed lines


 
Added in v.95
 Legend:



Removed from v.56
 


changed lines


 
Added in v.95
-Removed from v.56
+Added in v.95

	ViewVC Help
Powered by ViewVC 1.1.26