/[swish]/trunk/spider/progspider

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/spider/progspider

Parent Directory | Revision Log | View Patch Patch

-revision 46 by dpavlin,
Sat Jan 17 23:57:55 2004 UTC
+revision 66 by dpavlin,
Wed Mar 17 12:19:42 2004 UTC
 Line 1
  #!/usr/local/bin/perl -w
  use strict;
  use File::Find;
+ use Getopt::Long;
+ use File::Which;
+ my $collection;         # name which will be inserted
+ my $path_add;           # add additional info in path
+ my $verbose;
+ #$verbose = 1;
+ my $result = GetOptions(
+         "collection=s" => \$collection,
+         "path=s" => \$path_add,
+         "verbose!" => \$verbose,
+         "debug!" => \$verbose,
+ );
  my $dir = shift @ARGV || die "usage: $0 [dir]";
-Line 9 
 my $basedir = $0;
+Line 23 
 my $basedir = $0;
  $basedir =~ s,/[^/]+$,/,;
  require "$basedir/filter.pm";
+ my $pdftotext = which('pdftotext');
+ select(STDERR); $|=1;
+ select(STDOUT); $|=1;
+ print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
  find({ wanted => \&file,
          follow => 1,
          no_chdir => 1
  }, $dir);
- sub file {
+ sub dump_contents($$$) {
+         my ($contents,$mtime,$path) = @_;
-         return if (! -f || ! m/\.html*/i);
-         my $path = $_;
-         open(F,"$path") || die "can't open file: $path";
-         print STDERR "$path";
-         my $contents;
-         while(<F>) {
- #               chomp;
- #               chomp;
- #               $contents .= " ".$_;
-                 $contents .= $_;
-         }
- #       $contents =~ s/<(\/*\w+)\s+>/<$1>/g;
+         return if (! $contents);        # don't die on empty files
-         $contents = filter($contents);
+         use bytes;
-         my $mtime = time;
          my $size = length $contents;
-         print STDERR " [$size]\n";
+         print STDERR " [$size]" if ($verbose);
          # Output the document (to swish)
          print <<EOF;
-Line 47 
 Last-Mtime: $mtime
+Line 53 
 Last-Mtime: $mtime
  Document-Type: HTML
  EOF
          print $contents;
  }
+ sub file {
+         my $path = $_;
+         my $contents;
+         if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {
+                 print STDERR "$path {converting}" if ($verbose);
+                 open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
+                 my $html;
+                 while(<F>) {
+                         # XXX why pdftotext barks if I try to use this is beyond me.
+                         #$contents .= $_;
+                         $html .= $_;
+                 }
+                 close(F);
+                 my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
+                 ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+<pre>)(.+)(<\/pre>.+)$/si);
+                 $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si;
+                 my $page_nr = 1;
+                 foreach my $page (split(/\f/,$pages)) {
+                         my $pre_tmp = $pre_html;
+                         $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
+                         dump_contents($pre_tmp . $page . $post_html,time(), $path);
+                         $page_nr++;
+                 }
+         } else {
+                 return if (! -f $path || ! m/\.html*$/i);
+                 # skip index files
+                 return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
+                 open(F,"$path") || die "can't open file: $path";
+                 print STDERR "$path" if ($verbose);
+                 while(<F>) {
+                         $contents .= "$_";
+                 }
+                 $contents .= "\n\n";
+                 $contents = filter($contents,$collection);
+                 # add optional components to path
+                 $path .= " $path_add" if ($path_add);
+                 dump_contents($contents,time(), $path);
+         }
+         print STDERR "\n" if ($verbose);
+ #       die "zero size content in '$path'" if (! $contents);
+ }

 Legend:



Removed from v.46
 


changed lines


 
Added in v.66
 Legend:



Removed from v.46
 


changed lines


 
Added in v.66
-Removed from v.46
+Added in v.66

	ViewVC Help
Powered by ViewVC 1.1.26