/[hyperestraier_wrappers]/trunk/perl/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/perl/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 26 by dpavlin, Sat Sep 17 21:22:26 2005 UTC revision 27 by dpavlin, Sat Sep 17 23:07:52 2005 UTC
# Line 5  use Getopt::Long; Line 5  use Getopt::Long;
5  use File::Which;  use File::Which;
6  use HyperEstraier;  use HyperEstraier;
7  use Text::Iconv;  use Text::Iconv;
8    #use File::MMagic;
9    use File::MMagic::XS qw/:compat/;
10    
11  # do we use Node API?  # do we use Node API?
12  my $node_url;  my $node_url;
# Line 33  my $dir = shift @ARGV || die "usage: $0 Line 35  my $dir = shift @ARGV || die "usage: $0
35    
36  my $pdftotext = which('pdftotext');  my $pdftotext = which('pdftotext');
37    
38    #my $mm = new File::MMagic('/usr/share/misc/file/magic');
39    my $mm = new File::MMagic::XS();
40    
41  my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');  my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
42    
43  select(STDERR); $|=1;  select(STDERR); $|=1;
# Line 95  sub dump_contents($$$$) { Line 100  sub dump_contents($$$$) {
100    
101          my $title = $1 if ($contents =~ m#<title>(.+)</title>#is);          my $title = $1 if ($contents =~ m#<title>(.+)</title>#is);
102    
103            # chop long titles to 100 chars
104            $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
105            # use path if no title is found
106            $title ||= $path;
107    
108          # add attributes to the document object          # add attributes to the document object
109          $doc->add_attr('@uri', "file:///$path");          $doc->add_attr('@uri', "file:///$path");
110          $doc->add_attr('@title', $title || $path);          $doc->add_attr('@title', $iconv->convert($title));
111          $doc->add_attr('@size', $size);          $doc->add_attr('@size', $size);
112          $doc->add_attr('@mtime', $mtime);          $doc->add_attr('@mtime', $mtime);
113    
# Line 123  sub file { Line 133  sub file {
133          my $path = $_;          my $path = $_;
134          my $contents;          my $contents;
135    
136          return if (-l $path);          return if (-l $path || $path =~ m#/.svn#);
137    
138            my $type = $mm->checktype_filename($path);
139            $type =~ s/\s+/ /gs;
140    
141            print STDERR "# $path $type\n" if ($verbose);
142    
143          if ($pdftotext && -f $path && $path =~ m/\.pdf$/i) {          if ($pdftotext && -f $path && $type =~ m/pdf/i) {
144    
145                  print STDERR "$path {converting}" if ($verbose);                  print STDERR "$path {converting}" if ($verbose);
146    
# Line 166  sub file { Line 181  sub file {
181    
182          } else {          } else {
183    
184                  return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);  #               return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
185                    return if (! -f $path || (
186                            $type !~ m/html/ ||
187                            ($type !~ m#text/plain# && m/\.(php|pl|txt|info|log|text)$/)
188                    ));
189    
190                  # skip index files                  # skip index files
191                  return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);                  return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
192    
193                  open(F,"$path") || die "can't open file: $path";                  open(F,"$path") || die "can't open file: $path";
194                  print STDERR "$path" if ($verbose);                  print STDERR "$path ($type)" if ($verbose);
195                  while(<F>) {                  while(<F>) {
196                          $contents .= "$_";                          $contents .= "$_";
197                  }                  }

Legend:
Removed from v.26  
changed lines
  Added in v.27

  ViewVC Help
Powered by ViewVC 1.1.26