/[hyperestraier_wrappers]/trunk/perl/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/perl/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 28 - (hide annotations)
Sat Sep 17 23:43:20 2005 UTC (18 years, 6 months ago) by dpavlin
File size: 5361 byte(s)
use @mtime to store mtime as unix time and check for changes
1 dpavlin 5 #!/usr/bin/perl -w
2     use strict;
3     use File::Find;
4     use Getopt::Long;
5     use File::Which;
6     use HyperEstraier;
7     use Text::Iconv;
8 dpavlin 27 #use File::MMagic;
9     use File::MMagic::XS qw/:compat/;
10 dpavlin 5
11 dpavlin 18 # do we use Node API?
12     my $node_url;
13    
14 dpavlin 5 my $collection; # name which will be inserted
15     my $path_add; # add additional info in path
16     my $verbose;
17     my $exclude;
18    
19     #$verbose = 1;
20 dpavlin 28 my $debug = 0;
21 dpavlin 5
22     my $result = GetOptions(
23     "collection=s" => \$collection,
24     "path=s" => \$path_add,
25     "verbose!" => \$verbose,
26 dpavlin 28 "debug!" => \$debug,
27 dpavlin 5 "exclude=s" => \$exclude,
28 dpavlin 18 "node=s" => \$node_url,
29 dpavlin 5 );
30    
31     my $dir = shift @ARGV || die "usage: $0 [dir]";
32    
33     #my $basedir = $0;
34     #$basedir =~ s,/[^/]+$,/,;
35     #require "$basedir/filter.pm";
36    
37     my $pdftotext = which('pdftotext');
38    
39 dpavlin 27 #my $mm = new File::MMagic('/usr/share/misc/file/magic');
40     my $mm = new File::MMagic::XS();
41    
42 dpavlin 5 my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
43    
44     select(STDERR); $|=1;
45     select(STDOUT); $|=1;
46    
47     print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
48    
49 dpavlin 18 my $db;
50     if ($node_url) {
51     $db = HyperEstraier::Node->new($node_url);
52     $db->set_auth('admin', 'admin');
53     } else {
54     # open the database
55     $db = HyperEstraier::Database->new();
56     $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
57 dpavlin 5
58 dpavlin 18 sub signal {
59     my($sig) = @_;
60     print "\nCaught a SIG$sig--syncing database and shutting down\n";
61     $db->sync();
62     exit(0);
63     }
64 dpavlin 5
65 dpavlin 18 $SIG{'INT'} = \&signal;
66     $SIG{'QUIT'} = \&signal;
67 dpavlin 5 }
68    
69     find({ wanted => \&file,
70     follow => 1,
71 dpavlin 26 follow_skip => 2,
72     no_chdir => 1,
73 dpavlin 5 }, $dir);
74    
75 dpavlin 18 unless ($node_url) {
76     print "--- sync\n";
77     $db->sync();
78 dpavlin 5
79 dpavlin 18 print "--- optimize...\n";
80     $db->optimize(0);
81     }
82 dpavlin 5 exit;
83    
84     sub dump_contents($$$$) {
85     my ($db,$contents,$mtime,$path) = @_;
86    
87     return unless ($contents); # don't die on empty files
88    
89     if ($exclude && $path =~ m/$exclude/i) {
90     print STDERR "skip: $path\n" if ($verbose);
91     return;
92     }
93    
94     use bytes;
95     my $size = length $contents;
96    
97     print STDERR " [$size]" if ($verbose);
98    
99     # create a document object
100     my $doc = HyperEstraier::Document->new;
101    
102     my $title = $1 if ($contents =~ m#<title>(.+)</title>#is);
103    
104 dpavlin 27 # chop long titles to 100 chars
105     $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
106     # use path if no title is found
107     $title ||= $path;
108    
109 dpavlin 5 # add attributes to the document object
110     $doc->add_attr('@uri', "file:///$path");
111 dpavlin 27 $doc->add_attr('@title', $iconv->convert($title));
112 dpavlin 5 $doc->add_attr('@size', $size);
113     $doc->add_attr('@mtime', $mtime);
114    
115     # html2text
116     $contents =~ s#<[^>]+/*>##gs;
117     $contents =~ s#\s\s+# #gs;
118    
119     $doc->add_text($iconv->convert($contents));
120    
121     # print $doc->dump_draft if ($verbose);
122    
123     # register the document object to the database
124 dpavlin 18 if ($node_url) {
125     $db->put_doc($doc);
126     } else {
127     $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
128     }
129 dpavlin 5
130     }
131    
132     sub file {
133    
134     my $path = $_;
135     my $contents;
136    
137 dpavlin 28 return if (-l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/);
138 dpavlin 5
139 dpavlin 28 my $mtime = (stat($path))[9];
140     my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2;
141    
142     if ($mtime == $mtime_db) {
143     print STDERR "# same: $path $mtime\n" if ($verbose);
144     return;
145     } else {
146     print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
147     }
148    
149     # skip files on which File::MMagic::XS croaks
150     return if ($path =~ m#\.au$#);
151    
152 dpavlin 27 my $type = $mm->checktype_filename($path);
153     $type =~ s/\s+/ /gs;
154 dpavlin 5
155 dpavlin 28 print STDERR "# $path $type\n" if ($debug);
156 dpavlin 27
157     if ($pdftotext && -f $path && $type =~ m/pdf/i) {
158    
159 dpavlin 5 print STDERR "$path {converting}" if ($verbose);
160    
161     open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
162     my $html;
163     while(<F>) {
164     # XXX why pdftotext barks if I try to use this is beyond me.
165     #$contents .= $_;
166    
167     $html .= $_;
168     }
169     close(F);
170    
171     return if (! $html);
172    
173     my $file_only = $path;
174     $file_only =~ s/^.*\/([^\/]+)$/$1/g;
175    
176     my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
177    
178     ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
179    
180     if ($collection) {
181     $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
182     } else {
183     $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
184     $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
185     }
186    
187     my $page_nr = 1;
188     foreach my $page (split(/\f/s,$pages)) {
189     print STDERR " $page_nr" if ($verbose);
190     my $pre_tmp = $pre_html;
191     $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
192 dpavlin 28 dump_contents($db, $pre_tmp . $page . $post_html, $mtime, $path) if ($page !~ m/^\s*$/s);
193 dpavlin 5 $page_nr++;
194     }
195    
196     } else {
197    
198 dpavlin 27 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
199 dpavlin 28 return unless (-f $path && $type =~ m/html/ ||
200     ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io)
201     );
202 dpavlin 5
203     # skip index files
204     return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
205    
206     open(F,"$path") || die "can't open file: $path";
207 dpavlin 27 print STDERR "$path ($type)" if ($verbose);
208 dpavlin 5 while(<F>) {
209     $contents .= "$_";
210     }
211     $contents .= "\n\n";
212    
213     #$contents = filter($contents,$collection);
214    
215     # add optional components to path
216     $path .= " $path_add" if ($path_add);
217    
218 dpavlin 28 dump_contents($db, $contents, $mtime, $path);
219 dpavlin 5 }
220    
221     print STDERR "\n" if ($verbose);
222     # die "zero size content in '$path'" if (! $contents);
223    
224     }
225    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26