/[hyperestraier_wrappers]/trunk/perl/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/perl/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 28 - (show annotations)
Sat Sep 17 23:43:20 2005 UTC (17 years, 2 months ago) by dpavlin
File size: 5361 byte(s)
use @mtime to store mtime as unix time and check for changes
1 #!/usr/bin/perl -w
2 use strict;
3 use File::Find;
4 use Getopt::Long;
5 use File::Which;
6 use HyperEstraier;
7 use Text::Iconv;
8 #use File::MMagic;
9 use File::MMagic::XS qw/:compat/;
10
11 # do we use Node API?
12 my $node_url;
13
14 my $collection; # name which will be inserted
15 my $path_add; # add additional info in path
16 my $verbose;
17 my $exclude;
18
19 #$verbose = 1;
20 my $debug = 0;
21
22 my $result = GetOptions(
23 "collection=s" => \$collection,
24 "path=s" => \$path_add,
25 "verbose!" => \$verbose,
26 "debug!" => \$debug,
27 "exclude=s" => \$exclude,
28 "node=s" => \$node_url,
29 );
30
31 my $dir = shift @ARGV || die "usage: $0 [dir]";
32
33 #my $basedir = $0;
34 #$basedir =~ s,/[^/]+$,/,;
35 #require "$basedir/filter.pm";
36
37 my $pdftotext = which('pdftotext');
38
39 #my $mm = new File::MMagic('/usr/share/misc/file/magic');
40 my $mm = new File::MMagic::XS();
41
42 my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
43
44 select(STDERR); $|=1;
45 select(STDOUT); $|=1;
46
47 print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
48
49 my $db;
50 if ($node_url) {
51 $db = HyperEstraier::Node->new($node_url);
52 $db->set_auth('admin', 'admin');
53 } else {
54 # open the database
55 $db = HyperEstraier::Database->new();
56 $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
57
58 sub signal {
59 my($sig) = @_;
60 print "\nCaught a SIG$sig--syncing database and shutting down\n";
61 $db->sync();
62 exit(0);
63 }
64
65 $SIG{'INT'} = \&signal;
66 $SIG{'QUIT'} = \&signal;
67 }
68
69 find({ wanted => \&file,
70 follow => 1,
71 follow_skip => 2,
72 no_chdir => 1,
73 }, $dir);
74
75 unless ($node_url) {
76 print "--- sync\n";
77 $db->sync();
78
79 print "--- optimize...\n";
80 $db->optimize(0);
81 }
82 exit;
83
84 sub dump_contents($$$$) {
85 my ($db,$contents,$mtime,$path) = @_;
86
87 return unless ($contents); # don't die on empty files
88
89 if ($exclude && $path =~ m/$exclude/i) {
90 print STDERR "skip: $path\n" if ($verbose);
91 return;
92 }
93
94 use bytes;
95 my $size = length $contents;
96
97 print STDERR " [$size]" if ($verbose);
98
99 # create a document object
100 my $doc = HyperEstraier::Document->new;
101
102 my $title = $1 if ($contents =~ m#<title>(.+)</title>#is);
103
104 # chop long titles to 100 chars
105 $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
106 # use path if no title is found
107 $title ||= $path;
108
109 # add attributes to the document object
110 $doc->add_attr('@uri', "file:///$path");
111 $doc->add_attr('@title', $iconv->convert($title));
112 $doc->add_attr('@size', $size);
113 $doc->add_attr('@mtime', $mtime);
114
115 # html2text
116 $contents =~ s#<[^>]+/*>##gs;
117 $contents =~ s#\s\s+# #gs;
118
119 $doc->add_text($iconv->convert($contents));
120
121 # print $doc->dump_draft if ($verbose);
122
123 # register the document object to the database
124 if ($node_url) {
125 $db->put_doc($doc);
126 } else {
127 $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
128 }
129
130 }
131
132 sub file {
133
134 my $path = $_;
135 my $contents;
136
137 return if (-l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/);
138
139 my $mtime = (stat($path))[9];
140 my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2;
141
142 if ($mtime == $mtime_db) {
143 print STDERR "# same: $path $mtime\n" if ($verbose);
144 return;
145 } else {
146 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
147 }
148
149 # skip files on which File::MMagic::XS croaks
150 return if ($path =~ m#\.au$#);
151
152 my $type = $mm->checktype_filename($path);
153 $type =~ s/\s+/ /gs;
154
155 print STDERR "# $path $type\n" if ($debug);
156
157 if ($pdftotext && -f $path && $type =~ m/pdf/i) {
158
159 print STDERR "$path {converting}" if ($verbose);
160
161 open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
162 my $html;
163 while(<F>) {
164 # XXX why pdftotext barks if I try to use this is beyond me.
165 #$contents .= $_;
166
167 $html .= $_;
168 }
169 close(F);
170
171 return if (! $html);
172
173 my $file_only = $path;
174 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
175
176 my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
177
178 ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
179
180 if ($collection) {
181 $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
182 } else {
183 $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
184 $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
185 }
186
187 my $page_nr = 1;
188 foreach my $page (split(/\f/s,$pages)) {
189 print STDERR " $page_nr" if ($verbose);
190 my $pre_tmp = $pre_html;
191 $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
192 dump_contents($db, $pre_tmp . $page . $post_html, $mtime, $path) if ($page !~ m/^\s*$/s);
193 $page_nr++;
194 }
195
196 } else {
197
198 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
199 return unless (-f $path && $type =~ m/html/ ||
200 ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io)
201 );
202
203 # skip index files
204 return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
205
206 open(F,"$path") || die "can't open file: $path";
207 print STDERR "$path ($type)" if ($verbose);
208 while(<F>) {
209 $contents .= "$_";
210 }
211 $contents .= "\n\n";
212
213 #$contents = filter($contents,$collection);
214
215 # add optional components to path
216 $path .= " $path_add" if ($path_add);
217
218 dump_contents($db, $contents, $mtime, $path);
219 }
220
221 print STDERR "\n" if ($verbose);
222 # die "zero size content in '$path'" if (! $contents);
223
224 }
225

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26