/[hyperestraier_wrappers]/trunk/perl/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/perl/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 33 - (show annotations)
Tue Oct 11 14:04:52 2005 UTC (18 years, 5 months ago) by dpavlin
File size: 5605 byte(s)
save empty entry as a placeholder
1 #!/usr/bin/perl -w
2 use strict;
3 use File::Find;
4 use Getopt::Long;
5 use File::Which;
6 use HyperEstraier;
7 use Text::Iconv;
8 #use File::MMagic;
9 use File::MMagic::XS qw/:compat/;
10
11 # do we use Node API?
12 my $node_url;
13
14 my $collection; # name which will be inserted
15 my $path_add; # add additional info in path
16 my $verbose;
17 my $exclude;
18
19 #$verbose = 1;
20 my $debug = 0;
21 my $force = 0;
22
23 my $result = GetOptions(
24 "collection=s" => \$collection,
25 "path=s" => \$path_add,
26 "verbose!" => \$verbose,
27 "debug!" => \$debug,
28 "exclude=s" => \$exclude,
29 "node=s" => \$node_url,
30 "force!" => \$force,
31 );
32
33 my $dir = shift @ARGV || die "usage: $0 [dir]";
34
35 if (! -e $dir) {
36 warn "directory $dir doesn't exist, skipping\n";
37 exit 1;
38 }
39
40 #my $basedir = $0;
41 #$basedir =~ s,/[^/]+$,/,;
42 #require "$basedir/filter.pm";
43
44 my $pdftotext = which('pdftotext');
45
46 #my $mm = new File::MMagic('/usr/share/misc/file/magic');
47 my $mm = new File::MMagic::XS();
48
49 my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
50
51 select(STDERR); $|=1;
52 select(STDOUT); $|=1;
53
54 print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
55
56 my $db;
57 if ($node_url) {
58 $db = HyperEstraier::Node->new($node_url);
59 $db->set_auth('admin', 'admin');
60 } else {
61 # open the database
62 $db = HyperEstraier::Database->new();
63 $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
64
65 sub signal {
66 my($sig) = @_;
67 print "\nCaught a SIG$sig--syncing database and shutting down\n";
68 $db->sync();
69 exit(0);
70 }
71
72 $SIG{'INT'} = \&signal;
73 $SIG{'QUIT'} = \&signal;
74 }
75
76 find({ wanted => \&file,
77 follow => 1,
78 follow_skip => 2,
79 no_chdir => 1,
80 }, $dir);
81
82 unless ($node_url) {
83 print "--- sync\n";
84 $db->sync();
85
86 print "--- optimize...\n";
87 $db->optimize(0);
88 }
89 exit;
90
91 sub dump_contents($$$$) {
92 my ($db,$contents,$mtime,$path) = @_;
93
94 return unless ($contents); # don't die on empty files
95
96 if ($exclude && $path =~ m/$exclude/i) {
97 print STDERR "skip: $path\n" if ($verbose);
98 return;
99 }
100
101 use bytes;
102 my $size = length $contents;
103
104 print STDERR " [$size]" if ($verbose);
105
106 # create a document object
107 my $doc = HyperEstraier::Document->new;
108
109 my $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
110
111 # chop long titles to 100 chars
112 $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
113 # use path if no title is found
114 $title ||= $path;
115
116 # add attributes to the document object
117 $doc->add_attr('@uri', "file:///$path");
118 $doc->add_attr('@title', $iconv->convert($title));
119 $doc->add_attr('@size', $size);
120 $doc->add_attr('@mtime', $mtime);
121
122 # html2text
123 $contents =~ s#<[^>]+/*>##gs;
124 $contents =~ s#\s\s+# #gs;
125
126 $doc->add_text($iconv->convert($contents));
127
128 # print $doc->dump_draft if ($verbose);
129
130 # register the document object to the database
131 if ($node_url) {
132 $db->put_doc($doc);
133 } else {
134 $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
135 }
136
137 }
138
139 sub file {
140
141 my $path = $_;
142 my $contents;
143
144 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/);
145
146 my $mtime = (stat($path))[9] || -1;
147 my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2;
148
149 if ($mtime == $mtime_db) {
150 print STDERR "# same: $path $mtime\n" if ($verbose);
151 return unless($force);
152 } else {
153 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
154 }
155
156 # skip files on which File::MMagic::XS croaks
157 return if ($path =~ m#\.au$#);
158
159 my $type = $mm->checktype_filename($path);
160 $type =~ s/\s+/ /gs;
161
162 print STDERR "# $path $type\n" if ($debug);
163
164 if ($pdftotext && -f $path && $type =~ m/pdf/i) {
165
166 print STDERR "$path {converting}" if ($verbose);
167
168 open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
169 my $html;
170 while(<F>) {
171 # XXX why pdftotext barks if I try to use this is beyond me.
172 #$contents .= $_;
173
174 $html .= $_;
175 }
176 close(F);
177
178 return if (! $html);
179
180 my $file_only = $path;
181 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
182
183 my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
184
185 ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
186
187 if ($collection) {
188 $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
189 } else {
190 $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
191 $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
192 }
193
194 # save empty entry as a placeholder
195 dump_contents($db, ' ', $mtime, "$path");
196
197 my $page_nr = 1;
198 foreach my $page (split(/\f/s,$pages)) {
199 print STDERR " $page_nr" if ($verbose);
200 my $pre_tmp = $pre_html;
201 $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
202 dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
203 $page_nr++;
204 }
205
206 } else {
207
208 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
209 return unless (-f $path && $type =~ m/html/ ||
210 ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io)
211 );
212
213 # skip index files
214 return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
215
216 open(F,"$path") || die "can't open file: $path";
217 print STDERR "$path ($type)" if ($verbose);
218 while(<F>) {
219 $contents .= "$_";
220 }
221 $contents .= "\n\n";
222
223 #$contents = filter($contents,$collection);
224
225 # add optional components to path
226 $path .= " $path_add" if ($path_add);
227
228 dump_contents($db, $contents, $mtime, $path);
229 }
230
231 print STDERR "\n" if ($verbose);
232 # die "zero size content in '$path'" if (! $contents);
233
234 }
235

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26