/[hyperestraier_wrappers]/trunk/perl/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/perl/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 30 - (hide annotations)
Sun Sep 18 18:21:06 2005 UTC (17 years, 2 months ago) by dpavlin
File size: 5437 byte(s)
split pdf into pages (using #page in uri)
1 dpavlin 5 #!/usr/bin/perl -w
2     use strict;
3     use File::Find;
4     use Getopt::Long;
5     use File::Which;
6     use HyperEstraier;
7     use Text::Iconv;
8 dpavlin 27 #use File::MMagic;
9     use File::MMagic::XS qw/:compat/;
10 dpavlin 5
11 dpavlin 18 # do we use Node API?
12     my $node_url;
13    
14 dpavlin 5 my $collection; # name which will be inserted
15     my $path_add; # add additional info in path
16     my $verbose;
17     my $exclude;
18    
19     #$verbose = 1;
20 dpavlin 28 my $debug = 0;
21 dpavlin 29 my $force = 0;
22 dpavlin 5
23     my $result = GetOptions(
24     "collection=s" => \$collection,
25     "path=s" => \$path_add,
26     "verbose!" => \$verbose,
27 dpavlin 28 "debug!" => \$debug,
28 dpavlin 5 "exclude=s" => \$exclude,
29 dpavlin 18 "node=s" => \$node_url,
30 dpavlin 29 "force!" => \$force,
31 dpavlin 5 );
32    
33     my $dir = shift @ARGV || die "usage: $0 [dir]";
34    
35     #my $basedir = $0;
36     #$basedir =~ s,/[^/]+$,/,;
37     #require "$basedir/filter.pm";
38    
39     my $pdftotext = which('pdftotext');
40    
41 dpavlin 27 #my $mm = new File::MMagic('/usr/share/misc/file/magic');
42     my $mm = new File::MMagic::XS();
43    
44 dpavlin 5 my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
45    
46     select(STDERR); $|=1;
47     select(STDOUT); $|=1;
48    
49     print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
50    
51 dpavlin 18 my $db;
52     if ($node_url) {
53     $db = HyperEstraier::Node->new($node_url);
54     $db->set_auth('admin', 'admin');
55     } else {
56     # open the database
57     $db = HyperEstraier::Database->new();
58     $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
59 dpavlin 5
60 dpavlin 18 sub signal {
61     my($sig) = @_;
62     print "\nCaught a SIG$sig--syncing database and shutting down\n";
63     $db->sync();
64     exit(0);
65     }
66 dpavlin 5
67 dpavlin 18 $SIG{'INT'} = \&signal;
68     $SIG{'QUIT'} = \&signal;
69 dpavlin 5 }
70    
71     find({ wanted => \&file,
72     follow => 1,
73 dpavlin 26 follow_skip => 2,
74     no_chdir => 1,
75 dpavlin 5 }, $dir);
76    
77 dpavlin 18 unless ($node_url) {
78     print "--- sync\n";
79     $db->sync();
80 dpavlin 5
81 dpavlin 18 print "--- optimize...\n";
82     $db->optimize(0);
83     }
84 dpavlin 5 exit;
85    
86     sub dump_contents($$$$) {
87     my ($db,$contents,$mtime,$path) = @_;
88    
89     return unless ($contents); # don't die on empty files
90    
91     if ($exclude && $path =~ m/$exclude/i) {
92     print STDERR "skip: $path\n" if ($verbose);
93     return;
94     }
95    
96     use bytes;
97     my $size = length $contents;
98    
99     print STDERR " [$size]" if ($verbose);
100    
101     # create a document object
102     my $doc = HyperEstraier::Document->new;
103    
104 dpavlin 29 my $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
105 dpavlin 5
106 dpavlin 27 # chop long titles to 100 chars
107     $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
108     # use path if no title is found
109     $title ||= $path;
110    
111 dpavlin 5 # add attributes to the document object
112     $doc->add_attr('@uri', "file:///$path");
113 dpavlin 27 $doc->add_attr('@title', $iconv->convert($title));
114 dpavlin 5 $doc->add_attr('@size', $size);
115     $doc->add_attr('@mtime', $mtime);
116    
117     # html2text
118     $contents =~ s#<[^>]+/*>##gs;
119     $contents =~ s#\s\s+# #gs;
120    
121     $doc->add_text($iconv->convert($contents));
122    
123     # print $doc->dump_draft if ($verbose);
124    
125     # register the document object to the database
126 dpavlin 18 if ($node_url) {
127     $db->put_doc($doc);
128     } else {
129     $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
130     }
131 dpavlin 5
132     }
133    
134     sub file {
135    
136     my $path = $_;
137     my $contents;
138    
139 dpavlin 29 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/);
140 dpavlin 5
141 dpavlin 28 my $mtime = (stat($path))[9];
142     my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2;
143    
144     if ($mtime == $mtime_db) {
145     print STDERR "# same: $path $mtime\n" if ($verbose);
146 dpavlin 29 return unless($force);
147 dpavlin 28 } else {
148     print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
149     }
150    
151     # skip files on which File::MMagic::XS croaks
152     return if ($path =~ m#\.au$#);
153    
154 dpavlin 27 my $type = $mm->checktype_filename($path);
155     $type =~ s/\s+/ /gs;
156 dpavlin 5
157 dpavlin 28 print STDERR "# $path $type\n" if ($debug);
158 dpavlin 27
159     if ($pdftotext && -f $path && $type =~ m/pdf/i) {
160    
161 dpavlin 5 print STDERR "$path {converting}" if ($verbose);
162    
163     open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
164     my $html;
165     while(<F>) {
166     # XXX why pdftotext barks if I try to use this is beyond me.
167     #$contents .= $_;
168    
169     $html .= $_;
170     }
171     close(F);
172    
173     return if (! $html);
174    
175     my $file_only = $path;
176     $file_only =~ s/^.*\/([^\/]+)$/$1/g;
177    
178     my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
179    
180     ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
181    
182     if ($collection) {
183     $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
184     } else {
185     $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
186     $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
187     }
188    
189     my $page_nr = 1;
190     foreach my $page (split(/\f/s,$pages)) {
191     print STDERR " $page_nr" if ($verbose);
192     my $pre_tmp = $pre_html;
193     $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
194 dpavlin 30 dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
195 dpavlin 5 $page_nr++;
196     }
197    
198     } else {
199    
200 dpavlin 27 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
201 dpavlin 28 return unless (-f $path && $type =~ m/html/ ||
202     ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io)
203     );
204 dpavlin 5
205     # skip index files
206     return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
207    
208     open(F,"$path") || die "can't open file: $path";
209 dpavlin 27 print STDERR "$path ($type)" if ($verbose);
210 dpavlin 5 while(<F>) {
211     $contents .= "$_";
212     }
213     $contents .= "\n\n";
214    
215     #$contents = filter($contents,$collection);
216    
217     # add optional components to path
218     $path .= " $path_add" if ($path_add);
219    
220 dpavlin 28 dump_contents($db, $contents, $mtime, $path);
221 dpavlin 5 }
222    
223     print STDERR "\n" if ($verbose);
224     # die "zero size content in '$path'" if (! $contents);
225    
226     }
227    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26