1 |
#!/usr/bin/env python |
2 |
# vim:fileencoding=utf-8 |
3 |
|
4 |
import HyperEstraier |
5 |
|
6 |
import os |
7 |
|
8 |
class HEGatherer: |
9 |
def __init__(self, dbpath): |
10 |
self.db = HyperEstraier.Database() |
11 |
self.db.open(dbpath, HyperEstraier.Database.DBWRITER | HyperEstraier.Database.DBCREAT) |
12 |
|
13 |
def _put_doc(self, fname): |
14 |
print fname |
15 |
|
16 |
doc = HyperEstraier.Document() |
17 |
|
18 |
doc.add_attr('@uri', "file://" + fname) |
19 |
doc.add_attr('@title', fname) |
20 |
|
21 |
text = unicode(open(fname).read(), 'iso-2022-jp', 'ignore') |
22 |
doc.add_text(text.encode('utf-8')) |
23 |
|
24 |
self.db.put_doc(doc, HyperEstraier.Database.PDCLEAN) |
25 |
|
26 |
def put_dir(self, dirname): |
27 |
for root, dirs, files in os.walk(dirname): |
28 |
for fname in files: |
29 |
self._put_doc("%s/%s" % (root, fname)) |
30 |
|
31 |
if __name__ == '__main__': |
32 |
import sys |
33 |
|
34 |
if len(sys.argv) != 3: |
35 |
print >>sys.stderr, "%s dbname dirname" % sys.argv[0] |
36 |
sys.exit() |
37 |
|
38 |
dbname = sys.argv[1] |
39 |
dirname = sys.argv[2] |
40 |
|
41 |
heg = HEGatherer(dbname) |
42 |
heg.put_dir(dirname) |
43 |
|