/[Semantic-Engine]/EPrints/EPrints.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /EPrints/EPrints.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 17 - (hide annotations)
Sat Jun 30 13:46:51 2007 UTC (16 years, 10 months ago) by dpavlin
File size: 2928 byte(s)
move stem to EPrints and make it actually work, completly new
index code (mopefully less nosify in indexed data)
1 dpavlin 1 # Dobrica Pavlinusic, <dpavlin@rot13.org> 06/28/07 23:28:21 CEST
2    
3     package EPrints;
4    
5     use Exporter 'import';
6 dpavlin 4 @EXPORT_OK = qw(_x slogovi);
7 dpavlin 1
8     use Encode qw/from_to decode_utf8 decode/;
9     use Data::Dump qw/dump/;
10 dpavlin 3 use DBI;
11 dpavlin 13 use URI::Escape;
12 dpavlin 17 use Carp qw/confess/;
13 dpavlin 1
14 dpavlin 17 use lib '/home/dpavlin/stem-hr/';
15     use StemHR;
16    
17 dpavlin 1 use strict;
18     use warnings;
19    
20     my $debug = 0;
21    
22     my $connect = "DBI:mysql:dbname=eprints";
23 dpavlin 13 # path to eprints installation
24     my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/';
25 dpavlin 1
26     my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr;
27    
28     sub dbh {
29     my $self = shift;
30     return $dbh;
31     }
32    
33     my $id;
34    
35     sub id {
36     my $self = shift;
37     if ( defined( $_[0] ) ) {
38     $id = $_[0];
39     warn "# id = $id\n" if $debug;
40     }
41     return $id;
42     }
43    
44     sub lookup {
45     my $self = shift;
46     my $field = shift;
47 dpavlin 11 my $table = shift;
48     my $where = '';
49 dpavlin 1
50 dpavlin 11 if ( ! $table ) {
51     $table = "archive_$field";
52     $where = " and lang = 'hr'";
53     }
54    
55 dpavlin 1 my $sql = qq{
56     SELECT $field
57 dpavlin 11 FROM $table
58     WHERE eprintid = $id $where
59 dpavlin 1 };
60     warn "# sql: $sql\n" if $debug;
61     my @results = map { _x( $_->{$field} ) } @{ $dbh->selectall_arrayref($sql, { Slice => {} }) };
62    
63     warn "# loookup( $field, $id ) = ", dump( @results ),$/ if $debug;
64     return join(" ", @results);
65     }
66    
67 dpavlin 12 sub fulltext {
68     my $self = shift;
69 dpavlin 13 my $fulltext = EPrints->lookup( 'fileinfo', 'archive' );
70     $fulltext =~ s/\s+$//;
71     return split(/;/, $fulltext);
72 dpavlin 12 }
73    
74 dpavlin 13 sub fulltext_content {
75     my $self = shift;
76    
77     my $path = $eprints_archive;
78    
79     my ( $type, $uri ) = EPrints->fulltext;
80     $uri =~ s!http://[^/]+/!!;
81     $uri = uri_unescape($uri);
82     if ( $uri =~ s|^(\d+)/|| ) {
83     my $nr = sprintf("%08d", $1);
84     $nr =~ s!(\d\d)!$1/!g;
85     $path .= "/$nr/$uri";
86     } else {
87     warn "can't find ID in $uri";
88     return;
89     }
90     $path =~ s!//+!/!g;
91     if ( -r $path ) {
92     print "+ $path ", -s $path, " bytes\n";
93     open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!";
94     local $/;
95     my $content = <$pdf>;
96     print "\t>>", length( $content ), " text bytes\n";
97     close($pdf); # || die "can't close $path: $!";
98     return $content;
99     } else {
100     warn "ERROR: $path: $!\n";
101     }
102    
103     }
104    
105 dpavlin 1 sub _x {
106     my $v = join(" ", @_);
107     decode_utf8( $v );
108     from_to( $v, 'utf-8', 'iso-8859-2' );
109     warn "_x($v)\n" if $debug;
110     return "$v ";
111     }
112    
113 dpavlin 4 sub slogovi {
114 dpavlin 14 my $self = shift;
115 dpavlin 17 my $text = shift || confess "no text?";
116 dpavlin 6
117 dpavlin 17 my $count = 3;
118 dpavlin 6 my $out = '';
119    
120 dpavlin 4 foreach my $w ( split(/\W*\s+\W*/, $text ) ) {
121 dpavlin 6 warn "w: $w\n" if $debug;
122     my @s;
123 dpavlin 4 while ( $w =~ s/^([^aeiou]*[aeiou])//i ) {
124 dpavlin 6 push @s, $1;
125 dpavlin 4 }
126 dpavlin 6 push @s, $w if $w;
127 dpavlin 4 warn "slogovi = ", dump( @s ), $/ if $debug;
128 dpavlin 6 foreach my $p ( 0 .. ( $#s - $count + 1 ) ) {
129     map { $out .= $s[ $p + $_ ] } 0 .. $count - 1;
130     $out .= ' ';
131     }
132 dpavlin 4 }
133 dpavlin 6 warn "$out\n" if $debug;
134     return $out;
135 dpavlin 4 }
136    
137 dpavlin 17 sub stem {
138     my $self = shift;
139     my $text = shift || confess "no text?";
140    
141     my $body = '';
142     foreach my $w ( split(/\W*\s+\W*/, $text ) ) {
143     $body .= StemHR->stem( $w ) . ' ';
144     }
145    
146     return $body;
147     }
148    
149 dpavlin 1 1;

  ViewVC Help
Powered by ViewVC 1.1.26