/[stem-hr]/StemHR.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /StemHR.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 16 - (show annotations)
Thu Jun 28 10:38:34 2007 UTC (12 years, 4 months ago) by dpavlin
File size: 2540 byte(s)
new calling StemHR->stem('word')

1 package StemHR;
2
3 #
4 # Croatian stemmer
5 #
6
7 # promjenjive:
8 # - imenice
9 # - pridjevi
10 # - brojevi
11 # - zamjenice
12 # - prilozi
13 # - glagoli
14 #
15 # nepromjenjive:
16 # - prijedlozi
17 # - veznici
18 # - èestice
19 # - uzvici
20 #
21
22 use strict;
23 use locale;
24
25 use Memoize;
26 #memoize('stem');
27
28 sub kgh {
29 my ($pre,$replace,$post) = @_;
30 $replace =~ s/[cè]/k/g;
31 $replace =~ s/[z¾]/g/g;
32 $replace =~ s/[s¹]/h/g;
33 return $pre . $replace . $post;
34 }
35
36 # samoglasnici
37 my $sa = '[aeiou]';
38 # suglasnici
39 my $su = '[^aeiou]';
40
41 my $palatal = '(lj|nj|j|æ|¾|¹|¾|¹t|¾d)';
42
43 # glagolni razredi
44 my $g_1r = '[td]';
45 my $g_2r = '[sz]';
46 my $g_3r = '[pb]';
47 my $g_4r = '[kgh]';
48 my $g_5r = '[nm]';
49 my $g_6r = '(:?nu|n)';
50
51 sub stem {
52 my $self = shift;
53 die "call with StemHR->stem('word') $self" unless $self eq 'StemHR';
54
55 my $w = shift || return;
56
57 unless (
58 # infinitiv
59 $w =~ s/(\w)(ti|æi)$/$1.$2 100/g ||
60 # 1. razred
61 $w =~ s/([^sk])[td](em|e¹|e|emo|ete|oh|osmo|oste|o¹e|ijah|ija¹e|ijasno|ijaste|ijahu|imo|ite|en|ena|eni)$/$1s.ti 101/g ||
62 # 2. razred
63 $w =~ s/(${sa})[sz](em|e¹|e|e¹emo|emo|ete|u|oh|e|osmo|oste|o¹e|ijah|ija¹e|ijasmo|ijaste|ijahu|imo|ite|uæi|av¹i|ao|la|lo|li|le|la|en|ena|eni)$/$1s.ti 102/ ||
64 # 3. razred
65 $w =~ s/(p|b|sp|su)(em|e¹|e|emo|ete|u|oh|osmo|oste|o¹e|ah|a¹e|asmo|aste|ahu|ijah|ija¹e|ijahu|i|imo|ite||uæi|av¹i|ao|la|lo|en|ena|eni)$/$1s.ti 103/ ||
66 # 4. razred
67 $w =~ s/[è¾¹](em|e¹|e|emo|ete|u|ah|ahu|en|ena)$/.æi 104/g ||
68 $w =~ s/[k](oh|osmo|oste|o¹e|uæi|av¹i|ao|la|lo)$/.æi 105/g ||
69 $w =~ s/[c](ijah|ija¹e|ijasmo|ijaste|ijahu|i|imo|ite)$/.æi 106/g ||
70 $w =~ s/[g](nuti|oh|nuh|nu|av¹i|nuv¹i|ao|nuo|nem|ne¹|ni|imo|nut|nimo)$/.æi 107/g ||
71
72 # imenice
73
74 # vrsta a
75 $w =~ s/(${su}st)a$/$1 13/g ||
76 $w =~ s/(${su})c[ae]$/$1ce 17/g ||
77 # kgh
78 $w =~ s/(\w${sa})([è¾¹czs])(i|e|ima)$/kgh($1,$2,' 1')/gex ||
79 $w =~ s/(${sa}[kgh])(a|u|om)$/$1 2/g ||
80 $w =~ s/(${su})([è¾¹czs])(i|e|ima)$/kgh($1,"a$2",' 3')/gex ||
81 # imenice na palatal
82 $w =~ s/${palatal}${palatal}(a|u|em|i|ima|e)$/$1a$2 9/g ||
83 $w =~ s/${palatal}(a|u|em)$/$1 8/g ||
84 # nepostojano a
85 $w =~ s/(${su}a{$su})a/$1 4/g ||
86 $w =~ s/(${su})(${su})(a|u|i|e|om|ima)$/$1a$2 5/g
87 ) {
88
89 # vrsta a
90 $w =~ s/me$/me 11/g ||
91 $w =~ s/(eta|ena)$/e 12/g ||
92
93 $w =~ s/(\w${sa}${su})(o|e|a|u|om|em|i|ima|ina|eta)$/$1 7/g ||
94
95 $w =~ s/(${su})sa$/$1as 14/g ||
96 $w =~ s/(${su})ena$/$1e 16/g ||
97 $w =~ s/(${su})eta$/$1e 17/g ||
98 $w =~ s/(${su})([oe])$/$1$2 18/g ||
99
100 0;
101
102 }
103
104 # makni broj pravila
105 $w =~ s/\s\d+$//;
106 # makni toèku koja oznaèava korjen rijeèi
107 $w =~ s/\.//g;
108
109 return $w;
110 }
111
112 1;

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26