| Revision 16 (by dpavlin, 2007/06/28 10:38:34) |
new calling StemHR->stem('word')
|
package StemHR;
#
# Croatian stemmer
#
# promjenjive:
# - imenice
# - pridjevi
# - brojevi
# - zamjenice
# - prilozi
# - glagoli
#
# nepromjenjive:
# - prijedlozi
# - veznici
# - čestice
# - uzvici
#
use strict;
use locale;
use Memoize;
#memoize('stem');
sub kgh {
my ($pre,$replace,$post) = @_;
$replace =~ s/[cč]/k/g;
$replace =~ s/[zž]/g/g;
$replace =~ s/[sš]/h/g;
return $pre . $replace . $post;
}
# samoglasnici
my $sa = '[aeiou]';
# suglasnici
my $su = '[^aeiou]';
my $palatal = '(lj|nj|j|ć|ž|š|ž|št|žd)';
# glagolni razredi
my $g_1r = '[td]';
my $g_2r = '[sz]';
my $g_3r = '[pb]';
my $g_4r = '[kgh]';
my $g_5r = '[nm]';
my $g_6r = '(:?nu|n)';
sub stem {
my $self = shift;
die "call with StemHR->stem('word') $self" unless $self eq 'StemHR';
my $w = shift || return;
unless (
# infinitiv
$w =~ s/(\w)(ti|ći)$/$1.$2 100/g ||
# 1. razred
$w =~ s/([^sk])[td](em|eš|e|emo|ete|oh|osmo|oste|oše|ijah|ijaše|ijasno|ijaste|ijahu|imo|ite|en|ena|eni)$/$1s.ti 101/g ||
# 2. razred
$w =~ s/(${sa})[sz](em|eš|e|ešemo|emo|ete|u|oh|e|osmo|oste|oše|ijah|ijaše|ijasmo|ijaste|ijahu|imo|ite|ući|avši|ao|la|lo|li|le|la|en|ena|eni)$/$1s.ti 102/ ||
# 3. razred
$w =~ s/(p|b|sp|su)(em|eš|e|emo|ete|u|oh|osmo|oste|oše|ah|aše|asmo|aste|ahu|ijah|ijaše|ijahu|i|imo|ite||ući|avši|ao|la|lo|en|ena|eni)$/$1s.ti 103/ ||
# 4. razred
$w =~ s/[čžš](em|eš|e|emo|ete|u|ah|ahu|en|ena)$/.ći 104/g ||
$w =~ s/[k](oh|osmo|oste|oše|ući|avši|ao|la|lo)$/.ći 105/g ||
$w =~ s/[c](ijah|ijaše|ijasmo|ijaste|ijahu|i|imo|ite)$/.ći 106/g ||
$w =~ s/[g](nuti|oh|nuh|nu|avši|nuvši|ao|nuo|nem|neš|ni|imo|nut|nimo)$/.ći 107/g ||
# imenice
# vrsta a
$w =~ s/(${su}st)a$/$1 13/g ||
$w =~ s/(${su})c[ae]$/$1ce 17/g ||
# kgh
$w =~ s/(\w${sa})([čžšczs])(i|e|ima)$/kgh($1,$2,' 1')/gex ||
$w =~ s/(${sa}[kgh])(a|u|om)$/$1 2/g ||
$w =~ s/(${su})([čžšczs])(i|e|ima)$/kgh($1,"a$2",' 3')/gex ||
# imenice na palatal
$w =~ s/${palatal}${palatal}(a|u|em|i|ima|e)$/$1a$2 9/g ||
$w =~ s/${palatal}(a|u|em)$/$1 8/g ||
# nepostojano a
$w =~ s/(${su}a{$su})a/$1 4/g ||
$w =~ s/(${su})(${su})(a|u|i|e|om|ima)$/$1a$2 5/g
) {
# vrsta a
$w =~ s/me$/me 11/g ||
$w =~ s/(eta|ena)$/e 12/g ||
$w =~ s/(\w${sa}${su})(o|e|a|u|om|em|i|ima|ina|eta)$/$1 7/g ||
$w =~ s/(${su})sa$/$1as 14/g ||
$w =~ s/(${su})ena$/$1e 16/g ||
$w =~ s/(${su})eta$/$1e 17/g ||
$w =~ s/(${su})([oe])$/$1$2 18/g ||
0;
}
# makni broj pravila
$w =~ s/\s\d+$//;
# makni točku koja označava korjen riječi
$w =~ s/\.//g;
return $w;
}
1;