1 |
#!/usr/bin/perl -w |
2 |
|
3 |
use lib '.'; |
4 |
use StemHR; |
5 |
|
6 |
my %rules; |
7 |
my %stem_words; |
8 |
my $words = 0; |
9 |
my $stems = 0; |
10 |
|
11 |
|
12 |
my $last_stem = ''; |
13 |
my $errors = 0; |
14 |
sub check_stem { |
15 |
my $s = shift || return; |
16 |
if ($last_stem) { |
17 |
if ($last_stem ne $s) { |
18 |
print "ERROR==> "; |
19 |
$errors++; |
20 |
} |
21 |
} else { |
22 |
$last_stem = $s; |
23 |
} |
24 |
} |
25 |
|
26 |
while(<>) { |
27 |
chomp; |
28 |
next if (/^#/); |
29 |
if (/^$/) { |
30 |
print "\n"; |
31 |
$last_stem = ''; |
32 |
next; |
33 |
} |
34 |
|
35 |
$words++; |
36 |
|
37 |
my $orig = $_; |
38 |
|
39 |
my $stem = StemHR->stem($_); |
40 |
|
41 |
if (s/^(.+)\s(\d+)$/$1\t$2/g) { |
42 |
$rules{$2}++; |
43 |
$stems++; |
44 |
$stem_words{$1}++; |
45 |
check_stem($1); |
46 |
} else { |
47 |
$last_stem = $_; |
48 |
} |
49 |
|
50 |
printf("%-15s %s\n",$orig,$_); |
51 |
|
52 |
} |
53 |
my $nr_stems = keys(%stem_words); |
54 |
printf "\n# %d words, %d stems in %d ops, %.2f%% size [%d errors]\n",$words,$nr_stems,$stems,($nr_stems*100/$words),$errors; |
55 |
|
56 |
foreach my $s (keys %stem_words) { |
57 |
print "#stem $stem_words{$s} $s\n"; |
58 |
} |
59 |
|
60 |
foreach my $r (sort keys %rules) { |
61 |
print "#rule $rules{$r} $r\n"; |
62 |
} |