1 |
dpavlin |
12 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
use lib '.'; |
4 |
|
|
require 'stem-hr.pm'; |
5 |
|
|
|
6 |
|
|
my %rules; |
7 |
|
|
my %stem_words; |
8 |
|
|
my $words = 0; |
9 |
|
|
my $stems = 0; |
10 |
|
|
|
11 |
|
|
|
12 |
|
|
my $last_stem = ''; |
13 |
|
|
my $errors = 0; |
14 |
|
|
sub check_stem { |
15 |
|
|
my $s = shift || return; |
16 |
|
|
if ($last_stem) { |
17 |
|
|
if ($last_stem ne $s) { |
18 |
|
|
print "ERROR==> "; |
19 |
|
|
$errors++; |
20 |
|
|
} |
21 |
|
|
} else { |
22 |
|
|
$last_stem = $s; |
23 |
|
|
} |
24 |
|
|
} |
25 |
|
|
|
26 |
|
|
while(<>) { |
27 |
|
|
chomp; |
28 |
|
|
next if (/^#/); |
29 |
|
|
if (/^$/) { |
30 |
|
|
print "\n"; |
31 |
|
|
$last_stem = ''; |
32 |
|
|
next; |
33 |
|
|
} |
34 |
|
|
|
35 |
|
|
$words++; |
36 |
|
|
|
37 |
|
|
my $orig = $_; |
38 |
|
|
|
39 |
|
|
my $stem = StemHr::stem($_); |
40 |
|
|
|
41 |
|
|
if (s/^(.+)\s(\d+)$/$1\t$2/g) { |
42 |
|
|
$rules{$2}++; |
43 |
|
|
$stems++; |
44 |
|
|
$stem_words{$1}++; |
45 |
|
|
check_stem($1); |
46 |
|
|
} else { |
47 |
|
|
$last_stem = $_; |
48 |
|
|
} |
49 |
|
|
|
50 |
|
|
printf("%-15s %s\n",$orig,$_); |
51 |
|
|
|
52 |
|
|
} |
53 |
|
|
my $nr_stems = keys(%stem_words); |
54 |
|
|
printf "\n# %d words, %d stems in %d ops, %.2f%% size [%d errors]\n",$words,$nr_stems,$stems,($nr_stems*100/$words),$errors; |
55 |
|
|
|
56 |
|
|
foreach my $s (keys %stem_words) { |
57 |
|
|
print "#stem $stem_words{$s} $s\n"; |
58 |
|
|
} |
59 |
|
|
|
60 |
|
|
foreach my $r (sort keys %rules) { |
61 |
|
|
print "#rule $rules{$r} $r\n"; |
62 |
|
|
} |