package Lingua::Spelling::Alternative; require 5.001; use strict; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); use Data::Dumper; use Exporter; $VERSION = '0.10'; @ISA = ('Exporter'); #@EXPORT = qw(); @EXPORT_OK = qw( &alternatives ); my $debug=0; =head1 NAME Lingua::Spelling::Alternative - alternative spelling of a given word in a given language =head1 SYNOPSIS use Lingua::Spelling::Alternative; my $en = new Lingua::Spelling::Alternative; $en->load_affix('/usr/lib/ispell/default.aff') or die $!; print join(" ",$en->alternatives("cars")),"\n"; =head1 DESCRIPTION This module is designed to return all forms of a given word (for example when you want to see all possible forms of some word entered in search engine) which can be generated using affix file (from ispell) or findaffix output file (also part of ispell package) or using OpenOffice.org affix files. =head1 PUBLIC METHODS =head2 new The new() constructor (without parameters) create container for new language. my $en = new Lingua::Spelling::Alternative( debug => 1, min_length => 3, ); Options: =over 4 =item debug Turns debugging which will be spilled on C. =item min_length Minimum word length (by default B<3>) which will be considered for C. =back =cut sub new { my $class = shift; my $self = {@_}; bless($self, $class); $debug = 1 if ($self->{DEBUG} || $self->{debug}); $self->clear; $self->{'min_length'} ||= 3; $self ? return $self : return undef; } =head2 clear This method will clear internal affix and is called every time you use C, C or C on same object. =cut sub clear { my $self = shift; @{$self->{affix}} = (); return 1; } =head2 load_affix Loads ispell's affix file for later usage. It will create internal structures needed for other methods. $en->load_affix('/etc/dictionaries-common/default.aff'); =cut sub load_affix { my $self = shift; my $filename = shift; my ($prefix,$suffix,$combine)=('','',0); print STDERR "reading affix file $filename\n" if ($debug); open (A,$filename) || die "Can't open affix file $filename: $!"; while() { chomp; next if (/^#|^[\s\t\n\r]*$/); if (/^suffixes/i) { ($prefix,$suffix) = ('','$'); next; } next if (! $suffix && ! $prefix); if (/^flag[\s\t]+(\*?)(.):/i) { $combine = $1; print STDERR "pattern $2",($combine && " combine with prefix"),"\n" if ($debug); next; } my ($reg,$sub,$add); if (/^[\s\t]+([^>#\s\t]+)[\s\t]*>[\s\t]*-?([^\,\s\t]+)?,?([^\s\t]+)?/) { ($reg,$sub,$add) = ($1,$2,$3); print STDERR "rule: $_\n" if ($debug); if (! $add) { $add = $sub; $sub = ''; } } else { print STDERR "skip: $_\n"; next; } printf STDERR "adding: /$reg/ -> s/$sub/$add/i\n" if ($debug); push @{$self->{'affix'}}, [ qr/${prefix}${reg}${suffix}/i, qr/^(.+)${sub}${suffix}/i, $add ]; push @{$self->{'affix'}}, [ qr/${prefix}${add}${suffix}/i, qr/^(.+)${add}${suffix}/i, $sub ]; } print STDERR Dumper($self->{'affix'}) if ($debug); # XXX return 1; } =head2 load_findaffix This function loads output of findaffix program from ispell package. This is better idea (if you are creating affix file for particular language yourself or you can get your hands on one) because affix file from ispell is limited to 26 entries (because each entry is denoted by single character). $en->load_findaffix('findaffix.out'); =cut sub load_findaffix { my $self = shift; my $filename = shift; print STDERR "reading findaffix output $filename\n" if ($debug); open (A,$filename) || die "Can't open findaffix output $filename: $!"; while() { chomp; my ($sub,$add,undef,undef) = split(m;/;,$_,4); if ($sub && $add) { push @{$self->{'affix'}}, [ qr/./, qr/^(.+)${sub}$/i, $add ]; } } return 1; } =head2 alternatives Return array of all alternative spellings of particular word(s). It will also return spellings which are not lexically correct if there is rule like that in affix file. print $en->alternatives(qw(cat dog)); print $en->alternatives('demo'); =cut sub alternatives { my $self = shift; my @out; foreach my $word (@_) { # save original word push @out,$word; # skip short words next if (length($word) < $self->{'min_length'}); foreach my $a (@{$self->{'affix'}}) { next if ($word !~ $a->[0]); if ($word =~ $a->[1]) { push @out,lc($1.$a->[2]); print STDERR $word," -> ",$1.$a->[2], " [",$a->[0]," ... ",$a->[1],"]\n" if ($debug); } } } return @out; } =head2 minimal This function returns minimal of all alternatives of a given word(s). It's a poor man's version of normalize (because we don't know grammatic of particular language, just some spelling rules). print $en->minimal('informations'); Special case is when there is only one argument, and result is expecte in scalar context. In this case it will return just minimal length alternative of this word (new in version 0.10). =cut sub minimal { my $self = shift; my @out; my $argc = 0; # argument count foreach my $word (@_) { $argc++; my @alt = $self->alternatives($word); my $minimal = shift @alt; foreach (@alt) { $minimal=$_ if (length($_) < length($minimal)); } push @out, $minimal; } return @out if wantarray; warn "called in scalar context with more than one word" if ($argc > 1); return shift @out; } ############################################################################### 1; __END__ =head1 EXAMPLES Please see the test program in distribution which exercises some aspects of Alternative.pm. =head1 BUGS There are no known bugs. If you find any, please report it in CPAN's request tracker at: http://rt.cpan.org/ =head1 CONTACT AND COPYRIGHT Copyright 2002-2005 Dobrica Pavlinusic (dpavlin@rot13.org). All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut