206 lines
8.0 KiB
Python
206 lines
8.0 KiB
Python
from .bases import _StandardStemmer
|
|
|
|
from whoosh.compat import u
|
|
|
|
|
|
class PortugueseStemmer(_StandardStemmer):
|
|
|
|
"""
|
|
The Portuguese Snowball stemmer.
|
|
|
|
:cvar __vowels: The Portuguese vowels.
|
|
:type __vowels: unicode
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
:type __step1_suffixes: tuple
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
:type __step2_suffixes: tuple
|
|
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
|
|
:type __step4_suffixes: tuple
|
|
:note: A detailed description of the Portuguese
|
|
stemming algorithm can be found under
|
|
http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
|
|
|
|
"""
|
|
|
|
__vowels = u("aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4")
|
|
__step1_suffixes = ('amentos', 'imentos', 'uciones', 'amento',
|
|
'imento', 'adoras', 'adores', u('a\xE7o~es'),
|
|
u('log\xEDas'), u('\xEAncias'), 'amente',
|
|
'idades', 'ismos', 'istas', 'adora',
|
|
u('a\xE7a~o'), 'antes', u('\xE2ncia'),
|
|
u('log\xEDa'), u('uci\xF3n'), u('\xEAncia'),
|
|
'mente', 'idade', 'ezas', 'icos', 'icas',
|
|
'ismo', u('\xE1vel'), u('\xEDvel'), 'ista',
|
|
'osos', 'osas', 'ador', 'ante', 'ivas',
|
|
'ivos', 'iras', 'eza', 'ico', 'ica',
|
|
'oso', 'osa', 'iva', 'ivo', 'ira')
|
|
__step2_suffixes = (u('ar\xEDamos'), u('er\xEDamos'), u('ir\xEDamos'),
|
|
u('\xE1ssemos'), u('\xEAssemos'), u('\xEDssemos'),
|
|
u('ar\xEDeis'), u('er\xEDeis'), u('ir\xEDeis'),
|
|
u('\xE1sseis'), u('\xE9sseis'), u('\xEDsseis'),
|
|
u('\xE1ramos'), u('\xE9ramos'), u('\xEDramos'),
|
|
u('\xE1vamos'), 'aremos', 'eremos', 'iremos',
|
|
'ariam', 'eriam', 'iriam', 'assem', 'essem',
|
|
'issem', 'ara~o', 'era~o', 'ira~o', 'arias',
|
|
'erias', 'irias', 'ardes', 'erdes', 'irdes',
|
|
'asses', 'esses', 'isses', 'astes', 'estes',
|
|
'istes', u('\xE1reis'), 'areis', u('\xE9reis'),
|
|
'ereis', u('\xEDreis'), 'ireis', u('\xE1veis'),
|
|
u('\xEDamos'), 'armos', 'ermos', 'irmos',
|
|
'aria', 'eria', 'iria', 'asse', 'esse',
|
|
'isse', 'aste', 'este', 'iste', 'arei',
|
|
'erei', 'irei', 'aram', 'eram', 'iram',
|
|
'avam', 'arem', 'erem', 'irem',
|
|
'ando', 'endo', 'indo', 'adas', 'idas',
|
|
u('ar\xE1s'), 'aras', u('er\xE1s'), 'eras',
|
|
u('ir\xE1s'), 'avas', 'ares', 'eres', 'ires',
|
|
u('\xEDeis'), 'ados', 'idos', u('\xE1mos'),
|
|
'amos', 'emos', 'imos', 'iras', 'ada', 'ida',
|
|
u('ar\xE1'), 'ara', u('er\xE1'), 'era',
|
|
u('ir\xE1'), 'ava', 'iam', 'ado', 'ido',
|
|
'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am',
|
|
'em', 'ar', 'er', 'ir', 'as',
|
|
'es', 'is', 'eu', 'iu', 'ou')
|
|
__step4_suffixes = ("os", "a", "i", "o", u("\xE1"),
|
|
u("\xED"), u("\xF3"))
|
|
|
|
def stem(self, word):
|
|
"""
|
|
Stem a Portuguese word and return the stemmed form.
|
|
|
|
:param word: The word that is stemmed.
|
|
:type word: str or unicode
|
|
:return: The stemmed form.
|
|
:rtype: unicode
|
|
|
|
"""
|
|
word = word.lower()
|
|
|
|
step1_success = False
|
|
step2_success = False
|
|
|
|
word = (word.replace(u("\xE3"), "a~")
|
|
.replace(u("\xF5"), "o~"))
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
rv = self._rv_standard(word, self.__vowels)
|
|
|
|
# STEP 1: Standard suffix removal
|
|
for suffix in self.__step1_suffixes:
|
|
if word.endswith(suffix):
|
|
if suffix == "amente" and r1.endswith(suffix):
|
|
step1_success = True
|
|
|
|
word = word[:-6]
|
|
r2 = r2[:-6]
|
|
rv = rv[:-6]
|
|
|
|
if r2.endswith("iv"):
|
|
word = word[:-2]
|
|
r2 = r2[:-2]
|
|
rv = rv[:-2]
|
|
|
|
if r2.endswith("at"):
|
|
word = word[:-2]
|
|
rv = rv[:-2]
|
|
|
|
elif r2.endswith(("os", "ic", "ad")):
|
|
word = word[:-2]
|
|
rv = rv[:-2]
|
|
|
|
elif (suffix in ("ira", "iras") and rv.endswith(suffix) and
|
|
word[-len(suffix) - 1:-len(suffix)] == "e"):
|
|
step1_success = True
|
|
|
|
word = "".join((word[:-len(suffix)], "ir"))
|
|
rv = "".join((rv[:-len(suffix)], "ir"))
|
|
|
|
elif r2.endswith(suffix):
|
|
step1_success = True
|
|
|
|
if suffix in (u("log\xEDa"), u("log\xEDas")):
|
|
word = word[:-2]
|
|
rv = rv[:-2]
|
|
|
|
elif suffix in (u("uci\xF3n"), "uciones"):
|
|
word = "".join((word[:-len(suffix)], "u"))
|
|
rv = "".join((rv[:-len(suffix)], "u"))
|
|
|
|
elif suffix in (u("\xEAncia"), u("\xEAncias")):
|
|
word = "".join((word[:-len(suffix)], "ente"))
|
|
rv = "".join((rv[:-len(suffix)], "ente"))
|
|
|
|
elif suffix == "mente":
|
|
word = word[:-5]
|
|
r2 = r2[:-5]
|
|
rv = rv[:-5]
|
|
|
|
if r2.endswith(("ante", "avel", u("\xEDvel"))):
|
|
word = word[:-4]
|
|
rv = rv[:-4]
|
|
|
|
elif suffix in ("idade", "idades"):
|
|
word = word[:-len(suffix)]
|
|
r2 = r2[:-len(suffix)]
|
|
rv = rv[:-len(suffix)]
|
|
|
|
if r2.endswith(("ic", "iv")):
|
|
word = word[:-2]
|
|
rv = rv[:-2]
|
|
|
|
elif r2.endswith("abil"):
|
|
word = word[:-4]
|
|
rv = rv[:-4]
|
|
|
|
elif suffix in ("iva", "ivo", "ivas", "ivos"):
|
|
word = word[:-len(suffix)]
|
|
r2 = r2[:-len(suffix)]
|
|
rv = rv[:-len(suffix)]
|
|
|
|
if r2.endswith("at"):
|
|
word = word[:-2]
|
|
rv = rv[:-2]
|
|
else:
|
|
word = word[:-len(suffix)]
|
|
rv = rv[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 2: Verb suffixes
|
|
if not step1_success:
|
|
for suffix in self.__step2_suffixes:
|
|
if rv.endswith(suffix):
|
|
step2_success = True
|
|
|
|
word = word[:-len(suffix)]
|
|
rv = rv[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 3
|
|
if step1_success or step2_success:
|
|
if rv.endswith("i") and word[-2] == "c":
|
|
word = word[:-1]
|
|
rv = rv[:-1]
|
|
|
|
### STEP 4: Residual suffix
|
|
if not step1_success and not step2_success:
|
|
for suffix in self.__step4_suffixes:
|
|
if rv.endswith(suffix):
|
|
word = word[:-len(suffix)]
|
|
rv = rv[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 5
|
|
if rv.endswith(("e", u("\xE9"), u("\xEA"))):
|
|
word = word[:-1]
|
|
rv = rv[:-1]
|
|
|
|
if ((word.endswith("gu") and rv.endswith("u")) or
|
|
(word.endswith("ci") and rv.endswith("i"))):
|
|
word = word[:-1]
|
|
|
|
elif word.endswith(u("\xE7")):
|
|
word = "".join((word[:-1], "c"))
|
|
|
|
word = word.replace("a~", u("\xE3")).replace("o~", u("\xF5"))
|
|
return word
|