group-wbl/.venv/lib/python3.13/site-packages/whoosh/lang/snowball/norwegian.py

85 lines
2.7 KiB
Python
Raw Permalink Normal View History

2026-01-09 09:48:03 +08:00
from .bases import _ScandinavianStemmer
from whoosh.compat import u
class NorwegianStemmer(_ScandinavianStemmer):
"""
The Norwegian Snowball stemmer.
:cvar __vowels: The Norwegian vowels.
:type __vowels: unicode
:cvar __s_ending: Letters that may directly appear before a word final 's'.
:type __s_ending: unicode
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
:type __step1_suffixes: tuple
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
:type __step2_suffixes: tuple
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
:type __step3_suffixes: tuple
:note: A detailed description of the Norwegian
stemming algorithm can be found under
http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
"""
__vowels = u("aeiouy\xE6\xE5\xF8")
__s_ending = "bcdfghjlmnoprtvyz"
__step1_suffixes = ("hetenes", "hetene", "hetens", "heter",
"heten", "endes", "ande", "ende", "edes",
"enes", "erte", "ede", "ane", "ene", "ens",
"ers", "ets", "het", "ast", "ert", "en",
"ar", "er", "as", "es", "et", "a", "e", "s")
__step2_suffixes = ("dt", "vt")
__step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov",
"leg", "eig", "lig", "els", "lov", "ig")
def stem(self, word):
"""
Stem a Norwegian word and return the stemmed form.
:param word: The word that is stemmed.
:type word: str or unicode
:return: The stemmed form.
:rtype: unicode
"""
word = word.lower()
r1 = self._r1_scandinavian(word, self.__vowels)
# STEP 1
for suffix in self.__step1_suffixes:
if r1.endswith(suffix):
if suffix in ("erte", "ert"):
word = "".join((word[:-len(suffix)], "er"))
r1 = "".join((r1[:-len(suffix)], "er"))
elif suffix == "s":
if (word[-2] in self.__s_ending or
(word[-2] == "k" and word[-3] not in self.__vowels)):
word = word[:-1]
r1 = r1[:-1]
else:
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
break
# STEP 2
for suffix in self.__step2_suffixes:
if r1.endswith(suffix):
word = word[:-1]
r1 = r1[:-1]
break
# STEP 3
for suffix in self.__step3_suffixes:
if r1.endswith(suffix):
word = word[:-len(suffix)]
break
return word