group-wbl/.venv/lib/python3.13/site-packages/whoosh/lang/__init__.py
2026-01-09 09:48:03 +08:00

141 lines
4.2 KiB
Python

# coding=utf-8
# Copyright 2012 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
# Exceptions
class NoStemmer(Exception):
pass
class NoStopWords(Exception):
pass
# Data and functions for language names
languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt",
"ro", "ru", "es", "sv", "tr")
aliases = {
# By ISO 639-1 three letter codes
"ara": "ar",
"dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr",
"deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt",
"ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr",
# By name in English
"arabic": "ar",
"danish": "da",
"dutch": "nl",
"english": "en",
"finnish": "fi",
"french": "fr",
"german": "de",
"hungarian": "hu",
"italian": "it",
"norwegian": "no",
"portuguese": "pt",
"romanian": "ro",
"russian": "ru",
"spanish": "es",
"swedish": "sv",
"turkish": "tr",
# By name in own language
"العربية": "ar",
"dansk": "da",
"nederlands": "nl",
"suomi": "fi",
"français": "fr",
"deutsch": "de",
"magyar": "hu",
"italiano": "it",
"norsk": "no",
"português": "pt",
"русский язык": "ru",
"español": "es",
"svenska": "sv",
"türkçe": "tr",
}
def two_letter_code(name):
if name in languages:
return name
if name in aliases:
return aliases[name]
return None
# Getter functions
def has_stemmer(lang):
try:
return bool(stemmer_for_language(lang))
except NoStemmer:
return False
def has_stopwords(lang):
try:
return bool(stopwords_for_language(lang))
except NoStopWords:
return False
def stemmer_for_language(lang):
if lang == "en_porter":
# Original porter stemming algorithm is several times faster than the
# more correct porter2 algorithm in snowball package
from .porter import stem as porter_stem
return porter_stem
tlc = two_letter_code(lang)
if tlc == "ar":
from .isri import ISRIStemmer
return ISRIStemmer().stem
from .snowball import classes as snowball_classes
if tlc in snowball_classes:
return snowball_classes[tlc]().stem
raise NoStemmer("No stemmer available for %r" % lang)
def stopwords_for_language(lang):
from .stopwords import stoplists
tlc = two_letter_code(lang)
if tlc in stoplists:
return stoplists[tlc]
raise NoStopWords("No stop-word list available for %r" % lang)