Метод sent_tokenize библиотеки NLTK отказывается работать
Необходимая библиотека установлена и подключена:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
Прекрасно работает функция word_tokenize:
text_words = "Hello! How are you?"
words = word_tokenize(text=text_words, preserve_line=True)
print(words)
#['Hello', '!', 'How', 'are', 'you', '?']
А вот как выглядит вывод у sent_tokenize:
text_sents = "Hello! How are you?"
sents = sent_tokenize(text=text_sents, language="english")
print(sents)
> { "name": "LookupError", "message": "
> ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to
> obtain the resource:
>
> >>> import nltk
> >>> nltk.download('punkt_tab')
> For more information see: https://www.nltk.org/data.html
>
> Attempted to load tokenizers/punkt_tab/english/
>
> Searched in:
> - 'C:\\\\Users\\\\bem54/nltk_data'
> - 'd:\\\\Anaconda\\\\envs\\\\Neiro\\\ ltk_data'
> - 'd:\\\\Anaconda\\\\envs\\\\Neiro\\\\share\\\ ltk_data'
> - 'd:\\\\Anaconda\\\\envs\\\\Neiro\\\\lib\\\ ltk_data'
> - 'C:\\\\Users\\\\bem54\\\\AppData\\\\Roaming\\\ ltk_data'
> - 'C:\\\ ltk_data'
> - 'D:\\\ ltk_data'
> - 'E:\\\ ltk_data'
> ********************************************************************** ", "stack":
> "--------------------------------------------------------------------------- LookupError Traceback (most recent call
> last) Cell In[87], line 2
> 1 text_sents = \"Hello! How are you?\"
> ----> 2 sents = sent_tokenize(text=text_sents, language=\"english\")
> 3 print(sents)
>
> File d:\\Anaconda\\envs\\Neiro\\Lib\\site-packages\
> ltk\\tokenize\\__init__.py:119, in sent_tokenize(text, language)
> 109 def sent_tokenize(text, language=\"english\"):
> 110 \"\"\"
> 111 Return a sentence-tokenized copy of *text*,
> 112 using NLTK's recommended sentence tokenizer (...)
> 117 :param language: the model name in the Punkt corpus
> 118 \"\"\"
> --> 119 tokenizer = _get_punkt_tokenizer(language)
> 120 return tokenizer.tokenize(text)
>
> File d:\\Anaconda\\envs\\Neiro\\Lib\\site-packages\
> ltk\\tokenize\\__init__.py:105, in _get_punkt_tokenizer(language)
> 96 @functools.lru_cache
> 97 def _get_punkt_tokenizer(language=\"english\"):
> 98 \"\"\"
> 99 A constructor for the PunktTokenizer that utilizes
> 100 a lru cache for performance. (...)
> 103 :type language: str
> 104 \"\"\"
> --> 105 return PunktTokenizer(language)
>
> File d:\\Anaconda\\envs\\Neiro\\Lib\\site-packages\
> ltk\\tokenize\\punkt.py:1744, in PunktTokenizer.__init__(self, lang)
> 1742 def __init__(self, lang=\"english\"): 1743
> PunktSentenceTokenizer.__init__(self)
> -> 1744 self.load_lang(lang)
>
> File d:\\Anaconda\\envs\\Neiro\\Lib\\site-packages\
> ltk\\tokenize\\punkt.py:1749, in PunktTokenizer.load_lang(self, lang)
> 1746 def load_lang(self, lang=\"english\"): 1747 from nltk.data
> import find
> -> 1749 lang_dir = find(f\"tokenizers/punkt_tab/{lang}/\") 1750 self._params = load_punkt_params(lang_dir) 1751 self._lang =
> lang
>
> File d:\\Anaconda\\envs\\Neiro\\Lib\\site-packages\ ltk\\data.py:579,
> in find(resource_name, paths)
> 577 sep = \"*\" * 70
> 578 resource_not_found = f\"\ {sep}\ {msg}\ {sep}\ \"
> --> 579 raise LookupError(resource_not_found)
>
> LookupError:
> ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to
> obtain the resource:
>
> >>> import nltk
> >>> nltk.download('punkt_tab')
> For more information see: https://www.nltk.org/data.html
>
> Attempted to load tokenizers/punkt_tab/english/
>
> Searched in:
> - 'C:\\\\Users\\\\bem54/nltk_data'
> - 'd:\\\\Anaconda\\\\envs\\\\Neiro\\\ ltk_data'
> - 'd:\\\\Anaconda\\\\envs\\\\Neiro\\\\share\\\ ltk_data'
> - 'd:\\\\Anaconda\\\\envs\\\\Neiro\\\\lib\\\ ltk_data'
> - 'C:\\\\Users\\\\bem54\\\\AppData\\\\Roaming\\\ ltk_data'
> - 'C:\\\ ltk_data'
> - 'D:\\\ ltk_data'
> - 'E:\\\ ltk_data'
> ********************************************************************** " }
Изображение текста ошибки (может быть выйдет нагляднее)
Также уточню, что использую дистрибутив Anaconda и платформу Jupyter Notebook.
Ответы (1 шт):
Автор решения: Toporka
→ Ссылка
Решил собственную проблему одним незамысловатым способом, а именно этой строчкой кода:
nltk.download('all')
То есть, скачав все пакеты.