Чат бот выдаёт бред на tensorflow
Я делаю генеративного чат бота seq2seq. Я уже написал код и проверил его несколько раз, но несмотря на это он выдаёт бред. На любой запрос пользователя он отвечает: "pad pad pad pad". pad - заполнитель фраз в обучающей выборке. Этот заполнитель есть в словаре чат бота и он есть в обучающих данных, поэтому по идее он может использовать его в разговоре с пользователем, но он использует только его и тег end для окончания текста. Как мне исправить эту ошибку? Нейросеть при обучении имела точность 84%. Вот код:
from keras.layers import LSTM, Input, Embedding, Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from datasets import load_dataset
import ast
from itertools import chain
import numpy as np
characters = ['.', ',', '?', '!', '-', '=', '+', '-', '*', '^',
'#', '@', "'", '"', ':', ';', '[', ']', '{', '}',
'%', '$', '&', '~', '`', '/', '№', '«', '»', '\n']
class Generative_chatbot:
def __init__(self, max_phrase_len):
self.max_phrase_len = max_phrase_len + 2
self.neurons_num = 400
def data_normalize(self, dataset_easy, dialogues):
def normalize_normal_dataset():
dataset = load_dataset("Den4ikAI/russian_dialogues_2") # загрузка датасета
dataset = [dataset['train'][n]['sample'] for n in range(dialogues)] # выделение диалогов
dataset = [ast.literal_eval(dialogue) for dialogue in dataset] # нормализация очень странных списков, "[1, 5, 0]" -> [1, 5, 0]
patterns_ = [dialogue[::2] for dialogue in dataset] # выделение запросов
responses_ = [dialogue[1::2] for dialogue in dataset] # выделение ответов
# урезание до нормальной длины
patterns = []
responses = []
for pattern, response in zip(patterns_, responses_):
min_len = min(len(pattern), len(response))
if min_len == len(pattern):
response = response[:min_len]
else:
pattern = pattern[:min_len]
patterns.append(pattern)
responses.append(response)
return patterns, responses
def normalize_easy_dataset():
with open('train_data/generative_chat_bot/patterns.txt', encoding='utf-8') as file:
patterns = file.readlines()
with open('train_data/generative_chat_bot/responses.txt', encoding='utf-8') as file:
responses = file.readlines()
patterns = [[pattern] for pattern in patterns[: dialogues]]
responses = [[response] for response in responses[: dialogues]]
return patterns, responses
if dataset_easy == False:
patterns, responses = normalize_normal_dataset()
else:
patterns, responses = normalize_easy_dataset()
# из двумерного списка в одномерный
patterns = list(chain.from_iterable(patterns))
responses = list(chain.from_iterable(responses))
tokenizer_fit_text = ['start', 'end'] + word_tokenize(' '.join(patterns + responses))
self.tokenizer = Tokenizer(char_level=False, lower=True)
self.tokenizer.fit_on_texts(tokenizer_fit_text)
#self.tokenizer.index_word = {k - 1: v for k, v in self.tokenizer.index_word.items()}
self.tokenizer.index_word[0] = 'pad'
self.vocab_len = len(self.tokenizer.index_word) + 1
# разделение на отдельные слова
patterns = [word_tokenize(phrase) for phrase in patterns]
responses = [['start'] + word_tokenize(phrase) + ['end'] for phrase in responses]
# удаление всех знаков препинания
patterns = [[word for word in phrase if word not in characters] for phrase in patterns]
responses = [[word for word in phrase if word not in characters] for phrase in responses]
# токенизация текста
patterns = [list(chain.from_iterable(self.tokenizer.texts_to_sequences(phrase))) for phrase in patterns]
responses = [list(chain.from_iterable(self.tokenizer.texts_to_sequences(phrase))) for phrase in responses]
# приведение фраз к одинаковой длине
patterns = pad_sequences(patterns, self.max_phrase_len, padding='post', truncating='post')
responses = pad_sequences(responses, self.max_phrase_len, padding='post', truncating='post')
# превращение ответов из чисел в one-hot вектора
responses_final = to_categorical(responses, num_classes=self.vocab_len)
return patterns, responses, responses_final
def create_model(self):
enc_inp = Input(shape=(self.max_phrase_len,))
dec_inp = Input(shape=(self.max_phrase_len,))
embed = Embedding(self.vocab_len, output_dim=50,
input_length=self.max_phrase_len, trainable=True)
enc_embed = embed(enc_inp)
enc_lstm = LSTM(self.neurons_num, return_sequences=True, return_state=True)
enc_op, h, c = enc_lstm(enc_embed)
enc_state = [h, c]
dec_embed = embed(dec_inp)
dec_lstm = LSTM(self.neurons_num, return_sequences=True, return_state=True)
dec_op, _, _ = enc_lstm(enc_embed)
self.dense = Dense(self.vocab_len, activation='softmax')
dense_op = self.dense(dec_op)
self.model = Model([enc_inp, dec_inp], dense_op)
self.model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=Adam(0.003))
#######################################
self.enc_model = Model([enc_inp], enc_state)
decoder_state_input_h = Input(shape=(self.neurons_num,))
decoder_state_input_c = Input(shape=(self.neurons_num,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = dec_lstm(dec_embed, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
self.dec_model = Model([dec_inp] + decoder_states_inputs, [decoder_outputs] + decoder_states)
def get_response(self, pattern):
# нормализация данных
pattern = word_tokenize(pattern)
pattern = [word for word in pattern if word not in characters]
pattern = self.tokenizer.texts_to_sequences(pattern)
pattern = list(chain.from_iterable(pattern))
pattern = pad_sequences([pattern], self.max_phrase_len, padding='post', truncating='post')
print(pattern)
stat = self.enc_model.predict(pattern, verbose=False)
empty_target_seq = np.array(self.tokenizer.texts_to_sequences(['start']))
stop_condition = False
decoded_translation = ''
def sample_word(predictions, temperature=1.0):
predictions = np.asarray(predictions).astype('float64')
predictions = np.log(predictions) / temperature
exp_predictions = np.exp(predictions)
predictions = exp_predictions / np.sum(exp_predictions)
probas = np.random.multinomial(1, predictions, 1)
return np.argmax(probas)
while not stop_condition:
dec_outputs, h, c = self.dec_model.predict([empty_target_seq] + stat, verbose=False)
decoder_concat_input = self.dense(dec_outputs)
decoder_concat_input = decoder_concat_input.numpy().flatten().tolist()
sampled_word_index = np.argmax(decoder_concat_input)
#sampled_word_index = sample_word(decoder_concat_input, temperature=0.001)
sampled_word = self.tokenizer.index_word[sampled_word_index] + ' '
if sampled_word != 'end ':
decoded_translation += sampled_word
if sampled_word == 'end ' or len(decoded_translation.split(' ')) > self.max_phrase_len:
stop_condition = True
empty_target_seq = np.array([[sampled_word_index]])
stat = [h, c]
return decoded_translation
g_c = Generative_chatbot(15)
patterns, responses, responses_final = g_c.data_normalize(dataset_easy=False, dialogues=1500)
g_c.create_model()
g_c.model.fit([patterns, responses], responses_final, epochs=50, batch_size=1)
while True:
user = input('Вы: ')
response = g_c.get_response(user)
print(f'Аврора: {response}')
Использовался датасет с сайта https://huggingface.co/datasets/Den4ikAI/russian_dialogues_2. Нейросеть обучалась на 1500 диалогах(оперативка больше не позволяет).
Если что по дополнительной информации пишите в комментариях.