Чат бот выдаёт бред на tensorflow

Я делаю генеративного чат бота seq2seq. Я уже написал код и проверил его несколько раз, но несмотря на это он выдаёт бред. На любой запрос пользователя он отвечает: "pad pad pad pad". pad - заполнитель фраз в обучающей выборке. Этот заполнитель есть в словаре чат бота и он есть в обучающих данных, поэтому по идее он может использовать его в разговоре с пользователем, но он использует только его и тег end для окончания текста. Как мне исправить эту ошибку? Нейросеть при обучении имела точность 84%. Вот код:

from keras.layers import LSTM, Input, Embedding, Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from datasets import load_dataset
import ast
from itertools import chain
import numpy as np

characters = ['.', ',', '?', '!', '-', '=', '+', '-', '*', '^',
              '#', '@', "'", '"', ':', ';', '[', ']', '{', '}',
              '%', '$', '&', '~', '`', '/', '№', '«', '»', '\n']

class Generative_chatbot:
    def __init__(self, max_phrase_len):
        self.max_phrase_len = max_phrase_len + 2
        self.neurons_num = 400

    def data_normalize(self, dataset_easy, dialogues):
        def normalize_normal_dataset():
            dataset = load_dataset("Den4ikAI/russian_dialogues_2")  # загрузка датасета

            dataset = [dataset['train'][n]['sample'] for n in range(dialogues)]  # выделение диалогов
            dataset = [ast.literal_eval(dialogue) for dialogue in dataset]  # нормализация очень странных списков, "[1, 5, 0]" -> [1, 5, 0]

            patterns_ = [dialogue[::2] for dialogue in dataset]  # выделение запросов
            responses_ = [dialogue[1::2] for dialogue in dataset]  # выделение ответов

            # урезание до нормальной длины
            patterns = []
            responses = []
            for pattern, response in zip(patterns_, responses_):
                min_len = min(len(pattern), len(response))

                if min_len == len(pattern):
                    response = response[:min_len]
                else:
                    pattern = pattern[:min_len]

                patterns.append(pattern)
                responses.append(response)

            return patterns, responses

        def normalize_easy_dataset():
            with open('train_data/generative_chat_bot/patterns.txt', encoding='utf-8') as file:
                patterns = file.readlines()

            with open('train_data/generative_chat_bot/responses.txt', encoding='utf-8') as file:
                responses = file.readlines()

            patterns = [[pattern] for pattern in patterns[: dialogues]]
            responses = [[response] for response in responses[: dialogues]]

            return patterns, responses

        if dataset_easy == False:
            patterns, responses = normalize_normal_dataset()
        else:
            patterns, responses = normalize_easy_dataset()

        # из двумерного списка в одномерный
        patterns = list(chain.from_iterable(patterns))
        responses = list(chain.from_iterable(responses))

        tokenizer_fit_text = ['start', 'end'] + word_tokenize(' '.join(patterns + responses))
        self.tokenizer = Tokenizer(char_level=False, lower=True)
        self.tokenizer.fit_on_texts(tokenizer_fit_text)
        #self.tokenizer.index_word = {k - 1: v for k, v in self.tokenizer.index_word.items()}
        self.tokenizer.index_word[0] = 'pad'
        self.vocab_len = len(self.tokenizer.index_word) + 1

        # разделение на отдельные слова
        patterns = [word_tokenize(phrase) for phrase in patterns]
        responses = [['start'] + word_tokenize(phrase) + ['end'] for phrase in responses]

        # удаление всех знаков препинания
        patterns = [[word for word in phrase if word not in characters] for phrase in patterns]
        responses = [[word for word in phrase if word not in characters] for phrase in responses]

        # токенизация текста
        patterns = [list(chain.from_iterable(self.tokenizer.texts_to_sequences(phrase))) for phrase in patterns]
        responses = [list(chain.from_iterable(self.tokenizer.texts_to_sequences(phrase))) for phrase in responses]

        # приведение фраз к одинаковой длине
        patterns = pad_sequences(patterns, self.max_phrase_len, padding='post', truncating='post')
        responses = pad_sequences(responses, self.max_phrase_len, padding='post', truncating='post')

        # превращение ответов из чисел в one-hot вектора
        responses_final = to_categorical(responses, num_classes=self.vocab_len)

        return patterns, responses, responses_final

    def create_model(self):
        enc_inp = Input(shape=(self.max_phrase_len,))
        dec_inp = Input(shape=(self.max_phrase_len,))

        embed = Embedding(self.vocab_len, output_dim=50,
                          input_length=self.max_phrase_len, trainable=True)

        enc_embed = embed(enc_inp)
        enc_lstm = LSTM(self.neurons_num, return_sequences=True, return_state=True)
        enc_op, h, c = enc_lstm(enc_embed)
        enc_state = [h, c]

        dec_embed = embed(dec_inp)
        dec_lstm = LSTM(self.neurons_num, return_sequences=True, return_state=True)
        dec_op, _, _ = enc_lstm(enc_embed)

        self.dense = Dense(self.vocab_len, activation='softmax')

        dense_op = self.dense(dec_op)

        self.model = Model([enc_inp, dec_inp], dense_op)

        self.model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=Adam(0.003))

        #######################################
        self.enc_model = Model([enc_inp], enc_state)

        decoder_state_input_h = Input(shape=(self.neurons_num,))
        decoder_state_input_c = Input(shape=(self.neurons_num,))

        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

        decoder_outputs, state_h, state_c = dec_lstm(dec_embed, initial_state=decoder_states_inputs)
        decoder_states = [state_h, state_c]

        self.dec_model = Model([dec_inp] + decoder_states_inputs, [decoder_outputs] + decoder_states)


    def get_response(self, pattern):
        # нормализация данных
        pattern = word_tokenize(pattern)
        pattern = [word for word in pattern if word not in characters]
        pattern = self.tokenizer.texts_to_sequences(pattern)
        pattern = list(chain.from_iterable(pattern))
        pattern = pad_sequences([pattern], self.max_phrase_len, padding='post', truncating='post')
        print(pattern)

        stat = self.enc_model.predict(pattern, verbose=False)

        empty_target_seq = np.array(self.tokenizer.texts_to_sequences(['start']))
        stop_condition = False
        decoded_translation = ''

        def sample_word(predictions, temperature=1.0):
            predictions = np.asarray(predictions).astype('float64')
            predictions = np.log(predictions) / temperature
            exp_predictions = np.exp(predictions)
            predictions = exp_predictions / np.sum(exp_predictions)
            probas = np.random.multinomial(1, predictions, 1)
            return np.argmax(probas)

        while not stop_condition:
            dec_outputs, h, c = self.dec_model.predict([empty_target_seq] + stat, verbose=False)
            decoder_concat_input = self.dense(dec_outputs)
            decoder_concat_input = decoder_concat_input.numpy().flatten().tolist()

            sampled_word_index = np.argmax(decoder_concat_input)
            #sampled_word_index = sample_word(decoder_concat_input, temperature=0.001)
            sampled_word = self.tokenizer.index_word[sampled_word_index] + ' '

            if sampled_word != 'end ':
                decoded_translation += sampled_word

            if sampled_word == 'end ' or len(decoded_translation.split(' ')) > self.max_phrase_len:
                stop_condition = True

            empty_target_seq = np.array([[sampled_word_index]])
            stat = [h, c]

        return decoded_translation

g_c = Generative_chatbot(15)
patterns, responses, responses_final = g_c.data_normalize(dataset_easy=False, dialogues=1500)
g_c.create_model()
g_c.model.fit([patterns, responses], responses_final, epochs=50, batch_size=1)

while True:
    user = input('Вы: ')
    response = g_c.get_response(user)
    print(f'Аврора: {response}')

Использовался датасет с сайта https://huggingface.co/datasets/Den4ikAI/russian_dialogues_2. Нейросеть обучалась на 1500 диалогах(оперативка больше не позволяет).

Если что по дополнительной информации пишите в комментариях.


Ответы (0 шт):