При попытке обучить нейронную сеть получаю ошибку: "lambda x, y: x[name]) KeyError: 'fingerprint'"
Всем привет! Я пытаюсь построить нейронную сеть, которая будет принимать на вход данные анализа модекулы(ppm) и отдавать отпечаток молекулы(fingerprint len=512). csv файл с данными выглядит так:
ppm_and_assign fingerprint
0 [98.76, 687.0, 74.52, 666.0, 72.51, 704.0, 71.... 1111010000101011011011111101001001010101111100...
1 [15.52, 1000.0] 0000000000000000000000000000000000000000000000...
...
10335 [153.05, 662.0, 136.76, 408.0, 128.98, 1000.0,... 0010000001000000010000001000100000001010110000...
10336 [157.65, 96.0, 129.94, 995.0, 2.0, 113.81, 100... 0110000001000000010000000000000000001000110010...
смещение - список разной длинны, отпечаток - бинарное число длинной 512 символов. при попытке обучить нейронную сеть этими двумя значениями, получаю ошибку:
lambda x, y: x[name])
KeyError: 'fingerprint'
как я понимаю ошибка в энкодинге данных отпечатка. Подскажите пожайлуста как исправить ошибку. Мой код:
import os
import keras
from keras import layers
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import tensorflow as tf
dataframe = pd.read_csv('/home/oleksii/PycharmProjects/vae/ppm_fingerprint.csv',
names=['ppm_and_assign', 'fingerprint'], skipinitialspace=True, skiprows=1,
engine="python")
val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
train_dataframe = dataframe.drop(val_dataframe.index)
print(
"Using %d samples for training and %d for validation"
% (len(train_dataframe), len(val_dataframe))
)
def dataframe_to_dataset(dataframe):
dataframe = dataframe.copy()
labels = dataframe.pop("fingerprint")
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
ds = ds.shuffle(buffer_size=len(dataframe))
return ds
train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)
print(train_ds)
print(val_ds)
for x, y in train_ds.take(1):
print("Input:", x)
print("Target:", y)
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)
from keras.layers import IntegerLookup
from keras.layers import Normalization
from keras.layers import StringLookup
def encode_numerical_feature(feature, name, dataset):
'''
Didn't work for fingerprint
'''
# Create a Normalization layer for our feature
normalizer = Normalization()
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the statistics of the data
normalizer.adapt(feature_ds)
# Normalize the input feature
encoded_feature = normalizer(feature)
return encoded_feature
def encode_categorical_feature(feature, name, dataset, is_string):
lookup_class = StringLookup if is_string else IntegerLookup
# Create a lookup layer which will turn strings into integer indices
lookup = lookup_class(output_mode="binary")
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the set of possible string values and assign them a fixed integer index
lookup.adapt(feature_ds)
# Turn the string input into integer indices
encoded_feature = lookup(feature)
return encoded_feature
ppm_and_assign_1 = keras.Input(shape=(1,), name="ppm_and_assign", dtype="string")
fingerprint = keras.Input(shape=(1,), name="fingerprint", dtype="string")
all_inputs = [
ppm_and_assign_1,
fingerprint,
]
ppm_and_assign_encoded = encode_categorical_feature(ppm_and_assign_1,
"ppm_and_assign", train_ds, True)
fingerprint_encoded = encode_numerical_feature(fingerprint, "fingerprint", train_ds)
all_features = layers.concatenate(
[
ppm_and_assign_encoded,
fingerprint_encoded
]
)
x = layers.Dense(8, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="relu")(x)
model = keras.Model(all_inputs, output)
# model.compile("adam", "mean_squared_error", metrics=["accuracy"])
model.compile(
# loss='categorical_crossentropy',
loss='mean_squared_error',
# loss=tf.keras.losses.mae,
optimizer=tf.keras.optimizers.Adam(),
# metrics=tf.keras.metrics.mae
metrics=['accuracy']
)
model.fit(train_ds, epochs=50, validation_data=val_ds)