Pipeline, обучение модели
Есть код, но выдает ошибку, не могу понять где ошибка?
df = pd.read_csv('homework.csv')
X = df.drop(['price_category'], axis=1)
Y = df['price_category']
def filter_data(X):
columns_to_drop = [
'id',
'url',
'region',
'region_url',
'price',
'manufacturer',
'image_url',
'description',
'posting_date',
'lat',
'long']
return X.drop(columns_to_drop, axis=1)
def calculate_outliers(X):
q25 = df['year'].quantile(0.25)
q75 = df['year'].quantile(0.75)
iqr = q75 - q25
X.loc[X['year'] < (q25 - 1.5 * iqr), 'year'] = round(q25 - 1.5 * iqr)
X.loc[X['year'] > (q75 + 1.5 * iqr), 'year'] = round(q25 + 1.5 * iqr)
return X['year']
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
imputer_transformer = ColumnTransformer(transformers=[
('imputer_num', SimpleImputer(strategy='median'), numerical_features),
('imputer_cat',SimpleImputer(strategy='most_frequent'), categorical_features)
])
encode_scale_transformer = ColumnTransformer(transformers=[
('scale', StandardScaler(), numerical_features),
('encode',OneHotEncoder(handle_unknown='ignore'),categorical_features)
])
def new_predictor(X):
X.loc[:, 'age_category'] = X['year'].apply(lambda x: 'new' if x > 2013 else ('old' if x < 2006 else 'average'))
X.loc[:, 'short_model'] = X.apply(lambda x: x.model.split(' ')[0], axis=1)
return X
def filter_data2(X):
columns_to_drop = [
'year',
'model',
'fuel',
'odometer',
'title_status',
'transmission',
'short_model',
'state',
'age_category']
return X.drop(columns_to_drop, axis=1)
models = (
LogisticRegression(solver='liblinear'),
RandomForestClassifier(),
SVC()
)
preprocessor = Pipeline(steps=[
('filter', FunctionTransformer(filter_data)),
('outliers', FunctionTransformer(calculate_outliers)),
('imputer_transformer', imputer_transformer),
('predictors', FunctionTransformer(new_predictor)),
('encode_scale_transformer', encode_scale_transformer),
('filter_2', FunctionTransformer(filter_data2))
])
for model in models:
pipe = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', model)
])
score = cross_val_score(pipe, X, Y, cv=4, scoring='accuracy')
print(f'model:{type(model).__name__}, acc_mean: {score.mean():.4f}, acc_std:{score.std():.4f}'