Flower AI на двух машинах: received 0 results and 1 failures - для каждого раунда, без получения ошибок на стороне клиента
Всем доброго времени суток. Я новичок в использовании federated learning, да и в алгоритмах sklearn тоже. Я запускаю federated learning (Flower AI) на двух своих машинах в локальной сети: sklearn.RandomForestRegressor, в качестве основы беру этот пример
При запуске получаю на сервере следующее:
2025-01-15 20:08:50.448721: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-15 20:08:50.452303: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-15 20:08:50.463313: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1736960930.481690 36571 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736960930.487189 36571 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-15 20:08:50.506235: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
{'build-system': {'requires': ['hatchling'], 'build-backend': 'hatchling.build'}, 'project': {'name': 'soup', 'version': '1.0.0', 'description': '', 'license': 'Apache-2.0', 'dependencies': ['flwr[simulation]>=1.13.1', 'flwr-datasets[vision]>=0.3.0', 'scikit-learn>=1.1.1']}, 'tool': {'hatch': {'build': {'targets': {'wheel': {'packages': ['.']}}}}, 'flwr': {'app': {'publisher': 'helenklim', 'config': {'num-server-rounds': 2, 'penalty': 'l2', 'local-epochs': 1}}}}}
Getting model
Initial params after model creation: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': True}
RandomForestRegressor(max_features='sqrt', n_estimators=10, warm_start=True)
Init parameters [[]]
INFO : Starting Flower server, config: num_rounds=2, no round_timeout
INFO : Flower ECE: gRPC server running (2 rounds), SSL is disabled
INFO : [INIT]
INFO : Using initial global parameters provided by strategy
INFO : Starting evaluation of initial global parameters
INFO : Evaluation returned no results (`None`)
INFO :
INFO : [ROUND 1]
INFO : configure_fit: strategy sampled 1 clients (out of 1)
INFO : aggregate_fit: received 0 results and 1 failures
INFO : configure_evaluate: strategy sampled 1 clients (out of 1)
INFO : aggregate_evaluate: received 0 results and 1 failures
INFO :
INFO : [ROUND 2]
INFO : configure_fit: strategy sampled 1 clients (out of 1)
INFO : aggregate_fit: received 0 results and 1 failures
INFO : configure_evaluate: strategy sampled 1 clients (out of 1)
INFO : aggregate_evaluate: received 0 results and 1 failures
INFO :
INFO : [SUMMARY]
INFO : Run finished 2 round(s) in 8.06s
INFO :
На клиенте:
model.get_params() after init: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
DEBUG:flwr:Opened insecure gRPC connection (no certificates were passed)
DEBUG:flwr:ChannelConnectivity.IDLE
DEBUG:flwr:ChannelConnectivity.CONNECTING
DEBUG:flwr:ChannelConnectivity.READY
INFO :
INFO:flwr:
INFO : Received: reconnect message 98ab9e0e-8b5b-4d23-82a2-d53bf9bf7ea2
INFO:flwr:Received: reconnect message 98ab9e0e-8b5b-4d23-82a2-d53bf9bf7ea2
DEBUG:flwr:gRPC channel closed
INFO : Disconnect and shut down
INFO:flwr:Disconnect and shut down
Мой код на сервере:
from flwr.common import Context, ndarrays_to_parameters
from flwr.server import ServerApp, ServerAppComponents, ServerConfig
from flwr.server.strategy import FedAvg
from task import get_model
import flwr as fl
import toml
import numpy as np
config_file = open('pyproject.toml')
config = toml.load(config_file)
print(config)
tool_flwr_app_conf = config['tool']['flwr']['app']['config']
num_rounds = tool_flwr_app_conf["num-server-rounds"]
# Create LogisticRegression Model
penalty = tool_flwr_app_conf["penalty"]
local_epochs = tool_flwr_app_conf["local-epochs"]
model = get_model(penalty, local_epochs)
print(model)
set_initial_params(model)
initial_parameters = get_model_params(model)
print("Init parameters", initial_parameters)
# Define strategy
strategy = FedAvg(
min_fit_clients=1,
min_evaluate_clients=1,
min_available_clients=1,
initial_parameters=initial_parameters,
)
config = ServerConfig(num_rounds=num_rounds)
def server_fn(context: Context):
return ServerAppComponents(strategy=strategy, config=config)
app = ServerApp(server_fn=server_fn)
мой код на клиенте:
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import log_loss
from flwr.client import ClientApp, NumPyClient
from flwr.common import Context
from task import (
get_model_params,
load_data,
set_model_params,
)
import flwr as fl
import numpy as np
import logging
# logging.basicConfig(level=logging.DEBUG)
class FlowerClient(fl.client.NumPyClient):
def init(self, model, X_train, X_test, y_train, y_test):
try:
print("\nInit client model")
self.model = RandomForestRegressor(n_estimators=10, max_features='sqrt')
print("\nmodel.get_params() after init:", self.model.get_params())
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test
except Exception as e:
logging.error("Error initializing FlowerClient: %s", e)
def fit(self, parameters, config={}):
logging.info("Fitting...")
set_model_params(self.model, parameters)
print("model params before fitting:", self.model.get_params())
print("\nfitting...")
# Ignore convergence failure due to low local epochs
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.model.fit(self.X_train, self.y_train)
print("\nModel attrs after fitting:", self.model.dict)
return get_model_params(self.model), len(self.X_train), {}
def evaluate(self, parameters, config={}):
print("\nModel params before evaluation:", self.model.dict)
set_model_params(self.model, parameters)
print(parameters)
print("Client: Starting evaluation.")
predictions = self.model.predict(self.x_test)
loss = np.mean((predictions - self.y_test) ** 2)
print(f"Client: Evaluation completed with loss: {loss}.")
return loss, len(self.X_test), {"loss": loss}
def client_fn(context: Context):
X_train, X_test, y_train, y_test = load_data()
model = RandomForestRegressor(n_estimators=10, max_features='sqrt')
return FlowerClient(model, X_train, X_test, y_train, y_test)
app = ClientApp(client_fn=client_fn)
функции из task.py:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
fds = None # Cache FederatedDataset
data_path = 'data/dataset-2.csv'
def load_data():
dataset = read_csv(data_path, delimiter=',')
dataset = dataset.transpose()
trg = dataset[-1:]
trn = dataset[:-1]
x_train, x_test, y_train, y_test = train_test_split(trn.transpose(), trg.transpose(), test_size=0.2)
print(x_test)
return x_train, x_test, y_train, y_test
def get_model(penalty: str, local_epochs: int):
print("Getting model")
model = RandomForestRegressor(
n_estimators=10,
max_features='sqrt',
# max_iter=local_epochs,
warm_start=True,
)
print("Initial params after model creation:", model.get_params())
return model
def set_initial_params(model):
model.estimators_ = []
def get_model_params(model):
params = [model.estimators_]
return params
def set_model_params(model, params):
logging.debug("Setting model param")
model.estimators_ = params[0]
return model
то есть клиент не выводит никаких принтов, кроме тех, что в init методе класса FlowerClient.
Что я пробовала:
Менять порт
Перезагружать оба компьютера
использовать другой flwr.client
по-другому сериализовать параметры подели при их передаче
оборачивать все в блоки try/catch
снести и заново создать venv на клиенте
спрашивать совет у ChatGpt и вставлять его вариант кода вместо своего (итог тот же)
Подскажите пожалуйста, знающие люди!)