George Drakoulas
12/03/2024, 12:20 PMdef automl_train(
X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> RandomForestRegressor:
"""
Identify the best regression model and train it using AutoML.
Args:
X_train: Training data of independent features.
Y_train: Training data of dependent features
Returns:
The trained model
"""
# Define the parameter grid for RandomizedSearchCV
param_dist_rf = {
'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': [int(x) for x in np.linspace(10, 110, num=11)],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'bootstrap': [True, False]
}
# Define the parameter grid for RandomizedSearchCV for XGBRegressor
param_dist_xgb = {
'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
'learning_rate': [0.01, 0.1, 0.2, 0.3],
'max_depth': [3, 5, 7, 10],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0]
}
# Define the parameter grid for RandomizedSearchCV for LGBMRegressor
param_dist_lgb = {
'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
'learning_rate': [0.01, 0.1, 0.2, 0.3],
'num_leaves': [31, 50, 100],
'max_depth': [3, 5, 7, 10],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0]
}
# Create the models
models = {
# 'RandomForest': RandomForestRegressor(),
# 'XGBoost': XGBRegressor(),
'LightGBM': LGBMRegressor()
}
# Parameter grids for the models
param_grids = {
'RandomForest': param_dist_rf,
'XGBoost': param_dist_xgb,
'LightGBM': param_dist_lgb
}
# End any active run before starting a new one
if mlflow.active_run() is not None:
mlflow.end_run()
# Enable automatic logging to MLflow
mlflow.sklearn.autolog()
# Perform hyperparameter tuning and evaluation for each model
for model_name, model in models.items():
with mlflow.start_run(run_name=model_name):
<http://logger.info|logger.info>(f"Training {model_name}...")
# RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grids[model_name],
n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
# Fit the model
random_search.fit(X_train, y_train)
# Best model
best_model = random_search.best_estimator_
# Predictions
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)
# Scatter plot for both training and testing data
plt.figure(figsize=(10, 6))
# Training data
plt.scatter(y_train, y_pred_train, color='blue', label='Training Data')
# Testing data
plt.scatter(y_test, y_pred_test, color='green', label='Testing Data')
# Reference diagonal line for perfect predictions
min_value = min(y_train + y_test)
max_value = max(y_train + y_test)
plt.plot([min_value, max_value], [min_value, max_value], 'k--', lw=2)
# Labels and title with increased font sizes
plt.xlabel('True Values', fontsize=14)
plt.ylabel('Predicted Values', fontsize=14)
plt.title(f'{model_name} - True vs Predicted Values for Training and Testing Data', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
# Save and log the plot
plot_path = f"{model_name}_scatter_plot_true_vs_pred.png"
plt.savefig(plot_path)
# mlflow.log_artifact(plot_path) # Uncomment if using mlflow
# Show the plot
plt.show()
# Calculate metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
# Log metrics
mlflow.log_metric('train_r2', train_r2)
mlflow.log_metric('test_r2', test_r2)
mlflow.log_metric('train_mse', train_mse)
mlflow.log_metric('test_mse', test_mse)
<http://logger.info|logger.info>(f"{model_name} model R^2 on test data: {test_r2:.3f}")
# Cross-validation scores
cv_results = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
n = len(X_train) # Number of samples
p = X_train.shape[1] # Number of features
adjusted_r2_scores = []
for r2 in cv_results:
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
adjusted_r2_scores.append(adjusted_r2)
# Convert to numpy array for easier manipulation if needed
adjusted_r2_scores = np.array(adjusted_r2_scores)
# Create the bar plot
plt.figure(figsize=(10, 6))
x_values = range(1, len(adjusted_r2_scores) + 1)
plt.bar(x_values, adjusted_r2_scores, color='b', label=f'{model_name} Cross-validation R^2')
# Set x-ticks to be integers
plt.xticks(x_values)
# Add labels and title with increased font sizes
plt.xlabel('Fold', fontsize=14)
plt.ylabel('R^2', fontsize=14)
plt.title(f'{model_name} Cross-Validation adjusted R^2 Scores', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
# Save and log the plot
plot_path = f"{model_name}_cross_val_results.png"
plt.savefig(plot_path)
mlflow.log_artifact(plot_path) # Uncomment this line if you are using mlflow
# Show the plot
plt.show()
return best_model
Hall
12/03/2024, 12:20 PMYolan Honoré-Rougé
12/03/2024, 12:29 PMGeorge Drakoulas
12/03/2024, 2:41 PMYolan Honoré-Rougé
12/03/2024, 3:34 PM