Hi all, I can retrain my pipeline in azure ml with...
# questions
g
Hi all, I can retrain my pipeline in azure ml with kedro. However, I want to automatically register the model after training in azure ml. I have the following node for the model training. How can I accomplisht that furher?
Copy code
def automl_train(
    X_train: pd.DataFrame, y_train: pd.Series,  X_test: pd.DataFrame, y_test: pd.Series) -> RandomForestRegressor:
    """
    Identify the best regression model and train it using AutoML.

      Args:
        X_train: Training data of independent features.
        Y_train: Training data of dependent features

      Returns:
        The trained model
    """

    # Define the parameter grid for RandomizedSearchCV
    param_dist_rf = {
        'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [int(x) for x in np.linspace(10, 110, num=11)],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Define the parameter grid for RandomizedSearchCV for XGBRegressor
    param_dist_xgb = {
        'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    # Define the parameter grid for RandomizedSearchCV for LGBMRegressor
    param_dist_lgb = {
        'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'num_leaves': [31, 50, 100],
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    # Create the models
    models = {
     #   'RandomForest': RandomForestRegressor(),
     #   'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor()
    }

    # Parameter grids for the models
    param_grids = {
        'RandomForest': param_dist_rf,
        'XGBoost': param_dist_xgb,
        'LightGBM': param_dist_lgb
    }

    # End any active run before starting a new one
    if mlflow.active_run() is not None:
        mlflow.end_run()

    # Enable automatic logging to MLflow
    mlflow.sklearn.autolog()

    # Perform hyperparameter tuning and evaluation for each model
    for model_name, model in models.items():
        with mlflow.start_run(run_name=model_name):
            <http://logger.info|logger.info>(f"Training {model_name}...")

            # RandomizedSearchCV
            random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grids[model_name],
                                               n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

            # Fit the model
            random_search.fit(X_train, y_train)

            # Best model
            best_model = random_search.best_estimator_

            # Predictions
            y_pred_train = best_model.predict(X_train)
            y_pred_test = best_model.predict(X_test)

            # Scatter plot for both training and testing data
            plt.figure(figsize=(10, 6))

            # Training data
            plt.scatter(y_train, y_pred_train, color='blue', label='Training Data')
            # Testing data
            plt.scatter(y_test, y_pred_test, color='green', label='Testing Data')

            # Reference diagonal line for perfect predictions
            min_value = min(y_train + y_test)
            max_value = max(y_train + y_test)
            plt.plot([min_value, max_value], [min_value, max_value], 'k--', lw=2)

            # Labels and title with increased font sizes
            plt.xlabel('True Values', fontsize=14)
            plt.ylabel('Predicted Values', fontsize=14)
            plt.title(f'{model_name} - True vs Predicted Values for Training and Testing Data', fontsize=16)
            plt.legend(fontsize=12)
            plt.grid(True)

            # Save and log the plot
            plot_path = f"{model_name}_scatter_plot_true_vs_pred.png"
            plt.savefig(plot_path)
            # mlflow.log_artifact(plot_path)  # Uncomment if using mlflow

            # Show the plot
            plt.show()

            # Calculate metrics
            train_r2 = r2_score(y_train, y_pred_train)
            test_r2 = r2_score(y_test, y_pred_test)
            train_mse = mean_squared_error(y_train, y_pred_train)
            test_mse = mean_squared_error(y_test, y_pred_test)

            # Log metrics
            mlflow.log_metric('train_r2', train_r2)
            mlflow.log_metric('test_r2', test_r2)
            mlflow.log_metric('train_mse', train_mse)
            mlflow.log_metric('test_mse', test_mse)

            <http://logger.info|logger.info>(f"{model_name} model R^2 on test data: {test_r2:.3f}")

            # Cross-validation scores
            cv_results = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')

            n = len(X_train)  # Number of samples
            p = X_train.shape[1]  # Number of features

            adjusted_r2_scores = []
            for r2 in cv_results:
                adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
                adjusted_r2_scores.append(adjusted_r2)

            # Convert to numpy array for easier manipulation if needed
            adjusted_r2_scores = np.array(adjusted_r2_scores)

            # Create the bar plot
            plt.figure(figsize=(10, 6))
            x_values = range(1, len(adjusted_r2_scores) + 1)
            plt.bar(x_values, adjusted_r2_scores, color='b', label=f'{model_name} Cross-validation R^2')

            # Set x-ticks to be integers
            plt.xticks(x_values)

            # Add labels and title with increased font sizes
            plt.xlabel('Fold', fontsize=14)
            plt.ylabel('R^2', fontsize=14)
            plt.title(f'{model_name} Cross-Validation adjusted R^2 Scores', fontsize=16)
            plt.legend(fontsize=12)
            plt.grid(True)

            # Save and log the plot
            plot_path = f"{model_name}_cross_val_results.png"
            plt.savefig(plot_path)
            mlflow.log_artifact(plot_path)  # Uncomment this line if you are using mlflow

            # Show the plot
            plt.show()

    return best_model
h
Someone will reply to you shortly. In the meantime, this might help:
y
See kedro-mlflow MLflowArtifactDataset, MlflowModelDataset and MlflowModelRegistryDataset which are designed for this
👍 1
g
great thanks, is there any repo with a turorial to take a look?
y