Module python_scripts.model_rf

Random Forest Model with uncertainty estimates and feature importance.

This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).

Expand source code
"""
Random Forest Model with uncertainty estimates and feature importance.

This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).
"""

import matplotlib.pyplot as plt
import numpy as np
import os
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from scipy.stats import spearmanr

print_info = False


def pred_ints(model, X, percentile=95):
        """
        Predict standard deviation and CI using stats of all decision trees

        INPUT
        -----
        model: trained sklearn model
        X: input data matrix with shape (npoints,nfeatures)
        percentile: percentile of confidence interval

        RETURN
        -----
        stddev: standard deviation of prediction
        err_down: lower bound of confidence interval
        err_up: upper bound of confidence interval
        """
        preds = []
        for pred in model.estimators_:
            preds.append(pred.predict(X))
        preds = np.asarray(preds)
        stddev = np.std(preds, axis =0)
        err_down = np.percentile(preds, (100 - percentile) / 2., axis = 0)
        err_up = np.percentile(preds, 100 - (100 - percentile) / 2., axis = 0)
        return stddev, err_down, err_up


def rf_train(X_train, y_train):
        """
        Trains Random Fortest regression model with trainig data

        INPUT
        X: input data matrix with shape (npoints,nfeatures)
        y: target varable with shape (npoints)

        RETURN
        rf_model: trained sklearn RF model
        """
        rf_reg = RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, max_features = 0.3, random_state = 42) # 'reg:linear' depreceasted 
        rf_reg.fit(X_train, y_train)
        return rf_reg
        

def rf_predict(X_test, rf_model, y_test = None, outpath = None):
        """
        Returns Prediction for Random Forest regression model

        INPUT
        -----
        X_text: input datpoints in shape (ndata,n_feature). 
                        The number of features has to be the same as for the training data
        xg_model: pre-trained XGboost regression model
        y_test: if True, uses true y data for normalized RMSE calculation

        Return
        -----
        ypred: predicted y values
        ypred_std: standard deviation of prediction
        rmse_test: RMSE of test data (if y_test is not None)
        """
        ypred = rf_model.predict(X_test)
        ypred_std, _ , _ = pred_ints(rf_model, X_test, percentile=95)
        if y_test is not None:
                # calculate RMSE
                rmse_test = np.sqrt(np.mean((ypred - y_test)**2)) / y_test.std()
                if print_info: print("Random Forest normalized RMSE Test: ", np.round(rmse_test, 4))
                if outpath is not None:
                        plt.figure()  # inches
                        plt.title('Random Forest Test Data')
                        plt.errorbar(y_test, ypred, ypred_std, linestyle='None', marker = 'o', c = 'b')
                        #plt.scatter(y_test, ypred, c = 'b')
                        plt.xlabel('y True')
                        plt.ylabel('y Predict')
                        plt.savefig(os.path.join(outpath,'RF_test_pred_vs_true.png'), dpi = 300)
                        plt.close('all')
        else:
                rmse_test = None
        return ypred, ypred_std, rmse_test


def rf_train_predict(X_train, y_train, X_test, y_test = None, outpath = None):
        """
        Trains Random Forest regression model with trainig data and returns prediction for test data

        INPUT
        -----
        X_train: input data matrix with shape (npoints,nfeatures)
        y_train: target varable with shape (npoints)
        X_test: input data matrix with shape (npoints_test,nfeatures)
        y_test: target varable with shape (npoints_test)
        outpath: path to save plots

        RETURN
        ------
        ypred: predicted y values
        residuals: residuals of prediction
        """

        # Train RF
        rf_model = rf_train(X_train, y_train)

        # Predict for X_test
        ypred, ypred_std, nrmse_test = rf_predict(X_test, rf_model, y_test = y_test, outpath = outpath)

        # calculate square errors
        if y_test is not None:
                residual = ypred - y_test
        else:
                residual = np.zeros_like(y_test)
        return ypred, residual


def test_rf(logspace = False, nsamples = 600, nfeatures = 14, ninformative = 12, noise = 0.2, outpath = None):
        """
        Test RF model on synthetic data

        INPUT
        -----
        logspace: if True, uses logarithmic scale for features
        nsamples: number of samples
        nfeatures: number of features
        ninformative: number of informative features
        noise: noise level
        outpath: path to save plots
        """
        # Create simulated data
        from sklearn.datasets import make_regression
        Xorig, yorig, coeffs = make_regression(n_samples=nsamples, 
                n_features=nfeatures, n_informative=ninformative, 
                n_targets=1, bias=2.0, tail_strength=0.2, noise=noise, shuffle=True, coef=True, random_state=42)
        if logspace:
                Xorig = np.exp(Xorig)
                yorig = np.exp(yorig/100)

        if outpath is not None: 
                os.makedirs(outpath, exist_ok = True)

        X_train, X_test, y_train, y_test = train_test_split(Xorig, yorig, test_size=0.5, random_state=42)

        # Run RF
        y_pred, residual = rf_train_predict(X_train, y_train, X_test, y_test = y_test, outpath = outpath)

        # Calculate normalized RMSE:
        nrmse = np.sqrt(np.nanmean(residual**2)) / y_test.std()
        nrmedse = np.sqrt(np.median(residual**2)) / y_test.std()
        if print_info:
                print('Normalized RMSE for test data: ', np.round(nrmse,3))
                print('Normalized ROOT MEDIAM SE for test data: ', np.round(nrmedse,3))
        #Feature Importance
        feature_importance = rf_factor_importance(X_train, y_train)
        if print_info:
                print('Model correlation coefficients:', coeffs )
                print('Feature Importance:', feature_importance)

Functions

def pred_ints(model, X, percentile=95)

Predict standard deviation and CI using stats of all decision trees

Input

model: trained sklearn model X: input data matrix with shape (npoints,nfeatures) percentile: percentile of confidence interval

Return

stddev: standard deviation of prediction err_down: lower bound of confidence interval err_up: upper bound of confidence interval

Expand source code
def pred_ints(model, X, percentile=95):
        """
        Predict standard deviation and CI using stats of all decision trees

        INPUT
        -----
        model: trained sklearn model
        X: input data matrix with shape (npoints,nfeatures)
        percentile: percentile of confidence interval

        RETURN
        -----
        stddev: standard deviation of prediction
        err_down: lower bound of confidence interval
        err_up: upper bound of confidence interval
        """
        preds = []
        for pred in model.estimators_:
            preds.append(pred.predict(X))
        preds = np.asarray(preds)
        stddev = np.std(preds, axis =0)
        err_down = np.percentile(preds, (100 - percentile) / 2., axis = 0)
        err_up = np.percentile(preds, 100 - (100 - percentile) / 2., axis = 0)
        return stddev, err_down, err_up
def rf_predict(X_test, rf_model, y_test=None, outpath=None)

Returns Prediction for Random Forest regression model

Input

X_text: input datpoints in shape (ndata,n_feature). The number of features has to be the same as for the training data xg_model: pre-trained XGboost regression model y_test: if True, uses true y data for normalized RMSE calculation

Return

ypred: predicted y values ypred_std: standard deviation of prediction rmse_test: RMSE of test data (if y_test is not None)

Expand source code
def rf_predict(X_test, rf_model, y_test = None, outpath = None):
        """
        Returns Prediction for Random Forest regression model

        INPUT
        -----
        X_text: input datpoints in shape (ndata,n_feature). 
                        The number of features has to be the same as for the training data
        xg_model: pre-trained XGboost regression model
        y_test: if True, uses true y data for normalized RMSE calculation

        Return
        -----
        ypred: predicted y values
        ypred_std: standard deviation of prediction
        rmse_test: RMSE of test data (if y_test is not None)
        """
        ypred = rf_model.predict(X_test)
        ypred_std, _ , _ = pred_ints(rf_model, X_test, percentile=95)
        if y_test is not None:
                # calculate RMSE
                rmse_test = np.sqrt(np.mean((ypred - y_test)**2)) / y_test.std()
                if print_info: print("Random Forest normalized RMSE Test: ", np.round(rmse_test, 4))
                if outpath is not None:
                        plt.figure()  # inches
                        plt.title('Random Forest Test Data')
                        plt.errorbar(y_test, ypred, ypred_std, linestyle='None', marker = 'o', c = 'b')
                        #plt.scatter(y_test, ypred, c = 'b')
                        plt.xlabel('y True')
                        plt.ylabel('y Predict')
                        plt.savefig(os.path.join(outpath,'RF_test_pred_vs_true.png'), dpi = 300)
                        plt.close('all')
        else:
                rmse_test = None
        return ypred, ypred_std, rmse_test
def rf_train(X_train, y_train)

Trains Random Fortest regression model with trainig data

INPUT X: input data matrix with shape (npoints,nfeatures) y: target varable with shape (npoints)

RETURN rf_model: trained sklearn RF model

Expand source code
def rf_train(X_train, y_train):
        """
        Trains Random Fortest regression model with trainig data

        INPUT
        X: input data matrix with shape (npoints,nfeatures)
        y: target varable with shape (npoints)

        RETURN
        rf_model: trained sklearn RF model
        """
        rf_reg = RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, max_features = 0.3, random_state = 42) # 'reg:linear' depreceasted 
        rf_reg.fit(X_train, y_train)
        return rf_reg
def rf_train_predict(X_train, y_train, X_test, y_test=None, outpath=None)

Trains Random Forest regression model with trainig data and returns prediction for test data

Input

X_train: input data matrix with shape (npoints,nfeatures) y_train: target varable with shape (npoints) X_test: input data matrix with shape (npoints_test,nfeatures) y_test: target varable with shape (npoints_test) outpath: path to save plots

Return

ypred: predicted y values residuals: residuals of prediction

Expand source code
def rf_train_predict(X_train, y_train, X_test, y_test = None, outpath = None):
        """
        Trains Random Forest regression model with trainig data and returns prediction for test data

        INPUT
        -----
        X_train: input data matrix with shape (npoints,nfeatures)
        y_train: target varable with shape (npoints)
        X_test: input data matrix with shape (npoints_test,nfeatures)
        y_test: target varable with shape (npoints_test)
        outpath: path to save plots

        RETURN
        ------
        ypred: predicted y values
        residuals: residuals of prediction
        """

        # Train RF
        rf_model = rf_train(X_train, y_train)

        # Predict for X_test
        ypred, ypred_std, nrmse_test = rf_predict(X_test, rf_model, y_test = y_test, outpath = outpath)

        # calculate square errors
        if y_test is not None:
                residual = ypred - y_test
        else:
                residual = np.zeros_like(y_test)
        return ypred, residual
def test_rf(logspace=False, nsamples=600, nfeatures=14, ninformative=12, noise=0.2, outpath=None)

Test RF model on synthetic data

Input

logspace: if True, uses logarithmic scale for features nsamples: number of samples nfeatures: number of features ninformative: number of informative features noise: noise level outpath: path to save plots

Expand source code
def test_rf(logspace = False, nsamples = 600, nfeatures = 14, ninformative = 12, noise = 0.2, outpath = None):
        """
        Test RF model on synthetic data

        INPUT
        -----
        logspace: if True, uses logarithmic scale for features
        nsamples: number of samples
        nfeatures: number of features
        ninformative: number of informative features
        noise: noise level
        outpath: path to save plots
        """
        # Create simulated data
        from sklearn.datasets import make_regression
        Xorig, yorig, coeffs = make_regression(n_samples=nsamples, 
                n_features=nfeatures, n_informative=ninformative, 
                n_targets=1, bias=2.0, tail_strength=0.2, noise=noise, shuffle=True, coef=True, random_state=42)
        if logspace:
                Xorig = np.exp(Xorig)
                yorig = np.exp(yorig/100)

        if outpath is not None: 
                os.makedirs(outpath, exist_ok = True)

        X_train, X_test, y_train, y_test = train_test_split(Xorig, yorig, test_size=0.5, random_state=42)

        # Run RF
        y_pred, residual = rf_train_predict(X_train, y_train, X_test, y_test = y_test, outpath = outpath)

        # Calculate normalized RMSE:
        nrmse = np.sqrt(np.nanmean(residual**2)) / y_test.std()
        nrmedse = np.sqrt(np.median(residual**2)) / y_test.std()
        if print_info:
                print('Normalized RMSE for test data: ', np.round(nrmse,3))
                print('Normalized ROOT MEDIAM SE for test data: ', np.round(nrmedse,3))
        #Feature Importance
        feature_importance = rf_factor_importance(X_train, y_train)
        if print_info:
                print('Model correlation coefficients:', coeffs )
                print('Feature Importance:', feature_importance)