Module python_scripts.soilmod_xval
Probabilistic machine learning models and evaluation using Gaussian Process Priors with mean functions.
Current models implemented: - Gaussian Process with bayesian linear regression (BLR) as mean function and sparse spatial covariance function - Gaussian Process with random forest (RF) regression as mean function and sparse spatial covariance function
Core functions: - train baseline models (mean functions): BLR and RF - hyperparameter optimisation of GP model - n-fold cross-validation of models - model evaluations: RMSE, NRMSE, R2, uncertainty of predictions - residual plots and analysis - ranking of best models
User settings, such as input/output paths and all other options, are set in the settings file (Default filename: settings_soilmodel_xval.yaml) Alternatively, the settings file can be specified as a command line argument with: '-s', or '–settings' followed by PATH-TO-FILE/FILENAME.yaml (e.g. python featureimportance.py -s settings_featureimportance.yaml).
See README.md for more information.
This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).
Expand source code
"""
Probabilistic machine learning models and evaluation using Gaussian Process Priors with mean functions.
Current models implemented:
- Gaussian Process with bayesian linear regression (BLR) as mean function and sparse spatial covariance function
- Gaussian Process with random forest (RF) regression as mean function and sparse spatial covariance function
Core functions:
- train baseline models (mean functions): BLR and RF
- hyperparameter optimisation of GP model
- n-fold cross-validation of models
- model evaluations: RMSE, NRMSE, R2, uncertainty of predictions
- residual plots and analysis
- ranking of best models
User settings, such as input/output paths and all other options, are set in the settings file
(Default filename: settings_soilmodel_xval.yaml)
Alternatively, the settings file can be specified as a command line argument with:
'-s', or '--settings' followed by PATH-TO-FILE/FILENAME.yaml
(e.g. python featureimportance.py -s settings_featureimportance.yaml).
See README.md for more information.
This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).
"""
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import yaml
import argparse
from types import SimpleNamespace
# Custom local libraries:
from utils import print2, truncate_data
from preprocessing import gen_kfold
import GPmodel as gp # GP model plus kernel functions and distance matrix calculation
import model_blr as blr
import model_rf as rf
# Settings yaml file
_fname_settings = 'settings_soilmod_xval.yaml'
# flag to show plot figures interactively or not (True/False)
_show = False
def runmodel(dfsel, model_function, settings):
"""
Train model function on dataframe dfsel and return nfold cross-validation results.
This function creates multiple diagnostic charts and evaluation statistics saved in the output folder.
Input:
------
dfsel: dataframe with data for training and testing (nfold column required to split data)
model_function: str, function to train model (supported: 'blr', 'rf', 'blr-gp', 'rf-gp', 'gp-only')
settings: settings for model function
Returns:
--------
dfsum: dataframe with summary results
stats_summary: list of summary statistics
outpath: path to output files
"""
outpath_root = settings.outpath
# set conditional mean function
if (model_function == 'blr') | (model_function == 'rf'):
# only mean function model
calc_mean_only = True
else:
calc_mean_only = False
if (model_function == 'blr-gp') | (model_function == 'blr'):
mean_function = 'blr'
# print('mean function:', mean_function)
if (model_function == 'rf-gp') | (model_function == 'rf'):
mean_function = 'rf'
if model_function == 'gp-only':
mean_function = 'const'
# print('mean function:', mean_function)
# get train and test data, here we can include loop over ix for cross validation
range_nfold = np.sort(dfsel.nfold.unique())
print(f'Computing {len(range_nfold)}-fold xrossvalidation for function model: {model_function}')
subdir = 'Xval_' + str(len(range_nfold)) + '-fold_' + model_function + '_' + settings.name_target
outpath = os.path.join(outpath_root, subdir)
### X-fold Crossvalidation ###
# Intialise lists to hold summary results for n-fold validation
rmse_nfold = []
nrmse_nfold = []
rmedse_nfold = []
nrmedse_nfold = []
meansspe_nfold = []
r2_nfold = []
histresidual = []
histsspe = []
# dataframe to hold all predictions:
dfpred_all = pd.DataFrame()
# Loop over all folds
for ix in range_nfold:
# Loop over all train/test sets (test sets are designated by ix; training set is defined by the remaining set)
print('Processing for nfold ', ix)
# update outpath with iteration of cross-validation
outpath_nfold = os.path.join(outpath, 'nfold_' + str(ix) + '/')
os.makedirs(outpath_nfold, exist_ok = True)
# split into train and test data
dftrain = dfsel[dfsel[settings.name_ixval] != ix].copy()
dftest = dfsel[dfsel[settings.name_ixval] == ix].copy()
# Copy dataframe for saving results later
dfpred = dftest.copy()
points3D_train = np.asarray([dftrain.z.values, dftrain.y.values, dftrain.x.values]).T
points3D_test = np.asarray([dftest.z.values, dftest.y.values, dftest.x.values]).T
# Check for nan values:
y_train = dftrain[settings.name_target].values
y_test = dftest[settings.name_target].values
# Uncertainty in coordinates
if 'z_diff' in list(dftrain):
Xdelta_train = np.asarray([0.5 * dftrain.z_diff.values, dftrain.y.values * 0, dftrain.x.values * 0.]).T
Xdelta_test = np.asarray([0.5 * dftest.z_diff.values, dftest.y.values * 0, dftest.x.values * 0.]).T
else:
Xdelta_train = np.asarray([0 * dftrain.z.values, dftrain.y.values * 0, dftrain.x.values * 0.]).T
Xdelta_test = np.asarray([0 * dftest.z.values, dftest.y.values * 0, dftest.x.values * 0.]).T
if mean_function == 'rf':
# Estimate GP mean function with Random Forest Regressor
X_train = dftrain[settings.name_features].values
y_train = dftrain[settings.name_target].values
X_test = dftest[settings.name_features].values
y_test = dftest[settings.name_target].values
rf_model = rf.rf_train(X_train, y_train)
ypred_rf_train, ynoise_train, nrmse_rf_train = rf.rf_predict(X_train, rf_model, y_test = y_train)
ypred_rf, ynoise_pred, nrmse_rf_test = rf.rf_predict(X_test, rf_model, y_test = y_test)
y_train_fmean = ypred_rf_train
if not calc_mean_only:
plt.figure() # inches
plt.title('Random Forest Mean Function')
plt.errorbar(y_train, ypred_rf_train, ynoise_train, linestyle='None', marker = 'o', c = 'r', label = 'Train Data', alpha =0.5)
plt.errorbar(y_test, ypred_rf, ynoise_pred, linestyle='None', marker = 'o', c = 'b', label = 'Test Data', alpha =0.5)
plt.legend(loc = 'upper left')
plt.xlabel('y True')
plt.ylabel('y Predict')
plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_meanfunction_pred_vs_true.png'), dpi = 300)
if _show:
plt.show()
plt.close('all')
elif mean_function == 'blr':
X_train = dftrain[settings.name_features].values
y_train = dftrain[settings.name_target].values
X_test = dftest[settings.name_features].values
y_test = dftest[settings.name_target].values
# Scale data
Xs_train, ys_train, scale_params = blr.scale_data(X_train, y_train)
scaler_x, scaler_y = scale_params
Xs_test = scaler_x.transform(X_test)
# Train BLR
blr_model = blr.blr_train(Xs_train, y_train)
# Predict for X_test
ypred_blr, ypred_std_blr, nrmse_blr_test = blr.blr_predict(Xs_test, blr_model, y_test = y_test)
ypred_blr_train, ypred_std_blr_train, nrmse_blr_train = blr.blr_predict(Xs_train, blr_model, y_test = y_train)
ypred_blr = ypred_blr.flatten()
ypred_blr_train = ypred_blr_train.flatten()
y_train_fmean = ypred_blr_train
ynoise_train = ypred_std_blr_train #* fac_noise_train
ynoise_pred = ypred_std_blr #* fac_noise_pred
if not calc_mean_only:
plt.figure() # inches
plt.title('BLR Mean function')
plt.errorbar(y_train, ypred_blr_train, ynoise_train, linestyle='None', marker = 'o', c = 'r', label = 'Train Data', alpha =0.5)
plt.errorbar(y_test, ypred_blr, ynoise_pred, linestyle='None', marker = 'o', c = 'b', label = 'Test Data', alpha =0.5)
plt.legend(loc = 'upper left')
plt.xlabel('y True')
plt.ylabel('y Predict')
plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_meanfunction_pred_vs_true.png'), dpi = 300)
if _show:
plt.show()
plt.close('all')
elif mean_function == 'const':
y_train_fmean = np.mean(y_train) * np.ones(y_train.shape)
ypred_const = np.mean(y_train) * np.ones(y_test.shape)
ypred_const_train = np.mean(y_train) * y_train_fmean
ynoise_train = 1e-6 * np.ones(y_train.shape)
ynoise_pred = 1e-6 * np.ones(y_test.shape)
# Subtract mean function from training data for GP with zero mean
y_train -= y_train_fmean
# plot training and testing distribution
plt.figure(figsize=(8,6))
plt.scatter(dftrain.x.values, dftrain.y.values, alpha=0.3, c = 'b', label = 'Train')
plt.scatter(dftest.x.values, dftest.y.values, alpha=0.3, c = 'r', label = 'Test')
plt.axis('equal')
plt.xlabel('Easting')
plt.ylabel('Northing')
#plt.colorbar()
plt.title('Mean subtracted ' + settings.name_target)
#plt.colorbar()
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_train.png'), dpi = 300)
if _show:
plt.show()
### Plot histogram of target values after mean subtraction
plt.clf()
plt.hist(y_train, bins=30)
plt.xlabel('Mean subtracted y_train')
plt.ylabel('N')
plt.savefig(os.path.join(outpath_nfold,'Hist_' + settings.name_target + '_train.png'), dpi = 300)
if _show:
plt.show()
plt.close('all')
if not calc_mean_only:
# optimise GP hyperparameters
# Use mean of X uncertainity for optimizing since otherwise too many local minima
print('Mean of Y: ' +str(np.round(np.mean(y_train),4)) + ' +/- ' + str(np.round(np.std(y_train),4)))
print('Mean of Mean function: ' +str(np.round(np.mean(y_train_fmean),4)) + ' +/- ' + str(np.round(np.std(y_train_fmean),4)))
print('Mean of Mean function noise: ' +str(np.round(np.mean(ynoise_train),4)) + ' +/- ' + str(np.round(np.std(ynoise_train),4)))
print('Optimizing GP hyperparameters...')
Xdelta_mean = Xdelta_train * 0 + np.nanmean(Xdelta_train,axis=0)
# TBD: find automatic way to set hyperparameter boundaries based on data
xymin = 0.5 * (points3D_train[:,1].max() - points3D_train[:,1].min()) / np.unique(points3D_train[:,1]).size
zmin = 0.5 * (points3D_train[:,0].max() - points3D_train[:,0].min()) / np.unique(points3D_train[:,0]).size
opt_params, opt_logl = gp.optimize_gp_3D(points3D_train, y_train, ynoise_train, xymin = xymin, zmin = zmin, Xdelta = Xdelta_mean)
#opt_params, opt_logl = optimize_gp_3D(points3D_train, y_train, ynoise_train, xymin = 30, zmin = 0.05, Xdelta = Xdelta_train)
params_gp = opt_params
# Calculate predicted mean values
points3D_pred = points3D_test.copy()
print('Computing GP predictions for test set nfold ', ix)
ypred, ypred_std, logl, gp_train = gp.train_predict_3D(points3D_train, points3D_pred, y_train, ynoise_train, params_gp, Ynoise_pred = ynoise_pred, Xdelta = Xdelta_train)
ypred_train, ypred_std_train, _ , _ = gp.train_predict_3D(points3D_train, points3D_train, y_train, ynoise_train, params_gp, Ynoise_pred = ynoise_train, Xdelta = Xdelta_train)
else:
ypred = 0
ypred_train =0
ypred_std = ynoise_pred
ypred_std_train = ynoise_train
# Add mean function to prediction
if mean_function == 'rf':
y_pred_zmean = ypred_rf
y_pred_train_zmean = ypred_rf_train
elif mean_function == 'blr':
y_pred_zmean = ypred_blr
y_pred_train_zmean = ypred_blr_train
elif mean_function == 'const':
y_pred_zmean = ypred_const
y_pred_train_zmean = ypred_const_train
y_pred = ypred + y_pred_zmean
y_pred_train = ypred_train + y_pred_train_zmean
y_train += y_train_fmean
# Calculate Residual, RMSE, R2, SSPE
residual_test = y_pred - y_test
rmse = np.sqrt(np.nanmean(residual_test**2))
rmse_norm = rmse / y_test.std()
rmedse = np.sqrt(np.median(residual_test**2))
rmedse_norm = rmedse / y_test.std()
#sspe = residual_test**2 / ystd_test**2
sspe = residual_test**2 / (ypred_std**2)
r2 = 1 - np.nanmean(residual_test**2) / np.nanmean((y_test - y_test.mean())**2)
if not calc_mean_only:
print("GP Marginal Log-Likelihood: ", np.round(logl,2))
print("Normalized RMSE: ",np.round(rmse_norm,4))
print("Normalized ROOT MEDIAN SE: ",np.round(rmedse_norm,4))
print("R^2: ", np.round(r2,4))
print("Mean Theta: ", np.round(np.mean(sspe),4))
print("Median Theta: ", np.round(np.median(sspe)))
# Save results in dataframe
dfpred[settings.name_target + '_predict'] = y_pred
dfpred[settings.name_target + '_stddev'] = ypred_std
dfpred['Residual'] = residual_test
dfpred['Residual_squared'] = residual_test**2
dfpred['Theta'] = sspe
dfpred.to_csv(os.path.join(outpath_nfold, settings.name_target + '_results_nfold' + str(ix) + '.csv'), index = False)
# add to dataframe for all folds
dfpred_all = pd.concat([dfpred_all, dfpred], axis=0, ignore_index = True)
#Residual Map
plt.figure(figsize=(8,6))
plt.scatter(dftest.x.values, dftest.y.values, c=residual_test, alpha=0.3)
plt.axis('equal')
plt.xlabel('Easting')
plt.ylabel('Northing')
#plt.colorbar()
plt.title('Residual Test Data ' + settings.name_target)
plt.colorbar()
#plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_residualmap.png'), dpi = 300)
if _show:
plt.show()
# Residual Plot
import seaborn as sns
plt.subplot(2, 1, 1)
sns.distplot(residual_test, norm_hist = True)
plt.title(settings.name_target + ' Residual Analysis of Test Data')
plt.ylabel('Residual')
plt.subplot(2, 1, 2)
sns.distplot(sspe, norm_hist = True)
plt.ylabel(r'$\Theta$')
plt.savefig(os.path.join(outpath_nfold, 'Residual_hist_' + settings.name_target + '_nfold' + str(ix) + '.png'), dpi=300)
if _show:
plt.show()
plt.close('all')
plt.figure()
# plt.title(model_function)
plt.errorbar(y_train, y_pred_train, ypred_std_train, linestyle='None', marker = 'o', c = 'r', label = 'Train Data', alpha =0.5)
plt.errorbar(y_test, y_pred, ypred_std, linestyle='None', marker = 'o', c = 'b', label = 'Test Data', alpha =0.5)
plt.legend(loc = 'upper left')
plt.xlabel(settings.name_target + ' True')
plt.ylabel(settings.name_target + ' Predict')
plt.savefig(os.path.join(outpath_nfold,'pred_vs_true' + settings.name_target + '_nfold' + str(ix) + '.png'), dpi = 300)
if _show:
plt.show()
plt.close('all')
rmse_nfold.append(rmse)
nrmse_nfold.append(rmse_norm)
rmedse_nfold.append(rmedse)
nrmedse_nfold.append(rmedse_norm)
meansspe_nfold.append(np.mean(sspe))
r2_nfold.append(r2)
histsspe.append(sspe)
histresidual.append(residual_test)
# Save prediton results in dataframe
print("Saving all predictions in dataframe ...")
dfpred_all.to_csv(os.path.join(outpath, settings.name_target + '_results_nfold_all.csv'), index = False)
# Plot all predictions vs true for test data
plt.figure()
plt.title('Combined Test Data')
plt.errorbar(dfpred_all[settings.name_target],
dfpred_all[settings.name_target + '_predict'],
dfpred_all[settings.name_target + '_stddev'],
linestyle='None', marker = 'o', c = 'b', alpha = 0.5)
plt.xlabel(settings.name_target + ' True')
plt.ylabel(settings.name_target + ' Predict')
plt.savefig(os.path.join(outpath,'pred_vs_true' + settings.name_target + '_combined.png'), dpi = 300)
if _show:
plt.show()
plt.close('all')
# Save and plot summary results of residual analysis for test data:
rmse_nfold = np.asarray(rmse_nfold)
nrmse_nfold = np.asarray(nrmse_nfold)
rmedse_nfold = np.asarray(rmedse_nfold)
nrmedse_nfold = np.asarray(nrmedse_nfold)
meansspe_nfold = np.asarray(meansspe_nfold)
r2_nfold = np.asarray(r2_nfold)
dfsum = pd.DataFrame({'nfold': range_nfold, 'RMSE': rmse_nfold, 'nRMSE': nrmse_nfold, 'RMEDIANSE': rmedse_nfold, 'nRMEDIANSE': nrmedse_nfold, 'R2': r2_nfold, 'Theta': meansspe_nfold})
dfsum.to_csv(os.path.join(outpath, settings.name_target + 'nfold_summary_stats.csv'), index = False)
print("---- X-validation Summary -----")
print("Mean normalized RMSE: " + str(np.round(np.mean(nrmse_nfold),3)) + " +/- " + str(np.round(np.std(nrmse_nfold),3)))
print("Median normalized RMSE: " + str(np.round(np.median(nrmedse_nfold),3)))
print("Mean R^2: " + str(np.round(np.mean(r2_nfold),3)))
print("Mean Theta: " + str(np.round(np.mean(meansspe_nfold),3)) + " +/- " + str(np.round(np.std(meansspe_nfold),3)))
histresidual_cut = truncate_data(histresidual, 1)
histsspe_cut = truncate_data(histsspe, 1)
plt.subplot(2, 1, 1)
sns.distplot(histresidual_cut, norm_hist = True)
plt.title(settings.name_target + ' Residual Analysis of Test Data')
plt.ylabel('Residual')
plt.subplot(2, 1, 2)
sns.distplot(histsspe_cut, norm_hist = True)
plt.ylabel(r'$\Theta$')
#plt.title(valuename + ' SSPE Test')
plt.savefig(os.path.join(outpath, 'Xvalidation_Residual_hist_' + settings.name_target + '.png'), dpi=300)
if _show:
plt.show()
plt.close('all')
stats_summary = (np.round(np.mean(nrmse_nfold),3), np.round(np.std(nrmse_nfold),3),
np.round(np.mean(meansspe_nfold),3), np.round(np.std(meansspe_nfold),3),
np.round(np.mean(r2_nfold),3), np.round(np.std(r2_nfold),3) )
return dfsum, stats_summary, outpath
######################### Main Function ############################
def main(fname_settings):
"""
Main script for running 3D cubing with Gaussian Process.
See Documentation and comments below for more details.
"""
# Load settings from yaml file
with open(fname_settings, 'r') as f:
settings = yaml.load(f, Loader=yaml.FullLoader)
# Parse settings dictinary as namespace (settings are available as
# settings.variable_name rather than settings['variable_name'])
settings = SimpleNamespace(**settings)
# Add temporal or vertical componnet
if settings.axistype == 'temporal':
settings.colname_zcoord = settings.colname_tcoord
settings.colname_zmin = settings.colname_tmin
settings.colname_zmax = settings.colname_tmax
if type(settings.model_functions) != list:
settings.model_functions = [settings.model_functions]
# check if outpath exists, if not create direcory
os.makedirs(settings.outpath, exist_ok = True)
# Intialise output info file:
print2('init')
print2(f'--- Parameter Settings ---')
print2(f'Selected Model Functions: {settings.model_functions}')
print2(f'Target Name: {settings.name_target}')
print2(f'--------------------------')
print('Reading data into dataframe...')
# Read in data
dfsel = pd.read_csv(os.path.join(settings.inpath, settings.infname))
# Rename x and y coordinates of input data
if settings.colname_xcoord != 'x':
dfsel.rename(columns={settings.colname_xcoord: 'x'}, inplace = True)
if settings.colname_ycoord != 'y':
dfsel.rename(columns={settings.colname_ycoord: 'y'}, inplace = True)
if (settings.axistype == 'vertical') & (settings.colname_zcoord != 'z'):
dfsel.rename(columns={settings.colname_zcoord: 'z'}, inplace = True)
else:
dfsel.rename(columns={settings.colname_tcoord: 'z'}, inplace = True)
dfsel.rename(columns={settings.colname_zcoord: 'z'}, inplace = True)
settings.name_features.append('z')
# Select data between zmin and zmax
dfsel = dfsel[(dfsel['z'] >= settings.colname_zmin) & (dfsel['z'] <= settings.colname_zmax)]
# Generate kfold indices
if settings.axistype == 'vertical':
dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['x','y'], precision_unique = 0.01)
elif settings.axistype == 'temporal':
#dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['x', 'y', 'z'], precision_unique = 0.01)
dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['x', 'y'], precision_unique = 0.01)
## Get coordinates for training data and set coord origin to (0,0)
bound_xmin = dfsel.x.min()
bound_xmax = dfsel.x.max()
bound_ymin = dfsel.y.min()
bound_ymax = dfsel.y.max()
# Set origin to (0,0)
dfsel['x'] = dfsel['x'] - bound_xmin
dfsel['y'] = dfsel['y'] - bound_ymin
nrmse_meanfunction = []
nrmse_meanfunction_std = []
theta_meanfunction = []
theta_meanfunction_std = []
r2_meanfunction = []
r2_meanfunction_std = []
# Loop over model functions and evaluate
for model_function in settings.model_functions:
# run and evaluate model
dfsum, stats_summary, model_outpath = runmodel(dfsel, model_function, settings)
print(f'All output files of {model_function} saved in {model_outpath}')
print('')
# save results
nrmse_meanfunction.append(stats_summary[0])
nrmse_meanfunction_std.append(stats_summary[1])
theta_meanfunction.append(stats_summary[2])
theta_meanfunction_std.append(stats_summary[3])
r2_meanfunction.append(stats_summary[4])
r2_meanfunction_std.append(stats_summary[5])
#End of xval loop over all models
#Print best models sorted with nRMSE
ix_meanfunction_sorted = [nrmse_meanfunction.index(x) for x in sorted(nrmse_meanfunction)]
print('')
print('-------------------------------')
print('Models ranked based on nRMSE:')
print('')
for ix in ix_meanfunction_sorted:
print(f'{settings.model_functions[ix]}: Mean nRMSE = {nrmse_meanfunction[ix]} +/- {nrmse_meanfunction_std[ix]}, Mean R2= {r2_meanfunction[ix]} +/- {r2_meanfunction_std[ix]}, Theta = {theta_meanfunction[ix]} +/- {theta_meanfunction_std[ix]}')
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser(description='Prediction model for machine learning on soil data.')
parser.add_argument('-s', '--settings', type=str, required=False,
help='Path and filename of settings file.',
default = _fname_settings)
args = parser.parse_args()
# Run main function
main(args.settings)
Functions
def main(fname_settings)-
Main script for running 3D cubing with Gaussian Process. See Documentation and comments below for more details.
Expand source code
def main(fname_settings): """ Main script for running 3D cubing with Gaussian Process. See Documentation and comments below for more details. """ # Load settings from yaml file with open(fname_settings, 'r') as f: settings = yaml.load(f, Loader=yaml.FullLoader) # Parse settings dictinary as namespace (settings are available as # settings.variable_name rather than settings['variable_name']) settings = SimpleNamespace(**settings) # Add temporal or vertical componnet if settings.axistype == 'temporal': settings.colname_zcoord = settings.colname_tcoord settings.colname_zmin = settings.colname_tmin settings.colname_zmax = settings.colname_tmax if type(settings.model_functions) != list: settings.model_functions = [settings.model_functions] # check if outpath exists, if not create direcory os.makedirs(settings.outpath, exist_ok = True) # Intialise output info file: print2('init') print2(f'--- Parameter Settings ---') print2(f'Selected Model Functions: {settings.model_functions}') print2(f'Target Name: {settings.name_target}') print2(f'--------------------------') print('Reading data into dataframe...') # Read in data dfsel = pd.read_csv(os.path.join(settings.inpath, settings.infname)) # Rename x and y coordinates of input data if settings.colname_xcoord != 'x': dfsel.rename(columns={settings.colname_xcoord: 'x'}, inplace = True) if settings.colname_ycoord != 'y': dfsel.rename(columns={settings.colname_ycoord: 'y'}, inplace = True) if (settings.axistype == 'vertical') & (settings.colname_zcoord != 'z'): dfsel.rename(columns={settings.colname_zcoord: 'z'}, inplace = True) else: dfsel.rename(columns={settings.colname_tcoord: 'z'}, inplace = True) dfsel.rename(columns={settings.colname_zcoord: 'z'}, inplace = True) settings.name_features.append('z') # Select data between zmin and zmax dfsel = dfsel[(dfsel['z'] >= settings.colname_zmin) & (dfsel['z'] <= settings.colname_zmax)] # Generate kfold indices if settings.axistype == 'vertical': dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['x','y'], precision_unique = 0.01) elif settings.axistype == 'temporal': #dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['x', 'y', 'z'], precision_unique = 0.01) dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['x', 'y'], precision_unique = 0.01) ## Get coordinates for training data and set coord origin to (0,0) bound_xmin = dfsel.x.min() bound_xmax = dfsel.x.max() bound_ymin = dfsel.y.min() bound_ymax = dfsel.y.max() # Set origin to (0,0) dfsel['x'] = dfsel['x'] - bound_xmin dfsel['y'] = dfsel['y'] - bound_ymin nrmse_meanfunction = [] nrmse_meanfunction_std = [] theta_meanfunction = [] theta_meanfunction_std = [] r2_meanfunction = [] r2_meanfunction_std = [] # Loop over model functions and evaluate for model_function in settings.model_functions: # run and evaluate model dfsum, stats_summary, model_outpath = runmodel(dfsel, model_function, settings) print(f'All output files of {model_function} saved in {model_outpath}') print('') # save results nrmse_meanfunction.append(stats_summary[0]) nrmse_meanfunction_std.append(stats_summary[1]) theta_meanfunction.append(stats_summary[2]) theta_meanfunction_std.append(stats_summary[3]) r2_meanfunction.append(stats_summary[4]) r2_meanfunction_std.append(stats_summary[5]) #End of xval loop over all models #Print best models sorted with nRMSE ix_meanfunction_sorted = [nrmse_meanfunction.index(x) for x in sorted(nrmse_meanfunction)] print('') print('-------------------------------') print('Models ranked based on nRMSE:') print('') for ix in ix_meanfunction_sorted: print(f'{settings.model_functions[ix]}: Mean nRMSE = {nrmse_meanfunction[ix]} +/- {nrmse_meanfunction_std[ix]}, Mean R2= {r2_meanfunction[ix]} +/- {r2_meanfunction_std[ix]}, Theta = {theta_meanfunction[ix]} +/- {theta_meanfunction_std[ix]}') def runmodel(dfsel, model_function, settings)-
Train model function on dataframe dfsel and return nfold cross-validation results. This function creates multiple diagnostic charts and evaluation statistics saved in the output folder.
Input:
dfsel: dataframe with data for training and testing (nfold column required to split data) model_function: str, function to train model (supported: 'blr', 'rf', 'blr-gp', 'rf-gp', 'gp-only') settings: settings for model functionReturns:
dfsum: dataframe with summary results stats_summary: list of summary statistics outpath: path to output filesExpand source code
def runmodel(dfsel, model_function, settings): """ Train model function on dataframe dfsel and return nfold cross-validation results. This function creates multiple diagnostic charts and evaluation statistics saved in the output folder. Input: ------ dfsel: dataframe with data for training and testing (nfold column required to split data) model_function: str, function to train model (supported: 'blr', 'rf', 'blr-gp', 'rf-gp', 'gp-only') settings: settings for model function Returns: -------- dfsum: dataframe with summary results stats_summary: list of summary statistics outpath: path to output files """ outpath_root = settings.outpath # set conditional mean function if (model_function == 'blr') | (model_function == 'rf'): # only mean function model calc_mean_only = True else: calc_mean_only = False if (model_function == 'blr-gp') | (model_function == 'blr'): mean_function = 'blr' # print('mean function:', mean_function) if (model_function == 'rf-gp') | (model_function == 'rf'): mean_function = 'rf' if model_function == 'gp-only': mean_function = 'const' # print('mean function:', mean_function) # get train and test data, here we can include loop over ix for cross validation range_nfold = np.sort(dfsel.nfold.unique()) print(f'Computing {len(range_nfold)}-fold xrossvalidation for function model: {model_function}') subdir = 'Xval_' + str(len(range_nfold)) + '-fold_' + model_function + '_' + settings.name_target outpath = os.path.join(outpath_root, subdir) ### X-fold Crossvalidation ### # Intialise lists to hold summary results for n-fold validation rmse_nfold = [] nrmse_nfold = [] rmedse_nfold = [] nrmedse_nfold = [] meansspe_nfold = [] r2_nfold = [] histresidual = [] histsspe = [] # dataframe to hold all predictions: dfpred_all = pd.DataFrame() # Loop over all folds for ix in range_nfold: # Loop over all train/test sets (test sets are designated by ix; training set is defined by the remaining set) print('Processing for nfold ', ix) # update outpath with iteration of cross-validation outpath_nfold = os.path.join(outpath, 'nfold_' + str(ix) + '/') os.makedirs(outpath_nfold, exist_ok = True) # split into train and test data dftrain = dfsel[dfsel[settings.name_ixval] != ix].copy() dftest = dfsel[dfsel[settings.name_ixval] == ix].copy() # Copy dataframe for saving results later dfpred = dftest.copy() points3D_train = np.asarray([dftrain.z.values, dftrain.y.values, dftrain.x.values]).T points3D_test = np.asarray([dftest.z.values, dftest.y.values, dftest.x.values]).T # Check for nan values: y_train = dftrain[settings.name_target].values y_test = dftest[settings.name_target].values # Uncertainty in coordinates if 'z_diff' in list(dftrain): Xdelta_train = np.asarray([0.5 * dftrain.z_diff.values, dftrain.y.values * 0, dftrain.x.values * 0.]).T Xdelta_test = np.asarray([0.5 * dftest.z_diff.values, dftest.y.values * 0, dftest.x.values * 0.]).T else: Xdelta_train = np.asarray([0 * dftrain.z.values, dftrain.y.values * 0, dftrain.x.values * 0.]).T Xdelta_test = np.asarray([0 * dftest.z.values, dftest.y.values * 0, dftest.x.values * 0.]).T if mean_function == 'rf': # Estimate GP mean function with Random Forest Regressor X_train = dftrain[settings.name_features].values y_train = dftrain[settings.name_target].values X_test = dftest[settings.name_features].values y_test = dftest[settings.name_target].values rf_model = rf.rf_train(X_train, y_train) ypred_rf_train, ynoise_train, nrmse_rf_train = rf.rf_predict(X_train, rf_model, y_test = y_train) ypred_rf, ynoise_pred, nrmse_rf_test = rf.rf_predict(X_test, rf_model, y_test = y_test) y_train_fmean = ypred_rf_train if not calc_mean_only: plt.figure() # inches plt.title('Random Forest Mean Function') plt.errorbar(y_train, ypred_rf_train, ynoise_train, linestyle='None', marker = 'o', c = 'r', label = 'Train Data', alpha =0.5) plt.errorbar(y_test, ypred_rf, ynoise_pred, linestyle='None', marker = 'o', c = 'b', label = 'Test Data', alpha =0.5) plt.legend(loc = 'upper left') plt.xlabel('y True') plt.ylabel('y Predict') plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_meanfunction_pred_vs_true.png'), dpi = 300) if _show: plt.show() plt.close('all') elif mean_function == 'blr': X_train = dftrain[settings.name_features].values y_train = dftrain[settings.name_target].values X_test = dftest[settings.name_features].values y_test = dftest[settings.name_target].values # Scale data Xs_train, ys_train, scale_params = blr.scale_data(X_train, y_train) scaler_x, scaler_y = scale_params Xs_test = scaler_x.transform(X_test) # Train BLR blr_model = blr.blr_train(Xs_train, y_train) # Predict for X_test ypred_blr, ypred_std_blr, nrmse_blr_test = blr.blr_predict(Xs_test, blr_model, y_test = y_test) ypred_blr_train, ypred_std_blr_train, nrmse_blr_train = blr.blr_predict(Xs_train, blr_model, y_test = y_train) ypred_blr = ypred_blr.flatten() ypred_blr_train = ypred_blr_train.flatten() y_train_fmean = ypred_blr_train ynoise_train = ypred_std_blr_train #* fac_noise_train ynoise_pred = ypred_std_blr #* fac_noise_pred if not calc_mean_only: plt.figure() # inches plt.title('BLR Mean function') plt.errorbar(y_train, ypred_blr_train, ynoise_train, linestyle='None', marker = 'o', c = 'r', label = 'Train Data', alpha =0.5) plt.errorbar(y_test, ypred_blr, ynoise_pred, linestyle='None', marker = 'o', c = 'b', label = 'Test Data', alpha =0.5) plt.legend(loc = 'upper left') plt.xlabel('y True') plt.ylabel('y Predict') plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_meanfunction_pred_vs_true.png'), dpi = 300) if _show: plt.show() plt.close('all') elif mean_function == 'const': y_train_fmean = np.mean(y_train) * np.ones(y_train.shape) ypred_const = np.mean(y_train) * np.ones(y_test.shape) ypred_const_train = np.mean(y_train) * y_train_fmean ynoise_train = 1e-6 * np.ones(y_train.shape) ynoise_pred = 1e-6 * np.ones(y_test.shape) # Subtract mean function from training data for GP with zero mean y_train -= y_train_fmean # plot training and testing distribution plt.figure(figsize=(8,6)) plt.scatter(dftrain.x.values, dftrain.y.values, alpha=0.3, c = 'b', label = 'Train') plt.scatter(dftest.x.values, dftest.y.values, alpha=0.3, c = 'r', label = 'Test') plt.axis('equal') plt.xlabel('Easting') plt.ylabel('Northing') #plt.colorbar() plt.title('Mean subtracted ' + settings.name_target) #plt.colorbar() plt.legend() plt.tight_layout() plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_train.png'), dpi = 300) if _show: plt.show() ### Plot histogram of target values after mean subtraction plt.clf() plt.hist(y_train, bins=30) plt.xlabel('Mean subtracted y_train') plt.ylabel('N') plt.savefig(os.path.join(outpath_nfold,'Hist_' + settings.name_target + '_train.png'), dpi = 300) if _show: plt.show() plt.close('all') if not calc_mean_only: # optimise GP hyperparameters # Use mean of X uncertainity for optimizing since otherwise too many local minima print('Mean of Y: ' +str(np.round(np.mean(y_train),4)) + ' +/- ' + str(np.round(np.std(y_train),4))) print('Mean of Mean function: ' +str(np.round(np.mean(y_train_fmean),4)) + ' +/- ' + str(np.round(np.std(y_train_fmean),4))) print('Mean of Mean function noise: ' +str(np.round(np.mean(ynoise_train),4)) + ' +/- ' + str(np.round(np.std(ynoise_train),4))) print('Optimizing GP hyperparameters...') Xdelta_mean = Xdelta_train * 0 + np.nanmean(Xdelta_train,axis=0) # TBD: find automatic way to set hyperparameter boundaries based on data xymin = 0.5 * (points3D_train[:,1].max() - points3D_train[:,1].min()) / np.unique(points3D_train[:,1]).size zmin = 0.5 * (points3D_train[:,0].max() - points3D_train[:,0].min()) / np.unique(points3D_train[:,0]).size opt_params, opt_logl = gp.optimize_gp_3D(points3D_train, y_train, ynoise_train, xymin = xymin, zmin = zmin, Xdelta = Xdelta_mean) #opt_params, opt_logl = optimize_gp_3D(points3D_train, y_train, ynoise_train, xymin = 30, zmin = 0.05, Xdelta = Xdelta_train) params_gp = opt_params # Calculate predicted mean values points3D_pred = points3D_test.copy() print('Computing GP predictions for test set nfold ', ix) ypred, ypred_std, logl, gp_train = gp.train_predict_3D(points3D_train, points3D_pred, y_train, ynoise_train, params_gp, Ynoise_pred = ynoise_pred, Xdelta = Xdelta_train) ypred_train, ypred_std_train, _ , _ = gp.train_predict_3D(points3D_train, points3D_train, y_train, ynoise_train, params_gp, Ynoise_pred = ynoise_train, Xdelta = Xdelta_train) else: ypred = 0 ypred_train =0 ypred_std = ynoise_pred ypred_std_train = ynoise_train # Add mean function to prediction if mean_function == 'rf': y_pred_zmean = ypred_rf y_pred_train_zmean = ypred_rf_train elif mean_function == 'blr': y_pred_zmean = ypred_blr y_pred_train_zmean = ypred_blr_train elif mean_function == 'const': y_pred_zmean = ypred_const y_pred_train_zmean = ypred_const_train y_pred = ypred + y_pred_zmean y_pred_train = ypred_train + y_pred_train_zmean y_train += y_train_fmean # Calculate Residual, RMSE, R2, SSPE residual_test = y_pred - y_test rmse = np.sqrt(np.nanmean(residual_test**2)) rmse_norm = rmse / y_test.std() rmedse = np.sqrt(np.median(residual_test**2)) rmedse_norm = rmedse / y_test.std() #sspe = residual_test**2 / ystd_test**2 sspe = residual_test**2 / (ypred_std**2) r2 = 1 - np.nanmean(residual_test**2) / np.nanmean((y_test - y_test.mean())**2) if not calc_mean_only: print("GP Marginal Log-Likelihood: ", np.round(logl,2)) print("Normalized RMSE: ",np.round(rmse_norm,4)) print("Normalized ROOT MEDIAN SE: ",np.round(rmedse_norm,4)) print("R^2: ", np.round(r2,4)) print("Mean Theta: ", np.round(np.mean(sspe),4)) print("Median Theta: ", np.round(np.median(sspe))) # Save results in dataframe dfpred[settings.name_target + '_predict'] = y_pred dfpred[settings.name_target + '_stddev'] = ypred_std dfpred['Residual'] = residual_test dfpred['Residual_squared'] = residual_test**2 dfpred['Theta'] = sspe dfpred.to_csv(os.path.join(outpath_nfold, settings.name_target + '_results_nfold' + str(ix) + '.csv'), index = False) # add to dataframe for all folds dfpred_all = pd.concat([dfpred_all, dfpred], axis=0, ignore_index = True) #Residual Map plt.figure(figsize=(8,6)) plt.scatter(dftest.x.values, dftest.y.values, c=residual_test, alpha=0.3) plt.axis('equal') plt.xlabel('Easting') plt.ylabel('Northing') #plt.colorbar() plt.title('Residual Test Data ' + settings.name_target) plt.colorbar() #plt.legend() plt.tight_layout() plt.savefig(os.path.join(outpath_nfold, settings.name_target + '_residualmap.png'), dpi = 300) if _show: plt.show() # Residual Plot import seaborn as sns plt.subplot(2, 1, 1) sns.distplot(residual_test, norm_hist = True) plt.title(settings.name_target + ' Residual Analysis of Test Data') plt.ylabel('Residual') plt.subplot(2, 1, 2) sns.distplot(sspe, norm_hist = True) plt.ylabel(r'$\Theta$') plt.savefig(os.path.join(outpath_nfold, 'Residual_hist_' + settings.name_target + '_nfold' + str(ix) + '.png'), dpi=300) if _show: plt.show() plt.close('all') plt.figure() # plt.title(model_function) plt.errorbar(y_train, y_pred_train, ypred_std_train, linestyle='None', marker = 'o', c = 'r', label = 'Train Data', alpha =0.5) plt.errorbar(y_test, y_pred, ypred_std, linestyle='None', marker = 'o', c = 'b', label = 'Test Data', alpha =0.5) plt.legend(loc = 'upper left') plt.xlabel(settings.name_target + ' True') plt.ylabel(settings.name_target + ' Predict') plt.savefig(os.path.join(outpath_nfold,'pred_vs_true' + settings.name_target + '_nfold' + str(ix) + '.png'), dpi = 300) if _show: plt.show() plt.close('all') rmse_nfold.append(rmse) nrmse_nfold.append(rmse_norm) rmedse_nfold.append(rmedse) nrmedse_nfold.append(rmedse_norm) meansspe_nfold.append(np.mean(sspe)) r2_nfold.append(r2) histsspe.append(sspe) histresidual.append(residual_test) # Save prediton results in dataframe print("Saving all predictions in dataframe ...") dfpred_all.to_csv(os.path.join(outpath, settings.name_target + '_results_nfold_all.csv'), index = False) # Plot all predictions vs true for test data plt.figure() plt.title('Combined Test Data') plt.errorbar(dfpred_all[settings.name_target], dfpred_all[settings.name_target + '_predict'], dfpred_all[settings.name_target + '_stddev'], linestyle='None', marker = 'o', c = 'b', alpha = 0.5) plt.xlabel(settings.name_target + ' True') plt.ylabel(settings.name_target + ' Predict') plt.savefig(os.path.join(outpath,'pred_vs_true' + settings.name_target + '_combined.png'), dpi = 300) if _show: plt.show() plt.close('all') # Save and plot summary results of residual analysis for test data: rmse_nfold = np.asarray(rmse_nfold) nrmse_nfold = np.asarray(nrmse_nfold) rmedse_nfold = np.asarray(rmedse_nfold) nrmedse_nfold = np.asarray(nrmedse_nfold) meansspe_nfold = np.asarray(meansspe_nfold) r2_nfold = np.asarray(r2_nfold) dfsum = pd.DataFrame({'nfold': range_nfold, 'RMSE': rmse_nfold, 'nRMSE': nrmse_nfold, 'RMEDIANSE': rmedse_nfold, 'nRMEDIANSE': nrmedse_nfold, 'R2': r2_nfold, 'Theta': meansspe_nfold}) dfsum.to_csv(os.path.join(outpath, settings.name_target + 'nfold_summary_stats.csv'), index = False) print("---- X-validation Summary -----") print("Mean normalized RMSE: " + str(np.round(np.mean(nrmse_nfold),3)) + " +/- " + str(np.round(np.std(nrmse_nfold),3))) print("Median normalized RMSE: " + str(np.round(np.median(nrmedse_nfold),3))) print("Mean R^2: " + str(np.round(np.mean(r2_nfold),3))) print("Mean Theta: " + str(np.round(np.mean(meansspe_nfold),3)) + " +/- " + str(np.round(np.std(meansspe_nfold),3))) histresidual_cut = truncate_data(histresidual, 1) histsspe_cut = truncate_data(histsspe, 1) plt.subplot(2, 1, 1) sns.distplot(histresidual_cut, norm_hist = True) plt.title(settings.name_target + ' Residual Analysis of Test Data') plt.ylabel('Residual') plt.subplot(2, 1, 2) sns.distplot(histsspe_cut, norm_hist = True) plt.ylabel(r'$\Theta$') #plt.title(valuename + ' SSPE Test') plt.savefig(os.path.join(outpath, 'Xvalidation_Residual_hist_' + settings.name_target + '.png'), dpi=300) if _show: plt.show() plt.close('all') stats_summary = (np.round(np.mean(nrmse_nfold),3), np.round(np.std(nrmse_nfold),3), np.round(np.mean(meansspe_nfold),3), np.round(np.std(meansspe_nfold),3), np.round(np.mean(r2_nfold),3), np.round(np.std(r2_nfold),3) ) return dfsum, stats_summary, outpath