Module `python_scripts.synthgen`

Toolset for generating geospatial synthetic data-sets with multiple features, noise and spatial correlations.

The genrated models are regression models and can be either linear, quadratic or cubic. Options for spatial correlations are defined by spatial correlation lengthscale and amplitude and implemented by using a squared exponential kernel (currently only option).

User settings, such as output paths and synthetic data options, are set in the settings file (Default filename: settings_synthgen.yaml) Alternatively, the settings file can be specified as a command line argument with: '-s', or '–settings' followed by PATH-TO-FILE/FILENAME.yaml (e.g. python featureimportance.py -s settings_featureimportance.yaml).

This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).

Expand source code

"""
Toolset for generating geospatial synthetic data-sets with multiple features, noise and spatial correlations. 

The genrated models are regression models and can be either linear, quadratic or cubic.
Options for spatial correlations are defined by spatial correlation lengthscale and amplitude
and implemented by using a squared exponential kernel (currently only option).

User settings, such as output paths and synthetic data options, are set in the settings file 
(Default filename: settings_synthgen.yaml) 
Alternatively, the settings file can be specified as a command line argument with: 
'-s', or '--settings' followed by PATH-TO-FILE/FILENAME.yaml 
(e.g. python featureimportance.py -s settings_featureimportance.yaml).

This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).
"""

import os
import itertools
import sys
import yaml
import shutil
import argparse
from types import SimpleNamespace  
from pathlib import Path
import numpy as np
import pandas as pd
import datetime
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, RobustScaler
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from sklearn.metrics import pairwise_distances
import matplotlib as mpl
import matplotlib.pyplot as plt

# Settings yaml file
_fname_settings = 'settings_synthgen.yaml'


def create_kernel_expsquared(D, gamma):
    """
    Create exponential squared kernel

    Input: 
        X: distance matrix
        gamma: length scale parameter

    Return:
        kernel: kernel buffer
    """
    return np.exp(-D**2 / (2*gamma**2))


def gen_synthetic(n_features, n_informative_features = 10, 
                n_samples = 200,  outpath = None, 
                model_order = 'quadratic', correlated = False, 
                noise= 0.1, corr_length = 10, corr_amp = 0.2, 
                spatialsize = 100, center = [140,-35],  crs = 'EPSG:4326', grid = False):
        """
        Generate synthetic datasets

        Input:
                n_features: number of features
        n_informative_features: number of important features
                n_samples: number of samples (if grid = True then n_samples corresponds to the number of points along each axis)
        outpath: path to save simulated data    
                model_order: order of the model, either 'linear', 'quadratic', or 'cubic'
                correlated: if True, the features are correlated
                noise: random noise level [range: 0-1] that is added to the synthetic data
        corr_length: spatial correlation length [Gaussian FWHM in meters]
        corr_amp: spatial correlation amplitude
        spatialsize: size in x and y direction [in meters]
        center: [x,y] coordinates of the center of the data in meter (Easting, Northings)
        crs: coordinate reference system [Default: 'EPSG:8059']
                grid: if True, the synthetic data is generated on a regular grid, if not, it is generated on a random distribution

        Return:
                dfsim: dataframe with simulated features
                coefsim: simulated coefficients
                feature_names: list of feature names
        """
        # Initiate Random generator
        random_state = 42
        np.random.seed(random_state)
        if correlated:
                n_rank = int(n_features/2)
        else:
                n_rank = None
        if grid:
                n_samples = n_samples**2
        if crs == 'EPSG:4326':
                # convert form arcsec to degrees
                spatialsize = spatialsize/3600
    # Generate regression features:
        Xsim, ysim, coefsim = make_regression(n_samples=n_samples, n_features = n_features, n_informative=n_informative_features, n_targets=1, 
                bias=0.5, noise=noise, shuffle=False, coef=True, random_state=random_state, effective_rank = n_rank)    
        # Name features:
        feature_names = ["Feature_" + str(i+1) for i in range(n_features)]

    # Scale features to the range [0,1]
        coefsim /= 100
        scaler = MinMaxScaler()
        scaler.fit(Xsim)
        Xsim = scaler.transform(Xsim)
        
    # Make model
        if model_order == 'linear':
        # make first-order model
                ysim_new = np.dot(Xsim, coefsim)
        elif model_order == 'quadratic':
                # make quadratic model
                Xcomb = []
                for i, j in itertools.combinations(Xsim.T, 2):
                        Xcomb.append(i * j) 
                Xcomb = np.asarray(Xcomb).T
                Xcomb = np.hstack((Xsim, Xcomb, Xsim**2))
                coefcomb = []
                for i, j in itertools.combinations(coefsim, 2):
                        coefcomb.append(i * j) 
                coefcomb = np.asarray(coefcomb)
                coefcomb = np.hstack((coefsim, coefcomb, coefsim**2))
                ysim_new = np.dot(Xcomb, coefcomb)
        elif model_order == 'quadratic_pairwise':
                # make quadratic model
                Xcomb = []
                for i, j in itertools.combinations(Xsim.T, 2):
                        Xcomb.append(i * j) 
                Xcomb = np.asarray(Xcomb).T
                Xcomb = np.hstack((Xsim, Xcomb))
                coefcomb = []
                for i, j in itertools.combinations(coefsim, 2):
                        coefcomb.append(i * j) 
                coefcomb = np.asarray(coefcomb)
                coefcomb = np.hstack((coefsim, coefcomb))
                ysim_new = np.dot(Xcomb, coefcomb)
    
        # add randomly distributed cartesian points:
        if grid:
                x, y = np.meshgrid(np.linspace(-spatialsize/2, spatialsize/2, int(np.sqrt(n_samples))), np.linspace(-spatialsize/2, spatialsize/2, int(np.sqrt(n_samples))))
                x = x.flatten()
                y = y.flatten()
        else:
                x, y = np.random.uniform(- 0.5 * spatialsize, + 0.5 * spatialsize, (2, n_samples))

        # Add spatial correlation function:
        if (corr_amp > 0) & (corr_length > 0):
                dist = pairwise_distances(np.asarray([x,y]).T, metric='euclidean')
                # Add spatial correlation with 2-dimensional distance kernel function
                kernel = create_kernel_expsquared(dist, corr_length)
                ycorr = np.dot(kernel, ysim_new)
                # Normalize to unit variance
                fscale = ycorr.mean()/ysim_new.mean()
                ycorr = corr_amp * ycorr /fscale
                ysim_new += ycorr - ycorr.mean()

        # Add coordinate center to x,y
        x += center[0]
        y += center[1]

        # Add random noise as normal distribution
        ysim_new += np.random.normal(scale=noise, size = n_samples)
        #Save data as dataframe and coefficients on file
        if crs == 'EPSG:4326':
                header = np.hstack((feature_names, 'Ytarget', 'Longitude', 'Latitude'))
        else:
                header = np.hstack((feature_names, 'Ytarget', 'Easting', 'Northing'))
        data = np.hstack((Xsim, ysim_new.reshape(-1,1), x.reshape(-1,1), y.reshape(-1,1)))
        df = pd.DataFrame(data, columns = header)
        if outpath is not None:
                os.makedirs(outpath, exist_ok=True)
                if grid:
                        gridname = '_grid'
                else:
                        gridname = ''
                # Add datetime now to filename
                date = datetime.datetime.now().strftime("%Y-%m-%d")
                outfname = os.path.join(outpath, f'SyntheticData_{model_order}_{n_features}nfeatures_{date}{gridname}.csv')
                df.to_csv(outfname, index = False)
                # Now save coefficients and other parameters in extra file:
                df_coef = pd.DataFrame(coefsim.reshape(-1,1).T, columns = feature_names)
                # Add columns with spatial correlation function
                if (corr_amp > 0) & (corr_length > 0):
                        df_coef['corr_amp'] = corr_amp
                        df_coef['corr_length'] = corr_length
                # Add column with noise level
                df_coef['noise'] = noise
                outfname_coef = os.path.join(outpath, f'SyntheticData_coefficients_{model_order}_{n_features}nfeatures_{date}{gridname}.csv')
                df_coef.to_csv(outfname_coef, index = False)
        return df, coefsim, feature_names, outfname


def sample_fromgrid(fname_grid, nsample):
        """
        Sample random points from grid of synthetic data

        Parameters
        ----------
        fname_grid : str
                Name of grid file
        nsample : int
                Number of samples to be drawn from grid

        Returns
        -------
        filename of output file
        """
        df_grid = pd.read_csv(fname_grid)
        df_grid = df_grid.sample(n = nsample, random_state = 42)
        outfname = os.path.join(Path(fname_grid).parent,f'{Path(fname_grid).stem}_{nsample}sample.csv')
        df_grid.to_csv(outfname, index = False)
        return outfname


def main(fname_settings):
        """
        Main function for generating synthetic data.

        Input:
                fname_settings: path and filename to settings file
        """
    # Load settings from yaml file
        with open(fname_settings, 'r') as f:
                settings = yaml.load(f, Loader=yaml.FullLoader)
        # Parse settings dictionary as namespace (settings are available as 
        # settings.variable_name rather than settings['variable_name'])
        settings = SimpleNamespace(**settings)

        # Verify output directory and make it if it does not exist
        os.makedirs(settings.outpath, exist_ok = True)

    # Generate synthetic data
        df, coefsim, feature_names, outfname = gen_synthetic(n_features = settings.n_features, 
        n_informative_features = settings.n_informative_features, 
        n_samples = settings.n_samples , outpath = settings.outpath, 
        model_order = settings.model_order, correlated = settings.correlated, 
        noise=settings.noise, corr_length = settings.corr_length, corr_amp = settings.corr_amp, 
        spatialsize = settings.spatialsize, center = settings.center,  crs = settings.crs, grid = settings.grid)

        # Draw random samples from grid of synthetic data (if grid is used)
        if settings.grid & (settings.nsample_from_grid > 0):
                outfname_samples = sample_fromgrid(outfname, settings.nsample_from_grid)
                print(f'Samples from grid saved to {outfname_samples}')

if __name__ == '__main__':
        # Parse command line arguments
        parser = argparse.ArgumentParser(description='Calculating feature importance.')
        parser.add_argument('-s', '--settings', type=str, required=False,
                                                help='Path and filename of settings file.',
                                                default = _fname_settings)
        args = parser.parse_args()

        # Run main function
        main(args.settings)

Functions

def create_kernel_expsquared(D, gamma)

Create exponential squared kernel

Input: X: distance matrix gamma: length scale parameter

Return

kernel: kernel buffer

Expand source code

def create_kernel_expsquared(D, gamma):
    """
    Create exponential squared kernel

    Input: 
        X: distance matrix
        gamma: length scale parameter

    Return:
        kernel: kernel buffer
    """
    return np.exp(-D**2 / (2*gamma**2))

def gen_synthetic(n_features, n_informative_features=10, n_samples=200, outpath=None, model_order='quadratic', correlated=False, noise=0.1, corr_length=10, corr_amp=0.2, spatialsize=100, center=[140, -35], crs='EPSG:4326', grid=False)

Generate synthetic datasets

Input

n_features: number of features n_informative_features: number of important features n_samples: number of samples (if grid = True then n_samples corresponds to the number of points along each axis) outpath: path to save simulated data
model_order: order of the model, either 'linear', 'quadratic', or 'cubic' correlated: if True, the features are correlated noise: random noise level [range: 0-1] that is added to the synthetic data corr_length: spatial correlation length [Gaussian FWHM in meters] corr_amp: spatial correlation amplitude spatialsize: size in x and y direction [in meters] center: [x,y] coordinates of the center of the data in meter (Easting, Northings) crs: coordinate reference system [Default: 'EPSG:8059'] grid: if True, the synthetic data is generated on a regular grid, if not, it is generated on a random distribution

Return

dfsim: dataframe with simulated features coefsim: simulated coefficients feature_names: list of feature names

Expand source code

def gen_synthetic(n_features, n_informative_features = 10, 
                n_samples = 200,  outpath = None, 
                model_order = 'quadratic', correlated = False, 
                noise= 0.1, corr_length = 10, corr_amp = 0.2, 
                spatialsize = 100, center = [140,-35],  crs = 'EPSG:4326', grid = False):
        """
        Generate synthetic datasets

        Input:
                n_features: number of features
        n_informative_features: number of important features
                n_samples: number of samples (if grid = True then n_samples corresponds to the number of points along each axis)
        outpath: path to save simulated data    
                model_order: order of the model, either 'linear', 'quadratic', or 'cubic'
                correlated: if True, the features are correlated
                noise: random noise level [range: 0-1] that is added to the synthetic data
        corr_length: spatial correlation length [Gaussian FWHM in meters]
        corr_amp: spatial correlation amplitude
        spatialsize: size in x and y direction [in meters]
        center: [x,y] coordinates of the center of the data in meter (Easting, Northings)
        crs: coordinate reference system [Default: 'EPSG:8059']
                grid: if True, the synthetic data is generated on a regular grid, if not, it is generated on a random distribution

        Return:
                dfsim: dataframe with simulated features
                coefsim: simulated coefficients
                feature_names: list of feature names
        """
        # Initiate Random generator
        random_state = 42
        np.random.seed(random_state)
        if correlated:
                n_rank = int(n_features/2)
        else:
                n_rank = None
        if grid:
                n_samples = n_samples**2
        if crs == 'EPSG:4326':
                # convert form arcsec to degrees
                spatialsize = spatialsize/3600
    # Generate regression features:
        Xsim, ysim, coefsim = make_regression(n_samples=n_samples, n_features = n_features, n_informative=n_informative_features, n_targets=1, 
                bias=0.5, noise=noise, shuffle=False, coef=True, random_state=random_state, effective_rank = n_rank)    
        # Name features:
        feature_names = ["Feature_" + str(i+1) for i in range(n_features)]

    # Scale features to the range [0,1]
        coefsim /= 100
        scaler = MinMaxScaler()
        scaler.fit(Xsim)
        Xsim = scaler.transform(Xsim)
        
    # Make model
        if model_order == 'linear':
        # make first-order model
                ysim_new = np.dot(Xsim, coefsim)
        elif model_order == 'quadratic':
                # make quadratic model
                Xcomb = []
                for i, j in itertools.combinations(Xsim.T, 2):
                        Xcomb.append(i * j) 
                Xcomb = np.asarray(Xcomb).T
                Xcomb = np.hstack((Xsim, Xcomb, Xsim**2))
                coefcomb = []
                for i, j in itertools.combinations(coefsim, 2):
                        coefcomb.append(i * j) 
                coefcomb = np.asarray(coefcomb)
                coefcomb = np.hstack((coefsim, coefcomb, coefsim**2))
                ysim_new = np.dot(Xcomb, coefcomb)
        elif model_order == 'quadratic_pairwise':
                # make quadratic model
                Xcomb = []
                for i, j in itertools.combinations(Xsim.T, 2):
                        Xcomb.append(i * j) 
                Xcomb = np.asarray(Xcomb).T
                Xcomb = np.hstack((Xsim, Xcomb))
                coefcomb = []
                for i, j in itertools.combinations(coefsim, 2):
                        coefcomb.append(i * j) 
                coefcomb = np.asarray(coefcomb)
                coefcomb = np.hstack((coefsim, coefcomb))
                ysim_new = np.dot(Xcomb, coefcomb)
    
        # add randomly distributed cartesian points:
        if grid:
                x, y = np.meshgrid(np.linspace(-spatialsize/2, spatialsize/2, int(np.sqrt(n_samples))), np.linspace(-spatialsize/2, spatialsize/2, int(np.sqrt(n_samples))))
                x = x.flatten()
                y = y.flatten()
        else:
                x, y = np.random.uniform(- 0.5 * spatialsize, + 0.5 * spatialsize, (2, n_samples))

        # Add spatial correlation function:
        if (corr_amp > 0) & (corr_length > 0):
                dist = pairwise_distances(np.asarray([x,y]).T, metric='euclidean')
                # Add spatial correlation with 2-dimensional distance kernel function
                kernel = create_kernel_expsquared(dist, corr_length)
                ycorr = np.dot(kernel, ysim_new)
                # Normalize to unit variance
                fscale = ycorr.mean()/ysim_new.mean()
                ycorr = corr_amp * ycorr /fscale
                ysim_new += ycorr - ycorr.mean()

        # Add coordinate center to x,y
        x += center[0]
        y += center[1]

        # Add random noise as normal distribution
        ysim_new += np.random.normal(scale=noise, size = n_samples)
        #Save data as dataframe and coefficients on file
        if crs == 'EPSG:4326':
                header = np.hstack((feature_names, 'Ytarget', 'Longitude', 'Latitude'))
        else:
                header = np.hstack((feature_names, 'Ytarget', 'Easting', 'Northing'))
        data = np.hstack((Xsim, ysim_new.reshape(-1,1), x.reshape(-1,1), y.reshape(-1,1)))
        df = pd.DataFrame(data, columns = header)
        if outpath is not None:
                os.makedirs(outpath, exist_ok=True)
                if grid:
                        gridname = '_grid'
                else:
                        gridname = ''
                # Add datetime now to filename
                date = datetime.datetime.now().strftime("%Y-%m-%d")
                outfname = os.path.join(outpath, f'SyntheticData_{model_order}_{n_features}nfeatures_{date}{gridname}.csv')
                df.to_csv(outfname, index = False)
                # Now save coefficients and other parameters in extra file:
                df_coef = pd.DataFrame(coefsim.reshape(-1,1).T, columns = feature_names)
                # Add columns with spatial correlation function
                if (corr_amp > 0) & (corr_length > 0):
                        df_coef['corr_amp'] = corr_amp
                        df_coef['corr_length'] = corr_length
                # Add column with noise level
                df_coef['noise'] = noise
                outfname_coef = os.path.join(outpath, f'SyntheticData_coefficients_{model_order}_{n_features}nfeatures_{date}{gridname}.csv')
                df_coef.to_csv(outfname_coef, index = False)
        return df, coefsim, feature_names, outfname

def main(fname_settings)

Main function for generating synthetic data.

Input

fname_settings: path and filename to settings file

Expand source code

def main(fname_settings):
        """
        Main function for generating synthetic data.

        Input:
                fname_settings: path and filename to settings file
        """
    # Load settings from yaml file
        with open(fname_settings, 'r') as f:
                settings = yaml.load(f, Loader=yaml.FullLoader)
        # Parse settings dictionary as namespace (settings are available as 
        # settings.variable_name rather than settings['variable_name'])
        settings = SimpleNamespace(**settings)

        # Verify output directory and make it if it does not exist
        os.makedirs(settings.outpath, exist_ok = True)

    # Generate synthetic data
        df, coefsim, feature_names, outfname = gen_synthetic(n_features = settings.n_features, 
        n_informative_features = settings.n_informative_features, 
        n_samples = settings.n_samples , outpath = settings.outpath, 
        model_order = settings.model_order, correlated = settings.correlated, 
        noise=settings.noise, corr_length = settings.corr_length, corr_amp = settings.corr_amp, 
        spatialsize = settings.spatialsize, center = settings.center,  crs = settings.crs, grid = settings.grid)

        # Draw random samples from grid of synthetic data (if grid is used)
        if settings.grid & (settings.nsample_from_grid > 0):
                outfname_samples = sample_fromgrid(outfname, settings.nsample_from_grid)
                print(f'Samples from grid saved to {outfname_samples}')

def sample_fromgrid(fname_grid, nsample)

Sample random points from grid of synthetic data

Parameters

fname_grid : str: Name of grid file
nsample : int: Number of samples to be drawn from grid

Returns

filename of output file

Expand source code

def sample_fromgrid(fname_grid, nsample):
        """
        Sample random points from grid of synthetic data

        Parameters
        ----------
        fname_grid : str
                Name of grid file
        nsample : int
                Number of samples to be drawn from grid

        Returns
        -------
        filename of output file
        """
        df_grid = pd.read_csv(fname_grid)
        df_grid = df_grid.sample(n = nsample, random_state = 42)
        outfname = os.path.join(Path(fname_grid).parent,f'{Path(fname_grid).stem}_{nsample}sample.csv')
        df_grid.to_csv(outfname, index = False)
        return outfname