Module python_scripts.preprocessing
Preprocessing functions for geospatial soil data.
For more package details see conda environment file: environment.yaml
This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).
Expand source code
"""
Preprocessing functions for geospatial soil data.
For more package details see conda environment file: environment.yaml
This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).
"""
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import sys
import argparse
import yaml
from types import SimpleNamespace
#sklearn.model_selection import StratifiedKFold
# Default settings yaml file name:
_fname_settings = 'settings_preprocessing.yaml'
def preprocess(inpath, infname, outpath, outfname, name_target, name_features,
zmin = None, zmax = None, gen_gpkg = False, categorical = None,
colname_depthmin = None, colname_depthmax = None,
colname_xcoord = 'x', colname_ycoord = 'y', colname_lat = None, colname_lng= None,
project_crs = None):
"""
Converts input dataframe into more useable data and saves as new dataframe (csv file) on disk.
Preprocessing steps:
- Cleaning data
- Tries to finds Latitude and Longitude entries
- Calculation of depth intervals and their mid-points.
- Trims data to zmin and zmax
- Automatically converts categorical features to binary feature representations.
- Generates cross-validation indices for cross-validation.
- Generates geospatial dataframe with coordinates (not yet implemented)
Input:
inpath: input path
infname: input file name
outpath: output path
outfname: output file name
name_target: String, Name of target for prediction (column names in infname)
name_features: String or list of strings of covariate features (column names in infname)
zmin: in centimeters, if not None, data only larger than zmin will be selected
zmax: in centimeters, if not None, data only smaller than zmax will be selected
gen_gpkg: if True, data will be also saved as georeferenced geopackage
categorical: name of categorical feature, converts categorical feature into additional features with binary coding
colname_depthmin: name of column for lower depth
colname_depthmax: name of column for upper depth
colname_xcoord: name of column for Easting coordinate (if projected crs this is in meters)
colname_ycoord: name of column for Northing coordinate (if projected crs this is in meters)
project_crs: string, EPSG code for projected coordinate reference system
of colname_xcoord and colname_ycoord (e.g. 'EPSG:28355')
"""
## Keep track of all covariates
#name_features2 = name_features.copy()
# Keep track of all relevant column names
if name_target is not None:
fieldnames = name_features + [name_target]
else:
fieldnames = name_features
#df = pd.read_csv(os.path.join(inpath, infname), usecols = fieldnames)
df = pd.read_csv(os.path.join(inpath, infname))
# Find if Latitude or Longitude values exist:
if ('Latitude' in list(df)) & ('Longitude' in list(df)):
fieldnames += ['Latitude', 'Longitude']
else:
if ('Lat' in list(df)):
df.rename(columns={"Lat": "Latitude"}, inplace=True)
fieldnames += ['Latitude']
if ('Lng' in list(df)):
df.rename(columns={"Lng": "Longitude"}, inplace=True)
fieldnames += ['Longitude']
elif ('Lon' in list(df)):
df.rename(columns={"Lon": "Longitude"}, inplace=True)
fieldnames += ['Longitude']
#if ('Easting' in list(df)) & ('Northing' in list(df)) & ('x' not in list(df)) & ('y' not in list(df)):
# df.rename(columns={"Easting": "x", "Northing": "y"}, inplace=True)
if ('x' not in list(df)) & ('y' not in list(df)):
df.rename(columns={colname_xcoord: "x", colname_ycoord: "y"}, inplace=True)
fieldnames += ['x', 'y']
# Check that all x and y are numeric
if (df['x'].dtype != 'float64') | (df['y'].dtype != 'float64'):
# Convert to numeric
df['x'] = pd.to_numeric(df['x'], errors='coerce')
df['y'] = pd.to_numeric(df['y'], errors='coerce')
# Check if x and y are in meters (projected coordinates) or in degrees
# Here we assume that bounding box is not larger than 5 degree in any directions
if (abs(df.x.max() - df.x.min()) < 5) | (abs(df.y.max() - df.y.min()) < 5):
print(f'check if x and y are in meters (projected coordinates) or in degrees')
print(f'distance in x is {abs(df.x.max() - df.x.min())}')
print(f'WARNING: Coordinates {colname_xcoord} and {colname_ycoord} seem to be not in meters!')
print(f'Please check if coordinates are projected or not!')
# Calculate mid point and convert to cm
if (colname_depthmin is not None) & (colname_depthmax is not None):
df['z'] = 0.5 * (df[colname_depthmin] + df[colname_depthmax]) / 100.
df['z_diff'] = (df[colname_depthmin] - df[colname_depthmax]) / 100.
fieldnames += ['z', 'z_diff']
#if isinstance(name_features2, list):
# selcols = ['x', 'y', 'z', 'z_diff']
# selcols.extend(name_features2)
#else:
# selcols = ['x', 'y', 'z', 'z_diff', name_features2]
#selcols.extend([name_target])
# Trim data to zmin and zmax
if zmin is not None:
df = df[df.z >= zmin]
if zmax is not None:
df = df[df.z <= zmax]
# Continue only with relevant fields
df = df[fieldnames]
# Categories to binary coding
if categorical is not None:
# Convert string to list
if isinstance(categorical, str):
categorical = [categorical]
else:
categorical = []
# Find categorical features in dataframe (here we assume all strings are categorical)
categorical += df.select_dtypes(include=['object']).columns.tolist()
#names_categorical df.select_dtypes(exclude=['number','datetime']).columns.tolist()
# Convert categorical features to binary features
if len(categorical) > 0:
# Add categories as features with binary coding
for name_categorical in categorical:
cat_levels = df[name_categorical].unique()
#cat_names = [name_categorical + '_' + str(x) for x in cat_levels]
cat_names = []
for level in cat_levels:
cat_name = name_categorical + '_' + str(level)
df[cat_name] = 0
df.loc[df[name_categorical].values == level, cat_name] = 1
fieldnames.append(cat_name)
cat_names.append(cat_name)
fieldnames.remove(name_categorical)
print('Added following categories as binary features: ', cat_names)
print('fieldnames:' , fieldnames)
df = df[fieldnames]
#Keep only finite values (remove nan, inf, -inf)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace = True)
# Finally save dataframe as csv
df.to_csv(os.path.join(outpath, outfname), index=False)
# save also as geopackage:
if gen_gpkg:
if project_crs is not None:
gdf = gpd.GeoDataFrame(df.copy(),
geometry=gpd.points_from_xy(df['x'].values, df['y'].values),
crs = project_crs).to_file(os.path.join(outpath, outfname + '.gpkg'), driver='GPKG')
# Ignore depreciation warning until bug is fixed in Shapely for points_from_xy
elif ('Latitude' in df) & ('Longitude' in df):
lat_cen = df.Latitude.mean()
lng_cen = df.Longitude.mean()
# find zone from center point
#zone, crs = find_zone(lat_cen, lng_cen) # could add this function in future
crs_epsg = 'EPSG:' + str(crs)
# Use general Australian projection for now: EPSG:8059 (GDA2020 / SA Lambert for entire Australia)
# Save as non-projected withe Latitude and Longitude
gdf = gpd.GeoDataFrame(df.copy(),
geometry=gpd.points_from_xy(df['Longitude'], dfout['Latitude']),
crs='EPSG:4326').to_file(os.path.join(outpath, outfname + '.gpkg'), driver='GPKG')
else:
print('WARNING: Cannot generate geopackage file!')
print(' Please check to provide either project crs or include Latitude/Longitude in data!')
def round_nearest_base(x, base=0.5):
"""
Round to nearest multiple of base.
Input:
x: number or array to round
base: base to round to (default: 0.5), can be float or integer
Returns:
rounded number (array)
"""
x = np.asarray(x)
if (base < 1) & (base >0):
return base * np.round(x/base)
elif base >=1:
return (base * np.round(x/base)).astype(int)
else:
print("ERROR: base must be numeric and larger than 0")
return None
def gen_kfold(df, nfold, label_nfold = 'Label_nfold', id_unique = None, precision_unique = None):
"""
Generate k-fold non-overlapping cross-validation indices for dataframe.
This function supports generating a unique identifier and precision based on coordinates or other features.
Input:
df: pandas dataframe
nfold: number of folds
label_nfold: name of column to add to dataframe with fold number
id_unique: name of column(s) to use for k-fold cross validation,
e.g. ['ID'] or alternatively construct unique ID from list, e.g.
['x', 'y', 'z'] will use x, y and z to generate unique ID for each point.
If None, then assume index is unique ID.
precision_unique: float or integer; precision of unique ID, e.g. '0.1' for metric positions
Returns:
df: pandas dataframe with added column with the fold number
"""
if id_unique is None:
# Use index as unique ID
id_unique = df.index.values
df["ID_unique"] = id_unique
elif isinstance(id_unique, list):
# Use joint list of features as unique ID
id_array = np.empty(shape = (len(id_unique), len(df)), dtype='<U21')
for i in range(len(id_unique)):
col = id_unique[i]
# check if pandas column is numeric
if pd.api.types.is_numeric_dtype(df[col]):
if precision_unique is not None:
# Round to precision
id_array[i] = round_nearest_base(df[col].values, base=precision_unique).astype(str)
else:
# Use as is
id_array[i] = df[col].values.astype(str)
else:
id_array[i] = df[col].values
# Create unique ID by concatenating strings
df["ID_unique"] = id_array[0]
for i in range(1, len(id_unique)):
df["ID_unique"] = df["ID_unique"] + '_' + id_array[i]
else:
# Use column name as unique ID
if pd.api.types.is_numeric_dtype(df[id_unique]):
if precision_unique is not None:
# Round to precision
df["ID_unique"] = round_nearest_base(df[id_unique].values, base=precision_unique).astype(str)
else:
# Use as is
df["ID_unique"] = df[id_unique].astype(str)
else:
df["ID_unique"] = df[id_unique]
# Create nfold levels:
nunique = df['ID_unique'].unique()
df['new_id'] = 0
ix = 1
for unique in nunique:
df.loc[df['ID_unique'] == unique, 'new_id'] = ix
ix += 1
size = int(len(nunique)/nfold)
np.random.shuffle(nunique)
df[label_nfold] = 0
start = 0
for i in range(nfold - 1):
stop = start + size
sub = nunique[start:stop]
df.loc[df['ID_unique'].isin(sub),label_nfold] = i + 1
start = stop
df.loc[df[label_nfold] == 0, label_nfold] = nfold
# remove temporary columns
if isinstance(id_unique, list) | (precision_unique is not None):
# keep ID_unique column
df.drop(columns = ['new_id'], inplace = True)
else:
df.drop(columns = ['ID_unique', 'new_id'], inplace = True)
return df
def preprocess_grid(inpath, infname, outpath, outfname, name_features, categorical = None):
"""
Converts input dataframe into more useable data and saves as new dataframe on disk
categorical variables are converted into binary features
Input:
inpath: input path
infname: input file name
outpath: output path
outfname: output file name
name_features: String or list of strings of covariate features (column names in infname)
categorical: name of categorical feature, converts categorical feature into additional features with binary coding
Return:
processed dataframe
names of features with updated categorical (if none, then return original input name_features)
"""
print("Preprocessing grid covariates...")
name_features2 = name_features.copy()
df = pd.read_csv(os.path.join(path, infname))
if categorical is not None:
# Add categories as features with binary coding
cat_levels = df[categorical].unique()
print('Adding following categories as binary features: ', cat_levels)
name_features2.extend(cat_levels)
for level in cat_levels:
df[level] = 0
df.loc[df[categorical].values == level, level] = 1
name_features2.remove(categorical)
if ('Easting' in list(df)) & ('Northing' in list(df)) & ('x' not in list(df)) & ('y' not in list(df)):
df.rename(columns={"Easting": "x", "Northing": "y"}, inplace=True)
if isinstance(name_features2, list):
selcols = ['x', 'y']
selcols.extend(name_features2)
else:
selcols = ['x', 'y', name_features2]
dfout[selcols].to_csv(os.path.join(outpath, outfname), index = False)
return dfout[selcols], name_features2
def preprocess_grid_poly(path, infname_grid, infname_poly, name_features,
grid_crs, grid_colname_Easting = 'x', grid_colname_Northing = 'y',
categorical = None):
"""
Converts input dataframe into more useable data and saves as new dataframe on disk.
Grid points will be spatially joioned with polygons for prediction
To Do add conversion to categorical
Input:
path: input path (same used for output)
infname_grid: input file name of grid covariates
infname_poly: input file name of polygon shape for predictions
name_features: String or list of strings of covariate features (column names in infname)
grid_crs: coordinate reference system of grid points, e.g. 'EPSG:28355'
grid_colname_Easting: column name of Easting coordinate (or Longitude)
grid_colname_Northing: coumn name for Northing coodinate (or Latitude)
categorical: name of categorical feature, converts categorical feature into additional features with binary coding
Return:
processed dataframe
names of features with updated categorical (if none, then return original input name_features)
"""
from shapely.geometry import Point
print("Preprocessing grid covariates and joining with polygon geometry...")
name_features2 = name_features.copy()
df = pd.read_csv(os.path.join(path, infname_grid))
if categorical is not None:
# Add categories as features with binary coding
cat_levels = df[categorical].unique()
print('Adding following categories as binary features: ', cat_levels)
name_features2.extend(cat_levels)
for level in cat_levels:
df[level] = 0
df.loc[df[categorical].values == level, level] = 1
name_features2.remove(categorical)
# Convert grid covariates to geospatial point data
geometry = [Point(xy) for xy in zip(df[grid_colname_Easting], df[grid_colname_Northing])]
gdf = gpd.GeoDataFrame(df, crs=grid_crs, geometry=geometry)
# Read in polygon data
dfpoly = gpd.read_file(os.path.join(path, infname_poly))
dfpoly['ibatch'] = dfpoly.index.values.astype(int)
# Check before merging that grid and poly are in same crs, of not, convert poly to same crs as grid
if not (dfpoly.crs == gdf.crs):
dfpoly = dfpoly.to_crs(gdf.crs)
# Spatially merge points that are within polygons and keep point grid geometry
dfcomb = gpd.sjoin(gdf, dfpoly, how="left", op="within")
# alternatively use op='intersects' to includfe also points that are on boundary of polygon
# Remove all points that are not in a polygon
dfcomb.dropna(subset = ['ibatch'], inplace=True)
dfcomb['ibatch'] = dfcomb['ibatch'].astype(int) # Need to be converted again to int since merge changes type
if len(dfcomb) == 0:
print('WARNING: NO GRID POINTS IN POLYGONS!')
# if convert_to_crs is not None:
# # Convert to meters and get batch x,y coordinate list for each polygon:
# dfcomb = dfcomb.to_crs(epsgcrs)
# dfpoly = dfpoly.to_crs(epsgcrs)
""" If evalutaion points are not given by grid points, following approaches can be tried
evaluation points need to subdivide polygon in equal areas (voronoi), otherwise not equal spatial weight
options a) rasterize polygon and then turn pixels in points, b) use pygridtools https://github.com/Geosyntec/pygridtools
c) use Centroidal Voronoi tessellation (use vorbin or pytess), d) use iterative inwards buffering, e) find points by dynmaic simulation
or split in halfs iteratively: https://snorfalorpagus.net/blog/2016/03/13/splitting-large-polygons-for-faster-intersections/
use centroid, then any line through centroid splits polygon in half
Propose new algorithm: centroidal hexagonal grid (densest cirle packing),
this need optimise only one parameter: rotational angle so that a) maximal number of poinst are in polygon and
b) that distance of point to nearest polygon edge is maximised for all points (which aligns points towards center)
"""
if isinstance(name_features2, list):
selcols = [grid_colname_Easting, grid_colname_Northing, 'ibatch']
selcols.extend(name_features2)
else:
selcols = ['Sample.ID',grid_colname_Easting, grid_colname_Northing, 'ibatch', name_features2]
return dfcomb[selcols].copy(), dfpoly, name_features2
def main(fname_settings):
"""
Main function for running the script.
Input:
fname_settings: path and filename to settings file
"""
# Load settings from yaml file
with open(fname_settings, 'r') as f:
settings = yaml.load(f, Loader=yaml.FullLoader)
# Parse settings dictinary as namespace (settings are available as
# settings.variable_name rather than settings['variable_name'])
settings = SimpleNamespace(**settings)
# Verify output directory and make it if it does not exist
os.makedirs(settings.outpath, exist_ok = True)
# Preprocess data
preprocess(settings.inpath, settings.infname,
settings.outpath, settings.outfname,
settings.name_target, settings.name_features,
colname_xcoord = settings.colname_xcoord, colname_ycoord = settings.colname_ycoord,
zmin = settings.zmin, zmax= settings.zmax,
categorical = 'Soiltype',
colname_depthmin = settings.colname_depthmin, colname_depthmax = settings.colname_depthmax,
gen_gpkg = False,
project_crs = settings.project_crs)
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser(description='Preprocess data for machine learning on soil data.')
parser.add_argument('-s', '--settings', type=str, required=False,
help='Path and filename of settings file.',
default = _fname_settings)
args = parser.parse_args()
# Run main function
main(args.settings)
Functions
def gen_kfold(df, nfold, label_nfold='Label_nfold', id_unique=None, precision_unique=None)-
Generate k-fold non-overlapping cross-validation indices for dataframe. This function supports generating a unique identifier and precision based on coordinates or other features.
Input
df: pandas dataframe nfold: number of folds label_nfold: name of column to add to dataframe with fold number id_unique: name of column(s) to use for k-fold cross validation, e.g. ['ID'] or alternatively construct unique ID from list, e.g. ['x', 'y', 'z'] will use x, y and z to generate unique ID for each point. If None, then assume index is unique ID. precision_unique: float or integer; precision of unique ID, e.g. '0.1' for metric positions
Returns
df- pandas dataframe with added column with the fold number
Expand source code
def gen_kfold(df, nfold, label_nfold = 'Label_nfold', id_unique = None, precision_unique = None): """ Generate k-fold non-overlapping cross-validation indices for dataframe. This function supports generating a unique identifier and precision based on coordinates or other features. Input: df: pandas dataframe nfold: number of folds label_nfold: name of column to add to dataframe with fold number id_unique: name of column(s) to use for k-fold cross validation, e.g. ['ID'] or alternatively construct unique ID from list, e.g. ['x', 'y', 'z'] will use x, y and z to generate unique ID for each point. If None, then assume index is unique ID. precision_unique: float or integer; precision of unique ID, e.g. '0.1' for metric positions Returns: df: pandas dataframe with added column with the fold number """ if id_unique is None: # Use index as unique ID id_unique = df.index.values df["ID_unique"] = id_unique elif isinstance(id_unique, list): # Use joint list of features as unique ID id_array = np.empty(shape = (len(id_unique), len(df)), dtype='<U21') for i in range(len(id_unique)): col = id_unique[i] # check if pandas column is numeric if pd.api.types.is_numeric_dtype(df[col]): if precision_unique is not None: # Round to precision id_array[i] = round_nearest_base(df[col].values, base=precision_unique).astype(str) else: # Use as is id_array[i] = df[col].values.astype(str) else: id_array[i] = df[col].values # Create unique ID by concatenating strings df["ID_unique"] = id_array[0] for i in range(1, len(id_unique)): df["ID_unique"] = df["ID_unique"] + '_' + id_array[i] else: # Use column name as unique ID if pd.api.types.is_numeric_dtype(df[id_unique]): if precision_unique is not None: # Round to precision df["ID_unique"] = round_nearest_base(df[id_unique].values, base=precision_unique).astype(str) else: # Use as is df["ID_unique"] = df[id_unique].astype(str) else: df["ID_unique"] = df[id_unique] # Create nfold levels: nunique = df['ID_unique'].unique() df['new_id'] = 0 ix = 1 for unique in nunique: df.loc[df['ID_unique'] == unique, 'new_id'] = ix ix += 1 size = int(len(nunique)/nfold) np.random.shuffle(nunique) df[label_nfold] = 0 start = 0 for i in range(nfold - 1): stop = start + size sub = nunique[start:stop] df.loc[df['ID_unique'].isin(sub),label_nfold] = i + 1 start = stop df.loc[df[label_nfold] == 0, label_nfold] = nfold # remove temporary columns if isinstance(id_unique, list) | (precision_unique is not None): # keep ID_unique column df.drop(columns = ['new_id'], inplace = True) else: df.drop(columns = ['ID_unique', 'new_id'], inplace = True) return df def main(fname_settings)-
Main function for running the script.
Input
fname_settings: path and filename to settings file
Expand source code
def main(fname_settings): """ Main function for running the script. Input: fname_settings: path and filename to settings file """ # Load settings from yaml file with open(fname_settings, 'r') as f: settings = yaml.load(f, Loader=yaml.FullLoader) # Parse settings dictinary as namespace (settings are available as # settings.variable_name rather than settings['variable_name']) settings = SimpleNamespace(**settings) # Verify output directory and make it if it does not exist os.makedirs(settings.outpath, exist_ok = True) # Preprocess data preprocess(settings.inpath, settings.infname, settings.outpath, settings.outfname, settings.name_target, settings.name_features, colname_xcoord = settings.colname_xcoord, colname_ycoord = settings.colname_ycoord, zmin = settings.zmin, zmax= settings.zmax, categorical = 'Soiltype', colname_depthmin = settings.colname_depthmin, colname_depthmax = settings.colname_depthmax, gen_gpkg = False, project_crs = settings.project_crs) def preprocess(inpath, infname, outpath, outfname, name_target, name_features, zmin=None, zmax=None, gen_gpkg=False, categorical=None, colname_depthmin=None, colname_depthmax=None, colname_xcoord='x', colname_ycoord='y', colname_lat=None, colname_lng=None, project_crs=None)-
Converts input dataframe into more useable data and saves as new dataframe (csv file) on disk.
Preprocessing steps:
- Cleaning data
- Tries to finds Latitude and Longitude entries
- Calculation of depth intervals and their mid-points.
- Trims data to zmin and zmax
- Automatically converts categorical features to binary feature representations.
- Generates cross-validation indices for cross-validation.
- Generates geospatial dataframe with coordinates (not yet implemented)
Input
inpath: input path infname: input file name outpath: output path outfname: output file name name_target: String, Name of target for prediction (column names in infname) name_features: String or list of strings of covariate features (column names in infname) zmin: in centimeters, if not None, data only larger than zmin will be selected zmax: in centimeters, if not None, data only smaller than zmax will be selected gen_gpkg: if True, data will be also saved as georeferenced geopackage categorical: name of categorical feature, converts categorical feature into additional features with binary coding colname_depthmin: name of column for lower depth colname_depthmax: name of column for upper depth colname_xcoord: name of column for Easting coordinate (if projected crs this is in meters) colname_ycoord: name of column for Northing coordinate (if projected crs this is in meters) project_crs: string, EPSG code for projected coordinate reference system of colname_xcoord and colname_ycoord (e.g. 'EPSG:28355')
Expand source code
def preprocess(inpath, infname, outpath, outfname, name_target, name_features, zmin = None, zmax = None, gen_gpkg = False, categorical = None, colname_depthmin = None, colname_depthmax = None, colname_xcoord = 'x', colname_ycoord = 'y', colname_lat = None, colname_lng= None, project_crs = None): """ Converts input dataframe into more useable data and saves as new dataframe (csv file) on disk. Preprocessing steps: - Cleaning data - Tries to finds Latitude and Longitude entries - Calculation of depth intervals and their mid-points. - Trims data to zmin and zmax - Automatically converts categorical features to binary feature representations. - Generates cross-validation indices for cross-validation. - Generates geospatial dataframe with coordinates (not yet implemented) Input: inpath: input path infname: input file name outpath: output path outfname: output file name name_target: String, Name of target for prediction (column names in infname) name_features: String or list of strings of covariate features (column names in infname) zmin: in centimeters, if not None, data only larger than zmin will be selected zmax: in centimeters, if not None, data only smaller than zmax will be selected gen_gpkg: if True, data will be also saved as georeferenced geopackage categorical: name of categorical feature, converts categorical feature into additional features with binary coding colname_depthmin: name of column for lower depth colname_depthmax: name of column for upper depth colname_xcoord: name of column for Easting coordinate (if projected crs this is in meters) colname_ycoord: name of column for Northing coordinate (if projected crs this is in meters) project_crs: string, EPSG code for projected coordinate reference system of colname_xcoord and colname_ycoord (e.g. 'EPSG:28355') """ ## Keep track of all covariates #name_features2 = name_features.copy() # Keep track of all relevant column names if name_target is not None: fieldnames = name_features + [name_target] else: fieldnames = name_features #df = pd.read_csv(os.path.join(inpath, infname), usecols = fieldnames) df = pd.read_csv(os.path.join(inpath, infname)) # Find if Latitude or Longitude values exist: if ('Latitude' in list(df)) & ('Longitude' in list(df)): fieldnames += ['Latitude', 'Longitude'] else: if ('Lat' in list(df)): df.rename(columns={"Lat": "Latitude"}, inplace=True) fieldnames += ['Latitude'] if ('Lng' in list(df)): df.rename(columns={"Lng": "Longitude"}, inplace=True) fieldnames += ['Longitude'] elif ('Lon' in list(df)): df.rename(columns={"Lon": "Longitude"}, inplace=True) fieldnames += ['Longitude'] #if ('Easting' in list(df)) & ('Northing' in list(df)) & ('x' not in list(df)) & ('y' not in list(df)): # df.rename(columns={"Easting": "x", "Northing": "y"}, inplace=True) if ('x' not in list(df)) & ('y' not in list(df)): df.rename(columns={colname_xcoord: "x", colname_ycoord: "y"}, inplace=True) fieldnames += ['x', 'y'] # Check that all x and y are numeric if (df['x'].dtype != 'float64') | (df['y'].dtype != 'float64'): # Convert to numeric df['x'] = pd.to_numeric(df['x'], errors='coerce') df['y'] = pd.to_numeric(df['y'], errors='coerce') # Check if x and y are in meters (projected coordinates) or in degrees # Here we assume that bounding box is not larger than 5 degree in any directions if (abs(df.x.max() - df.x.min()) < 5) | (abs(df.y.max() - df.y.min()) < 5): print(f'check if x and y are in meters (projected coordinates) or in degrees') print(f'distance in x is {abs(df.x.max() - df.x.min())}') print(f'WARNING: Coordinates {colname_xcoord} and {colname_ycoord} seem to be not in meters!') print(f'Please check if coordinates are projected or not!') # Calculate mid point and convert to cm if (colname_depthmin is not None) & (colname_depthmax is not None): df['z'] = 0.5 * (df[colname_depthmin] + df[colname_depthmax]) / 100. df['z_diff'] = (df[colname_depthmin] - df[colname_depthmax]) / 100. fieldnames += ['z', 'z_diff'] #if isinstance(name_features2, list): # selcols = ['x', 'y', 'z', 'z_diff'] # selcols.extend(name_features2) #else: # selcols = ['x', 'y', 'z', 'z_diff', name_features2] #selcols.extend([name_target]) # Trim data to zmin and zmax if zmin is not None: df = df[df.z >= zmin] if zmax is not None: df = df[df.z <= zmax] # Continue only with relevant fields df = df[fieldnames] # Categories to binary coding if categorical is not None: # Convert string to list if isinstance(categorical, str): categorical = [categorical] else: categorical = [] # Find categorical features in dataframe (here we assume all strings are categorical) categorical += df.select_dtypes(include=['object']).columns.tolist() #names_categorical df.select_dtypes(exclude=['number','datetime']).columns.tolist() # Convert categorical features to binary features if len(categorical) > 0: # Add categories as features with binary coding for name_categorical in categorical: cat_levels = df[name_categorical].unique() #cat_names = [name_categorical + '_' + str(x) for x in cat_levels] cat_names = [] for level in cat_levels: cat_name = name_categorical + '_' + str(level) df[cat_name] = 0 df.loc[df[name_categorical].values == level, cat_name] = 1 fieldnames.append(cat_name) cat_names.append(cat_name) fieldnames.remove(name_categorical) print('Added following categories as binary features: ', cat_names) print('fieldnames:' , fieldnames) df = df[fieldnames] #Keep only finite values (remove nan, inf, -inf) df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(inplace = True) # Finally save dataframe as csv df.to_csv(os.path.join(outpath, outfname), index=False) # save also as geopackage: if gen_gpkg: if project_crs is not None: gdf = gpd.GeoDataFrame(df.copy(), geometry=gpd.points_from_xy(df['x'].values, df['y'].values), crs = project_crs).to_file(os.path.join(outpath, outfname + '.gpkg'), driver='GPKG') # Ignore depreciation warning until bug is fixed in Shapely for points_from_xy elif ('Latitude' in df) & ('Longitude' in df): lat_cen = df.Latitude.mean() lng_cen = df.Longitude.mean() # find zone from center point #zone, crs = find_zone(lat_cen, lng_cen) # could add this function in future crs_epsg = 'EPSG:' + str(crs) # Use general Australian projection for now: EPSG:8059 (GDA2020 / SA Lambert for entire Australia) # Save as non-projected withe Latitude and Longitude gdf = gpd.GeoDataFrame(df.copy(), geometry=gpd.points_from_xy(df['Longitude'], dfout['Latitude']), crs='EPSG:4326').to_file(os.path.join(outpath, outfname + '.gpkg'), driver='GPKG') else: print('WARNING: Cannot generate geopackage file!') print(' Please check to provide either project crs or include Latitude/Longitude in data!') def preprocess_grid(inpath, infname, outpath, outfname, name_features, categorical=None)-
Converts input dataframe into more useable data and saves as new dataframe on disk categorical variables are converted into binary features
Input
inpath: input path infname: input file name outpath: output path outfname: output file name name_features: String or list of strings of covariate features (column names in infname) categorical: name of categorical feature, converts categorical feature into additional features with binary coding
Return
processed dataframe names of features with updated categorical (if none, then return original input name_features)
Expand source code
def preprocess_grid(inpath, infname, outpath, outfname, name_features, categorical = None): """ Converts input dataframe into more useable data and saves as new dataframe on disk categorical variables are converted into binary features Input: inpath: input path infname: input file name outpath: output path outfname: output file name name_features: String or list of strings of covariate features (column names in infname) categorical: name of categorical feature, converts categorical feature into additional features with binary coding Return: processed dataframe names of features with updated categorical (if none, then return original input name_features) """ print("Preprocessing grid covariates...") name_features2 = name_features.copy() df = pd.read_csv(os.path.join(path, infname)) if categorical is not None: # Add categories as features with binary coding cat_levels = df[categorical].unique() print('Adding following categories as binary features: ', cat_levels) name_features2.extend(cat_levels) for level in cat_levels: df[level] = 0 df.loc[df[categorical].values == level, level] = 1 name_features2.remove(categorical) if ('Easting' in list(df)) & ('Northing' in list(df)) & ('x' not in list(df)) & ('y' not in list(df)): df.rename(columns={"Easting": "x", "Northing": "y"}, inplace=True) if isinstance(name_features2, list): selcols = ['x', 'y'] selcols.extend(name_features2) else: selcols = ['x', 'y', name_features2] dfout[selcols].to_csv(os.path.join(outpath, outfname), index = False) return dfout[selcols], name_features2 def preprocess_grid_poly(path, infname_grid, infname_poly, name_features, grid_crs, grid_colname_Easting='x', grid_colname_Northing='y', categorical=None)-
Converts input dataframe into more useable data and saves as new dataframe on disk. Grid points will be spatially joioned with polygons for prediction To Do add conversion to categorical
Input
path: input path (same used for output) infname_grid: input file name of grid covariates infname_poly: input file name of polygon shape for predictions name_features: String or list of strings of covariate features (column names in infname) grid_crs: coordinate reference system of grid points, e.g. 'EPSG:28355' grid_colname_Easting: column name of Easting coordinate (or Longitude) grid_colname_Northing: coumn name for Northing coodinate (or Latitude) categorical: name of categorical feature, converts categorical feature into additional features with binary coding
Return
processed dataframe names of features with updated categorical (if none, then return original input name_features)
Expand source code
def preprocess_grid_poly(path, infname_grid, infname_poly, name_features, grid_crs, grid_colname_Easting = 'x', grid_colname_Northing = 'y', categorical = None): """ Converts input dataframe into more useable data and saves as new dataframe on disk. Grid points will be spatially joioned with polygons for prediction To Do add conversion to categorical Input: path: input path (same used for output) infname_grid: input file name of grid covariates infname_poly: input file name of polygon shape for predictions name_features: String or list of strings of covariate features (column names in infname) grid_crs: coordinate reference system of grid points, e.g. 'EPSG:28355' grid_colname_Easting: column name of Easting coordinate (or Longitude) grid_colname_Northing: coumn name for Northing coodinate (or Latitude) categorical: name of categorical feature, converts categorical feature into additional features with binary coding Return: processed dataframe names of features with updated categorical (if none, then return original input name_features) """ from shapely.geometry import Point print("Preprocessing grid covariates and joining with polygon geometry...") name_features2 = name_features.copy() df = pd.read_csv(os.path.join(path, infname_grid)) if categorical is not None: # Add categories as features with binary coding cat_levels = df[categorical].unique() print('Adding following categories as binary features: ', cat_levels) name_features2.extend(cat_levels) for level in cat_levels: df[level] = 0 df.loc[df[categorical].values == level, level] = 1 name_features2.remove(categorical) # Convert grid covariates to geospatial point data geometry = [Point(xy) for xy in zip(df[grid_colname_Easting], df[grid_colname_Northing])] gdf = gpd.GeoDataFrame(df, crs=grid_crs, geometry=geometry) # Read in polygon data dfpoly = gpd.read_file(os.path.join(path, infname_poly)) dfpoly['ibatch'] = dfpoly.index.values.astype(int) # Check before merging that grid and poly are in same crs, of not, convert poly to same crs as grid if not (dfpoly.crs == gdf.crs): dfpoly = dfpoly.to_crs(gdf.crs) # Spatially merge points that are within polygons and keep point grid geometry dfcomb = gpd.sjoin(gdf, dfpoly, how="left", op="within") # alternatively use op='intersects' to includfe also points that are on boundary of polygon # Remove all points that are not in a polygon dfcomb.dropna(subset = ['ibatch'], inplace=True) dfcomb['ibatch'] = dfcomb['ibatch'].astype(int) # Need to be converted again to int since merge changes type if len(dfcomb) == 0: print('WARNING: NO GRID POINTS IN POLYGONS!') # if convert_to_crs is not None: # # Convert to meters and get batch x,y coordinate list for each polygon: # dfcomb = dfcomb.to_crs(epsgcrs) # dfpoly = dfpoly.to_crs(epsgcrs) """ If evalutaion points are not given by grid points, following approaches can be tried evaluation points need to subdivide polygon in equal areas (voronoi), otherwise not equal spatial weight options a) rasterize polygon and then turn pixels in points, b) use pygridtools https://github.com/Geosyntec/pygridtools c) use Centroidal Voronoi tessellation (use vorbin or pytess), d) use iterative inwards buffering, e) find points by dynmaic simulation or split in halfs iteratively: https://snorfalorpagus.net/blog/2016/03/13/splitting-large-polygons-for-faster-intersections/ use centroid, then any line through centroid splits polygon in half Propose new algorithm: centroidal hexagonal grid (densest cirle packing), this need optimise only one parameter: rotational angle so that a) maximal number of poinst are in polygon and b) that distance of point to nearest polygon edge is maximised for all points (which aligns points towards center) """ if isinstance(name_features2, list): selcols = [grid_colname_Easting, grid_colname_Northing, 'ibatch'] selcols.extend(name_features2) else: selcols = ['Sample.ID',grid_colname_Easting, grid_colname_Northing, 'ibatch', name_features2] return dfcomb[selcols].copy(), dfpoly, name_features2 def round_nearest_base(x, base=0.5)-
Round to nearest multiple of base.
Input
x: number or array to round base: base to round to (default: 0.5), can be float or integer
Returns
rounded number (array)
Expand source code
def round_nearest_base(x, base=0.5): """ Round to nearest multiple of base. Input: x: number or array to round base: base to round to (default: 0.5), can be float or integer Returns: rounded number (array) """ x = np.asarray(x) if (base < 1) & (base >0): return base * np.round(x/base) elif base >=1: return (base * np.round(x/base)).astype(int) else: print("ERROR: base must be numeric and larger than 0") return None