Module geodata_harvester.harvest
This script is running the headless version of the geodata-harvester.
The following main steps are automatically executed within the run() function: - loading settings from config file - creating bounding box from input file points if not provided - downloading data layers as specified in config file - processing data layers as specified in config file - save downloaded image files to disk as GeoTiffs - save summary table of downloaded files as CSV - extract data for point locations provided in input file (name specified in settings) - save extracted point results to disk as CSV and as geopackage
Example call within Python: from geodata_harvester import harvest harvest.run(path_to_config))
Expand source code
"""
This script is running the headless version of the geodata-harvester.
The following main steps are automatically executed within the run() function:
- loading settings from config file
- creating bounding box from input file points if not provided
- downloading data layers as specified in config file
- processing data layers as specified in config file
- save downloaded image files to disk as GeoTiffs
- save summary table of downloaded files as CSV
- extract data for point locations provided in input file (name specified in settings)
- save extracted point results to disk as CSV and as geopackage
Example call within Python:
from geodata_harvester import harvest
harvest.run(path_to_config))
"""
import os
from pathlib import Path
import geopandas as gpd
from termcolor import cprint
import yaml
import shutil
import argparse
import numpy as np
from datetime import datetime, timedelta
from geodata_harvester.widgets import harvesterwidgets as hw
from geodata_harvester.utils import init_logtable, update_logtable
from geodata_harvester import (getdata_dea, getdata_dem, getdata_landscape,
getdata_radiometric, getdata_silo, getdata_slga,
utils, temporal)
from eeharvest import harvester as eeharvester
def run(path_to_config, log_name="download_summary", preview=False, return_df=False):
"""
A headless version of the Data-Harvester (with some limitations).
Results are saved to disk.
Parameters
----------
path_to_config : str
Path to YAML config file
log_name: name of log file (default: "download_log")
preview : bool, optional
Plots a matrix of downloaded images if set to True, by default False
return_df : bool, optional (Default: False)
if True, returns dataframe with results
Returns
-------
None (if return_df is False)
dataframe (if return_df is True)
"""
cprint("Starting the data harvester -----", "magenta", attrs=["bold"])
# Load config file (based on notebook for now, will optimise later)
settings = hw.load_settings(path_to_config)
# Count number of sources to download from
count_sources = len(settings.target_sources)
list_sources = list(settings.target_sources.keys())
# If no infile provided, generate a blank one (including colnames)
try:
settings.infile
if settings.infile is None:
points_available = False
else:
points_available = True
except (AttributeError, KeyError):
settings.infile = None
settings.colname_lng = None
settings.colname_lat = None
points_available = False
# If no resolution set, make it 1 arc-second
if settings.target_res is None:
utils.msg_info(
"No target resolution specified, using default of 1 arc-sec")
settings.target_res = 1
# Create bounding box if infile is provided and target_bbox is not provided
if settings.infile is not None:
gdfpoints = gpd.read_file(settings.infile)
longs = gdfpoints[settings.colname_lng].astype(float)
lats = gdfpoints[settings.colname_lat].astype(float)
coords = np.vstack((longs, lats)).T
if settings.target_bbox is None:
settings.target_bbox = (
min(longs) - 0.05,
min(lats) - 0.05,
max(longs) + 0.05,
max(lats) + 0.05,
)
# Stop if bounding box cannot be calculated or was not provided
if settings.infile is None and settings.target_bbox is None:
raise ValueError("No sampling file or bounding box provided")
# Temporal range
# convert date strings to datetime objects
date_diff = (datetime.strptime(settings.date_max, "%Y-%m-%d")
- datetime.strptime(settings.date_min, "%Y-%m-%d")).days
if settings.time_intervals is not None:
period_days = date_diff // settings.time_intervals
if period_days == 0:
period_days = 1
else:
period_days = None
# Create download log
download_log = init_logtable()
# process each data source
utils.msg_info(
f"Found the following {count_sources} sources: {list_sources}")
cprint("\nDownloading from API sources -----", "magenta", attrs=["bold"])
# GEE
if "GEE" in list_sources:
# Try to initialise API if Earth Engine is selected
try:
eeharvester.initialise()
except:
eeharvester.initialise(auth_mode = 'notebook')
cprint("\n⌛ Downloading Google Earth Engine data...", attrs=["bold"])
# get data from GEE with eeharvest
#gee = eeharvester.collect(config=path_to_config)
#gee.preprocess()
#gee.download()
# use auto function to download and preprocess
if period_days is None:
"""
If no time intervals are specified and reduce is not set, download all datasets.
If no time intervals are specified, but reduce is set,
download the temporal reduced (aggregated) set over the entire time range.
"""
gee_outpath = os.path.join(settings.outpath,'ee')
gee = eeharvester.auto(config=path_to_config, outpath=gee_outpath)
# add settings.outpath to all entries in list of gee.filenames
# if gee.filenames is a list of strings
if not isinstance(gee.filenames, list):
# convert to list
gee.filenames = [gee.filenames]
# get list of filenames in the directory
gee_filenames = []
# Walk through the directory and its subdirectories to find full paths
for root, _, files in os.walk(gee_outpath):
for file in files:
if file in gee.filenames:
# Join the root and file to get the full path
file_path = os.path.join(root, file)
gee_filenames.append(file_path)
outfnames = gee_filenames
agg_list = [None] * len(outfnames)
layernames = [Path(filename).resolve().stem for filename in gee_filenames]
layer_titles = layernames
else:
# run data extraction for each time interval
date_start_list = [datetime.strptime(settings.date_min, "%Y-%m-%d") +
timedelta(days=i*period_days) for i in range(settings.time_intervals)]
date_end_list = [datetime.strptime(settings.date_min, "%Y-%m-%d") +
timedelta(days=i*period_days) for i in range(1, settings.time_intervals+1)]
# clip end date to date_max
date_end_list[-1] = min(date_end_list[-1], datetime.strptime(settings.date_max, "%Y-%m-%d"))
if settings.target_sources['GEE']['preprocess']['reduce'] is None:
agg_type = "median"
else:
agg_type = settings.target_sources['GEE']['preprocess']['reduce']
# Check if agg_type is a list
if isinstance(agg_type, list):
agg_type = agg_type[0]
# run eeharvest extraction for each time interval
outfnames = []
layernames = []
agg_list = []
layer_titles = []
for i in range(settings.time_intervals):
# update settings.date_min and settings.date_max in config file
# make temporary copy of config file
tmp_config = path_to_config.replace(".yaml", "_tmp.yaml")
shutil.copy(path_to_config, tmp_config)
# update config file
with open(tmp_config, "r") as f:
config = yaml.load(f, Loader=yaml.SafeLoader)
config['date_min'] = date_start_list[i].date() #.strftime("%Y-%m-%d") # can't be string since eeharvest can't strings
config['date_max'] = date_end_list[i].date() #.strftime("%Y-%m-%d")
if config['target_sources']['GEE']['preprocess']['reduce'] is None:
config['target_sources']['GEE']['preprocess']['reduce'] = agg_type
with open(tmp_config, "w") as f:
yaml.dump(config, f)
# run eeharvest
gee_outpath = os.path.join(settings.outpath,'ee')
try:
# Try to download gee images for the time interval. If no images available, skip.
gee = eeharvester.auto(config=tmp_config, outpath=gee_outpath)
except Exception as e:
print(e)
continue
if not isinstance(gee.filenames, list):
# convert to list
gee.filenames = [gee.filenames]
gee_filenames = []
# Walk through the directory and its subdirectories to find full paths
for root, _, files in os.walk(gee_outpath):
for file in files:
if file in gee.filenames:
# Join the root and file to get the full path
file_path = os.path.join(root, file)
gee_filenames.append(file_path)
layers = [Path(filename).resolve().stem for filename in gee_filenames]
outfnames += gee_filenames
layernames += layers
agg_list += [agg_type] * len(gee_filenames)
layer_titles += [layer + "_" + agg_type + "_" + date_start_list[i].strftime("%Y-%m-%d") +
"-to-" + date_end_list[i].strftime("%Y-%m-%d") for layer in layers]
# remove temporary config file
os.remove(tmp_config)
if len(outfnames) > 0:
# rename outfname files to layer_titles + .tif
for i in range(len(outfnames)):
os.rename(outfnames[i], os.path.join(os.path.dirname(outfnames[i]), layer_titles[i] + ".tif"))
outfnames[i] = os.path.join(os.path.dirname(outfnames[i]), layer_titles[i] + ".tif")
download_log = update_logtable(
download_log,
outfnames,
layernames,
"GEE",
settings,
layertitles=layer_titles,
agfunctions=agg_list,
loginfos="downloaded",
)
# DEA
if "DEA" in list_sources:
cprint("\n⌛ Downloading DEA data...", attrs=["bold"])
# get data from DEA
dea_layernames = settings.target_sources["DEA"]
outpath_dea = os.path.join(settings.outpath, "dea")
# put into subdirectory
files_dea = getdata_dea.get_dea_layers_daterange(
dea_layernames,
settings.date_min,
settings.date_max,
settings.target_bbox,
settings.target_res,
outpath_dea,
crs="EPSG:4326",
format_out="GeoTIFF",
)
if period_days is not None:
# aggregate temporal data
outfname_dea_list = []
layer_list = []
layer_titles = []
agg_list = []
for layername in dea_layernames:
# get files for layername
#files_layer = [os.path.basename(x) for x in files_dea if layername in x]
files_layer = [x for x in files_dea if layername in x]
# Check if there are multiple files for the same layer,
# if not assume no temporal aggregation and skip loop
if len(files_layer) == 1:
outfname_dea_list += files_layer
layer_titles += [layername]
layer_list += [layername]
agg_list += ['None']
continue
xdr = temporal.multiband_raster_to_xarray(files_layer)
# replace values with nan_value with nan so that aggregation works properly
nan_value = xdr.attrs["_FillValue"]
xdr = xdr.where(xdr != nan_value, np.nan)
"""
Aggregate over temporal period by using median along the time dimension.
Note that some DEA layers may have quality flags but these are not applied here because it is layer-specific.
"""
outfname_list, agg_list = temporal.aggregate_temporal(
xdr,
period=period_days,
agg=["median"],
outfile=os.path.join(settings.outpath,f"DEA_{layername}"),
buffer = None)
outfname_dea_list += outfname_list
# create layer titles with proper date range format
for filename in outfname_list:
# get date from filename without extension
date_start = os.path.splitext(os.path.basename(filename).rsplit('_')[-1])[0]
# convert date string YYYY-MM-D to datetime object
date_start = datetime.strptime(date_start, "%Y-%m-%d")
date_end = date_start + timedelta(days=period_days)
date_str = date_start.strftime("%Y-%m-%d") + "-to-" + date_end.strftime("%Y-%m-%d")
new_name = "DEA_" + layername + "_median_" + date_str
layer_titles += [new_name]
layer_list += [layername]*len(outfname_list)
agg_list = ['median']*len(layer_titles)
else:
outfname_dea_list = files_dea
# get string dea_layernames from filenames in files_dea
layer_list = []
for fname in files_dea:
for layername in dea_layernames:
if layername in fname:
layer_list.append(layername)
break
layer_titles = [os.path.basename(path).rsplit('.')[0] for path in files_dea]
agg_list = ['None']*len(layer_titles)
# Update download log table
download_log = update_logtable(
download_log,
outfname_dea_list,
layer_list,
'DEA',
settings,
layertitles=layer_titles,
agfunctions = agg_list,
loginfos='downloaded'
)
# DEM
if "DEM" in list_sources:
cprint("\n⌛ Downloading DEM data...", attrs=["bold"])
dem_layernames = settings.target_sources["DEM"]
try:
files_dem = getdata_dem.get_dem_layers(
dem_layernames,
settings.outpath,
settings.target_bbox,
settings.target_res,
)
except Exception as e:
print(e)
# Check if output if False (no data available) and skip if so
if (files_dem == [False]) or files_dem is None:
pass
else:
# Add extracted data to log dataframe
download_log = update_logtable(
download_log,
files_dem,
dem_layernames,
'DEM',
settings,
layertitles=dem_layernames,
loginfos='downloaded')
if "Landscape" in list_sources:
cprint("\n⌛ Downloading Landscape data...", attrs=["bold"])
# get data from Landscape
layernames = settings.target_sources["Landscape"]
layertitles = ["landscape_" + layername for layername in layernames]
files_ls = getdata_landscape.get_landscape_layers(
layernames,
settings.target_bbox,
settings.outpath,
resolution=settings.target_res,
)
# Add extracted data to log dataframe
download_log = update_logtable(
download_log,
files_ls,
layernames,
"Landscape",
settings,
layertitles=layertitles,
loginfos="downloaded",
)
if "Radiometric" in list_sources:
cprint("\n⌛ Downloading Radiometric data...", attrs=["bold"])
# get data from Radiometric
# Download radiometrics
layernames = settings.target_sources["Radiometric"]
try:
files_rd = getdata_radiometric.get_radiometric_layers(
settings.outpath,
layernames,
bbox=settings.target_bbox,
resolution=settings.target_res,
)
except Exception as e:
print(e)
var_exists = "files_rd" in locals() or "files_rd" in globals()
if var_exists:
# Add extracted data to log dataframe
download_log = update_logtable(
download_log,
files_rd,
layernames,
"Radiometric",
settings,
layertitles=layernames,
loginfos="downloaded",
)
else:
pass
if "SILO" in list_sources:
cprint("\n⌛ Downloading SILO data.\
This will take a couple of minutes, depending on your internet speed...", attrs=["bold"])
# get data from SILO
fnames_out_silo = []
silo_layernames = list(settings.target_sources["SILO"].keys())
# print(silo_layernames)
try:
# run the download
files_silo = os.path.join(settings.outpath, "silo")
fnames_out = getdata_silo.get_SILO_layers(
silo_layernames,
settings.date_min,
settings.date_max,
files_silo,
bbox=settings.target_bbox,
format_out="tif"
)
# Save the layer name
fnames_out_silo += fnames_out
except Exception as e:
print(e)
# aggregate the data along time windows if period_days is not None
if period_days is not None:
#try:
outfname_list = []
layername_list = []
aggfunction_list = []
layer_titles = []
for i, fname in enumerate(fnames_out_silo):
xdr = temporal.combine_rasters_temporal(fname, channel_name="band", attribute_name="long_name")
outfnames, agg_list = temporal.aggregate_temporal(
xdr,
period=period_days,
agg=[settings.target_sources['SILO'][silo_layernames[i]]],
outfile=f"{fname.split('.')[0]}",
buffer = None)
outfname_list += outfnames
layername_list += [silo_layernames[i]]*len(outfnames)
aggfunction_list += agg_list
# define proper titles for the layers
layername = silo_layernames[i]
for filename in outfnames:
# get date from filename without extension
date_start = os.path.splitext(os.path.basename(filename).rsplit('_')[-1])[0]
# convert date string YYYY-MM-D to datetime object
date_start = datetime.strptime(date_start, "%Y-%m-%d")
date_end = date_start + timedelta(days=period_days)
date_str = date_start.strftime("%Y-%m-%d") + "-to-" + date_end.strftime("%Y-%m-%d")
new_name = "SILO_" + layername + "_" + settings.target_sources['SILO'][silo_layernames[i]] + "_" + date_str
layer_titles += [new_name]
#except Exception as e:
# print(e)
else:
outfname_list = fnames_out_silo
layername_list = silo_layernames
aggfunction_list = ['']*len(fnames_out_silo)
layer_titles = ["SILO_" + layername for layername in silo_layernames]
var_exists = "files_silo" in locals() or "files_silo" in globals()
if var_exists:
# Add download info to log dataframe
download_log = update_logtable(
download_log,
#fnames_out_silo,
filenames = outfname_list,
layernames = layername_list,
datasource = "SILO",
settings = settings,
layertitles=[os.path.basename(fname).split('.')[0] for fname in outfname_list],
agfunctions = aggfunction_list,
loginfos="downloaded",
)
else:
pass
if "SLGA" in list_sources:
cprint("\n⌛ Downloading SLGA data...", attrs=["bold"])
# get data from SLGA
slga_layernames = list(settings.target_sources["SLGA"].keys())
# get min and max depth for each layername
depth_min = []
depth_max = []
for layername in slga_layernames:
depth_bounds = settings.target_sources["SLGA"][layername]
dmin, dmax = getdata_slga.identifier2depthbounds(depth_bounds)
depth_min.append(dmin)
depth_max.append(dmax)
try:
files_slga = getdata_slga.get_slga_layers(
slga_layernames,
settings.target_bbox,
settings.outpath,
depth_min=depth_min,
depth_max=depth_max,
get_ci=True,
)
except Exception as e:
print(e)
var_exists = "files_slga" in locals() or "files_slga" in globals()
if var_exists:
if len(files_slga) != len(slga_layernames):
# get filename stems of files_slga
slga_layernames = [Path(f).stem for f in files_slga]
download_log = update_logtable(
download_log,
files_slga,
slga_layernames,
"SLGA",
settings,
layertitles=[],
loginfos="downloaded",
)
else:
pass
# save log to file
download_log.to_csv(os.path.join(settings.outpath, log_name + ".csv"), index=False)
# extract filename from settings.infile
# Select all processed data
df_sel = download_log.copy()
rasters = df_sel["filename_out"].values.tolist()
titles = df_sel["layertitle"].values.tolist()
if points_available:
fn = Path(settings.infile).resolve().name
cprint(
f"\nExtracting data points for {fn} -----", "magenta", attrs=["bold"])
# Extract datatable from rasters given input coordinates
# gdf = utils.raster_query(longs, lats, rasters, titles) # old slower version
gdf = utils.extract_values_from_rasters(coords, rasters)
# Save as geopackage
gdf.to_file(os.path.join(settings.outpath,
"results.gpkg"), driver="GPKG")
# Save the results table to a csv as well
gdf.drop("geometry", axis=1).to_csv(
os.path.join(settings.outpath, "results.csv"), index=True, mode="w"
)
utils.msg_success(
f"Data points extracted to {settings.outpath}results.gpkg")
if preview and points_available:
utils.plot_rasters(rasters, longs, lats, titles)
elif preview and not points_available:
utils.plot_rasters(rasters, titles=titles)
cprint("\n🎉 🎉 🎉 Harvest complete 🎉 🎉 🎉", "magenta", attrs=["bold"])
if return_df and points_available:
return gdf
else:
return None
Functions
def run(path_to_config, log_name='download_summary', preview=False, return_df=False)
-
A headless version of the Data-Harvester (with some limitations). Results are saved to disk.
Parameters
path_to_config
:str
- Path to YAML config file
log_name
:name
oflog file (default: "download_log")
preview
:bool
, optional- Plots a matrix of downloaded images if set to True, by default False
return_df
:bool
, optional(Default: False)
- if True, returns dataframe with results
Returns
None (if return_df is False)
dataframe (if return_df is True)
Expand source code
def run(path_to_config, log_name="download_summary", preview=False, return_df=False): """ A headless version of the Data-Harvester (with some limitations). Results are saved to disk. Parameters ---------- path_to_config : str Path to YAML config file log_name: name of log file (default: "download_log") preview : bool, optional Plots a matrix of downloaded images if set to True, by default False return_df : bool, optional (Default: False) if True, returns dataframe with results Returns ------- None (if return_df is False) dataframe (if return_df is True) """ cprint("Starting the data harvester -----", "magenta", attrs=["bold"]) # Load config file (based on notebook for now, will optimise later) settings = hw.load_settings(path_to_config) # Count number of sources to download from count_sources = len(settings.target_sources) list_sources = list(settings.target_sources.keys()) # If no infile provided, generate a blank one (including colnames) try: settings.infile if settings.infile is None: points_available = False else: points_available = True except (AttributeError, KeyError): settings.infile = None settings.colname_lng = None settings.colname_lat = None points_available = False # If no resolution set, make it 1 arc-second if settings.target_res is None: utils.msg_info( "No target resolution specified, using default of 1 arc-sec") settings.target_res = 1 # Create bounding box if infile is provided and target_bbox is not provided if settings.infile is not None: gdfpoints = gpd.read_file(settings.infile) longs = gdfpoints[settings.colname_lng].astype(float) lats = gdfpoints[settings.colname_lat].astype(float) coords = np.vstack((longs, lats)).T if settings.target_bbox is None: settings.target_bbox = ( min(longs) - 0.05, min(lats) - 0.05, max(longs) + 0.05, max(lats) + 0.05, ) # Stop if bounding box cannot be calculated or was not provided if settings.infile is None and settings.target_bbox is None: raise ValueError("No sampling file or bounding box provided") # Temporal range # convert date strings to datetime objects date_diff = (datetime.strptime(settings.date_max, "%Y-%m-%d") - datetime.strptime(settings.date_min, "%Y-%m-%d")).days if settings.time_intervals is not None: period_days = date_diff // settings.time_intervals if period_days == 0: period_days = 1 else: period_days = None # Create download log download_log = init_logtable() # process each data source utils.msg_info( f"Found the following {count_sources} sources: {list_sources}") cprint("\nDownloading from API sources -----", "magenta", attrs=["bold"]) # GEE if "GEE" in list_sources: # Try to initialise API if Earth Engine is selected try: eeharvester.initialise() except: eeharvester.initialise(auth_mode = 'notebook') cprint("\n⌛ Downloading Google Earth Engine data...", attrs=["bold"]) # get data from GEE with eeharvest #gee = eeharvester.collect(config=path_to_config) #gee.preprocess() #gee.download() # use auto function to download and preprocess if period_days is None: """ If no time intervals are specified and reduce is not set, download all datasets. If no time intervals are specified, but reduce is set, download the temporal reduced (aggregated) set over the entire time range. """ gee_outpath = os.path.join(settings.outpath,'ee') gee = eeharvester.auto(config=path_to_config, outpath=gee_outpath) # add settings.outpath to all entries in list of gee.filenames # if gee.filenames is a list of strings if not isinstance(gee.filenames, list): # convert to list gee.filenames = [gee.filenames] # get list of filenames in the directory gee_filenames = [] # Walk through the directory and its subdirectories to find full paths for root, _, files in os.walk(gee_outpath): for file in files: if file in gee.filenames: # Join the root and file to get the full path file_path = os.path.join(root, file) gee_filenames.append(file_path) outfnames = gee_filenames agg_list = [None] * len(outfnames) layernames = [Path(filename).resolve().stem for filename in gee_filenames] layer_titles = layernames else: # run data extraction for each time interval date_start_list = [datetime.strptime(settings.date_min, "%Y-%m-%d") + timedelta(days=i*period_days) for i in range(settings.time_intervals)] date_end_list = [datetime.strptime(settings.date_min, "%Y-%m-%d") + timedelta(days=i*period_days) for i in range(1, settings.time_intervals+1)] # clip end date to date_max date_end_list[-1] = min(date_end_list[-1], datetime.strptime(settings.date_max, "%Y-%m-%d")) if settings.target_sources['GEE']['preprocess']['reduce'] is None: agg_type = "median" else: agg_type = settings.target_sources['GEE']['preprocess']['reduce'] # Check if agg_type is a list if isinstance(agg_type, list): agg_type = agg_type[0] # run eeharvest extraction for each time interval outfnames = [] layernames = [] agg_list = [] layer_titles = [] for i in range(settings.time_intervals): # update settings.date_min and settings.date_max in config file # make temporary copy of config file tmp_config = path_to_config.replace(".yaml", "_tmp.yaml") shutil.copy(path_to_config, tmp_config) # update config file with open(tmp_config, "r") as f: config = yaml.load(f, Loader=yaml.SafeLoader) config['date_min'] = date_start_list[i].date() #.strftime("%Y-%m-%d") # can't be string since eeharvest can't strings config['date_max'] = date_end_list[i].date() #.strftime("%Y-%m-%d") if config['target_sources']['GEE']['preprocess']['reduce'] is None: config['target_sources']['GEE']['preprocess']['reduce'] = agg_type with open(tmp_config, "w") as f: yaml.dump(config, f) # run eeharvest gee_outpath = os.path.join(settings.outpath,'ee') try: # Try to download gee images for the time interval. If no images available, skip. gee = eeharvester.auto(config=tmp_config, outpath=gee_outpath) except Exception as e: print(e) continue if not isinstance(gee.filenames, list): # convert to list gee.filenames = [gee.filenames] gee_filenames = [] # Walk through the directory and its subdirectories to find full paths for root, _, files in os.walk(gee_outpath): for file in files: if file in gee.filenames: # Join the root and file to get the full path file_path = os.path.join(root, file) gee_filenames.append(file_path) layers = [Path(filename).resolve().stem for filename in gee_filenames] outfnames += gee_filenames layernames += layers agg_list += [agg_type] * len(gee_filenames) layer_titles += [layer + "_" + agg_type + "_" + date_start_list[i].strftime("%Y-%m-%d") + "-to-" + date_end_list[i].strftime("%Y-%m-%d") for layer in layers] # remove temporary config file os.remove(tmp_config) if len(outfnames) > 0: # rename outfname files to layer_titles + .tif for i in range(len(outfnames)): os.rename(outfnames[i], os.path.join(os.path.dirname(outfnames[i]), layer_titles[i] + ".tif")) outfnames[i] = os.path.join(os.path.dirname(outfnames[i]), layer_titles[i] + ".tif") download_log = update_logtable( download_log, outfnames, layernames, "GEE", settings, layertitles=layer_titles, agfunctions=agg_list, loginfos="downloaded", ) # DEA if "DEA" in list_sources: cprint("\n⌛ Downloading DEA data...", attrs=["bold"]) # get data from DEA dea_layernames = settings.target_sources["DEA"] outpath_dea = os.path.join(settings.outpath, "dea") # put into subdirectory files_dea = getdata_dea.get_dea_layers_daterange( dea_layernames, settings.date_min, settings.date_max, settings.target_bbox, settings.target_res, outpath_dea, crs="EPSG:4326", format_out="GeoTIFF", ) if period_days is not None: # aggregate temporal data outfname_dea_list = [] layer_list = [] layer_titles = [] agg_list = [] for layername in dea_layernames: # get files for layername #files_layer = [os.path.basename(x) for x in files_dea if layername in x] files_layer = [x for x in files_dea if layername in x] # Check if there are multiple files for the same layer, # if not assume no temporal aggregation and skip loop if len(files_layer) == 1: outfname_dea_list += files_layer layer_titles += [layername] layer_list += [layername] agg_list += ['None'] continue xdr = temporal.multiband_raster_to_xarray(files_layer) # replace values with nan_value with nan so that aggregation works properly nan_value = xdr.attrs["_FillValue"] xdr = xdr.where(xdr != nan_value, np.nan) """ Aggregate over temporal period by using median along the time dimension. Note that some DEA layers may have quality flags but these are not applied here because it is layer-specific. """ outfname_list, agg_list = temporal.aggregate_temporal( xdr, period=period_days, agg=["median"], outfile=os.path.join(settings.outpath,f"DEA_{layername}"), buffer = None) outfname_dea_list += outfname_list # create layer titles with proper date range format for filename in outfname_list: # get date from filename without extension date_start = os.path.splitext(os.path.basename(filename).rsplit('_')[-1])[0] # convert date string YYYY-MM-D to datetime object date_start = datetime.strptime(date_start, "%Y-%m-%d") date_end = date_start + timedelta(days=period_days) date_str = date_start.strftime("%Y-%m-%d") + "-to-" + date_end.strftime("%Y-%m-%d") new_name = "DEA_" + layername + "_median_" + date_str layer_titles += [new_name] layer_list += [layername]*len(outfname_list) agg_list = ['median']*len(layer_titles) else: outfname_dea_list = files_dea # get string dea_layernames from filenames in files_dea layer_list = [] for fname in files_dea: for layername in dea_layernames: if layername in fname: layer_list.append(layername) break layer_titles = [os.path.basename(path).rsplit('.')[0] for path in files_dea] agg_list = ['None']*len(layer_titles) # Update download log table download_log = update_logtable( download_log, outfname_dea_list, layer_list, 'DEA', settings, layertitles=layer_titles, agfunctions = agg_list, loginfos='downloaded' ) # DEM if "DEM" in list_sources: cprint("\n⌛ Downloading DEM data...", attrs=["bold"]) dem_layernames = settings.target_sources["DEM"] try: files_dem = getdata_dem.get_dem_layers( dem_layernames, settings.outpath, settings.target_bbox, settings.target_res, ) except Exception as e: print(e) # Check if output if False (no data available) and skip if so if (files_dem == [False]) or files_dem is None: pass else: # Add extracted data to log dataframe download_log = update_logtable( download_log, files_dem, dem_layernames, 'DEM', settings, layertitles=dem_layernames, loginfos='downloaded') if "Landscape" in list_sources: cprint("\n⌛ Downloading Landscape data...", attrs=["bold"]) # get data from Landscape layernames = settings.target_sources["Landscape"] layertitles = ["landscape_" + layername for layername in layernames] files_ls = getdata_landscape.get_landscape_layers( layernames, settings.target_bbox, settings.outpath, resolution=settings.target_res, ) # Add extracted data to log dataframe download_log = update_logtable( download_log, files_ls, layernames, "Landscape", settings, layertitles=layertitles, loginfos="downloaded", ) if "Radiometric" in list_sources: cprint("\n⌛ Downloading Radiometric data...", attrs=["bold"]) # get data from Radiometric # Download radiometrics layernames = settings.target_sources["Radiometric"] try: files_rd = getdata_radiometric.get_radiometric_layers( settings.outpath, layernames, bbox=settings.target_bbox, resolution=settings.target_res, ) except Exception as e: print(e) var_exists = "files_rd" in locals() or "files_rd" in globals() if var_exists: # Add extracted data to log dataframe download_log = update_logtable( download_log, files_rd, layernames, "Radiometric", settings, layertitles=layernames, loginfos="downloaded", ) else: pass if "SILO" in list_sources: cprint("\n⌛ Downloading SILO data.\ This will take a couple of minutes, depending on your internet speed...", attrs=["bold"]) # get data from SILO fnames_out_silo = [] silo_layernames = list(settings.target_sources["SILO"].keys()) # print(silo_layernames) try: # run the download files_silo = os.path.join(settings.outpath, "silo") fnames_out = getdata_silo.get_SILO_layers( silo_layernames, settings.date_min, settings.date_max, files_silo, bbox=settings.target_bbox, format_out="tif" ) # Save the layer name fnames_out_silo += fnames_out except Exception as e: print(e) # aggregate the data along time windows if period_days is not None if period_days is not None: #try: outfname_list = [] layername_list = [] aggfunction_list = [] layer_titles = [] for i, fname in enumerate(fnames_out_silo): xdr = temporal.combine_rasters_temporal(fname, channel_name="band", attribute_name="long_name") outfnames, agg_list = temporal.aggregate_temporal( xdr, period=period_days, agg=[settings.target_sources['SILO'][silo_layernames[i]]], outfile=f"{fname.split('.')[0]}", buffer = None) outfname_list += outfnames layername_list += [silo_layernames[i]]*len(outfnames) aggfunction_list += agg_list # define proper titles for the layers layername = silo_layernames[i] for filename in outfnames: # get date from filename without extension date_start = os.path.splitext(os.path.basename(filename).rsplit('_')[-1])[0] # convert date string YYYY-MM-D to datetime object date_start = datetime.strptime(date_start, "%Y-%m-%d") date_end = date_start + timedelta(days=period_days) date_str = date_start.strftime("%Y-%m-%d") + "-to-" + date_end.strftime("%Y-%m-%d") new_name = "SILO_" + layername + "_" + settings.target_sources['SILO'][silo_layernames[i]] + "_" + date_str layer_titles += [new_name] #except Exception as e: # print(e) else: outfname_list = fnames_out_silo layername_list = silo_layernames aggfunction_list = ['']*len(fnames_out_silo) layer_titles = ["SILO_" + layername for layername in silo_layernames] var_exists = "files_silo" in locals() or "files_silo" in globals() if var_exists: # Add download info to log dataframe download_log = update_logtable( download_log, #fnames_out_silo, filenames = outfname_list, layernames = layername_list, datasource = "SILO", settings = settings, layertitles=[os.path.basename(fname).split('.')[0] for fname in outfname_list], agfunctions = aggfunction_list, loginfos="downloaded", ) else: pass if "SLGA" in list_sources: cprint("\n⌛ Downloading SLGA data...", attrs=["bold"]) # get data from SLGA slga_layernames = list(settings.target_sources["SLGA"].keys()) # get min and max depth for each layername depth_min = [] depth_max = [] for layername in slga_layernames: depth_bounds = settings.target_sources["SLGA"][layername] dmin, dmax = getdata_slga.identifier2depthbounds(depth_bounds) depth_min.append(dmin) depth_max.append(dmax) try: files_slga = getdata_slga.get_slga_layers( slga_layernames, settings.target_bbox, settings.outpath, depth_min=depth_min, depth_max=depth_max, get_ci=True, ) except Exception as e: print(e) var_exists = "files_slga" in locals() or "files_slga" in globals() if var_exists: if len(files_slga) != len(slga_layernames): # get filename stems of files_slga slga_layernames = [Path(f).stem for f in files_slga] download_log = update_logtable( download_log, files_slga, slga_layernames, "SLGA", settings, layertitles=[], loginfos="downloaded", ) else: pass # save log to file download_log.to_csv(os.path.join(settings.outpath, log_name + ".csv"), index=False) # extract filename from settings.infile # Select all processed data df_sel = download_log.copy() rasters = df_sel["filename_out"].values.tolist() titles = df_sel["layertitle"].values.tolist() if points_available: fn = Path(settings.infile).resolve().name cprint( f"\nExtracting data points for {fn} -----", "magenta", attrs=["bold"]) # Extract datatable from rasters given input coordinates # gdf = utils.raster_query(longs, lats, rasters, titles) # old slower version gdf = utils.extract_values_from_rasters(coords, rasters) # Save as geopackage gdf.to_file(os.path.join(settings.outpath, "results.gpkg"), driver="GPKG") # Save the results table to a csv as well gdf.drop("geometry", axis=1).to_csv( os.path.join(settings.outpath, "results.csv"), index=True, mode="w" ) utils.msg_success( f"Data points extracted to {settings.outpath}results.gpkg") if preview and points_available: utils.plot_rasters(rasters, longs, lats, titles) elif preview and not points_available: utils.plot_rasters(rasters, titles=titles) cprint("\n🎉 🎉 🎉 Harvest complete 🎉 🎉 🎉", "magenta", attrs=["bold"]) if return_df and points_available: return gdf else: return None