Module `geodata_harvester.utils`

Utility functions for use in the Agrefed data harvesting pipeline.

–Function List, in order of appearence–

plot_rasters: Plots a list of rasters. _getFeatures (internal): Extracts rasterio compatible test from geodataframe. reproj_mask: Masks a raster to the area of a shape, and reprojects. reproj_rastermatch: Reproject a file to match the shape and projection of existing raster. reproj_raster: Reproject and clip for a given output resolution, crs and bbox. _read_file (internal): Reads raster with rasterio returns numpy array aggregate_rasters: Averages (or similar) over multiple files and multiple channels. aggregate_multiband: Averages (or similar) over multiple files but keeps multi-channels independent. _get_coords_at_point (internal): Finds closest index of a point-location in an array (raster). raster_query: Given a longitude,latitude value, return the value at that point of a raster/tif. extract_values_from_rasters: Given a list of rasters, extract the values at coords. init_logtable: Stores metdata for each step of raster download and processing. update_logtable: Updates each the logtable with new information.

Expand source code

#!/bin/python
"""
Utility functions for use in the Agrefed data harvesting pipeline.

--Function List, in order of appearence--

plot_rasters: Plots a list of rasters.
_getFeatures (internal): Extracts rasterio compatible test from geodataframe.
reproj_mask: Masks a raster to the area of a shape, and reprojects.
reproj_rastermatch: Reproject a file to match the shape and projection of
    existing raster.
reproj_raster: Reproject and clip for a given output resolution, crs and bbox.
_read_file (internal): Reads raster with rasterio returns numpy array
aggregate_rasters: Averages (or similar) over multiple files and multiple
    channels.
aggregate_multiband: Averages (or similar) over multiple files but keeps
    multi-channels independent.
_get_coords_at_point (internal): Finds closest index of a point-location in an
    array (raster).
raster_query: Given a longitude,latitude value, return the value at that point
    of a raster/tif.
extract_values_from_rasters: Given a list of rasters, extract the values at coords.
init_logtable: Stores metdata for each step of raster download and processing.
update_logtable: Updates each the logtable with new information.
"""

from glob import glob
import os
import json

import rasterio
from rasterio.mask import mask
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.plot import show

import numpy as np
import pandas as pd
import geopandas as gpd
import rioxarray as rxr

from pyproj import CRS
from pathlib import Path

import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

from numba import jit

import warnings
import logging

from termcolor import colored, cprint
from alive_progress import alive_bar, config_handler


config_handler.set_global(
    force_tty=True,
    bar=None,
    spinner="waves",
    monitor=False,
    stats=False,
    receipt=True,
    elapsed="{elapsed}",
)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    filename="harvest.txt",
    filemode="w",
)

## ------ Functions to show progress and provide feedback to the user ------ ##


def spin(message=None, colour="magenta", events=1, log=False):
    """Spin animation as a progress inidicator"""
    if log:
        logging.info(message)
    return alive_bar(events, title=colored("\u2299 " + message, color=colour))


def msg_info(message, icon=True, log=False):
    """Prints an info message"""
    if log:
        logging.info(message)
    if icon:
        cprint("\u2139 " + message, color="magenta")
    else:
        cprint("  " + message, color="magenta")


def msg_dl(message, log=False):
    """Prints a downloading message"""
    if log:
        logging.info(message)
    cprint("\u29e9 " + message, color="magenta")


def msg_warn(message, log=False):
    """Prints a warning message"""
    if log:
        logging.warning(message)
    cprint("\u2691 " + message, color="yellow")


def msg_err(message, log=False):
    """Prints an error message"""
    if log:
        logging.error(message)
    cprint("\u2716 " + message, color="red", attrs=["bold"])


def msg_success(message, log=False):
    """Prints a success message"""
    if log:
        logging.info(message)
    cprint("\u2714 " + message, color="magenta")


## ------------------------------------------------------------------------- ##


def plot_rasters(rasters, longs=None, lats=None, titles=None):
    """
    Plots multiple raster files (.tif) on a grid of nicely arranged figures.

    Parameters:
        raster: list of filenames (.tif).
            Will only read the first band/channel if multiband.
        longs: optional x values in list like object for plotting as points 
            over raster images.
        lats: optional x values in list like object for plotting as points
            over raster images.
        titles: title of plot default is raster file name.

    Returns:  
        None
    """
    # Set the value for reasonable shaped plot based on the number of datasets
    figlen = int(np.ceil(len(rasters) / 3))
    # Make a blank canvas if there is no data
    figlen = 2 if figlen < 2 else figlen

    # Create the figure
    fig, axes = plt.subplots(figlen, 3, figsize=(12, figlen * 3))

    if titles == None:
        titles = rasters
    # Loop through each subplot/axis on the figure.
    # Use counters to know what axes we are up to.
    i, j = 0, 0
    for a, rast in enumerate(rasters):
        if j == 3:
            j = 0
            i += 1

        # print(a,i,j,figlen,rast)
        src = rasterio.open(rast)
        # Only read first Band for flexibility without complexity
        data = src.read(1)
        # Grab the percentiles for pretty plotting of color ranges
        n95 = np.percentile(data, 5)
        n5 = np.percentile(data, 95)

        # Make the plot and clean it up
        show(
            data,
            ax=axes[i, j],
            title=titles[a],
            transform=src.transform,
            cmap="Greys",
            vmin=n95,
            vmax=n5,
        )
        axes[i, j].scatter(longs, lats, s=1, c="r")
        axes[i, j].xaxis.set_major_formatter(FormatStrFormatter("%.2f"))
        axes[i, j].yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
        axes[i, j].locator_params(axis="y", nbins=4)
        axes[i, j].locator_params(axis="x", nbins=4)

        j += 1

    fig.tight_layout()
    plt.show()


def _getFeatures(gdf):
    """
    Internal function to parse features from GeoDataFrame in such a manner that
    rasterio wants them.

    INPUTS
        gdf: geodataframe

    RETURNS
        json object for rasterio to read
    """
    return [json.loads(gdf.to_json())["features"][0]["geometry"]]


def reproj_mask(filepath, bbox, crscode=4326, filepath_out=None):
    """
    Clips a raster to the area of a shape, and reprojects.

    INPUTS
        filepath: input filename (tif)
        bbox: shapely geometry(polygon) defining mask boundary
        crscode: optional, coordinate reference system as defined by EPSG
        filepath_out: optional, the optional output filename of the raster. If False, 
        does not save a new file

    RETURNS
        out_img: numpy array of the clipped and reprojected raster
    """
    data = rasterio.open(filepath)
    geo = gpd.GeoDataFrame({"geometry": bbox}, index=[0], crs=CRS.from_epsg(crscode))
    geo = geo.to_crs(crs=CRS.from_epsg(crscode))
    coords = _getFeatures(geo)
    out_img, out_transform = mask(data, shapes=coords, crop=True)

    if filepath_out:
        out_meta = data.meta.copy()
        out_meta.update(
            {
                "driver": "GTiff",
                "height": out_img.shape[1],
                "width": out_img.shape[2],
                "transform": out_transform,
                "crs": CRS.from_epsg(crscode),
            }
        )

        with rasterio.open(filepath_out, "w", **out_meta) as dest:
            dest.write(out_img)
        print("Clipped raster written to:", filepath_out)

    return out_img


def reproj_rastermatch(infile, matchfile, outfile, nodata):
    """
    Reproject a file to match the shape and projection of existing raster.
    Output file is written to disk.

    Parameters
    ----------
    infile : (string) path to input file to reproject
    matchfile : (string) path to raster with desired shape and projection
    outfile : (string) path to output file tif
    nodata : (float) nodata value for output raster
    """
    # open input
    with rasterio.open(infile) as src:
        src_transform = src.transform

        # open input to match
        with rasterio.open(matchfile) as match:
            dst_crs = match.crs

            # calculate the output transform matrix
            dst_transform, dst_width, dst_height = calculate_default_transform(
                src.crs,  # input CRS
                dst_crs,  # output CRS
                match.width,  # input width
                match.height,  # input height
                *match.bounds,  # unpacks input outer boundaries (left, bottom, right, top)
            )

        # set properties for output
        dst_kwargs = src.meta.copy()
        dst_kwargs.update(
            {
                "crs": dst_crs,
                "transform": dst_transform,
                "width": dst_width,
                "height": dst_height,
                "nodata": nodata,
            }
        )
        print(
            "Coregistered to shape:", dst_height, dst_width, "\n Affine", dst_transform
        )
        # open output
        with rasterio.open(outfile, "w", **dst_kwargs) as dst:
            # iterate through bands and write using reproject function
            for i in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst_transform,
                    dst_crs=dst_crs,
                    resampling=Resampling.nearest,
                )


def reproj_raster(
    infile, outfile, bbox_out, resolution_out=None, crs_out="EPSG:4326", nodata=0
):
    """
    Reproject and clip for a given output resolution, crs and bbox.
    Output file is written to disk.

    Parameters
    ----------
    infile : (string) path to input file to reproject
    outfile : (string) path to output file tif
    bbox_out : (left, bottom, right, top)
    resolution_out : (float) resolution of output raster
    crs_out : default "EPSG:4326"
    nodata : (float) nodata value for output raster
    """
    # open input
    with rasterio.open(infile) as src:
        src_transform = src.transform

        width_out = int((bbox_out[2] - bbox_out[0]) / resolution_out)
        height_out = int((bbox_out[3] - bbox_out[1]) / resolution_out)

        # calculate the output transform matrix
        dst_transform, dst_width, dst_height = calculate_default_transform(
            src.crs,  # input CRS
            crs_out,  # output CRS
            width_out,  # output width
            height_out,  # output height
            *bbox_out,  # unpacks input outer boundaries (left, bottom, right, top)
        )

        # set properties for output
        dst_kwargs = src.meta.copy()
        dst_kwargs.update(
            {
                "crs": crs_out,
                "transform": dst_transform,
                "width": dst_width,
                "height": dst_height,
                "nodata": nodata,
            }
        )
        print("Converting to shape:", dst_height, dst_width, "\n Affine", dst_transform)
        # open output
        with rasterio.open(outfile, "w", **dst_kwargs) as dst:
            # iterate through bands and write using reproject function
            for i in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst_transform,
                    dst_crs=crs_out,
                    resampling=Resampling.nearest,
                )


def _read_file(file):
    """
    Internal function to read a raster file with rasterio

    INPUT:
        file: filepath to raster file

    RETURNS:
        Either single data array or multi-dimensional array if input is multiband.
    """
    with rasterio.open(file) as src:
        temp = src.read()
        dims = temp.shape[0]
        if dims == 1:
            return src.read(1)
        else:
            # Returns array in form [channels, long, lat]
            return src.read()


def aggregate_rasters(
    file_list=None, data_dir=None, agg=["mean"], outfile="aggregation"
):
    """
    Aggregrates over multiple files and over all channels
    and writes results to new tif file(s).

    Parameters
    ----------
    file_list : list of strings
        List of files to aggregate
    data_dir : string
        Path to directory containing files
    agg : list of strings
        List of aggregation methods to apply (mean, median, sum, perc95, perc5)
    outfile : string
        Name of output file

    Returns
    -------
    list_outfnames : list of strings of output file names

    """

    if (file_list != None) and (data_dir != None):
        raise RuntimeWarning(
            "file_list and data_dir both set, only the data_dir will be used."
        )

    # Check the aggregation methods are okay
    agg_types = ["mean", "median", "sum", "perc95", "perc5"]
    aggcheck = [a for a in agg if a in agg_types]
    if aggcheck is None:
        raise ValueError("Invalid Aggregation type. Expected any of: %s" % agg_types)
    else:
        print("Finding", aggcheck, " out of possible", agg_types)

    # If a directory has been passed, add all the files to the list
    if data_dir is not None:
        # data_dir = '/path/to/data'
        print("Reading all *.tif files in: ", data_dir)
        file_list = glob(os.path.join(data_dir, "*.tif"))

    # Get metadata from one of the input files
    with rasterio.open(file_list[0]) as src:
        meta = src.meta

    meta.update(dtype=rasterio.float32)

    # Stack all data/channels as a list of numpy arrays
    array_list = []
    for x in file_list:
        array_list.append(_read_file(x))

    array_list = np.asarray(array_list)

    # Perform aggregation over channels axis
    mean_array = np.nanmean(array_list, axis=0)
    median_array = np.nanmedian(array_list, axis=0)
    sum_array = np.nansum(array_list, axis=0)
    mean_array95 = np.nanpercentile(array_list, 95, axis=0)
    mean_array5 = np.nanpercentile(array_list, 5, axis=0)

    aggdict = {}
    aggdict["mean"] = mean_array
    aggdict["median"] = median_array
    aggdict["sum"] = sum_array
    aggdict["perc95"] = mean_array95
    aggdict["perc5"] = mean_array5

    # Write output file
    list_outfnames = []
    for a in aggcheck:
        with rasterio.open(outfile + "_" + a + ".tif", "w", **meta) as dst:
            dst.write(aggdict[a].astype(rasterio.float32), 1)
        print(a, "of filelist saved in: ", outfile + "_" + a + ".tif")
        list_outfnames.append(outfile + "_" + a + ".tif")
    return list_outfnames


def aggregate_multiband(
    file_list=None, data_dir=None, agg=["mean"], outfile="aggregation"
):
    """
    Aggregates over multiple files but keeps channels independently.
    Results are written to new tif files.

    Parameters
    ----------
    file_list : list of strings
        List of files to aggregate
    data_dir : string
        Path to directory containing files
    agg : list of strings
        List of aggregation methods to apply
    outfile : string
        Name of output file

    Returns
    -------
    outfname_list : list of strings of output file names
    channel_list : list of strings of channel names
    agg_list : list of strings of aggregation methods
    """

    if (file_list != None) and (data_dir != None):
        raise RuntimeWarning(
            "file_list and data_dir both set, only the data_dir will be used."
        )

    # Check the aggregation methods are okay
    agg_types = ["mean", "median", "sum", "perc95", "perc5"]
    aggcheck = [a for a in agg if a in agg_types]
    if aggcheck is None:
        raise ValueError("Invalid Aggregation type. Expected any of: %s" % agg_types)
    else:
        print("Finding", aggcheck, " out of possible", agg_types)

    # If a directory has been passed, add all the files to the list
    if data_dir is not None:
        # data_dir = '/path/to/data'
        print("Reading all *.tif files in: ", data_dir)
        file_list = glob(os.path.join(data_dir, "*.tif"))

    # Get metadata from one of the input files
    with rasterio.open(file_list[0]) as src:
        meta = src.meta
        desc = src.descriptions

    meta.update(dtype=rasterio.float32)

    # Append all tif files for each channel as a list of numpy arrays
    array_list = {k: [] for k in desc}

    for x in file_list:
        # print(x)
        data = _read_file(x)
        if data.shape[0] != len(desc):
            print("Band number mismatch between files!")
        for i in range(data.shape[0]):
            # print(i,desc[i])
            array_list[desc[i]].append(data[i, :, :])

    # Perform aggregation over channels axis
    outfname_list = []
    channel_list = []
    agg_list = []
    for i, channel in enumerate(array_list):
        mean_array = np.nanmean(array_list[channel], axis=0)
        median_array = np.nanmedian(array_list[channel], axis=0)
        sum_array = np.nansum(array_list[channel], axis=0)
        mean_array95 = np.nanpercentile(array_list[channel], 95, axis=0)
        mean_array5 = np.nanpercentile(array_list[channel], 5, axis=0)

        aggdict = {}
        aggdict["mean"] = mean_array
        aggdict["median"] = median_array
        aggdict["sum"] = sum_array
        aggdict["perc95"] = mean_array95
        aggdict["perc5"] = mean_array5

        # Write output file
        for a in aggcheck:
            outstring = outfile + "_" + a + "_channel_" + channel + ".tif"
            with rasterio.open(
                outstring, "w", **meta
            ) as dst:
                dst.write(aggdict[a].astype(rasterio.float32), 1)
            print(
                a,
                "of filelist saved in: ",
                outstring,
            )
            outfname_list.append(outstring)
            agg_list.append(a)
            channel_list.append(str(i))
    return outfname_list, channel_list, agg_list


@jit(nopython=True)
def _get_coords_at_point(gt, lon, lat):
    """
    Internal function, given a point in some coordinate reference
    (e.g. lat/lon) Find the closest point to that in an array (e.g.
    a raster) and return the index location of that point in the raster.
    
    INPUTS:
        gt: output from "gdal_data.GetGeoTransform()"
        lon: x/row-coordinate of interest
        lat: y/column-coordinate of interest
    
    RETURNS:
        col: x index value from the raster
        row: y index value from the raster
    """
    row = int((lon - gt[2]) / gt[0])
    col = int((lat - gt[5]) / gt[4])

    return (col, row)


def raster_query(longs, lats, rasters, titles=None):
    """
    DEPRECATED: Use extract_values_from_rasters instead.

    given a longitude,latitude value, return the value at that point of the
        first channel/band in the raster/tif.

    INPUTS
        longs:list of longitudes
        lats:list of latitudes
        rasters:list of raster filenames (as strings)
        titles:list of column titles (as strings) that correspond to rasters (if none provided, rasternames will be used)

    RETURNS
        gdf: geopandas dataframe where each row is long/lat point,
            and columns are rasterfiles
    """

    # Setup the dataframe to store the ML data
    gdf = gpd.GeoDataFrame(
        {"Longitude": longs, "Latitude": lats},
        geometry=gpd.points_from_xy(longs, lats),
        crs="EPSG:4326",
    )

    # Loop through each raster
    for filepath in rasters:
        filename = Path(filepath).resolve().stem
        # print("Opening:", filename)
        # Open the file:
        raster = rasterio.open(filepath)
        # Get the transformation crs data
        gt = raster.transform

        # This will only be the first band, usally multiband has same index.
        arr = raster.read(1)

        if titles is not None:
            colname = titles[rasters.index(filepath)]
        else:
            colname = Path(filepath).stem
            # colname = filepath.split("/")[-1][:-4]

        # Interogate the tiff file as an array

        # FIXME Check the number of bands and print a warning if more than 1

        # Shape of raster
        # print("Raster pixel size:", np.shape(arr))

        # Slowest part of this function.
        # Speed up with Numba/Dask etc
        # (although previous attempts have not been worth it.)
        # Query the raster at the points of interest
        with spin(f"• {filename} | pixel size: {np.shape(arr)}", "blue") as s:
            values = []
            for (lon, lat) in zip(longs, lats):

                # Convert lat/lon to raster units-index
                point = _get_coords_at_point(gt, lon, lat)

                # This will fail for small areas or on boundaries
                try:
                    val = arr[point[0], point[1]]
                except:
                    # print(lon,lat,point[0],point[1],"has failed.")
                    val = 0
                values.append(val)
            s(1)

        # dd the values at the points to the dataframe
        gdf[filepath] = values
        gdf = gdf.rename(columns={filepath: colname})

    return gdf


def extract_values_from_rasters(coords, raster_files, method = "nearest"):
    """
    Extract values from a list of raster files at given coordinates using rioxarray.
    Values will be extracted for all bands in each raster file.
    Return geopandas DataFrame with extracted values and geometry.

    Input:
        coords: A list of tuples containing longitude and latitude coordinates.
                Format: [(lng1, lat1), (lng2, lat2), ...]

        raster_files: A list of raster file paths.
                      Format: ["path/to/raster1.tif", "path/to/raster2.tif", ...]

        method: The method to select values from raster files for 
                inexact matches between input coords and raster coords:
                 {"nearest", "pad", "ffill", "backfill", "bfill"}, optional
            - nearest (Default): use nearest valid index value. 
            - pad / ffill: propagate last valid index value forward
            - backfill / bfill: propagate next valid index value backward
            - None: only exact matches

    Output:
        A geopandas DataFrame containing the extracted values and geometry, where each row represents
        a coordinate point and the columns represent the bands for each raster file.
        Output column names are the raster file name plus the band name.
    """
    all_coords_data = []
    column_names = []

    with spin("Extracting values from raster files...", "blue") as s:
        for raster_file in raster_files:
            # Open the raster file with rioxarray
            ds = rxr.open_rasterio(raster_file)
            
            # Extract values for all coordinates
            coords_data = []
            for lng, lat in coords:
                # Select the nearest lat and lon coordinates from the dataset
                data = ds.sel(x=lng, y=lat, method=method)
                
                # Convert the data to a numpy array and flatten it
                data_array = data.values.flatten().tolist()
                
                # Add the extracted values to the list
                coords_data.append(data_array)

            # Concatenate the extracted values from all raster files
            all_coords_data.append(coords_data)

            # try to det the band names from the dataset, otherwise use the band number
            try:
                band_names = ds.attrs['long_name']
                if isinstance(band_names, str):
                    band_names = [band_names]
                if isinstance(band_names, tuple):
                    band_names = list(band_names)
                if len(band_names) != len(ds.band.values.tolist()):
                    band_names = ds.band.values.tolist()
            except:
                band_names = ds.band.values.tolist()
            # get the raster name
            raster_name = os.path.basename(raster_file).split(".")[0]
            # Add the raster name to the band names
            band_names = [f"{raster_name}_{band_name}" for band_name in band_names]

            # Add the band names to the column names list
            column_names.extend(band_names)

    # Convert the data to a pandas DataFrame and include the column names
    all_coords_data = pd.DataFrame(np.hstack(all_coords_data), columns=column_names)
    
    # Check for potential duplicate column names in all_coords_data
    if all_coords_data.columns.duplicated().any():
        print("Duplicate column names found. Please check the input raster files.")
        # drop duplicate columns and leave only first occurence
        all_coords_data = all_coords_data.loc[:,~all_coords_data.columns.duplicated()].copy()

    # save all_coords_data with coords as geopackage with geopandas
    gdf = gpd.GeoDataFrame(all_coords_data, geometry=gpd.points_from_xy(coords[:,0], coords[:,1]), crs="EPSG:4326")

    # insert the coords into the dataframe
    gdf.insert(0, 'Longitude', coords[:,0])
    gdf.insert(1, 'Latitude', coords[:,1])

    return gdf


def init_logtable():
    """
    Create a log table to store information from the raster download or processing.

    RETURNS:
        df_log: dataframe to update
    """
    return pd.DataFrame(
        columns=[
            "layername",
            "agfunction",
            "dataset",
            "layertitle",
            "filename_out",
            "loginfo",
        ]
    )


def update_logtable(
    df_log,
    filenames,
    layernames,
    datasource,
    settings,
    layertitles=[],
    agfunctions=[],
    loginfos=[],
    force=False
):
    """
    Update the dataframe table with the information from the raster download or processing.
    The dataframe is simultaneoulsy saved to a csv file in default output directory.

    INPUTS
    df_log: dataframe to update
    filenames: list of filenames to add to the dataframe (captured in output of getdata_* functions)
    layernames: list of layernames to add to the dataframe (must be same length as filenames)
    datasource: datasource of the rasters (e.g. 'SLGA', 'SILO', 'DEA', see settings)
    settings: settings Namespace object
    layertitles: list of layer titles to add to the dataframe; if empty or none provided, it will be inferred from settings
    agfunctions: list of aggregation functions to add to the dataframe; if empty or none provided, it will be inferred from settings
    loginfos: string or list of log information strings to add to the dataframe;

    RETURNS
    df_log: updated dataframe
    """
    # First automatically check consistency of inputs and set defaults if necessary
    if len(filenames) != len(layernames):
        print(
            "Error: Number of filenames does not match number of layernames. Dataframe not updated."
        )
        return df_log
    if type(agfunctions) == str:
        agfunctions = [agfunctions] * len(layernames)
    if agfunctions == []:
        try:
            if "agfunctions" in settings.target_sources["SLGA"]:
                agfunctions = settings.target_sources[datasource]["agfunctions"]
            else:
                agfunctions = list(settings.target_sources[datasource].values())
                # flatten possible list of lists
                # check if list of lists
                if type(agfunctions[0]) == list:
                    agfunctions = [item for sublist in agfunctions for item in sublist]
            if agfunctions == None:
                agfunctions = ["None"] * len(layernames)
        except:
            agfunctions = ["None"] * len(layernames)

    if len(agfunctions) != len(layernames):
        print(
            "Error: Number of agfunctions does not match number of layernames. Dataframe not updated."
        )
        return df_log
    if layertitles == []:
        layertitles = [
            layernames[i] + "_" + agfunctions[i] for i in range(len(layernames))
        ]

    # check if you are adding a duplicate entry to the log
    for f in filenames:
        warnings.simplefilter(action="ignore", category=FutureWarning)
        if f in df_log.filename_out.values:
            if force == False:
                print("Error: " + str(f) + " exists in df_log! Dataframe not updated.\nCheck your inputs or overwrite with force=True")
                return df_log
            elif force==True:
                print("Warning: " + str(f) + " exists in df_log and has been overitten by force=True")
                df_log.drop(df_log[df_log.filename_out != f].index, inplace=True)

    # check if loginfos is a list or a string
    if type(loginfos) == str:
        loginfos = [loginfos] * len(layernames)
    else:
        if loginfos == []:
            loginfos = ["processed"] * len(layernames)
        elif len(loginfos) != len(layernames):
            print(
                "Error: Number of loginfos does not match number of layernames. Dataframe not updated."
            )
            return df_log
    datasets = [datasource] * len(layernames)
    data_add = {
        "layername": layernames,
        "agfunction": agfunctions,
        "dataset": datasets,
        "layertitle": layertitles,
        "filename_out": filenames,
        "loginfo": loginfos,
    }
    # Add to log dataframe
    df_log = pd.concat([df_log, pd.DataFrame(data_add)], ignore_index=True)
    # Save to csv in settings.outpath
    df_log.to_csv(os.path.join(settings.outpath, "df_log.csv"), index=False)
    return df_log


@jit(nopython=True)
def points_in_circle(circle, arr):
    """
    A generator to return all points whose indices are within a given circle.
    http://stackoverflow.com/a/2774284
    Warning: If a point is near the the edges of the raster it will not loop
    around to the other side of the raster!

    INPUTS
    circle: a tuple of (i0, j0, r) where i0, j0 are the indices of the center of the circle and r is the radius

    arr: a two-dimensional numpy array

    RETURNS
    A generator that yields all points within the circle    
    """
    i0, j0, r = circle

    def intceil(x):
        return int(np.ceil(x))

    for i in range(intceil(i0 - r), intceil(i0 + r)):
        ri = np.sqrt(r**2 - (i - i0) ** 2)
        for j in range(intceil(j0 - ri), intceil(j0 + ri)):
            if (i >= 0 and i < len(arr[:, 0])) and (j >= 0 and j < len(arr[0, :])):
                yield arr[i][j]


def coreg_raster(i0, j0, data, region):
    """
    Coregisters a point with a buffer region of a raster.
    
    INPUTS
        i0: column-index of point of interest
        j0: row-index of point of interest
        data: two-dimensional numpy array (raster)
        region: integer, same units as data resolution

    RETURNS
        pts: all values from array within region
    """
    pts_iterator = points_in_circle((i0, j0, region), data)
    pts = np.array(list(pts_iterator))

    return pts

Functions

def aggregate_multiband(file_list=None, data_dir=None, agg=['mean'], outfile='aggregation')

Aggregates over multiple files but keeps channels independently. Results are written to new tif files.

Parameters

file_list : list of strings: List of files to aggregate
data_dir : string: Path to directory containing files
agg : list of strings: List of aggregation methods to apply
outfile : string: Name of output file

Returns

outfname_list : list of strings of output file names
channel_list : list of strings of channel names
agg_list : list of strings of aggregation methods

Expand source code

def aggregate_multiband(
    file_list=None, data_dir=None, agg=["mean"], outfile="aggregation"
):
    """
    Aggregates over multiple files but keeps channels independently.
    Results are written to new tif files.

    Parameters
    ----------
    file_list : list of strings
        List of files to aggregate
    data_dir : string
        Path to directory containing files
    agg : list of strings
        List of aggregation methods to apply
    outfile : string
        Name of output file

    Returns
    -------
    outfname_list : list of strings of output file names
    channel_list : list of strings of channel names
    agg_list : list of strings of aggregation methods
    """

    if (file_list != None) and (data_dir != None):
        raise RuntimeWarning(
            "file_list and data_dir both set, only the data_dir will be used."
        )

    # Check the aggregation methods are okay
    agg_types = ["mean", "median", "sum", "perc95", "perc5"]
    aggcheck = [a for a in agg if a in agg_types]
    if aggcheck is None:
        raise ValueError("Invalid Aggregation type. Expected any of: %s" % agg_types)
    else:
        print("Finding", aggcheck, " out of possible", agg_types)

    # If a directory has been passed, add all the files to the list
    if data_dir is not None:
        # data_dir = '/path/to/data'
        print("Reading all *.tif files in: ", data_dir)
        file_list = glob(os.path.join(data_dir, "*.tif"))

    # Get metadata from one of the input files
    with rasterio.open(file_list[0]) as src:
        meta = src.meta
        desc = src.descriptions

    meta.update(dtype=rasterio.float32)

    # Append all tif files for each channel as a list of numpy arrays
    array_list = {k: [] for k in desc}

    for x in file_list:
        # print(x)
        data = _read_file(x)
        if data.shape[0] != len(desc):
            print("Band number mismatch between files!")
        for i in range(data.shape[0]):
            # print(i,desc[i])
            array_list[desc[i]].append(data[i, :, :])

    # Perform aggregation over channels axis
    outfname_list = []
    channel_list = []
    agg_list = []
    for i, channel in enumerate(array_list):
        mean_array = np.nanmean(array_list[channel], axis=0)
        median_array = np.nanmedian(array_list[channel], axis=0)
        sum_array = np.nansum(array_list[channel], axis=0)
        mean_array95 = np.nanpercentile(array_list[channel], 95, axis=0)
        mean_array5 = np.nanpercentile(array_list[channel], 5, axis=0)

        aggdict = {}
        aggdict["mean"] = mean_array
        aggdict["median"] = median_array
        aggdict["sum"] = sum_array
        aggdict["perc95"] = mean_array95
        aggdict["perc5"] = mean_array5

        # Write output file
        for a in aggcheck:
            outstring = outfile + "_" + a + "_channel_" + channel + ".tif"
            with rasterio.open(
                outstring, "w", **meta
            ) as dst:
                dst.write(aggdict[a].astype(rasterio.float32), 1)
            print(
                a,
                "of filelist saved in: ",
                outstring,
            )
            outfname_list.append(outstring)
            agg_list.append(a)
            channel_list.append(str(i))
    return outfname_list, channel_list, agg_list

def aggregate_rasters(file_list=None, data_dir=None, agg=['mean'], outfile='aggregation')

Aggregrates over multiple files and over all channels and writes results to new tif file(s).

Parameters

file_list : list of strings: List of files to aggregate
data_dir : string: Path to directory containing files
agg : list of strings: List of aggregation methods to apply (mean, median, sum, perc95, perc5)
outfile : string: Name of output file

Returns

list_outfnames : list of strings of output file names

Expand source code

def aggregate_rasters(
    file_list=None, data_dir=None, agg=["mean"], outfile="aggregation"
):
    """
    Aggregrates over multiple files and over all channels
    and writes results to new tif file(s).

    Parameters
    ----------
    file_list : list of strings
        List of files to aggregate
    data_dir : string
        Path to directory containing files
    agg : list of strings
        List of aggregation methods to apply (mean, median, sum, perc95, perc5)
    outfile : string
        Name of output file

    Returns
    -------
    list_outfnames : list of strings of output file names

    """

    if (file_list != None) and (data_dir != None):
        raise RuntimeWarning(
            "file_list and data_dir both set, only the data_dir will be used."
        )

    # Check the aggregation methods are okay
    agg_types = ["mean", "median", "sum", "perc95", "perc5"]
    aggcheck = [a for a in agg if a in agg_types]
    if aggcheck is None:
        raise ValueError("Invalid Aggregation type. Expected any of: %s" % agg_types)
    else:
        print("Finding", aggcheck, " out of possible", agg_types)

    # If a directory has been passed, add all the files to the list
    if data_dir is not None:
        # data_dir = '/path/to/data'
        print("Reading all *.tif files in: ", data_dir)
        file_list = glob(os.path.join(data_dir, "*.tif"))

    # Get metadata from one of the input files
    with rasterio.open(file_list[0]) as src:
        meta = src.meta

    meta.update(dtype=rasterio.float32)

    # Stack all data/channels as a list of numpy arrays
    array_list = []
    for x in file_list:
        array_list.append(_read_file(x))

    array_list = np.asarray(array_list)

    # Perform aggregation over channels axis
    mean_array = np.nanmean(array_list, axis=0)
    median_array = np.nanmedian(array_list, axis=0)
    sum_array = np.nansum(array_list, axis=0)
    mean_array95 = np.nanpercentile(array_list, 95, axis=0)
    mean_array5 = np.nanpercentile(array_list, 5, axis=0)

    aggdict = {}
    aggdict["mean"] = mean_array
    aggdict["median"] = median_array
    aggdict["sum"] = sum_array
    aggdict["perc95"] = mean_array95
    aggdict["perc5"] = mean_array5

    # Write output file
    list_outfnames = []
    for a in aggcheck:
        with rasterio.open(outfile + "_" + a + ".tif", "w", **meta) as dst:
            dst.write(aggdict[a].astype(rasterio.float32), 1)
        print(a, "of filelist saved in: ", outfile + "_" + a + ".tif")
        list_outfnames.append(outfile + "_" + a + ".tif")
    return list_outfnames

def coreg_raster(i0, j0, data, region)

Coregisters a point with a buffer region of a raster.

INPUTS i0: column-index of point of interest j0: row-index of point of interest data: two-dimensional numpy array (raster) region: integer, same units as data resolution

RETURNS pts: all values from array within region

Expand source code

def coreg_raster(i0, j0, data, region):
    """
    Coregisters a point with a buffer region of a raster.
    
    INPUTS
        i0: column-index of point of interest
        j0: row-index of point of interest
        data: two-dimensional numpy array (raster)
        region: integer, same units as data resolution

    RETURNS
        pts: all values from array within region
    """
    pts_iterator = points_in_circle((i0, j0, region), data)
    pts = np.array(list(pts_iterator))

    return pts

def extract_values_from_rasters(coords, raster_files, method='nearest')

Extract values from a list of raster files at given coordinates using rioxarray. Values will be extracted for all bands in each raster file. Return geopandas DataFrame with extracted values and geometry.

Input

coords: A list of tuples containing longitude and latitude coordinates. Format: [(lng1, lat1), (lng2, lat2), …]

raster_files: A list of raster file paths. Format: ["path/to/raster1.tif", "path/to/raster2.tif", …]

method: The method to select values from raster files for inexact matches between input coords and raster coords: {"nearest", "pad", "ffill", "backfill", "bfill"}, optional - nearest (Default): use nearest valid index value. - pad / ffill: propagate last valid index value forward - backfill / bfill: propagate next valid index value backward - None: only exact matches

Output

A geopandas DataFrame containing the extracted values and geometry, where each row represents a coordinate point and the columns represent the bands for each raster file. Output column names are the raster file name plus the band name.

Expand source code

def extract_values_from_rasters(coords, raster_files, method = "nearest"):
    """
    Extract values from a list of raster files at given coordinates using rioxarray.
    Values will be extracted for all bands in each raster file.
    Return geopandas DataFrame with extracted values and geometry.

    Input:
        coords: A list of tuples containing longitude and latitude coordinates.
                Format: [(lng1, lat1), (lng2, lat2), ...]

        raster_files: A list of raster file paths.
                      Format: ["path/to/raster1.tif", "path/to/raster2.tif", ...]

        method: The method to select values from raster files for 
                inexact matches between input coords and raster coords:
                 {"nearest", "pad", "ffill", "backfill", "bfill"}, optional
            - nearest (Default): use nearest valid index value. 
            - pad / ffill: propagate last valid index value forward
            - backfill / bfill: propagate next valid index value backward
            - None: only exact matches

    Output:
        A geopandas DataFrame containing the extracted values and geometry, where each row represents
        a coordinate point and the columns represent the bands for each raster file.
        Output column names are the raster file name plus the band name.
    """
    all_coords_data = []
    column_names = []

    with spin("Extracting values from raster files...", "blue") as s:
        for raster_file in raster_files:
            # Open the raster file with rioxarray
            ds = rxr.open_rasterio(raster_file)
            
            # Extract values for all coordinates
            coords_data = []
            for lng, lat in coords:
                # Select the nearest lat and lon coordinates from the dataset
                data = ds.sel(x=lng, y=lat, method=method)
                
                # Convert the data to a numpy array and flatten it
                data_array = data.values.flatten().tolist()
                
                # Add the extracted values to the list
                coords_data.append(data_array)

            # Concatenate the extracted values from all raster files
            all_coords_data.append(coords_data)

            # try to det the band names from the dataset, otherwise use the band number
            try:
                band_names = ds.attrs['long_name']
                if isinstance(band_names, str):
                    band_names = [band_names]
                if isinstance(band_names, tuple):
                    band_names = list(band_names)
                if len(band_names) != len(ds.band.values.tolist()):
                    band_names = ds.band.values.tolist()
            except:
                band_names = ds.band.values.tolist()
            # get the raster name
            raster_name = os.path.basename(raster_file).split(".")[0]
            # Add the raster name to the band names
            band_names = [f"{raster_name}_{band_name}" for band_name in band_names]

            # Add the band names to the column names list
            column_names.extend(band_names)

    # Convert the data to a pandas DataFrame and include the column names
    all_coords_data = pd.DataFrame(np.hstack(all_coords_data), columns=column_names)
    
    # Check for potential duplicate column names in all_coords_data
    if all_coords_data.columns.duplicated().any():
        print("Duplicate column names found. Please check the input raster files.")
        # drop duplicate columns and leave only first occurence
        all_coords_data = all_coords_data.loc[:,~all_coords_data.columns.duplicated()].copy()

    # save all_coords_data with coords as geopackage with geopandas
    gdf = gpd.GeoDataFrame(all_coords_data, geometry=gpd.points_from_xy(coords[:,0], coords[:,1]), crs="EPSG:4326")

    # insert the coords into the dataframe
    gdf.insert(0, 'Longitude', coords[:,0])
    gdf.insert(1, 'Latitude', coords[:,1])

    return gdf

def init_logtable()

Create a log table to store information from the raster download or processing.

Returns

df_log: dataframe to update

Expand source code

def init_logtable():
    """
    Create a log table to store information from the raster download or processing.

    RETURNS:
        df_log: dataframe to update
    """
    return pd.DataFrame(
        columns=[
            "layername",
            "agfunction",
            "dataset",
            "layertitle",
            "filename_out",
            "loginfo",
        ]
    )

def msg_dl(message, log=False)

Prints a downloading message

Expand source code

def msg_dl(message, log=False):
    """Prints a downloading message"""
    if log:
        logging.info(message)
    cprint("\u29e9 " + message, color="magenta")

def msg_err(message, log=False)

Prints an error message

Expand source code

def msg_err(message, log=False):
    """Prints an error message"""
    if log:
        logging.error(message)
    cprint("\u2716 " + message, color="red", attrs=["bold"])

def msg_info(message, icon=True, log=False)

Prints an info message

Expand source code

def msg_info(message, icon=True, log=False):
    """Prints an info message"""
    if log:
        logging.info(message)
    if icon:
        cprint("\u2139 " + message, color="magenta")
    else:
        cprint("  " + message, color="magenta")

def msg_success(message, log=False)

Prints a success message

Expand source code

def msg_success(message, log=False):
    """Prints a success message"""
    if log:
        logging.info(message)
    cprint("\u2714 " + message, color="magenta")

def msg_warn(message, log=False)

Prints a warning message

Expand source code

def msg_warn(message, log=False):
    """Prints a warning message"""
    if log:
        logging.warning(message)
    cprint("\u2691 " + message, color="yellow")

def plot_rasters(rasters, longs=None, lats=None, titles=None)

Plots multiple raster files (.tif) on a grid of nicely arranged figures.

Parameters

raster: list of filenames (.tif). Will only read the first band/channel if multiband. longs: optional x values in list like object for plotting as points over raster images. lats: optional x values in list like object for plotting as points over raster images. titles: title of plot default is raster file name.

Returns:
None

Expand source code

def plot_rasters(rasters, longs=None, lats=None, titles=None):
    """
    Plots multiple raster files (.tif) on a grid of nicely arranged figures.

    Parameters:
        raster: list of filenames (.tif).
            Will only read the first band/channel if multiband.
        longs: optional x values in list like object for plotting as points 
            over raster images.
        lats: optional x values in list like object for plotting as points
            over raster images.
        titles: title of plot default is raster file name.

    Returns:  
        None
    """
    # Set the value for reasonable shaped plot based on the number of datasets
    figlen = int(np.ceil(len(rasters) / 3))
    # Make a blank canvas if there is no data
    figlen = 2 if figlen < 2 else figlen

    # Create the figure
    fig, axes = plt.subplots(figlen, 3, figsize=(12, figlen * 3))

    if titles == None:
        titles = rasters
    # Loop through each subplot/axis on the figure.
    # Use counters to know what axes we are up to.
    i, j = 0, 0
    for a, rast in enumerate(rasters):
        if j == 3:
            j = 0
            i += 1

        # print(a,i,j,figlen,rast)
        src = rasterio.open(rast)
        # Only read first Band for flexibility without complexity
        data = src.read(1)
        # Grab the percentiles for pretty plotting of color ranges
        n95 = np.percentile(data, 5)
        n5 = np.percentile(data, 95)

        # Make the plot and clean it up
        show(
            data,
            ax=axes[i, j],
            title=titles[a],
            transform=src.transform,
            cmap="Greys",
            vmin=n95,
            vmax=n5,
        )
        axes[i, j].scatter(longs, lats, s=1, c="r")
        axes[i, j].xaxis.set_major_formatter(FormatStrFormatter("%.2f"))
        axes[i, j].yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
        axes[i, j].locator_params(axis="y", nbins=4)
        axes[i, j].locator_params(axis="x", nbins=4)

        j += 1

    fig.tight_layout()
    plt.show()

def points_in_circle(circle, arr)

A generator to return all points whose indices are within a given circle. http://stackoverflow.com/a/2774284 Warning: If a point is near the the edges of the raster it will not loop around to the other side of the raster!

INPUTS circle: a tuple of (i0, j0, r) where i0, j0 are the indices of the center of the circle and r is the radius

arr: a two-dimensional numpy array

RETURNS A generator that yields all points within the circle

Expand source code

@jit(nopython=True)
def points_in_circle(circle, arr):
    """
    A generator to return all points whose indices are within a given circle.
    http://stackoverflow.com/a/2774284
    Warning: If a point is near the the edges of the raster it will not loop
    around to the other side of the raster!

    INPUTS
    circle: a tuple of (i0, j0, r) where i0, j0 are the indices of the center of the circle and r is the radius

    arr: a two-dimensional numpy array

    RETURNS
    A generator that yields all points within the circle    
    """
    i0, j0, r = circle

    def intceil(x):
        return int(np.ceil(x))

    for i in range(intceil(i0 - r), intceil(i0 + r)):
        ri = np.sqrt(r**2 - (i - i0) ** 2)
        for j in range(intceil(j0 - ri), intceil(j0 + ri)):
            if (i >= 0 and i < len(arr[:, 0])) and (j >= 0 and j < len(arr[0, :])):
                yield arr[i][j]

def raster_query(longs, lats, rasters, titles=None)

DEPRECATED: Use extract_values_from_rasters instead.

given a longitude,latitude value, return the value at that point of the first channel/band in the raster/tif.

INPUTS longs:list of longitudes lats:list of latitudes rasters:list of raster filenames (as strings) titles:list of column titles (as strings) that correspond to rasters (if none provided, rasternames will be used)

RETURNS gdf: geopandas dataframe where each row is long/lat point, and columns are rasterfiles

Expand source code

def raster_query(longs, lats, rasters, titles=None):
    """
    DEPRECATED: Use extract_values_from_rasters instead.

    given a longitude,latitude value, return the value at that point of the
        first channel/band in the raster/tif.

    INPUTS
        longs:list of longitudes
        lats:list of latitudes
        rasters:list of raster filenames (as strings)
        titles:list of column titles (as strings) that correspond to rasters (if none provided, rasternames will be used)

    RETURNS
        gdf: geopandas dataframe where each row is long/lat point,
            and columns are rasterfiles
    """

    # Setup the dataframe to store the ML data
    gdf = gpd.GeoDataFrame(
        {"Longitude": longs, "Latitude": lats},
        geometry=gpd.points_from_xy(longs, lats),
        crs="EPSG:4326",
    )

    # Loop through each raster
    for filepath in rasters:
        filename = Path(filepath).resolve().stem
        # print("Opening:", filename)
        # Open the file:
        raster = rasterio.open(filepath)
        # Get the transformation crs data
        gt = raster.transform

        # This will only be the first band, usally multiband has same index.
        arr = raster.read(1)

        if titles is not None:
            colname = titles[rasters.index(filepath)]
        else:
            colname = Path(filepath).stem
            # colname = filepath.split("/")[-1][:-4]

        # Interogate the tiff file as an array

        # FIXME Check the number of bands and print a warning if more than 1

        # Shape of raster
        # print("Raster pixel size:", np.shape(arr))

        # Slowest part of this function.
        # Speed up with Numba/Dask etc
        # (although previous attempts have not been worth it.)
        # Query the raster at the points of interest
        with spin(f"• {filename} | pixel size: {np.shape(arr)}", "blue") as s:
            values = []
            for (lon, lat) in zip(longs, lats):

                # Convert lat/lon to raster units-index
                point = _get_coords_at_point(gt, lon, lat)

                # This will fail for small areas or on boundaries
                try:
                    val = arr[point[0], point[1]]
                except:
                    # print(lon,lat,point[0],point[1],"has failed.")
                    val = 0
                values.append(val)
            s(1)

        # dd the values at the points to the dataframe
        gdf[filepath] = values
        gdf = gdf.rename(columns={filepath: colname})

    return gdf

def reproj_mask(filepath, bbox, crscode=4326, filepath_out=None)

Clips a raster to the area of a shape, and reprojects.

INPUTS filepath: input filename (tif) bbox: shapely geometry(polygon) defining mask boundary crscode: optional, coordinate reference system as defined by EPSG filepath_out: optional, the optional output filename of the raster. If False, does not save a new file

RETURNS out_img: numpy array of the clipped and reprojected raster

Expand source code

def reproj_mask(filepath, bbox, crscode=4326, filepath_out=None):
    """
    Clips a raster to the area of a shape, and reprojects.

    INPUTS
        filepath: input filename (tif)
        bbox: shapely geometry(polygon) defining mask boundary
        crscode: optional, coordinate reference system as defined by EPSG
        filepath_out: optional, the optional output filename of the raster. If False, 
        does not save a new file

    RETURNS
        out_img: numpy array of the clipped and reprojected raster
    """
    data = rasterio.open(filepath)
    geo = gpd.GeoDataFrame({"geometry": bbox}, index=[0], crs=CRS.from_epsg(crscode))
    geo = geo.to_crs(crs=CRS.from_epsg(crscode))
    coords = _getFeatures(geo)
    out_img, out_transform = mask(data, shapes=coords, crop=True)

    if filepath_out:
        out_meta = data.meta.copy()
        out_meta.update(
            {
                "driver": "GTiff",
                "height": out_img.shape[1],
                "width": out_img.shape[2],
                "transform": out_transform,
                "crs": CRS.from_epsg(crscode),
            }
        )

        with rasterio.open(filepath_out, "w", **out_meta) as dest:
            dest.write(out_img)
        print("Clipped raster written to:", filepath_out)

    return out_img

def reproj_raster(infile, outfile, bbox_out, resolution_out=None, crs_out='EPSG:4326', nodata=0)

Reproject and clip for a given output resolution, crs and bbox. Output file is written to disk.

Parameters

infile : (string) path to input file to reproject
outfile : (string) path to output file tif
bbox_out : (left, bottom, right, top)
resolution_out : (float) resolution of output raster
crs_out : default "EPSG:4326"
nodata : (float) nodata value for output raster

Expand source code

def reproj_raster(
    infile, outfile, bbox_out, resolution_out=None, crs_out="EPSG:4326", nodata=0
):
    """
    Reproject and clip for a given output resolution, crs and bbox.
    Output file is written to disk.

    Parameters
    ----------
    infile : (string) path to input file to reproject
    outfile : (string) path to output file tif
    bbox_out : (left, bottom, right, top)
    resolution_out : (float) resolution of output raster
    crs_out : default "EPSG:4326"
    nodata : (float) nodata value for output raster
    """
    # open input
    with rasterio.open(infile) as src:
        src_transform = src.transform

        width_out = int((bbox_out[2] - bbox_out[0]) / resolution_out)
        height_out = int((bbox_out[3] - bbox_out[1]) / resolution_out)

        # calculate the output transform matrix
        dst_transform, dst_width, dst_height = calculate_default_transform(
            src.crs,  # input CRS
            crs_out,  # output CRS
            width_out,  # output width
            height_out,  # output height
            *bbox_out,  # unpacks input outer boundaries (left, bottom, right, top)
        )

        # set properties for output
        dst_kwargs = src.meta.copy()
        dst_kwargs.update(
            {
                "crs": crs_out,
                "transform": dst_transform,
                "width": dst_width,
                "height": dst_height,
                "nodata": nodata,
            }
        )
        print("Converting to shape:", dst_height, dst_width, "\n Affine", dst_transform)
        # open output
        with rasterio.open(outfile, "w", **dst_kwargs) as dst:
            # iterate through bands and write using reproject function
            for i in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst_transform,
                    dst_crs=crs_out,
                    resampling=Resampling.nearest,
                )

def reproj_rastermatch(infile, matchfile, outfile, nodata)

Reproject a file to match the shape and projection of existing raster. Output file is written to disk.

Parameters

infile : (string) path to input file to reproject
matchfile : (string) path to raster with desired shape and projection
outfile : (string) path to output file tif
nodata : (float) nodata value for output raster

Expand source code

def reproj_rastermatch(infile, matchfile, outfile, nodata):
    """
    Reproject a file to match the shape and projection of existing raster.
    Output file is written to disk.

    Parameters
    ----------
    infile : (string) path to input file to reproject
    matchfile : (string) path to raster with desired shape and projection
    outfile : (string) path to output file tif
    nodata : (float) nodata value for output raster
    """
    # open input
    with rasterio.open(infile) as src:
        src_transform = src.transform

        # open input to match
        with rasterio.open(matchfile) as match:
            dst_crs = match.crs

            # calculate the output transform matrix
            dst_transform, dst_width, dst_height = calculate_default_transform(
                src.crs,  # input CRS
                dst_crs,  # output CRS
                match.width,  # input width
                match.height,  # input height
                *match.bounds,  # unpacks input outer boundaries (left, bottom, right, top)
            )

        # set properties for output
        dst_kwargs = src.meta.copy()
        dst_kwargs.update(
            {
                "crs": dst_crs,
                "transform": dst_transform,
                "width": dst_width,
                "height": dst_height,
                "nodata": nodata,
            }
        )
        print(
            "Coregistered to shape:", dst_height, dst_width, "\n Affine", dst_transform
        )
        # open output
        with rasterio.open(outfile, "w", **dst_kwargs) as dst:
            # iterate through bands and write using reproject function
            for i in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst_transform,
                    dst_crs=dst_crs,
                    resampling=Resampling.nearest,
                )

def spin(message=None, colour='magenta', events=1, log=False)

Spin animation as a progress inidicator

Expand source code

def spin(message=None, colour="magenta", events=1, log=False):
    """Spin animation as a progress inidicator"""
    if log:
        logging.info(message)
    return alive_bar(events, title=colored("\u2299 " + message, color=colour))

def update_logtable(df_log, filenames, layernames, datasource, settings, layertitles=[], agfunctions=[], loginfos=[], force=False)

Update the dataframe table with the information from the raster download or processing. The dataframe is simultaneoulsy saved to a csv file in default output directory.

INPUTS df_log: dataframe to update filenames: list of filenames to add to the dataframe (captured in output of getdata_* functions) layernames: list of layernames to add to the dataframe (must be same length as filenames) datasource: datasource of the rasters (e.g. 'SLGA', 'SILO', 'DEA', see settings) settings: settings Namespace object layertitles: list of layer titles to add to the dataframe; if empty or none provided, it will be inferred from settings agfunctions: list of aggregation functions to add to the dataframe; if empty or none provided, it will be inferred from settings loginfos: string or list of log information strings to add to the dataframe;

RETURNS df_log: updated dataframe

Expand source code

def update_logtable(
    df_log,
    filenames,
    layernames,
    datasource,
    settings,
    layertitles=[],
    agfunctions=[],
    loginfos=[],
    force=False
):
    """
    Update the dataframe table with the information from the raster download or processing.
    The dataframe is simultaneoulsy saved to a csv file in default output directory.

    INPUTS
    df_log: dataframe to update
    filenames: list of filenames to add to the dataframe (captured in output of getdata_* functions)
    layernames: list of layernames to add to the dataframe (must be same length as filenames)
    datasource: datasource of the rasters (e.g. 'SLGA', 'SILO', 'DEA', see settings)
    settings: settings Namespace object
    layertitles: list of layer titles to add to the dataframe; if empty or none provided, it will be inferred from settings
    agfunctions: list of aggregation functions to add to the dataframe; if empty or none provided, it will be inferred from settings
    loginfos: string or list of log information strings to add to the dataframe;

    RETURNS
    df_log: updated dataframe
    """
    # First automatically check consistency of inputs and set defaults if necessary
    if len(filenames) != len(layernames):
        print(
            "Error: Number of filenames does not match number of layernames. Dataframe not updated."
        )
        return df_log
    if type(agfunctions) == str:
        agfunctions = [agfunctions] * len(layernames)
    if agfunctions == []:
        try:
            if "agfunctions" in settings.target_sources["SLGA"]:
                agfunctions = settings.target_sources[datasource]["agfunctions"]
            else:
                agfunctions = list(settings.target_sources[datasource].values())
                # flatten possible list of lists
                # check if list of lists
                if type(agfunctions[0]) == list:
                    agfunctions = [item for sublist in agfunctions for item in sublist]
            if agfunctions == None:
                agfunctions = ["None"] * len(layernames)
        except:
            agfunctions = ["None"] * len(layernames)

    if len(agfunctions) != len(layernames):
        print(
            "Error: Number of agfunctions does not match number of layernames. Dataframe not updated."
        )
        return df_log
    if layertitles == []:
        layertitles = [
            layernames[i] + "_" + agfunctions[i] for i in range(len(layernames))
        ]

    # check if you are adding a duplicate entry to the log
    for f in filenames:
        warnings.simplefilter(action="ignore", category=FutureWarning)
        if f in df_log.filename_out.values:
            if force == False:
                print("Error: " + str(f) + " exists in df_log! Dataframe not updated.\nCheck your inputs or overwrite with force=True")
                return df_log
            elif force==True:
                print("Warning: " + str(f) + " exists in df_log and has been overitten by force=True")
                df_log.drop(df_log[df_log.filename_out != f].index, inplace=True)

    # check if loginfos is a list or a string
    if type(loginfos) == str:
        loginfos = [loginfos] * len(layernames)
    else:
        if loginfos == []:
            loginfos = ["processed"] * len(layernames)
        elif len(loginfos) != len(layernames):
            print(
                "Error: Number of loginfos does not match number of layernames. Dataframe not updated."
            )
            return df_log
    datasets = [datasource] * len(layernames)
    data_add = {
        "layername": layernames,
        "agfunction": agfunctions,
        "dataset": datasets,
        "layertitle": layertitles,
        "filename_out": filenames,
        "loginfo": loginfos,
    }
    # Add to log dataframe
    df_log = pd.concat([df_log, pd.DataFrame(data_add)], ignore_index=True)
    # Save to csv in settings.outpath
    df_log.to_csv(os.path.join(settings.outpath, "df_log.csv"), index=False)
    return df_log