SIH Tech Tidbits

Useful tips, libraries and tools from the Sydney Informatics Hub team

A comparison of piping dataframe operations in R Tidyverse vs Python Pandas


Getting the dataset

Flight data downloaded from https://raw.githubusercontent.com/apache-superset/examples-data/master/tutorial_flights.csv:

R

library(dplyr)
library(readr)

data <- read_csv("https://raw.githubusercontent.com/apache-superset/examples-data/master/tutorial_flights.csv")
> data %>% glimpse()
Rows: 2,954
Columns: 20
$ Department                <chr> "Orange Department", "Yellow Department", "Yell…
$ Cost                      <dbl> 81.52, 343.98, 25.98, 12.99, 32.97, 61.48, 159.…
$ `Travel Class`            <chr> "Economy", "Economy", "Economy", "Economy", "Ec…
$ `Ticket Single or Return` <chr> "Return", "Return", "Return", "Return", "Return…
$ Airline                   <chr> "Flybe", "Flybe", "Flybe", "Flybe", "Flybe", "F…
$ `Travel Date`             <date> 2011-02-10, 2011-02-11, 2011-03-02, 2011-03-09…
$ `Origin ICAO`             <chr> "EGPF", "EGPF", "EGHI", "EGHI", "EGPE", "EGPE",…
$ `Origin Name`             <chr> "Glasgow International Airport", "Glasgow Inter…
$ `Origin Municipality`     <chr> "Glasgow", "Glasgow", "Southampton", "Southampt…
$ `Origin Region`           <chr> "Scotland", "Scotland", "England", "England", "…
$ `Origin Country`          <chr> "United Kingdom", "United Kingdom", "United Kin…
$ `Origin Latitude`         <dbl> 55.8719, 55.8719, 50.9503, 50.9503, 57.5425, 57…
$ `Origin Longitude`        <dbl> -4.433060, -4.433060, -1.356800, -1.356800, -4.…
$ `Destination ICAO`        <chr> "EGKK", "EGHI", "EGPB", "EPKK", "EGKK", "EGBB",…
$ `Destination Name`        <chr> "London Gatwick Airport", "Southampton Airport"…
$ `Destination Region`      <chr> "England", "England", "Scotland", "Lesser Polan…
$ `Destination Country`     <chr> "United Kingdom", "United Kingdom", "United Kin…
$ `Destination Latitude`    <chr> "51.148102", "50.9502983093262", "59.8788986206…
$ `Destination Longitude`   <chr> "-0.190278", "-1.35679996013641", "-1.295560002…
$ Distance                  <chr> "595", "584", "993", "1493", "753", "584", "508…

Python

import pandas as pd

data = pd.read_csv("https://raw.githubusercontent.com/apache-superset/examples-data/master/tutorial_flights.csv")
>>> data.columns
Index(['Department', 'Cost', 'Travel Class', 'Ticket Single or Return',
       'Airline', 'Travel Date', 'Origin ICAO', 'Origin Name',
       'Origin Municipality', 'Origin Region', 'Origin Country',
       'Origin Latitude', 'Origin Longitude', 'Destination ICAO',
       'Destination Name', 'Destination Region', 'Destination Country',
       'Destination Latitude', 'Destination Longitude', 'Distance'],
      dtype='object')

Creating New columns from old ones

R

> data %>%
  group_by(`Destination Country`) %>%
  mutate(
    total_cost_per_country = sum(Cost),
    tot_cost_country_euro = total_cost_per_country * 1.3
  ) %>%
  glimpse()
Rows: 2,954
Columns: 22
Groups: Destination Country [65]
$ Department                <chr> "Orange Department", "Yellow Department", "Yello…
$ Cost                      <dbl> 81.52, 343.98, 25.98, 12.99, 32.97, 61.48, 159.8…
$ `Travel Class`            <chr> "Economy", "Economy", "Economy", "Economy", "Eco…
$ `Ticket Single or Return` <chr> "Return", "Return", "Return", "Return", "Return"…
$ Airline                   <chr> "Flybe", "Flybe", "Flybe", "Flybe", "Flybe", "Fl…
$ `Travel Date`             <date> 2011-02-10, 2011-02-11, 2011-03-02, 2011-03-09,…
$ `Origin ICAO`             <chr> "EGPF", "EGPF", "EGHI", "EGHI", "EGPE", "EGPE", …
$ `Origin Name`             <chr> "Glasgow International Airport", "Glasgow Intern…
$ `Origin Municipality`     <chr> "Glasgow", "Glasgow", "Southampton", "Southampto…
$ `Origin Region`           <chr> "Scotland", "Scotland", "England", "England", "S…
$ `Origin Country`          <chr> "United Kingdom", "United Kingdom", "United King…
$ `Origin Latitude`         <dbl> 55.8719, 55.8719, 50.9503, 50.9503, 57.5425, 57.…
$ `Origin Longitude`        <dbl> -4.433060, -4.433060, -1.356800, -1.356800, -4.0…
$ `Destination ICAO`        <chr> "EGKK", "EGHI", "EGPB", "EPKK", "EGKK", "EGBB", …
$ `Destination Name`        <chr> "London Gatwick Airport", "Southampton Airport",…
$ `Destination Region`      <chr> "England", "England", "Scotland", "Lesser Poland…
$ `Destination Country`     <chr> "United Kingdom", "United Kingdom", "United King…
$ `Destination Latitude`    <chr> "51.148102", "50.9502983093262", "59.87889862060…
$ `Destination Longitude`   <chr> "-0.190278", "-1.35679996013641", "-1.2955600023…
$ Distance                  <chr> "595", "584", "993", "1493", "753", "584", "508"…
$ total_cost_per_country    <dbl> 376846.33, 376846.33, 376846.33, 4968.51, 376846…
$ tot_cost_country_euro     <dbl> 489900.229, 489900.229, 489900.229, 6459.063, 48…

Python

Same as above, and printing out the first 5 rows of the last 5 columns:

(
    data.assign(
        total_cost_per_country=lambda x: (
            x[["Destination Country", "Cost"]].groupby("Destination Country")["Cost"]
            .transform("sum")
        ),
        tot_cost_country_euro=lambda x: x["total_cost_per_country"] * 1.3
    )
    .iloc[:5, -5:]
    .to_markdown()
)

Code output:

Destination Latitude Destination Longitude Distance total_cost_per_country tot_cost_country_euro
0 51.1481 -0.190278 595 376846.33 489900.23
1 50.9503 -1.3568 584 376846.33 489900.23
2 59.8789 -1.29556 993 376846.33 489900.23
3 50.0777 19.7848 1493 4968.51 6459.06
4 51.1481 -0.190278 753 376846.33 489900.23

Data viz example

A classic groupby, apply aggregation and filtering and plot stuffs

R

data %>%
  filter(`Origin Country` == "Germany") %>%
  # glimpse()
  select(`Origin Municipality`, Cost, Department) %>%
  group_by(`Origin Municipality`) %>%
  mutate(total_cost = sum(Cost)) %>%
  ggplot(data = ., mapping = aes(x = `Origin Municipality`, y = total_cost, fill = Department)) +
  geom_bar(stat = "identity")

Output:

Python

(
    data[data["Origin Country"] == "Germany"]
    .groupby(["Origin Municipality", "Department"])
    .agg(
        total_cost=pd.NamedAgg(column="Cost", aggfunc="sum"),
    )
    .reset_index()
    .pivot_table(
        index=["Origin Municipality"],
        columns=["Department"],
        values=["total_cost"]
    )
    .plot.bar(stacked=True)
    .get_figure()
)

Output:

Data pipelines

A classic data pipeline with loading, processing, writing steps:

R

read_data <- function(filename) {
  df  <- read_csv(filename, ...)
  return(df)
}

process_data <- function(df) {
  df_out <- df %>%
    group_by("group_col") %>%
    mutate(
      new_col1 = max(col1),
      new_col2 = sum(col2)
    )
  ...
  return(df_out)
}

write_data <- function(df, outfilename) {
  df_out <- df %>%
    select(-col1, -col2)
  write_csv(df_out, outfilename)

  return(df_out)
}
data <- read_data(filename = "path/to/input/file") %>%
  process_data(df = .) %>%
  write_data(df = ., outfilename = "path/to/output/file")

Python

def read_data(filename: str) -> Dataframe:
    df = pd.read_csv(filename, colums = , ...)
    return df

def process_data(df: Dataframe):
    df_out = df
    df_out[["new_col1", "new_col2"]] = (
        df[["group_col", "col1", "col2"]].groupby("group_col")
        .agg({
            "col1": "max",
            "col2": "sum",
        })
    )
    ...
    return df_out

def write_data(df: Dataframe, outfilename):
    df_out = df.drop(["col1", "col2"], axis=1)
    df_out.write(outfilename)
    return df_out
data = (
    read_data("path/to/input/file")
    .pipe(process_data)
    .pipe(write_data, outfilename="path/to/output/file")
)