Source code for passengersim.summaries.legs

from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd

from passengersim.config.dataframes import legs_to_dataframe
from passengersim.database import common_queries
from passengersim.reporting import report_figure
from passengersim.utils.nested_dict import from_nested_dict

from .generic import DatabaseTableItem, GenericSimulationTables, SimulationTableItem
from .tools import aggregate_by_concat_dataframe, aggregate_by_summing_dataframe, break_on_integer

if TYPE_CHECKING:
    from collections.abc import Collection

    import altair as alt

    from passengersim import Simulation


[docs] def extract_legs(sim: Simulation) -> pd.DataFrame | None: """Extract leg-level summary data from a Simulation.""" leg_data = [] for leg in sim.eng.legs: leg_data.append( { "leg_id": leg.leg_id, "carrier": leg.carrier.name, "flt_no": leg.flt_no, "orig": leg.orig, "dest": leg.dest, "gt_sold": leg.gt_sold, "gt_capacity": leg.gt_capacity, "gt_sold_local": leg.gt_sold_local, "gt_revenue": leg.gt_revenue, "distance": leg.distance, } ) if len(leg_data) == 0: return None return pd.DataFrame(leg_data).set_index("leg_id")
[docs] class SimTabLegs(GenericSimulationTables): """Container for summary tables and figures extracted from a Simulation. This class is a subclass of GenericSimulationTables, which is defined in the generic module. It lists the items that are available in the SimulationTables class, and provides type hints and (optionally, but ideally) documentation for the data that is stored in each item. """ legs: pd.DataFrame = SimulationTableItem( aggregation_func=aggregate_by_summing_dataframe("legs", ["carrier", "flt_no", "orig", "dest", "distance"]), extraction_func=extract_legs, computed_fields={ "avg_load_factor": "100.0 * gt_sold / gt_capacity", "avg_local": "100.0 * gt_sold_local / gt_sold", # "avg_sold": "gt_sold / @n_total_samples", }, doc="Leg-level summary data.", ) @property def leg_defs(self): """ A DataFrame containing the definitions of the legs in the simulation. This DataFrame is constructed from the leg definitions defined in the simulation config, and does not depend on the simulation results. Returns ------- pd.DataFrame """ if "leg_defs" not in self._data: self._data["leg_defs"] = legs_to_dataframe(self.config.legs).set_index("leg_id") return self._data["leg_defs"] @property def legs_(self): """ A DataFrame containing the leg summary data, merged with the leg definitions. This DataFrame is constructed by merging the `legs` DataFrame with the `leg_defs` DataFrame, so it includes all the summary data for each leg, as well as all the attributes of each leg defined in the config. Returns ------- pd.DataFrame """ if "legs_" not in self._data: cols_to_use = self.leg_defs.columns.difference(self.legs.columns).tolist() self._data["legs_"] = self.legs.merge( self.leg_defs[cols_to_use], left_index=True, right_index=True, how="left" ) return self._data["legs_"] @property def local_fraction_by_place(self) -> pd.DataFrame: """ The local share of passengers by carrier and place. The index of this DataFrame contains all possible places, and the columns contain the carriers. For each carrier and place, this is the percentage of leg passengers on legs arriving or departing from that place that are local passengers (i.e. not connecting passengers). Passengers are considered connecting whether the connection is at this place, or at another place. If a carrier does not operate any legs to or from a place, or if legs are operated but no passengers are booked (which probably indicates a config error), the local share is NaN. Returns ------- pd.DataFrame """ if "local_fraction_by_place" not in self._data: carriers = self.legs.carrier.unique() result = defaultdict(dict) for carrier in carriers: places = set(self.legs.orig.unique()) | set(self.legs.dest.unique()) for place in places: temp_table = self.legs.loc[ (self.legs.carrier == carrier) & ((self.legs.orig == place) | (self.legs.dest == place)), ["gt_sold", "gt_sold_local"], ].sum() if temp_table["gt_sold"] > 0: result[carrier][place] = float(temp_table["gt_sold_local"] / temp_table["gt_sold"]) else: result[carrier][place] = np.nan result = from_nested_dict(result, dims=["carrier", "place"]).T result = result.sort_index().sort_index(axis=1) self._data["local_fraction_by_place"] = result return self._data["local_fraction_by_place"] def _fig_leg_factor_distribution( self, title: str, leg_attr: str, cat_attr: str, by_carrier: bool | str = True, breakpoints: Collection[int] = None, normalize: bool = False, *, raw_df: bool = False, also_df: bool = False, ) -> alt.Chart | pd.DataFrame | tuple[alt.Chart, pd.DataFrame]: """ Figure showing the distribution of leg factors. Parameters ---------- title : str The title of the figure. leg_attr : str The attribute of the leg to use for the distribution. This should be a percentage value that ranges 0-100, such as "avg_load_factor" or "avg_local". cat_attr : str The name to use for labeling categories in the resulting figure. by_carrier : bool or str, default True If True, show the distribution by carrier. If a string, show the distribution for that carrier. If False, show the distribution aggregated over all carriers. breakpoints : Collection[int, ...], default (25, 30, 35, 40, ..., 90, 95, 100) The breakpoints for the load factor ranges, which represent the lowest load factor value in each bin. The first and last breakpoints are always bounded to 0 and 101, respectively; these bounds can be included explicitly or omitted to be included implicitly. Setting the top value to 101 ensures that the highest load factor value (100) is included in the last bin. normalize : bool, default False If True, normalize the frequency by the total number of legs for each carrier, so that the sum of the frequencies for each carrier is 1. raw_df : bool, default False Return the raw data for this figure as a pandas DataFrame, instead of generating the figure itself. also_df : bool, default False If True, return the raw data for this figure as a pandas DataFrame, in addition to the figure itself. Returns ------- alt.Chart or pd.DataFrame or tuple[alt.Chart, pd.DataFrame] """ if breakpoints is None: breakpoints = range(25, 100, 5) # default breakpoints leg_cat = f"{leg_attr}_category" new_data = { leg_cat: break_on_integer( self.legs[leg_attr], breakpoints, result_name=leg_cat, ) } df_for_chart = ( self.legs.assign(**new_data) .groupby(["carrier", leg_cat], observed=False) .size() .rename("frequency") .reset_index() ) if normalize and by_carrier: df_for_chart["frequency"] = df_for_chart.groupby("carrier")["frequency"].transform(lambda x: x / x.sum()) elif not by_carrier: df_for_chart = df_for_chart.groupby([leg_cat], observed=False).frequency.sum().reset_index() if normalize: df_for_chart["frequency"] = df_for_chart["frequency"] / df_for_chart["frequency"].sum() elif isinstance(by_carrier, str): df_for_chart = df_for_chart[df_for_chart["carrier"] == by_carrier] df_for_chart = df_for_chart.drop(columns=["carrier"]) if normalize: df_for_chart["frequency"] = df_for_chart["frequency"] / df_for_chart["frequency"].sum() freq_label = "Relative Frequency" if normalize else "Count" if raw_df: return df_for_chart import altair as alt if by_carrier is True: chart = ( alt.Chart(df_for_chart) .mark_bar() .encode( x=alt.X(leg_cat, title=cat_attr), y=alt.Y("frequency:Q", title=freq_label), color=alt.Color("carrier:N", title="Carrier"), facet=alt.Facet("carrier:N", columns=2, title="Carrier"), tooltip=[ alt.Tooltip("carrier", title="Carrier"), alt.Tooltip(leg_cat, title=cat_attr), alt.Tooltip("frequency", title=freq_label), ], ) .properties(width=300, height=250, title=f"{title} by Carrier") ) else: chart = ( alt.Chart(df_for_chart) .mark_bar() .encode( x=alt.X(leg_cat, title=cat_attr), y=alt.Y("frequency:Q", title=freq_label), tooltip=[ alt.Tooltip("carrier", title="Carrier"), alt.Tooltip(leg_cat, title=cat_attr), alt.Tooltip("frequency", title=freq_label), ], ) .properties( width=600, height=400, title=title if not by_carrier else f"{title} ({by_carrier})", ) ) if also_df: return chart, df_for_chart return chart
[docs] @report_figure def fig_leg_load_factor_distribution( self, by_carrier: bool | str = True, breakpoints: Collection[int] = None, normalize: bool = False, *, raw_df: bool = False, also_df: bool = False, ) -> alt.Chart | pd.DataFrame | tuple[alt.Chart, pd.DataFrame]: """ Figure showing the distribution of leg load factors. Parameters ---------- by_carrier : bool or str, default True If True, show the distribution by carrier. If a string, show the distribution for that carrier. If False, show the distribution aggregated over all carriers. breakpoints : Collection[int, ...], default (25, 30, 35, 40, ..., 90, 95, 100) The breakpoints for the load factor ranges, which represent the lowest load factor value in each bin. The first and last breakpoints are always bounded to 0 and 101, respectively; these bounds can be included explicitly or omitted to be included implicitly. Setting the top value to 101 ensures that the highest load factor value (100) is included in the last bin. normalize : bool, default False If True, normalize the frequency by the total number of legs for each carrier, so that the sum of the frequencies for each carrier is 1. raw_df : bool, default False Return the raw data for this figure as a pandas DataFrame, instead of generating the figure itself. also_df : bool, default False If True, return the raw data for this figure as a pandas DataFrame, in addition to the figure itself. Returns ------- alt.Chart or pd.DataFrame or tuple[alt.Chart, pd.DataFrame] """ title = "Load Factor Frequency" if normalize: title = "Load Factor Relative Frequency" if isinstance(by_carrier, str): title += f" ({by_carrier})" return self._fig_leg_factor_distribution( title=title, leg_attr="avg_load_factor", cat_attr="Load Factor Range", by_carrier=by_carrier, breakpoints=breakpoints, normalize=normalize, raw_df=raw_df, also_df=also_df, )
[docs] @report_figure def fig_leg_local_share_distribution( self, by_carrier: bool | str = True, breakpoints: Collection[int] = None, normalize: bool = False, *, raw_df=False, also_df: bool = False, ) -> alt.Chart | pd.DataFrame | tuple[alt.Chart, pd.DataFrame]: """ Figure showing the distribution of leg local shares. The local share is the percentage of passengers on a leg that are local to the leg's origin and destination (i.e. not connecting). Parameters ---------- by_carrier : bool or str, default True If True, show the distribution by carrier. If a string, show the distribution for that carrier. If False, show the distribution aggregated over all carriers. breakpoints : Collection[int, ...], default (0, 10, 20, ..., 90, 100) The breakpoints for the load factor ranges, which represent the lowest load factor value in each bin. The first and last breakpoints are always bounded to 0 and 101, respectively; these bounds can be included explicitly or omitted to be included implicitly. Setting the top value to 101 ensures that the highest load factor value (100) is included in the last bin. normalize : bool, default False If True, normalize the frequency by the total number of legs for each carrier, so that the sum of the frequencies for each carrier is 1. raw_df : bool, default False Return the raw data for this figure as a pandas DataFrame, instead of generating the figure itself. also_df : bool, default False If True, return the raw data for this figure as a pandas DataFrame, in addition to the figure itself. Returns ------- alt.Chart or pd.DataFrame """ if breakpoints is None: breakpoints = range(0, 100, 10) title = "Local Share Frequency" if normalize: title = "Local Share Relative Frequency" if isinstance(by_carrier, str): title += f" ({by_carrier})" return self._fig_leg_factor_distribution( title=title, leg_attr="avg_local", cat_attr="Local Share Range", by_carrier=by_carrier, breakpoints=breakpoints, normalize=normalize, raw_df=raw_df, also_df=also_df, )
def _leg_filtering( self, orig: str | None = None, dest: str | None = None, place: str | None = None, carrier: str | None = None, ) -> tuple[pd.DataFrame, alt.Color]: import altair as alt df = self.legs.assign(capacity=self.legs.gt_capacity / self.n_total_samples) color = alt.Color("carrier:N", title="Carrier") if carrier: df = df[df.carrier == carrier] if orig: df = df[df.orig == orig] if len(df.dest.unique()) < 11: color = alt.Color("dest:N", title="Destination") if dest: df = df[df.dest == dest] if len(df.orig.unique()) < 11: color = alt.Color("orig:N", title="Origin") if place: df = df[(df.orig == place) | (df.dest == place)] df = df.assign(other_place=df.orig.where(df.orig != place, df.dest)) if len(df.other_place.unique()) < 11: color = alt.Color("other_place:N", title="Other Place") return df, color
[docs] @report_figure def fig_leg_load_v_local( self, *, orig: str | None = None, dest: str | None = None, place: str | None = None, carrier: str | None = None, raw_df: bool = False, also_df: bool = False, facet_columns: int | None = 2, select_leg: bool = False, ) -> alt.Chart | pd.DataFrame: """ Figure showing the relationship between leg load factor and local share. Parameters ---------- orig : str or None, default None Filter the data to only include legs with this origin. dest : str or None, default None Filter the data to only include legs with this destination. place : str or None, default None Filter the data to only include legs with this origin or destination. carrier : str or None, default None Filter the data to only include legs operated by this carrier. raw_df : bool, default False If True, return the raw data for this figure as a pandas DataFrame, instead of generating the figure itself. also_df : bool, default False If True, return the raw data for this figure as a pandas DataFrame, in addition to the figure itself. facet_columns : int or None, default 2 The number of columns to use for faceting the plot by carrier. If None, all facets will appear on one row. select_leg : bool, default False If True, return an interactive widget that allows the user to select specific legs and view their path_legs. This feature is experimental and may change without notice. Returns ------- alt.Chart or pd.DataFrame """ import altair as alt df, color = self._leg_filtering(orig=orig, dest=dest, place=place, carrier=carrier) if raw_df: return df chart = ( alt.Chart(df.reset_index()) .mark_point() .encode( x=alt.X("avg_local:Q", title="Leg Local Share"), y=alt.Y("avg_load_factor:Q", title="Leg Load Factor"), size=alt.Size("capacity:Q").scale(zero=True), # set zero to False for more contrast facet=alt.Facet("carrier:N", columns=facet_columns), tooltip=[ "leg_id", alt.Tooltip("carrier", title="Carrier"), alt.Tooltip("flt_no", title="Flight No"), alt.Tooltip("orig", title="Orig"), alt.Tooltip("dest", title="Dest"), alt.Tooltip("capacity", title="Capacity", format=",.0f"), alt.Tooltip("avg_local", title="Local Share", format=",.2f"), alt.Tooltip("avg_load_factor", title="Load Factor", format=",.2f"), ], color=color, ) ) if select_leg: point_sel = alt.selection_point(name="point") brush_sel = alt.selection_interval( name="brush", on="[mousedown[event.shiftKey], mouseup] > mousemove", translate="[mousedown[event.shiftKey], mouseup] > mousemove!", ) zoom = alt.selection_interval( name="zoom", bind="scales", on="[mousedown[!event.shiftKey], mouseup] > mousemove", translate="[mousedown[!event.shiftKey], mouseup] > mousemove!", ) chart_widget = alt.JupyterChart(chart.add_params(point_sel).add_params(brush_sel).add_selection(zoom)) from ipywidgets import VBox # table_widget = HTML(value=df.iloc[:0].to_html()) subchart_widget = alt.JupyterChart(self.fig_select_leg_analysis([])) def on_select_point(change): sel = change.new.value subchart_widget.chart = self.fig_select_leg_analysis(df.index[sel]) def on_select_brush(change): try: sel = change.new.value if sel is None or "avg_local" not in sel: filtered = df.iloc[:0] else: carrier_name = change.new.store[0]["unit"].split("_")[-1] sel_local = sel["avg_local"] sel_load = sel["avg_load_factor"] filter_query = ( f"{sel_local[0]} <= `avg_local` <= {sel_local[1]} and " f"{sel_load[0]} <= `avg_load_factor` <= {sel_load[1]}" ) filter_query += f" and `carrier` == '{carrier_name}'" filtered = df.query(filter_query) # table_widget.value = filtered.to_html() # table_widget.value = f"<pre>{change.new}</pre>" subchart_widget.chart = self.fig_select_leg_analysis(filtered.index) except Exception: # table_widget.value = f"<pre>{e}</pre>" subchart_widget.chart = alt.Chart().mark_point() chart_widget.selections.observe(on_select_point, ["point"]) chart_widget.selections.observe(on_select_brush, ["brush"]) chart_stack = VBox( [ chart_widget, # table_widget, subchart_widget, ] ) if also_df: return chart_stack, df return chart_stack if also_df: return chart.interactive(), df return chart.interactive()
[docs] @report_figure def fig_leg_load_v_distance( self, *, orig: str | None = None, dest: str | None = None, place: str | None = None, carrier: str | None = None, raw_df: bool = False, also_df: bool = False, facet_columns: int | None = 2, beeswarm: int | tuple[int, float] = 0, ): import altair as alt df, _color = self._leg_filtering(orig=orig, dest=dest, place=place, carrier=carrier) if raw_df: return df df = self.legs.eval(f"capacity = gt_capacity/{self.n_total_samples}").reset_index() tooltips = [ "leg_id", "flt_no", "orig", "dest", "distance", "avg_load_factor", "carrier", "capacity", "avg_local", ] if beeswarm: from passengersim.utils.beeswarm import beeswarm as beeswarm_func if isinstance(beeswarm, int): # default to aspect ratio of 1.25 beeswarm = (beeswarm, 1.25) df = beeswarm_func( df, "distance", "avg_load_factor", n_hex=beeswarm[0], aspect_ratio=beeswarm[1], ) tooltips = [ "leg_id", "flt_no", "orig", "dest", alt.Tooltip("original_distance", title="distance"), alt.Tooltip("original_avg_load_factor", title="avg_load_factor"), "carrier", "capacity", "avg_local", ] chart = ( alt.Chart(df) .mark_point(filled=True) .encode( x="distance", y=alt.Y("avg_load_factor", title="Leg Load Factor", scale=alt.Scale(domain=(50, 100))), color=alt.Color( "avg_local", title="Leg Local Share", scale=alt.Scale( scheme=alt.SchemeParams( name="viridis", extent=[0.1, 0.9], # use the middle portion of the colormap for better contrast ) ), ), shape=alt.Shape( "carrier:N", title="Carrier", scale=alt.Scale(range=["circle", "diamond", "cross", "triangle-up", "triangle-down"]), ), tooltip=tooltips, ) .properties(title="Leg Load Factor, Local Passenger Mix, and Distance") .interactive() ) if also_df: return chart, df return chart
"""Just dump leg_detail""" leg_detail: pd.DataFrame = DatabaseTableItem( aggregation_func=aggregate_by_concat_dataframe("leg_detail"), query_func=common_queries.leg_detail, doc="Sample / DCP level detail for legs - a lot of data", )