Source code for passengersim.summaries.carriers

from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING, Literal

import pandas as pd

from passengersim.database import common_queries
from passengersim.reporting import report_figure

from .generic import (
    DatabaseTableItem,
    GenericSimulationTables,
    SimulationTableItem,
)
from .tools import aggregate_by_concat_dataframe

if TYPE_CHECKING:
    import altair as alt

    from passengersim import Simulation

    from . import SimulationTables



[docs]
def extract_carriers(sim: Simulation) -> pd.DataFrame:
    """Extract carrier-level summary data from a Simulation."""
    eng = sim.eng
    num_samples = eng.num_trials_completed * (eng.num_samples - eng.burn_samples)

    carrier_asm = defaultdict(float)
    carrier_rpm = defaultdict(float)
    carrier_leg_lf = defaultdict(float)
    carrier_leg_count = defaultdict(float)
    carrier_local_leg_pax = defaultdict(float)
    carrier_total_leg_pax = defaultdict(float)
    for leg in eng.legs:
        carrier_name = leg.carrier.name
        carrier_asm[carrier_name] += leg.distance * leg.capacity * num_samples
        carrier_rpm[carrier_name] += leg.distance * leg.gt_sold
        carrier_leg_lf[carrier_name] += leg.gt_sold / (leg.capacity * num_samples)
        carrier_leg_count[carrier_name] += 1
        carrier_local_leg_pax[carrier_name] += leg.gt_sold_local
        carrier_total_leg_pax[carrier_name] += leg.gt_sold

    carrier_data = []
    for carrier in sim.eng.carriers:
        avg_rev = carrier.gt_revenue / num_samples
        rpm = carrier_rpm[carrier.name] / num_samples
        avg_leg_lf = 100 * carrier_leg_lf[carrier.name] / max(carrier_leg_count[carrier.name], 1)
        # Add up total ancillaries
        tot_anc_rev = 0.0
        for anc in carrier.ancillaries:
            tot_anc_rev += anc.price * anc.sold
        gt_cp_sold = carrier.gt_cp_sold
        gt_cp_revenue = carrier.gt_cp_revenue
        carrier_data.append(
            {
                "carrier": carrier.name,
                "control": carrier.control,
                "avg_rev": avg_rev,
                "avg_sold": carrier.gt_sold / num_samples,
                "truncation_rule": carrier.truncation_rule,
                "avg_leg_lf": avg_leg_lf,
                "asm": carrier_asm[carrier.name] / num_samples,
                "rpm": rpm,
                "ancillary_rev": tot_anc_rev,
                "avg_local_leg_pax": carrier_local_leg_pax[carrier.name] / num_samples,
                "avg_total_leg_pax": carrier_total_leg_pax[carrier.name] / num_samples,
                "cp_sold": gt_cp_sold / num_samples,
                "cp_revenue": gt_cp_revenue / num_samples,
                "rm_system": carrier.metadata.get("rm_system", {}).get("name"),
            }
        )
    if len(carrier_data) == 0:
        return None
    return pd.DataFrame(carrier_data).set_index("carrier")




[docs]
def aggregate_carriers(summaries: list[SimulationTables]) -> pd.DataFrame | None:
    """Aggregate leg-level summaries."""
    table_avg = []
    for s in summaries:
        frame = s._raw_carriers
        if frame is not None:
            # The keys in the set_index below represent qualitative string attributes that should be
            # consistent across all the tables we are collecting to aggregate.  We will group by these
            # attributes and average the numeric values within each group, becase getting the
            # "average" of these string attributes doesn't make sense numerically. But conceptually
            # the average of strings that are all the same is just that string.
            table_avg.append(frame.set_index(["control", "truncation_rule", "rm_system"], append=True))
    n = len(table_avg)
    while len(table_avg) > 1:
        table_avg[0] = table_avg[0].add(table_avg.pop(1), fill_value=0)
    if table_avg:
        table_avg[0] /= n
        return table_avg[0].reset_index(["control", "truncation_rule", "rm_system"])
    return None




[docs]
def extract_carrier_history2(sim: Simulation) -> pd.DataFrame | None:
    """Extract carrier_history from the Carrier class."""
    combined_data = []
    for cxr in sim.eng.carriers:
        hist = cxr.get_carrier_history()
        combined_data += hist
    if len(combined_data) == 0:
        return None
    df = pd.DataFrame.from_dict(combined_data)
    df = df.set_index(["trial", "sample", "carrier"])
    return df




[docs]
def extract_forecast_accuracy(sim: Simulation) -> pd.DataFrame | None:
    """Extract forecast accuracy from the Carrier class."""
    combined_data = []
    for cxr in sim.eng.carriers:
        hist = cxr.get_forecast_accuracy()
        combined_data += hist
    if len(combined_data) == 0:
        return None
    df = pd.DataFrame.from_dict(combined_data)
    df = df.set_index(["trial", "sample", "carrier", "booking_class", "timeframe"]).reset_index()
    return df




[docs]
class SimTabCarriers(GenericSimulationTables):
    """Container for summary tables and figures extracted from a Simulation.

    This class is a subclass of GenericSimulationTables, which is defined in
    the generic module.  It lists the items that are available in the
    SimulationTables class, and provides type hints and (optionally, but
    ideally) documentation for the data that is stored in each item.
    """

    carriers: pd.DataFrame = SimulationTableItem(
        aggregation_func=aggregate_carriers,
        extraction_func=extract_carriers,
        computed_fields={
            "avg_price": "avg_rev / avg_sold",
            "yield": "avg_rev / rpm",
            "rasm": "avg_rev / asm",
            "sys_lf": "100.0 * rpm / asm",
            "local_pct_leg_pax": "100.0 * avg_local_leg_pax / avg_total_leg_pax",
            "local_pct_bookings": "100.0 * avg_local_leg_pax / avg_sold",
        },
        doc="Carrier-level summary data.",
    )

    carrier_history: pd.DataFrame | None = DatabaseTableItem(
        aggregation_func=aggregate_by_concat_dataframe("carrier_history"),
        query_func=common_queries.carrier_history,
        doc="Carrier-level summary data from each sample.",
    )

    carrier_history2: pd.DataFrame | None = SimulationTableItem(
        aggregation_func=aggregate_by_concat_dataframe("carrier_history2"),
        extraction_func=extract_carrier_history2,
        doc="Carrier-level summary data from each sample, new version with counters in CoreCarrier.",
    )

    forecast_accuracy: pd.DataFrame | None = SimulationTableItem(
        aggregation_func=aggregate_by_concat_dataframe("forecast_accuracy"),
        extraction_func=extract_forecast_accuracy,
        doc="Summary of forecast history, based on UA's EDGAR approach",
    )

    def _fig_carrier_attribute(
        self,
        raw_df: bool,
        load_measure: str,
        measure_name: str,
        measure_format: str = ".2f",
        orient: Literal["h", "v"] = "h",
        title: str | None = None,
        also_df: bool = False,
    ) -> alt.Chart | pd.DataFrame | tuple[alt.Chart, pd.DataFrame]:
        if "rm_system" not in self.carriers.columns:
            # TODO: remove this once no longer using older cached data
            df = self.carriers.reset_index()[["carrier", load_measure]].assign(rm_system="Unknown")
        else:
            df = self.carriers.reset_index()[["carrier", load_measure, "rm_system"]]
        if raw_df:
            return df
        import altair as alt

        chart = alt.Chart(df)
        if orient == "v":
            bars = chart.mark_bar().encode(
                x=alt.X("carrier:N", title="Carrier"),
                y=alt.Y(f"{load_measure}:Q", title=measure_name, axis=alt.Axis(format=measure_format)).stack("zero"),
                color=alt.Color("rm_system:N", title="RM System"),
                tooltip=[
                    alt.Tooltip("carrier", title="Carrier"),
                    alt.Tooltip("rm_system", title="RM System"),
                    alt.Tooltip(f"{load_measure}:Q", title=measure_name, format=measure_format),
                ],
            )
            text = chart.mark_text(dx=0, dy=3, color="white", baseline="top").encode(
                x=alt.X("carrier:N", title="Carrier"),
                y=alt.Y(f"{load_measure}:Q", title=measure_name, axis=alt.Axis(format=measure_format)).stack("zero"),
                text=alt.Text(f"{load_measure}:Q", format=measure_format),
            )
        else:
            bars = chart.mark_bar().encode(
                y=alt.Y("carrier:N", title="Carrier"),
                x=alt.X(f"{load_measure}:Q", title=measure_name, axis=alt.Axis(format=measure_format)).stack("zero"),
                color=alt.Color("rm_system:N", title="RM System"),
                tooltip=[
                    alt.Tooltip("carrier", title="Carrier"),
                    alt.Tooltip("rm_system", title="RM System"),
                    alt.Tooltip(f"{load_measure}:Q", title=measure_name, format=measure_format),
                ],
            )
            text = chart.mark_text(dx=-5, dy=0, color="white", baseline="middle", align="right").encode(
                y=alt.Y("carrier:N", title="Carrier"),
                x=alt.X(f"{load_measure}:Q", title=measure_name, axis=alt.Axis(format=measure_format)).stack("zero"),
                text=alt.Text(f"{load_measure}:Q", format=measure_format),
            )
        fig = (
            (bars + text)
            .properties(
                width=500,
                height=10 + 20 * len(df),
            )
            .configure_axis(
                labelFontSize=12,
                titleFontSize=12,
            )
            .configure_legend(
                titleFontSize=12,
                labelFontSize=15,
            )
        )
        if title:
            fig.title = title
        if also_df:
            return fig, df
        return fig


[docs]
    @report_figure
    def fig_carrier_load_factors(
        self,
        load_measure: Literal["sys_lf", "avg_leg_lf"] = "sys_lf",
        *,
        raw_df: bool = False,
        also_df: bool = False,
        title: str | None = "_default_",
    ):
        measure_name = "System Load Factor" if load_measure == "sys_lf" else "Leg Load Factor"
        return self._fig_carrier_attribute(
            raw_df,
            load_measure,
            measure_name,
            title=f"Carrier {measure_name}s" if title == "_default_" else title,
            also_df=also_df,
        )



[docs]
    @report_figure
    def fig_carrier_revenues(
        self, *, raw_df: bool = False, also_df: bool = False, title: str | None = "Carrier Revenues"
    ):
        return self._fig_carrier_attribute(raw_df, "avg_rev", "Average Revenue", "$.4s", title=title, also_df=also_df)



[docs]
    @report_figure
    def fig_carrier_yields(self, *, raw_df: bool = False, also_df: bool = False, title: str | None = "Carrier Yields"):
        """Generate a figure showing carrier yields.

        Notes
        -----
        Yield is defined as revenue per revenue passenger-mile.  It differs from
        RASM (revenue per available seat mile) in that it only considers revenue
        and miles from paying passengers,  If a seat is flown empty, it does not
        generate revenue or contribute to RPM, so it does not affect yield, but it
        does reduce RASM since it contributes to ASM.  Yield is often considered a
        better measure of the price level that a carrier is achieving, while RASM
        is a better measure of overall revenue efficiency.  Both measures are useful
        for understanding carrier performance, and they can sometimes move in
        different directions, so it's helpful to look at both.
        """
        return self._fig_carrier_attribute(raw_df, "yield", "Average Yield", "$.4f", title=title, also_df=also_df)



[docs]
    @report_figure
    def fig_carrier_rasm(
        self,
        *,
        raw_df: bool = False,
        also_df: bool = False,
        title: str | None = "Carrier Revenue per Available Seat Mile (RASM)",
    ):
        return self._fig_carrier_attribute(
            raw_df,
            "rasm",
            "Revenue per Available Seat Mile",
            "$.4f",
            title=title,
            also_df=also_df,
        )



[docs]
    @report_figure
    def fig_carrier_total_bookings(
        self: SimulationTables,
        *,
        raw_df: bool = False,
        also_df: bool = False,
        title: str | None = "Carrier Total Bookings",
    ):
        return self._fig_carrier_attribute(raw_df, "avg_sold", "Total Bookings", ".4s", title=title, also_df=also_df)



[docs]
    @report_figure
    def fig_carrier_local_share(
        self,
        load_measure: Literal["bookings", "leg_pax"] = "bookings",
        *,
        raw_df: bool = False,
        also_df: bool = False,
        title: str | None = "_default_",
    ):
        measure_name = "Local Percent of Bookings" if load_measure == "bookings" else "Local Percent of Leg Passengers"
        m = "local_pct_bookings" if load_measure == "bookings" else "local_pct_leg_pax"
        if title == "_default_":
            title = f"Carrier {measure_name}"
        return self._fig_carrier_attribute(raw_df, m, measure_name, title=title, also_df=also_df)



[docs]
    @report_figure
    def fig_carrier_mileage(
        self, *, raw_df: bool = False, also_df: bool = False
    ) -> alt.Chart | pd.DataFrame | tuple[alt.Chart, pd.DataFrame]:
        """
        Figure showing mileage by carrier.

        ASM is available seat miles, and RPM is revenue passenger miles. Both
        measures are reported as the average across all non-burned samples.

        Parameters
        ----------
        raw_df : bool, default False
            Return the raw data for this figure as a pandas DataFrame, instead
            of generating the figure itself.
        report : xmle.Reporter, optional
            Also append this figure to the given report.
        trace : pd.ExcelWriter, optional
            Also write the data from this figure to the given Excel file.
        """
        df = (
            self.carriers.reset_index()[["carrier", "asm", "rpm"]]
            .set_index("carrier")
            .rename_axis(columns="measure")
            .unstack()
            .to_frame("value")
            .reset_index()
        )
        if raw_df:
            return df
        import altair as alt

        chart = alt.Chart(df, title="Carrier Loads")
        bars = chart.mark_bar().encode(
            x=alt.X("carrier:N", title="Carrier"),
            y=alt.Y("value", stack=None, title="miles"),
            color="measure",
            tooltip=["carrier", "measure", alt.Tooltip("value", format=".4s")],
        )
        text = chart.mark_text(
            dx=0,
            dy=5,
            color="white",
            baseline="top",
        ).encode(
            x=alt.X("carrier:N"),
            y=alt.Y("value").stack(None),
            text=alt.Text("value:Q", format=".4s"),
        )
        fig = (
            (bars + text)
            .properties(
                width=400,
                height=300,
            )
            .configure_axis(
                labelFontSize=12,
                titleFontSize=12,
            )
            .configure_legend(
                titleFontSize=12,
                labelFontSize=15,
            )
        )
        if also_df:
            return fig, df
        return fig



[docs]
    def fig_carrier_revenue_distribution(self, *, raw_df=False, also_df=False):
        """Figure showing the distribution of carrier revenues.

        Parameters
        ----------
        raw_df : bool, default False
            Return the raw data for this figure as a pandas DataFrame, instead
            of generating the figure itself. This is not implemented yet and will
            raise an error if set.
        also_df: bool, default False
            Return the raw data for this figure as a pandas DataFrame, in addition
            to the figure itself. This is not implemented yet, and will be silently
            ignored if set.
        """
        if raw_df:
            raise NotImplementedError("Raw data not available for this figure.")
        import altair as alt

        fig = (
            alt.Chart(self.carrier_history2.reset_index())
            .transform_density(
                "revenue",
                groupby=["carrier"],
                as_=["revenue", "density"],
            )
            .mark_area()
            .encode(
                x=alt.X("revenue:Q", axis=alt.Axis(title="Revenue", format="$.3s")),
                y=alt.Y("density:Q", title="Density", axis=alt.Axis(labels=False)),
                color="carrier:N",
            )
            .facet(
                "carrier:N",
                title="Revenue Distribution by Carrier",
            )
        )
        return fig



[docs]
    def fig_carrier_head_to_head_revenue(
        self, x_carrier: str, y_carrier: str, *, raw_df=False, mean_adjusted: bool = True
    ):
        """
        Figure comparing carrier revenues head-to-head.

        Parameters
        ----------
        x_carrier, y_carrier : str
            The carrier to plot on the x- and y-axis, respectively.
        raw_df : bool, default False
            Return the raw data for this figure as a pandas DataFrame, instead
            of generating the figure itself.
        mean_adjusted : bool, default True
            If True, adjust revenues by dividing by the mean revenue for each carrier,
            so that the plot shows percentage of mean revenue.  If False, use raw
            revenues, which is generally only useful for analyzing symmetric networks,
            such as 3MKT.

        Returns
        -------
        alt.Chart | pd.DataFrame
            The Altair chart object, or the raw data as a pandas DataFrame
        """
        import altair as alt

        df1 = self.carrier_history2.query(f"carrier == '{x_carrier}'")
        df2 = self.carrier_history2.query(f"carrier == '{y_carrier}'")

        axis_label_text = "Percentage of Mean Revenue"
        axis_format = "%"

        if mean_adjusted:
            df = pd.concat(
                [
                    df1["revenue"] / df1["revenue"].mean(),
                    df2["revenue"] / df2["revenue"].mean(),
                ]
            )
        else:
            df = pd.concat(
                [
                    df1["revenue"],
                    df2["revenue"],
                ]
            )
            axis_label_text = "Revenue"
            axis_format = "$.2s"
        rng = df.min(), df.max()
        df = df.unstack("carrier").reset_index()
        if raw_df:
            return df

        diag = (
            alt.Chart(pd.DataFrame({x_carrier: rng, "AL2": rng}))
            .mark_line(color="red", opacity=0.3)
            .encode(
                x=x_carrier,
                y="AL2",
            )
        )

        fig = (
            alt.Chart(df)
            .mark_circle(opacity=0.3)
            .encode(
                x=alt.X(f"{x_carrier}:Q")
                .axis(format=axis_format)
                .scale(zero=False)
                .title(f"{x_carrier} {axis_label_text}"),
                y=alt.Y(f"{y_carrier}:Q")
                .axis(format=axis_format)
                .scale(zero=False)
                .title(f"{y_carrier} {axis_label_text}"),
            )
            + diag
        )

        return fig.interactive()