Source code for passengersim.summaries.segmentation_detail

from __future__ import annotations

from typing import TYPE_CHECKING

import pandas as pd

from passengersim.database import Database
from passengersim.reporting import report_figure

from .generic import DatabaseTableItem, GenericSimulationTables
from .tools import aggregate_by_averaging_dataframe

if TYPE_CHECKING:
    import altair as alt

    from passengersim.contrast import Contrast



[docs]
def detailed_bookings_by_timeframe(
    cnx: Database,
    *,
    scenario: str | None = None,
    burn_samples: int = 100,
) -> pd.DataFrame:
    """
    Average bookings by carrier, orig, dest, booking class, and timeframe.

    This query requires that the simulation was run while recording supporting
    details (i.e. with the `bookings` or `fare` flags set on `Config.db.write_items`).

    Parameters
    ----------
    cnx : Database
    scenario : str
    burn_samples : int, default 100
        The bookings will be computed ignoring this many samples from the
        beginning of each trial. This argument is nominally ignored by this query
        unless `from_fare_detail` is true, although the simulator will have already
        ignored the burned samples when storing the data in the bookings table.

    Returns
    -------
    pandas.DataFrame
        The resulting dataframe is indexed by `trial`, `carrier`, `orig`, `dest`,
        `booking_class`, and `days_prior`, and has these columns:

        - `avg_sold`: Average number of sales.
        - `avg_business`: Average number of sales to passengers in the business segment.
        - `avg_leisure`: Average number of sales to leisure passengers.
    """
    qry = """
    SELECT trial, carrier, orig, dest, booking_class, days_prior,
           (AVG(sold)) AS avg_sold,
           (AVG(sold_business)) AS avg_business,
           (AVG(sold_leisure)) AS avg_leisure
    FROM (SELECT trial, scenario, carrier, orig, dest, booking_class, days_prior,
                 SUM(sold) AS sold,
                 SUM(sold_business) AS sold_business,
                 SUM(sold - sold_business) AS sold_leisure
          FROM fare_detail LEFT JOIN fare_defs USING (fare_id)
          WHERE
                sample >= ?1
                AND scenario = ?2
          GROUP BY trial, sample, carrier, orig, dest, booking_class, days_prior) a
    GROUP BY carrier, orig, dest, booking_class, days_prior, trial
    ORDER BY carrier, orig, dest, booking_class, days_prior, trial;
    """
    if scenario is None:
        qry = qry.replace("AND scenario = ?2", "")
        params = (burn_samples,)
    else:
        params = (burn_samples, scenario)

    return cnx.dataframe(qry, params).set_index(["trial", "carrier", "orig", "dest", "booking_class", "days_prior"])




[docs]
class SimTabSegmentationDetail(GenericSimulationTables):
    """Container for summary tables and figures extracted from a Simulation.

    This class is a subclass of _Generic
    """

    segmentation_detail: pd.DataFrame = DatabaseTableItem(
        aggregation_func=aggregate_by_averaging_dataframe("segmentation_detail"),
        query_func=detailed_bookings_by_timeframe,
        doc="Segmentation detail.",
    )


[docs]
    @report_figure
    def fig_segmentation_detail(
        self,
        *,
        by_carrier: bool | str = True,
        by_class: bool | str = False,
        orig: str | None = None,
        dest: str | None = None,
        raw_df: bool = False,
        also_df: bool = False,
    ) -> alt.Chart | pd.DataFrame | tuple[alt.Chart, pd.DataFrame]:
        """
        Plot the segmentation detail data.

        Parameters
        ----------
        by_carrier : bool or str, default True
            If True, group by carrier. If a string, filter by carrier.
        by_class : bool or str, default False
            If True, group by booking class. If a string, filter by booking class.
        orig : str, optional
            Filter by origin.
        dest : str, optional
            Filter by destination.
        raw_df : bool, default False
            If True, return the raw dataframe instead of the figure.
        also_df : bool, default False
            If True, return the dataframe as well as the figure.

        Returns
        -------
        alt.Chart or pd.DataFrame or tuple[alt.Chart, pd.DataFrame]
            The segmentation detail figure or dataframe.
        """
        if self.segmentation_detail is None:
            raise ValueError("segmentation_detail not found")
        metric = "bookings"
        df = self.segmentation_detail

        # convert to sales by timeframe
        df = df.groupby(df.index.names[:-1])[["avg_business", "avg_leisure"]].diff(-1)
        df = df.groupby(df.index.names[:-1])[["avg_business", "avg_leisure"]].shift()
        df = df.query("days_prior != 0")

        # rename columns
        df = df.rename(columns={"avg_business": "business", "avg_leisure": "leisure"})
        df.columns.name = "segment"
        df = df.stack().rename("bookings").to_frame()

        if orig is not None:
            df = df.query("orig == @orig")
            df.index = df.index.droplevel("orig")
        else:
            gb = [i for i in df.index.names if i != "orig"]
            df = df.groupby(gb).sum()
        if dest is not None:
            df = df.query("dest == @dest")
            df.index = df.index.droplevel("dest")
        else:
            gb = [i for i in df.index.names if i != "dest"]
            df = df.groupby(gb).sum()

        idxs = list(df.index.names)
        if "trial" in idxs:
            idxs.remove("trial")
            df = df.groupby(idxs).mean()
        df = df.reset_index()

        title = "Detailed Segmentation by Timeframe"
        if orig and dest:
            title = f"{title} ({orig}~{dest})"
        elif orig and not dest:
            title = f"{title} (Orig={orig})"
        elif not orig and dest:
            title = f"{title} (Dest={dest})"
        title_annot = []
        if not by_carrier:
            g = ["days_prior", "segment"]
            if by_class:
                g += ["booking_class"]
            df = df.groupby(g, observed=False)[[metric]].sum().reset_index()
        if by_carrier and not by_class:
            df = df.groupby(["carrier", "days_prior", "segment"], observed=False)[[metric]].sum().reset_index()
        if isinstance(by_carrier, str):
            df = df[df["carrier"] == by_carrier]
            df = df.drop(columns=["carrier"])
            title_annot.append(by_carrier)
            by_carrier = False
        if isinstance(by_class, str):
            df = df[df["booking_class"] == by_class]
            df = df.drop(columns=["booking_class"])
            title_annot.append(f"Class {by_class}")
            by_class = False
        if title_annot:
            title = f"{title} ({', '.join(title_annot)})"
        if raw_df:
            return df

        import altair as alt

        if by_carrier:
            color = "carrier:N"
            color_title = "Carrier"
        elif by_class:
            color = "booking_class:N"
            color_title = "Booking Class"
        else:
            color = "segment:N"
            color_title = "Passenger Type"

        if metric == "revenue":
            metric_fmt = "$,.0f"
        else:
            metric_fmt = ",.2f"

        chart = (
            alt.Chart(df)
            .mark_bar()
            .encode(
                color=alt.Color(color).title(color_title),
                x=alt.X("days_prior:O").scale(reverse=True).title("Days Prior to Departure"),
                y=alt.Y(metric),
                tooltip=([alt.Tooltip("carrier").title("Carrier")] if by_carrier else [])
                + ([alt.Tooltip("booking_class").title("Booking Class")] if by_class else [])
                + [
                    alt.Tooltip("segment", title="Passenger Type"),
                    alt.Tooltip("days_prior", title="Days Prior"),
                    alt.Tooltip(metric, format=metric_fmt, title=metric.title()),
                ],
            )
            .properties(
                width=500,
                height=200,
            )
        )
        if by_carrier or by_class:
            chart = chart.facet(
                row=alt.Row("segment:N", title="Passenger Type"),
                title=title,
            )
        else:
            chart = chart.properties(title=title)
        if also_df:
            return chart, df
        return chart




# Figure for Contrast

[docs]
def fig_segmentation_detail(
    summaries: Contrast,
    *,
    by_carrier: bool | str = True,
    by_class: bool | str = False,
    orig: str | None = None,
    dest: str | None = None,
    raw_df: bool = False,
    also_df: bool = False,
    width: int | None = 400,
    height: int | None = 180,
) -> alt.Chart | pd.DataFrame | tuple[alt.Chart, pd.DataFrame]:
    """
    Plot the segmentation detail data.

    Parameters
    ----------
    summaries : Contrast
        The contrast object containing the segmentation detail data.
    by_carrier : bool or str, default True
        If True, group by carrier. If a string, filter by carrier.
    by_class : bool or str, default False
        If True, group by booking class. If a string, filter by booking class.
    orig : str, optional
        Filter by origin.
    dest : str, optional
        Filter by destination.
    raw_df : bool, default False
        If True, return the raw dataframe instead of the figure.
    also_df : bool, default False
        If True, return the dataframe as well as the figure.
    width : int, optional
        The width of the figure. Default is 400.
    height : int, optional
        The height of the figure. Default is 180.

    Returns
    -------
    alt.Chart or pd.DataFrame or tuple[alt.Chart, pd.DataFrame]
        The segmentation detail figure or dataframe.
    """
    dfs = {
        k: v.fig_segmentation_detail(
            by_carrier=by_carrier,
            by_class=by_class,
            orig=orig,
            dest=dest,
            raw_df=True,
        )
        for k, v in summaries.items()
    }
    df = pd.concat(dfs, names=["source"]).reset_index()
    if raw_df:
        return df

    metric = "bookings"
    if metric == "revenue":
        metric_fmt = "$,.0f"
    else:
        metric_fmt = ",.2f"

    title = "Detailed Segmentation by Timeframe"
    title_annot = []
    if orig and dest:
        title_annot.append(f"{orig}~{dest}")
    elif orig and not dest:
        title_annot.append(f"Orig={orig}")
    elif not orig and dest:
        title_annot.append(f"Dest={dest}")

    if isinstance(by_carrier, str):
        # this filter is already applied in the query against each source
        # and can now be removed
        title_annot.append(by_carrier)
        by_carrier = False

    title = f"{title} ({', '.join(title_annot)})"

    if by_carrier:
        color = "carrier:N"
        color_title = "Carrier"
    elif by_class:
        color = "booking_class:N"
        color_title = "Booking Class"
    else:
        color = "segment:N"
        color_title = "Passenger Type"

    import altair as alt

    fig = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            xOffset=alt.XOffset("source:N"),
            x=alt.X("days_prior:O").scale(reverse=True).title("Days Prior to Departure"),
            y=alt.Y("bookings:Q").title("Bookings"),
            color=alt.Color(color).title(color_title),
            tooltip=[
                alt.Tooltip("source", title="Source"),
            ]
            + ([alt.Tooltip("carrier").title("Carrier")] if by_carrier else [])
            + ([alt.Tooltip("booking_class").title("Booking Class")] if by_class else [])
            + [
                alt.Tooltip("segment", title="Passenger Type"),
                alt.Tooltip("days_prior", title="Days Prior"),
                alt.Tooltip(metric, format=metric_fmt, title=metric.title()),
            ],
        )
    )
    if by_carrier or by_class:
        fig = fig.facet(
            row=alt.Row("segment:N", title="Passenger Type"),
            title=title,
        )
        if width:
            fig.spec.width = width
        if height:
            fig.spec.height = height
    else:
        fig = fig.properties(title=title, width=width, height=height)

    if also_df:
        return fig, df
    return fig