Spaces:

digitiamosrl
/

recsys-and-customer-segmentation

Running

File size: 13,700 Bytes

from collections import defaultdict
import streamlit as st
from utils import load_and_preprocess_data
import pandas as pd
import numpy as np
import altair as alt
from sklearn.mixture import GaussianMixture
import plotly.express as px
import itertools
from typing import Dict, List, Tuple


SIDEBAR_DESCRIPTION = """
# Client clustering

To cluster a client, we adopt the RFM metrics. They stand for:

- R = recency, that is the number of days since the last purchase
    in the store
- F = frequency, that is the number of times a customer has ordered something
- M = monetary value, that is how much a customer has spent buying
    from your business.

Given these 3 metrics, we can cluster the customers and find a suitable
"definition" based on the clusters they belong to. Since the dataset
we're using right now has about 5000 distinct customers, we identify
3 clusters for each metric.

## How we compute the clusters

We resort to a GaussianMixture algorithm. We can think of GaussianMixture
as generalized k-means clustering that incorporates information about
the covariance structure of the data as well as the centers of the clusters.
""".lstrip()

FREQUENCY_CLUSTERS_EXPLAIN = """
The **frequency** denotes how frequently a customer has ordered.

There 3 available clusters for this metric:

- cluster 1: denotes a customer that purchases one or few times (range [{}, {}])
- cluster 2: these customer have a discrete amount of orders (range [{}, {}])
- cluster 3: these customer purchases lots of times (range [{}, {}])

-------
""".lstrip()

RECENCY_CLUSTERS_EXPLAIN = """
The **recency** refers to how recently a customer has bought;

There 3 available clusters for this metric:

- cluster 1: the last order of these client is long time ago (range [{}, {}])
- cluster 2: these are clients that purchases something not very recently (range [{}, {}])
- cluster 3: the last order of these client is a few days/weeks ago (range [{}, {}])

-------
""".lstrip()

MONETARY_CLUSTERS_EXPLAIN = """
The **revenue** refers to how much a customer has spent buying
from your business.

There 3 available clusters for this metric:

- cluster 1: these clients spent little money (range [{}, {}])
- cluster 2: these clients spent a considerable amount of money (range [{}, {}])
- cluster 3: these clients spent lots of money (range [{}, {}])

-------
""".lstrip()

EXPLANATION_DICT = {
    "Frequency_cluster": FREQUENCY_CLUSTERS_EXPLAIN,
    "Recency_cluster": RECENCY_CLUSTERS_EXPLAIN,
    "Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN,
}


def create_features(df: pd.DataFrame):
    """Creates a new dataframe with the RFM features for each client."""
    # Compute frequency, the number of distinct time a user purchased.
    client_features = df.groupby("CustomerID")["InvoiceDate"].nunique().reset_index()
    client_features.columns = ["CustomerID", "Frequency"]

    # Add monetary value, the total revenue for  each single user.
    client_takings = df.groupby("CustomerID")["Price"].sum()
    client_features["Revenue"] = client_takings.values

    # Add recency, i.e. the days since the last purchase in the store.
    max_date = df.groupby("CustomerID")["InvoiceDate"].max().reset_index()
    max_date.columns = ["CustomerID", "LastPurchaseDate"]

    client_features["Recency"] = (
        max_date["LastPurchaseDate"].max() - max_date["LastPurchaseDate"]
    ).dt.days

    return client_features


@st.cache
def cluster_clients(df: pd.DataFrame):
    """Computes the RFM features and clusters for each user based on the RFM metrics."""

    df_rfm = create_features(df)

    for to_cluster, order in zip(
        ["Revenue", "Frequency", "Recency"], ["ascending", "ascending", "descending"]
    ):
        kmeans = GaussianMixture(n_components=3, random_state=42)
        labels = kmeans.fit_predict(df_rfm[[to_cluster]])
        df_rfm[f"{to_cluster}_cluster"] = _order_cluster(kmeans, labels, order)

    return df_rfm


def _order_cluster(cluster_model: GaussianMixture, clusters, order="ascending"):
    """Orders the cluster by `order`."""
    centroids = cluster_model.means_.sum(axis=1)

    if order.lower() == "descending":
        centroids *= -1

    ascending_order = np.argsort(centroids)
    lookup_table = np.zeros_like(ascending_order)
    # Cluster will start from 1
    lookup_table[ascending_order] = np.arange(cluster_model.n_components) + 1
    return lookup_table[clusters]


def show_purhcase_history(user: int, df: pd.DataFrame):
    user_purchases = df.loc[df.CustomerID == user, ["Price", "InvoiceDate"]]
    expenses = user_purchases.groupby(user_purchases.InvoiceDate).sum()
    expenses.columns = ["Expenses"]
    expenses = expenses.reset_index()

    c = (
        alt.Chart(expenses)
        .mark_line(point=True)
        .encode(
            x=alt.X("InvoiceDate", timeUnit="yearmonthdate", title="Date"),
            y="Expenses",
        )
        .properties(title="User expenses")
    )

    st.altair_chart(c)


def show_user_info(user: int, df_rfm: pd.DataFrame):
    """Prints some information about the user.

    The main information are the total expenses, how
    many times he purchases in the store, and the clusters
    he belongs to.
    """

    user_row = df_rfm[df_rfm["CustomerID"] == user]
    if len(user_row) == 0:
        st.write(f"No user with id {user}")

    output = []

    output.append(f"The user purchased **{user_row['Frequency'].squeeze()} times**.\n")
    output.append(
        f"She/he spent **{user_row['Revenue'].squeeze()} dollars** in total.\n"
    )
    output.append(
        f"The last time she/he bought something was **{user_row['Recency'].squeeze()} days ago**.\n"
    )
    output.append(f"She/he belongs to the clusters: ")
    for cluster in [column for column in user_row.columns if "_cluster" in column]:
        output.append(f"- {cluster} = {user_row[cluster].squeeze()}")

    st.write("\n".join(output))

    return (
        user_row["Recency_cluster"].squeeze(),
        user_row["Frequency_cluster"].squeeze(),
        user_row["Revenue_cluster"].squeeze(),
    )


def explain_cluster(cluster_info):
    """Displays a popup menu explinging the meanining of the clusters."""

    with st.expander("Show information about the clusters"):
        st.write(
            "**Note**: these values are valid for these dataset."
            "Different dataset will have different number of clusters"
            " and values"
        )
        for cluster, info in cluster_info.items():
            # Transform the (mins, maxs) tuple into
            # [min_1, max_1, min_2, max_2, ...] list.
            min_max_interleaved = list(itertools.chain(*zip(info[0], info[1])))
            st.write(EXPLANATION_DICT[cluster].format(*min_max_interleaved))


def categorize_user(recency_cluster, frequency_cluster, monetary_cluster):
    """Describe the user with few words based on the cluster he belongs to."""

    score = f"{recency_cluster}{frequency_cluster}{monetary_cluster}"

    # @fixme: find a better approeach. These elif chains don't scale at all.

    description = ""

    if score == "111":
        description = "Tourist"
    elif score.startswith("2"):
        description = "Losing interest"
    elif score == "133":
        description = "Former lover"
    elif score == "123":
        description = "Former passionate client"
    elif score == "113":
        description = "Spent a lot, but never come back"
    elif score.startswith("1"):
        description = "About to dump"
    elif score == "313":
        description = "Potential lover"
    elif score == "312":
        description = "Interesting new client"
    elif score == "311":
        description = "New customer"
    elif score == "333":
        description = "Gold client"
    elif score == "322":
        description = "Lovers"
    else:
        description = "Average client"

    st.write(f"The customer can be described as: **{description}**")


def plot_rfm_distribution(
    df_rfm: pd.DataFrame, cluster_info: Dict[str, Tuple[List[int], List[int]]]
):
    """Plots 3 histograms for the RFM metrics."""

    for x, to_reverse in zip(("Revenue", "Frequency", "Recency"), (False, False, True)):
        fig = px.histogram(
            df_rfm,
            x=x,
            log_y=True,
            title=f"{x} metric",
        )
        # Get the max value in the cluster info. The cluster_info_dict is a
        # tuple with first element the min values of the cluster, and second
        # element the max values of the cluster.
        values = cluster_info[f"{x}_cluster"][1]  # get max values
        print(values)
        # Add vertical bar on each cluster end. But skip the last cluster.
        loop_range = range(len(values) - 1)
        if to_reverse:
            # Skip the last element
            loop_range = range(len(values) - 1, 0, -1)
        for n_cluster in loop_range:
            print(x)
            print(values[n_cluster])
            fig.add_vline(
                x=values[n_cluster],
                annotation_text=f"End of cluster {n_cluster+1}",
                line_dash="dot",
                annotation=dict(textangle=90, font_color="red"),
            )

        fig.update_layout(
            yaxis_title="Count (log scale)",
        )

        st.plotly_chart(fig)


def display_dataframe_heatmap(df_rfm: pd.DataFrame, cluster_info_dict):
    """Displays an heatmap of how many clients lay in the clusters.

    This method uses some black magic coming from the dataframe
    styling guide.
    """

    def style_with_limits(x, column, cluster_limit_dict):
        """Simple function to transform the cluster number into
        a cluster + range string."""
        min_v = cluster_limit_dict[column][0][x - 1]
        max_v = cluster_limit_dict[column][1][x - 1]
        return f"{x}: [{int(min_v)}, {int(max_v)}]"

    # Create a dataframe with the count of clients for each group
    # of cluster.

    count = (
        df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
            "CustomerID"
        ]
        .count()
        .reset_index()
    )
    count = count.rename(columns={"CustomerID": "Count"})

    # Remove duplicates
    count = count.drop_duplicates(
        ["Revenue_cluster", "Frequency_cluster", "Recency_cluster"]
    )

    # Add limits to the cells. In this way, we can better display
    # the heatmap.
    for cluster in ["Revenue_cluster", "Frequency_cluster", "Recency_cluster"]:
        count[cluster] = count[cluster].apply(
            lambda x: style_with_limits(x, cluster, cluster_info_dict)
        )

    # Use the count column as values, then index with the clusters.
    count = count.pivot(
        index=["Revenue_cluster", "Frequency_cluster"],
        columns="Recency_cluster",
        values="Count",
    )

    # Style manipulation
    cell_hover = {
        "selector": "td",
        "props": "font-size:1.2em",
    }
    index_names = {
        "selector": ".index_name",
        "props": "font-style: italic; color: Black; font-weight:normal;font-size:1.2em;",
    }
    headers = {
        "selector": "th:not(.index_name)",
        "props": "background-color: White; color: black; font-size:1.2em",
    }

    # Finally, display
    # We cannot directly print the dataframe since the streamlit
    # functin remove the multiindex. Thus, we extract the html representation
    # and then display it.
    st.markdown("## Heatmap: how the client are distributed between clusters")
    st.write(
        count.style.format(thousands=" ", precision=0, na_rep="0")
        .set_table_styles([cell_hover, index_names, headers])
        .background_gradient(cmap="coolwarm")
        .to_html(),
        unsafe_allow_html=True,
    )


def main():
    st.sidebar.markdown(SIDEBAR_DESCRIPTION)

    df, _, _ = load_and_preprocess_data()
    df_rfm = cluster_clients(df)

    st.markdown(
        "# Dataset "
        "\nThis is the processed dataset with information about the clients, such as"
        " the RFM values and the clusters they belong to."
    )
    st.dataframe(df_rfm.style.format(formatter={"Revenue": "{:.2f}"}))

    cluster_info_dict = defaultdict(list)

    with st.expander("Show more details about the clusters"):
        for cluster in [column for column in df_rfm.columns if "_cluster" in column]:
            st.write(cluster)
            cluster_info = (
                df_rfm.groupby(cluster)[cluster.split("_")[0]]
                .describe()
                .reset_index(names="Cluster")
            )
            min_cluster = cluster_info["min"].astype(int)
            max_cluster = cluster_info["max"].astype(int)
            cluster_info_dict[cluster] = (min_cluster, max_cluster)
            st.dataframe(cluster_info)

    st.markdown("## RFM metric distribution")

    plot_rfm_distribution(df_rfm, cluster_info_dict)

    display_dataframe_heatmap(df_rfm, cluster_info_dict)

    st.markdown("## Interactive exploration")

    filter_by_cluster = st.checkbox(
        "Filter client: only one client per cluster type",
        value=True,
    )

    client_to_select = (
        df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
            "CustomerID"
        ]
        .first()
        .values
        if filter_by_cluster
        else df["CustomerID"].unique()
    )

    # Let the user select the user to investigate
    user = st.selectbox(
        "Select a customer to show more information about him.",
        client_to_select,
    )

    show_purhcase_history(user, df)

    recency, frequency, revenue = show_user_info(user, df_rfm)

    categorize_user(recency, frequency, revenue)

    explain_cluster(cluster_info_dict)


main()