Spaces:
Runtime error
Runtime error
File size: 1,467 Bytes
e618873 66cc316 e618873 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import streamlit as st
import pandas as pd
@st.cache
def load_and_preprocess_data():
df = pd.read_csv(
"Data/OnlineRetail.csv",
encoding="latin-1",
)
# Remove nans values
df = df.dropna()
# Use only positive quantites. This is not a robust approach,
# but to keep things simple it quite good.
df = df[df["Quantity"] > 0]
# Parse the date column and add 10 years, just to better visualization
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(
"d"
) + pd.offsets.DateOffset(years=10)
# Change customer id to int
df["CustomerID"] = df["CustomerID"].astype(int)
# Add price column
df["Price"] = df["Quantity"] * df["UnitPrice"]
# Get unique entries in the dataset of users and products
users = df["CustomerID"].unique()
products = df["StockCode"].unique()
# Create a categorical type for users and product. User ordered to ensure
# reproducibility
user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
# Transform and get the indexes of the columns
user_idx = df["CustomerID"].astype(user_cat).cat.codes
product_idx = df["StockCode"].astype(product_cat).cat.codes
# Add the categorical index to the starting dataframe
df["CustomerIndex"] = user_idx
df["ProductIndex"] = product_idx
return df, user_idx, product_idx
|