{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.metrics.pairwise import cosine_similarity"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')\n",
"ratings = pd.read_csv('../data/reduced/ratings_m10.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 4.0 | \n",
" 964982703 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 1 4.0 964982703"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" tmdbId | \n",
" imdbId | \n",
" cast | \n",
" director | \n",
" keywords | \n",
" overview | \n",
" title | \n",
" genres | \n",
" year | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1 | \n",
" 862 | \n",
" 114709 | \n",
" ['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim... | \n",
" John Lasseter | \n",
" ['jealousy', 'toy', 'boy', 'friendship', 'frie... | \n",
" Woody the cowboy is young Andy’s favorite to... | \n",
" Toy Story | \n",
" ['Adventure', 'Animation', 'Children', 'Comedy... | \n",
" 1995 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 tmdbId imdbId \\\n",
"movieId \n",
"1 1 862 114709 \n",
"\n",
" cast director \\\n",
"movieId \n",
"1 ['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim... John Lasseter \n",
"\n",
" keywords \\\n",
"movieId \n",
"1 ['jealousy', 'toy', 'boy', 'friendship', 'frie... \n",
"\n",
" overview title \\\n",
"movieId \n",
"1 Woody the cowboy is young Andy’s favorite to... Toy Story \n",
"\n",
" genres year \n",
"movieId \n",
"1 ['Adventure', 'Animation', 'Children', 'Comedy... 1995 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#ratings = ratings.head(200)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def train_test_column_split(df, group_column, split_column, y_label, train_size):\n",
" df = df.sort_values(by=split_column, ascending=True) \n",
" train = pd.DataFrame(columns=df.columns)\n",
" test = pd.DataFrame(columns=df.columns)\n",
"\n",
" for idx in df[group_column].unique():\n",
" group = df.loc[df[group_column] == idx]\n",
"\n",
" q_user = group[group[split_column].le(group[split_column].quantile(train_size))]\n",
" p_user = group[group[split_column].ge(group[split_column].quantile(train_size))]\n",
"\n",
" train = pd.concat([train, q_user])\n",
" test = pd.concat([test, p_user])\n",
" train = train.sort_index(ascending=True)\n",
" test = test.sort_index(ascending=True)\n",
"\n",
" X_labels = [c for c in df.columns if c != y_label]\n",
"\n",
" X_train = train[X_labels]\n",
" X_test = test[X_labels]\n",
" y_train = train[y_label]\n",
" y_test = test[y_label]\n",
"\n",
" return (X_train, X_test, y_train, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_column_split(ratings, 'userId', 'timestamp', 'rating', .9)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 964982703 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 3 | \n",
" 964981247 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 6 | \n",
" 964982224 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 50 | \n",
" 964982931 | \n",
"
\n",
" \n",
" 5 | \n",
" 1 | \n",
" 70 | \n",
" 964982400 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 81109 | \n",
" 610 | \n",
" 157296 | \n",
" 1493846563 | \n",
"
\n",
" \n",
" 81110 | \n",
" 610 | \n",
" 158238 | \n",
" 1479545219 | \n",
"
\n",
" \n",
" 81111 | \n",
" 610 | \n",
" 159093 | \n",
" 1493847704 | \n",
"
\n",
" \n",
" 81112 | \n",
" 610 | \n",
" 164179 | \n",
" 1493845631 | \n",
"
\n",
" \n",
" 81115 | \n",
" 610 | \n",
" 168252 | \n",
" 1493846352 | \n",
"
\n",
" \n",
"
\n",
"
72991 rows × 3 columns
\n",
"
"
],
"text/plain": [
" userId movieId timestamp\n",
"0 1 1 964982703\n",
"1 1 3 964981247\n",
"2 1 6 964982224\n",
"4 1 50 964982931\n",
"5 1 70 964982400\n",
"... ... ... ...\n",
"81109 610 157296 1493846563\n",
"81110 610 158238 1479545219\n",
"81111 610 159093 1493847704\n",
"81112 610 164179 1493845631\n",
"81115 610 168252 1493846352\n",
"\n",
"[72991 rows x 3 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 4.0\n",
"1 4.0\n",
"2 4.0\n",
"4 5.0\n",
"5 3.0\n",
" ... \n",
"81109 4.0\n",
"81110 5.0\n",
"81111 3.0\n",
"81112 5.0\n",
"81115 5.0\n",
"Name: rating, Length: 72991, dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" 1 | \n",
" 47 | \n",
" 964983815 | \n",
"
\n",
" \n",
" 8 | \n",
" 1 | \n",
" 151 | \n",
" 964984041 | \n",
"
\n",
" \n",
" 9 | \n",
" 1 | \n",
" 157 | \n",
" 964984100 | \n",
"
\n",
" \n",
" 28 | \n",
" 1 | \n",
" 527 | \n",
" 964984002 | \n",
"
\n",
" \n",
" 31 | \n",
" 1 | \n",
" 553 | \n",
" 964984153 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 81075 | \n",
" 610 | \n",
" 115149 | \n",
" 1493849607 | \n",
"
\n",
" \n",
" 81076 | \n",
" 610 | \n",
" 115210 | \n",
" 1493849803 | \n",
"
\n",
" \n",
" 81105 | \n",
" 610 | \n",
" 142488 | \n",
" 1493849575 | \n",
"
\n",
" \n",
" 81113 | \n",
" 610 | \n",
" 166528 | \n",
" 1493879365 | \n",
"
\n",
" \n",
" 81114 | \n",
" 610 | \n",
" 168250 | \n",
" 1494273047 | \n",
"
\n",
" \n",
"
\n",
"
8621 rows × 3 columns
\n",
"
"
],
"text/plain": [
" userId movieId timestamp\n",
"3 1 47 964983815\n",
"8 1 151 964984041\n",
"9 1 157 964984100\n",
"28 1 527 964984002\n",
"31 1 553 964984153\n",
"... ... ... ...\n",
"81075 610 115149 1493849607\n",
"81076 610 115210 1493849803\n",
"81105 610 142488 1493849575\n",
"81113 610 166528 1493879365\n",
"81114 610 168250 1494273047\n",
"\n",
"[8621 rows x 3 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3 5.0\n",
"8 5.0\n",
"9 5.0\n",
"28 5.0\n",
"31 5.0\n",
" ... \n",
"81075 5.0\n",
"81076 4.0\n",
"81105 3.5\n",
"81113 4.0\n",
"81114 5.0\n",
"Name: rating, Length: 8621, dtype: float64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"X_train.to_csv('../data/splitted/X_train_90.csv')\n",
"X_test.to_csv('../data/splitted/X_test_90.csv')\n",
"np.save('../data/splitted/y_train_90.npy', y_train)\n",
"np.save('../data/splitted/y_test_90.npy', y_test)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}