Spaces:
Sleeping
Sleeping
initial commit
Browse files- Demo Data.ipynb +718 -0
- README.md +4 -7
- app.py +57 -0
- dataset/books.csv +0 -0
- dataset/ratings.csv +0 -0
- dataset/users.csv +0 -0
- preprocessing.py +79 -0
- processed/R.npy +3 -0
- processed/Y.npy +3 -0
- processed/book_id_map.json +1 -0
- processed/summary_book.csv +0 -0
- processed/user_id_map.json +1 -0
- recommend.py +55 -0
- requirements.txt +51 -0
- train.py +86 -0
- utils_c.py +40 -0
- weight/W.npy +3 -0
- weight/X.npy +3 -0
- weight/b.npy +3 -0
- weight/predicted.npy +3 -0
Demo Data.ipynb
ADDED
@@ -0,0 +1,718 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "6c97a769",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Overview data\n",
|
9 |
+
"\n",
|
10 |
+
"**Note: In this notebook, I assume the dataset is cleaned and ignore EDA.**"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"id": "a54afd58",
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"import pandas as pd\n",
|
21 |
+
"import numpy as np\n",
|
22 |
+
"import warnings\n",
|
23 |
+
"\n",
|
24 |
+
"warnings.filterwarnings(\"ignore\")"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": 2,
|
30 |
+
"id": "cdb44c97",
|
31 |
+
"metadata": {},
|
32 |
+
"outputs": [
|
33 |
+
{
|
34 |
+
"data": {
|
35 |
+
"text/html": [
|
36 |
+
"<div>\n",
|
37 |
+
"<style scoped>\n",
|
38 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
39 |
+
" vertical-align: middle;\n",
|
40 |
+
" }\n",
|
41 |
+
"\n",
|
42 |
+
" .dataframe tbody tr th {\n",
|
43 |
+
" vertical-align: top;\n",
|
44 |
+
" }\n",
|
45 |
+
"\n",
|
46 |
+
" .dataframe thead th {\n",
|
47 |
+
" text-align: right;\n",
|
48 |
+
" }\n",
|
49 |
+
"</style>\n",
|
50 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
51 |
+
" <thead>\n",
|
52 |
+
" <tr style=\"text-align: right;\">\n",
|
53 |
+
" <th></th>\n",
|
54 |
+
" <th>User-ID</th>\n",
|
55 |
+
" <th>Location</th>\n",
|
56 |
+
" <th>Age</th>\n",
|
57 |
+
" </tr>\n",
|
58 |
+
" </thead>\n",
|
59 |
+
" <tbody>\n",
|
60 |
+
" <tr>\n",
|
61 |
+
" <th>0</th>\n",
|
62 |
+
" <td>1</td>\n",
|
63 |
+
" <td>nyc, new york, usa</td>\n",
|
64 |
+
" <td>NaN</td>\n",
|
65 |
+
" </tr>\n",
|
66 |
+
" <tr>\n",
|
67 |
+
" <th>1</th>\n",
|
68 |
+
" <td>2</td>\n",
|
69 |
+
" <td>stockton, california, usa</td>\n",
|
70 |
+
" <td>18.0</td>\n",
|
71 |
+
" </tr>\n",
|
72 |
+
" <tr>\n",
|
73 |
+
" <th>2</th>\n",
|
74 |
+
" <td>3</td>\n",
|
75 |
+
" <td>moscow, yukon territory, russia</td>\n",
|
76 |
+
" <td>NaN</td>\n",
|
77 |
+
" </tr>\n",
|
78 |
+
" <tr>\n",
|
79 |
+
" <th>3</th>\n",
|
80 |
+
" <td>4</td>\n",
|
81 |
+
" <td>porto, v.n.gaia, portugal</td>\n",
|
82 |
+
" <td>17.0</td>\n",
|
83 |
+
" </tr>\n",
|
84 |
+
" <tr>\n",
|
85 |
+
" <th>4</th>\n",
|
86 |
+
" <td>5</td>\n",
|
87 |
+
" <td>farnborough, hants, united kingdom</td>\n",
|
88 |
+
" <td>NaN</td>\n",
|
89 |
+
" </tr>\n",
|
90 |
+
" </tbody>\n",
|
91 |
+
"</table>\n",
|
92 |
+
"</div>"
|
93 |
+
],
|
94 |
+
"text/plain": [
|
95 |
+
" User-ID Location Age\n",
|
96 |
+
"0 1 nyc, new york, usa NaN\n",
|
97 |
+
"1 2 stockton, california, usa 18.0\n",
|
98 |
+
"2 3 moscow, yukon territory, russia NaN\n",
|
99 |
+
"3 4 porto, v.n.gaia, portugal 17.0\n",
|
100 |
+
"4 5 farnborough, hants, united kingdom NaN"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
"execution_count": 2,
|
104 |
+
"metadata": {},
|
105 |
+
"output_type": "execute_result"
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"source": [
|
109 |
+
"path = \"./dataset\"\n",
|
110 |
+
"\n",
|
111 |
+
"# user dataset\n",
|
112 |
+
"user_df = pd.read_csv(f\"{path}/users.csv\", delimiter=';', encoding='ISO-8859-1')\n",
|
113 |
+
"user_df.head()"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"execution_count": 3,
|
119 |
+
"id": "fe62dfa3",
|
120 |
+
"metadata": {},
|
121 |
+
"outputs": [
|
122 |
+
{
|
123 |
+
"name": "stderr",
|
124 |
+
"output_type": "stream",
|
125 |
+
"text": [
|
126 |
+
"b'Skipping line 6452: expected 8 fields, saw 9\\nSkipping line 43667: expected 8 fields, saw 10\\nSkipping line 51751: expected 8 fields, saw 9\\n'\n",
|
127 |
+
"b'Skipping line 92038: expected 8 fields, saw 9\\nSkipping line 104319: expected 8 fields, saw 9\\nSkipping line 121768: expected 8 fields, saw 9\\n'\n",
|
128 |
+
"b'Skipping line 144058: expected 8 fields, saw 9\\nSkipping line 150789: expected 8 fields, saw 9\\nSkipping line 157128: expected 8 fields, saw 9\\nSkipping line 180189: expected 8 fields, saw 9\\nSkipping line 185738: expected 8 fields, saw 9\\n'\n",
|
129 |
+
"b'Skipping line 209388: expected 8 fields, saw 9\\nSkipping line 220626: expected 8 fields, saw 9\\nSkipping line 227933: expected 8 fields, saw 11\\nSkipping line 228957: expected 8 fields, saw 10\\nSkipping line 245933: expected 8 fields, saw 9\\nSkipping line 251296: expected 8 fields, saw 9\\nSkipping line 259941: expected 8 fields, saw 9\\nSkipping line 261529: expected 8 fields, saw 9\\n'\n"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"data": {
|
134 |
+
"text/html": [
|
135 |
+
"<div>\n",
|
136 |
+
"<style scoped>\n",
|
137 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
138 |
+
" vertical-align: middle;\n",
|
139 |
+
" }\n",
|
140 |
+
"\n",
|
141 |
+
" .dataframe tbody tr th {\n",
|
142 |
+
" vertical-align: top;\n",
|
143 |
+
" }\n",
|
144 |
+
"\n",
|
145 |
+
" .dataframe thead th {\n",
|
146 |
+
" text-align: right;\n",
|
147 |
+
" }\n",
|
148 |
+
"</style>\n",
|
149 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
150 |
+
" <thead>\n",
|
151 |
+
" <tr style=\"text-align: right;\">\n",
|
152 |
+
" <th></th>\n",
|
153 |
+
" <th>ISBN</th>\n",
|
154 |
+
" <th>Book-Title</th>\n",
|
155 |
+
" <th>Book-Author</th>\n",
|
156 |
+
" <th>Year-Of-Publication</th>\n",
|
157 |
+
" <th>Publisher</th>\n",
|
158 |
+
" <th>Image-URL-S</th>\n",
|
159 |
+
" <th>Image-URL-M</th>\n",
|
160 |
+
" <th>Image-URL-L</th>\n",
|
161 |
+
" </tr>\n",
|
162 |
+
" </thead>\n",
|
163 |
+
" <tbody>\n",
|
164 |
+
" <tr>\n",
|
165 |
+
" <th>0</th>\n",
|
166 |
+
" <td>0195153448</td>\n",
|
167 |
+
" <td>Classical Mythology</td>\n",
|
168 |
+
" <td>Mark P. O. Morford</td>\n",
|
169 |
+
" <td>2002</td>\n",
|
170 |
+
" <td>Oxford University Press</td>\n",
|
171 |
+
" <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
|
172 |
+
" <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
|
173 |
+
" <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
|
174 |
+
" </tr>\n",
|
175 |
+
" <tr>\n",
|
176 |
+
" <th>1</th>\n",
|
177 |
+
" <td>0002005018</td>\n",
|
178 |
+
" <td>Clara Callan</td>\n",
|
179 |
+
" <td>Richard Bruce Wright</td>\n",
|
180 |
+
" <td>2001</td>\n",
|
181 |
+
" <td>HarperFlamingo Canada</td>\n",
|
182 |
+
" <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
|
183 |
+
" <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
|
184 |
+
" <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
|
185 |
+
" </tr>\n",
|
186 |
+
" <tr>\n",
|
187 |
+
" <th>2</th>\n",
|
188 |
+
" <td>0060973129</td>\n",
|
189 |
+
" <td>Decision in Normandy</td>\n",
|
190 |
+
" <td>Carlo D'Este</td>\n",
|
191 |
+
" <td>1991</td>\n",
|
192 |
+
" <td>HarperPerennial</td>\n",
|
193 |
+
" <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
|
194 |
+
" <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
|
195 |
+
" <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
|
196 |
+
" </tr>\n",
|
197 |
+
" <tr>\n",
|
198 |
+
" <th>3</th>\n",
|
199 |
+
" <td>0374157065</td>\n",
|
200 |
+
" <td>Flu: The Story of the Great Influenza Pandemic...</td>\n",
|
201 |
+
" <td>Gina Bari Kolata</td>\n",
|
202 |
+
" <td>1999</td>\n",
|
203 |
+
" <td>Farrar Straus Giroux</td>\n",
|
204 |
+
" <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
|
205 |
+
" <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
|
206 |
+
" <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
|
207 |
+
" </tr>\n",
|
208 |
+
" <tr>\n",
|
209 |
+
" <th>4</th>\n",
|
210 |
+
" <td>0393045218</td>\n",
|
211 |
+
" <td>The Mummies of Urumchi</td>\n",
|
212 |
+
" <td>E. J. W. Barber</td>\n",
|
213 |
+
" <td>1999</td>\n",
|
214 |
+
" <td>W. W. Norton &amp; Company</td>\n",
|
215 |
+
" <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
|
216 |
+
" <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
|
217 |
+
" <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
|
218 |
+
" </tr>\n",
|
219 |
+
" </tbody>\n",
|
220 |
+
"</table>\n",
|
221 |
+
"</div>"
|
222 |
+
],
|
223 |
+
"text/plain": [
|
224 |
+
" ISBN Book-Title \\\n",
|
225 |
+
"0 0195153448 Classical Mythology \n",
|
226 |
+
"1 0002005018 Clara Callan \n",
|
227 |
+
"2 0060973129 Decision in Normandy \n",
|
228 |
+
"3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n",
|
229 |
+
"4 0393045218 The Mummies of Urumchi \n",
|
230 |
+
"\n",
|
231 |
+
" Book-Author Year-Of-Publication Publisher \\\n",
|
232 |
+
"0 Mark P. O. Morford 2002 Oxford University Press \n",
|
233 |
+
"1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
|
234 |
+
"2 Carlo D'Este 1991 HarperPerennial \n",
|
235 |
+
"3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
|
236 |
+
"4 E. J. W. Barber 1999 W. W. Norton & Company \n",
|
237 |
+
"\n",
|
238 |
+
" Image-URL-S \\\n",
|
239 |
+
"0 http://images.amazon.com/images/P/0195153448.0... \n",
|
240 |
+
"1 http://images.amazon.com/images/P/0002005018.0... \n",
|
241 |
+
"2 http://images.amazon.com/images/P/0060973129.0... \n",
|
242 |
+
"3 http://images.amazon.com/images/P/0374157065.0... \n",
|
243 |
+
"4 http://images.amazon.com/images/P/0393045218.0... \n",
|
244 |
+
"\n",
|
245 |
+
" Image-URL-M \\\n",
|
246 |
+
"0 http://images.amazon.com/images/P/0195153448.0... \n",
|
247 |
+
"1 http://images.amazon.com/images/P/0002005018.0... \n",
|
248 |
+
"2 http://images.amazon.com/images/P/0060973129.0... \n",
|
249 |
+
"3 http://images.amazon.com/images/P/0374157065.0... \n",
|
250 |
+
"4 http://images.amazon.com/images/P/0393045218.0... \n",
|
251 |
+
"\n",
|
252 |
+
" Image-URL-L \n",
|
253 |
+
"0 http://images.amazon.com/images/P/0195153448.0... \n",
|
254 |
+
"1 http://images.amazon.com/images/P/0002005018.0... \n",
|
255 |
+
"2 http://images.amazon.com/images/P/0060973129.0... \n",
|
256 |
+
"3 http://images.amazon.com/images/P/0374157065.0... \n",
|
257 |
+
"4 http://images.amazon.com/images/P/0393045218.0... "
|
258 |
+
]
|
259 |
+
},
|
260 |
+
"execution_count": 3,
|
261 |
+
"metadata": {},
|
262 |
+
"output_type": "execute_result"
|
263 |
+
}
|
264 |
+
],
|
265 |
+
"source": [
|
266 |
+
"# book dataset\n",
|
267 |
+
"book_df = pd.read_csv(f\"{path}/books.csv\", delimiter=';', encoding='ISO-8859-1', error_bad_lines=False)\n",
|
268 |
+
"book_df.head()"
|
269 |
+
]
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"cell_type": "code",
|
273 |
+
"execution_count": 4,
|
274 |
+
"id": "d9fa4750",
|
275 |
+
"metadata": {},
|
276 |
+
"outputs": [
|
277 |
+
{
|
278 |
+
"data": {
|
279 |
+
"text/html": [
|
280 |
+
"<div>\n",
|
281 |
+
"<style scoped>\n",
|
282 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
283 |
+
" vertical-align: middle;\n",
|
284 |
+
" }\n",
|
285 |
+
"\n",
|
286 |
+
" .dataframe tbody tr th {\n",
|
287 |
+
" vertical-align: top;\n",
|
288 |
+
" }\n",
|
289 |
+
"\n",
|
290 |
+
" .dataframe thead th {\n",
|
291 |
+
" text-align: right;\n",
|
292 |
+
" }\n",
|
293 |
+
"</style>\n",
|
294 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
295 |
+
" <thead>\n",
|
296 |
+
" <tr style=\"text-align: right;\">\n",
|
297 |
+
" <th></th>\n",
|
298 |
+
" <th>User-ID</th>\n",
|
299 |
+
" <th>ISBN</th>\n",
|
300 |
+
" <th>Book-Rating</th>\n",
|
301 |
+
" </tr>\n",
|
302 |
+
" </thead>\n",
|
303 |
+
" <tbody>\n",
|
304 |
+
" <tr>\n",
|
305 |
+
" <th>0</th>\n",
|
306 |
+
" <td>276725</td>\n",
|
307 |
+
" <td>034545104X</td>\n",
|
308 |
+
" <td>0</td>\n",
|
309 |
+
" </tr>\n",
|
310 |
+
" <tr>\n",
|
311 |
+
" <th>1</th>\n",
|
312 |
+
" <td>276726</td>\n",
|
313 |
+
" <td>0155061224</td>\n",
|
314 |
+
" <td>5</td>\n",
|
315 |
+
" </tr>\n",
|
316 |
+
" <tr>\n",
|
317 |
+
" <th>2</th>\n",
|
318 |
+
" <td>276727</td>\n",
|
319 |
+
" <td>0446520802</td>\n",
|
320 |
+
" <td>0</td>\n",
|
321 |
+
" </tr>\n",
|
322 |
+
" <tr>\n",
|
323 |
+
" <th>3</th>\n",
|
324 |
+
" <td>276729</td>\n",
|
325 |
+
" <td>052165615X</td>\n",
|
326 |
+
" <td>3</td>\n",
|
327 |
+
" </tr>\n",
|
328 |
+
" <tr>\n",
|
329 |
+
" <th>4</th>\n",
|
330 |
+
" <td>276729</td>\n",
|
331 |
+
" <td>0521795028</td>\n",
|
332 |
+
" <td>6</td>\n",
|
333 |
+
" </tr>\n",
|
334 |
+
" <tr>\n",
|
335 |
+
" <th>5</th>\n",
|
336 |
+
" <td>276733</td>\n",
|
337 |
+
" <td>2080674722</td>\n",
|
338 |
+
" <td>0</td>\n",
|
339 |
+
" </tr>\n",
|
340 |
+
" <tr>\n",
|
341 |
+
" <th>6</th>\n",
|
342 |
+
" <td>276736</td>\n",
|
343 |
+
" <td>3257224281</td>\n",
|
344 |
+
" <td>8</td>\n",
|
345 |
+
" </tr>\n",
|
346 |
+
" <tr>\n",
|
347 |
+
" <th>7</th>\n",
|
348 |
+
" <td>276737</td>\n",
|
349 |
+
" <td>0600570967</td>\n",
|
350 |
+
" <td>6</td>\n",
|
351 |
+
" </tr>\n",
|
352 |
+
" <tr>\n",
|
353 |
+
" <th>8</th>\n",
|
354 |
+
" <td>276744</td>\n",
|
355 |
+
" <td>038550120X</td>\n",
|
356 |
+
" <td>7</td>\n",
|
357 |
+
" </tr>\n",
|
358 |
+
" <tr>\n",
|
359 |
+
" <th>9</th>\n",
|
360 |
+
" <td>276745</td>\n",
|
361 |
+
" <td>342310538</td>\n",
|
362 |
+
" <td>10</td>\n",
|
363 |
+
" </tr>\n",
|
364 |
+
" </tbody>\n",
|
365 |
+
"</table>\n",
|
366 |
+
"</div>"
|
367 |
+
],
|
368 |
+
"text/plain": [
|
369 |
+
" User-ID ISBN Book-Rating\n",
|
370 |
+
"0 276725 034545104X 0\n",
|
371 |
+
"1 276726 0155061224 5\n",
|
372 |
+
"2 276727 0446520802 0\n",
|
373 |
+
"3 276729 052165615X 3\n",
|
374 |
+
"4 276729 0521795028 6\n",
|
375 |
+
"5 276733 2080674722 0\n",
|
376 |
+
"6 276736 3257224281 8\n",
|
377 |
+
"7 276737 0600570967 6\n",
|
378 |
+
"8 276744 038550120X 7\n",
|
379 |
+
"9 276745 342310538 10"
|
380 |
+
]
|
381 |
+
},
|
382 |
+
"execution_count": 4,
|
383 |
+
"metadata": {},
|
384 |
+
"output_type": "execute_result"
|
385 |
+
}
|
386 |
+
],
|
387 |
+
"source": [
|
388 |
+
"# rating dataset\n",
|
389 |
+
"rating_df = pd.read_csv(f\"{path}/ratings.csv\", delimiter=';', encoding='ISO-8859-1')\n",
|
390 |
+
"rating_df.head(10)"
|
391 |
+
]
|
392 |
+
},
|
393 |
+
{
|
394 |
+
"cell_type": "code",
|
395 |
+
"execution_count": 5,
|
396 |
+
"id": "53c66ec4",
|
397 |
+
"metadata": {},
|
398 |
+
"outputs": [
|
399 |
+
{
|
400 |
+
"data": {
|
401 |
+
"text/plain": [
|
402 |
+
"Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')"
|
403 |
+
]
|
404 |
+
},
|
405 |
+
"execution_count": 5,
|
406 |
+
"metadata": {},
|
407 |
+
"output_type": "execute_result"
|
408 |
+
}
|
409 |
+
],
|
410 |
+
"source": [
|
411 |
+
"rating_df.columns"
|
412 |
+
]
|
413 |
+
},
|
414 |
+
{
|
415 |
+
"cell_type": "code",
|
416 |
+
"execution_count": 6,
|
417 |
+
"id": "691767c0",
|
418 |
+
"metadata": {},
|
419 |
+
"outputs": [
|
420 |
+
{
|
421 |
+
"data": {
|
422 |
+
"text/html": [
|
423 |
+
"<div>\n",
|
424 |
+
"<style scoped>\n",
|
425 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
426 |
+
" vertical-align: middle;\n",
|
427 |
+
" }\n",
|
428 |
+
"\n",
|
429 |
+
" .dataframe tbody tr th {\n",
|
430 |
+
" vertical-align: top;\n",
|
431 |
+
" }\n",
|
432 |
+
"\n",
|
433 |
+
" .dataframe thead th {\n",
|
434 |
+
" text-align: right;\n",
|
435 |
+
" }\n",
|
436 |
+
"</style>\n",
|
437 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
438 |
+
" <thead>\n",
|
439 |
+
" <tr style=\"text-align: right;\">\n",
|
440 |
+
" <th></th>\n",
|
441 |
+
" <th>Mean-Rating</th>\n",
|
442 |
+
" <th>Num-Rating</th>\n",
|
443 |
+
" </tr>\n",
|
444 |
+
" <tr>\n",
|
445 |
+
" <th>ISBN</th>\n",
|
446 |
+
" <th></th>\n",
|
447 |
+
" <th></th>\n",
|
448 |
+
" </tr>\n",
|
449 |
+
" </thead>\n",
|
450 |
+
" <tbody>\n",
|
451 |
+
" <tr>\n",
|
452 |
+
" <th>0330299891</th>\n",
|
453 |
+
" <td>3.0</td>\n",
|
454 |
+
" <td>2</td>\n",
|
455 |
+
" </tr>\n",
|
456 |
+
" <tr>\n",
|
457 |
+
" <th>0375404120</th>\n",
|
458 |
+
" <td>1.5</td>\n",
|
459 |
+
" <td>2</td>\n",
|
460 |
+
" </tr>\n",
|
461 |
+
" <tr>\n",
|
462 |
+
" <th>0586045007</th>\n",
|
463 |
+
" <td>0.0</td>\n",
|
464 |
+
" <td>1</td>\n",
|
465 |
+
" </tr>\n",
|
466 |
+
" <tr>\n",
|
467 |
+
" <th>9022906116</th>\n",
|
468 |
+
" <td>3.5</td>\n",
|
469 |
+
" <td>2</td>\n",
|
470 |
+
" </tr>\n",
|
471 |
+
" <tr>\n",
|
472 |
+
" <th>9032803328</th>\n",
|
473 |
+
" <td>0.0</td>\n",
|
474 |
+
" <td>1</td>\n",
|
475 |
+
" </tr>\n",
|
476 |
+
" </tbody>\n",
|
477 |
+
"</table>\n",
|
478 |
+
"</div>"
|
479 |
+
],
|
480 |
+
"text/plain": [
|
481 |
+
" Mean-Rating Num-Rating\n",
|
482 |
+
"ISBN \n",
|
483 |
+
" 0330299891 3.0 2\n",
|
484 |
+
" 0375404120 1.5 2\n",
|
485 |
+
" 0586045007 0.0 1\n",
|
486 |
+
" 9022906116 3.5 2\n",
|
487 |
+
" 9032803328 0.0 1"
|
488 |
+
]
|
489 |
+
},
|
490 |
+
"execution_count": 6,
|
491 |
+
"metadata": {},
|
492 |
+
"output_type": "execute_result"
|
493 |
+
}
|
494 |
+
],
|
495 |
+
"source": [
|
496 |
+
"function = {\n",
|
497 |
+
" \"Book-Rating\": \"mean\",\n",
|
498 |
+
" \"User-ID\": \"count\"\n",
|
499 |
+
"}\n",
|
500 |
+
"\n",
|
501 |
+
"summary_rating = rating_df.groupby(\"ISBN\").agg(function, axis=0)\n",
|
502 |
+
"summary_rating = summary_rating.rename(columns={\"Book-Rating\": \"Mean-Rating\", \"User-ID\": \"Num-Rating\"})\n",
|
503 |
+
"summary_rating.head()"
|
504 |
+
]
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"cell_type": "markdown",
|
508 |
+
"id": "3e20611a",
|
509 |
+
"metadata": {},
|
510 |
+
"source": [
|
511 |
+
"**Note:** In this repo, I only consider `book_df` and `rating_df`."
|
512 |
+
]
|
513 |
+
},
|
514 |
+
{
|
515 |
+
"cell_type": "code",
|
516 |
+
"execution_count": 7,
|
517 |
+
"id": "82e1b680",
|
518 |
+
"metadata": {},
|
519 |
+
"outputs": [
|
520 |
+
{
|
521 |
+
"data": {
|
522 |
+
"text/html": [
|
523 |
+
"<div>\n",
|
524 |
+
"<style scoped>\n",
|
525 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
526 |
+
" vertical-align: middle;\n",
|
527 |
+
" }\n",
|
528 |
+
"\n",
|
529 |
+
" .dataframe tbody tr th {\n",
|
530 |
+
" vertical-align: top;\n",
|
531 |
+
" }\n",
|
532 |
+
"\n",
|
533 |
+
" .dataframe thead th {\n",
|
534 |
+
" text-align: right;\n",
|
535 |
+
" }\n",
|
536 |
+
"</style>\n",
|
537 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
538 |
+
" <thead>\n",
|
539 |
+
" <tr style=\"text-align: right;\">\n",
|
540 |
+
" <th></th>\n",
|
541 |
+
" <th>ISBN</th>\n",
|
542 |
+
" <th>Book-Title</th>\n",
|
543 |
+
" <th>Book-Author</th>\n",
|
544 |
+
" <th>Year-Of-Publication</th>\n",
|
545 |
+
" <th>Publisher</th>\n",
|
546 |
+
" <th>Mean-Rating</th>\n",
|
547 |
+
" <th>Num-Rating</th>\n",
|
548 |
+
" </tr>\n",
|
549 |
+
" </thead>\n",
|
550 |
+
" <tbody>\n",
|
551 |
+
" <tr>\n",
|
552 |
+
" <th>0</th>\n",
|
553 |
+
" <td>0195153448</td>\n",
|
554 |
+
" <td>Classical Mythology</td>\n",
|
555 |
+
" <td>Mark P. O. Morford</td>\n",
|
556 |
+
" <td>2002</td>\n",
|
557 |
+
" <td>Oxford University Press</td>\n",
|
558 |
+
" <td>0.000000</td>\n",
|
559 |
+
" <td>1.0</td>\n",
|
560 |
+
" </tr>\n",
|
561 |
+
" <tr>\n",
|
562 |
+
" <th>1</th>\n",
|
563 |
+
" <td>0002005018</td>\n",
|
564 |
+
" <td>Clara Callan</td>\n",
|
565 |
+
" <td>Richard Bruce Wright</td>\n",
|
566 |
+
" <td>2001</td>\n",
|
567 |
+
" <td>HarperFlamingo Canada</td>\n",
|
568 |
+
" <td>4.928571</td>\n",
|
569 |
+
" <td>14.0</td>\n",
|
570 |
+
" </tr>\n",
|
571 |
+
" <tr>\n",
|
572 |
+
" <th>2</th>\n",
|
573 |
+
" <td>0060973129</td>\n",
|
574 |
+
" <td>Decision in Normandy</td>\n",
|
575 |
+
" <td>Carlo D'Este</td>\n",
|
576 |
+
" <td>1991</td>\n",
|
577 |
+
" <td>HarperPerennial</td>\n",
|
578 |
+
" <td>5.000000</td>\n",
|
579 |
+
" <td>3.0</td>\n",
|
580 |
+
" </tr>\n",
|
581 |
+
" <tr>\n",
|
582 |
+
" <th>3</th>\n",
|
583 |
+
" <td>0374157065</td>\n",
|
584 |
+
" <td>Flu: The Story of the Great Influenza Pandemic...</td>\n",
|
585 |
+
" <td>Gina Bari Kolata</td>\n",
|
586 |
+
" <td>1999</td>\n",
|
587 |
+
" <td>Farrar Straus Giroux</td>\n",
|
588 |
+
" <td>4.272727</td>\n",
|
589 |
+
" <td>11.0</td>\n",
|
590 |
+
" </tr>\n",
|
591 |
+
" <tr>\n",
|
592 |
+
" <th>4</th>\n",
|
593 |
+
" <td>0393045218</td>\n",
|
594 |
+
" <td>The Mummies of Urumchi</td>\n",
|
595 |
+
" <td>E. J. W. Barber</td>\n",
|
596 |
+
" <td>1999</td>\n",
|
597 |
+
" <td>W. W. Norton &amp; Company</td>\n",
|
598 |
+
" <td>0.000000</td>\n",
|
599 |
+
" <td>1.0</td>\n",
|
600 |
+
" </tr>\n",
|
601 |
+
" </tbody>\n",
|
602 |
+
"</table>\n",
|
603 |
+
"</div>"
|
604 |
+
],
|
605 |
+
"text/plain": [
|
606 |
+
" ISBN Book-Title \\\n",
|
607 |
+
"0 0195153448 Classical Mythology \n",
|
608 |
+
"1 0002005018 Clara Callan \n",
|
609 |
+
"2 0060973129 Decision in Normandy \n",
|
610 |
+
"3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n",
|
611 |
+
"4 0393045218 The Mummies of Urumchi \n",
|
612 |
+
"\n",
|
613 |
+
" Book-Author Year-Of-Publication Publisher \\\n",
|
614 |
+
"0 Mark P. O. Morford 2002 Oxford University Press \n",
|
615 |
+
"1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
|
616 |
+
"2 Carlo D'Este 1991 HarperPerennial \n",
|
617 |
+
"3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
|
618 |
+
"4 E. J. W. Barber 1999 W. W. Norton & Company \n",
|
619 |
+
"\n",
|
620 |
+
" Mean-Rating Num-Rating \n",
|
621 |
+
"0 0.000000 1.0 \n",
|
622 |
+
"1 4.928571 14.0 \n",
|
623 |
+
"2 5.000000 3.0 \n",
|
624 |
+
"3 4.272727 11.0 \n",
|
625 |
+
"4 0.000000 1.0 "
|
626 |
+
]
|
627 |
+
},
|
628 |
+
"execution_count": 7,
|
629 |
+
"metadata": {},
|
630 |
+
"output_type": "execute_result"
|
631 |
+
}
|
632 |
+
],
|
633 |
+
"source": [
|
634 |
+
"df = book_df.merge(summary_rating, how=\"left\", left_on=\"ISBN\", right_on=\"ISBN\")\n",
|
635 |
+
"df.drop(columns=[\"Image-URL-S\", \"Image-URL-M\", \"Image-URL-L\"], inplace=True)\n",
|
636 |
+
"df.head()"
|
637 |
+
]
|
638 |
+
},
|
639 |
+
{
|
640 |
+
"cell_type": "code",
|
641 |
+
"execution_count": 9,
|
642 |
+
"id": "fb397a05",
|
643 |
+
"metadata": {},
|
644 |
+
"outputs": [
|
645 |
+
{
|
646 |
+
"data": {
|
647 |
+
"text/plain": [
|
648 |
+
"ISBN 0\n",
|
649 |
+
"Book-Title 0\n",
|
650 |
+
"Book-Author 1\n",
|
651 |
+
"Year-Of-Publication 0\n",
|
652 |
+
"Publisher 2\n",
|
653 |
+
"Mean-Rating 1209\n",
|
654 |
+
"Num-Rating 1209\n",
|
655 |
+
"dtype: int64"
|
656 |
+
]
|
657 |
+
},
|
658 |
+
"execution_count": 9,
|
659 |
+
"metadata": {},
|
660 |
+
"output_type": "execute_result"
|
661 |
+
}
|
662 |
+
],
|
663 |
+
"source": [
|
664 |
+
"df.isnull().sum()"
|
665 |
+
]
|
666 |
+
},
|
667 |
+
{
|
668 |
+
"cell_type": "code",
|
669 |
+
"execution_count": 10,
|
670 |
+
"id": "7c7139ed",
|
671 |
+
"metadata": {},
|
672 |
+
"outputs": [],
|
673 |
+
"source": [
|
674 |
+
"# Save\n",
|
675 |
+
"df.to_csv(f\"{path}/summary_book.csv\", index=False)"
|
676 |
+
]
|
677 |
+
}
|
678 |
+
],
|
679 |
+
"metadata": {
|
680 |
+
"kernelspec": {
|
681 |
+
"display_name": "Python 3",
|
682 |
+
"language": "python",
|
683 |
+
"name": "python3"
|
684 |
+
},
|
685 |
+
"language_info": {
|
686 |
+
"codemirror_mode": {
|
687 |
+
"name": "ipython",
|
688 |
+
"version": 3
|
689 |
+
},
|
690 |
+
"file_extension": ".py",
|
691 |
+
"mimetype": "text/x-python",
|
692 |
+
"name": "python",
|
693 |
+
"nbconvert_exporter": "python",
|
694 |
+
"pygments_lexer": "ipython3",
|
695 |
+
"version": "3.8.10"
|
696 |
+
},
|
697 |
+
"latex_envs": {
|
698 |
+
"LaTeX_envs_menu_present": true,
|
699 |
+
"autoclose": false,
|
700 |
+
"autocomplete": true,
|
701 |
+
"bibliofile": "biblio.bib",
|
702 |
+
"cite_by": "apalike",
|
703 |
+
"current_citInitial": 1,
|
704 |
+
"eqLabelWithNumbers": true,
|
705 |
+
"eqNumInitial": 1,
|
706 |
+
"hotkeys": {
|
707 |
+
"equation": "Ctrl-E",
|
708 |
+
"itemize": "Ctrl-I"
|
709 |
+
},
|
710 |
+
"labels_anchors": false,
|
711 |
+
"latex_user_defs": false,
|
712 |
+
"report_style_numbering": false,
|
713 |
+
"user_envs_cfg": false
|
714 |
+
}
|
715 |
+
},
|
716 |
+
"nbformat": 4,
|
717 |
+
"nbformat_minor": 5
|
718 |
+
}
|
README.md
CHANGED
@@ -1,12 +1,9 @@
|
|
1 |
---
|
2 |
title: Book Recommender System
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.21.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Book Recommender System
|
3 |
+
emoji: 👀
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: purple
|
6 |
sdk: streamlit
|
|
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
+
---
|
|
|
|
app.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
|
8 |
+
# Parameters
|
9 |
+
data_dir = f'./processed'
|
10 |
+
weight_dir = f'./weight'
|
11 |
+
info_path = f'./processed/summary_book.csv'
|
12 |
+
num = 10
|
13 |
+
lb = 0
|
14 |
+
|
15 |
+
# Load R matrix from file
|
16 |
+
R = np.load(f'{data_dir}/R.npy', allow_pickle=True)
|
17 |
+
# Load prediction
|
18 |
+
prediction = np.load(f'{weight_dir}/predicted.npy', allow_pickle=True)
|
19 |
+
# Load dictionary from JSON file
|
20 |
+
with open(f'{data_dir}/user_id_map.json', 'r') as file:
|
21 |
+
user2id = json.load(file)
|
22 |
+
with open(f'{data_dir}/book_id_map.json', 'r') as file:
|
23 |
+
book2id = json.load(file)
|
24 |
+
|
25 |
+
|
26 |
+
# Define the input and output functions for Gradio
|
27 |
+
def recommend_books(user_id):
|
28 |
+
# Recommend
|
29 |
+
user_idx = user2id[str(user_id)]
|
30 |
+
predict = prediction[:, user_idx] # get prediction for user
|
31 |
+
predict_dict = {book: np.round(predict[idx], 2) for book, idx in book2id.items()}
|
32 |
+
# Load information about book
|
33 |
+
book_df = pd.read_csv(info_path)
|
34 |
+
book_df = book_df[book_df["Num-Rating"] > lb]
|
35 |
+
book_df['predict'] = book_df["ISBN"].map(predict_dict)
|
36 |
+
df = book_df.nlargest(num, "predict").reset_index(drop=True)
|
37 |
+
df["context"] = df.apply(
|
38 |
+
lambda book: f"{book['Book-Title']} ({book['Year-Of-Publication']}) - by {book['Book-Author']}", axis=1
|
39 |
+
)
|
40 |
+
|
41 |
+
return df['context'].values
|
42 |
+
|
43 |
+
st.title('Book Recommender System')
|
44 |
+
|
45 |
+
# Display dialogue box that contains content
|
46 |
+
user_id = st.selectbox(
|
47 |
+
'Enter your ID:',
|
48 |
+
user2id.keys()
|
49 |
+
)
|
50 |
+
|
51 |
+
# Setting a button
|
52 |
+
if st.button('Recommend'):
|
53 |
+
recommendations = recommend_books(user_id)
|
54 |
+
st.write('**_Your ID:_**', user_id)
|
55 |
+
st.write('**_Your top 10 recommendations:_**')
|
56 |
+
for num, i in enumerate(recommendations):
|
57 |
+
st.write(num + 1, ':', i)
|
dataset/books.csv
ADDED
Binary file (77.8 MB). View file
|
|
dataset/ratings.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dataset/users.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessing.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import yaml
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from pathlib import Path
|
7 |
+
from jsonargparse import ArgumentParser
|
8 |
+
|
9 |
+
|
10 |
+
def parse_args():
|
11 |
+
"""Parse command-line arguments."""
|
12 |
+
parser = ArgumentParser()
|
13 |
+
parser.add_argument("--rating_path", type=str, required=True, default="./dataset/ratings.csv")
|
14 |
+
parser.add_argument("--book_path", type=str, required=True, default="./dataset/books.csv")
|
15 |
+
parser.add_argument("--out_dir", type=str, required=True, default="./processed")
|
16 |
+
parser.add_argument("--limit", required=True, type=int, default=1000)
|
17 |
+
|
18 |
+
return vars(parser.parse_args())
|
19 |
+
|
20 |
+
|
21 |
+
def main(
|
22 |
+
rating_path,
|
23 |
+
book_path,
|
24 |
+
out_dir,
|
25 |
+
limit,
|
26 |
+
**kwargs
|
27 |
+
):
|
28 |
+
data = pd.read_csv(rating_path, delimiter=';', nrows=limit, encoding='ISO-8859-1')
|
29 |
+
|
30 |
+
# Make Y
|
31 |
+
Y = data.pivot(index='ISBN', columns='User-ID', values='Book-Rating')
|
32 |
+
Y = Y.fillna(0)
|
33 |
+
Y = Y.values
|
34 |
+
|
35 |
+
# Make R
|
36 |
+
R = np.where(Y != 0, 1, 0)
|
37 |
+
|
38 |
+
# Save Y and R as dense matrices
|
39 |
+
out_dir_path = Path(out_dir)
|
40 |
+
if out_dir_path.exists():
|
41 |
+
assert out_dir_path.is_dir()
|
42 |
+
else:
|
43 |
+
out_dir_path.mkdir(parents=True)
|
44 |
+
np.save(f'{out_dir_path}/Y.npy', Y)
|
45 |
+
np.save(f'{out_dir_path}/R.npy', R)
|
46 |
+
|
47 |
+
# Create mappings for book and user IDs
|
48 |
+
book_lst = data['ISBN'].unique()
|
49 |
+
user_lst = data['User-ID'].unique()
|
50 |
+
book_id_map = {book_id: i for i, book_id in enumerate(book_lst)}
|
51 |
+
user_id_map = {user_id: i for i, user_id in enumerate(user_lst)}
|
52 |
+
# Convert keys to compatible types
|
53 |
+
book_id_map = {str(key): value for key, value in book_id_map.items()}
|
54 |
+
user_id_map = {str(key): value for key, value in user_id_map.items()}
|
55 |
+
|
56 |
+
# Save book_id_map to file
|
57 |
+
with open(f'{out_dir_path}/book_id_map.json', 'w') as f:
|
58 |
+
json.dump(book_id_map, f)
|
59 |
+
|
60 |
+
# Save user_id_map to file
|
61 |
+
with open(f'{out_dir_path}/user_id_map.json', 'w') as f:
|
62 |
+
json.dump(user_id_map, f)
|
63 |
+
|
64 |
+
# Get summary
|
65 |
+
function = {
|
66 |
+
"Book-Rating": "mean",
|
67 |
+
"User-ID": "count"
|
68 |
+
}
|
69 |
+
|
70 |
+
book_df = pd.read_csv(book_path, delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
|
71 |
+
summary_rating = data.groupby("ISBN").agg(function, axis=0)
|
72 |
+
summary_rating = summary_rating.rename(columns={"Book-Rating": "Mean-Rating", "User-ID": "Num-Rating"})
|
73 |
+
df = book_df.merge(summary_rating, how="left", left_on="ISBN", right_on="ISBN")
|
74 |
+
df.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"], inplace=True)
|
75 |
+
df.to_csv(f"{out_dir_path}/summary_book.csv", index=False)
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
main(**parse_args())
|
processed/R.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e1255c1daea3561d8e326acb7271127549923abed46da5de0e092a8664b227f
|
3 |
+
size 1293760
|
processed/Y.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1701a14103e9b7259e27b06c7eb9e0b71af75078e0eef6b2e4b6c163f281f7ee
|
3 |
+
size 1293760
|
processed/book_id_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"034545104X": 0, "0155061224": 1, "0446520802": 2, "052165615X": 3, "0521795028": 4, "2080674722": 5, "3257224281": 6, "0600570967": 7, "038550120X": 8, "342310538": 9, "0425115801": 10, "0449006522": 11, "0553561618": 12, "055356451X": 13, "0786013990": 14, "0786014512": 15, "0060517794": 16, "0451192001": 17, "0609801279": 18, "0671537458": 19, "0679776818": 20, "0943066433": 21, "1570231028": 22, "1885408226": 23, "0747558167": 24, "3442437407": 25, "033390804X": 26, "3596218098": 27, "0684867621": 28, "0451166892": 29, "8440682697": 30, "034544003X": 31, "0380000059": 32, "0380711524": 33, "0451167317": 34, "0451454952": 35, "0843920262": 36, "3404122879": 37, "3404182928": 38, "3404611306": 39, "342662429": 40, "3426690179": 41, "3442424216": 42, "3442425573": 43, "3453092007": 44, "3453157745": 45, "3453176944": 46, "3453185137": 47, "3453185323": 48, "3453213025": 49, "3453877241": 50, "3492226604": 51, "3517017442": 52, "3596125006": 53, "B0000BLD7X": 54, "N3453124715": 55, "9029716894": 56, "9057868059": 57, "0140279091": 58, "0553572369": 59, "0571058086": 60, "3499230933": 61, "3596151465": 62, "0099543818": 63, "3404147723": 64, "3423111321": 65, "3442136644": 66, "3492232000": 67, "8434811634": 68, "8484330478": 69, "8484332039": 70, "2864322102": 71, "8402065945": 72, "8423314901": 73, "842333533X": 74, "8427911769": 75, "8433914456": 76, "8437606322": 77, "8445072919": 78, "8466300821": 79, "847765011X": 80, "8478442588": 81, "8495368099": 82, "0345443683": 83, "043935806X": 84, "055310666X": 85, "0330332775": 86, "0330367358": 87, "0006379702": 88, "3423084049": 89, "3442131340": 90, "3446202102": 91, "3453073398": 92, "3453115783": 93, "3499134004": 94, "349915398X": 95, "3548603203": 96, "3764501383": 97, "3791535889": 98, "0061054143": 99, "0061054151": 100, "0061056774": 101, "0671021346": 102, "0671024108": 103, "1562827898": 104, "9726954835": 105, "0064405176": 106, "0439104769": 107, "0440498058": 108, "8807817144": 109, "8845915913": 110, "0395547032": 111, "0440414121": 112, "8879839993": 113, "8401328322": 114, "8401461189": 115, "8401471796": 116, "8423996565": 117, "8424130081": 118, "8426449476": 119, "8426449573": 120, "8478884831": 121, "8478885218": 122, "8478885463": 123, "8478886044": 124, "8495618052": 125, "0091830893": 126, "0586207414": 127, "0812571029": 128, "0671749609": 129, "0140062718": 130, "0140260498": 131, "0060096195": 132, "0141310340": 133, "0142302198": 134, "0156006065": 135, "0375821813": 136, "038076041X": 137, "0439087597": 138, "0439401399": 139, "0552546933": 140, "0689804458": 141, "0699854289": 142, "0786812508": 143, "0786817070": 144, "0805057706": 145, "1551925060": 146, "1573248533": 147, "000651118X": 148, "0385272324": 149, "2038701962": 150, "2070264564": 151, "2070334430": 152, "2070403734": 153, "2080680676": 154, "2232122263": 155, "2253044903": 156, "2253150711": 157, "2266076000": 158, "2277221678": 159, "2290321028": 160, "2842190009": 161, "0440225825": 162, "0316781266": 163, "0345446860": 164, "0671876244": 165, "3125785006": 166, "0380005239": 167, "1593080255": 168, "0330201700": 169, "0385729340": 170, "3809407536": 171, "0446364193": 172, "3257200552": 173, "3379015180": 174, "3404145909": 175, "3404148576": 176, "3404921178": 177, "3423071516": 178, "3423204885": 179, "3423205806": 180, "3426029553": 181, "3426622610": 182, "3426671298": 183, "344215121X": 184, "3442413508": 185, "3442422035": 186, "3442435773": 187, "3442437717": 188, "3442441080": 189, "3442442796": 190, "3442444020": 191, "3442446414": 192, "3442448530": 193, "3442449820": 194, "3453137442": 195, "3453870190": 196, "3455077331": 197, "347354034X": 198, "3492231322": 199, "349912176X": 200, "3499222213": 201, "3499228297": 202, "3499232529": 203, "3499233436": 204, "3499264528": 205, "3499433443": 206, "3506464078": 207, "3548602967": 208, "3551551677": 209, "3551551685": 210, "3551551693": 211, "3551551936": 212, "359621078X": 213, "3608932240": 214, "360893541X": 215, "3608935428": 216, "3608935436": 217, "374661922X": 218, "3770131495": 219, "3809024589": 220, "3821815191": 221, "3932069234": 222, "3423100424": 223, "3442096596": 224, "3442440777": 225, "3453009304": 226, "3453042905": 227, "3453061187": 228, "3453071174": 229, "3453127013": 230, "3453211014": 231, "3462026062": 232, "349926028X": 233, "3596122279": 234, "3596287200": 235, "3922524443": 236, "0449217264": 237, "0140621741": 238, "055321358X": 239, "8420457477": 240, "8448034023": 241, "0671034944": 242, "074931012X": 243, "0843946415": 244, "2010173929": 245, "20103389": 246, "2012003494": 247, "2013218826": 248, "2013220162": 249, "207033015X": 250, "2070332985": 251, "2203142278": 252, "2205040561": 253, "221096900X": 254, "2253005274": 255, "2723402983": 256, "2747002748": 257, "2800108584": 258, "2800134259": 259, "2800134267": 260, "2800135522": 261, "2800135565": 262, "2800135719": 263, "2800135727": 264, "2803616998": 265, "2907572458": 266, "8439598459": 267, "014014899X": 268, "0140252517": 269, "0140269967": 270, "0140328742": 271, "0140366830": 272, "0140367446": 273, "0425131378": 274, "0517642689": 275, "0600571165": 276, "077104450X": 277, "0789706032": 278, "0836218833": 279, "0836220889": 280, "088365721X": 281, "1559712252": 282, "1567616089": 283, "2890510328": 284, "3257227264": 285, "0141011904": 286, "342313075X": 287, "3423201509": 288, "3423206616": 289, "344242529": 290, "3446200452": 291, "349223903X": 292, "3499231603": 293, "3704320196": 294, "3886807843": 295, "0345423402": 296, "1569312435": 297, "1892213141": 298, "8530805461": 299, "3257218516": 300, "3404126343": 301, "3404131606": 302, "3404139178": 303, "3404143299": 304, "3404614542": 305, "340645724X": 306, "3426192543": 307, "3426702266": 308, "3442421357": 309, "3442443806": 310, "345309221X": 311, "3453108361": 312, "3453126912": 313, "3453130901": 314, "3453132262": 315, "3596129389": 316, "3596237874": 317, "3596247500": 318, "3809410357": 319, "0345249372": 320, "044020562X": 321, "9026935722": 322, "0425167097": 323, "042518109X": 324, "0425188221": 325, "8500010452": 326, "8501023450": 327, "8571648972": 328, "8588615126": 329, "8589885291": 330, "009975181X": 331, "00273755": 332, "014366020444": 333, "0688172377": 334, "8481305464": 335, "8496075850": 336, "8804321008": 337, "8804375299": 338, "8804407808": 339, "8804464895": 340, "8804510579": 341, "8804512652": 342, "8806144146": 343, "8806155873": 344, "8807700735": 345, "8807806746": 346, "8807810751": 347, "880781112X": 348, "8807812495": 349, "8807813157": 350, "8807816059": 351, "8817106100": 352, "8817112917": 353, "8817877028": 354, "8820024381": 355, "883041915X": 356, "8831760122": 357, "8833908488": 358, "8838910987": 359, "8842806978": 360, "8845219747": 361, "8845249689": 362, "8846200624": 363, "88741800047": 364, "8876846565": 365, "8877825200": 366, "8879285513": 367, "8879285645": 368, "8879832905": 369, "8881110288": 370, "888634712X": 371, "8887432252": 372, "8888424121": 373, "0449210197": 374, "8817151068": 375, "0151446474": 376, "8433967606": 377, "8496280012": 378, "899792145": 379, "B158991965": 380, "0064430227": 381, "0671723650": 382, "0812533550": 383, "8806162160": 384, "884590184X": 385, "0446613843": 386, "0140270272": 387, "0440241537": 388, "0099460343": 389, "0375901582": 390, "0439317746": 391, "0440228840": 392, "0738205737": 393, "1566911605": 394, "0060542845": 395, "0449148831": 396, "1551666308": 397, "8420430943": 398, "8420636282": 399, "8432205311": 400, "000225669X": 401, "0099549611": 402, "0701162767": 403, "1852422580": 404, "042513976X": 405, "0441008291": 406, "0006511929": 407, "002542730X": 408, "0060520507": 409, "0060930934": 410, "0060951303": 411, "0099414732": 412, "0140154078": 413, "0140327592": 414, "0140367616": 415, "0141181222": 416, "0151010633": 417, "0192834312": 418, "0194216748": 419, "0240514866": 420, "0316666343": 421, "0345391810": 422, "0375400699": 423, "0385504209": 424, "043527242X": 425, "0439284031": 426, "0439286182": 427, "0439286239": 428, "0449221148": 429, "0451527747": 430, "0486282112": 431, "0486424499": 432, "0553275283": 433, "0582530431": 434, "0590502123": 435, "0595132189": 436, "0613329740": 437, "063403541X": 438, "067172939X": 439, "0671729438": 440, "0671746502": 441, "0679721851": 442, "0679745580": 443, "0691000980": 444, "0747545111": 445, "0749931434": 446, "0754000117": 447, "0804106304": 448, "0812583566": 449, "0843128240": 450, "0879517344": 451, "0971880107": 452, "1400001625": 453, "1400002672": 454, "1853262404": 455, "2061007074": 456, "2266095536": 457, "2290309494": 458, "2742739351": 459, "3522149904": 460, "3775713328": 461, "8401009421": 462, "8401327199": 463, "8401462231": 464, "840149236X": 465, "8408011200": 466, "8420427462": 467, "8420432113": 468, "8420444367": 469, "8420789895": 470, "8422655500": 471, "8422657104": 472, "842265783X": 473, "8422693445": 474, "8423325105": 475, "8423662152": 476, "8423951537": 477, "8423970647": 478, "8423976645": 479, "8426109799": 480, "8427007450": 481, "8429712372": 482, "8429714936": 483, "842975295": 484, "8429753419": 485, "8432227706": 486, "8434830809": 487, "8434840391": 488, "8440627203": 489, "8440696833": 490, "8445071572": 491, "8447306194": 492, "8447312054": 493, "8449416078": 494, "8472453723": 495, "8473068971": 496, "8474104823": 497, "8474263123": 498, "8475071163": 499, "8475961290": 500, "8476722338": 501, "8478809783": 502, "8482180088": 503, "8483221306": 504, "8489163499": 505, "8495501090": 506, "8495501198": 507, "8496077152": 508, "8496077209": 509, "8496246140": 510, "8497931467": 511, "950491036X": 512, "9508521481": 513, "9871138016": 514, "0460010239": 515, "0749336145": 516, "1899344705": 517, "0439135494": 518, "059030271X": 519, "0590453661": 520, "0590470108": 521, "0590483404": 522, "0671021354": 523, "0671026283": 524, "0671027506": 525, "0671727109": 526, "0060964049": 527, "0380807343": 528, "0439064864": 529, "2.02.032126.2": 530, "2.264.03602.8": 531, "2020058863": 532, "2020062399": 533, "2020101653": 534, "202011528X": 535, "20202006935": 536, "20203119888": 537, "2020386666": 538, "2020564777": 539, "2020591944": 540, "2070365832": 541, "2070378411": 542, "2070386023": 543, "2070394956": 544, "2070404587": 545, "2070404706": 546, "2070404722": 547, "2070406962": 548, "2070408450": 549, "2070425770": 550, "2070725804": 551, "2080680692": 552, "2226070109": 553, "2226126570": 554, "2226135022": 555, "2253030570": 556, "2253049417": 557, "2253050407": 558, "2253055972": 559, "225307659X": 560, "2253171670": 561, "2253172367": 562, "2264010991": 563, "2264013257": 564, "2264018194": 565, "2264024674": 566, "226402593X": 567, "2264027568": 568, "2264029463": 569, "226403114": 570, "2264031158": 571, "2264033282": 572, "226403601X": 573, "2264036036": 574, "2266040820": 575, "2266047280": 576, "226604960": 577, "22660861003": 578, "2266096451": 579, "2266102028": 580, "2266102621": 581, "2266104535": 582, "2266105698": 583, "2266121367": 584, "2277302228": 585, "2290303488": 586, "2290308285": 587, "2290312924": 588, "2290315524": 589, "2290321559": 590, "2702424131": 591, "2742724028": 592, "2742724613": 593, "2742729038": 594, "2743602295": 595, "2841469824": 596, "2842611462": 597, "2868696627": 598, "2869304129": 599, "2869304560": 600, "2869304870": 601, "2869305583": 602, "2895400644": 603, "2907572121": 604, "290757213X": 605, "1566190096": 606, "0749317256": 607, "3453150538": 608, "3492224628": 609, "3492224768": 610, "3596147700": 611, "0060011939": 612, "0099283697": 613, "0140187758": 614, "0316154601": 615, "059035342X": 616, "0713628944": 617, "0752844040": 618, "1853260665": 619, "2070414256": 620, "2070419657": 621, "2253063339": 622, "2253137243": 623, "2253144452": 624, "2253152072": 625, "2264027134": 626, "226612269X": 627, "2290318329": 628, "2702400612": 629, "2702401694": 630, "2878580753": 631, "8408040383": 632, "8423310353": 633, "844140321X": 634, "8484601072": 635, "0330373269": 636, "8882461327": 637, "349202436X": 638, "8807813858": 639, "0140182551": 640, "0446310786": 641, "0886776783": 642, "3257008155": 643, "329300301X": 644, "3419528078": 645, "3423118709": 646, "3440054594": 647, "3442723078": 648, "3453034120": 649, "3480204015": 650, "3492238882": 651, "3499121808": 652, "3499153629": 653, "3502513333": 654, "3502517371": 655, "3502551685": 656, "3596119502": 657, "3596221234": 658, "3596850185": 659, "3772402542": 660, "3772420176": 661, "3794170180": 662, "3806852456": 663, "3808572612": 664, "3821812028": 665, "3922708072": 666, "7321578936": 667, "0312966970": 668, "680ISBN359623": 669, "0340818182": 670, "0061096261": 671, "0312169817": 672, "0312864590": 673, "0345348664": 674, "0345348672": 675, "0345354621": 676, "0345385764": 677, "0345435036": 678, "0385264356": 679, "0425104273": 680, "0425172546": 681, "0440218667": 682, "0440220602": 683, "0446522856": 684, "0451180054": 685, "0451198514": 686, "0515131520": 687, "0517093715": 688, "0553266306": 689, "0765342987": 690, "0812550153": 691, "0812550307": 692, "0843949163": 693, "0886773741": 694, "0886774802": 695, "0886774829": 696, "0886775426": 697, "0886775957": 698, "0886777178": 699, "1572971835": 700, "9722900684": 701, "3499221489": 702, "2422614189": 703, "0345425596": 704, "0590259970": 705, "0590260251": 706, "0590417827": 707, "0590426702": 708, "0590436422": 709, "0590436449": 710, "0590436457": 711, "0590436465": 712, "0590436481": 713, "059043649X": 714, "0590436503": 715, "0590442589": 716, "0590442988": 717, "0590442996": 718, "0590448234": 719, "0590448595": 720, "0590456458": 721, "0590456504": 722, "0590470485": 723, "0590483056": 724, "0060930187": 725, "0375760911": 726, "0689817851": 727, "0874869870": 728, "0061081450": 729, "0061083259": 730, "0061087017": 731, "0061094404": 732, "0312925689": 733, "0440295653": 734, "0671793489": 735, "0684845768": 736, "0701169176": 737, "0312244266": 738, "880701601X": 739, "8817134899": 740, "0099268345": 741, "3100970616": 742, "3257062354": 743, "3257228317": 744, "3423128879": 745, "3423202327": 746, "342677609X": 747, "3442054753": 748, "3442727073": 749, "3453132041": 750, "3478387507": 751, "3492045170": 752, "3492230814": 753, "3499101505": 754, "3545202461": 755, "3548359698": 756, "3550075359": 757, "3596154766": 758, "3596214629": 759, "0671011367": 760, "0618045996": 761, "3822858617": 762, "0864425589": 763, "8817860751": 764, "0452282101": 765, "0671025368": 766, "3257060580": 767, "0553581112": 768, "0805047379": 769, "0892964456": 770, "0373250223": 771, "0451097009": 772, "0451179994": 773, "9681500830": 774, "8432087653": 775, "0553140779": 776, "0425182150": 777, "1883473004": 778, "0061007129": 779, "0061000027": 780, "0812511816": 781, "0833531654": 782, "0880381736": 783, "0880381744": 784, "0722536283": 785, "0060505885": 786, "0061097101": 787, "0299164942": 788, "0312283709": 789, "0312983271": 790, "0380731851": 791, "0446605484": 792, "0446611212": 793, "0451188454": 794, "0451207955": 795, "0609804138": 796, "0671003755": 797, "067104754X": 798, "0743407067": 799, "074343627X": 800, "0786013230": 801, "0812509560": 802, "3426615355": 803, "3442435838": 804, "3442455707": 805, "0075536498": 806, "0099287692": 807, "0099845008": 808, "0330262130": 809, "0385720920": 810, "0393319296": 811, "0553262505": 812, "06514251": 813, "0805062971": 814, "3257228007": 815, "3442430496": 816, "3442446325": 817, "3453171500": 818, "3464371506": 819, "9513098648": 820, "0590108395": 821, "3442451353": 822, "8425330866": 823, "8481301213": 824, "0460905589": 825, "0816704627": 826, "1573229571": 827, "0060595183": 828, "9782922145441": 829, "2830207904": 830, "3499263998": 831, "0434009407": 832, "1841193887": 833, "0375700668": 834, "457871971": 835, "840149768X": 836, "8401499917": 837, "8402007287": 838, "8420600369": 839, "8420603066": 840, "8423918335": 841, "8426105084": 842, "8426429807": 843, "843223138X": 844, "8437608570": 845, "8440630921": 846, "8440630922": 847, "8470394126": 848, "8471662531": 849, "8472230082": 850, "8474541913": 851, "8478091351": 852, "3426193310": 853, "0064404773": 854, "0064407667": 855, "0552545228": 856, "006054094X": 857, "0375706038": 858, "081297106X": 859, "0843951826": 860, "1400032628": 861, "1585861553": 862, "0066210151": 863, "0385334141": 864, "0425191184": 865, "0671027343": 866, "0751503894": 867, "3714500799": 868, "0385503822": 869, "3548208975": 870, "0399138684": 871, "0425189864": 872, "0440236053": 873, "0440241073": 874, "0553586122": 875, "0099935708": 876, "0140118608": 877, "0552998249": 878, "3518408127": 879, "1586609726": 880, "0865472807": 881, "0312960344": 882, "055358068X": 883, "0792270142": 884, "0688174590": 885, "031286504X": 886, "0345342968": 887, "0375756981": 888, "0553382411": 889, "0671027662": 890, "0767903382": 891, "3785527195": 892, "1558744592": 893, "0060002484": 894, "0060094117": 895, "0312253397": 896, "0312331754": 897, "0312874243": 898, "0312979517": 899, "0316152196": 900, "0316154059": 901, "0316287555": 902, "034541389X": 903, "0345450175": 904, "0345452550": 905, "0375727981": 906, "0380975017": 907, "0385305389": 908, "0393045390": 909, "0394543289": 910, "0399135804": 911, "0399138188": 912, "0399141340": 913, "0399146466": 914, "0399146504": 915, "0399146687": 916, "0399147101": 917, "0399147144": 918, "0399147322": 919, "0399147624": 920, "0399148337": 921, "0399148450": 922, "0399148639": 923, "0399148728": 924, "0399149783": 925, "0399150811": 926, "0399150870": 927, "0399151451": 928, "0399151478": 929, "039915177X": 930, "0399151885": 931, "0425183181": 932, "0440111323": 933, "0440122147": 934, "0440221463": 935, "0441005470": 936, "0446519480": 937, "0449221512": 938, "055358295X": 939, "0670894184": 940, "0671024094": 941, "0679450408": 942, "0684801663": 943, "0684846608": 944, "0684871726": 945, "0743201604": 946, "074320607X": 947, "0743407377": 948, "0743486226": 949, "0804109990": 950, "0812571118": 951, "0969691319": 952, "8475251471": 953, "0380724987": 954, "0380726246": 955, "0380816059": 956, "0399139419": 957, "0425175405": 958, "0425182932": 959, "0440225701": 960, "0446602485": 961, "0446603406": 962, "0451091949": 963, "0553095439": 964, "0553227041": 965, "0553263226": 966, "0553295098": 967, "0553564994": 968, "067102423X": 969, "0671032658": 970, "0671653849": 971, "0722509049": 972, "0812500067": 973, "0812516001": 974, "0812568710": 975, "0843921609": 976, "0890875588": 977, "1557730091": 978, "1558172882": 979, "0425178102": 980, "0446609404": 981, "0446610038": 982, "0451178017": 983, "0553211056": 984, "0553285920": 985}
|
processed/summary_book.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
processed/user_id_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"276725": 0, "276726": 1, "276727": 2, "276729": 3, "276733": 4, "276736": 5, "276737": 6, "276744": 7, "276745": 8, "276746": 9, "276747": 10, "276748": 11, "276751": 12, "276754": 13, "276755": 14, "276760": 15, "276762": 16, "276765": 17, "276768": 18, "276772": 19, "276774": 20, "276780": 21, "276786": 22, "276788": 23, "276796": 24, "276798": 25, "276800": 26, "276803": 27, "276804": 28, "276806": 29, "276808": 30, "276811": 31, "276812": 32, "276813": 33, "276814": 34, "276817": 35, "276820": 36, "276822": 37, "276827": 38, "276828": 39, "276830": 40, "276832": 41, "276833": 42, "276835": 43, "276837": 44, "276838": 45, "276840": 46, "276842": 47, "276847": 48, "276848": 49, "276850": 50, "276852": 51, "276853": 52, "276854": 53, "276856": 54, "276857": 55, "276859": 56, "276861": 57, "276862": 58, "276863": 59, "276866": 60, "276869": 61, "276870": 62, "276872": 63, "276873": 64, "276875": 65, "276878": 66, "276879": 67, "276884": 68, "276887": 69, "276888": 70, "276889": 71, "276890": 72, "276896": 73, "276904": 74, "276905": 75, "276911": 76, "276912": 77, "276915": 78, "276916": 79, "276925": 80, "276927": 81, "276928": 82, "276929": 83, "276934": 84, "276936": 85, "276939": 86, "276943": 87, "276946": 88, "276949": 89, "276950": 90, "276953": 91, "276954": 92, "276957": 93, "276959": 94, "276963": 95, "276964": 96, "276965": 97, "276975": 98, "276981": 99, "276984": 100, "276986": 101, "276988": 102, "276989": 103, "276990": 104, "276992": 105, "276994": 106, "276997": 107, "276998": 108, "277002": 109, "277007": 110, "277009": 111, "277010": 112, "277012": 113, "277018": 114, "277019": 115, "277022": 116, "277023": 117, "277028": 118, "277031": 119, "277032": 120, "277035": 121, "277036": 122, "277040": 123, "277042": 124, "277048": 125, "277051": 126, "277052": 127, "277053": 128, "277056": 129, "277058": 130, "277064": 131, "277065": 132, "277072": 133, "277073": 134, "277074": 135, "277075": 136, "277079": 137, "277085": 138, "277087": 139, "277090": 140, "277094": 141, "277096": 142, "277102": 143, "277107": 144, "277109": 145, "277114": 146, "277116": 147, "277123": 148, "277124": 149, "277128": 150, "277129": 151, "277134": 152, "277135": 153, "277139": 154, "277142": 155, "277143": 156, "277149": 157, "277155": 158, "277157": 159, "277159": 160, "277165": 161, "277168": 162, "277170": 163}
|
recommend.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from jsonargparse import ArgumentParser
|
6 |
+
|
7 |
+
|
8 |
+
def parse_args():
|
9 |
+
"""Parse command-line arguments."""
|
10 |
+
parser = ArgumentParser()
|
11 |
+
parser.add_argument("--data_dir", type=str, required=True, default="./processed")
|
12 |
+
parser.add_argument("--weight_dir", type=str, required=True, default="./weight")
|
13 |
+
parser.add_argument("--info_path", type=str, required=True, default="./processed/summary_book.csv")
|
14 |
+
parser.add_argument("--user_id", required=True, default="276729")
|
15 |
+
parser.add_argument("--num", type=int, required=True, default=10)
|
16 |
+
parser.add_argument("--lb", type=int, required=True, default=0)
|
17 |
+
|
18 |
+
return vars(parser.parse_args())
|
19 |
+
|
20 |
+
def main(
|
21 |
+
data_dir,
|
22 |
+
weight_dir,
|
23 |
+
info_path,
|
24 |
+
user_id,
|
25 |
+
num,
|
26 |
+
lb,
|
27 |
+
**kwargs
|
28 |
+
):
|
29 |
+
# Load R matrix from file
|
30 |
+
R = np.load(f'{data_dir}/R.npy', allow_pickle=True)
|
31 |
+
# Load prediction
|
32 |
+
prediction = np.load(f'{weight_dir}/predicted.npy', allow_pickle=True)
|
33 |
+
# Load dictionary from JSON file
|
34 |
+
with open(f'{data_dir}/user_id_map.json', 'r') as file:
|
35 |
+
user2id = json.load(file)
|
36 |
+
with open(f'{data_dir}/book_id_map.json', 'r') as file:
|
37 |
+
book2id = json.load(file)
|
38 |
+
|
39 |
+
# Recommend
|
40 |
+
user_idx = user2id[str(user_id)]
|
41 |
+
predict = prediction[:, user_idx] # get prediction for user
|
42 |
+
predict_dict = {book: np.round(predict[idx], 2) for book, idx in book2id.items()}
|
43 |
+
# Load information about book
|
44 |
+
book_df = pd.read_csv(info_path)
|
45 |
+
book_df = book_df[book_df["Num-Rating"] > lb]
|
46 |
+
book_df['predict'] = book_df["ISBN"].map(predict_dict)
|
47 |
+
recommendations = book_df.nlargest(num, "predict").reset_index(drop=True)
|
48 |
+
recommendations["context"] = recommendations.apply(
|
49 |
+
lambda book: f"{book['Book-Title']} ({book['Year-Of-Publication']}) - by {book['Book-Author']}", axis=1
|
50 |
+
)
|
51 |
+
print(recommendations)
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
main(**parse_args())
|
requirements.txt
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==4.2.2
|
2 |
+
attrs==23.1.0
|
3 |
+
backports.zoneinfo==0.2.1
|
4 |
+
blinker==1.6.2
|
5 |
+
cachetools==5.3.0
|
6 |
+
certifi==2023.5.7
|
7 |
+
charset-normalizer==3.1.0
|
8 |
+
click==8.1.3
|
9 |
+
decorator==5.1.1
|
10 |
+
entrypoints==0.4
|
11 |
+
gitdb==4.0.10
|
12 |
+
GitPython==3.1.31
|
13 |
+
idna==3.4
|
14 |
+
importlib-metadata==6.6.0
|
15 |
+
importlib-resources==5.12.0
|
16 |
+
Jinja2==3.1.2
|
17 |
+
jsonargparse==4.21.1
|
18 |
+
jsonschema==4.17.3
|
19 |
+
markdown-it-py==2.2.0
|
20 |
+
MarkupSafe==2.1.2
|
21 |
+
mdurl==0.1.2
|
22 |
+
numpy==1.24.3
|
23 |
+
packaging==23.1
|
24 |
+
pandas==2.0.1
|
25 |
+
Pillow==9.5.0
|
26 |
+
pkgutil-resolve-name==1.3.10
|
27 |
+
protobuf==3.20.3
|
28 |
+
pyarrow==12.0.0
|
29 |
+
pydeck==0.8.1b0
|
30 |
+
Pygments==2.15.1
|
31 |
+
Pympler==1.0.1
|
32 |
+
pyrsistent==0.19.3
|
33 |
+
python-dateutil==2.8.2
|
34 |
+
pytz==2023.3
|
35 |
+
PyYAML==6.0
|
36 |
+
requests==2.31.0
|
37 |
+
rich==13.3.5
|
38 |
+
six==1.16.0
|
39 |
+
smmap==5.0.0
|
40 |
+
streamlit==1.22.0
|
41 |
+
tenacity==8.2.2
|
42 |
+
toml==0.10.2
|
43 |
+
toolz==0.12.0
|
44 |
+
tornado==6.3.2
|
45 |
+
typing-extensions==4.6.2
|
46 |
+
tzdata==2023.3
|
47 |
+
tzlocal==5.0.1
|
48 |
+
urllib3==2.0.2
|
49 |
+
validators==0.20.0
|
50 |
+
watchdog==3.0.0
|
51 |
+
zipp==3.15.0
|
train.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import tensorflow as tf
|
3 |
+
|
4 |
+
from tensorflow import keras
|
5 |
+
from pathlib import Path
|
6 |
+
from jsonargparse import ArgumentParser
|
7 |
+
from utils_c import normalize, cost_function
|
8 |
+
|
9 |
+
|
10 |
+
def parse_args():
|
11 |
+
"""Parse command-line arguments."""
|
12 |
+
parser = ArgumentParser()
|
13 |
+
parser.add_argument("--data_dir", type=str, required=True, default="./processed")
|
14 |
+
parser.add_argument("--out_dir", type=str, required=True, default="./weight")
|
15 |
+
parser.add_argument("--num_features", type=int, required=True, default=10)
|
16 |
+
parser.add_argument("--num_iterators", type=int, required=True, default=200)
|
17 |
+
parser.add_argument("--learning_rate", type=float, required=True, default=1e-1)
|
18 |
+
parser.add_argument("--lambda_", type=float, required=True, default=2.0)
|
19 |
+
parser.add_argument("--seed", type=int, required=True, default=1234)
|
20 |
+
parser.add_argument("--freq", type=int, required=True, default=20)
|
21 |
+
|
22 |
+
return vars(parser.parse_args())
|
23 |
+
|
24 |
+
def main(
|
25 |
+
data_dir,
|
26 |
+
out_dir,
|
27 |
+
num_features,
|
28 |
+
num_iterators,
|
29 |
+
learning_rate,
|
30 |
+
lambda_,
|
31 |
+
seed,
|
32 |
+
freq
|
33 |
+
):
|
34 |
+
# Load R matrix from file
|
35 |
+
R = np.load(f'{data_dir}/R.npy', allow_pickle=True)
|
36 |
+
# Load Y matrix from file
|
37 |
+
Y = np.load(f'{data_dir}/Y.npy', allow_pickle=True)
|
38 |
+
# Normalize the Dataset
|
39 |
+
Y_norm, Y_mean = normalize(Y, R)
|
40 |
+
|
41 |
+
num_books, num_users = Y.shape
|
42 |
+
# Set Initial Parameters (W, X), use tf.Variable to track these variables
|
43 |
+
tf.random.set_seed(seed) # for consistent results
|
44 |
+
|
45 |
+
W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name='W')
|
46 |
+
X = tf.Variable(tf.random.normal((num_books, num_features), dtype=tf.float64), name='X')
|
47 |
+
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name='b')
|
48 |
+
|
49 |
+
# Instantiate an optimizer.
|
50 |
+
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
|
51 |
+
for iter in range(num_iterators):
|
52 |
+
# Use TensorFlow’s GradientTape
|
53 |
+
# to record the operations used to compute the cost
|
54 |
+
with tf.GradientTape() as tape:
|
55 |
+
# Compute the cost (forward pass included in cost)
|
56 |
+
cost_value = cost_function(X, W, b, Y_norm, R, lambda_)
|
57 |
+
|
58 |
+
# Use the gradient tape to automatically retrieve
|
59 |
+
# the gradients of the trainable variables with respect to the loss
|
60 |
+
grads = tape.gradient(cost_value, [X, W, b])
|
61 |
+
|
62 |
+
# Run one step of gradient descent by updating
|
63 |
+
# the value of the variables to minimize the loss.
|
64 |
+
optimizer.apply_gradients(zip(grads, [X, W, b]))
|
65 |
+
|
66 |
+
# Log periodically.
|
67 |
+
if iter % freq == 0:
|
68 |
+
print(f"Training loss at iteration {iter}: {cost_value:0.1f}")
|
69 |
+
|
70 |
+
predict = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
|
71 |
+
predict = predict + Y_mean
|
72 |
+
|
73 |
+
# Save weight
|
74 |
+
out_dir = Path(out_dir)
|
75 |
+
if out_dir.exists():
|
76 |
+
assert out_dir.is_dir()
|
77 |
+
else:
|
78 |
+
out_dir.mkdir(parents=True)
|
79 |
+
np.save(f'{out_dir}/W.npy', W)
|
80 |
+
np.save(f'{out_dir}/X.npy', X)
|
81 |
+
np.save(f'{out_dir}/b.npy', b)
|
82 |
+
np.save(f'{out_dir}/predicted.npy', predict)
|
83 |
+
|
84 |
+
|
85 |
+
if __name__ == "__main__":
|
86 |
+
main(**parse_args())
|
utils_c.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import tensorflow as tf
|
3 |
+
|
4 |
+
|
5 |
+
def normalize(Y, R):
|
6 |
+
"""
|
7 |
+
Preprocess data by subtracting mean rating for every book (every row).
|
8 |
+
Only include real ratings R(i,j)=1.
|
9 |
+
|
10 |
+
[Y_norm, Y_mean] = normalize(Y, R) normalized Y so that each book
|
11 |
+
has a rating of 0 on average. Unrated moves then have a mean rating (0)
|
12 |
+
|
13 |
+
Returns the mean rating in Y_mean.
|
14 |
+
"""
|
15 |
+
Y_mean = (np.sum(Y * R, axis=1) / (np.sum(R, axis=1) + 1e-12)).reshape(-1, 1)
|
16 |
+
Y_norm = Y - np.multiply(Y_mean, R)
|
17 |
+
|
18 |
+
return Y_norm, Y_mean
|
19 |
+
|
20 |
+
def cost_function(X, W, b, Y, R, lambda_):
|
21 |
+
"""
|
22 |
+
Returns the cost for the collaborative filtering
|
23 |
+
Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
X (ndarray (num_books,num_features)): matrix of item features
|
27 |
+
W (ndarray (num_users,num_features)) : matrix of user parameters
|
28 |
+
b (ndarray (1, num_users) : vector of user parameters
|
29 |
+
Y (ndarray (num_books,num_users) : matrix of user ratings of books
|
30 |
+
R (ndarray (num_books,num_users) : matrix, where R(i, j) = 1 if the i-th books was rated by the j-th user
|
31 |
+
lambda_ (float): regularization parameter
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
J (float) : Cost
|
35 |
+
"""
|
36 |
+
|
37 |
+
j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
|
38 |
+
J = 0.5 * tf.reduce_sum(j ** 2) + (lambda_ / 2) * (tf.reduce_sum(X ** 2) + tf.reduce_sum(W ** 2))
|
39 |
+
|
40 |
+
return J
|
weight/W.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8df3995b7b6243c4b68b3cecabb10414d982c1cf1baf4533e6c8b8fadd3dc751
|
3 |
+
size 13248
|
weight/X.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d9f0e114c605d0b60edbf581ae35380ec3f1a1271f280d3f17c415a52828358f
|
3 |
+
size 79008
|
weight/b.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d69013d3a83302293d5e37e4fbbb2dd026297f6499cd71d9501fc91adc0d817f
|
3 |
+
size 1440
|
weight/predicted.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d02e08da558c5ad31c1eb64d15a6227b570ef73c4d0597d5ab49a5aa7f0310f
|
3 |
+
size 1293760
|