{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "0b2455be", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 5, "id": "1a400539", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"C:\\\\Users\\\\Immortal\\\\OneDrive\\\\Desktop\\\\aiml\\\\Datasets\\\\movies.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "ee8e5c57", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexbudgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularity...runtimespoken_languagesstatustaglinetitlevote_averagevote_countcastcrewdirector
00237000000Action Adventure Fantasy Science Fictionhttp://www.avatarmovie.com/19995culture clash future space war space colony so...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577...162.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.211800Sam Worthington Zoe Saldana Sigourney Weaver S...[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...James Cameron
11300000000Adventure Fantasy Actionhttp://disney.go.com/disneypictures/pirates/285ocean drug abuse exotic island east india trad...enPirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...139.082615...169.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedAt the end of the world, the adventure begins.Pirates of the Caribbean: At World's End6.94500Johnny Depp Orlando Bloom Keira Knightley Stel...[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...Gore Verbinski
22245000000Action Adventure Crimehttp://www.sonypictures.com/movies/spectre/206647spy based on novel secret agent sequel mi6enSpectreA cryptic message from Bond’s past sends him o...107.376788...148.0[{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},...ReleasedA Plan No One EscapesSpectre6.34466Daniel Craig Christoph Waltz L\\u00e9a Seydoux ...[{'name': 'Thomas Newman', 'gender': 2, 'depar...Sam Mendes
33250000000Action Crime Drama Thrillerhttp://www.thedarkknightrises.com/49026dc comics crime fighter terrorist secret ident...enThe Dark Knight RisesFollowing the death of District Attorney Harve...112.312950...165.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedThe Legend EndsThe Dark Knight Rises7.69106Christian Bale Michael Caine Gary Oldman Anne ...[{'name': 'Hans Zimmer', 'gender': 2, 'departm...Christopher Nolan
44260000000Action Adventure Science Fictionhttp://movies.disney.com/john-carter49529based on novel mars medallion space travel pri...enJohn CarterJohn Carter is a war-weary, former military ca...43.926995...132.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedLost in our world, found in another.John Carter6.12124Taylor Kitsch Lynn Collins Samantha Morton Wil...[{'name': 'Andrew Stanton', 'gender': 2, 'depa...Andrew Stanton
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " index budget genres \\\n", "0 0 237000000 Action Adventure Fantasy Science Fiction \n", "1 1 300000000 Adventure Fantasy Action \n", "2 2 245000000 Action Adventure Crime \n", "3 3 250000000 Action Crime Drama Thriller \n", "4 4 260000000 Action Adventure Science Fiction \n", "\n", " homepage id \\\n", "0 http://www.avatarmovie.com/ 19995 \n", "1 http://disney.go.com/disneypictures/pirates/ 285 \n", "2 http://www.sonypictures.com/movies/spectre/ 206647 \n", "3 http://www.thedarkknightrises.com/ 49026 \n", "4 http://movies.disney.com/john-carter 49529 \n", "\n", " keywords original_language \\\n", "0 culture clash future space war space colony so... en \n", "1 ocean drug abuse exotic island east india trad... en \n", "2 spy based on novel secret agent sequel mi6 en \n", "3 dc comics crime fighter terrorist secret ident... en \n", "4 based on novel mars medallion space travel pri... en \n", "\n", " original_title \\\n", "0 Avatar \n", "1 Pirates of the Caribbean: At World's End \n", "2 Spectre \n", "3 The Dark Knight Rises \n", "4 John Carter \n", "\n", " overview popularity ... runtime \\\n", "0 In the 22nd century, a paraplegic Marine is di... 150.437577 ... 162.0 \n", "1 Captain Barbossa, long believed to be dead, ha... 139.082615 ... 169.0 \n", "2 A cryptic message from Bond’s past sends him o... 107.376788 ... 148.0 \n", "3 Following the death of District Attorney Harve... 112.312950 ... 165.0 \n", "4 John Carter is a war-weary, former military ca... 43.926995 ... 132.0 \n", "\n", " spoken_languages status \\\n", "0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", "1 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "2 [{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},... Released \n", "3 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "4 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "\n", " tagline \\\n", "0 Enter the World of Pandora. \n", "1 At the end of the world, the adventure begins. \n", "2 A Plan No One Escapes \n", "3 The Legend Ends \n", "4 Lost in our world, found in another. \n", "\n", " title vote_average vote_count \\\n", "0 Avatar 7.2 11800 \n", "1 Pirates of the Caribbean: At World's End 6.9 4500 \n", "2 Spectre 6.3 4466 \n", "3 The Dark Knight Rises 7.6 9106 \n", "4 John Carter 6.1 2124 \n", "\n", " cast \\\n", "0 Sam Worthington Zoe Saldana Sigourney Weaver S... \n", "1 Johnny Depp Orlando Bloom Keira Knightley Stel... \n", "2 Daniel Craig Christoph Waltz L\\u00e9a Seydoux ... \n", "3 Christian Bale Michael Caine Gary Oldman Anne ... \n", "4 Taylor Kitsch Lynn Collins Samantha Morton Wil... \n", "\n", " crew director \n", "0 [{'name': 'Stephen E. Rivkin', 'gender': 0, 'd... James Cameron \n", "1 [{'name': 'Dariusz Wolski', 'gender': 2, 'depa... Gore Verbinski \n", "2 [{'name': 'Thomas Newman', 'gender': 2, 'depar... Sam Mendes \n", "3 [{'name': 'Hans Zimmer', 'gender': 2, 'departm... Christopher Nolan \n", "4 [{'name': 'Andrew Stanton', 'gender': 2, 'depa... Andrew Stanton \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "id": "eccde832", "metadata": {}, "outputs": [], "source": [ "features = [\"keywords\", \"cast\", \"genres\", \"director\"]" ] }, { "cell_type": "code", "execution_count": 8, "id": "2be37c53", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Avatar\n", "1 Pirates of the Caribbean: At World's End\n", "2 Spectre\n", "3 The Dark Knight Rises\n", "4 John Carter\n", " ... \n", "4798 El Mariachi\n", "4799 Newlyweds\n", "4800 Signed, Sealed, Delivered\n", "4801 Shanghai Calling\n", "4802 My Date with Drew\n", "Name: title, Length: 4803, dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.title" ] }, { "cell_type": "code", "execution_count": null, "id": "03ce1dbc", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "id": "2d9f793a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "index 0\n", "budget 0\n", "genres 28\n", "homepage 3091\n", "id 0\n", "keywords 412\n", "original_language 0\n", "original_title 0\n", "overview 3\n", "popularity 0\n", "production_companies 0\n", "production_countries 0\n", "release_date 1\n", "revenue 0\n", "runtime 2\n", "spoken_languages 0\n", "status 0\n", "tagline 844\n", "title 0\n", "vote_average 0\n", "vote_count 0\n", "cast 43\n", "crew 0\n", "director 30\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 10, "id": "08029ce3", "metadata": {}, "outputs": [], "source": [ "for feature in features:\n", " df[feature] = df[feature].fillna('')" ] }, { "cell_type": "code", "execution_count": 11, "id": "2a186746", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "index 0\n", "budget 0\n", "genres 0\n", "homepage 3091\n", "id 0\n", "keywords 0\n", "original_language 0\n", "original_title 0\n", "overview 3\n", "popularity 0\n", "production_companies 0\n", "production_countries 0\n", "release_date 1\n", "revenue 0\n", "runtime 2\n", "spoken_languages 0\n", "status 0\n", "tagline 844\n", "title 0\n", "vote_average 0\n", "vote_count 0\n", "cast 0\n", "crew 0\n", "director 0\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 12, "id": "84bcac4f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4803 entries, 0 to 4802\n", "Data columns (total 24 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 index 4803 non-null int64 \n", " 1 budget 4803 non-null int64 \n", " 2 genres 4803 non-null object \n", " 3 homepage 1712 non-null object \n", " 4 id 4803 non-null int64 \n", " 5 keywords 4803 non-null object \n", " 6 original_language 4803 non-null object \n", " 7 original_title 4803 non-null object \n", " 8 overview 4800 non-null object \n", " 9 popularity 4803 non-null float64\n", " 10 production_companies 4803 non-null object \n", " 11 production_countries 4803 non-null object \n", " 12 release_date 4802 non-null object \n", " 13 revenue 4803 non-null int64 \n", " 14 runtime 4801 non-null float64\n", " 15 spoken_languages 4803 non-null object \n", " 16 status 4803 non-null object \n", " 17 tagline 3959 non-null object \n", " 18 title 4803 non-null object \n", " 19 vote_average 4803 non-null float64\n", " 20 vote_count 4803 non-null int64 \n", " 21 cast 4803 non-null object \n", " 22 crew 4803 non-null object \n", " 23 director 4803 non-null object \n", "dtypes: float64(3), int64(5), object(16)\n", "memory usage: 900.7+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 13, "id": "13cb0f62", "metadata": {}, "outputs": [], "source": [ "def combined_features(row):\n", " return row['keywords']+\" \"+row['cast']+\" \"+row['genres']+\" \"+row['director']" ] }, { "cell_type": "code", "execution_count": 14, "id": "437baa14", "metadata": {}, "outputs": [], "source": [ "df[\"combined_features\"] = df.apply(combined_features, axis=1)" ] }, { "cell_type": "code", "execution_count": 15, "id": "79226699", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexbudgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularity...spoken_languagesstatustaglinetitlevote_averagevote_countcastcrewdirectorcombined_features
47984798220000Action Crime ThrillerNaN9367united states\\u2013mexico barrier legs arms pa...esEl MariachiEl Mariachi just wants to play his guitar and ...14.269792...[{\"iso_639_1\": \"es\", \"name\": \"Espa\\u00f1ol\"}]ReleasedHe didn't come looking for trouble, but troubl...El Mariachi6.6238Carlos Gallardo Jaime de Hoyos Peter Marquardt...[{'name': 'Robert Rodriguez', 'gender': 0, 'de...Robert Rodriguezunited states\\u2013mexico barrier legs arms pa...
479947999000Comedy RomanceNaN72766enNewlywedsA newlywed couple's honeymoon is upended by th...0.642552...[]ReleasedA newlywed couple's honeymoon is upended by th...Newlyweds5.95Edward Burns Kerry Bish\\u00e9 Marsha Dietlein ...[{'name': 'Edward Burns', 'gender': 2, 'depart...Edward BurnsEdward Burns Kerry Bish\\u00e9 Marsha Dietlein...
480048000Comedy Drama Romance TV Moviehttp://www.hallmarkchannel.com/signedsealeddel...231617date love at first sight narration investigati...enSigned, Sealed, Delivered\"Signed, Sealed, Delivered\" introduces a dedic...1.444476...[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedNaNSigned, Sealed, Delivered7.06Eric Mabius Kristin Booth Crystal Lowe Geoff G...[{'name': 'Carla Hetland', 'gender': 0, 'depar...Scott Smithdate love at first sight narration investigati...
480148010http://shanghaicalling.com/126186enShanghai CallingWhen ambitious New York attorney Sam is sent t...0.857008...[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedA New Yorker in ShanghaiShanghai Calling5.77Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...[{'name': 'Daniel Hsia', 'gender': 2, 'departm...Daniel HsiaDaniel Henney Eliza Coupe Bill Paxton Alan Ru...
480248020DocumentaryNaN25975obsession camcorder crush dream girlenMy Date with DrewEver since the second grade when he first saw ...1.929883...[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedNaNMy Date with Drew6.316Drew Barrymore Brian Herzlinger Corey Feldman ...[{'name': 'Clark Peterson', 'gender': 2, 'depa...Brian Herzlingerobsession camcorder crush dream girl Drew Barr...
\n", "

5 rows × 25 columns

\n", "
" ], "text/plain": [ " index budget genres \\\n", "4798 4798 220000 Action Crime Thriller \n", "4799 4799 9000 Comedy Romance \n", "4800 4800 0 Comedy Drama Romance TV Movie \n", "4801 4801 0 \n", "4802 4802 0 Documentary \n", "\n", " homepage id \\\n", "4798 NaN 9367 \n", "4799 NaN 72766 \n", "4800 http://www.hallmarkchannel.com/signedsealeddel... 231617 \n", "4801 http://shanghaicalling.com/ 126186 \n", "4802 NaN 25975 \n", "\n", " keywords original_language \\\n", "4798 united states\\u2013mexico barrier legs arms pa... es \n", "4799 en \n", "4800 date love at first sight narration investigati... en \n", "4801 en \n", "4802 obsession camcorder crush dream girl en \n", "\n", " original_title \\\n", "4798 El Mariachi \n", "4799 Newlyweds \n", "4800 Signed, Sealed, Delivered \n", "4801 Shanghai Calling \n", "4802 My Date with Drew \n", "\n", " overview popularity ... \\\n", "4798 El Mariachi just wants to play his guitar and ... 14.269792 ... \n", "4799 A newlywed couple's honeymoon is upended by th... 0.642552 ... \n", "4800 \"Signed, Sealed, Delivered\" introduces a dedic... 1.444476 ... \n", "4801 When ambitious New York attorney Sam is sent t... 0.857008 ... \n", "4802 Ever since the second grade when he first saw ... 1.929883 ... \n", "\n", " spoken_languages status \\\n", "4798 [{\"iso_639_1\": \"es\", \"name\": \"Espa\\u00f1ol\"}] Released \n", "4799 [] Released \n", "4800 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "4801 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "4802 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "\n", " tagline \\\n", "4798 He didn't come looking for trouble, but troubl... \n", "4799 A newlywed couple's honeymoon is upended by th... \n", "4800 NaN \n", "4801 A New Yorker in Shanghai \n", "4802 NaN \n", "\n", " title vote_average vote_count \\\n", "4798 El Mariachi 6.6 238 \n", "4799 Newlyweds 5.9 5 \n", "4800 Signed, Sealed, Delivered 7.0 6 \n", "4801 Shanghai Calling 5.7 7 \n", "4802 My Date with Drew 6.3 16 \n", "\n", " cast \\\n", "4798 Carlos Gallardo Jaime de Hoyos Peter Marquardt... \n", "4799 Edward Burns Kerry Bish\\u00e9 Marsha Dietlein ... \n", "4800 Eric Mabius Kristin Booth Crystal Lowe Geoff G... \n", "4801 Daniel Henney Eliza Coupe Bill Paxton Alan Ruc... \n", "4802 Drew Barrymore Brian Herzlinger Corey Feldman ... \n", "\n", " crew director \\\n", "4798 [{'name': 'Robert Rodriguez', 'gender': 0, 'de... Robert Rodriguez \n", "4799 [{'name': 'Edward Burns', 'gender': 2, 'depart... Edward Burns \n", "4800 [{'name': 'Carla Hetland', 'gender': 0, 'depar... Scott Smith \n", "4801 [{'name': 'Daniel Hsia', 'gender': 2, 'departm... Daniel Hsia \n", "4802 [{'name': 'Clark Peterson', 'gender': 2, 'depa... Brian Herzlinger \n", "\n", " combined_features \n", "4798 united states\\u2013mexico barrier legs arms pa... \n", "4799 Edward Burns Kerry Bish\\u00e9 Marsha Dietlein... \n", "4800 date love at first sight narration investigati... \n", "4801 Daniel Henney Eliza Coupe Bill Paxton Alan Ru... \n", "4802 obsession camcorder crush dream girl Drew Barr... \n", "\n", "[5 rows x 25 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": 16, "id": "59bdfd01", "metadata": {}, "outputs": [], "source": [ "Tfidf_vect = TfidfVectorizer()\n", "vector_matrix = Tfidf_vect.fit_transform(df[\"combined_features\"])" ] }, { "cell_type": "code", "execution_count": 17, "id": "2361e4a3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<4803x14845 sparse matrix of type ''\n", "\twith 97547 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vector_matrix" ] }, { "cell_type": "code", "execution_count": 18, "id": "ba641108", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " ...,\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vector_matrix.toarray()" ] }, { "cell_type": "code", "execution_count": 19, "id": "e1b80eb0", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 20, "id": "ba839fc3", "metadata": {}, "outputs": [], "source": [ "cosine_sim = cosine_similarity(vector_matrix)" ] }, { "cell_type": "code", "execution_count": 21, "id": "4aef9fe1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0.02703888, 0.04479062, ..., 0. , 0. ,\n", " 0. ],\n", " [0.02703888, 1. , 0.01505199, ..., 0.01653831, 0. ,\n", " 0. ],\n", " [0.04479062, 0.01505199, 1. , ..., 0. , 0.06576044,\n", " 0. ],\n", " ...,\n", " [0. , 0.01653831, 0. , ..., 1. , 0. ,\n", " 0.02554794],\n", " [0. , 0. , 0.06576044, ..., 0. , 1. ,\n", " 0. ],\n", " [0. , 0. , 0. , ..., 0.02554794, 0. ,\n", " 1. ]])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cosine_sim" ] }, { "cell_type": "code", "execution_count": 22, "id": "e5a9353a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...4793479447954796479747984799480048014802
01.0000000.0270390.0447910.0078960.1140580.0718650.00.0411710.0248690.030587...0.0000000.0000000.0000000.0200070.0000000.0412000.0000000.0000000.000000.000000
10.0270391.0000000.0150520.0068080.0146990.1144750.00.0142610.0367820.026371...0.0000000.0000000.0000000.0000000.0000000.0050070.0000000.0165380.000000.000000
20.0447910.0150521.0000000.0629430.0754070.0441880.00.0884940.0412020.051567...0.0000000.0000000.0000000.0000000.0000000.0146660.0000000.0000000.065760.000000
30.0078960.0068080.0629431.0000000.0079610.0438770.00.0077230.0368500.112528...0.0155470.0085050.0285970.0381290.0137340.0305540.0000000.0037310.000000.000000
40.1140580.0146990.0754070.0079611.0000000.1514490.00.0753350.0100520.050359...0.0000000.0301530.0000000.0505530.0000000.0058550.0000000.0000000.000000.000000
..................................................................
47980.0412000.0050070.0146660.0305540.0058550.0057300.00.0186880.0000000.005664...0.0000000.0062560.0151590.0042990.0101011.0000000.0000000.0000000.000000.000000
47990.0000000.0000000.0000000.0000000.0000000.0000000.00.0000000.0000000.000000...0.0000000.0057610.0000000.0000000.0000000.0000001.0000000.0126050.000000.000000
48000.0000000.0165380.0000000.0037310.0000000.0189240.00.0000000.0000000.000000...0.0029870.0053710.0031000.0027180.0000000.0000000.0126051.0000000.000000.025548
48010.0000000.0000000.0657600.0000000.0000000.0000000.00.0000000.0656730.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.000000.000000
48020.0000000.0000000.0000000.0000000.0000000.0000000.00.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0255480.000001.000000
\n", "

4803 rows × 4803 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 1.000000 0.027039 0.044791 0.007896 0.114058 0.071865 0.0 \n", "1 0.027039 1.000000 0.015052 0.006808 0.014699 0.114475 0.0 \n", "2 0.044791 0.015052 1.000000 0.062943 0.075407 0.044188 0.0 \n", "3 0.007896 0.006808 0.062943 1.000000 0.007961 0.043877 0.0 \n", "4 0.114058 0.014699 0.075407 0.007961 1.000000 0.151449 0.0 \n", "... ... ... ... ... ... ... ... \n", "4798 0.041200 0.005007 0.014666 0.030554 0.005855 0.005730 0.0 \n", "4799 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", "4800 0.000000 0.016538 0.000000 0.003731 0.000000 0.018924 0.0 \n", "4801 0.000000 0.000000 0.065760 0.000000 0.000000 0.000000 0.0 \n", "4802 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", "\n", " 7 8 9 ... 4793 4794 4795 \\\n", "0 0.041171 0.024869 0.030587 ... 0.000000 0.000000 0.000000 \n", "1 0.014261 0.036782 0.026371 ... 0.000000 0.000000 0.000000 \n", "2 0.088494 0.041202 0.051567 ... 0.000000 0.000000 0.000000 \n", "3 0.007723 0.036850 0.112528 ... 0.015547 0.008505 0.028597 \n", "4 0.075335 0.010052 0.050359 ... 0.000000 0.030153 0.000000 \n", "... ... ... ... ... ... ... ... \n", "4798 0.018688 0.000000 0.005664 ... 0.000000 0.006256 0.015159 \n", "4799 0.000000 0.000000 0.000000 ... 0.000000 0.005761 0.000000 \n", "4800 0.000000 0.000000 0.000000 ... 0.002987 0.005371 0.003100 \n", "4801 0.000000 0.065673 0.000000 ... 0.000000 0.000000 0.000000 \n", "4802 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 \n", "\n", " 4796 4797 4798 4799 4800 4801 4802 \n", "0 0.020007 0.000000 0.041200 0.000000 0.000000 0.00000 0.000000 \n", "1 0.000000 0.000000 0.005007 0.000000 0.016538 0.00000 0.000000 \n", "2 0.000000 0.000000 0.014666 0.000000 0.000000 0.06576 0.000000 \n", "3 0.038129 0.013734 0.030554 0.000000 0.003731 0.00000 0.000000 \n", "4 0.050553 0.000000 0.005855 0.000000 0.000000 0.00000 0.000000 \n", "... ... ... ... ... ... ... ... \n", "4798 0.004299 0.010101 1.000000 0.000000 0.000000 0.00000 0.000000 \n", "4799 0.000000 0.000000 0.000000 1.000000 0.012605 0.00000 0.000000 \n", "4800 0.002718 0.000000 0.000000 0.012605 1.000000 0.00000 0.025548 \n", "4801 0.000000 0.000000 0.000000 0.000000 0.000000 1.00000 0.000000 \n", "4802 0.000000 0.000000 0.000000 0.000000 0.025548 0.00000 1.000000 \n", "\n", "[4803 rows x 4803 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(cosine_sim)" ] }, { "cell_type": "code", "execution_count": 23, "id": "56252702", "metadata": {}, "outputs": [], "source": [ "#movie_user_like = \"Dead Poets Society\"\n", "movie_user_like = \"Batman\"\n", "def get_index_from_title(title):\n", " return df[df.title == title][\"index\"].values[0]" ] }, { "cell_type": "code", "execution_count": 24, "id": "9be834ce", "metadata": {}, "outputs": [], "source": [ "def check_movie(m_name):\n", " movie_index = get_index_from_title(m_name)\n", " similar_movies= list(enumerate(cosine_sim[movie_index]))\n", " sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)\n", " mv = get_suggestions(sorted_similar_movies)\n", " return mv" ] }, { "cell_type": "markdown", "id": "76ed784a", "metadata": {}, "source": [ "movie_index = get_index_from_title(movie_user_like)" ] }, { "cell_type": "markdown", "id": "f578515e", "metadata": {}, "source": [ "similar_movies= list(enumerate(cosine_sim[movie_index]))" ] }, { "cell_type": "markdown", "id": "7a65d60f", "metadata": {}, "source": [ "sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)" ] }, { "cell_type": "markdown", "id": "893b1095", "metadata": {}, "source": [ "sorted_similar_movies" ] }, { "cell_type": "code", "execution_count": 26, "id": "d9ffed79", "metadata": {}, "outputs": [], "source": [ "def get_title_from_index(index):\n", " return df[df.index == index][\"title\"].values[0]" ] }, { "cell_type": "markdown", "id": "cb175fc0", "metadata": {}, "source": [ "df[df.index == 0][\"title\"].values[0]" ] }, { "cell_type": "markdown", "id": "cc91a9e3", "metadata": {}, "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 29, "id": "9a3dcea2", "metadata": {}, "outputs": [], "source": [ "def get_suggestions(sorted_similar_movies):\n", " i=0\n", " movies = \"\"\n", " for movie in sorted_similar_movies:\n", " t = get_title_from_index(movie[0])\n", " movies = movies + t +\"\\n\"\n", " \n", " i=i+1\n", " if i>15:\n", " print(movies)\n", " return movies" ] }, { "cell_type": "code", "execution_count": null, "id": "1d8a296f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "782192ef", "metadata": {}, "source": [ "get_suggestions()" ] }, { "cell_type": "code", "execution_count": 2, "id": "0333fa75", "metadata": {}, "outputs": [], "source": [ "import gradio as gr" ] }, { "cell_type": "code", "execution_count": 31, "id": "64a0399e", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7860/\n", "Running on public URL: https://45339.gradio.app\n", "\n", "This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co./spaces)\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(,\n", " 'http://127.0.0.1:7860/',\n", " 'https://45339.gradio.app')" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "Batman\n", "Batman Returns\n", "Batman & Robin\n", "The Dark Knight Rises\n", "Batman Begins\n", "The Dark Knight\n", "A History of Violence\n", "The Sentinel\n", "Superman\n", "Mars Attacks!\n", "Beetlejuice\n", "The Mask\n", "Man of Steel\n", "Jonah Hex\n", "Spider-Man 3\n", "Spider-Man 2\n", "\n" ] } ], "source": [ "movie = gr.Interface(fn=check_movie, inputs=\"text\", outputs=\"text\")\n", "\n", "movie.launch(share=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false } }, "nbformat": 4, "nbformat_minor": 5 }