import nltk import string import ast import re import unidecode import pandas as pd import streamlit as st import nltk nltk.download('wordnet') from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet from collections import Counter from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer import pickle def ingredient_parser(ingreds): measures = ['teaspoon', 't', 'tsp', 'tablespoon', 'T', 'tbl.', 'tb', 'tbsp.', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt', 'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal', 'ml', 'milliliter', 'millilitre', 'cc', 'mL', 'l', 'liter', 'litre', 'L', 'dl', 'deciliter', 'decilitre', 'dL', 'bulb', 'level', 'heaped', 'rounded', 'whole', 'pinch', 'medium', 'slice', 'pound', 'lb', '#', 'ounce', 'oz', 'mg', 'milligram', 'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'x', 'of', 'mm', 'millimetre', 'millimeter', 'cm', 'centimeter', 'centimetre', 'm', 'meter', 'metre', 'inch', 'in', 'milli', 'centi', 'deci', 'hecto', 'kilo'] words_to_remove = ['fresh', 'oil', 'a', 'red', 'bunch', 'and', 'clove', 'or', 'leaf', 'chilly','chillies', 'large', 'extra', 'sprig', 'ground', 'handful', 'free', 'small', 'pepper', 'virgin', 'range', 'from', 'dried', 'sustainable', 'black', 'peeled', 'higher', 'welfare', 'seed', 'for', 'finely', 'freshly', 'sea', 'quality', 'white', 'ripe', 'few', 'piece', 'source', 'to', 'organic', 'flat', 'smoked', 'ginger', 'sliced', 'green', 'picked', 'the', 'stick', 'plain', 'plus', 'mixed', 'mint', 'bay', 'basil', 'your', 'cumin', 'optional', 'fennel', 'serve', 'mustard', 'unsalted', 'baby', 'paprika', 'fat', 'ask', 'natural', 'skin', 'roughly', 'into', 'such', 'cut', 'good', 'brown', 'grated', 'trimmed', 'oregano', 'powder', 'yellow', 'dusting', 'knob', 'frozen', 'on', 'deseeded', 'low', 'runny', 'balsamic', 'cooked', 'streaky', 'nutmeg', 'sage', 'rasher', 'zest', 'pin', 'groundnut', 'breadcrumb', 'turmeric', 'halved', 'grating', 'stalk', 'light', 'tinned', 'dry', 'soft', 'rocket', 'bone', 'colour', 'washed', 'skinless', 'leftover', 'splash', 'removed', 'dijon', 'thick', 'big', 'hot', 'drained', 'sized', 'chestnut', 'watercress', 'fishmonger', 'english', 'dill', 'caper', 'raw', 'worcestershire', 'flake', 'cider', 'cayenne', 'tbsp', 'leg', 'pine', 'wild', 'if', 'fine', 'herb', 'almond', 'shoulder', 'cube', 'dressing', 'with', 'chunk', 'spice', 'thumb', 'garam', 'new', 'little', 'punnet', 'peppercorn', 'shelled', 'saffron', 'other''chopped', 'salt', 'olive', 'taste', 'can', 'sauce', 'water', 'diced', 'package', 'italian', 'shredded', 'divided', 'parsley', 'vinegar', 'all', 'purpose', 'crushed', 'juice', 'more', 'coriander', 'bell', 'needed', 'thinly', 'boneless', 'half', 'thyme', 'cubed', 'cinnamon', 'cilantro', 'jar', 'seasoning', 'rosemary', 'extract', 'sweet', 'baking', 'beaten', 'heavy', 'seeded', 'tin', 'vanilla', 'uncooked', 'crumb', 'style', 'thin', 'nut', 'coarsely', 'spring', 'chili', 'cornstarch', 'strip', 'cardamom', 'rinsed', 'honey', 'cherry', 'root', 'quartered', 'head', 'softened', 'container', 'crumbled', 'frying', 'lean', 'cooking', 'roasted', 'warm', 'whipping', 'thawed', 'corn', 'pitted', 'sun', 'kosher', 'bite', 'toasted', 'lasagna', 'split', 'melted', 'degree', 'lengthwise', 'romano', 'packed', 'pod', 'anchovy', 'rom', 'prepared', 'juiced', 'fluid', 'floret', 'room', 'active', 'seasoned', 'mix', 'deveined', 'lightly', 'anise', 'thai', 'size', 'unsweetened', 'torn', 'wedge', 'sour', 'basmati', 'marinara', 'dark', 'temperature', 'garnish', 'bouillon', 'loaf', 'shell', 'reggiano', 'canola', 'parmigiano', 'round', 'canned', 'ghee', 'crust', 'long', 'broken', 'ketchup', 'bulk', 'cleaned', 'condensed', 'sherry', 'provolone', 'cold', 'soda', 'cottage', 'spray', 'tamarind', 'pecorino', 'shortening', 'part', 'bottle', 'sodium', 'cocoa', 'grain', 'french', 'roast', 'stem', 'link', 'firm', 'asafoetida', 'mild', 'dash', 'boiling'] if isinstance(ingreds, list): ingredients = ingreds else: ingredients = ast.literal_eval(ingreds) translator = str.maketrans('', '', string.punctuation) lemmatizer = WordNetLemmatizer() ingred_list = [] for i in ingredients: i.translate(translator) # We split up with hyphens as well as spaces items = re.split(' |-', i) # Get rid of words containing non alphabet letters items = [word for word in items if word.isalpha()] # Turn everything to lowercase items = [word.lower() for word in items] # remove accents items = [unidecode.unidecode(word) for word in items] #''.join((c for c in unicodedata.normalize('NFD', items) if unicodedata.category(c) != 'Mn')) # Lemmatize words so we can compare words to measuring words items = [lemmatizer.lemmatize(word) for word in items] # Gets rid of measuring words/phrases, e.g. heaped teaspoon items = [word for word in items if word not in measures] # Get rid of common easy words items = [word for word in items if word not in words_to_remove] if items: ingred_list.append(' '.join(items)) ingred_list = " ".join(ingred_list) return ingred_list # load in tdidf model and encodings def scorefunc(ingredients): with open('tfidf_encodings.pkl', 'rb') as f: tfidf_encodings = pickle.load(f) with open('tfidf_model.pkl', "rb") as f: tfidf = pickle.load(f) # parse the ingredients using my ingredient_parser try: ingredients_parsed = ingredient_parser(ingredients) except: ingredients_parsed = ingredient_parser([ingredients]) # use our pretrained tfidf model to encode our input ingredients ingredients_tfidf = tfidf.transform([ingredients_parsed]) # calculate cosine similarity between actual recipe ingreds and test ingreds cos_sim = map(lambda x: cosine_similarity(ingredients_tfidf, x), tfidf_encodings) scores = list(cos_sim) return scores def get_recommendations(N, scores): # load in recipe dataset df_recipes = pd.read_csv('JO_parsed.csv') # order the scores with and filter to get the highest N scores(desc order) top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N] # create dataframe to load in recommendations recommendation = pd.DataFrame(columns = ['Recipe', 'Ingredients', 'Score', 'Url']) count = 0 for i in top: recommendation.at[count, 'Recipe'] = df_recipes['recipe_name'][i] recommendation.at[count, 'Ingredients'] = df_recipes['ingredients'][i] recommendation.at[count, 'Url'] = df_recipes['recipe_urls'][i] recommendation.at[count, 'Score'] = "{:.3f}".format(float(scores[i])) count += 1 return recommendation def app(): st.markdown("*Recipe Recommendation System*") recipe_box = st.selectbox( "Display the top 5 recommendations or pick a particular recipe", ["Show the top picks", "Select a single recipe"], ) if recipe_box =="Show the top picks": N=5 else: N=1 ing = st.text_input("Enter the ingredients you would like to cook with") if ing: scores = scorefunc(ing) rec = get_recommendations(N,scores) st.write("These are some recommendation(s) for you") st.write(rec.head(N))