import numpy as np import pandas as pd from sklearn.metrics.pairwise import pairwise_distances from sklearn.feature_extraction.text import CountVectorizer from typing import List, Dict import os from utils.config import Config # Load the dataset (replace with the actual path to your dataset) dataset_path = Config.read('app', 'dataset') # Ensure the dataset exists if not os.path.exists(dataset_path): raise FileNotFoundError(f"The dataset file at {dataset_path} was not found.") # Load the dataset data = pd.read_pickle(dataset_path) # Ensure the dataset has the necessary columns: 'asin', 'title', 'brand', 'medium_image_url' required_columns = ['asin', 'title', 'brand', 'medium_image_url'] for col in required_columns: if col not in data.columns: raise ValueError(f"Missing required column: {col} in the dataset") # Set up the vectorizer and fit the model title_vectorizer = CountVectorizer() title_features = title_vectorizer.fit_transform(data['title']) # Function to calculate the bag-of-words model and return closest matches def bag_of_words_model(query: str, num_results: int) -> List[Dict]: # Transform the input query to the same feature space query_vec = title_vectorizer.transform([query]) # Calculate pairwise distances between the query and all items in the corpus pairwise_dist = pairwise_distances(title_features, query_vec, metric='cosine') # Get the indices of the closest matches indices = np.argsort(pairwise_dist.flatten())[0:num_results] results = [] for idx in indices: result = { 'asin': data['asin'].iloc[idx], 'brand': data['brand'].iloc[idx], 'title': data['title'].iloc[idx], 'url': data['medium_image_url'].iloc[idx], } results.append(result) return results