import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from typing import List, Dict
import os

from utils.config import Config

# Load the dataset (replace with the actual path to your dataset)
dataset_path = Config.read('app', 'dataset')

# Ensure the dataset exists
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"The dataset file at {dataset_path} was not found.")

# Load the dataset
data = pd.read_pickle(dataset_path)

# Ensure the dataset has the necessary columns: 'asin', 'title', 'brand', 'medium_image_url'
required_columns = ['asin', 'title', 'brand', 'medium_image_url']
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Missing required column: {col} in the dataset")

# Set up the vectorizer and fit the model
title_vectorizer = CountVectorizer()
title_features = title_vectorizer.fit_transform(data['title'])

# Function to calculate the bag-of-words model and return closest matches
def bag_of_words_model(query: str, num_results: int) -> List[Dict]:
    # Transform the input query to the same feature space
    query_vec = title_vectorizer.transform([query])
    
    # Calculate pairwise distances between the query and all items in the corpus
    pairwise_dist = pairwise_distances(title_features, query_vec, metric='cosine')
    
    # Get the indices of the closest matches
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    
    results = []
    for idx in indices:
        result = {
            'asin': data['asin'].iloc[idx],
            'brand': data['brand'].iloc[idx],
            'title': data['title'].iloc[idx],
            'url': data['medium_image_url'].iloc[idx],
        }
        results.append(result)
    
    return results