|
import numpy as np |
|
import pandas as pd |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import pairwise_distances |
|
from typing import List, Dict |
|
from utils.config import Config |
|
import os |
|
|
|
|
|
|
|
dataset_path = Config.read('app', 'dataset') |
|
|
|
|
|
if not os.path.exists(dataset_path): |
|
raise FileNotFoundError(f"The dataset file at {dataset_path} was not found.") |
|
|
|
|
|
data = pd.read_pickle(dataset_path) |
|
|
|
|
|
required_columns = ['asin', 'title', 'brand', 'medium_image_url'] |
|
for col in required_columns: |
|
if col not in data.columns: |
|
raise ValueError(f"Missing required column: {col} in the dataset") |
|
|
|
|
|
tfidf_title_vectorizer = TfidfVectorizer(min_df = 0.0) |
|
tfidf_title_features = tfidf_title_vectorizer.fit_transform(data['title']) |
|
|
|
|
|
def tfidf_model(input_text: str, num_results: int) -> List[Dict]: |
|
|
|
|
|
query_vec = tfidf_title_vectorizer.transform([input_text]) |
|
|
|
pairwise_dist = pairwise_distances(tfidf_title_features, query_vec) |
|
|
|
|
|
indices = np.argsort(pairwise_dist.flatten())[0:num_results] |
|
|
|
|
|
df_indices = list(data.index[indices]) |
|
|
|
results = [] |
|
for i in range(0,len(indices)): |
|
result = { |
|
'asin': data['asin'].loc[df_indices[i]], |
|
'brand': data['brand'].loc[df_indices[i]], |
|
'title': data['title'].loc[df_indices[i]], |
|
'url': data['medium_image_url'].loc[df_indices[i]] |
|
} |
|
results.append(result) |
|
|
|
return results |