import re import numpy as np from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sentence_transformers import SentenceTransformer stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() model = SentenceTransformer('all-mpnet-base-v2') def clean_text(text): # Lowercase text = text.lower() # Remove special characters and digits text = re.sub(r'[^a-z\s]', '', text) # Tokenize words = word_tokenize(text) # Remove stopwords and lemmatize words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] # Join words back to a single string cleaned_text = ' '.join(words) return cleaned_text def encode_and_normalize(text): vector = model.encode(text) normalized_vector = vector / np.linalg.norm(vector) return normalized_vector def extract_order_id_from_query(text): match = re.search(r'\bB-\d+\b', text) if match: return match.group(0) return None