|
import re |
|
import numpy as np |
|
from nltk.tokenize import word_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from sentence_transformers import SentenceTransformer |
|
|
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
model = SentenceTransformer('all-mpnet-base-v2') |
|
|
|
|
|
def clean_text(text): |
|
|
|
text = text.lower() |
|
|
|
text = re.sub(r'[^a-z\s]', '', text) |
|
|
|
words = word_tokenize(text) |
|
|
|
words = [lemmatizer.lemmatize(word) |
|
for word in words if word not in stop_words] |
|
|
|
cleaned_text = ' '.join(words) |
|
return cleaned_text |
|
|
|
|
|
def encode_and_normalize(text): |
|
vector = model.encode(text) |
|
normalized_vector = vector / np.linalg.norm(vector) |
|
return normalized_vector |
|
|
|
def extract_order_id_from_query(text): |
|
match = re.search(r'\bB-\d+\b', text) |
|
if match: |
|
return match.group(0) |
|
return None |
|
|