fastlane / services /utils.py
Hugo Guarin
Update space
c169262
raw
history blame
1.05 kB
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
model = SentenceTransformer('all-mpnet-base-v2')
def clean_text(text):
# Lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^a-z\s]', '', text)
# Tokenize
words = word_tokenize(text)
# Remove stopwords and lemmatize
words = [lemmatizer.lemmatize(word)
for word in words if word not in stop_words]
# Join words back to a single string
cleaned_text = ' '.join(words)
return cleaned_text
def encode_and_normalize(text):
vector = model.encode(text)
normalized_vector = vector / np.linalg.norm(vector)
return normalized_vector
def extract_order_id_from_query(text):
match = re.search(r'\bB-\d+\b', text)
if match:
return match.group(0)
return None