File size: 1,048 Bytes
c169262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
model = SentenceTransformer('all-mpnet-base-v2')


def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word)
             for word in words if word not in stop_words]
    # Join words back to a single string
    cleaned_text = ' '.join(words)
    return cleaned_text


def encode_and_normalize(text):
    vector = model.encode(text)
    normalized_vector = vector / np.linalg.norm(vector)
    return normalized_vector

def extract_order_id_from_query(text):
    match = re.search(r'\bB-\d+\b', text)
    if match:
        return match.group(0)
    return None