ariansyahdedy commited on
Commit
99723c5
·
1 Parent(s): 4c47563

comment NLTK package

Browse files
Files changed (1) hide show
  1. app/search/bm25_search.py +39 -59
app/search/bm25_search.py CHANGED
@@ -1,46 +1,38 @@
1
- # bm25_search.py
2
  import asyncio
3
  from rank_bm25 import BM25Okapi
4
- import nltk
5
  import string
6
  from typing import List, Set, Optional
7
- from nltk.corpus import stopwords
8
- from nltk.stem import WordNetLemmatizer
9
  import os
10
 
11
- def download_nltk_resources():
12
- """
13
- Downloads required NLTK resources synchronously.
14
- """
15
- resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
16
-
17
- nltk_data_path = "/tmp/nltk_data" # Temporary directory for Hugging Face Spaces
18
- os.makedirs(nltk_data_path, exist_ok=True)
19
- nltk.data.path.append(nltk_data_path)
20
-
21
- for resource in resources:
22
- try:
23
- nltk.download(resource, download_dir=nltk_data_path, quiet=True)
24
- except Exception as e:
25
- print(f"Error downloading {resource}: {str(e)}")
26
 
27
  class BM25_search:
28
- # Class variable to track if resources have been downloaded
29
  nltk_resources_downloaded = False
30
 
31
  def __init__(self, remove_stopwords: bool = True, perform_lemmatization: bool = False):
32
  """
33
  Initializes the BM25search.
34
-
35
- Parameters:
36
- - remove_stopwords (bool): Whether to remove stopwords during preprocessing.
37
- - perform_lemmatization (bool): Whether to perform lemmatization on tokens.
38
- """
39
- # Ensure NLTK resources are downloaded only once
40
- if not BM25_search.nltk_resources_downloaded:
41
-
42
- download_nltk_resources()
43
- BM25_search.nltk_resources_downloaded = True # Mark as downloaded
44
 
45
  self.documents: List[str] = []
46
  self.doc_ids: List[str] = []
@@ -48,20 +40,22 @@ class BM25_search:
48
  self.bm25: Optional[BM25Okapi] = None
49
  self.remove_stopwords = remove_stopwords
50
  self.perform_lemmatization = perform_lemmatization
51
- self.stop_words: Set[str] = set(stopwords.words('english')) if remove_stopwords else set()
52
- self.lemmatizer = WordNetLemmatizer() if perform_lemmatization else None
 
53
 
54
  def preprocess(self, text: str) -> List[str]:
55
  """
56
- Preprocesses the input text by lowercasing, removing punctuation,
57
- tokenizing, removing stopwords, and optionally lemmatizing.
58
  """
59
  text = text.lower().translate(str.maketrans('', '', string.punctuation))
60
- tokens = nltk.word_tokenize(text)
61
- if self.remove_stopwords:
62
- tokens = [token for token in tokens if token not in self.stop_words]
63
- if self.perform_lemmatization and self.lemmatizer:
64
- tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
 
65
  return tokens
66
 
67
  def add_document(self, doc_id: str, new_doc: str) -> None:
@@ -69,11 +63,9 @@ class BM25_search:
69
  Adds a new document to the corpus and updates the BM25 index.
70
  """
71
  processed_tokens = self.preprocess(new_doc)
72
-
73
  self.documents.append(new_doc)
74
  self.doc_ids.append(doc_id)
75
  self.tokenized_docs.append(processed_tokens)
76
- # Ensure update_bm25 is awaited if required in async context
77
  self.update_bm25()
78
  print(f"Added document ID: {doc_id}")
79
 
@@ -101,14 +93,12 @@ class BM25_search:
101
  else:
102
  print("No documents to initialize BM25.")
103
 
104
-
105
  def get_scores(self, query: str) -> List[float]:
106
  """
107
  Computes BM25 scores for all documents based on the given query.
108
  """
109
  processed_query = self.preprocess(query)
110
  print(f"Tokenized Query: {processed_query}")
111
-
112
  if self.bm25:
113
  return self.bm25.get_scores(processed_query)
114
  else:
@@ -123,9 +113,9 @@ class BM25_search:
123
  if self.bm25:
124
  return self.bm25.get_top_n(processed_query, self.documents, n)
125
  else:
126
- print("initialized.")
127
  return []
128
-
129
  def clear_documents(self) -> None:
130
  """
131
  Clears all documents from the BM25 index.
@@ -133,18 +123,12 @@ class BM25_search:
133
  self.documents = []
134
  self.doc_ids = []
135
  self.tokenized_docs = []
136
- self.bm25 = None # Reset BM25 index
137
  print("BM25 documents cleared and index reset.")
138
-
139
  def get_document(self, doc_id: str) -> str:
140
  """
141
  Retrieves a document by its document ID.
142
-
143
- Parameters:
144
- - doc_id (str): The ID of the document to retrieve.
145
-
146
- Returns:
147
- - str: The document text if found, otherwise an empty string.
148
  """
149
  try:
150
  index = self.doc_ids.index(doc_id)
@@ -153,13 +137,9 @@ class BM25_search:
153
  print(f"Document ID {doc_id} not found.")
154
  return ""
155
 
156
-
157
  async def initialize_bm25_search(remove_stopwords: bool = True, perform_lemmatization: bool = False) -> BM25_search:
158
  """
159
- Initializes the BM25search with proper NLTK resource downloading.
160
  """
161
- loop = asyncio.get_running_loop()
162
- await loop.run_in_executor(None, download_nltk_resources)
163
  return BM25_search(remove_stopwords, perform_lemmatization)
164
-
165
-
 
 
1
  import asyncio
2
  from rank_bm25 import BM25Okapi
3
+ # import nltk
4
  import string
5
  from typing import List, Set, Optional
6
+ # from nltk.corpus import stopwords
7
+ # from nltk.stem import WordNetLemmatizer
8
  import os
9
 
10
+ # Commented out this function that downloads NLTK resources.
11
+ # def download_nltk_resources():
12
+ # """
13
+ # Downloads required NLTK resources synchronously.
14
+ # """
15
+ # resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
16
+ # nltk_data_path = "/tmp/nltk_data"
17
+ # os.makedirs(nltk_data_path, exist_ok=True)
18
+ # nltk.data.path.append(nltk_data_path)
19
+ # for resource in resources:
20
+ # try:
21
+ # nltk.download(resource, download_dir=nltk_data_path, quiet=True)
22
+ # except Exception as e:
23
+ # print(f"Error downloading {resource}: {str(e)}")
 
24
 
25
  class BM25_search:
 
26
  nltk_resources_downloaded = False
27
 
28
  def __init__(self, remove_stopwords: bool = True, perform_lemmatization: bool = False):
29
  """
30
  Initializes the BM25search.
31
+ """
32
+ # Commented out NLTK resource initialization
33
+ # if not BM25_search.nltk_resources_downloaded:
34
+ # download_nltk_resources()
35
+ # BM25_search.nltk_resources_downloaded = True
 
 
 
 
 
36
 
37
  self.documents: List[str] = []
38
  self.doc_ids: List[str] = []
 
40
  self.bm25: Optional[BM25Okapi] = None
41
  self.remove_stopwords = remove_stopwords
42
  self.perform_lemmatization = perform_lemmatization
43
+ # Commented out NLTK-specific tools
44
+ # self.stop_words: Set[str] = set(stopwords.words('english')) if remove_stopwords else set()
45
+ # self.lemmatizer = WordNetLemmatizer() if perform_lemmatization else None
46
 
47
  def preprocess(self, text: str) -> List[str]:
48
  """
49
+ Preprocesses the input text by lowercasing and removing punctuation.
50
+ NLTK-related tokenization, stopword removal, and lemmatization are commented out.
51
  """
52
  text = text.lower().translate(str.maketrans('', '', string.punctuation))
53
+ # tokens = nltk.word_tokenize(text) # Commented out NLTK tokenization
54
+ tokens = text.split() # Basic tokenization as a fallback
55
+ # if self.remove_stopwords:
56
+ # tokens = [token for token in tokens if token not in self.stop_words]
57
+ # if self.perform_lemmatization and self.lemmatizer:
58
+ # tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
59
  return tokens
60
 
61
  def add_document(self, doc_id: str, new_doc: str) -> None:
 
63
  Adds a new document to the corpus and updates the BM25 index.
64
  """
65
  processed_tokens = self.preprocess(new_doc)
 
66
  self.documents.append(new_doc)
67
  self.doc_ids.append(doc_id)
68
  self.tokenized_docs.append(processed_tokens)
 
69
  self.update_bm25()
70
  print(f"Added document ID: {doc_id}")
71
 
 
93
  else:
94
  print("No documents to initialize BM25.")
95
 
 
96
  def get_scores(self, query: str) -> List[float]:
97
  """
98
  Computes BM25 scores for all documents based on the given query.
99
  """
100
  processed_query = self.preprocess(query)
101
  print(f"Tokenized Query: {processed_query}")
 
102
  if self.bm25:
103
  return self.bm25.get_scores(processed_query)
104
  else:
 
113
  if self.bm25:
114
  return self.bm25.get_top_n(processed_query, self.documents, n)
115
  else:
116
+ print("BM25 is not initialized.")
117
  return []
118
+
119
  def clear_documents(self) -> None:
120
  """
121
  Clears all documents from the BM25 index.
 
123
  self.documents = []
124
  self.doc_ids = []
125
  self.tokenized_docs = []
126
+ self.bm25 = None
127
  print("BM25 documents cleared and index reset.")
128
+
129
  def get_document(self, doc_id: str) -> str:
130
  """
131
  Retrieves a document by its document ID.
 
 
 
 
 
 
132
  """
133
  try:
134
  index = self.doc_ids.index(doc_id)
 
137
  print(f"Document ID {doc_id} not found.")
138
  return ""
139
 
 
140
  async def initialize_bm25_search(remove_stopwords: bool = True, perform_lemmatization: bool = False) -> BM25_search:
141
  """
142
+ Initializes the BM25search.
143
  """
144
+ # Removed NLTK resource download from async context
 
145
  return BM25_search(remove_stopwords, perform_lemmatization)