Spaces:

lihuigu
/

SciPIP

Running

App Files Files Community

lihuigu commited on Nov 6, 2024

Commit

a6a5155

1 Parent(s): b6336ac

[feat]add example & singleton

Browse files

Files changed (9) hide show

configs/datasets.yaml +0 -1
src/pages/button_interface.py +24 -9
src/pages/one_click_generation.py +5 -3
src/pages/step_by_step_generation.py +1 -1
src/paper_manager.py +4 -5
src/utils/hash.py +31 -2
src/utils/llms_api.py +1 -1
src/utils/paper_client.py +22 -16
src/utils/paper_retriever.py +45 -35

configs/datasets.yaml CHANGED Viewed

@@ -4,7 +4,6 @@ DEFAULT:
     log_level: "DEBUG"
     log_dir: ./log
     embedding: ./assets/model/sentence-transformers/all-MiniLM-L6-v2
-    device: "cpu" # "cpu"
 ARTICLE:
     summarizing_prompt: ./assets/prompt/summarizing.xml

     log_level: "DEBUG"
     log_dir: ./log
     embedding: ./assets/model/sentence-transformers/all-MiniLM-L6-v2
 ARTICLE:
     summarizing_prompt: ./assets/prompt/summarizing.xml

src/pages/button_interface.py CHANGED Viewed

@@ -7,6 +7,7 @@ from generator import IdeaGenerator
 class Backend(object):
     def __init__(self) -> None:
         CONFIG_PATH = "./configs/datasets.yaml"
         RETRIEVER_NAME = "SNKG"
         USE_INSPIRATION = True
         BRAINSTORM_MODE = "mode_c"
@@ -22,6 +23,16 @@ class Backend(object):
         self.idea_generator = IdeaGenerator(self.config, None)
         self.use_inspiration = USE_INSPIRATION
         self.brainstorm_mode = BRAINSTORM_MODE
     def background2brainstorm_callback(self, background, json_strs=None):
         if json_strs is not None: # only for DEBUG_MODE
@@ -99,12 +110,16 @@ class Backend(object):
             return final_ideas
     def get_demo_i(self, i):
-        return ("The application scope of large-scale language models such as GPT-4 and LLaMA "
-    "has rapidly expanded, demonstrating powerful capabilities in natural language processing "
-    "and multimodal tasks. However, as the size and complexity of the models increase, understanding "
-    "how they make decisions becomes increasingly difficult. Challenge: 1 The complexity of model "
-    "interpretation: The billions of parameters and nonlinear decision paths within large-scale language "
-    "models make it very difficult to track and interpret specific outputs. The existing interpretation "
-    "methods usually only provide a local perspective and are difficult to systematize. 2. Transparency "
-    "and Fairness: In specific scenarios, models may exhibit biased or discriminatory behavior. Ensuring "
-    "the transparency of these models, reducing bias, and providing credible explanations is one of the current challenges.")

 class Backend(object):
     def __init__(self) -> None:
         CONFIG_PATH = "./configs/datasets.yaml"
+        EXAMPLE_PATH = "./assets/data/example.json"
         RETRIEVER_NAME = "SNKG"
         USE_INSPIRATION = True
         BRAINSTORM_MODE = "mode_c"
         self.idea_generator = IdeaGenerator(self.config, None)
         self.use_inspiration = USE_INSPIRATION
         self.brainstorm_mode = BRAINSTORM_MODE
+        self.examples = self.load_examples(EXAMPLE_PATH)
+    def load_examples(self, path):
+        try:
+            with open(path, "r") as f:
+                data = json.load(f)
+            return data
+        except (FileNotFoundError, json.JSONDecodeError) as e:
+            print(f"Error loading examples from {path}: {e}")
+            return []
     def background2brainstorm_callback(self, background, json_strs=None):
         if json_strs is not None: # only for DEBUG_MODE
             return final_ideas
     def get_demo_i(self, i):
+        if 0 <= i < len(self.examples):
+            return self.examples[i].get("background", "Background not found.")
+        else:
+            return "Example not found. Please select a valid index."
+    #     return ("The application scope of large-scale language models such as GPT-4 and LLaMA "
+    # "has rapidly expanded, demonstrating powerful capabilities in natural language processing "
+    # "and multimodal tasks. However, as the size and complexity of the models increase, understanding "
+    # "how they make decisions becomes increasingly difficult. Challenge: 1 The complexity of model "
+    # "interpretation: The billions of parameters and nonlinear decision paths within large-scale language "
+    # "models make it very difficult to track and interpret specific outputs. The existing interpretation "
+    # "methods usually only provide a local perspective and are difficult to systematize. 2. Transparency "
+    # "and Fairness: In specific scenarios, models may exhibit biased or discriminatory behavior. Ensuring "
+    # "the transparency of these models, reducing bias, and providing credible explanations is one of the current challenges.")

src/pages/one_click_generation.py CHANGED Viewed

@@ -74,9 +74,11 @@ def genrate_mainpage(backend):
         st.session_state["use_demo_input"] = True
         st.session_state["demo_input"] = demo_input
-    cols = st.columns([2, 2])
-    cols[0].button("Example 1", on_click=get_demo_n, args=(1,), use_container_width=True, disabled=not st.session_state.get("enable_submmit", True))
-    cols[1].button("Example 2", on_click=get_demo_n, args=(2,), use_container_width=True, disabled=not st.session_state.get("enable_submmit", True))
     def check_intermediate_outputs(id="brainstorms"):
         msg = st.session_state["intermediate_output"].get(id, None)

         st.session_state["use_demo_input"] = True
         st.session_state["demo_input"] = demo_input
+    cols = st.columns([1, 1, 1, 1])
+    cols[0].button("Example 1", on_click=get_demo_n, args=(0,), use_container_width=True, disabled=not st.session_state.get("enable_submmit", True))
+    cols[1].button("Example 2", on_click=get_demo_n, args=(1,), use_container_width=True, disabled=not st.session_state.get("enable_submmit", True))
+    cols[2].button("Example 3", on_click=get_demo_n, args=(2,), use_container_width=True, disabled=not st.session_state.get("enable_submmit", True))
+    cols[3].button("Example 4", on_click=get_demo_n, args=(3,), use_container_width=True, disabled=not st.session_state.get("enable_submmit", True))
     def check_intermediate_outputs(id="brainstorms"):
         msg = st.session_state["intermediate_output"].get(id, None)

src/pages/step_by_step_generation.py CHANGED Viewed

@@ -56,7 +56,7 @@ def genrate_mainpage(backend):
         background = st.session_state.get("background", "")
         background = st.text_area("Input your field background", background, placeholder="Input your field background", height=200, label_visibility="collapsed")
-        cols = st.columns(2)
         def click_demo_i(i):
             st.session_state["background"] = backend.get_demo_i(i)
         for i, col in enumerate(cols):

         background = st.session_state.get("background", "")
         background = st.text_area("Input your field background", background, placeholder="Input your field background", height=200, label_visibility="collapsed")
+        cols = st.columns(4)
         def click_demo_i(i):
             st.session_state["background"] = backend.get_demo_i(i)
         for i, col in enumerate(cols):

src/paper_manager.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import os
 import json
 import re
-from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
 from utils.paper_crawling import PaperCrawling
 from utils.paper_client import PaperClient
-from utils.hash import generate_hash_id
 from collections import defaultdict
 from utils.header import get_dir, ConfigReader
 from utils.llms_api import APIHelper
@@ -165,9 +165,8 @@ class PaperManager:
         self.data_type = "train"
         self.paper_client = PaperClient(config)
         self.paper_crawling = PaperCrawling(config, data_type=self.data_type)
-        self.embedding_model = SentenceTransformer(
-            model_name_or_path=get_dir(config.DEFAULT.embedding), device=self.config.DEFAULT.device
-        )
         self.api_helper = APIHelper(config)
         self.retriever = Retriever(config)
         self.paper_id_map = defaultdict()

 import os
 import json
 import re
 from tqdm import tqdm
+import torch
 from utils.paper_crawling import PaperCrawling
 from utils.paper_client import PaperClient
+from utils.hash import generate_hash_id, get_embedding_model
 from collections import defaultdict
 from utils.header import get_dir, ConfigReader
 from utils.llms_api import APIHelper
         self.data_type = "train"
         self.paper_client = PaperClient(config)
         self.paper_crawling = PaperCrawling(config, data_type=self.data_type)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.embedding_model = get_embedding_model(config)
         self.api_helper = APIHelper(config)
         self.retriever = Retriever(config)
         self.paper_id_map = defaultdict()

src/utils/hash.py CHANGED Viewed

@@ -1,18 +1,23 @@
 import re
 import os
 import hashlib
 import struct
 from collections import Counter
 from huggingface_hub import hf_hub_download
 ENV_CHECKED = False
 EMBEDDING_CHECKED = False
 def check_embedding():
     global EMBEDDING_CHECKED
     if not EMBEDDING_CHECKED:
         # Define the repository and files to download
         repo_id = "sentence-transformers/all-MiniLM-L6-v2"  # "BAAI/bge-small-en-v1.5"
         files_to_download = [
             "config.json",
             "pytorch_model.bin",
@@ -21,10 +26,18 @@ def check_embedding():
         ]
         # Download each file and save it to the /model/bge directory
         for file_name in files_to_download:
-            print("Checking for file: ", file_name)
-            hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=f"./assets/model/{repo_id}")
         EMBEDDING_CHECKED = True
 def check_env():
     global ENV_CHECKED
     if not ENV_CHECKED:
@@ -43,6 +56,22 @@ def check_env():
         ENV_CHECKED = True
 def generate_hash_id(input_string):
     if input_string is None:
         return None

 import re
 import os
 import hashlib
+import torch
 import struct
 from collections import Counter
 from huggingface_hub import hf_hub_download
+from sentence_transformers import SentenceTransformer
+from .header import get_dir
 ENV_CHECKED = False
 EMBEDDING_CHECKED = False
 def check_embedding():
     global EMBEDDING_CHECKED
     if not EMBEDDING_CHECKED:
         # Define the repository and files to download
         repo_id = "sentence-transformers/all-MiniLM-L6-v2"  # "BAAI/bge-small-en-v1.5"
+        local_dir = f"./assets/model/{repo_id}"
         files_to_download = [
             "config.json",
             "pytorch_model.bin",
         ]
         # Download each file and save it to the /model/bge directory
         for file_name in files_to_download:
+            if not os.path.exists(os.path.join(local_dir, file_name)):
+                print(
+                    f"file: {file_name} not exist in {local_dir}, try to download from huggingface ..."
+                )
+                hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_name,
+                    local_dir=local_dir,
+                )
         EMBEDDING_CHECKED = True
 def check_env():
     global ENV_CHECKED
     if not ENV_CHECKED:
         ENV_CHECKED = True
+class EmbeddingModel:
+    _instance = None
+    def __new__(cls, config):
+        if cls._instance is None:
+            cls._instance = super(EmbeddingModel, cls).__new__(cls)
+            cls._instance.embedding_model = SentenceTransformer(
+                model_name_or_path=get_dir(config.DEFAULT.embedding),
+                device="cuda" if torch.cuda.is_available() else "cpu",
+            )
+        return cls._instance
+def get_embedding_model(config):
+    return EmbeddingModel(config).embedding_model
 def generate_hash_id(input_string):
     if input_string is None:
         return None

src/utils/llms_api.py CHANGED Viewed

@@ -864,7 +864,7 @@ class APIHelper(object):
     def transfer_form(self, idea: str):
         prompt_template_transfer = """
         ### Task Description:
-        I will give you some ideas, please standardize the output format of the ideas without simplifying or modifying their specific content. Note that the content of each idea includes everything about the idea。
         ### Specific Information:
         I will provide you with specific information now, please use them according to the instructions above:

     def transfer_form(self, idea: str):
         prompt_template_transfer = """
         ### Task Description:
+        I will give you some ideas, please standardize the output format of the ideas without changing any characters in their content. Note that the content of each idea includes everything about the idea。
         ### Specific Information:
         I will provide you with specific information now, please use them according to the instructions above:

src/utils/paper_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import re
 import json
 from tqdm import tqdm
 from neo4j import GraphDatabase
 from collections import defaultdict, deque
@@ -8,18 +9,26 @@ from py2neo import Graph, Node, Relationship
 from loguru import logger
 class PaperClient:
-    def __init__(self, config) -> None:
-        self.config = config
-        self.driver = self.get_neo4j_driver()
-        self.teb_model = None
     def get_neo4j_driver(self):
-        # 配置信息
         URI = os.environ["NEO4J_URL"]
         NEO4J_USERNAME = os.environ["NEO4J_USERNAME"]
         NEO4J_PASSWD = os.environ["NEO4J_PASSWD"]
         AUTH = (NEO4J_USERNAME, NEO4J_PASSWD)
-        # 连接到 Neo4j 数据库
         driver = GraphDatabase.driver(URI, auth=AUTH)
         return driver
@@ -274,7 +283,7 @@ class PaperClient:
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["title"] + result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
-        context_embeddings = embedding_model.encode(contexts, batch_size=512, convert_to_tensor=True, device=self.config.DEFAULT.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.abstract_embedding = $embedding
@@ -304,7 +313,7 @@ class PaperClient:
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
-        context_embeddings = embedding_model.encode(contexts, batch_size=256, convert_to_tensor=True, device=self.config.DEFAULT.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.embedding = $embedding
@@ -334,7 +343,7 @@ class PaperClient:
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
-        context_embeddings = embedding_model.encode(contexts, batch_size=256, convert_to_tensor=True, device=self.config.DEFAULT.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.contribution_embedding = $embedding
@@ -365,7 +374,7 @@ class PaperClient:
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
-        context_embeddings = embedding_model.encode(contexts, batch_size=256, convert_to_tensor=True, device=self.config.DEFAULT.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.summary_embedding = $embedding
@@ -528,13 +537,13 @@ class PaperClient:
         NEO4J_PASSWD = os.environ["NEO4J_PASSWD"]
         AUTH = (NEO4J_USERNAME, NEO4J_PASSWD)
         graph = Graph(URI, auth=AUTH)
         query = """
             MATCH (e:Entity)-[r:RELATED_TO]->(p:Paper)
             RETURN p, e, r
         """
         results = graph.run(query)
-        # 创建一个字典来保存数据
-        data = {"nodes": [], "relationships": []}
         # 处理查询结果
         for record in tqdm(results):
             paper_node = record["p"]
@@ -622,9 +631,6 @@ class PaperClient:
 if __name__ == "__main__":
-    from header import get_dir, ConfigReader
-    config_path = get_dir("./configs/datasets.yaml")
-    config = ConfigReader.load(config_path)
-    paper_client = PaperClient(config)
     # paper_client.neo4j_backup()
     paper_client.neo4j_import_data()

 import os
 import re
 import json
+import torch
 from tqdm import tqdm
 from neo4j import GraphDatabase
 from collections import defaultdict, deque
 from loguru import logger
 class PaperClient:
+    _instance = None
+    _initialized = False
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super(PaperClient, cls).__new__(cls)
+        return cls._instance
+    def __init__(self) -> None:
+        if not self._initialized:
+            self.driver = self.get_neo4j_driver()
+            self.teb_model = None
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            PaperClient._initialized = True
     def get_neo4j_driver(self):
         URI = os.environ["NEO4J_URL"]
         NEO4J_USERNAME = os.environ["NEO4J_USERNAME"]
         NEO4J_PASSWD = os.environ["NEO4J_PASSWD"]
         AUTH = (NEO4J_USERNAME, NEO4J_PASSWD)
         driver = GraphDatabase.driver(URI, auth=AUTH)
         return driver
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["title"] + result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
+        context_embeddings = embedding_model.encode(contexts, batch_size=512, convert_to_tensor=True, device=self.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.abstract_embedding = $embedding
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
+        context_embeddings = embedding_model.encode(contexts, batch_size=256, convert_to_tensor=True, device=self.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.embedding = $embedding
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
+        context_embeddings = embedding_model.encode(contexts, batch_size=256, convert_to_tensor=True, device=self.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.contribution_embedding = $embedding
                 results = session.execute_write(lambda tx: tx.run(query).data())
         contexts = [result["context"] for result in results]
         paper_ids = [result["hash_id"] for result in results]
+        context_embeddings = embedding_model.encode(contexts, batch_size=256, convert_to_tensor=True, device=self.device)
         query = """
             MERGE (p:Paper {hash_id: $hash_id})
             ON CREATE SET p.summary_embedding = $embedding
         NEO4J_PASSWD = os.environ["NEO4J_PASSWD"]
         AUTH = (NEO4J_USERNAME, NEO4J_PASSWD)
         graph = Graph(URI, auth=AUTH)
+        # 创建一个字典来保存数据
+        data = {"nodes": [], "relationships": []}
         query = """
             MATCH (e:Entity)-[r:RELATED_TO]->(p:Paper)
             RETURN p, e, r
         """
         results = graph.run(query)
         # 处理查询结果
         for record in tqdm(results):
             paper_node = record["p"]
 if __name__ == "__main__":
+    paper_client = PaperClient()
     # paper_client.neo4j_backup()
     paper_client.neo4j_import_data()

src/utils/paper_retriever.py CHANGED Viewed

@@ -2,8 +2,6 @@ import torch
 import itertools
 import threading
 import numpy as np
-from sentence_transformers import SentenceTransformer
-from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from collections import Counter, defaultdict
 from loguru import logger
@@ -11,7 +9,7 @@ from abc import ABCMeta, abstractmethod
 from .paper_client import PaperClient
 from .paper_crawling import PaperCrawling
 from .llms_api import APIHelper
-from .header import get_dir
 class UnionFind:
@@ -51,18 +49,26 @@ def can_merge(uf, similarity_matrix, i, j, threshold):
 class CoCite:
-    def __init__(self, config) -> None:
-        self.paper_client = PaperClient(config)
-        citemap = self.paper_client.build_citemap()
-        self.comap = defaultdict(
-            lambda: defaultdict(int)
-        )
-        for paper_id, cited_id in citemap.items():
-            for id0, id1 in itertools.combinations(cited_id, 2):
-                # ensure comap[id0][id1] == comap[id1][id0]
-                self.comap[id0][id1] += 1
-                self.comap[id1][id0] += 1
-        logger.debug("init co-cite map success")
     def get_cocite_ids(self, id_, k=1):
         sorted_items = sorted(self.comap[id_].items(), key=lambda x: x[1], reverse=True)
@@ -82,14 +88,12 @@ class Retriever(object):
         self.config = config
         self.use_cocite = use_cocite
         self.use_cluster_to_filter = use_cluster_to_filter
-        self.paper_client = PaperClient(config)
-        self.cocite = CoCite(config)
         self.api_helper = APIHelper(config=config)
-        self.embedding_model = SentenceTransformer(
-            model_name_or_path=get_dir(config.DEFAULT.embedding), device=self.config.DEFAULT.device
-        )
         self.paper_crawling = PaperCrawling(config=config)
-        self.vectorizer = CountVectorizer()
     @abstractmethod
     def retrieve(self, bg, entities, use_evaluate):
@@ -192,7 +196,7 @@ class Retriever(object):
             entities = self.api_helper.generate_entity_list(context)
             logger.debug("get entity from context: {}".format(entities))
         origin_vector = self.embedding_model.encode(
-            context, convert_to_tensor=True, device=self.config.DEFAULT.device
         ).unsqueeze(0)
         related_contexts = [
             self.paper_client.get_paper_attribute(paper_id, type_name)
@@ -200,7 +204,10 @@ class Retriever(object):
         ]
         if len(related_contexts) > 0:
             context_embeddings = self.embedding_model.encode(
-                related_contexts, batch_size=512, convert_to_tensor=True, device=self.config.DEFAULT.device
             )
             score_1 = torch.nn.functional.cosine_similarity(
                 origin_vector, context_embeddings
@@ -208,7 +215,7 @@ class Retriever(object):
             score_1 = score_1.cpu().numpy()
             if self.config.RETRIEVE.need_normalize:
                 score_1 = score_1 / np.max(score_1)
-        # score_2 not enable
         # if self.config.RETRIEVE.beta != 0:
         score_sn_dict = dict(zip(related_paper_id_list, score_1))
         score_en_dict = dict(zip(related_paper_id_list, score_2))
@@ -231,28 +238,33 @@ class Retriever(object):
                 else list(score_dict.keys())
             )
             return paper_id_list
-        else:
             # clustering filter, ensure that each category the highest score save first
             paper_id_list = list(score_dict.keys())
             paper_embedding_list = [
-                self.paper_client.get_paper_attribute(paper_id, "embedding") for paper_id in paper_id_list
             ]
             paper_embedding = np.array(paper_embedding_list)
             paper_embedding_list = [
-                self.paper_client.get_paper_attribute(paper_id, "contribution_embedding") for paper_id in paper_id_list
             ]
             paper_contribution_embedding = np.array(paper_embedding_list)
             paper_embedding_list = [
-                self.paper_client.get_paper_attribute(paper_id, "summary_embedding") for paper_id in paper_id_list
             ]
             paper_summary_embedding = np.array(paper_embedding_list)
             weight_embedding = self.config.RETRIEVE.s_bg
             weight_contribution = self.config.RETRIEVE.s_contribution
             weight_summary = self.config.RETRIEVE.s_summary
             paper_embedding = (
-                weight_embedding * paper_embedding +
-                weight_contribution * paper_contribution_embedding +
-                weight_summary * paper_summary_embedding
             )
             similarity_matrix = np.dot(paper_embedding, paper_embedding.T)
             related_labels = self.cluster_algorithm(paper_id_list, similarity_matrix)
@@ -542,9 +554,7 @@ class SNRetriever(Retriever):
         related_paper_id_list = retrieve_result["paper"]
         retrieve_paper_num = len(related_paper_id_list)
         _, _, score_all_dict = self.cal_related_score(
-            bg,
-            related_paper_id_list=related_paper_id_list,
-            entities=entities
         )
         top_k_matrix = {}
         recall = 0
@@ -746,4 +756,4 @@ class SNKGRetriever(Retriever):
             "retrieve_paper_num": retrieve_paper_num,
             "label_num": label_num,
         }
-        return result

 import itertools
 import threading
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
 from collections import Counter, defaultdict
 from loguru import logger
 from .paper_client import PaperClient
 from .paper_crawling import PaperCrawling
 from .llms_api import APIHelper
+from .hash import get_embedding_model
 class UnionFind:
 class CoCite:
+    _instance = None
+    _initialized = False
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super(CoCite, cls).__new__(cls)
+        return cls._instance
+    def __init__(self) -> None:
+        if not self._initialized:
+            self.paper_client = PaperClient()
+            citemap = self.paper_client.build_citemap()
+            self.comap = defaultdict(lambda: defaultdict(int))
+            for paper_id, cited_id in citemap.items():
+                for id0, id1 in itertools.combinations(cited_id, 2):
+                    # ensure comap[id0][id1] == comap[id1][id0]
+                    self.comap[id0][id1] += 1
+                    self.comap[id1][id0] += 1
+            logger.debug("init co-cite map success")
+            CoCite._initialized = True
     def get_cocite_ids(self, id_, k=1):
         sorted_items = sorted(self.comap[id_].items(), key=lambda x: x[1], reverse=True)
         self.config = config
         self.use_cocite = use_cocite
         self.use_cluster_to_filter = use_cluster_to_filter
+        self.paper_client = PaperClient()
+        self.cocite = CoCite()
         self.api_helper = APIHelper(config=config)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.embedding_model = get_embedding_model(config)
         self.paper_crawling = PaperCrawling(config=config)
     @abstractmethod
     def retrieve(self, bg, entities, use_evaluate):
             entities = self.api_helper.generate_entity_list(context)
             logger.debug("get entity from context: {}".format(entities))
         origin_vector = self.embedding_model.encode(
+            context, convert_to_tensor=True, device=self.device
         ).unsqueeze(0)
         related_contexts = [
             self.paper_client.get_paper_attribute(paper_id, type_name)
         ]
         if len(related_contexts) > 0:
             context_embeddings = self.embedding_model.encode(
+                related_contexts,
+                batch_size=512,
+                convert_to_tensor=True,
+                device=self.device,
             )
             score_1 = torch.nn.functional.cosine_similarity(
                 origin_vector, context_embeddings
             score_1 = score_1.cpu().numpy()
             if self.config.RETRIEVE.need_normalize:
                 score_1 = score_1 / np.max(score_1)
+        # score_2 not enable
         # if self.config.RETRIEVE.beta != 0:
         score_sn_dict = dict(zip(related_paper_id_list, score_1))
         score_en_dict = dict(zip(related_paper_id_list, score_2))
                 else list(score_dict.keys())
             )
             return paper_id_list
+        else:
             # clustering filter, ensure that each category the highest score save first
             paper_id_list = list(score_dict.keys())
             paper_embedding_list = [
+                self.paper_client.get_paper_attribute(paper_id, "embedding")
+                for paper_id in paper_id_list
             ]
             paper_embedding = np.array(paper_embedding_list)
             paper_embedding_list = [
+                self.paper_client.get_paper_attribute(
+                    paper_id, "contribution_embedding"
+                )
+                for paper_id in paper_id_list
             ]
             paper_contribution_embedding = np.array(paper_embedding_list)
             paper_embedding_list = [
+                self.paper_client.get_paper_attribute(paper_id, "summary_embedding")
+                for paper_id in paper_id_list
             ]
             paper_summary_embedding = np.array(paper_embedding_list)
             weight_embedding = self.config.RETRIEVE.s_bg
             weight_contribution = self.config.RETRIEVE.s_contribution
             weight_summary = self.config.RETRIEVE.s_summary
             paper_embedding = (
+                weight_embedding * paper_embedding
+                + weight_contribution * paper_contribution_embedding
+                + weight_summary * paper_summary_embedding
             )
             similarity_matrix = np.dot(paper_embedding, paper_embedding.T)
             related_labels = self.cluster_algorithm(paper_id_list, similarity_matrix)
         related_paper_id_list = retrieve_result["paper"]
         retrieve_paper_num = len(related_paper_id_list)
         _, _, score_all_dict = self.cal_related_score(
+            bg, related_paper_id_list=related_paper_id_list, entities=entities
         )
         top_k_matrix = {}
         recall = 0
             "retrieve_paper_num": retrieve_paper_num,
             "label_num": label_num,
         }
+        return result