Spaces:

mano-wii
/

tools

Running

App Files Files Community

Germano Cavalcante commited on Jul 24

Commit

1b8973e

•

1 Parent(s): af4d94e

Wiki Search: Updates

Browse files

- remove wiki
- Deduplicate code
- rename docs to dev_docs

Files changed (4) hide show

routers/embedding/__init__.py +112 -0
routers/embedding/{embeddings_manual_wiki.pkl → embeddings_dev_docs.pkl} +2 -2
routers/embedding/embeddings_manual.pkl +3 -0
routers/tool_wiki_search.py +97 -249

routers/embedding/__init__.py CHANGED Viewed

@@ -1,10 +1,12 @@
 # routers/embedding/__init__.py
 import os
 import sys
 import threading
 import torch
 from sentence_transformers import SentenceTransformer, util
 class EmbeddingContext:
@@ -111,4 +113,114 @@ class EmbeddingContext:
         return tokens
 EMBEDDING_CTX = EmbeddingContext()

 # routers/embedding/__init__.py
 import os
+import re
 import sys
 import threading
 import torch
 from sentence_transformers import SentenceTransformer, util
+from typing import Dict, List, Tuple, Set, LiteralString
 class EmbeddingContext:
         return tokens
+class SplitDocs:
+    def split_in_topics(self,
+                        filedir: LiteralString = None,
+                        *,
+                        pattern_filename=r'(?<!navigation)\.(md|rst)',
+                        pattern_content_sub=r'---\nhide:[\s\S]+?---\s*',
+                        patterns_titles=(
+                            r'^# (.+)', r'^## (.+)', r'^### (.+)'),
+                        ) -> List[Tuple[str, str]]:
+        def matches_pattern(filename):
+            return re.search(pattern_filename, filename) is not None
+        def split_patterns_recursive(patterns, text, index=-1):
+            sections = re.split(patterns[0], text, flags=re.MULTILINE)
+            for i, section in enumerate(sections):
+                if not section.strip():
+                    continue
+                is_match = bool(i & 1)
+                if is_match:
+                    yield (index, section)
+                elif len(patterns) > 1:
+                    for j, section_j in split_patterns_recursive(patterns[1:], section, index + 1):
+                        yield (j, section_j)
+                else:
+                    yield (-1, section)
+        for root, _, files in os.walk(filedir):
+            for name in files:
+                if not matches_pattern(name):
+                    continue
+                full_path = os.path.join(root, name)
+                with open(full_path, 'r', encoding='utf-8') as file:
+                    content = file.read()
+                if pattern_content_sub:
+                    content = re.sub(pattern_content_sub, '', content)
+                rel_path = full_path.replace(filedir, '').replace('\\', '/')
+                # Protect code parts
+                patterns = (r'(```[\s\S]+?```)', *patterns_titles)
+                last_titles = []
+                last_titles_index = []
+                content_accum = ''
+                for i, section in split_patterns_recursive(patterns, content):
+                    if i < 0:
+                        content_accum += section
+                        continue
+                    if content_accum:
+                        yield rel_path, last_titles, content_accum
+                        content_accum = ''
+                    if not last_titles_index or i > last_titles_index[-1]:
+                        last_titles_index.append(i)
+                        last_titles.append(section)
+                        continue
+                    while len(last_titles_index) > 1 and i < last_titles_index[-1]:
+                        last_titles_index.pop()
+                        last_titles.pop()
+                    # Replace
+                    last_titles_index[-1] = i
+                    last_titles[-1] = section
+                if content_accum or i != -1:
+                    yield rel_path, last_titles, content_accum
+    def reduce_text(_self, text):
+        text = re.sub(r'^\n+', '', text)  # Strip
+        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
+        text = re.sub(r':\S*: ', '', text)  # Remove [:...:] patterns
+        text = re.sub(r'\s*\n+', '\n', text)
+        return text
+    def embedding_header(_self, rel_path, titles):
+        return f"{rel_path}\n# {' | '.join(titles)}\n\n"
+    def split_for_embedding(self,
+                            filedir: LiteralString = None,
+                            *,
+                            pattern_filename=r'(?<!navigation)\.(md|rst)',
+                            pattern_content_sub=r'---\nhide:[\s\S]+?---\s*',
+                            patterns_titles=(
+                                r'^# (.+)', r'^## (.+)', r'^### (.+)'),
+                            ):
+        tokenizer = EMBEDDING_CTX.model.tokenizer
+        max_tokens = EMBEDDING_CTX.model.max_seq_length
+        texts = []
+        for rel_path, titles, content in self.split_in_topics(
+                filedir, pattern_filename=pattern_filename, pattern_content_sub=pattern_content_sub, patterns_titles=patterns_titles):
+            header = self.embedding_header(rel_path, titles)
+            tokens_pre_len = len(tokenizer.tokenize(header))
+            tokens_so_far = tokens_pre_len
+            text_so_far = header
+            for part in self.reduce_text(content).splitlines():
+                part += '\n'
+                part_tokens_len = len(tokenizer.tokenize(part))
+                if tokens_so_far + part_tokens_len > max_tokens:
+                    texts.append(text_so_far)
+                    text_so_far = header
+                    tokens_so_far = tokens_pre_len
+                text_so_far += part
+                tokens_so_far += part_tokens_len
+            if tokens_so_far != tokens_pre_len:
+                texts.append(text_so_far)
+        return texts
 EMBEDDING_CTX = EmbeddingContext()

routers/embedding/{embeddings_manual_wiki.pkl → embeddings_dev_docs.pkl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e12f37bd8b14982fa070b5db9d9c468c0bc858fc65ab136cc714bf8fcce48d69
-size 31873812

 version https://git-lfs.github.com/spec/v1
+oid sha256:e94dbc62cda6258367836eaec82dfda7f35183b1debdc980541e2ceb22d52637
+size 15328541

routers/embedding/embeddings_manual.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4f9759847d9fb0948eb2f550a3e5512b7a5bbdf8fd70118bb75fe670395ee7c
+size 22522382

routers/tool_wiki_search.py CHANGED Viewed

@@ -6,133 +6,43 @@ import pickle
 import re
 import torch
 from enum import Enum
-from typing import Dict, List, Tuple, Set
-from sentence_transformers import util
-from fastapi import APIRouter
 from fastapi.responses import PlainTextResponse
 try:
-    from .embedding import EMBEDDING_CTX
     from .utils_gitea import gitea_wiki_page_get, gitea_wiki_pages_get
 except:
-    from embedding import EMBEDDING_CTX
     from utils_gitea import gitea_wiki_page_get, gitea_wiki_pages_get
-MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
 DOCS_DIR = "D:/BlenderDev/blender-developer-docs/docs"
 class Group(str, Enum):
-    docs = "docs"
-    wiki = "wiki"
     manual = "manual"
-    all = "all"
-class Split:
-    filedir = None
-    filetype = '.md'
-    def __init__(self, filedir=None, filetype='.md'):
-        self.filedir = filedir
-        self.filetype = filetype
-    def split_in_topics(self) -> List[Tuple[str, str]]:
-        for root, _dirs, files in os.walk(self.filedir):
-            for name in files:
-                if not name.endswith(self.filetype) or name == 'navigation.md':
-                    continue
-                full_path = os.path.join(root, name)
-                with open(full_path, 'r', encoding='utf-8') as file:
-                    content = file.read()
-                prefix = full_path.replace(self.filedir, '')
-                prefix = re.sub(r'(index)?.md', '', prefix)
-                prefix = prefix.replace('\\', '/')
-                # Protect code parts
-                parts = ['']
-                is_first = True
-                is_in_code_block = False
-                for line in content.splitlines():
-                    if not line:
-                        continue
-                    line += '\n'
-                    is_in_code_block = is_in_code_block != line.strip().startswith('```')
-                    if not is_in_code_block and line.startswith('## '):
-                        if not is_first:
-                            parts.append(line)
-                            continue
-                        is_first = False
-                    parts[-1] += line
-                title_main = ''
-                for topic in parts:
-                    topic = topic.strip()
-                    if not topic or topic.startswith('---\nhide'):
-                        continue
-                    try:
-                        title, body = topic.split('\n', 1)
-                    except ValueError:
-                        # ignore non content
-                        continue
-                    if not title_main:
-                        title_main = title
-                    else:
-                        title = title_main + ' | ' + title
-                    yield (prefix + '\n' + title, body)
-    def reduce_text(_self, text):
-        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
-        text = re.sub(r':\S*: ', '', text)  # Remove [:...:] patterns
-        text = re.sub(r'(index)?.md', '', text)  # Remove .md
-        return re.sub(r'(\s*\n\s*)+', '\n', text)
-    def split_for_embedding(self):
-        tokenizer = EMBEDDING_CTX.model.tokenizer
-        max_tokens = EMBEDDING_CTX.model.max_seq_length
-        texts = []
-        for prefix, content in self.split_in_topics():
-            prefix += '\n\n'
-            tokens_prefix_len = len(tokenizer.tokenize(prefix))
-            tokens_so_far = tokens_prefix_len
-            text_so_far = prefix
-            for part in self.reduce_text(content).splitlines():
-                part += '\n'
-                part_tokens_len = len(tokenizer.tokenize(part))
-                if tokens_so_far + part_tokens_len > max_tokens:
-                    texts.append(text_so_far)
-                    text_so_far = prefix
-                    tokens_so_far = tokens_prefix_len
-                text_so_far += part
-                tokens_so_far += part_tokens_len
-            if tokens_so_far != tokens_prefix_len:
-                texts.append(text_so_far)
-        return texts
 class _Data(dict):
-    cache_path = "routers/embedding/embeddings_manual_wiki.pkl"
     def __init__(self):
-        if os.path.exists(self.cache_path):
-            with open(self.cache_path, 'rb') as file:
-                data = pickle.load(file)
-                self.update(data)
-                return
-        # Generate
-        print("Embedding Texts...")
-        for grp in list(Group)[:-1]:
             self[grp.name] = {}
             # Create a list to store the text files
@@ -146,166 +56,115 @@ class _Data(dict):
             self[grp]['texts'] = texts
             self[grp]['embeddings'] = EMBEDDING_CTX.encode(texts)
-        with open(self.cache_path, "wb") as file:
-            # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
-            for val in self.values():
-                val['embeddings'] = val['embeddings'].to(torch.device('cpu'))
-            pickle.dump(dict(self), file, protocol=pickle.HIGHEST_PROTOCOL)
-    @classmethod
-    def parse_file_recursive(cls, filepath):
-        with open(filepath, 'r', encoding='utf-8') as file:
-            content = file.read()
-        parsed_data = {}
-        if filepath.endswith('index.rst'):
-            filedir = os.path.dirname(filepath)
-            parts = content.split(".. toctree::")
-            if len(parts) > 1:
-                parsed_data["toctree"] = {}
-                for part in parts[1:]:
-                    toctree_entries = part.splitlines()[1:]
-                    for entry in toctree_entries:
-                        entry = entry.strip()
-                        if not entry:
-                            continue
-                        if entry.startswith('/'):
-                            # relative path.
-                            continue
-                        if not entry.endswith('.rst'):
-                            continue
-                        entry_name = entry[:-4]  # remove '.rst'
-                        filepath_iter = os.path.join(filedir, entry)
-                        parsed_data['toctree'][entry_name] = cls.parse_file_recursive(
-                            filepath_iter)
-        parsed_data['body'] = content
-        return parsed_data
     @classmethod
     def manual_get_texts_to_embed(cls):
-        class SplitManual(Split):
-            def split_in_topics(_self):
-                def get_topics_recursive(page, path='/index.html'):
-                    # Remove patterns ".. word::" and ":word:"
-                    text = re.sub(
-                        r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', page['body'])
-                    # Regular expression to find titles and subtitles
-                    pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
-                    # Split text by found patterns
-                    sections = re.split(pattern, text)
-                    # Remove possible white spaces at the beginning and end of each section
-                    sections = [
-                        section for section in sections if section.strip()]
-                    # Separate sections into a dictionary
-                    topics = []
-                    current_title = ''
-                    current_topic = path
-                    for section in sections:
-                        if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
-                            current_topic = current_title = f'{path}\n# {match.group(1)}:'
-                        elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
-                            current_topic = f'{current_title} | {match.group(1)}'
-                        else:
-                            if current_topic == path:
-                                raise
-                            topics.append((current_topic, section))
-                    try:
-                        for key in page['toctree'].keys():
-                            page_child = page['toctree'][key]
-                            topics.extend(get_topics_recursive(
-                                page_child, path.replace('index', key)))
-                    except KeyError:
-                        pass
-                    return topics
-                manual = cls.parse_file_recursive(
-                    os.path.join(MANUAL_DIR, 'index.rst'))
-                manual['toctree']["copyright"] = cls.parse_file_recursive(
-                    os.path.join(MANUAL_DIR, 'copyright.rst'))
-                return get_topics_recursive(manual)
             def reduce_text(_self, text):
                 # Remove repeated characters
-                text = re.sub(r'%{2,}', '', text)  # Title
-                text = re.sub(r'#{2,}', '', text)  # Title
-                text = re.sub(r'\*{3,}', '', text)  # Title
-                text = re.sub(r'={3,}', '', text)  # Topic
                 text = re.sub(r'\^{3,}', '', text)
                 text = re.sub(r'-{3,}', '', text)
-                text = re.sub(r'(\s*\n\s*)+', '\n', text)
                 return text
-        return SplitManual().split_for_embedding()
     @staticmethod
     def wiki_get_texts_to_embed():
-        class SplitWiki(Split):
-            def split_in_topics(_self):
                 owner = "blender"
                 repo = "blender"
                 pages = gitea_wiki_pages_get(owner, repo)
                 for page_name in pages:
                     page_name_title = page_name["title"]
                     page = gitea_wiki_page_get(owner, repo, page_name_title)
-                    prefix = f'/{owner}/{repo}/{page["sub_url"]}\n# {page_name_title}:\n'
                     text = base64.b64decode(
                         page["content_base64"]).decode('utf-8')
-                    yield (prefix, text)
             def reduce_text(_self, text):
-                return super().reduce_text(text).replace('https://projects.blender.org', '')
         return SplitWiki().split_for_embedding()
     @staticmethod
     def docs_get_texts_to_embed():
-        return Split(DOCS_DIR).split_for_embedding()
-    def _sort_similarity(self, text_to_search, groups: Set[Group] = {Group.docs, Group.wiki, Group.manual}, limit=5):
-        result = []
         query_emb = EMBEDDING_CTX.encode([text_to_search])
-        ret = {}
         for grp in groups:
-            if not grp in self:
                 continue
-            ret[grp] = util.semantic_search(
                 query_emb, self[grp]['embeddings'], top_k=limit, score_function=util.dot_score)
-        score_best = 0.0
-        group_best = None
-        for grp, val in ret.items():
-            score_curr = val[0][0]['score']
-            if score_curr > score_best:
-                score_best = score_curr
-                group_best = grp
-        texts = self[group_best]['texts']
-        for score in ret[group_best][0]:
-            corpus_id = score['corpus_id']
-            text = texts[corpus_id]
-            result.append(text)
-        return result, group_best
 G_data = _Data()
@@ -314,23 +173,12 @@ router = APIRouter()
 @router.get("/wiki_search", response_class=PlainTextResponse)
-def wiki_search(query: str = "", group: Group = Group.all) -> str:
-    base_url = {
-        "docs": "https://developer.blender.org/docs",
-        "wiki": "https://projects.blender.org",
-        "manual": "https://docs.blender.org/manual/en/dev"
-    }
-    if group is Group.all:
-        groups = {Group.docs, Group.wiki, Group.manual}
-    elif group is Group.wiki:
-        groups = {Group.docs, Group.wiki}
-    else:
-        groups = {group}
-    texts, group_best = G_data._sort_similarity(query, groups)
-    result = f'BASE_URL: {base_url[group_best]}\n'
     for text in texts:
         result += f'\n---\n{text}'
     return result
@@ -339,5 +187,5 @@ def wiki_search(query: str = "", group: Group = Group.all) -> str:
 if __name__ == '__main__':
     tests = ["Set Snap Base", "Building the Manual",
              "Bisect Object", "Who are the Triagers", "4.3 Release Notes Motion Paths"]
-    result = wiki_search(tests[4], Group.wiki)
     print(result)

 import re
 import torch
 from enum import Enum
+from fastapi import APIRouter, Query
 from fastapi.responses import PlainTextResponse
+from heapq import nlargest
+from sentence_transformers import util
+from typing import Dict, List, Tuple, Set, LiteralString
 try:
+    from .embedding import SplitDocs, EMBEDDING_CTX
     from .utils_gitea import gitea_wiki_page_get, gitea_wiki_pages_get
 except:
+    from embedding import SplitDocs, EMBEDDING_CTX
     from utils_gitea import gitea_wiki_page_get, gitea_wiki_pages_get
+MANUAL_DIR = "D:/BlenderDev/blender-manual/manual"
 DOCS_DIR = "D:/BlenderDev/blender-developer-docs/docs"
 class Group(str, Enum):
+    dev_docs = "dev_docs"
+    # wiki = "wiki"
     manual = "manual"
 class _Data(dict):
+    cache_path = "routers/embedding/embeddings_{}.pkl"
     def __init__(self):
+        for grp in list(Group):
+            cache_path = self.cache_path.format(grp.name)
+            if os.path.exists(cache_path):
+                with open(cache_path, 'rb') as file:
+                    self[grp.name] = pickle.load(file)
+                continue
+            # Generate
+            print("Embedding Texts for", grp.name)
             self[grp.name] = {}
             # Create a list to store the text files
             self[grp]['texts'] = texts
             self[grp]['embeddings'] = EMBEDDING_CTX.encode(texts)
+            with open(cache_path, "wb") as file:
+                # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
+                self[grp]['embeddings'] = self[grp]['embeddings'].to(
+                    torch.device('cpu'))
+                pickle.dump(self[grp], file, protocol=pickle.HIGHEST_PROTOCOL)
     @classmethod
     def manual_get_texts_to_embed(cls):
+        class SplitManual(SplitDocs):
             def reduce_text(_self, text):
                 # Remove repeated characters
                 text = re.sub(r'\^{3,}', '', text)
                 text = re.sub(r'-{3,}', '', text)
+                text = text.replace('.rst', '.html')
+                text = super().reduce_text(text)
                 return text
+            def embedding_header(self, rel_path, titles):
+                rel_path = rel_path.replace('.rst', '.html')
+                return super().embedding_header(rel_path, titles)
+        # Remove patterns ".. word::" and ":word:"
+        pattern_content_sub = r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:'
+        patterns_titles = (
+            r'[\*#%]{3,}\n\s*(.+)\n[\*#%]{3,}', r'(?:[=+]{3,}\n)?\s*(.+)\n[=+]{3,}\n')
+        return SplitManual().split_for_embedding(
+            MANUAL_DIR,
+            pattern_content_sub=pattern_content_sub,
+            patterns_titles=patterns_titles,
+        )
     @staticmethod
     def wiki_get_texts_to_embed():
+        class SplitWiki(SplitDocs):
+            def split_in_topics(_self,
+                                filedir: LiteralString = None,
+                                *,
+                                pattern_filename=None,
+                                pattern_content_sub=None,
+                                patterns_titles=None):
                 owner = "blender"
                 repo = "blender"
                 pages = gitea_wiki_pages_get(owner, repo)
                 for page_name in pages:
                     page_name_title = page_name["title"]
                     page = gitea_wiki_page_get(owner, repo, page_name_title)
+                    rel_dir = f'/{owner}/{repo}/{page["sub_url"]}'
+                    titles = [page_name_title]
                     text = base64.b64decode(
                         page["content_base64"]).decode('utf-8')
+                    yield (rel_dir, titles, text)
             def reduce_text(_self, text):
+                text = super().reduce_text(text)
+                text = text.replace('https://projects.blender.org', '')
+                return text
         return SplitWiki().split_for_embedding()
     @staticmethod
     def docs_get_texts_to_embed():
+        class SplitBlenderDocs(SplitDocs):
+            def reduce_text(_self, text):
+                text = super().reduce_text(text)
+                # Remove .md or index.md
+                text = re.sub(r'(index)?.md', '', text)
+                return text
+            def embedding_header(_self, rel_path, titles):
+                rel_path = re.sub(r'(index)?.md', '', rel_path)
+                return super().embedding_header(rel_path, titles)
+        return SplitBlenderDocs().split_for_embedding(DOCS_DIR)
+    def _sort_similarity(
+            self,
+            text_to_search: str,
+            groups: Set[Group] = Query(
+                default={Group.dev_docs, Group.manual}),
+            limit: int = 5) -> List[str]:
+        base_url: Dict[Group, str] = {
+            Group.dev_docs: "https://developer.blender.org/docs",
+            # Group.wiki: "https://projects.blender.org",
+            Group.manual: "https://docs.blender.org/manual/en/dev"
+        }
         query_emb = EMBEDDING_CTX.encode([text_to_search])
+        results: List[Tuple[float, str, Group]] = []
         for grp in groups:
+            if grp not in self:
                 continue
+            search_results = util.semantic_search(
                 query_emb, self[grp]['embeddings'], top_k=limit, score_function=util.dot_score)
+            for score in search_results[0]:
+                corpus_id = score['corpus_id']
+                text = self[grp]['texts'][corpus_id]
+                results.append((score['score'], text, grp))
+        # Keep only the top `limit` results
+        top_results = nlargest(limit, results, key=lambda x: x[0])
+        # Extract sorted texts with base URL
+        sorted_texts = [base_url[grp] + text for _, text, grp in top_results]
+        return sorted_texts
 G_data = _Data()
 @router.get("/wiki_search", response_class=PlainTextResponse)
+def wiki_search(
+    query: str = "",
+    groups: Set[Group] = Query(default={Group.dev_docs, Group.manual})
+) -> str:
+    texts = G_data._sort_similarity(query, groups)
+    result: str = ''
     for text in texts:
         result += f'\n---\n{text}'
     return result
 if __name__ == '__main__':
     tests = ["Set Snap Base", "Building the Manual",
              "Bisect Object", "Who are the Triagers", "4.3 Release Notes Motion Paths"]
+    result = wiki_search(tests[0], {Group.dev_docs, Group.manual})
     print(result)