quincyqiang commited on
Commit
3c24b5a
·
1 Parent(s): 0e24559

feature@添加websearch

Browse files
README.md CHANGED
@@ -2,12 +2,15 @@
2
 
3
  > Chinese-LangChain:中文langchain项目,基于ChatGLM-6b+langchain实现本地化知识库检索与智能答案生成
4
 
 
 
5
  ## 🔥 效果演示
6
 
7
  ![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/web_demo.png)
8
 
9
  ## 🚀 特性
10
 
 
11
  - 🚀 2023/04/18 webui增加知识库选择功能
12
  - 🚀 2023/04/18 修复推理预测超时5s报错问题
13
  - 🎉 2023/04/17 支持多种文档上传与内容解析:pdf、docx,ppt等
@@ -29,7 +32,7 @@
29
  * [x] 支持检索结果与LLM生成结果对比
30
  * [ ] 支持检索生成结果与原始LLM生成结果对比
31
  * [ ] 检索结果过滤与排序
32
- * [ ] 互联网检索结果接入
33
  * [ ] 模型初始化有问题
34
  * [ ] 增加非LangChain策略
35
 
 
2
 
3
  > Chinese-LangChain:中文langchain项目,基于ChatGLM-6b+langchain实现本地化知识库检索与智能答案生成
4
 
5
+ 俗称:小必应,Q.Talk,强聊,QiangTalk
6
+
7
  ## 🔥 效果演示
8
 
9
  ![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/web_demo.png)
10
 
11
  ## 🚀 特性
12
 
13
+ - 🚀 2023/04/19 增加web search功能,需要确保网络畅通!
14
  - 🚀 2023/04/18 webui增加知识库选择功能
15
  - 🚀 2023/04/18 修复推理预测超时5s报错问题
16
  - 🎉 2023/04/17 支持多种文档上传与内容解析:pdf、docx,ppt等
 
32
  * [x] 支持检索结果与LLM生成结果对比
33
  * [ ] 支持检索生成结果与原始LLM生成结果对比
34
  * [ ] 检索结果过滤与排序
35
+ * [x] 互联网检索结果接入
36
  * [ ] 模型初始化有问题
37
  * [ ] 增加非LangChain策略
38
 
clc/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (310 Bytes). View file
 
clc/__pycache__/gpt_service.cpython-310.pyc ADDED
Binary file (1.96 kB). View file
 
clc/__pycache__/langchain_application.cpython-310.pyc ADDED
Binary file (3.21 kB). View file
 
clc/__pycache__/source_service.cpython-310.pyc ADDED
Binary file (2.37 kB). View file
 
clc/langchain_application.py CHANGED
@@ -37,13 +37,24 @@ class LangChainApplication(object):
37
  history_len=5,
38
  temperature=0.1,
39
  top_p=0.9,
 
 
40
  chat_history=[]):
41
- prompt_template = """基于以下已知信息,简洁和专业的来回答用户的问题。
42
- 如果无法从中得到答案,请说 "根据已知信息无法回答该问题""没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。
43
- 已知内容:
44
- {context}
45
- 问题:
46
- {question}"""
 
 
 
 
 
 
 
 
 
47
  prompt = PromptTemplate(template=prompt_template,
48
  input_variables=["context", "question"])
49
  self.llm_service.history = chat_history[-history_len:] if history_len > 0 else []
@@ -54,7 +65,7 @@ class LangChainApplication(object):
54
  knowledge_chain = RetrievalQA.from_llm(
55
  llm=self.llm_service,
56
  retriever=self.source_service.vector_store.as_retriever(
57
- search_kwargs={"k": 4}),
58
  prompt=prompt)
59
  knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
60
  input_variables=["page_content"], template="{page_content}")
 
37
  history_len=5,
38
  temperature=0.1,
39
  top_p=0.9,
40
+ top_k=4,
41
+ web_content='',
42
  chat_history=[]):
43
+ if web_content:
44
+ prompt_template = f"""基于以下已知信息,简洁和专业的来回答用户的问题。
45
+ 如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。
46
+ 已知网络检索内容:{web_content}""" + """
47
+ 已知内容:
48
+ {context}
49
+ 问题:
50
+ {question}"""
51
+ else:
52
+ prompt_template = """基于以下已知信息,简洁和专业的来回答用户的问题。
53
+ 如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。
54
+ 已知内容:
55
+ {context}
56
+ 问题:
57
+ {question}"""
58
  prompt = PromptTemplate(template=prompt_template,
59
  input_variables=["context", "question"])
60
  self.llm_service.history = chat_history[-history_len:] if history_len > 0 else []
 
65
  knowledge_chain = RetrievalQA.from_llm(
66
  llm=self.llm_service,
67
  retriever=self.source_service.vector_store.as_retriever(
68
+ search_kwargs={"k": top_k}),
69
  prompt=prompt)
70
  knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
71
  input_variables=["page_content"], template="{page_content}")
clc/source_service.py CHANGED
@@ -12,6 +12,8 @@
12
 
13
  import os
14
 
 
 
15
  from langchain.document_loaders import UnstructuredFileLoader
16
  from langchain.embeddings.huggingface import HuggingFaceEmbeddings
17
  from langchain.vectorstores import FAISS
@@ -53,6 +55,18 @@ class SourceService(object):
53
  self.vector_store = FAISS.load_local(path, self.embeddings)
54
  return self.vector_store
55
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # if __name__ == '__main__':
57
  # config = LangChainCFG()
58
  # source_service = SourceService(config)
 
12
 
13
  import os
14
 
15
+ from duckduckgo_search import ddg
16
+ from duckduckgo_search.utils import SESSION
17
  from langchain.document_loaders import UnstructuredFileLoader
18
  from langchain.embeddings.huggingface import HuggingFaceEmbeddings
19
  from langchain.vectorstores import FAISS
 
55
  self.vector_store = FAISS.load_local(path, self.embeddings)
56
  return self.vector_store
57
 
58
+ def search_web(self, query):
59
+
60
+ SESSION.proxies = {
61
+ "http": f"socks5h://localhost:7890",
62
+ "https": f"socks5h://localhost:7890"
63
+ }
64
+ results = ddg(query)
65
+ web_content = ''
66
+ if results:
67
+ for result in results:
68
+ web_content += result['body']
69
+ return web_content
70
  # if __name__ == '__main__':
71
  # config = LangChainCFG()
72
  # source_service = SourceService(config)
main.py CHANGED
@@ -5,19 +5,19 @@ import gradio as gr
5
 
6
  from clc.langchain_application import LangChainApplication
7
 
8
- os.environ["CUDA_VISIBLE_DEVICES"] = '1'
9
 
10
 
11
  # 修改成自己的配置!!!
12
  class LangChainCFG:
13
- llm_model_name = '../../pretrained_models/chatglm-6b-int4-qe' # 本地模型文件 or huggingface远程仓库
14
- embedding_model_name = '../../pretrained_models/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
15
  vector_store_path = './cache'
16
  docs_path = './docs'
17
  kg_vector_stores = {
18
- '中文维基百科': '/root/GoMall/Knowledge-ChatGLM/cache/zh_wikipedia',
19
- '大规模金融研报知识图谱': '/root/GoMall/Knowledge-ChatGLM/cache/financial_research_reports',
20
- '初始化知识库': '/root/GoMall/Knowledge-ChatGLM/cache',
21
  } # 可以替换成自己的知识库,如果没有需要设置为None
22
  # kg_vector_stores=None
23
 
@@ -62,24 +62,35 @@ def clear_session():
62
  def predict(input,
63
  large_language_model,
64
  embedding_model,
 
 
65
  history=None):
66
  # print(large_language_model, embedding_model)
67
  print(input)
68
  if history == None:
69
  history = []
 
 
 
 
 
70
  resp = application.get_knowledge_based_answer(
71
  query=input,
72
  history_len=1,
73
  temperature=0.1,
74
  top_p=0.9,
 
 
75
  chat_history=history
76
  )
77
  history.append((input, resp['result']))
78
  search_text = ''
79
  for idx, source in enumerate(resp['source_documents'][:4]):
80
- sep = f'----------【搜索结果{idx+1}:】---------------\n'
81
  search_text += f'{sep}\n{source.page_content}\n\n'
82
  print(search_text)
 
 
83
  return '', history, history, search_text
84
 
85
 
@@ -108,20 +119,22 @@ with block as demo:
108
 
109
  top_k = gr.Slider(1,
110
  20,
111
- value=2,
112
  step=1,
113
- label="向量匹配 top k",
114
  interactive=True)
115
  kg_name = gr.Radio(['中文维基百科',
116
  '大规模金融研报知识图谱',
117
  '初始化知识库'
118
  ],
119
  label="知识库",
120
- value='中文维基百科',
121
  interactive=True)
122
  set_kg_btn = gr.Button("重新加载知识库")
123
 
124
- file = gr.File(label="将文件上传到数据库",
 
 
125
  visible=True,
126
  file_types=['.txt', '.md', '.docx', '.pdf']
127
  )
@@ -149,7 +162,9 @@ with block as demo:
149
  send.click(predict,
150
  inputs=[
151
  message, large_language_model,
152
- embedding_model, state
 
 
153
  ],
154
  outputs=[message, chatbot, state, search])
155
 
@@ -163,7 +178,8 @@ with block as demo:
163
  message.submit(predict,
164
  inputs=[
165
  message, large_language_model,
166
- embedding_model, state
 
167
  ],
168
  outputs=[message, chatbot, state, search])
169
  gr.Markdown("""提醒:<br>
 
5
 
6
  from clc.langchain_application import LangChainApplication
7
 
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = '0'
9
 
10
 
11
  # 修改成自己的配置!!!
12
  class LangChainCFG:
13
+ llm_model_name = 'THUDM/chatglm-6b-int4-qe' # 本地模型文件 or huggingface远程仓库
14
+ embedding_model_name = 'GanymedeNil/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
15
  vector_store_path = './cache'
16
  docs_path = './docs'
17
  kg_vector_stores = {
18
+ '中文维基百科': './cache/zh_wikipedia',
19
+ '大规模金融研报知识图谱': '.cache/financial_research_reports',
20
+ '初始化知识库': '.cache',
21
  } # 可以替换成自己的知识库,如果没有需要设置为None
22
  # kg_vector_stores=None
23
 
 
62
  def predict(input,
63
  large_language_model,
64
  embedding_model,
65
+ top_k,
66
+ use_web,
67
  history=None):
68
  # print(large_language_model, embedding_model)
69
  print(input)
70
  if history == None:
71
  history = []
72
+
73
+ if use_web == '使用':
74
+ web_content = application.source_service.search_web(query=input)
75
+ else:
76
+ web_content = ''
77
  resp = application.get_knowledge_based_answer(
78
  query=input,
79
  history_len=1,
80
  temperature=0.1,
81
  top_p=0.9,
82
+ top_k=top_k,
83
+ web_content=web_content,
84
  chat_history=history
85
  )
86
  history.append((input, resp['result']))
87
  search_text = ''
88
  for idx, source in enumerate(resp['source_documents'][:4]):
89
+ sep = f'----------【搜索结果{idx + 1}:】---------------\n'
90
  search_text += f'{sep}\n{source.page_content}\n\n'
91
  print(search_text)
92
+ search_text += "----------【网络检索内容】-----------\n"
93
+ search_text += web_content
94
  return '', history, history, search_text
95
 
96
 
 
119
 
120
  top_k = gr.Slider(1,
121
  20,
122
+ value=4,
123
  step=1,
124
+ label="检索top-k文档",
125
  interactive=True)
126
  kg_name = gr.Radio(['中文维基百科',
127
  '大规模金融研报知识图谱',
128
  '初始化知识库'
129
  ],
130
  label="知识库",
131
+ value='初始化知识库',
132
  interactive=True)
133
  set_kg_btn = gr.Button("重新加载知识库")
134
 
135
+ use_web = gr.Radio(["使用", "不使用"], label="web search", info="是否使用网络搜索,使用时确保网络通常")
136
+
137
+ file = gr.File(label="将文件上传到知识库库,内容要尽量匹配",
138
  visible=True,
139
  file_types=['.txt', '.md', '.docx', '.pdf']
140
  )
 
162
  send.click(predict,
163
  inputs=[
164
  message, large_language_model,
165
+ embedding_model, top_k, use_web,
166
+
167
+ state
168
  ],
169
  outputs=[message, chatbot, state, search])
170
 
 
178
  message.submit(predict,
179
  inputs=[
180
  message, large_language_model,
181
+ embedding_model, top_k, use_web,
182
+ state
183
  ],
184
  outputs=[message, chatbot, state, search])
185
  gr.Markdown("""提醒:<br>
requirements.txt CHANGED
@@ -1,153 +1,7 @@
1
- aiofiles==23.1.0
2
- aiohttp==3.8.4
3
- aiosignal==1.3.1
4
- altair==4.2.2
5
- antlr4-python3-runtime==4.9.3
6
- anyio==3.6.2
7
- argilla==1.6.0
8
- async-timeout==4.0.2
9
- attrs==23.1.0
10
- backoff==2.2.1
11
- beautifulsoup4==4.12.2
12
- brotlipy==0.7.0
13
- cachetools==5.3.0
14
- cchardet==2.1.7
15
- certifi
16
- cffi
17
- chardet==5.1.0
18
- charset-normalizer==3.1.0
19
- click==8.1.3
20
- coloredlogs==15.0.1
21
- commonmark==0.9.1
22
- contourpy==1.0.7
23
- cpm-kernels==1.0.11
24
- cryptography
25
- cycler==0.11.0
26
- dataclasses-json==0.5.7
27
- Deprecated==1.2.13
28
- effdet==0.3.0
29
- entrypoints==0.4
30
- et-xmlfile==1.1.0
31
- faiss-gpu==1.7.2
32
- fastapi==0.95.1
33
- ffmpy==0.3.0
34
- filelock==3.11.0
35
- flatbuffers==23.3.3
36
- flit_core
37
- fonttools==4.39.3
38
- frozenlist==1.3.3
39
- fsspec==2023.4.0
40
- gmpy2
41
- gptcache==0.1.14
42
- gradio==3.27.0
43
- gradio_client==0.1.3
44
- greenlet==2.0.2
45
- h11==0.14.0
46
- httpcore==0.16.3
47
- httpx==0.23.3
48
- huggingface-hub==0.13.4
49
- humanfriendly==10.0
50
- icetk==0.0.7
51
- idna
52
- iopath==0.1.10
53
- Jinja2
54
- joblib==1.2.0
55
- jsonschema==4.17.3
56
- kiwisolver==1.4.4
57
- langchain==0.0.142
58
- layoutparser==0.3.4
59
- linkify-it-py==2.0.0
60
- lxml==4.9.2
61
- Markdown==3.4.3
62
- markdown-it-py==2.2.0
63
- MarkupSafe==2.1.2
64
- marshmallow==3.19.0
65
- marshmallow-enum==1.5.1
66
- matplotlib==3.7.1
67
- mdit-py-plugins==0.3.3
68
- mdurl==0.1.2
69
- mkl-fft==1.3.1
70
- mkl-random
71
- mkl-service==2.4.0
72
- monotonic==1.6
73
- mpmath==1.2.1
74
- msg-parser==1.2.0
75
- multidict==6.0.4
76
- mypy-extensions==1.0.0
77
- networkx
78
- nltk==3.8.1
79
- numexpr==2.8.4
80
- numpy
81
- olefile==0.46
82
- omegaconf==2.3.0
83
- onnxruntime==1.14.1
84
- openai==0.27.4
85
- openapi-schema-pydantic==1.2.4
86
- opencv-python==4.6.0.66
87
- openpyxl==3.1.2
88
- orjson==3.8.10
89
- packaging==23.1
90
- pandas==1.5.3
91
- pdf2image==1.16.3
92
- pdfminer.six==20221105
93
- pdfplumber==0.9.0
94
- Pillow==9.5.0
95
- portalocker==2.7.0
96
- protobuf==3.18.3
97
- pycocotools==2.0.6
98
- pycparser
99
- pydantic==1.10.7
100
- pydub==0.25.1
101
- Pygments==2.15.0
102
- pyOpenSSL
103
- pypandoc==1.11
104
- pyparsing==3.0.9
105
- pyrsistent==0.19.3
106
- PySocks
107
- pytesseract==0.3.10
108
- python-dateutil==2.8.2
109
- python-docx==0.8.11
110
- python-magic==0.4.27
111
- python-multipart==0.0.6
112
- python-pptx==0.6.21
113
- pytz==2023.3
114
- PyYAML==6.0
115
- regex==2023.3.23
116
- requests==2.28.2
117
- rfc3986==1.5.0
118
- rich==13.0.1
119
- scikit-learn==1.2.2
120
- scipy==1.10.1
121
- semantic-version==2.10.0
122
- sentence-transformers==2.2.2
123
- sentencepiece==0.1.98
124
- six
125
- sniffio==1.3.0
126
- soupsieve==2.4.1
127
- SQLAlchemy==1.4.47
128
- starlette==0.26.1
129
- sympy
130
- tenacity==8.2.2
131
- threadpoolctl==3.1.0
132
- timm==0.6.13
133
- tokenizers==0.13.3
134
- toolz==0.12.0
135
- torch==2.0.0
136
- torchaudio==2.0.0
137
- torchvision==0.15.0
138
- tqdm==4.65.0
139
- transformers==4.28.1
140
- triton==2.0.0
141
- typing-inspect==0.8.0
142
- typing_extensions==4.5.0
143
- tzdata==2023.3
144
- uc-micro-py==1.0.1
145
- unstructured==0.5.12
146
- unstructured-inference==0.3.2
147
- urllib3
148
- uvicorn==0.21.1
149
- Wand==0.6.11
150
- websockets==11.0.2
151
- wrapt==1.14.1
152
- XlsxWriter==3.1.0
153
- yarl==1.8.2
 
1
+ langchain
2
+ gradio
3
+ transformers
4
+ sentence_transformers
5
+ faiss-cpu
6
+ unstructured
7
+ duckduckgo_search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_duckduckgo_search.py CHANGED
@@ -2,9 +2,15 @@ from duckduckgo_search import ddg
2
  from duckduckgo_search.utils import SESSION
3
 
4
 
5
- # SESSION.proxies = {
6
- # "http": f"socks5h://localhost:7890",
7
- # "https": f"socks5h://localhost:7890"
8
- # }
9
  r = ddg("马保国")
10
- print(r)
 
 
 
 
 
 
 
2
  from duckduckgo_search.utils import SESSION
3
 
4
 
5
+ SESSION.proxies = {
6
+ "http": f"socks5h://localhost:7890",
7
+ "https": f"socks5h://localhost:7890"
8
+ }
9
  r = ddg("马保国")
10
+ print(r[:2])
11
+ """
12
+ [{'title': '马保国 - 维基百科,自由的百科全书', 'href': 'https://zh.wikipedia.org/wiki/%E9%A9%AC%E4%BF%9D%E5%9B%BD', 'body': '马保国(1951年 — ) ,男,籍贯 山东 临沂,出生及长大于河南,中国大陆太极拳师,自称"浑元形意太极门掌门人" 。 马保国因2017年约战mma格斗家徐晓冬首次出现
13
+ 大众视野中。 2020年5月,马保国在对阵民间武术爱好者王庆民的比赛中,30秒内被连续高速击倒三次,此事件成为了持续多日的社交 ...'}, {'title': '馬保國的主页 - 抖音', 'href': 'https://www.douyin.com/user/MS4wLjABAAAAW0E1ziOvxgUh3VVv5FE6xmoo3w5WtZalfphYZKj4mCg', 'body': '6.3万. #马马国教扛打功 最近有几个人模芳我动作,很危险啊,不可以的,朋友们不要受伤了。. 5.3万. #马保国直播带货榜第一 朋友们周末愉快,本周六早上湿点,我本人在此号进行第一次带货直播,活到老,学到老,越活越年轻。. 7.0万. #马保国击破红牛罐 昨天 ...'}]
14
+
15
+
16
+ """