MariaK commited on
Commit
9559672
·
verified ·
1 Parent(s): dd4ae81

Added embedding and chunking options

Browse files
Files changed (1) hide show
  1. app.py +63 -23
app.py CHANGED
@@ -10,9 +10,9 @@ with open('destination_connectors.json', 'r') as f:
10
  destination_connectors = json.load(f)
11
 
12
  def generate_documentation_link(source, destination):
13
- return f"[{source['source_connector']} documentation]({source['docs']}) | [{destination['destination_connector']} documentation]({destination['docs']})"
14
 
15
- def generate_code(source, destination, chunking, embedding):
16
  source_connector = source_connectors[source]
17
  destination_connector = destination_connectors[destination]
18
 
@@ -24,6 +24,41 @@ def generate_code(source, destination, chunking, embedding):
24
  ' ' + line
25
  for line in destination_connector['configs'].strip().split('\n'))
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  code = f'''
28
  import os
29
  from unstructured_ingest.v2.pipeline.pipeline import Pipeline
@@ -43,32 +78,37 @@ if __name__ == "__main__":
43
  api_key=os.getenv("UNSTRUCTURED_API_KEY"),
44
  partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
45
  strategy="hi_res",
46
- ),
47
- {'chunker_config=ChunkerConfig(chunking_strategy="by_title"),' if chunking else '# Chunking is disabled'}
48
- {'embedder_config=EmbedderConfig(embedding_provider="' + embedding + '")' if embedding else '# Embedding is disabled'}
49
  {indented_destination_configs}
50
  ).run()
51
  '''
52
  doc_link = generate_documentation_link(source_connector, destination_connector)
53
  return code, doc_link
54
 
55
- demo = gr.Interface(
56
- fn=generate_code,
57
- inputs=[
58
- gr.Dropdown(list(source_connectors.keys()),
59
- label="Get unstructured documents from:"),
60
- gr.Dropdown(list(destination_connectors.keys()),
61
- label="Upload RAG-ready documents to:"),
62
- gr.Checkbox(label="Check to enable chunking"),
63
- gr.Dropdown(["langchain-openai", "langchain-huggingface"],
64
- label="Embedding provider:")
65
- ],
66
- outputs=[
67
- gr.Code(language="python", label="Generated Code"),
68
- gr.Markdown(label="Documentation Links")
69
- ],
70
- title="Unstructured-Ingest Code Generator",
71
- description="Generate code for the unstructured-ingest library based on your inputs.")
 
 
 
 
 
 
72
 
73
 
74
- demo.launch()
 
10
  destination_connectors = json.load(f)
11
 
12
  def generate_documentation_link(source, destination):
13
+ return f"[{source['source_connector']} source connector documentation]({source['docs']}) | [{destination['destination_connector']} destination connector documentation]({destination['docs']})"
14
 
15
+ def generate_code(source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding):
16
  source_connector = source_connectors[source]
17
  destination_connector = destination_connectors[destination]
18
 
 
24
  ' ' + line
25
  for line in destination_connector['configs'].strip().split('\n'))
26
 
27
+ # Generate chunking configuration
28
+ chunking_config = '\n # Chunking step skipped\n'
29
+ if chunking_strategy != "None":
30
+ chunking_config = f'''
31
+ chunker_config=ChunkerConfig(
32
+ chunking_strategy="{chunking_strategy}",
33
+ chunk_size={chunk_size if chunk_size is not None else 1000},
34
+ chunk_overlap={chunk_overlap if chunk_overlap is not None else 20}
35
+ ),'''
36
+
37
+ # Generate embedding configuration
38
+ embedding_config = ' # Embedding step is skipped'
39
+ if embedding != "None":
40
+ if embedding == "langchain-huggingface":
41
+ embedding_config = f'''
42
+ embedder_config=EmbedderConfig(
43
+ embedding_provider="{embedding}",
44
+ embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
45
+ ),'''
46
+ elif embedding == "langchain-aws-bedrock":
47
+ embedding_config = f'''
48
+ embedder_config=EmbedderConfig(
49
+ embedding_provider="{embedding}",
50
+ embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
51
+ embedding_aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
52
+ embedding_aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
53
+ ),'''
54
+ else:
55
+ embedding_config = f'''
56
+ embedder_config=EmbedderConfig(
57
+ embedding_provider="{embedding}",
58
+ embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
59
+ embedding_api_key=os.getenv("EMBEDDING_PROVIDER_API_KEY"),
60
+ ),'''
61
+
62
  code = f'''
63
  import os
64
  from unstructured_ingest.v2.pipeline.pipeline import Pipeline
 
78
  api_key=os.getenv("UNSTRUCTURED_API_KEY"),
79
  partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
80
  strategy="hi_res",
81
+ ),{chunking_config}
82
+ {embedding_config}
 
83
  {indented_destination_configs}
84
  ).run()
85
  '''
86
  doc_link = generate_documentation_link(source_connector, destination_connector)
87
  return code, doc_link
88
 
89
+ with gr.Blocks() as demo:
90
+ gr.Markdown("Unstructured-Ingest Code Generator")
91
+ gr.Markdown("Generate code for the unstructured-ingest library based on your inputs.")
92
+
93
+ with gr.Row():
94
+ with gr.Column(scale=1):
95
+ source = gr.Dropdown(list(source_connectors.keys()), label="Get unstructured documents from:", value="S3")
96
+ destination = gr.Dropdown(list(destination_connectors.keys()), label="Upload RAG-ready documents to:", value="Local directory")
97
+ chunking_strategy = gr.Dropdown(["None", "by_title", "basic", "by_page", "by_similarity"], label="Chunking strategy:", value="None")
98
+ chunk_size = gr.Number(value=1000, label="Chunk size (characters):", step=1)
99
+ chunk_overlap = gr.Number(value=20, label="Chunk overlap (characters):", step=1)
100
+ embedding = gr.Dropdown(["None", "langchain-openai", "langchain-huggingface", "langchain-aws-bedrock", "langchain-vertexai", "langchain-voyageai", "octoai"], label="Embedding provider:")
101
+ submit_button = gr.Button("Generate Code")
102
+
103
+ with gr.Column(scale=2):
104
+ output_code = gr.Code(language="python", label="Generated Code")
105
+ output_docs = gr.Markdown(label="Documentation Links")
106
+
107
+ submit_button.click(
108
+ fn=generate_code,
109
+ inputs=[source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding],
110
+ outputs=[output_code, output_docs]
111
+ )
112
 
113
 
114
+ demo.launch()