Spaces:

MariaK
/

unstructured-pipeline-builder

Running

App Files Files Community

MariaK commited on Oct 2, 2024

Commit

9559672

verified ·

1 Parent(s): dd4ae81

Added embedding and chunking options

Browse files

Files changed (1) hide show

app.py +63 -23

app.py CHANGED Viewed

@@ -10,9 +10,9 @@ with open('destination_connectors.json', 'r') as f:
     destination_connectors = json.load(f)
 def generate_documentation_link(source, destination):
-    return f"[{source['source_connector']} documentation]({source['docs']}) | [{destination['destination_connector']} documentation]({destination['docs']})"
-def generate_code(source, destination, chunking, embedding):
     source_connector = source_connectors[source]
     destination_connector = destination_connectors[destination]
@@ -24,6 +24,41 @@ def generate_code(source, destination, chunking, embedding):
         '        ' + line
         for line in destination_connector['configs'].strip().split('\n'))
     code = f'''
 import os
 from unstructured_ingest.v2.pipeline.pipeline import Pipeline
@@ -43,32 +78,37 @@ if __name__ == "__main__":
             api_key=os.getenv("UNSTRUCTURED_API_KEY"),
             partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
             strategy="hi_res",
-        ),
-        {'chunker_config=ChunkerConfig(chunking_strategy="by_title"),' if chunking else '# Chunking is disabled'}
-        {'embedder_config=EmbedderConfig(embedding_provider="' + embedding + '")' if embedding else '# Embedding is disabled'}
 {indented_destination_configs}
     ).run()
 '''
     doc_link = generate_documentation_link(source_connector, destination_connector)
     return code, doc_link
-demo = gr.Interface(
-    fn=generate_code,
-    inputs=[
-        gr.Dropdown(list(source_connectors.keys()),
-                    label="Get unstructured documents from:"),
-        gr.Dropdown(list(destination_connectors.keys()),
-                    label="Upload RAG-ready documents to:"),
-        gr.Checkbox(label="Check to enable chunking"),
-        gr.Dropdown(["langchain-openai", "langchain-huggingface"],
-                    label="Embedding provider:")
-    ],
-    outputs=[
-        gr.Code(language="python", label="Generated Code"),
-        gr.Markdown(label="Documentation Links")
-    ],
-    title="Unstructured-Ingest Code Generator",
-    description="Generate code for the unstructured-ingest library based on your inputs.")
-demo.launch()

     destination_connectors = json.load(f)
 def generate_documentation_link(source, destination):
+    return f"[{source['source_connector']} source connector documentation]({source['docs']}) | [{destination['destination_connector']} destination connector documentation]({destination['docs']})"
+def generate_code(source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding):
     source_connector = source_connectors[source]
     destination_connector = destination_connectors[destination]
         '        ' + line
         for line in destination_connector['configs'].strip().split('\n'))
+    # Generate chunking configuration
+    chunking_config = '\n        # Chunking step skipped\n'
+    if chunking_strategy != "None":
+        chunking_config = f'''
+        chunker_config=ChunkerConfig(
+            chunking_strategy="{chunking_strategy}",
+            chunk_size={chunk_size if chunk_size is not None else 1000},
+            chunk_overlap={chunk_overlap if chunk_overlap is not None else 20}
+        ),'''
+    # Generate embedding configuration
+    embedding_config = '        # Embedding step is skipped'
+    if embedding != "None":
+        if embedding == "langchain-huggingface":
+            embedding_config = f'''
+        embedder_config=EmbedderConfig(
+            embedding_provider="{embedding}",
+            embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
+        ),'''
+        elif embedding == "langchain-aws-bedrock":
+            embedding_config = f'''
+        embedder_config=EmbedderConfig(
+            embedding_provider="{embedding}",
+            embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
+            embedding_aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+            embedding_aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+        ),'''
+        else:
+            embedding_config = f'''
+        embedder_config=EmbedderConfig(
+            embedding_provider="{embedding}",
+            embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
+            embedding_api_key=os.getenv("EMBEDDING_PROVIDER_API_KEY"),
+        ),'''
     code = f'''
 import os
 from unstructured_ingest.v2.pipeline.pipeline import Pipeline
             api_key=os.getenv("UNSTRUCTURED_API_KEY"),
             partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
             strategy="hi_res",
+        ),{chunking_config}
+{embedding_config}
 {indented_destination_configs}
     ).run()
 '''
     doc_link = generate_documentation_link(source_connector, destination_connector)
     return code, doc_link
+with gr.Blocks() as demo:
+    gr.Markdown("Unstructured-Ingest Code Generator")
+    gr.Markdown("Generate code for the unstructured-ingest library based on your inputs.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            source = gr.Dropdown(list(source_connectors.keys()), label="Get unstructured documents from:", value="S3")
+            destination = gr.Dropdown(list(destination_connectors.keys()), label="Upload RAG-ready documents to:", value="Local directory")
+            chunking_strategy = gr.Dropdown(["None", "by_title", "basic", "by_page", "by_similarity"], label="Chunking strategy:", value="None")
+            chunk_size = gr.Number(value=1000, label="Chunk size (characters):", step=1)
+            chunk_overlap = gr.Number(value=20, label="Chunk overlap (characters):", step=1)
+            embedding = gr.Dropdown(["None", "langchain-openai", "langchain-huggingface", "langchain-aws-bedrock", "langchain-vertexai", "langchain-voyageai", "octoai"], label="Embedding provider:")
+            submit_button = gr.Button("Generate Code")
+        with gr.Column(scale=2):
+            output_code = gr.Code(language="python", label="Generated Code")
+            output_docs = gr.Markdown(label="Documentation Links")
+    submit_button.click(
+        fn=generate_code,
+        inputs=[source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding],
+        outputs=[output_code, output_docs]
+    )
+demo.launch()