MariaK's picture
Update app.py
617b17a verified
import gradio as gr
import json
import os
# Load source_connectors and destination_connectors from JSON files
with open('source_connectors.json', 'r') as f:
source_connectors = json.load(f)
with open('destination_connectors.json', 'r') as f:
destination_connectors = json.load(f)
def generate_documentation_link(source, destination):
return f"[{source['source_connector']} source connector documentation]({source['docs']}) | [{destination['destination_connector']} destination connector documentation]({destination['docs']})"
def generate_code(source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding):
source_connector = source_connectors[source]
destination_connector = destination_connectors[destination]
# Ensure proper indentation for source and destination configs
indented_source_configs = '\n'.join(
' ' + line
for line in source_connector['configs'].strip().split('\n'))
indented_destination_configs = '\n'.join(
' ' + line
for line in destination_connector['configs'].strip().split('\n'))
# Generate chunking configuration
chunking_config = '\n # Chunking step skipped\n'
if chunking_strategy != "None":
chunking_config = f'''
chunker_config=ChunkerConfig(
chunking_strategy="{chunking_strategy}",
chunk_max_characters={chunk_size if chunk_size is not None else 1000},
chunk_overlap={chunk_overlap if chunk_overlap is not None else 20}
),'''
# Generate embedding configuration
embedding_config = ' # Embedding step is skipped'
if embedding != "None":
if embedding == "langchain-huggingface":
embedding_config = f'''
embedder_config=EmbedderConfig(
embedding_provider="{embedding}",
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
),'''
elif embedding == "langchain-aws-bedrock":
embedding_config = f'''
embedder_config=EmbedderConfig(
embedding_provider="{embedding}",
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
embedding_aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
embedding_aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
),'''
else:
embedding_config = f'''
embedder_config=EmbedderConfig(
embedding_provider="{embedding}",
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
embedding_api_key=os.getenv("EMBEDDING_PROVIDER_API_KEY"),
),'''
code = f'''
import os
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
{source_connector['imports']}
{destination_connector['imports']}
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
if __name__ == "__main__":
Pipeline.from_configs(
context=ProcessorConfig(),
{indented_source_configs}
partitioner_config=PartitionerConfig(
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
strategy="hi_res",
),{chunking_config}
{embedding_config}
{indented_destination_configs}
).run()
'''
doc_link = generate_documentation_link(source_connector, destination_connector)
return code, doc_link
with gr.Blocks() as demo:
gr.Markdown("# Unstructured-Ingest Code Generator")
gr.Markdown("Generate code for the unstructured-ingest library based on your inputs. Learn more about using Unstructured Serverless API in the [documentation](https://docs.unstructured.io/api-reference/ingest/overview).")
with gr.Row():
with gr.Column(scale=1):
source = gr.Dropdown(list(source_connectors.keys()), label="Get unstructured documents from:", value="S3")
destination = gr.Dropdown(list(destination_connectors.keys()), label="Upload RAG-ready documents to:", value="Local directory")
chunking_strategy = gr.Dropdown(["None", "by_title", "basic", "by_page", "by_similarity"], label="Chunking strategy:", value="None")
chunk_size = gr.Number(value=1000, label="Chunk size (characters):", step=1)
chunk_overlap = gr.Number(value=20, label="Chunk overlap (characters):", step=1)
embedding = gr.Dropdown(["None", "langchain-openai", "langchain-huggingface", "langchain-aws-bedrock", "langchain-vertexai", "langchain-voyageai", "octoai"], label="Embedding provider:")
submit_button = gr.Button("Generate Code")
with gr.Column(scale=2):
output_code = gr.Code(language="python", label="Generated Code")
output_docs = gr.Markdown(label="Documentation Links")
submit_button.click(
fn=generate_code,
inputs=[source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding],
outputs=[output_code, output_docs]
)
demo.launch()