|
import gradio as gr |
|
import json |
|
import os |
|
|
|
|
|
with open('source_connectors.json', 'r') as f: |
|
source_connectors = json.load(f) |
|
|
|
with open('destination_connectors.json', 'r') as f: |
|
destination_connectors = json.load(f) |
|
|
|
def generate_documentation_link(source, destination): |
|
return f"[{source['source_connector']} source connector documentation]({source['docs']}) | [{destination['destination_connector']} destination connector documentation]({destination['docs']})" |
|
|
|
def generate_code(source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding): |
|
source_connector = source_connectors[source] |
|
destination_connector = destination_connectors[destination] |
|
|
|
|
|
indented_source_configs = '\n'.join( |
|
' ' + line |
|
for line in source_connector['configs'].strip().split('\n')) |
|
indented_destination_configs = '\n'.join( |
|
' ' + line |
|
for line in destination_connector['configs'].strip().split('\n')) |
|
|
|
|
|
chunking_config = '\n # Chunking step skipped\n' |
|
if chunking_strategy != "None": |
|
chunking_config = f''' |
|
chunker_config=ChunkerConfig( |
|
chunking_strategy="{chunking_strategy}", |
|
chunk_max_characters={chunk_size if chunk_size is not None else 1000}, |
|
chunk_overlap={chunk_overlap if chunk_overlap is not None else 20} |
|
),''' |
|
|
|
|
|
embedding_config = ' # Embedding step is skipped' |
|
if embedding != "None": |
|
if embedding == "langchain-huggingface": |
|
embedding_config = f''' |
|
embedder_config=EmbedderConfig( |
|
embedding_provider="{embedding}", |
|
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"), |
|
),''' |
|
elif embedding == "langchain-aws-bedrock": |
|
embedding_config = f''' |
|
embedder_config=EmbedderConfig( |
|
embedding_provider="{embedding}", |
|
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"), |
|
embedding_aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), |
|
embedding_aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), |
|
),''' |
|
else: |
|
embedding_config = f''' |
|
embedder_config=EmbedderConfig( |
|
embedding_provider="{embedding}", |
|
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"), |
|
embedding_api_key=os.getenv("EMBEDDING_PROVIDER_API_KEY"), |
|
),''' |
|
|
|
code = f''' |
|
import os |
|
from unstructured_ingest.v2.pipeline.pipeline import Pipeline |
|
from unstructured_ingest.v2.interfaces import ProcessorConfig |
|
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig |
|
{source_connector['imports']} |
|
{destination_connector['imports']} |
|
from unstructured_ingest.v2.processes.chunker import ChunkerConfig |
|
from unstructured_ingest.v2.processes.embedder import EmbedderConfig |
|
|
|
if __name__ == "__main__": |
|
Pipeline.from_configs( |
|
context=ProcessorConfig(), |
|
{indented_source_configs} |
|
partitioner_config=PartitionerConfig( |
|
partition_by_api=True, |
|
api_key=os.getenv("UNSTRUCTURED_API_KEY"), |
|
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), |
|
strategy="hi_res", |
|
),{chunking_config} |
|
{embedding_config} |
|
{indented_destination_configs} |
|
).run() |
|
''' |
|
doc_link = generate_documentation_link(source_connector, destination_connector) |
|
return code, doc_link |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Unstructured-Ingest Code Generator") |
|
gr.Markdown("Generate code for the unstructured-ingest library based on your inputs. Learn more about using Unstructured Serverless API in the [documentation](https://docs.unstructured.io/api-reference/ingest/overview).") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
source = gr.Dropdown(list(source_connectors.keys()), label="Get unstructured documents from:", value="S3") |
|
destination = gr.Dropdown(list(destination_connectors.keys()), label="Upload RAG-ready documents to:", value="Local directory") |
|
chunking_strategy = gr.Dropdown(["None", "by_title", "basic", "by_page", "by_similarity"], label="Chunking strategy:", value="None") |
|
chunk_size = gr.Number(value=1000, label="Chunk size (characters):", step=1) |
|
chunk_overlap = gr.Number(value=20, label="Chunk overlap (characters):", step=1) |
|
embedding = gr.Dropdown(["None", "langchain-openai", "langchain-huggingface", "langchain-aws-bedrock", "langchain-vertexai", "langchain-voyageai", "octoai"], label="Embedding provider:") |
|
submit_button = gr.Button("Generate Code") |
|
|
|
with gr.Column(scale=2): |
|
output_code = gr.Code(language="python", label="Generated Code") |
|
output_docs = gr.Markdown(label="Documentation Links") |
|
|
|
submit_button.click( |
|
fn=generate_code, |
|
inputs=[source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding], |
|
outputs=[output_code, output_docs] |
|
) |
|
|
|
|
|
demo.launch() |