Spaces:

MariaK
/

unstructured-pipeline-builder

Running

File size: 5,200 Bytes

import gradio as gr
import json
import os

# Load source_connectors and destination_connectors from JSON files
with open('source_connectors.json', 'r') as f:
    source_connectors = json.load(f)

with open('destination_connectors.json', 'r') as f:
    destination_connectors = json.load(f)

def generate_documentation_link(source, destination):
    return f"[{source['source_connector']} source connector documentation]({source['docs']}) | [{destination['destination_connector']} destination connector documentation]({destination['docs']})"

def generate_code(source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding):
    source_connector = source_connectors[source]
    destination_connector = destination_connectors[destination]

    # Ensure proper indentation for source and destination configs
    indented_source_configs = '\n'.join(
        '        ' + line
        for line in source_connector['configs'].strip().split('\n'))
    indented_destination_configs = '\n'.join(
        '        ' + line
        for line in destination_connector['configs'].strip().split('\n'))

    # Generate chunking configuration
    chunking_config = '\n        # Chunking step skipped\n'
    if chunking_strategy != "None":
        chunking_config = f'''
        chunker_config=ChunkerConfig(
            chunking_strategy="{chunking_strategy}",
            chunk_max_characters={chunk_size if chunk_size is not None else 1000},
            chunk_overlap={chunk_overlap if chunk_overlap is not None else 20}
        ),'''

    # Generate embedding configuration
    embedding_config = '        # Embedding step is skipped'
    if embedding != "None":
        if embedding == "langchain-huggingface":
            embedding_config = f'''
        embedder_config=EmbedderConfig(
            embedding_provider="{embedding}",
            embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
        ),'''
        elif embedding == "langchain-aws-bedrock":
            embedding_config = f'''
        embedder_config=EmbedderConfig(
            embedding_provider="{embedding}",
            embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
            embedding_aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
            embedding_aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
        ),'''
        else:
            embedding_config = f'''
        embedder_config=EmbedderConfig(
            embedding_provider="{embedding}",
            embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
            embedding_api_key=os.getenv("EMBEDDING_PROVIDER_API_KEY"),
        ),'''

    code = f'''
import os
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
{source_connector['imports']}
{destination_connector['imports']}
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig

if __name__ == "__main__":
    Pipeline.from_configs(
        context=ProcessorConfig(),
{indented_source_configs}
        partitioner_config=PartitionerConfig(
            partition_by_api=True,
            api_key=os.getenv("UNSTRUCTURED_API_KEY"),
            partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
            strategy="hi_res",
        ),{chunking_config}
{embedding_config}
{indented_destination_configs}
    ).run()
'''
    doc_link = generate_documentation_link(source_connector, destination_connector)
    return code, doc_link

with gr.Blocks() as demo:
    gr.Markdown("# Unstructured-Ingest Code Generator")
    gr.Markdown("Generate code for the unstructured-ingest library based on your inputs. Learn more about using Unstructured Serverless API in the [documentation](https://docs.unstructured.io/api-reference/ingest/overview).")
    
    with gr.Row():
        with gr.Column(scale=1):
            source = gr.Dropdown(list(source_connectors.keys()), label="Get unstructured documents from:", value="S3")
            destination = gr.Dropdown(list(destination_connectors.keys()), label="Upload RAG-ready documents to:", value="Local directory")
            chunking_strategy = gr.Dropdown(["None", "by_title", "basic", "by_page", "by_similarity"], label="Chunking strategy:", value="None")
            chunk_size = gr.Number(value=1000, label="Chunk size (characters):", step=1)
            chunk_overlap = gr.Number(value=20, label="Chunk overlap (characters):", step=1)
            embedding = gr.Dropdown(["None", "langchain-openai", "langchain-huggingface", "langchain-aws-bedrock", "langchain-vertexai", "langchain-voyageai", "octoai"], label="Embedding provider:")
            submit_button = gr.Button("Generate Code")
        
        with gr.Column(scale=2):
            output_code = gr.Code(language="python", label="Generated Code")
            output_docs = gr.Markdown(label="Documentation Links")
    
    submit_button.click(
        fn=generate_code,
        inputs=[source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding],
        outputs=[output_code, output_docs]
    )


demo.launch()