Spaces:

MariaK
/

unstructured-pipeline-builder

Running

App Files Files Community

unstructured-pipeline-builder / app.py

MariaK

Update app.py

617b17a verified 4 months ago

raw

history blame contribute delete

5.2 kB

	import gradio as gr
	import json
	import os

	# Load source_connectors and destination_connectors from JSON files
	with open('source_connectors.json', 'r') as f:
	source_connectors = json.load(f)

	with open('destination_connectors.json', 'r') as f:
	destination_connectors = json.load(f)

	def generate_documentation_link(source, destination):
	return f"[{source['source_connector']} source connector documentation]({source['docs']}) \| [{destination['destination_connector']} destination connector documentation]({destination['docs']})"

	def generate_code(source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding):
	source_connector = source_connectors[source]
	destination_connector = destination_connectors[destination]

	# Ensure proper indentation for source and destination configs
	indented_source_configs = '\n'.join(
	' ' + line
	for line in source_connector['configs'].strip().split('\n'))
	indented_destination_configs = '\n'.join(
	' ' + line
	for line in destination_connector['configs'].strip().split('\n'))

	# Generate chunking configuration
	chunking_config = '\n # Chunking step skipped\n'
	if chunking_strategy != "None":
	chunking_config = f'''
	chunker_config=ChunkerConfig(
	chunking_strategy="{chunking_strategy}",
	chunk_max_characters={chunk_size if chunk_size is not None else 1000},
	chunk_overlap={chunk_overlap if chunk_overlap is not None else 20}
	),'''

	# Generate embedding configuration
	embedding_config = ' # Embedding step is skipped'
	if embedding != "None":
	if embedding == "langchain-huggingface":
	embedding_config = f'''
	embedder_config=EmbedderConfig(
	embedding_provider="{embedding}",
	embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
	),'''
	elif embedding == "langchain-aws-bedrock":
	embedding_config = f'''
	embedder_config=EmbedderConfig(
	embedding_provider="{embedding}",
	embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
	embedding_aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
	embedding_aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
	),'''
	else:
	embedding_config = f'''
	embedder_config=EmbedderConfig(
	embedding_provider="{embedding}",
	embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
	embedding_api_key=os.getenv("EMBEDDING_PROVIDER_API_KEY"),
	),'''

	code = f'''
	import os
	from unstructured_ingest.v2.pipeline.pipeline import Pipeline
	from unstructured_ingest.v2.interfaces import ProcessorConfig
	from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
	{source_connector['imports']}
	{destination_connector['imports']}
	from unstructured_ingest.v2.processes.chunker import ChunkerConfig
	from unstructured_ingest.v2.processes.embedder import EmbedderConfig

	if __name__ == "__main__":
	Pipeline.from_configs(
	context=ProcessorConfig(),
	{indented_source_configs}
	partitioner_config=PartitionerConfig(
	partition_by_api=True,
	api_key=os.getenv("UNSTRUCTURED_API_KEY"),
	partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
	strategy="hi_res",
	),{chunking_config}
	{embedding_config}
	{indented_destination_configs}
	).run()
	'''
	doc_link = generate_documentation_link(source_connector, destination_connector)
	return code, doc_link

	with gr.Blocks() as demo:
	gr.Markdown("# Unstructured-Ingest Code Generator")
	gr.Markdown("Generate code for the unstructured-ingest library based on your inputs. Learn more about using Unstructured Serverless API in the [documentation](https://docs.unstructured.io/api-reference/ingest/overview).")

	with gr.Row():
	with gr.Column(scale=1):
	source = gr.Dropdown(list(source_connectors.keys()), label="Get unstructured documents from:", value="S3")
	destination = gr.Dropdown(list(destination_connectors.keys()), label="Upload RAG-ready documents to:", value="Local directory")
	chunking_strategy = gr.Dropdown(["None", "by_title", "basic", "by_page", "by_similarity"], label="Chunking strategy:", value="None")
	chunk_size = gr.Number(value=1000, label="Chunk size (characters):", step=1)
	chunk_overlap = gr.Number(value=20, label="Chunk overlap (characters):", step=1)
	embedding = gr.Dropdown(["None", "langchain-openai", "langchain-huggingface", "langchain-aws-bedrock", "langchain-vertexai", "langchain-voyageai", "octoai"], label="Embedding provider:")
	submit_button = gr.Button("Generate Code")

	with gr.Column(scale=2):
	output_code = gr.Code(language="python", label="Generated Code")
	output_docs = gr.Markdown(label="Documentation Links")

	submit_button.click(
	fn=generate_code,
	inputs=[source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding],
	outputs=[output_code, output_docs]
	)


	demo.launch()