text-to-sql-hub-datasets

Sleeping

File size: 3,604 Bytes

import json
import os
import urllib.parse

import gradio as gr
import requests
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from huggingface_hub import InferenceClient

example = HuggingfaceHubSearch().example_value()

client = InferenceClient(
    "meta-llama/Meta-Llama-3.1-70B-Instruct",
    token=os.environ["HF_TOKEN"],
)


def get_iframe(hub_repo_id, sql_query=None):
    if sql_query:
        sql_query = urllib.parse.quote(sql_query)
        url = f"https://huggingface.co./datasets/{hub_repo_id}/embed/viewer?sql_console=true&sql={sql_query}"
    else:
        url = f"https://huggingface.co./datasets/{hub_repo_id}/embed/viewer"
    iframe = f"""
    <iframe
  src="{url}"
  frameborder="0"
  width="100%"
  height="800px"
></iframe>
"""
    return iframe


def get_column_info(hub_repo_id):
    url: str = f"https://datasets-server.huggingface.co/info?dataset={hub_repo_id}"
    response = requests.get(url)
    try:
        data = response.json()
        data = data.get("dataset_info")
        key = list(data.keys())[0]
        features: str = json.dumps(data.get(key).get("features"))
    except Exception as e:
        gr.Error(f"Error getting column info: {e}")
    return features


def query_dataset(hub_repo_id, features, query):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that returns a DuckDB SQL query based on the user's query and dataset features. Only return the SQL query, no other text.",
        },
        {
            "role": "user",
            "content": f"""table train
# Features
{features}

# Query
{query}
""",
        },
    ]
    response = client.chat_completion(
        messages=messages,
        max_tokens=1000,
        stream=False,
    )
    query = response.choices[0].message.content
    return query, get_iframe(hub_repo_id, query)


with gr.Blocks() as demo:
    gr.Markdown("""# 🐥 🦙 🤗 Text To Sql Hub Datasets 🐥 🦙 🤗

                This is a basic text to SQL tool that allows you to query datasets on Huggingface Hub.
                It is built with [DuckDB](https://duckdb.org/), [Huggingface's Inference API](https://huggingface.co./docs/api-inference/index), and [LLama 3.1 70B](https://huggingface.co./meta-llama/Meta-Llama-3.1-70B-Instruct).
                Also, it uses the [dataset-server API](https://redocly.github.io/redoc/?url=https://datasets-server.huggingface.co/openapi.json#operation/isValidDataset).
                """)
    with gr.Row():
        with gr.Column():
            search_in = HuggingfaceHubSearch(
                label="Search Huggingface Hub",
                placeholder="Search for models on Huggingface",
                search_type="dataset",
            )

            btn = gr.Button("Show Dataset")
    with gr.Row():
        search_out = gr.HTML(label="Search Results")
    with gr.Row():
        features = gr.Code(label="Features", language="json", visible=False)
    with gr.Row():
        query = gr.Textbox(label="Query", placeholder="Enter a query to generate SQL")
    with gr.Row():
        sql_out = gr.Code(label="SQL Query")
    with gr.Row():
        btn2 = gr.Button("Query Dataset")

    gr.on(
        [btn.click, search_in.submit],
        fn=get_iframe,
        inputs=[search_in],
        outputs=[search_out],
    ).then(
        fn=get_column_info,
        inputs=[search_in],
        outputs=[features],
    )

    btn2.click(
        fn=query_dataset,
        inputs=[search_in, features, query],
        outputs=[sql_out, search_out],
    )

if __name__ == "__main__":
    demo.launch()