Spaces:
Sleeping
Sleeping
""" | |
TODOS: | |
- Improve prompts | |
- Improve model usage (Quantization?) | |
- Improve error handling | |
- Add more tests | |
- Improve response in a friendly way | |
""" | |
import gradio as gr | |
from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
import duckdb | |
import pandas as pd | |
import requests | |
from outlines import prompt | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import spaces | |
import json | |
import torch | |
import logging | |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" | |
logger = logging.getLogger(__name__) | |
""" | |
Methods for generating potential questions and SQL queries | |
""" | |
device = "cuda" | |
gemma_model_id = "google/gemma-2b-it" | |
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_id) | |
gemma_model = AutoModelForCausalLM.from_pretrained( | |
gemma_model_id, | |
device_map="auto", | |
torch_dtype=torch.bfloat16 | |
) | |
def generate_potential_questions_with_gemma(prompt): | |
input_ids = gemma_tokenizer(prompt, return_tensors="pt").to(device) | |
outputs = gemma_model.generate(**input_ids, max_new_tokens=1024) | |
return gemma_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
def prompt_for_questions(dataset, schema, first_rows): | |
""" | |
You are a data analyst tasked with exploring a dataset named {{ dataset }}. | |
Below is the dataset schema in SQL format along with a sample of 3 rows: | |
{{ schema }} | |
Sample rows: | |
{% for example in first_rows %} | |
{{ example}} | |
{% endfor %} | |
Your goal is to generate a list of 5 potential questions that a user might want | |
to ask about this dataset. Consider the information contained in the provided | |
columns and rows, and try to think of meaningful questions that could | |
provide insights or useful information. For each question, provide the SQL query | |
that would extract the relevant information from the dataset. | |
Ouput JSON format: | |
{ | |
"questions": [ | |
{"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
{"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
{"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
{"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
{"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
] | |
} | |
Please ensure that each SQL query retrieves relevant information from the dataset to answer the corresponding question accurately. | |
Return only the JSON object, do not add extra information. | |
""" | |
""" | |
Methods for generating and SQL based on a user request | |
""" | |
mother_duckdb_model_id = "motherduckdb/DuckDB-NSQL-7B-v0.1" | |
mother_duck_tokenizer = AutoTokenizer.from_pretrained(mother_duckdb_model_id) | |
mother_duck_model = AutoModelForCausalLM.from_pretrained( | |
mother_duckdb_model_id, | |
device_map="auto", | |
torch_dtype=torch.bfloat16 | |
) | |
def generate_sql_with_mother_duck(prompt): | |
input_ids = mother_duck_tokenizer(prompt, return_tensors="pt").to(device).input_ids | |
generated_ids = mother_duck_model.generate(input_ids, max_length=1024) | |
return mother_duck_tokenizer.decode(generated_ids[0], skip_special_tokens=True) | |
def prompt_for_sql(ddl_create, query_input): | |
""" | |
### Instruction: | |
Your task is to generate valid duckdb SQL to answer the following question. | |
### Input: | |
Here is the database schema that the SQL query will run on: | |
{{ ddl_create }} | |
### Question: | |
{{ query_input }} | |
### Response (use duckdb shorthand if possible): | |
""" | |
""" | |
Datasets Viewer Methods | |
https://huggingface.co./docs/datasets-server/index | |
""" | |
def get_first_parquet(dataset: str): | |
resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}") | |
return resp.json()["parquet_files"][0] | |
def get_dataset_schema(parquet_url: str): | |
con = duckdb.connect() | |
con.execute(f"CREATE TABLE data as SELECT * FROM '{parquet_url}' LIMIT 1;") | |
result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df() | |
ddl_create = result.iloc[0,0] | |
con.close() | |
return ddl_create | |
def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int): | |
resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}") | |
rows = resp.json()["rows"] | |
rows = [row['row'] for row in rows] | |
return pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit) | |
""" | |
Main logic, to get the recommended queries | |
""" | |
def get_recommended_queries(dataset: str): | |
ddl_create, prompt = "", "" | |
try: | |
first_split = get_first_parquet(dataset) | |
df_first_rows = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 3) | |
first_parquet_url = first_split["url"] | |
logger.info(f"First parquet URL: {first_parquet_url}") | |
ddl_create = get_dataset_schema(first_parquet_url) | |
prompt = prompt_for_questions(dataset, ddl_create, df_first_rows.to_dict('records')) | |
txt_questions = generate_potential_questions_with_gemma(prompt).split("``json")[1].replace('\n', ' ').strip()[:-4] | |
data = json.loads(txt_questions) | |
questions = data["questions"] | |
potential_questions = [] | |
for question in questions: | |
try: | |
sql = question["sql_query"].replace("FROM data", f"FROM '{first_parquet_url}'") | |
result = duckdb.sql(sql).df() | |
potential_questions.append({"question": question["question"], "result": result, "sql_query": sql}) | |
continue | |
except Exception as err: | |
logger.error(f"Error in running SQL query: {question['sql_query']} {err}") | |
mother_duck_prompt = prompt_for_sql(ddl_create, question["question"]) | |
sql = generate_sql_with_mother_duck(mother_duck_prompt).split("### Response (use duckdb shorthand if possible):")[-1].strip() | |
sql = sql.replace("FROM data", f"FROM '{first_parquet_url}'") | |
try: | |
result = duckdb.sql(sql).df() | |
potential_questions.append({"question": question["question"], "result": result, "sql_query": sql}) | |
except: | |
pass | |
df_result = pd.DataFrame(potential_questions) | |
except Exception as err: | |
logger.error(f"Error in getting recommended queries: {err}") | |
return { | |
gr_txt_ddl: ddl_create, | |
gr_txt_prompt: prompt, | |
gr_df_result: pd.DataFrame([{"error": f"β {err=}"}]) | |
} | |
return { | |
gr_txt_ddl: ddl_create, | |
gr_txt_prompt: prompt, | |
gr_df_result: df_result | |
} | |
def preview_dataset(dataset: str): | |
try: | |
first_split = get_first_parquet(dataset) | |
df = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 4) | |
except Exception as err: | |
df = pd.DataFrame([{"Unable to preview dataset": f"β {err=}"}]) | |
return { | |
gr_df_first_rows: df | |
} | |
with gr.Blocks() as demo: | |
gr.Markdown("# π« Dataset Insights Explorer π«") | |
gr_dataset_name = HuggingfaceHubSearch( | |
label="Hub Dataset ID", | |
placeholder="Search for dataset id on Huggingface", | |
search_type="dataset", | |
value="jamescalam/world-cities-geo", | |
) | |
gr_preview_btn = gr.Button("Preview Dataset") | |
gr_df_first_rows = gr.DataFrame(datatype="markdown") | |
gr_recommend_btn = gr.Button("Show Insights") | |
gr_df_result = gr.DataFrame(datatype="markdown") | |
with gr.Accordion("Open for details", open=False): | |
gr_txt_ddl = gr.Textbox(label="Dataset as CREATE DDL", interactive= False) | |
gr_txt_prompt = gr.Textbox(label="Generated prompt to get recommended questions", interactive= False) | |
gr_preview_btn.click(preview_dataset, inputs=[gr_dataset_name], outputs=[gr_df_first_rows]) | |
gr_recommend_btn.click(get_recommended_queries, inputs=[gr_dataset_name], outputs=[gr_txt_ddl, gr_txt_prompt, gr_df_result]) | |
demo.launch() | |