|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
import os |
|
import pandas as pd |
|
from typing import List, Dict, Tuple |
|
import json |
|
import io |
|
import traceback |
|
import csv |
|
from openai import OpenAI |
|
from functools import lru_cache |
|
from concurrent.futures import ThreadPoolExecutor |
|
import math |
|
|
|
|
|
css = """ |
|
footer { |
|
visibility: hidden; |
|
} |
|
#chatbot-container, #chatbot-data-upload { |
|
height: 700px; |
|
overflow-y: scroll; |
|
} |
|
#chatbot-container .message, #chatbot-data-upload .message { |
|
font-size: 14px; |
|
} |
|
/* ์
๋ ฅ์ฐฝ ๋ฐฐ๊ฒฝ์ ๋ฐ ๊ธ์์ ๋ณ๊ฒฝ */ |
|
textarea, input[type="text"] { |
|
background-color: #ffffff; |
|
color: #000000; |
|
} |
|
/* ํ์ผ ์
๋ก๋ ์์ญ ๋์ด ์กฐ์ */ |
|
#parquet-upload-area { |
|
max-height: 150px; |
|
overflow-y: auto; |
|
} |
|
/* ์ด๊ธฐ ์ค๋ช
๊ธ์จ ํฌ๊ธฐ ์กฐ์ */ |
|
#initial-description { |
|
font-size: 14px; |
|
} |
|
/* API Key ์
๋ ฅ ์น์
์คํ์ผ */ |
|
.api-key-section { |
|
margin: 10px 0; |
|
padding: 10px; |
|
border: 1px solid #ddd; |
|
border-radius: 5px; |
|
} |
|
.api-key-status { |
|
margin-top: 5px; |
|
font-weight: bold; |
|
} |
|
""" |
|
|
|
|
|
hf_client = InferenceClient( |
|
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN") |
|
) |
|
|
|
def load_code(filename: str) -> str: |
|
try: |
|
with open(filename, 'r', encoding='utf-8') as file: |
|
return file.read() |
|
except FileNotFoundError: |
|
return f"{filename} ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค." |
|
except Exception as e: |
|
return f"ํ์ผ์ ์ฝ๋ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}" |
|
|
|
def load_parquet(filename: str) -> str: |
|
try: |
|
df = pd.read_parquet(filename, engine='pyarrow') |
|
return df.head(10).to_markdown(index=False) |
|
except FileNotFoundError: |
|
return f"{filename} ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค." |
|
except Exception as e: |
|
return f"ํ์ผ์ ์ฝ๋ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}" |
|
|
|
def clean_response(text: str) -> str: |
|
"""์๋ต ํ
์คํธ ์ ์ ํจ์""" |
|
sentences = [s.strip() for s in text.split('.') if s.strip()] |
|
unique_sentences = [] |
|
seen = set() |
|
|
|
for sentence in sentences: |
|
normalized = ' '.join(sentence.lower().split()) |
|
if normalized not in seen: |
|
seen.add(normalized) |
|
unique_sentences.append(sentence) |
|
|
|
cleaned_text = '. '.join(unique_sentences) |
|
if cleaned_text and not cleaned_text.endswith('.'): |
|
cleaned_text += '.' |
|
|
|
return cleaned_text |
|
|
|
def remove_duplicates(text: str) -> str: |
|
"""์ค๋ณต ๋ฌธ์ฅ ์ ๊ฑฐ ํจ์""" |
|
sentences = text.split('.') |
|
unique_sentences = [] |
|
seen = set() |
|
|
|
for sentence in sentences: |
|
sentence = sentence.strip() |
|
if sentence and sentence not in seen: |
|
seen.add(sentence) |
|
unique_sentences.append(sentence) |
|
|
|
return '. '.join(unique_sentences) |
|
|
|
def upload_csv(file_path: str) -> Tuple[str, str]: |
|
try: |
|
df = pd.read_csv(file_path, sep=',') |
|
required_columns = {'id', 'text', 'label', 'metadata'} |
|
available_columns = set(df.columns) |
|
missing_columns = required_columns - available_columns |
|
if missing_columns: |
|
return f"CSV ํ์ผ์ ๋ค์ ํ์ ์ปฌ๋ผ์ด ๋๋ฝ๋์์ต๋๋ค: {', '.join(missing_columns)}", "" |
|
|
|
df.drop_duplicates(inplace=True) |
|
df.fillna('', inplace=True) |
|
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'}) |
|
|
|
parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet' |
|
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy') |
|
return f"{parquet_filename} ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์
๋ก๋๋๊ณ ๋ณํ๋์์ต๋๋ค.", parquet_filename |
|
except Exception as e: |
|
return f"CSV ํ์ผ ์
๋ก๋ ๋ฐ ๋ณํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}", "" |
|
|
|
def upload_parquet(file_path: str) -> Tuple[str, str, str]: |
|
try: |
|
df = pd.read_parquet(file_path, engine='pyarrow') |
|
|
|
data_info = { |
|
"์ด ๋ ์ฝ๋ ์": len(df), |
|
"์ปฌ๋ผ ๋ชฉ๋ก": list(df.columns), |
|
"๋ฐ์ดํฐ ํ์
": df.dtypes.to_dict(), |
|
"๊ฒฐ์ธก์น ์ ๋ณด": df.isnull().sum().to_dict() |
|
} |
|
|
|
summary = [] |
|
summary.append(f"### ๋ฐ์ดํฐ์
๊ธฐ๋ณธ ์ ๋ณด:") |
|
summary.append(f"- ์ด ๋ ์ฝ๋ ์: {data_info['์ด ๋ ์ฝ๋ ์']}") |
|
summary.append(f"- ์ปฌ๋ผ ๋ชฉ๋ก: {', '.join(data_info['์ปฌ๋ผ ๋ชฉ๋ก'])}") |
|
|
|
summary.append("\n### ์ปฌ๋ผ๋ณ ์ ๋ณด:") |
|
for col in df.columns: |
|
if df[col].dtype in ['int64', 'float64']: |
|
stats = df[col].describe() |
|
summary.append(f"\n{col} (์์นํ):") |
|
summary.append(f"- ํ๊ท : {stats['mean']:.2f}") |
|
summary.append(f"- ์ต์: {stats['min']}") |
|
summary.append(f"- ์ต๋: {stats['max']}") |
|
elif df[col].dtype == 'object' or df[col].dtype == 'string': |
|
unique_count = df[col].nunique() |
|
summary.append(f"\n{col} (ํ
์คํธ):") |
|
summary.append(f"- ๊ณ ์ ๊ฐ ์: {unique_count}") |
|
if unique_count < 10: |
|
value_counts = df[col].value_counts().head(5) |
|
summary.append("- ์์ 5๊ฐ ๊ฐ:") |
|
for val, count in value_counts.items(): |
|
summary.append(f" โข {val}: {count}๊ฐ") |
|
|
|
preview = df.head(10).to_markdown(index=False) |
|
summary.append("\n### ๋ฐ์ดํฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:") |
|
summary.append(preview) |
|
|
|
parquet_content = "\n".join(summary) |
|
parquet_json = df.to_json(orient='records', force_ascii=False) |
|
|
|
return "Parquet ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์
๋ก๋๋์์ต๋๋ค.", parquet_content, parquet_json |
|
except Exception as e: |
|
return f"Parquet ํ์ผ ์
๋ก๋ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}", "", "" |
|
|
|
def text_to_parquet(text: str) -> Tuple[str, str, str]: |
|
try: |
|
lines = [line.strip() for line in text.split('\n') if line.strip()] |
|
data = [] |
|
|
|
for line in lines: |
|
try: |
|
import re |
|
pattern = r'(\d+),([^,]+),([^,]+),(.+)' |
|
match = re.match(pattern, line) |
|
|
|
if match: |
|
id_val, text_val, label_val, metadata_val = match.groups() |
|
text_val = text_val.strip().strip('"') |
|
label_val = label_val.strip().strip('"') |
|
metadata_val = metadata_val.strip().strip('"') |
|
|
|
data.append({ |
|
'id': int(id_val), |
|
'text': text_val, |
|
'label': label_val, |
|
'metadata': metadata_val |
|
}) |
|
except Exception as e: |
|
print(f"๋ผ์ธ ํ์ฑ ์ค๋ฅ: {line}\n{str(e)}") |
|
continue |
|
|
|
if not data: |
|
return "๋ณํํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.", "", "" |
|
|
|
df = pd.DataFrame(data) |
|
df = df.astype({ |
|
'id': 'int32', |
|
'text': 'string', |
|
'label': 'string', |
|
'metadata': 'string' |
|
}) |
|
|
|
parquet_filename = 'text_to_parquet.parquet' |
|
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy') |
|
preview = df.to_markdown(index=False) |
|
|
|
return ( |
|
f"{parquet_filename} ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ๋ณํ๋์์ต๋๋ค. ์ด {len(df)}๊ฐ์ ๋ ์ฝ๋๊ฐ ์ฒ๋ฆฌ๋์์ต๋๋ค.", |
|
preview, |
|
parquet_filename |
|
) |
|
|
|
except Exception as e: |
|
error_message = f"ํ
์คํธ ๋ณํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}" |
|
print(f"{error_message}\n{traceback.format_exc()}") |
|
return error_message, "", "" |
|
|
|
def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None, api_key: str = None) -> str: |
|
if not api_key: |
|
yield "โ ๏ธ API Key๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ์๋น์ค ์ด์ฉ์ ์ํด API Key๋ฅผ ์
๋ ฅํด์ฃผ์ธ์." |
|
return |
|
|
|
|
|
client = OpenAI(api_key=api_key) |
|
|
|
system_prefix = """๋ฐ๋์ ํ๊ธ๋ก ๋ต๋ณํ ๊ฒ. ๋๋ ์
๋ก๋๋ ๋ฐ์ดํฐ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ง๋ฌธ์ ๋ต๋ณํ๋ ์ญํ ์ ํ๋ค. |
|
|
|
์ฃผ์ ์ง์นจ: |
|
1. ์ง๋ฌธ๊ณผ ์ง์ ๊ด๋ จ๋ ๋ด์ฉ๋ง ๊ฐ๋จ๋ช
๋ฃํ๊ฒ ๋ต๋ณํ ๊ฒ |
|
2. ์ด์ ๋ต๋ณ๊ณผ ์ค๋ณต๋๋ ๋ด์ฉ์ ์ ์ธํ ๊ฒ |
|
3. ๋ถํ์ํ ์์๋ ๋ถ์ฐ ์ค๋ช
์ ํ์ง ๋ง ๊ฒ |
|
4. ๋์ผํ ๋ด์ฉ์ ๋ค๋ฅธ ํํ์ผ๋ก ๋ฐ๋ณตํ์ง ๋ง ๊ฒ |
|
5. ํต์ฌ ์ ๋ณด๋ง ์ ๋ฌํ ๊ฒ |
|
""" |
|
|
|
if parquet_data: |
|
try: |
|
df = pd.read_json(io.StringIO(parquet_data)) |
|
data_summary = df.describe(include='all').to_string() |
|
system_prefix += f"\n\n๋ฐ์ดํฐ ์์ฝ:\n{data_summary}" |
|
except Exception as e: |
|
print(f"๋ฐ์ดํฐ ๋ก๋ ์ค๋ฅ: {str(e)}") |
|
|
|
messages = [{"role": "system", "content": system_prefix}] |
|
recent_history = history[-3:] if history else [] |
|
for chat in recent_history: |
|
messages.append({"role": chat["role"], "content": chat["content"]}) |
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
try: |
|
response = client.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=messages, |
|
max_tokens=max_tokens, |
|
temperature=temperature, |
|
top_p=top_p, |
|
stream=True |
|
) |
|
|
|
full_response = "" |
|
for chunk in response: |
|
if chunk.choices[0].delta.content: |
|
full_response += chunk.choices[0].delta.content |
|
yield clean_response(full_response) |
|
|
|
except Exception as e: |
|
error_message = f"์๋ต ์์ฑ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}" |
|
print(f"{error_message}\n{traceback.format_exc()}") |
|
yield error_message |
|
|
|
def preprocess_text_with_llm(input_text: str, api_key: str = None) -> str: |
|
if not api_key: |
|
return "โ ๏ธ API Key๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ์๋น์ค ์ด์ฉ์ ์ํด API Key๋ฅผ ์
๋ ฅํด์ฃผ์ธ์." |
|
|
|
|
|
client = OpenAI(api_key=api_key) |
|
|
|
system_prompt = """๋ฐ๋์ ํ๊ธ(ํ๊ตญ์ด)๋ก ๋ต๋ณํ์์ค. ๋น์ ์ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์ ๋ฌธ๊ฐ์
๋๋ค. ์
๋ ฅ๋ ํ
์คํธ๋ฅผ CSV ๋ฐ์ดํฐ์
ํ์์ผ๋ก ๋ณํํ์ธ์. |
|
|
|
๊ท์น: |
|
1. ์ถ๋ ฅ ํ์: id,text,label,metadata |
|
2. id: 1๋ถํฐ ์์ํ๋ ์์ฐจ์ ๋ฒํธ |
|
3. text: ์๋ฏธ ์๋ ๋จ์๋ก ๋ถ๋ฆฌ๋ ํ
์คํธ |
|
4. label: ํ
์คํธ์ ์ฃผ์ ๋ ์นดํ
๊ณ ๋ฆฌ๋ฅผ ์๋ ๊ธฐ์ค์ผ๋ก ์ ํํ๊ฒ ํ ๊ฐ๋ง ์ ํ |
|
- Historical_Figure (์ญ์ฌ์ ์ธ๋ฌผ) |
|
- Military_History (๊ตฐ์ฌ ์ญ์ฌ) |
|
- Technology (๊ธฐ์ ) |
|
- Politics (์ ์น) |
|
- Culture (๋ฌธํ) |
|
5. metadata: ๋ ์ง, ์ถ์ฒ ๋ฑ ์ถ๊ฐ ์ ๋ณด""" |
|
|
|
try: |
|
response = client.chat.completions.create( |
|
model="gpt-4-0125-preview", |
|
messages=[ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": input_text} |
|
], |
|
max_tokens=4000, |
|
temperature=0.1, |
|
stream=True |
|
) |
|
|
|
full_response = "" |
|
for chunk in response: |
|
if chunk.choices[0].delta.content: |
|
full_response += chunk.choices[0].delta.content |
|
|
|
processed_text = clean_response(full_response) |
|
|
|
try: |
|
from io import StringIO |
|
import csv |
|
csv.reader(StringIO(processed_text)) |
|
return processed_text |
|
except csv.Error: |
|
return "LLM์ด ์ฌ๋ฐ๋ฅธ CSV ํ์์ ์์ฑํ์ง ๋ชปํ์ต๋๋ค. ๋ค์ ์๋ํด์ฃผ์ธ์." |
|
|
|
except Exception as e: |
|
error_message = f"์ ์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}" |
|
print(error_message) |
|
return error_message |
|
|
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
api_key_state = gr.State("") |
|
|
|
gr.Markdown("# MyEzRAG: LLM์ด ๋๋ง์ ๋ฐ์ดํฐ๋ก ํ์ตํ ์ฝํ
์ธ ์์ฑ/๋ต๋ณ", elem_id="initial-description") |
|
|
|
|
|
with gr.Row(elem_classes="api-key-section"): |
|
with gr.Column(scale=3): |
|
api_key_input = gr.Textbox( |
|
label="OpenAI API Key", |
|
placeholder="sk-...", |
|
type="password", |
|
show_label=True |
|
) |
|
with gr.Column(scale=1): |
|
api_key_button = gr.Button("API Key ์ค์ ", variant="primary") |
|
|
|
|
|
api_key_status = gr.Markdown("โ ๏ธ API Key๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ์๋น์ค ์ด์ฉ์ ์ํด API Key๋ฅผ ์
๋ ฅํด์ฃผ์ธ์.", elem_classes="api-key-status") |
|
|
|
|
|
def set_api_key(api_key: str): |
|
if not api_key.strip(): |
|
return "โ ๏ธ API Key๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ์๋น์ค ์ด์ฉ์ ์ํด API Key๋ฅผ ์
๋ ฅํด์ฃผ์ธ์.", "" |
|
if not api_key.startswith("sk-"): |
|
return "โ ์ฌ๋ฐ๋ฅด์ง ์์ API Key ํ์์
๋๋ค. ๋ค์ ํ์ธํด์ฃผ์ธ์.", "" |
|
return "โ
API Key๊ฐ ์ฑ๊ณต์ ์ผ๋ก ์ค์ ๋์์ต๋๋ค.", api_key |
|
|
|
|
|
api_key_button.click( |
|
set_api_key, |
|
inputs=[api_key_input], |
|
outputs=[api_key_status, api_key_state] |
|
) |
|
|
|
gr.Markdown( |
|
"### '์ฌ์ฉ ๋ฐฉ๋ฒ' ํญ์ ํตํด ์์ธํ ์ด์ฉ ๋ฐฉ๋ฒ์ ์ฐธ๊ณ ํ์ธ์.\n" |
|
"### Tip) '์์ '๋ฅผ ํตํด ๋ค์ํ ํ์ฉ ๋ฐฉ๋ฒ์ ์ฒดํํ๊ณ ์์ฉํด ๋ณด์ธ์, ๋ฐ์ดํฐ์
์
๋ก๋์ ๋ฏธ๋ฆฌ๋ณด๊ธฐ๋ 10๊ฑด๋ง ์ถ๋ ฅ", |
|
elem_id="initial-description" |
|
) |
|
|
|
|
|
with gr.Tab("My ๋ฐ์ดํฐ์
+LLM"): |
|
gr.Markdown("### LLM๊ณผ ๋ํํ๊ธฐ") |
|
chatbot_data_upload = gr.Chatbot(label="์ฑ๋ด", type="messages", elem_id="chatbot-data-upload") |
|
msg_data_upload = gr.Textbox(label="๋ฉ์์ง ์
๋ ฅ", placeholder="์ฌ๊ธฐ์ ๋ฉ์์ง๋ฅผ ์
๋ ฅํ์ธ์...") |
|
send_data_upload = gr.Button("์ ์ก") |
|
|
|
with gr.Accordion("์์คํ
ํ๋กฌํํธ ๋ฐ ์ต์
์ค์ ", open=False): |
|
system_message = gr.Textbox(label="System Message", value="๋๋ AI ์กฐ์ธ์ ์ญํ ์ด๋ค.") |
|
max_tokens = gr.Slider(minimum=1, maximum=8000, value=1000, label="Max Tokens") |
|
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature") |
|
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P") |
|
|
|
parquet_data_state = gr.State() |
|
|
|
def handle_message_data_upload(message: str, history: List[Dict[str, str]], system_message: str, max_tokens: int, temperature: float, top_p: float, parquet_data: str, api_key: str): |
|
if not api_key: |
|
history = history or [] |
|
history.append({"role": "assistant", "content": "โ ๏ธ API Key๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ์๋น์ค ์ด์ฉ์ ์ํด API Key๋ฅผ ์
๋ ฅํด์ฃผ์ธ์."}) |
|
yield history, "" |
|
return |
|
|
|
history = history or [] |
|
recent_questions = [chat['content'].strip().lower() for chat in history[-3:] if chat['role'] == 'user'] |
|
if message.strip().lower() in recent_questions: |
|
yield history + [{"role": "assistant", "content": "๋์ผํ ์ง๋ฌธ์ด ์ต๊ทผ์ ์์์ต๋๋ค. ๋ค๋ฅธ ์ง๋ฌธ์ ํด์ฃผ์ธ์."}], "" |
|
return |
|
|
|
try: |
|
history.append({"role": "user", "content": message}) |
|
response_gen = respond( |
|
message, |
|
history, |
|
system_message, |
|
max_tokens, |
|
temperature=0.3, |
|
top_p=top_p, |
|
parquet_data=parquet_data, |
|
api_key=api_key |
|
) |
|
|
|
partial_response = "" |
|
for partial in response_gen: |
|
partial_response = partial |
|
display_history = history + [{"role": "assistant", "content": partial_response}] |
|
yield display_history, "" |
|
|
|
history.append({"role": "assistant", "content": partial_response}) |
|
except Exception as e: |
|
response = f"์ค๋ฅ ๋ฐ์: {str(e)}" |
|
history.append({"role": "assistant", "content": response}) |
|
yield history, "" |
|
|
|
send_data_upload.click( |
|
handle_message_data_upload, |
|
inputs=[ |
|
msg_data_upload, |
|
chatbot_data_upload, |
|
system_message, |
|
max_tokens, |
|
temperature, |
|
top_p, |
|
parquet_data_state, |
|
api_key_state, |
|
], |
|
outputs=[chatbot_data_upload, msg_data_upload], |
|
queue=True |
|
) |
|
|
|
|
|
with gr.Accordion("์์ ", open=False): |
|
gr.Examples( |
|
examples=[ |
|
["์
๋ก๋๋ ๋ฐ์ดํฐ์
์ ๋ํด ์์ฝ ์ค๋ช
ํ๋ผ."], |
|
["์
๋ก๋๋ ๋ฐ์ดํฐ์
ํ์ผ์ ํ์ต ๋ฐ์ดํฐ๋ก ํ์ฉํ์ฌ, ๋ณธ ์๋น์ค๋ฅผ SEO ์ต์ ํํ์ฌ ๋ธ๋ก๊ทธ ํฌ์คํธ(๊ฐ์, ๋ฐฐ๊ฒฝ ๋ฐ ํ์์ฑ, ๊ธฐ์กด ์ ์ฌ ์ ํ/์๋น์ค์ ๋น๊ตํ์ฌ ํน์ฅ์ , ํ์ฉ์ฒ, ๊ฐ์น, ๊ธฐ๋ํจ๊ณผ, ๊ฒฐ๋ก ์ ํฌํจ)๋ก 4000 ํ ํฐ ์ด์ ์์ฑํ๋ผ"], |
|
["์
๋ก๋๋ ๋ฐ์ดํฐ์
ํ์ผ์ ํ์ต ๋ฐ์ดํฐ๋ก ํ์ฉํ์ฌ, ์ฌ์ฉ ๋ฐฉ๋ฒ๊ณผ ์ฐจ๋ณ์ , ํน์ง, ๊ฐ์ ์ ์ค์ฌ์ผ๋ก 4000 ํ ํฐ ์ด์ ์ ํ๋ธ ์์ ์คํฌ๋ฆฝํธ ํํ๋ก ์์ฑํ๋ผ"], |
|
["์
๋ก๋๋ ๋ฐ์ดํฐ์
ํ์ผ์ ํ์ต ๋ฐ์ดํฐ๋ก ํ์ฉํ์ฌ, ์ ํ ์์ธ ํ์ด์ง ํ์์ ๋ด์ฉ์ 4000 ํ ํฐ ์ด์ ์์ธํ ์ค๋ช
ํ๋ผ"], |
|
["์
๋ก๋๋ ๋ฐ์ดํฐ์
ํ์ผ์ ํ์ต ๋ฐ์ดํฐ๋ก ํ์ฉํ์ฌ, FAQ 20๊ฑด์ ์์ธํ๊ฒ ์์ฑํ๋ผ. 4000ํ ํฐ ์ด์ ์ฌ์ฉํ๋ผ."], |
|
["์
๋ก๋๋ ๋ฐ์ดํฐ์
ํ์ผ์ ํ์ต ๋ฐ์ดํฐ๋ก ํ์ฉํ์ฌ, ํนํ ์ถ์์ ํ์ฉํ ๊ธฐ์ ๋ฐ ๋น์ฆ๋์ค ๋ชจ๋ธ ์ธก๋ฉด์ ํฌํจํ์ฌ ํนํ ์ถ์์ ๊ตฌ์ฑ์ ๋ง๊ฒ ํ์ ์ ์ธ ์ฐฝ์ ๋ฐ๋ช
๋ด์ฉ์ ์ค์ฌ์ผ๋ก 4000 ํ ํฐ ์ด์ ์์ฑํ๋ผ."], |
|
], |
|
inputs=msg_data_upload, |
|
label="์์ ์ ํ", |
|
) |
|
|
|
|
|
gr.Markdown("### Parquet ํ์ผ ์
๋ก๋") |
|
with gr.Row(): |
|
with gr.Column(): |
|
parquet_upload = gr.File( |
|
label="Parquet ํ์ผ ์
๋ก๋", type="filepath", elem_id="parquet-upload-area" |
|
) |
|
parquet_upload_button = gr.Button("์
๋ก๋") |
|
parquet_upload_status = gr.Textbox(label="์
๋ก๋ ์ํ", interactive=False) |
|
parquet_preview_chat = gr.Markdown(label="Parquet ํ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ") |
|
|
|
def handle_parquet_upload(file_path: str): |
|
message, parquet_content, parquet_json = upload_parquet(file_path) |
|
if parquet_json: |
|
return message, parquet_content, parquet_json |
|
else: |
|
return message, "", "" |
|
|
|
parquet_upload_button.click( |
|
handle_parquet_upload, |
|
inputs=parquet_upload, |
|
outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state] |
|
) |
|
|
|
|
|
with gr.Tab("CSV to My ๋ฐ์ดํฐ์
"): |
|
gr.Markdown("### CSV ํ์ผ ์
๋ก๋ ๋ฐ Parquet ๋ณํ") |
|
with gr.Row(): |
|
with gr.Column(): |
|
csv_file = gr.File(label="CSV ํ์ผ ์
๋ก๋", type="filepath") |
|
upload_button = gr.Button("์
๋ก๋ ๋ฐ ๋ณํ") |
|
upload_status = gr.Textbox(label="์
๋ก๋ ์ํ", interactive=False) |
|
parquet_preview = gr.Markdown(label="Parquet ํ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ") |
|
download_button = gr.File(label="Parquet ํ์ผ ๋ค์ด๋ก๋", interactive=False) |
|
|
|
def handle_csv_upload(file_path: str): |
|
message, parquet_filename = upload_csv(file_path) |
|
if parquet_filename: |
|
parquet_content = load_parquet(parquet_filename) |
|
return message, parquet_content, parquet_filename |
|
else: |
|
return message, "", None |
|
|
|
upload_button.click( |
|
handle_csv_upload, |
|
inputs=csv_file, |
|
outputs=[upload_status, parquet_preview, download_button] |
|
) |
|
|
|
|
|
with gr.Tab("Text to My ๋ฐ์ดํฐ์
"): |
|
gr.Markdown("### ํ
์คํธ๋ฅผ ์
๋ ฅํ๋ฉด CSV๋ก ๋ณํ ํ Parquet์ผ๋ก ์๋ ์ ํ๋ฉ๋๋ค.") |
|
with gr.Row(): |
|
with gr.Column(): |
|
text_input = gr.Textbox( |
|
label="ํ
์คํธ ์
๋ ฅ (๊ฐ ํ์ `id,text,label,metadata` ํ์์ผ๋ก ์
๋ ฅ)", |
|
lines=10, |
|
placeholder='์: 1,"์ด์์ ","์ฅ๊ตฐ","๊ฑฐ๋ถ์ "\n2,"์๊ท ","์ฅ๊ตฐ","๋ชจํจ"\n3,"์ ์กฐ","์","์๊ธฐ"\n4,"๋์ํ ๋ฏธ ํ๋ฐ์์","์","์นจ๋ต"' |
|
) |
|
convert_button = gr.Button("๋ณํ ๋ฐ ๋ค์ด๋ก๋") |
|
convert_status = gr.Textbox(label="๋ณํ ์ํ", interactive=False) |
|
parquet_preview_convert = gr.Markdown(label="Parquet ํ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ") |
|
download_parquet_convert = gr.File(label="Parquet ํ์ผ ๋ค์ด๋ก๋", interactive=False) |
|
|
|
def handle_text_to_parquet(text: str): |
|
message, parquet_content, parquet_filename = text_to_parquet(text) |
|
if parquet_filename: |
|
return message, parquet_content, parquet_filename |
|
else: |
|
return message, "", None |
|
|
|
convert_button.click( |
|
handle_text_to_parquet, |
|
inputs=text_input, |
|
outputs=[convert_status, parquet_preview_convert, download_parquet_convert] |
|
) |
|
|
|
|
|
with gr.Tab("Text Preprocessing with LLM"): |
|
gr.Markdown("### ํ
์คํธ๋ฅผ ์
๋ ฅํ๋ฉด LLM์ด ๋ฐ์ดํฐ์
ํ์์ ๋ง๊ฒ ์ ์ฒ๋ฆฌํ์ฌ ์ถ๋ ฅํฉ๋๋ค.") |
|
with gr.Row(): |
|
with gr.Column(): |
|
raw_text_input = gr.Textbox( |
|
label="ํ
์คํธ ์
๋ ฅ", |
|
lines=15, |
|
placeholder="์ฌ๊ธฐ์ ์ ์ฒ๋ฆฌํ ํ
์คํธ๋ฅผ ์
๋ ฅํ์ธ์..." |
|
) |
|
|
|
with gr.Row(): |
|
preprocess_button = gr.Button("์ ์ฒ๋ฆฌ ์คํ", variant="primary") |
|
clear_button = gr.Button("์ด๊ธฐํ") |
|
|
|
preprocess_status = gr.Textbox( |
|
label="์ ์ฒ๋ฆฌ ์ํ", |
|
interactive=False, |
|
value="๋๊ธฐ ์ค..." |
|
) |
|
|
|
processed_text_output = gr.Textbox( |
|
label="์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ์
์ถ๋ ฅ", |
|
lines=15, |
|
interactive=False |
|
) |
|
|
|
convert_to_parquet_button = gr.Button("Parquet์ผ๋ก ๋ณํ") |
|
download_parquet = gr.File(label="๋ณํ๋ Parquet ํ์ผ ๋ค์ด๋ก๋") |
|
|
|
def handle_text_preprocessing(input_text: str, api_key: str): |
|
if not api_key: |
|
yield "โ ๏ธ API Key๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค.", "" |
|
return |
|
|
|
if not input_text.strip(): |
|
yield "์
๋ ฅ ํ
์คํธ๊ฐ ์์ต๋๋ค.", "" |
|
return |
|
|
|
try: |
|
yield "์ ์ฒ๋ฆฌ๋ฅผ ์์ํฉ๋๋ค...", "" |
|
processed_text = preprocess_text_with_llm(input_text, api_key) |
|
|
|
if processed_text: |
|
yield "์ ์ฒ๋ฆฌ๊ฐ ์๋ฃ๋์์ต๋๋ค.", processed_text |
|
else: |
|
yield "์ ์ฒ๋ฆฌ ๊ฒฐ๊ณผ๊ฐ ์์ต๋๋ค.", "" |
|
|
|
except Exception as e: |
|
yield f"์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}", "" |
|
|
|
def clear_inputs(): |
|
return "", "๋๊ธฐ ์ค...", "" |
|
|
|
def convert_to_parquet_file(processed_text: str): |
|
if not processed_text.strip(): |
|
return "๋ณํํ ํ
์คํธ๊ฐ ์์ต๋๋ค.", None |
|
|
|
try: |
|
message, parquet_content, parquet_filename = text_to_parquet(processed_text) |
|
if parquet_filename: |
|
return message, parquet_filename |
|
return message, None |
|
except Exception as e: |
|
return f"Parquet ๋ณํ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", None |
|
|
|
preprocess_button.click( |
|
handle_text_preprocessing, |
|
inputs=[raw_text_input, api_key_state], |
|
outputs=[preprocess_status, processed_text_output], |
|
queue=True |
|
) |
|
|
|
clear_button.click( |
|
clear_inputs, |
|
outputs=[raw_text_input, preprocess_status, processed_text_output] |
|
) |
|
|
|
convert_to_parquet_button.click( |
|
convert_to_parquet_file, |
|
inputs=[processed_text_output], |
|
outputs=[preprocess_status, download_parquet] |
|
) |
|
|
|
with gr.Accordion("์์ ํ
์คํธ", open=False): |
|
gr.Examples( |
|
examples=[ |
|
["์ด์์ ์ ์กฐ์ ์ค๊ธฐ์ ๋ฌด์ ์ด๋ค. ๊ทธ๋ ์์ง์๋ ๋น์ ํด๊ตฐ์ ์ด๋์๋ค. ๊ฑฐ๋ถ์ ์ ๋ง๋ค์ด ์๊ตฐ๊ณผ ์ธ์ ๋ค."], |
|
["์ธ๊ณต์ง๋ฅ์ ์ปดํจํฐ ๊ณผํ์ ํ ๋ถ์ผ์ด๋ค. ๊ธฐ๊ณํ์ต์ ์ธ๊ณต์ง๋ฅ์ ํ์ ๋ถ์ผ์ด๋ค. ๋ฅ๋ฌ๋์ ๊ธฐ๊ณํ์ต์ ํ ๋ฐฉ๋ฒ์ด๋ค."] |
|
], |
|
inputs=raw_text_input, |
|
label="์์ ์ ํ" |
|
) |
|
|
|
|
|
with gr.Tab("๐ ์ฌ์ฉ ๋ฐฉ๋ฒ"): |
|
gr.Markdown(""" |
|
# MyEzRAG ์ฌ์ฉ ๊ฐ์ด๋ |
|
|
|
## ๐ API Key ์ค์ |
|
1. OpenAI API Key๋ฅผ ์๋จ ์
๋ ฅ์ฐฝ์ ์
๋ ฅ |
|
2. 'API Key ์ค์ ' ๋ฒํผ ํด๋ฆญ |
|
3. ์ค์ ์ฑ๊ณต ๋ฉ์์ง ํ์ธ |
|
|
|
## 1๏ธโฃ My ๋ฐ์ดํฐ์
+LLM ํญ |
|
### ๊ธฐ๋ฅ |
|
- ์
๋ก๋๋ Parquet ๋ฐ์ดํฐ์
์ ๊ธฐ๋ฐ์ผ๋ก LLM๊ณผ ๋ํ |
|
- ๋ฐ์ดํฐ์
์ ๋ด์ฉ์ ํ์ฉํ ์ฝํ
์ธ ์์ฑ |
|
|
|
### ์ฌ์ฉ ๋ฐฉ๋ฒ |
|
1. Parquet ํ์ผ ์
๋ก๋ ์น์
์์ ๋ฐ์ดํฐ์
ํ์ผ์ ์
๋ก๋ |
|
2. ์ฑํ
์ฐฝ์ ์ํ๋ ์ง๋ฌธ์ด๋ ์์ฒญ์ฌํญ ์
๋ ฅ |
|
3. ์์ ๋ฒํผ์ ํ์ฉํ์ฌ ๋ค์ํ ํ์ฉ ์ฌ๋ก ์ฒดํ |
|
|
|
### ํ |
|
- ์์คํ
ํ๋กฌํํธ ์ค์ ์ผ๋ก ์๋ต ์คํ์ผ ์กฐ์ ๊ฐ๋ฅ |
|
- ์์ธํ ์ง๋ฌธ์ผ์๋ก ๋ ์ ํํ ๋ต๋ณ ์ ๊ณต |
|
|
|
--- |
|
|
|
## 2๏ธโฃ CSV to My ๋ฐ์ดํฐ์
ํญ |
|
### ๊ธฐ๋ฅ |
|
- CSV ํ์ผ์ Parquet ํ์์ผ๋ก ๋ณํ |
|
- ๋ฐ์ดํฐ ์ต์ ํ ๋ฐ ์ ์ |
|
|
|
### ์ฌ์ฉ ๋ฐฉ๋ฒ |
|
1. CSV ํ์ผ ์ค๋น (ํ์ ์ปฌ๋ผ: id, text, label, metadata) |
|
2. ํ์ผ ์
๋ก๋ ํ '์
๋ก๋ ๋ฐ ๋ณํ' ๋ฒํผ ํด๋ฆญ |
|
3. ๋ณํ๋ Parquet ํ์ผ ๋ค์ด๋ก๋ |
|
|
|
### ์ฃผ์์ฌํญ |
|
- CSV ํ์ผ์ ๋ฐ๋์ ํ์ ์ปฌ๋ผ์ ํฌํจํด์ผ ํจ |
|
- ์ธ์ฝ๋ฉ์ UTF-8 ๊ถ์ฅ |
|
|
|
--- |
|
|
|
## 3๏ธโฃ Text to My ๋ฐ์ดํฐ์
ํญ |
|
### ๊ธฐ๋ฅ |
|
- ํ
์คํธ ํ์์ ๋ฐ์ดํฐ๋ฅผ Parquet์ผ๋ก ๋ณํ |
|
- ์๋ ๋ฐ์ดํฐ ์
๋ ฅ ์ง์ |
|
|
|
### ์ฌ์ฉ ๋ฐฉ๋ฒ |
|
1. ์ง์ ๋ ํ์์ผ๋ก ํ
์คํธ ์
๋ ฅ |
|
``` |
|
1,"์ด์์ ","์ฅ๊ตฐ","๊ฑฐ๋ถ์ " |
|
2,"์๊ท ","์ฅ๊ตฐ","๋ชจํจ" |
|
``` |
|
2. '๋ณํ ๋ฐ ๋ค์ด๋ก๋' ๋ฒํผ ํด๋ฆญ |
|
3. ๋ณํ๋ ํ์ผ ํ์ธ ๋ฐ ๋ค์ด๋ก๋ |
|
|
|
### ์
๋ ฅ ํ์ |
|
- id: ์์ฐจ์ ๋ฒํธ |
|
- text: ์ค์ ํ
์คํธ ๋ด์ฉ |
|
- label: ๋ถ๋ฅ ๋ผ๋ฒจ |
|
- metadata: ๋ถ๊ฐ ์ ๋ณด |
|
|
|
--- |
|
|
|
## 4๏ธโฃ Text Preprocessing with LLM ํญ |
|
### ๊ธฐ๋ฅ |
|
- LLM์ ํ์ฉํ ์๋ ํ
์คํธ ์ ์ฒ๋ฆฌ |
|
- ๊ตฌ์กฐํ๋ ๋ฐ์ดํฐ์
์์ฑ |
|
|
|
### ์ฌ์ฉ ๋ฐฉ๋ฒ |
|
1. ์๋ฌธ ํ
์คํธ ์
๋ ฅ |
|
2. '์ ์ฒ๋ฆฌ ์คํ' ๋ฒํผ ํด๋ฆญ |
|
3. ๊ฒฐ๊ณผ ํ์ธ ํ ํ์์ Parquet ๋ณํ |
|
|
|
### ํน์ง |
|
- ์๋ ๋ ์ด๋ธ๋ง |
|
- ๋ฌธ์ฅ ๋จ์ ๋ถ๋ฆฌ |
|
- ์ค๋ณต ์ ๊ฑฐ |
|
- ๋ฐ์ดํฐ ์ ๊ทํ |
|
|
|
## ๐ก ์ผ๋ฐ์ ์ธ ํ |
|
- API Key๋ ์์ ํ๊ฒ ๋ณด๊ดํ๊ณ ์ฃผ๊ธฐ์ ์ผ๋ก ๊ฐฑ์ |
|
- ๊ฐ ํญ์ ์์ ๋ฅผ ์ฐธ๊ณ ํ์ฌ ์ฌ์ฉ๋ฒ ์ตํ๊ธฐ |
|
- ๋ฐ์ดํฐ ํ์ง์ด ์ข์์๋ก ๋ ๋์ ๊ฒฐ๊ณผ ์ ๊ณต |
|
- ์ค๋ฅ ๋ฐ์ ์ ์
๋ ฅ ๋ฐ์ดํฐ ํ์ ํ์ธ |
|
- ๋์ฉ๋ ์ฒ๋ฆฌ ์ ์ ์ ํ ์ฒญํฌ ํฌ๊ธฐ๋ก ๋ถํ ์ฒ๋ฆฌ |
|
|
|
## โ ๏ธ ์ฃผ์์ฌํญ |
|
- API Key๋ฅผ ํ์ธ๊ณผ ๊ณต์ ํ์ง ์๊ธฐ |
|
- ๋ฏผ๊ฐํ ๊ฐ์ธ์ ๋ณด ํฌํจํ์ง ์๊ธฐ |
|
- ๋ฐ์ดํฐ ๋ฐฑ์
๊ถ์ฅ |
|
- ๋คํธ์ํฌ ์ํ ํ์ธ |
|
- ๋ธ๋ผ์ฐ์ ์บ์ ์ฃผ๊ธฐ์ ์ ๋ฆฌ |
|
|
|
## ๐ ๋ฌธ์ ํด๊ฒฐ |
|
- API Key ์ค๋ฅ: ํค ํ์ ๋ฐ ์ ํจ์ฑ ํ์ธ |
|
- ์ค๋ฅ ๋ฐ์ ์ ์
๋ ฅ ๋ฐ์ดํฐ ํ์ ํ์ธ |
|
- ํ์ผ ์
๋ก๋ ์คํจ ์ ํ์ผ ํฌ๊ธฐ ๋ฐ ํ์ ํ์ธ |
|
- ๋ณํ ์คํจ ์ ๋ฐ์ดํฐ ์ธ์ฝ๋ฉ ํ์ธ |
|
- ์๋ต์ด ๋๋ฆด ๊ฒฝ์ฐ ๋ฐ์ดํฐ ํฌ๊ธฐ ์กฐ์ |
|
""") |
|
|
|
gr.Markdown("### [email protected]", elem_id="initial-description") |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |
|
|