myezrag / app.py
ginipick's picture
Update app.py
d81053d verified
raw
history blame
28.2 kB
import gradio as gr
from huggingface_hub import InferenceClient
import os
import pandas as pd
from typing import List, Dict, Tuple
import io
import traceback
import csv
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
# NLTK ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ
nltk.download('punkt')
# ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
hf_client = InferenceClient(
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
)
def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
"""ํ…์ŠคํŠธ๋ฅผ ๋” ์ž‘์€ ์ฒญํฌ๋กœ ๋ถ„ํ• """
tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus-08-2024")
sentences = sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence = sentence.strip()
tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
sentence_length = len(tokenized_sentence)
if current_length + sentence_length > chunk_size:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_length = sentence_length
else:
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
# ๋‚˜๋จธ์ง€ ์ฝ”๋“œ๋Š” ์ด์ „๊ณผ ๋™์ผํ•˜๊ฒŒ ์œ ์ง€
@lru_cache(maxsize=100)
def cached_preprocess(text: str) -> str:
"""์ž์ฃผ ์‚ฌ์šฉ๋˜๋Š” ํ…์ŠคํŠธ์— ๋Œ€ํ•œ ์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ๋ฅผ ์บ์‹ฑ"""
return preprocess_single_chunk(text)
def preprocess_single_chunk(chunk: str) -> str:
"""๋‹จ์ผ ์ฒญํฌ์— ๋Œ€ํ•œ ์ „์ฒ˜๋ฆฌ ์ˆ˜ํ–‰"""
system_prompt = """๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ CSV ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ๋น ๋ฅด๊ฒŒ ๋ณ€ํ™˜ํ•˜์„ธ์š”.
[๊ธฐ์กด ๊ทœ์น™ ๋™์ผ]"""
full_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{chunk}\n\n์ถœ๋ ฅ:"
try:
# ์ŠคํŠธ๋ฆฌ๋ฐ ๋น„ํ™œ์„ฑํ™” ๋ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์ตœ์ ํ™”
response = hf_client.text_generation(
prompt=full_prompt,
max_new_tokens=2000, # ํ† ํฐ ์ˆ˜ ์ œํ•œ
temperature=0.1, # ๋” ๊ฒฐ์ •์ ์ธ ์ถœ๋ ฅ
top_p=0.5, # ๋” ์ง‘์ค‘๋œ ์ถœ๋ ฅ
stream=False # ์ŠคํŠธ๋ฆฌ๋ฐ ๋น„ํ™œ์„ฑํ™”
)
return response.strip()
except Exception as e:
print(f"์ฒญํฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{traceback.format_exc()}")
return "์ฒญํฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
def load_code(filename: str) -> str:
try:
with open(filename, 'r', encoding='utf-8') as file:
return file.read()
except FileNotFoundError:
return f"{filename} ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
except Exception as e:
print(f"ํŒŒ์ผ ์ฝ๊ธฐ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}")
return "ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
def load_parquet(filename: str) -> str:
try:
df = pd.read_parquet(filename, engine='pyarrow')
return df.head(10).to_markdown(index=False)
except FileNotFoundError:
return f"{filename} ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
except Exception as e:
print(f"Parquet ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}")
return "ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
def respond(
message: str,
history: List[Dict[str, str]],
system_message: str = "",
max_tokens: int = 4000,
temperature: float = 0.5,
top_p: float = 0.9,
parquet_data: str = None
) -> str:
# ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ์„ค์ •
if parquet_data:
system_prefix = """๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€๋กœ ๋‹ต๋ณ€ํ•  ๊ฒƒ. ๋„ˆ๋Š” ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜๋Š” ์—ญํ• ์„ ํ•œ๋‹ค. ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„์„ํ•˜์—ฌ ์‚ฌ์šฉ์ž์—๊ฒŒ ๋„์›€์ด ๋˜๋Š” ์ •๋ณด๋ฅผ ์ œ๊ณตํ•˜๋ผ. ๋ฐ์ดํ„ฐ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์ƒ์„ธํ•˜๊ณ  ์ •ํ™•ํ•œ ๋‹ต๋ณ€์„ ์ œ๊ณตํ•˜๋˜, ๋ฏผ๊ฐํ•œ ์ •๋ณด๋‚˜ ๊ฐœ์ธ ์ •๋ณด๋ฅผ ๋…ธ์ถœํ•˜์ง€ ๋งˆ๋ผ."""
try:
df = pd.read_json(io.StringIO(parquet_data))
# ๋ฐ์ดํ„ฐ์˜ ์š”์•ฝ ์ •๋ณด ์ƒ์„ฑ
data_summary = df.describe(include='all').to_string()
system_prefix += f"\n\n์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์˜ ์š”์•ฝ ์ •๋ณด:\n{data_summary}"
except Exception as e:
print(f"๋ฐ์ดํ„ฐ ๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{traceback.format_exc()}")
system_prefix += "\n\n๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•˜๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค."
else:
system_prefix = system_message or "๋„ˆ๋Š” AI ์กฐ์–ธ์ž ์—ญํ• ์ด๋‹ค."
# ๋ฉ”์‹œ์ง€ ์ƒ์„ฑ
prompt = system_prefix + "\n\n"
for chat in history:
if chat['role'] == 'user':
prompt += f"์‚ฌ์šฉ์ž: {chat['content']}\n"
else:
prompt += f"AI: {chat['content']}\n"
prompt += f"์‚ฌ์šฉ์ž: {message}\nAI:"
try:
# ๋ชจ๋ธ์— ๋ฉ”์‹œ์ง€ ์ „์†ก ๋ฐ ์‘๋‹ต ๋ฐ›๊ธฐ
response = ""
stream = hf_client.text_generation(
prompt=prompt,
max_new_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
)
for msg in stream:
if msg:
response += msg
yield response
except Exception as e:
error_message = f"์ถ”๋ก  ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
print(error_message)
yield "์ถ”๋ก  ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
def upload_csv(file_path: str) -> Tuple[str, str]:
try:
# CSV ํŒŒ์ผ ์ฝ๊ธฐ
df = pd.read_csv(file_path, sep=',')
# ํ•„์ˆ˜ ์ปฌ๋Ÿผ ํ™•์ธ
required_columns = {'id', 'text', 'label', 'metadata'}
available_columns = set(df.columns)
missing_columns = required_columns - available_columns
if missing_columns:
return f"CSV ํŒŒ์ผ์— ๋‹ค์Œ ํ•„์ˆ˜ ์ปฌ๋Ÿผ์ด ๋ˆ„๋ฝ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {', '.join(missing_columns)}", ""
# ๋ฐ์ดํ„ฐ ํด๋ Œ์ง•
df.drop_duplicates(inplace=True)
df.fillna('', inplace=True)
# ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
# Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
return f"{parquet_filename} ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์—…๋กœ๋“œ๋˜๊ณ  ๋ณ€ํ™˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", parquet_filename
except Exception as e:
print(f"CSV ํŒŒ์ผ ์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{traceback.format_exc()}")
return "CSV ํŒŒ์ผ ์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”.", ""
def upload_parquet(file_path: str) -> Tuple[str, str, str]:
try:
# Parquet ํŒŒ์ผ ์ฝ๊ธฐ
df = pd.read_parquet(file_path, engine='pyarrow')
# Markdown์œผ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
parquet_content = df.head(10).to_markdown(index=False)
# DataFrame์„ JSON ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
parquet_json = df.to_json(orient='records', force_ascii=False)
return "Parquet ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์—…๋กœ๋“œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", parquet_content, parquet_json
except Exception as e:
print(f"Parquet ํŒŒ์ผ ์—…๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{traceback.format_exc()}")
return "Parquet ํŒŒ์ผ ์—…๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”.", "", ""
def text_to_parquet(text: str) -> Tuple[str, str, str]:
try:
from io import StringIO
import csv
# ์ž…๋ ฅ ํ…์ŠคํŠธ ์ •์ œ
lines = text.strip().split('\n')
cleaned_lines = []
for line in lines:
# ๋นˆ ์ค„ ๊ฑด๋„ˆ๋›ฐ๊ธฐ
if not line.strip():
continue
# ์Œ๋”ฐ์˜ดํ‘œ ์ •๊ทœํ™”
line = line.replace('""', '"') # ์ค‘๋ณต ์Œ๋”ฐ์˜ดํ‘œ ์ฒ˜๋ฆฌ
# CSV ํŒŒ์‹ฑ์„ ์œ„ํ•œ ์ž„์‹œ StringIO ๊ฐ์ฒด ์ƒ์„ฑ
temp_buffer = StringIO(line)
try:
# CSV ๋ผ์ธ ํŒŒ์‹ฑ ์‹œ๋„
reader = csv.reader(temp_buffer, quoting=csv.QUOTE_ALL)
parsed_line = next(reader)
if len(parsed_line) == 4: # id, text, label, metadata
# ๊ฐ ํ•„๋“œ๋ฅผ ์ ์ ˆํžˆ ํฌ๋งทํŒ…
formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
cleaned_lines.append(formatted_line)
except Exception as e:
continue
finally:
temp_buffer.close()
# ์ •์ œ๋œ CSV ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
cleaned_csv = '\n'.join(cleaned_lines)
# DataFrame ์ƒ์„ฑ
df = pd.read_csv(
StringIO(cleaned_csv),
sep=',',
quoting=csv.QUOTE_ALL,
escapechar='\\',
names=['id', 'text', 'label', 'metadata']
)
# ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
# Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
parquet_filename = 'text_to_parquet.parquet'
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
# Parquet ํŒŒ์ผ ๋‚ด์šฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
parquet_content = load_parquet(parquet_filename)
return f"{parquet_filename} ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ๋ณ€ํ™˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", parquet_content, parquet_filename
except Exception as e:
error_message = f"ํ…์ŠคํŠธ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
print(f"{error_message}\n{traceback.format_exc()}")
return "ํ…์ŠคํŠธ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”.", "", ""
def preprocess_text_with_llm(input_text: str) -> str:
if not input_text.strip():
return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
system_prompt = """๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ CSV ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•˜์„ธ์š”.
๊ทœ์น™:
1. ์ถœ๋ ฅ ํ˜•์‹: id,text,label,metadata
2. id: 1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
3. text: ์˜๋ฏธ ์žˆ๋Š” ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ๋œ ํ…์ŠคํŠธ
4. label: ํ…์ŠคํŠธ์˜ ์ฃผ์ œ๋‚˜ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์•„๋ž˜ ๊ธฐ์ค€์œผ๋กœ ์ •ํ™•ํ•˜๊ฒŒ ํ•œ ๊ฐœ๋งŒ ์„ ํƒ
- Historical_Figure (์—ญ์‚ฌ์  ์ธ๋ฌผ)
- Military_History (๊ตฐ์‚ฌ ์—ญ์‚ฌ)
- Technology (๊ธฐ์ˆ )
- Politics (์ •์น˜)
- Culture (๋ฌธํ™”)
5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด
์ค‘์š”:
- ๋™์ผํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜๋ณตํ•ด์„œ ์ถœ๋ ฅํ•˜์ง€ ๋ง ๊ฒƒ
- ๊ฐ ํ…์ŠคํŠธ๋Š” ํ•œ ๋ฒˆ๋งŒ ์ฒ˜๋ฆฌํ•˜์—ฌ ๊ฐ€์žฅ ์ ํ•ฉํ•œ label์„ ์„ ํƒํ•  ๊ฒƒ
- ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์˜๋ฏธ ๋‹จ์œ„๋กœ ์ ์ ˆํžˆ ๋ถ„๋ฆฌํ•  ๊ฒƒ
์˜ˆ์‹œ:
1,"์ด์ˆœ์‹ ์€ ์กฐ์„  ์ค‘๊ธฐ์˜ ๋ฌด์‹ ์ด๋‹ค.","Historical_Figure","์กฐ์„ ์‹œ๋Œ€, ์œ„ํ‚ค๋ฐฑ๊ณผ"
์ฃผ์˜์‚ฌํ•ญ:
- text์— ์‰ผํ‘œ๊ฐ€ ์žˆ์œผ๋ฉด ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ธ๊ธฐ
- ํฐ๋”ฐ์˜ดํ‘œ๋Š” ๋ฐฑ์Šฌ๋ž˜์‹œ๋กœ ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌ
- ๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„
- ๋ถˆํ•„์š”ํ•œ ๋ฐ˜๋ณต ์ถœ๋ ฅ ๊ธˆ์ง€"""
try:
# ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
chunks = chunk_text(input_text)
# ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ์ฒญํฌ๋“ค์„ ์ฒ˜๋ฆฌ
with ThreadPoolExecutor(max_workers=3) as executor:
futures = []
for chunk in chunks:
# ๊ฐ ์ฒญํฌ์— ๋Œ€ํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
chunk_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{chunk}\n\n์ถœ๋ ฅ:"
future = executor.submit(
hf_client.text_generation,
prompt=chunk_prompt,
max_new_tokens=2000,
temperature=0.1,
top_p=0.5,
stream=False
)
futures.append(future)
processed_chunks = [future.result() for future in futures]
# ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
all_lines = []
seen_texts = set()
current_id = 1
for chunk_result in processed_chunks:
# EOS_TOKEN ์ฒ˜๋ฆฌ
if "<EOS_TOKEN>" in chunk_result:
chunk_result = chunk_result.split("<EOS_TOKEN>")[0]
lines = chunk_result.strip().split('\n')
for line in lines:
line = line.strip()
if line and '์ถœ๋ ฅ:' not in line and line not in seen_texts:
# ID ์žฌํ• ๋‹น
parts = line.split(',', 1)
if len(parts) > 1:
new_line = f"{current_id},{parts[1]}"
if new_line not in seen_texts: # ์ถ”๊ฐ€์ ์ธ ์ค‘๋ณต ๊ฒ€์‚ฌ
all_lines.append(new_line)
seen_texts.add(new_line)
current_id += 1
processed_text = '\n'.join(all_lines)
# CSV ํ˜•์‹ ๊ฒ€์ฆ
try:
from io import StringIO
import csv
csv.reader(StringIO(processed_text))
return processed_text
except csv.Error:
return "LLM์ด ์˜ฌ๋ฐ”๋ฅธ CSV ํ˜•์‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
except Exception as e:
error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
print(error_message)
return "์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
# CSS ์„ค์ •
css = """
footer {
visibility: hidden;
}
#chatbot-container, #chatbot-data-upload {
height: 700px;
overflow-y: scroll;
}
#chatbot-container .message, #chatbot-data-upload .message {
font-size: 14px;
}
/* ์ž…๋ ฅ์ฐฝ ๋ฐฐ๊ฒฝ์ƒ‰ ๋ฐ ๊ธ€์ž์ƒ‰ ๋ณ€๊ฒฝ */
textarea, input[type="text"] {
background-color: #ffffff; /* ํฐ์ƒ‰ ๋ฐฐ๊ฒฝ */
color: #000000; /* ๊ฒ€์ •์ƒ‰ ๊ธ€์ž */
}
/* ํŒŒ์ผ ์—…๋กœ๋“œ ์˜์—ญ ๋†’์ด ์กฐ์ ˆ */
#parquet-upload-area {
max-height: 150px;
overflow-y: auto;
}
/* ์ดˆ๊ธฐ ์„ค๋ช… ๊ธ€์”จ ํฌ๊ธฐ ์กฐ์ ˆ */
#initial-description {
font-size: 14px;
}
"""
# Gradio Blocks ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
with gr.Blocks(css=css) as demo:
gr.Markdown("# My RAG: LLM์ด ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋กœ ํ•™์Šตํ•œ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ/๋‹ต๋ณ€", elem_id="initial-description")
gr.Markdown(
"### 1) ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋ฅผ ์ž…๋ ฅ ๋˜๋Š” CSV ์—…๋กœ๋“œ๋กœ Parquet ๋ฐ์ดํ„ฐ์…‹ ์ž๋™ ๋ณ€ํ™˜ 2) Parquet ๋ฐ์ดํ„ฐ์…‹์„ ์—…๋กœ๋“œํ•˜๋ฉด, LLM์ด ๋งž์ถค ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ ์‘๋‹ต\n"
"### Tip) '์˜ˆ์ œ'๋ฅผ ํ†ตํ•ด ๋‹ค์–‘ํ•œ ํ™œ์šฉ ๋ฐฉ๋ฒ•์„ ์ฒดํ—˜ํ•˜๊ณ  ์‘์šฉํ•ด ๋ณด์„ธ์š”, ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ์‹œ ๋ฏธ๋ฆฌ๋ณด๊ธฐ๋Š” 10๊ฑด๋งŒ ์ถœ๋ ฅ",
elem_id="initial-description"
)
# ์ฒซ ๋ฒˆ์งธ ํƒญ: ์ฑ—๋ด‡ ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "My ๋ฐ์ดํ„ฐ์…‹+LLM")
with gr.Tab("My ๋ฐ์ดํ„ฐ์…‹+LLM"):
gr.Markdown("### LLM๊ณผ ๋Œ€ํ™”ํ•˜๊ธฐ")
chatbot_data_upload = gr.Chatbot(label="์ฑ—๋ด‡", type="messages", elem_id="chatbot-data-upload")
msg_data_upload = gr.Textbox(label="๋ฉ”์‹œ์ง€ ์ž…๋ ฅ", placeholder="์—ฌ๊ธฐ์— ๋ฉ”์‹œ์ง€๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”...")
send_data_upload = gr.Button("์ „์†ก")
with gr.Accordion("์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๋ฐ ์˜ต์…˜ ์„ค์ •", open=False):
system_message = gr.Textbox(label="System Message", value="๋„ˆ๋Š” AI ์กฐ์–ธ์ž ์—ญํ• ์ด๋‹ค.")
max_tokens = gr.Slider(minimum=1, maximum=8000, value=1000, label="Max Tokens")
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")
parquet_data_state = gr.State()
def handle_message_data_upload(
message: str,
history: List[Dict[str, str]],
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
parquet_data: str
):
history = history or []
try:
# ์‚ฌ์šฉ์ž์˜ ๋ฉ”์‹œ์ง€๋ฅผ ํžˆ์Šคํ† ๋ฆฌ์— ์ถ”๊ฐ€
history.append({"role": "user", "content": message})
# ์‘๋‹ต ์ƒ์„ฑ
response_gen = respond(
message, history, system_message, max_tokens, temperature, top_p, parquet_data
)
partial_response = ""
for partial in response_gen:
partial_response = partial
# ๋Œ€ํ™” ๋‚ด์—ญ ์—…๋ฐ์ดํŠธ
display_history = history + [
{"role": "assistant", "content": partial_response}
]
yield display_history, ""
# ์–ด์‹œ์Šคํ„ดํŠธ์˜ ์‘๋‹ต์„ ํžˆ์Šคํ† ๋ฆฌ์— ์ถ”๊ฐ€
history.append({"role": "assistant", "content": partial_response})
except Exception as e:
print(f"๋ฉ”์‹œ์ง€ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{traceback.format_exc()}")
response = "๋ฉ”์‹œ์ง€ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
history.append({"role": "assistant", "content": response})
yield history, ""
send_data_upload.click(
handle_message_data_upload,
inputs=[
msg_data_upload,
chatbot_data_upload,
system_message,
max_tokens,
temperature,
top_p,
parquet_data_state, # parquet_data_state๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์ „๋‹ฌ
],
outputs=[chatbot_data_upload, msg_data_upload],
queue=True
)
# ์˜ˆ์ œ ์ถ”๊ฐ€
with gr.Accordion("์˜ˆ์ œ", open=False):
gr.Examples(
examples=[
["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹์— ๋Œ€ํ•ด ์š”์•ฝ ์„ค๋ช…ํ•˜๋ผ."],
["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ๋ณธ ์„œ๋น„์Šค๋ฅผ SEO ์ตœ์ ํ™”ํ•˜์—ฌ ๋ธ”๋กœ๊ทธ ํฌ์ŠคํŠธ(๊ฐœ์š”, ๋ฐฐ๊ฒฝ ๋ฐ ํ•„์š”์„ฑ, ๊ธฐ์กด ์œ ์‚ฌ ์ œํ’ˆ/์„œ๋น„์Šค์™€ ๋น„๊ตํ•˜์—ฌ ํŠน์žฅ์ , ํ™œ์šฉ์ฒ˜, ๊ฐ€์น˜, ๊ธฐ๋Œ€ํšจ๊ณผ, ๊ฒฐ๋ก ์„ ํฌํ•จ)๋กœ 4000 ํ† ํฐ ์ด์ƒ ์ž‘์„ฑํ•˜๋ผ"],
["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ์‚ฌ์šฉ ๋ฐฉ๋ฒ•๊ณผ ์ฐจ๋ณ„์ , ํŠน์ง•, ๊ฐ•์ ์„ ์ค‘์‹ฌ์œผ๋กœ 4000 ํ† ํฐ ์ด์ƒ ์œ ํŠœ๋ธŒ ์˜์ƒ ์Šคํฌ๋ฆฝํŠธ ํ˜•ํƒœ๋กœ ์ž‘์„ฑํ•˜๋ผ"],
["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ์ œํ’ˆ ์ƒ์„ธ ํŽ˜์ด์ง€ ํ˜•์‹์˜ ๋‚ด์šฉ์„ 4000 ํ† ํฐ ์ด์ƒ ์ž์„ธํžˆ ์„ค๋ช…ํ•˜๋ผ"],
["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, FAQ 20๊ฑด์„ ์ƒ์„ธํ•˜๊ฒŒ ์ž‘์„ฑํ•˜๋ผ. 4000ํ† ํฐ ์ด์ƒ ์‚ฌ์šฉํ•˜๋ผ."],
["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ํŠนํ—ˆ ์ถœ์›์— ํ™œ์šฉํ•  ๊ธฐ์ˆ  ๋ฐ ๋น„์ฆˆ๋‹ˆ์Šค ๋ชจ๋ธ ์ธก๋ฉด์„ ํฌํ•จํ•˜์—ฌ ํŠนํ—ˆ ์ถœ์›์„œ ๊ตฌ์„ฑ์— ๋งž๊ฒŒ ํ˜์‹ ์ ์ธ ์ฐฝ์˜ ๋ฐœ๋ช… ๋‚ด์šฉ์„ ์ค‘์‹ฌ์œผ๋กœ 4000 ํ† ํฐ ์ด์ƒ ์ž‘์„ฑํ•˜๋ผ."],
],
inputs=msg_data_upload,
label="์˜ˆ์ œ ์„ ํƒ",
)
# Parquet ํŒŒ์ผ ์—…๋กœ๋“œ๋ฅผ ํ™”๋ฉด ํ•˜๋‹จ์œผ๋กœ ์ด๋™
gr.Markdown("### Parquet ํŒŒ์ผ ์—…๋กœ๋“œ")
with gr.Row():
with gr.Column():
parquet_upload = gr.File(
label="Parquet ํŒŒ์ผ ์—…๋กœ๋“œ", type="filepath", elem_id="parquet-upload-area"
)
parquet_upload_button = gr.Button("์—…๋กœ๋“œ")
parquet_upload_status = gr.Textbox(label="์—…๋กœ๋“œ ์ƒํƒœ", interactive=False)
parquet_preview_chat = gr.Markdown(label="Parquet ํŒŒ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
def handle_parquet_upload(file_path: str):
message, parquet_content, parquet_json = upload_parquet(file_path)
if parquet_json:
return message, parquet_content, parquet_json
else:
return message, "", ""
parquet_upload_button.click(
handle_parquet_upload,
inputs=parquet_upload,
outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state]
)
# ๋‘ ๋ฒˆ์งธ ํƒญ: ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "CSV to My ๋ฐ์ดํ„ฐ์…‹")
with gr.Tab("CSV to My ๋ฐ์ดํ„ฐ์…‹"):
gr.Markdown("### CSV ํŒŒ์ผ ์—…๋กœ๋“œ ๋ฐ Parquet ๋ณ€ํ™˜")
with gr.Row():
with gr.Column():
csv_file = gr.File(label="CSV ํŒŒ์ผ ์—…๋กœ๋“œ", type="filepath")
upload_button = gr.Button("์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜")
upload_status = gr.Textbox(label="์—…๋กœ๋“œ ์ƒํƒœ", interactive=False)
parquet_preview = gr.Markdown(label="Parquet ํŒŒ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
download_button = gr.File(label="Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ", interactive=False)
def handle_csv_upload(file_path: str):
message, parquet_filename = upload_csv(file_path)
if parquet_filename:
parquet_content = load_parquet(parquet_filename)
return message, parquet_content, parquet_filename
else:
return message, "", None
upload_button.click(
handle_csv_upload,
inputs=csv_file,
outputs=[upload_status, parquet_preview, download_button]
)
# ์„ธ ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ to csv to parquet ๋ณ€ํ™˜ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "Text to My ๋ฐ์ดํ„ฐ์…‹")
with gr.Tab("Text to My ๋ฐ์ดํ„ฐ์…‹"):
gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด CSV๋กœ ๋ณ€ํ™˜ ํ›„ Parquet์œผ๋กœ ์ž๋™ ์ „ํ™˜๋ฉ๋‹ˆ๋‹ค.")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="ํ…์ŠคํŠธ ์ž…๋ ฅ (๊ฐ ํ–‰์€ `id,text,label,metadata` ํ˜•์‹์œผ๋กœ ์ž…๋ ฅ)",
lines=10,
placeholder='์˜ˆ: 1,"์ด์ˆœ์‹ ","์žฅ๊ตฐ","๊ฑฐ๋ถ์„ "\n2,"์›๊ท ","์žฅ๊ตฐ","๋ชจํ•จ"\n3,"์„ ์กฐ","์™•","์‹œ๊ธฐ"\n4,"๋„์š”ํ† ๋ฏธ ํžˆ๋ฐ์š”์‹œ","์™•","์นจ๋žต"'
)
convert_button = gr.Button("๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ")
convert_status = gr.Textbox(label="๋ณ€ํ™˜ ์ƒํƒœ", interactive=False)
parquet_preview_convert = gr.Markdown(label="Parquet ํŒŒ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
download_parquet_convert = gr.File(label="Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ", interactive=False)
def handle_text_to_parquet(text: str):
message, parquet_content, parquet_filename = text_to_parquet(text)
if parquet_filename:
return message, parquet_content, parquet_filename
else:
return message, "", None
convert_button.click(
handle_text_to_parquet,
inputs=text_input,
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
)
# ๋„ค๋ฒˆ์งธ ํƒญ์˜ UI ๋ถ€๋ถ„ ์ˆ˜์ •
with gr.Tab("Text Preprocessing with LLM"):
gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด LLM์ด ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.")
with gr.Row():
with gr.Column():
raw_text_input = gr.Textbox(
label="ํ…์ŠคํŠธ ์ž…๋ ฅ",
lines=15,
placeholder="์—ฌ๊ธฐ์— ์ „์ฒ˜๋ฆฌํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”..."
)
with gr.Row():
preprocess_button = gr.Button("์ „์ฒ˜๋ฆฌ ์‹คํ–‰", variant="primary")
clear_button = gr.Button("์ดˆ๊ธฐํ™”")
preprocess_status = gr.Textbox(
label="์ „์ฒ˜๋ฆฌ ์ƒํƒœ",
interactive=False,
value="๋Œ€๊ธฐ ์ค‘..."
)
processed_text_output = gr.Textbox(
label="์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹ ์ถœ๋ ฅ",
lines=15,
interactive=False
)
# Parquet ๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ ์„น์…˜
convert_to_parquet_button = gr.Button("Parquet์œผ๋กœ ๋ณ€ํ™˜")
download_parquet = gr.File(label="๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
def handle_text_preprocessing(input_text: str):
if not input_text.strip():
yield "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", ""
return
try:
preprocess_status_msg = "์ „์ฒ˜๋ฆฌ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."
yield preprocess_status_msg, ""
processed_text = preprocess_text_with_llm(input_text)
if processed_text:
preprocess_status_msg = "์ „์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
yield preprocess_status_msg, processed_text
else:
preprocess_status_msg = "์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
yield preprocess_status_msg, ""
except Exception as e:
error_msg = "์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
print(f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{traceback.format_exc()}")
yield error_msg, ""
def clear_inputs():
return "", "๋Œ€๊ธฐ ์ค‘...", ""
def convert_to_parquet_file(processed_text: str):
if not processed_text.strip():
return "๋ณ€ํ™˜ํ•  ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", None
try:
message, parquet_content, parquet_filename = text_to_parquet(processed_text)
if parquet_filename:
return message, parquet_filename
return message, None
except Exception as e:
print(f"Parquet ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{traceback.format_exc()}")
return "Parquet ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”.", None
# ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ ์—ฐ๊ฒฐ
preprocess_button.click(
handle_text_preprocessing,
inputs=[raw_text_input],
outputs=[preprocess_status, processed_text_output],
queue=True
)
clear_button.click(
clear_inputs,
outputs=[raw_text_input, preprocess_status, processed_text_output]
)
convert_to_parquet_button.click(
convert_to_parquet_file,
inputs=[processed_text_output],
outputs=[preprocess_status, download_parquet]
)
# ์˜ˆ์ œ ํ…์ŠคํŠธ ์ถ”๊ฐ€
with gr.Accordion("์˜ˆ์ œ ํ…์ŠคํŠธ", open=False):
gr.Examples(
examples=[
["์ด์ˆœ์‹ ์€ ์กฐ์„  ์ค‘๊ธฐ์˜ ๋ฌด์‹ ์ด๋‹ค. ๊ทธ๋Š” ์ž„์ง„์™œ๋ž€ ๋‹น์‹œ ํ•ด๊ตฐ์„ ์ด๋Œ์—ˆ๋‹ค. ๊ฑฐ๋ถ์„ ์„ ๋งŒ๋“ค์–ด ์™œ๊ตฐ๊ณผ ์‹ธ์› ๋‹ค."],
["์ธ๊ณต์ง€๋Šฅ์€ ์ปดํ“จํ„ฐ ๊ณผํ•™์˜ ํ•œ ๋ถ„์•ผ์ด๋‹ค. ๊ธฐ๊ณ„ํ•™์Šต์€ ์ธ๊ณต์ง€๋Šฅ์˜ ํ•˜์œ„ ๋ถ„์•ผ์ด๋‹ค. ๋”ฅ๋Ÿฌ๋‹์€ ๊ธฐ๊ณ„ํ•™์Šต์˜ ํ•œ ๋ฐฉ๋ฒ•์ด๋‹ค."]
],
inputs=raw_text_input,
label="์˜ˆ์ œ ์„ ํƒ"
)
gr.Markdown("### [email protected]", elem_id="initial-description")
if __name__ == "__main__":
demo.launch(share=True) # ์ฝ”๋“œ์ƒ์˜ ์˜ค๋ฅ˜๋‚˜ ๊ฐœ์„ ์ด ํ•„์š”ํ•œ ์‚ฌํ•ญ์„ ์ถ”๋ก ํ•˜์—ฌ ๋ณด๊ณ ํ•˜๋ผ