ginipick commited on
Commit
f7728da
ยท
verified ยท
1 Parent(s): d5a415c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +853 -0
app.py ADDED
@@ -0,0 +1,853 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+ import os
4
+ import pandas as pd
5
+ from typing import List, Dict, Tuple
6
+ import json
7
+ import io
8
+ import traceback
9
+ import csv
10
+ # HuggingFace ํด๋ผ์ด์–ธํŠธ ๋Œ€์‹  OpenAI ํด๋ผ์ด์–ธํŠธ ์‚ฌ์šฉ
11
+ from openai import OpenAI
12
+ import os
13
+
14
+ # ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
15
+ hf_client = InferenceClient(
16
+ "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
17
+ )
18
+
19
+ def load_code(filename: str) -> str:
20
+ try:
21
+ with open(filename, 'r', encoding='utf-8') as file:
22
+ return file.read()
23
+ except FileNotFoundError:
24
+ return f"{filename} ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
25
+ except Exception as e:
26
+ return f"ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
27
+
28
+ def load_parquet(filename: str) -> str:
29
+ try:
30
+ df = pd.read_parquet(filename, engine='pyarrow')
31
+ return df.head(10).to_markdown(index=False)
32
+ except FileNotFoundError:
33
+ return f"{filename} ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
34
+ except Exception as e:
35
+ return f"ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
36
+
37
+
38
+ # OpenAI ํด๋ผ์ด์–ธํŠธ ์„ค์ •
39
+ client = OpenAI(api_key=os.getenv("OPEN_AI"))
40
+
41
+ # respond ํ•จ์ˆ˜ ์ˆ˜์ •
42
+ def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
43
+ # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ์„ค์ •
44
+ system_prefix = """๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€๋กœ ๋‹ต๋ณ€ํ•  ๊ฒƒ. ๋„ˆ๋Š” ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜๋Š” ์—ญํ• ์„ ํ•œ๋‹ค.
45
+
46
+ ์ฃผ์š” ์ง€์นจ:
47
+ 1. ์งˆ๋ฌธ๊ณผ ์ง์ ‘ ๊ด€๋ จ๋œ ๋‚ด์šฉ๋งŒ ๊ฐ„๋‹จ๋ช…๋ฃŒํ•˜๊ฒŒ ๋‹ต๋ณ€ํ•  ๊ฒƒ
48
+ 2. ์ด์ „ ๋‹ต๋ณ€๊ณผ ์ค‘๋ณต๋˜๋Š” ๋‚ด์šฉ์€ ์ œ์™ธํ•  ๊ฒƒ
49
+ 3. ๋ถˆํ•„์š”ํ•œ ์˜ˆ์‹œ๋‚˜ ๋ถ€์—ฐ ์„ค๋ช…์€ ํ•˜์ง€ ๋ง ๊ฒƒ
50
+ 4. ๋™์ผํ•œ ๋‚ด์šฉ์„ ๋‹ค๋ฅธ ํ‘œํ˜„์œผ๋กœ ๋ฐ˜๋ณตํ•˜์ง€ ๋ง ๊ฒƒ
51
+ 5. ํ•ต์‹ฌ ์ •๋ณด๋งŒ ์ „๋‹ฌํ•  ๊ฒƒ
52
+ """
53
+
54
+ if parquet_data:
55
+ try:
56
+ df = pd.read_json(io.StringIO(parquet_data))
57
+ data_summary = df.describe(include='all').to_string()
58
+ system_prefix += f"\n\n๋ฐ์ดํ„ฐ ์š”์•ฝ:\n{data_summary}"
59
+ except Exception as e:
60
+ print(f"๋ฐ์ดํ„ฐ ๋กœ๋“œ ์˜ค๋ฅ˜: {str(e)}")
61
+
62
+ # ๋Œ€ํ™” ํžˆ์Šคํ† ๋ฆฌ ๊ตฌ์„ฑ
63
+ messages = [{"role": "system", "content": system_prefix}]
64
+
65
+ # ์ตœ๊ทผ ๋Œ€ํ™” ์ปจํ…์ŠคํŠธ๋งŒ ์œ ์ง€
66
+ recent_history = history[-3:] if history else []
67
+ for chat in recent_history:
68
+ messages.append({"role": chat["role"], "content": chat["content"]})
69
+
70
+ messages.append({"role": "user", "content": message})
71
+
72
+ try:
73
+ # OpenAI API ํ˜ธ์ถœ
74
+ response = client.chat.completions.create(
75
+ model="gpt-4o-mini", # GPT-4-mini ๋ชจ๋ธ ์‚ฌ์šฉ
76
+ messages=messages,
77
+ max_tokens=max_tokens,
78
+ temperature=temperature,
79
+ top_p=top_p,
80
+ stream=True
81
+ )
82
+
83
+ full_response = ""
84
+ for chunk in response:
85
+ if chunk.choices[0].delta.content:
86
+ full_response += chunk.choices[0].delta.content
87
+ # ์‘๋‹ต ์ •์ œ
88
+ cleaned_response = clean_response(full_response)
89
+ yield cleaned_response
90
+
91
+ except Exception as e:
92
+ error_message = f"์ถ”๋ก  ์˜ค๋ฅ˜: {str(e)}"
93
+ print(error_message)
94
+ yield error_message
95
+
96
+ def clean_response(text: str) -> str:
97
+ """์‘๋‹ต ํ…์ŠคํŠธ ์ •์ œ ํ•จ์ˆ˜"""
98
+ # ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ
99
+ sentences = [s.strip() for s in text.split('.') if s.strip()]
100
+
101
+ # ์ค‘๋ณต ์ œ๊ฑฐ
102
+ unique_sentences = []
103
+ seen = set()
104
+
105
+ for sentence in sentences:
106
+ # ๋ฌธ์žฅ ์ •๊ทœํ™” (๊ณต๋ฐฑ ์ œ๊ฑฐ, ์†Œ๋ฌธ์ž ๋ณ€ํ™˜)
107
+ normalized = ' '.join(sentence.lower().split())
108
+ if normalized not in seen:
109
+ seen.add(normalized)
110
+ unique_sentences.append(sentence)
111
+
112
+ # ์ •์ œ๋œ ๋ฌธ์žฅ ๊ฒฐํ•ฉ
113
+ cleaned_text = '. '.join(unique_sentences)
114
+ if cleaned_text and not cleaned_text.endswith('.'):
115
+ cleaned_text += '.'
116
+
117
+ return cleaned_text
118
+
119
+ def remove_duplicates(text: str) -> str:
120
+ """์ค‘๋ณต ๋ฌธ์žฅ ์ œ๊ฑฐ ํ•จ์ˆ˜"""
121
+ sentences = text.split('.')
122
+ unique_sentences = []
123
+ seen = set()
124
+
125
+ for sentence in sentences:
126
+ sentence = sentence.strip()
127
+ if sentence and sentence not in seen:
128
+ seen.add(sentence)
129
+ unique_sentences.append(sentence)
130
+
131
+ return '. '.join(unique_sentences)
132
+
133
+ def upload_csv(file_path: str) -> Tuple[str, str]:
134
+ try:
135
+ # CSV ํŒŒ์ผ ์ฝ๊ธฐ
136
+ df = pd.read_csv(file_path, sep=',')
137
+ # ํ•„์ˆ˜ ์ปฌ๋Ÿผ ํ™•์ธ
138
+ required_columns = {'id', 'text', 'label', 'metadata'}
139
+ available_columns = set(df.columns)
140
+ missing_columns = required_columns - available_columns
141
+ if missing_columns:
142
+ return f"CSV ํŒŒ์ผ์— ๋‹ค์Œ ํ•„์ˆ˜ ์ปฌ๋Ÿผ์ด ๋ˆ„๋ฝ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {', '.join(missing_columns)}", ""
143
+ # ๋ฐ์ดํ„ฐ ํด๋ Œ์ง•
144
+ df.drop_duplicates(inplace=True)
145
+ df.fillna('', inplace=True)
146
+ # ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
147
+ df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
148
+ # Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
149
+ parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
150
+ df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
151
+ return f"{parquet_filename} ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์—…๋กœ๋“œ๋˜๊ณ  ๋ณ€ํ™˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", parquet_filename
152
+ except Exception as e:
153
+ return f"CSV ํŒŒ์ผ ์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}", ""
154
+
155
+ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
156
+ try:
157
+ # Parquet ํŒŒ์ผ ์ฝ๊ธฐ
158
+ df = pd.read_parquet(file_path, engine='pyarrow')
159
+
160
+ # ๋ฐ์ดํ„ฐ ๊ธฐ๋ณธ ์ •๋ณด ์ˆ˜์ง‘
161
+ data_info = {
162
+ "์ด ๋ ˆ์ฝ”๋“œ ์ˆ˜": len(df),
163
+ "์ปฌ๋Ÿผ ๋ชฉ๋ก": list(df.columns),
164
+ "๋ฐ์ดํ„ฐ ํƒ€์ž…": df.dtypes.to_dict(),
165
+ "๊ฒฐ์ธก์น˜ ์ •๋ณด": df.isnull().sum().to_dict()
166
+ }
167
+
168
+ # ๋ฐ์ดํ„ฐ ์š”์•ฝ ์ •๋ณด ์ƒ์„ฑ
169
+ summary = []
170
+ summary.append(f"### ๋ฐ์ดํ„ฐ์…‹ ๊ธฐ๋ณธ ์ •๋ณด:")
171
+ summary.append(f"- ์ด ๋ ˆ์ฝ”๋“œ ์ˆ˜: {data_info['์ด ๋ ˆ์ฝ”๋“œ ์ˆ˜']}")
172
+ summary.append(f"- ์ปฌ๋Ÿผ ๋ชฉ๋ก: {', '.join(data_info['์ปฌ๋Ÿผ ๋ชฉ๋ก'])}")
173
+
174
+ # ๊ฐ ์ปฌ๋Ÿผ๋ณ„ ํ†ต๊ณ„ ์ •๋ณด ์ƒ์„ฑ
175
+ summary.append("\n### ์ปฌ๋Ÿผ๋ณ„ ์ •๋ณด:")
176
+ for col in df.columns:
177
+ if df[col].dtype in ['int64', 'float64']:
178
+ # ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ
179
+ stats = df[col].describe()
180
+ summary.append(f"\n{col} (์ˆ˜์น˜ํ˜•):")
181
+ summary.append(f"- ํ‰๊ท : {stats['mean']:.2f}")
182
+ summary.append(f"- ์ตœ์†Œ: {stats['min']}")
183
+ summary.append(f"- ์ตœ๋Œ€: {stats['max']}")
184
+ elif df[col].dtype == 'object' or df[col].dtype == 'string':
185
+ # ๋ฌธ์ž์—ด ๋ฐ์ดํ„ฐ
186
+ unique_count = df[col].nunique()
187
+ summary.append(f"\n{col} (ํ…์ŠคํŠธ):")
188
+ summary.append(f"- ๊ณ ์œ ๊ฐ’ ์ˆ˜: {unique_count}")
189
+ if unique_count < 10: # ๊ณ ์œ ๊ฐ’์ด ์ ์€ ๊ฒฝ์šฐ๋งŒ ํ‘œ์‹œ
190
+ value_counts = df[col].value_counts().head(5)
191
+ summary.append("- ์ƒ์œ„ 5๊ฐœ ๊ฐ’:")
192
+ for val, count in value_counts.items():
193
+ summary.append(f" โ€ข {val}: {count}๊ฐœ")
194
+
195
+ # ๋ฏธ๋ฆฌ๋ณด๊ธฐ ์ƒ์„ฑ
196
+ preview = df.head(10).to_markdown(index=False)
197
+ summary.append("\n### ๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:")
198
+ summary.append(preview)
199
+
200
+ parquet_content = "\n".join(summary)
201
+
202
+ # DataFrame์„ JSON ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜ (Q&A์—์„œ ์‚ฌ์šฉ)
203
+ parquet_json = df.to_json(orient='records', force_ascii=False)
204
+
205
+ return "Parquet ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์—…๋กœ๋“œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", parquet_content, parquet_json
206
+ except Exception as e:
207
+ return f"Parquet ํŒŒ์ผ ์—…๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}", "", ""
208
+
209
+
210
+ def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
211
+ try:
212
+ if parquet_data:
213
+ # JSON ๋ฌธ์ž์—ด์„ DataFrame์œผ๋กœ ๋ณ€ํ™˜
214
+ df = pd.read_json(io.StringIO(parquet_data))
215
+
216
+ # ๋ฐ์ดํ„ฐ์…‹ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ
217
+ columns_info = []
218
+ for col in df.columns:
219
+ if df[col].dtype in ['int64', 'float64']:
220
+ col_type = "์ˆ˜์น˜ํ˜•"
221
+ stats = df[col].describe()
222
+ col_info = f"- {col} ({col_type}): ํ‰๊ท ={stats['mean']:.2f}, ์ตœ์†Œ={stats['min']}, ์ตœ๋Œ€={stats['max']}"
223
+ else:
224
+ col_type = "ํ…์ŠคํŠธ"
225
+ unique_count = df[col].nunique()
226
+ col_info = f"- {col} ({col_type}): ๊ณ ์œ ๊ฐ’ {unique_count}๊ฐœ"
227
+ columns_info.append(col_info)
228
+
229
+ data_context = f"""
230
+ ํ˜„์žฌ ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ์ •๋ณด:
231
+ - ์ด {len(df)} ๊ฐœ์˜ ๋ ˆ์ฝ”๋“œ
232
+ - ์ปฌ๋Ÿผ ์ •๋ณด:
233
+ {chr(10).join(columns_info)}
234
+
235
+ ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ:
236
+ {df.head(20).to_string()}
237
+ """
238
+ system_prompt = f"""๋‹น์‹ ์€ ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹์„ ๋ถ„์„ํ•˜๊ณ  ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜๋Š” AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค.
239
+
240
+ ์ฃผ์š” ์ง€์นจ:
241
+ 1. ๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€๋กœ ๋‹ต๋ณ€ํ•  ๊ฒƒ
242
+ 2. ๋ฐ์ดํ„ฐ์…‹์˜ ์‹ค์ œ ๋‚ด์šฉ์„ ๊ธฐ๋ฐ˜์œผ๋กœ ์ •ํ™•ํ•˜๊ฒŒ ๋‹ต๋ณ€ํ•  ๊ฒƒ
243
+ 3. ๋ฐ์ดํ„ฐ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ถ”์ธกํ•˜์ง€ ๋ง ๊ฒƒ
244
+ 4. ๋‹ต๋ณ€์€ ๊ฐ„๋‹จ๋ช…๋ฃŒํ•˜๊ฒŒ ํ•  ๊ฒƒ
245
+ 5. ๋ฐ์ดํ„ฐ ํ”„๋ผ์ด๋ฒ„์‹œ๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ๋‹ต๋ณ€ํ•  ๊ฒƒ
246
+
247
+ ๋ฐ์ดํ„ฐ์…‹ ๊ตฌ์กฐ ์„ค๋ช…:
248
+ {chr(10).join(columns_info)}
249
+
250
+ ์ฐธ๊ณ ํ•  ๋ฐ์ดํ„ฐ ์ƒ˜ํ”Œ:
251
+ {data_context}
252
+ """
253
+ else:
254
+ system_prompt = system_message or "๋„ˆ๋Š” AI ์กฐ์–ธ์ž ์—ญํ• ์ด๋‹ค."
255
+
256
+ # OpenAI API ํ˜ธ์ถœ
257
+ messages = [{"role": "system", "content": system_prompt}]
258
+
259
+ # ์ตœ๊ทผ ๋Œ€ํ™” ๊ธฐ๋ก ์ถ”๊ฐ€
260
+ recent_history = history[-3:] if history else []
261
+ for chat in recent_history:
262
+ messages.append({"role": chat["role"], "content": chat["content"]})
263
+
264
+ messages.append({"role": "user", "content": message})
265
+
266
+ response = client.chat.completions.create(
267
+ model="gpt-4-0125-preview",
268
+ messages=messages,
269
+ max_tokens=max_tokens,
270
+ temperature=temperature,
271
+ top_p=top_p,
272
+ stream=True
273
+ )
274
+
275
+ full_response = ""
276
+ for chunk in response:
277
+ if chunk.choices[0].delta.content:
278
+ full_response += chunk.choices[0].delta.content
279
+ yield clean_response(full_response)
280
+
281
+ except Exception as e:
282
+ error_message = f"์‘๋‹ต ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
283
+ print(f"{error_message}\n{traceback.format_exc()}")
284
+ yield error_message
285
+
286
+ def text_to_parquet(text: str) -> Tuple[str, str, str]:
287
+ try:
288
+ # ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์ค„ ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ
289
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
290
+
291
+ # ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
292
+ data = []
293
+
294
+ for line in lines:
295
+ try:
296
+ # ์ •๊ทœ์‹์„ ์‚ฌ์šฉํ•˜์—ฌ CSV ํ˜•์‹ ํŒŒ์‹ฑ
297
+ import re
298
+ pattern = r'(\d+),([^,]+),([^,]+),(.+)'
299
+ match = re.match(pattern, line)
300
+
301
+ if match:
302
+ id_val, text_val, label_val, metadata_val = match.groups()
303
+
304
+ # ์Œ๋”ฐ์˜ดํ‘œ ์ œ๊ฑฐ ๋ฐ ์ •์ œ
305
+ text_val = text_val.strip().strip('"')
306
+ label_val = label_val.strip().strip('"')
307
+ metadata_val = metadata_val.strip().strip('"')
308
+
309
+ data.append({
310
+ 'id': int(id_val),
311
+ 'text': text_val,
312
+ 'label': label_val,
313
+ 'metadata': metadata_val
314
+ })
315
+ except Exception as e:
316
+ print(f"๋ผ์ธ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {line}\n{str(e)}")
317
+ continue
318
+
319
+ if not data:
320
+ return "๋ณ€ํ™˜ํ•  ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", "", ""
321
+
322
+ # DataFrame ์ƒ์„ฑ
323
+ df = pd.DataFrame(data)
324
+
325
+ # ๋ฐ์ดํ„ฐ ํƒ€์ž… ์„ค์ •
326
+ df = df.astype({
327
+ 'id': 'int32',
328
+ 'text': 'string',
329
+ 'label': 'string',
330
+ 'metadata': 'string'
331
+ })
332
+
333
+ # Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
334
+ parquet_filename = 'text_to_parquet.parquet'
335
+ df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
336
+
337
+ # ๋ฏธ๋ฆฌ๋ณด๊ธฐ ์ƒ์„ฑ
338
+ preview = df.to_markdown(index=False)
339
+
340
+ return (
341
+ f"{parquet_filename} ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ๋ณ€ํ™˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ด {len(df)}๊ฐœ์˜ ๋ ˆ์ฝ”๋“œ๊ฐ€ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.",
342
+ preview,
343
+ parquet_filename
344
+ )
345
+
346
+ except Exception as e:
347
+ error_message = f"ํ…์ŠคํŠธ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
348
+ print(f"{error_message}\n{traceback.format_exc()}")
349
+ return error_message, "", ""
350
+
351
+ # preprocess_text_with_llm ํ•จ์ˆ˜๋„ ์ˆ˜์ •
352
+ def preprocess_text_with_llm(input_text: str) -> str:
353
+ if not input_text.strip():
354
+ return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
355
+
356
+ system_prompt = """๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€(ํ•œ๊ตญ์–ด)๋กœ ๋‹ต๋ณ€ํ•˜์‹œ์˜ค. ๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ CSV ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•˜์„ธ์š”.
357
+
358
+ ๊ทœ์น™:
359
+ 1. ์ถœ๋ ฅ ํ˜•์‹: id,text,label,metadata
360
+ 2. id: 1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
361
+ 3. text: ์˜๋ฏธ ์žˆ๋Š” ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ๋œ ํ…์ŠคํŠธ
362
+ 4. label: ํ…์ŠคํŠธ์˜ ์ฃผ์ œ๋‚˜ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์•„๋ž˜ ๊ธฐ์ค€์œผ๋กœ ์ •ํ™•ํ•˜๊ฒŒ ํ•œ ๊ฐœ๋งŒ ์„ ํƒ
363
+ - Historical_Figure (์—ญ์‚ฌ์  ์ธ๋ฌผ)
364
+ - Military_History (๊ตฐ์‚ฌ ์—ญ์‚ฌ)
365
+ - Technology (๊ธฐ์ˆ )
366
+ - Politics (์ •์น˜)
367
+ - Culture (๋ฌธํ™”)
368
+ 5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด"""
369
+
370
+ try:
371
+ response = client.chat.completions.create(
372
+ model="gpt-4-0125-preview",
373
+ messages=[
374
+ {"role": "system", "content": system_prompt},
375
+ {"role": "user", "content": input_text}
376
+ ],
377
+ max_tokens=4000,
378
+ temperature=0.1,
379
+ stream=True
380
+ )
381
+
382
+ full_response = ""
383
+ for chunk in response:
384
+ if chunk.choices[0].delta.content:
385
+ full_response += chunk.choices[0].delta.content
386
+
387
+ # ์‘๋‹ต ์ •์ œ
388
+ processed_text = clean_response(full_response)
389
+
390
+ # CSV ํ˜•์‹ ๊ฒ€์ฆ
391
+ try:
392
+ from io import StringIO
393
+ import csv
394
+ csv.reader(StringIO(processed_text))
395
+ return processed_text
396
+ except csv.Error:
397
+ return "LLM์ด ์˜ฌ๋ฐ”๋ฅธ CSV ํ˜•์‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
398
+
399
+ except Exception as e:
400
+ error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
401
+ print(error_message)
402
+ return error_message# preprocess_text_with_llm ํ•จ์ˆ˜๋„ ์ˆ˜์ •
403
+ def preprocess_text_with_llm(input_text: str) -> str:
404
+ if not input_text.strip():
405
+ return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
406
+
407
+ system_prompt = """๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€(ํ•œ๊ตญ์–ด)๋กœ ๋‹ต๋ณ€ํ•˜์‹œ์˜ค. ๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ CSV ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•˜์„ธ์š”.
408
+
409
+ ๊ทœ์น™:
410
+ 1. ์ถœ๋ ฅ ํ˜•์‹: id,text,label,metadata
411
+ 2. id: 1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
412
+ 3. text: ์˜๋ฏธ ์žˆ๋Š” ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ๋œ ํ…์ŠคํŠธ
413
+ 4. label: ํ…์ŠคํŠธ์˜ ์ฃผ์ œ๋‚˜ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์•„๋ž˜ ๊ธฐ์ค€์œผ๋กœ ์ •ํ™•ํ•˜๊ฒŒ ํ•œ ๊ฐœ๋งŒ ์„ ํƒ
414
+ - Historical_Figure (์—ญ์‚ฌ์  ์ธ๋ฌผ)
415
+ - Military_History (๊ตฐ์‚ฌ ์—ญ์‚ฌ)
416
+ - Technology (๊ธฐ์ˆ )
417
+ - Politics (์ •์น˜)
418
+ - Culture (๋ฌธํ™”)
419
+ 5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด"""
420
+
421
+ try:
422
+ response = client.chat.completions.create(
423
+ model="gpt-4o-mini",
424
+ messages=[
425
+ {"role": "system", "content": system_prompt},
426
+ {"role": "user", "content": input_text}
427
+ ],
428
+ max_tokens=4000,
429
+ temperature=0.1,
430
+ stream=True
431
+ )
432
+
433
+ full_response = ""
434
+ for chunk in response:
435
+ if chunk.choices[0].delta.content:
436
+ full_response += chunk.choices[0].delta.content
437
+
438
+ # ์‘๋‹ต ์ •์ œ
439
+ processed_text = clean_response(full_response)
440
+
441
+ # CSV ํ˜•์‹ ๊ฒ€์ฆ
442
+ try:
443
+ from io import StringIO
444
+ import csv
445
+ csv.reader(StringIO(processed_text))
446
+ return processed_text
447
+ except csv.Error:
448
+ return "LLM์ด ์˜ฌ๋ฐ”๋ฅธ CSV ํ˜•์‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
449
+
450
+ except Exception as e:
451
+ error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
452
+ print(error_message)
453
+ return error_message
454
+
455
+ # CSS ์„ค์ •
456
+ css = """
457
+ footer {
458
+ visibility: hidden;
459
+ }
460
+ #chatbot-container, #chatbot-data-upload {
461
+ height: 700px;
462
+ overflow-y: scroll;
463
+ }
464
+ #chatbot-container .message, #chatbot-data-upload .message {
465
+ font-size: 14px;
466
+ }
467
+ /* ์ž…๋ ฅ์ฐฝ ๋ฐฐ๊ฒฝ์ƒ‰ ๋ฐ ๊ธ€์ž์ƒ‰ ๋ณ€๊ฒฝ */
468
+ textarea, input[type="text"] {
469
+ background-color: #ffffff; /* ํฐ์ƒ‰ ๋ฐฐ๊ฒฝ */
470
+ color: #000000; /* ๊ฒ€์ •์ƒ‰ ๊ธ€์ž */
471
+ }
472
+ /* ํŒŒ์ผ ์—…๋กœ๋“œ ์˜์—ญ ๋†’์ด ์กฐ์ ˆ */
473
+ #parquet-upload-area {
474
+ max-height: 150px;
475
+ overflow-y: auto;
476
+ }
477
+ /* ์ดˆ๊ธฐ ์„ค๋ช… ๊ธ€์”จ ํฌ๊ธฐ ์กฐ์ ˆ */
478
+ #initial-description {
479
+ font-size: 14px;
480
+ }
481
+ """
482
+
483
+ # Gradio Blocks ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
484
+ with gr.Blocks(css=css) as demo:
485
+ gr.Markdown("# MyEzRAG: LLM์ด ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋กœ ํ•™์Šตํ•œ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ/๋‹ต๋ณ€", elem_id="initial-description")
486
+ gr.Markdown(
487
+ "### '์‚ฌ์šฉ ๋ฐฉ๋ฒ•' ํƒญ์„ ํ†ตํ•ด ์ž์„ธํ•œ ์ด์šฉ ๋ฐฉ๋ฒ•์„ ์ฐธ๊ณ ํ•˜์„ธ์š”.\n"
488
+ "### Tip) '์˜ˆ์ œ'๋ฅผ ํ†ตํ•ด ๋‹ค์–‘ํ•œ ํ™œ์šฉ ๋ฐฉ๋ฒ•์„ ์ฒดํ—˜ํ•˜๊ณ  ์‘์šฉํ•ด ๋ณด์„ธ์š”, ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ์‹œ ๋ฏธ๋ฆฌ๋ณด๊ธฐ๋Š” 10๊ฑด๋งŒ ์ถœ๋ ฅ",
489
+ elem_id="initial-description"
490
+ )
491
+
492
+
493
+
494
+ # ์ฒซ ๋ฒˆ์งธ ํƒญ: ์ฑ—๋ด‡ ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "My ๋ฐ์ดํ„ฐ์…‹+LLM")
495
+ with gr.Tab("My ๋ฐ์ดํ„ฐ์…‹+LLM"):
496
+ gr.Markdown("### LLM๊ณผ ๋Œ€ํ™”ํ•˜๊ธฐ")
497
+ chatbot_data_upload = gr.Chatbot(label="์ฑ—๋ด‡", type="messages", elem_id="chatbot-data-upload")
498
+ msg_data_upload = gr.Textbox(label="๋ฉ”์‹œ์ง€ ์ž…๋ ฅ", placeholder="์—ฌ๊ธฐ์— ๋ฉ”์‹œ์ง€๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”...")
499
+ send_data_upload = gr.Button("์ „์†ก")
500
+
501
+ with gr.Accordion("์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๋ฐ ์˜ต์…˜ ์„ค์ •", open=False):
502
+ system_message = gr.Textbox(label="System Message", value="๋„ˆ๋Š” AI ์กฐ์–ธ์ž ์—ญํ• ์ด๋‹ค.")
503
+ max_tokens = gr.Slider(minimum=1, maximum=8000, value=1000, label="Max Tokens")
504
+ temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
505
+ top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")
506
+
507
+ parquet_data_state = gr.State()
508
+
509
+ def handle_message_data_upload(message: str, history: List[Dict[str, str]], system_message: str, max_tokens: int, temperature: float, top_p: float, parquet_data: str):
510
+ history = history or []
511
+
512
+ # ์ค‘๋ณต ์งˆ๋ฌธ ๊ฒ€์‚ฌ
513
+ recent_questions = [chat['content'].strip().lower() for chat in history[-3:] if chat['role'] == 'user']
514
+ if message.strip().lower() in recent_questions:
515
+ yield history + [{"role": "assistant", "content": "๋™์ผํ•œ ์งˆ๋ฌธ์ด ์ตœ๊ทผ์— ์žˆ์—ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค๋ฅธ ์งˆ๋ฌธ์„ ํ•ด์ฃผ์„ธ์š”."}], ""
516
+ return
517
+
518
+ try:
519
+ history.append({"role": "user", "content": message})
520
+ response_gen = respond(
521
+ message,
522
+ history,
523
+ system_message,
524
+ max_tokens,
525
+ temperature=0.3, # ๋‚ฎ์€ temperature ์‚ฌ์šฉ
526
+ top_p=top_p,
527
+ parquet_data=parquet_data
528
+ )
529
+
530
+ partial_response = ""
531
+ for partial in response_gen:
532
+ partial_response = partial
533
+ display_history = history + [{"role": "assistant", "content": partial_response}]
534
+ yield display_history, ""
535
+
536
+ history.append({"role": "assistant", "content": partial_response})
537
+ except Exception as e:
538
+ response = f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
539
+ history.append({"role": "assistant", "content": response})
540
+ yield history, ""
541
+
542
+
543
+
544
+
545
+ send_data_upload.click(
546
+ handle_message_data_upload,
547
+ inputs=[
548
+ msg_data_upload,
549
+ chatbot_data_upload,
550
+ system_message,
551
+ max_tokens,
552
+ temperature,
553
+ top_p,
554
+ parquet_data_state, # parquet_data_state๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์ „๋‹ฌ
555
+ ],
556
+ outputs=[chatbot_data_upload, msg_data_upload],
557
+ queue=True
558
+ )
559
+
560
+ # ์˜ˆ์ œ ์ถ”๊ฐ€
561
+ with gr.Accordion("์˜ˆ์ œ", open=False):
562
+ gr.Examples(
563
+ examples=[
564
+ ["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹์— ๋Œ€ํ•ด ์š”์•ฝ ์„ค๋ช…ํ•˜๋ผ."],
565
+ ["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ๋ณธ ์„œ๋น„์Šค๋ฅผ SEO ์ตœ์ ํ™”ํ•˜์—ฌ ๋ธ”๋กœ๊ทธ ํฌ์ŠคํŠธ(๊ฐœ์š”, ๋ฐฐ๊ฒฝ ๋ฐ ํ•„์š”์„ฑ, ๊ธฐ์กด ์œ ์‚ฌ ์ œํ’ˆ/์„œ๋น„์Šค์™€ ๋น„๊ตํ•˜์—ฌ ํŠน์žฅ์ , ํ™œ์šฉ์ฒ˜, ๊ฐ€์น˜, ๊ธฐ๋Œ€ํšจ๊ณผ, ๊ฒฐ๋ก ์„ ํฌํ•จ)๋กœ 4000 ํ† ํฐ ์ด์ƒ ์ž‘์„ฑํ•˜๋ผ"],
566
+ ["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ์‚ฌ์šฉ ๋ฐฉ๋ฒ•๊ณผ ์ฐจ๋ณ„์ , ํŠน์ง•, ๊ฐ•์ ์„ ์ค‘์‹ฌ์œผ๋กœ 4000 ํ† ํฐ ์ด์ƒ ์œ ํŠœ๋ธŒ ์˜์ƒ ์Šคํฌ๋ฆฝํŠธ ํ˜•ํƒœ๋กœ ์ž‘์„ฑํ•˜๋ผ"],
567
+ ["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ์ œํ’ˆ ์ƒ์„ธ ํŽ˜์ด์ง€ ํ˜•์‹์˜ ๋‚ด์šฉ์„ 4000 ํ† ํฐ ์ด์ƒ ์ž์„ธํžˆ ์„ค๋ช…ํ•˜๋ผ"],
568
+ ["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, FAQ 20๊ฑด์„ ์ƒ์„ธํ•˜๊ฒŒ ์ž‘์„ฑํ•˜๋ผ. 4000ํ† ํฐ ์ด์ƒ ์‚ฌ์šฉํ•˜๋ผ."],
569
+ ["์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ, ํŠนํ—ˆ ์ถœ์›์— ํ™œ์šฉํ•  ๊ธฐ์ˆ  ๋ฐ ๋น„์ฆˆ๋‹ˆ์Šค ๋ชจ๋ธ ์ธก๋ฉด์„ ํฌํ•จํ•˜์—ฌ ํŠนํ—ˆ ์ถœ์›์„œ ๊ตฌ์„ฑ์— ๋งž๊ฒŒ ํ˜์‹ ์ ์ธ ์ฐฝ์˜ ๋ฐœ๋ช… ๋‚ด์šฉ์„ ์ค‘์‹ฌ์œผ๋กœ 4000 ํ† ํฐ ์ด์ƒ ์ž‘์„ฑํ•˜๋ผ."],
570
+ ],
571
+ inputs=msg_data_upload,
572
+ label="์˜ˆ์ œ ์„ ํƒ",
573
+ )
574
+
575
+ # Parquet ํŒŒ์ผ ์—…๋กœ๋“œ๋ฅผ ํ™”๋ฉด ํ•˜๋‹จ์œผ๋กœ ์ด๋™
576
+ gr.Markdown("### Parquet ํŒŒ์ผ ์—…๋กœ๋“œ")
577
+ with gr.Row():
578
+ with gr.Column():
579
+ parquet_upload = gr.File(
580
+ label="Parquet ํŒŒ์ผ ์—…๋กœ๋“œ", type="filepath", elem_id="parquet-upload-area"
581
+ )
582
+ parquet_upload_button = gr.Button("์—…๋กœ๋“œ")
583
+ parquet_upload_status = gr.Textbox(label="์—…๋กœ๋“œ ์ƒํƒœ", interactive=False)
584
+ parquet_preview_chat = gr.Markdown(label="Parquet ํŒŒ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
585
+
586
+ def handle_parquet_upload(file_path: str):
587
+ message, parquet_content, parquet_json = upload_parquet(file_path)
588
+ if parquet_json:
589
+ return message, parquet_content, parquet_json
590
+ else:
591
+ return message, "", ""
592
+
593
+ parquet_upload_button.click(
594
+ handle_parquet_upload,
595
+ inputs=parquet_upload,
596
+ outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state]
597
+ )
598
+
599
+ # ๋‘ ๋ฒˆ์งธ ํƒญ: ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "CSV to My ๋ฐ์ดํ„ฐ์…‹")
600
+ with gr.Tab("CSV to My ๋ฐ์ดํ„ฐ์…‹"):
601
+ gr.Markdown("### CSV ํŒŒ์ผ ์—…๋กœ๋“œ ๋ฐ Parquet ๋ณ€ํ™˜")
602
+ with gr.Row():
603
+ with gr.Column():
604
+ csv_file = gr.File(label="CSV ํŒŒ์ผ ์—…๋กœ๋“œ", type="filepath")
605
+ upload_button = gr.Button("์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜")
606
+ upload_status = gr.Textbox(label="์—…๋กœ๋“œ ์ƒํƒœ", interactive=False)
607
+ parquet_preview = gr.Markdown(label="Parquet ํŒŒ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
608
+ download_button = gr.File(label="Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ", interactive=False)
609
+
610
+ def handle_csv_upload(file_path: str):
611
+ message, parquet_filename = upload_csv(file_path)
612
+ if parquet_filename:
613
+ parquet_content = load_parquet(parquet_filename)
614
+ return message, parquet_content, parquet_filename
615
+ else:
616
+ return message, "", None
617
+
618
+ upload_button.click(
619
+ handle_csv_upload,
620
+ inputs=csv_file,
621
+ outputs=[upload_status, parquet_preview, download_button]
622
+ )
623
+
624
+ # ์„ธ ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ to csv to parquet ๋ณ€ํ™˜ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "Text to My ๋ฐ์ดํ„ฐ์…‹")
625
+ with gr.Tab("Text to My ๋ฐ์ดํ„ฐ์…‹"):
626
+ gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด CSV๋กœ ๋ณ€ํ™˜ ํ›„ Parquet์œผ๋กœ ์ž๋™ ์ „ํ™˜๋ฉ๋‹ˆ๋‹ค.")
627
+ with gr.Row():
628
+ with gr.Column():
629
+ text_input = gr.Textbox(
630
+ label="ํ…์ŠคํŠธ ์ž…๋ ฅ (๊ฐ ํ–‰์€ `id,text,label,metadata` ํ˜•์‹์œผ๋กœ ์ž…๋ ฅ)",
631
+ lines=10,
632
+ placeholder='์˜ˆ: 1,"์ด์ˆœ์‹ ","์žฅ๊ตฐ","๊ฑฐ๋ถ์„ "\n2,"์›๊ท ","์žฅ๊ตฐ","๋ชจํ•จ"\n3,"์„ ์กฐ","์™•","์‹œ๊ธฐ"\n4,"๋„์š”ํ† ๋ฏธ ํžˆ๋ฐ์š”์‹œ","์™•","์นจ๋žต"'
633
+ )
634
+ convert_button = gr.Button("๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ")
635
+ convert_status = gr.Textbox(label="๋ณ€ํ™˜ ์ƒํƒœ", interactive=False)
636
+ parquet_preview_convert = gr.Markdown(label="Parquet ํŒŒ์ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
637
+ download_parquet_convert = gr.File(label="Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ", interactive=False)
638
+
639
+ def handle_text_to_parquet(text: str):
640
+ message, parquet_content, parquet_filename = text_to_parquet(text)
641
+ if parquet_filename:
642
+ return message, parquet_content, parquet_filename
643
+ else:
644
+ return message, "", None
645
+
646
+ convert_button.click(
647
+ handle_text_to_parquet,
648
+ inputs=text_input,
649
+ outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
650
+ )
651
+
652
+ # ๋„ค๋ฒˆ์งธ ํƒญ์˜ UI ๋ถ€๋ถ„ ์ˆ˜์ •
653
+ with gr.Tab("Text Preprocessing with LLM"):
654
+ gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด LLM์ด ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.")
655
+ with gr.Row():
656
+ with gr.Column():
657
+ raw_text_input = gr.Textbox(
658
+ label="ํ…์ŠคํŠธ ์ž…๋ ฅ",
659
+ lines=15,
660
+ placeholder="์—ฌ๊ธฐ์— ์ „์ฒ˜๋ฆฌํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”..."
661
+ )
662
+
663
+ with gr.Row():
664
+ preprocess_button = gr.Button("์ „์ฒ˜๋ฆฌ ์‹คํ–‰", variant="primary")
665
+ clear_button = gr.Button("์ดˆ๊ธฐํ™”")
666
+
667
+ preprocess_status = gr.Textbox(
668
+ label="์ „์ฒ˜๋ฆฌ ์ƒํƒœ",
669
+ interactive=False,
670
+ value="๋Œ€๊ธฐ ์ค‘..."
671
+ )
672
+
673
+ processed_text_output = gr.Textbox(
674
+ label="์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹ ์ถœ๋ ฅ",
675
+ lines=15,
676
+ interactive=False
677
+ )
678
+
679
+ # Parquet ๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ ์„น์…˜
680
+ convert_to_parquet_button = gr.Button("Parquet์œผ๋กœ ๋ณ€ํ™˜")
681
+ download_parquet = gr.File(label="๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
682
+
683
+
684
+
685
+
686
+ def handle_text_preprocessing(input_text: str):
687
+ if not input_text.strip():
688
+ return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", ""
689
+
690
+ try:
691
+ preprocess_status_msg = "์ „์ฒ˜๋ฆฌ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."
692
+ yield preprocess_status_msg, ""
693
+
694
+ processed_text = preprocess_text_with_llm(input_text)
695
+
696
+ if processed_text:
697
+ preprocess_status_msg = "์ „์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
698
+ yield preprocess_status_msg, processed_text
699
+ else:
700
+ preprocess_status_msg = "์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
701
+ yield preprocess_status_msg, ""
702
+
703
+ except Exception as e:
704
+ error_msg = f"์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
705
+ yield error_msg, ""
706
+
707
+ def clear_inputs():
708
+ return "", "๋Œ€๊ธฐ ์ค‘...", ""
709
+
710
+ def convert_to_parquet_file(processed_text: str):
711
+ if not processed_text.strip():
712
+ return "๋ณ€ํ™˜ํ•  ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", None
713
+
714
+ try:
715
+ message, parquet_content, parquet_filename = text_to_parquet(processed_text)
716
+ if parquet_filename:
717
+ return message, parquet_filename
718
+ return message, None
719
+ except Exception as e:
720
+ return f"Parquet ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ๏ฟฝ๏ฟฝ๏ฟฝ: {str(e)}", None
721
+
722
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ ์—ฐ๊ฒฐ
723
+ preprocess_button.click(
724
+ handle_text_preprocessing,
725
+ inputs=[raw_text_input],
726
+ outputs=[preprocess_status, processed_text_output],
727
+ queue=True
728
+ )
729
+
730
+ clear_button.click(
731
+ clear_inputs,
732
+ outputs=[raw_text_input, preprocess_status, processed_text_output]
733
+ )
734
+
735
+ convert_to_parquet_button.click(
736
+ convert_to_parquet_file,
737
+ inputs=[processed_text_output],
738
+ outputs=[preprocess_status, download_parquet]
739
+ )
740
+
741
+ # ์˜ˆ์ œ ํ…์ŠคํŠธ ์ถ”๊ฐ€
742
+ with gr.Accordion("์˜ˆ์ œ ํ…์ŠคํŠธ", open=False):
743
+ gr.Examples(
744
+ examples=[
745
+ ["์ด์ˆœ์‹ ์€ ์กฐ์„  ์ค‘๊ธฐ์˜ ๋ฌด์‹ ์ด๋‹ค. ๊ทธ๋Š” ์ž„์ง„์™œ๋ž€ ๋‹น์‹œ ํ•ด๊ตฐ์„ ์ด๋Œ์—ˆ๋‹ค. ๊ฑฐ๋ถ์„ ์„ ๋งŒ๋“ค์–ด ์™œ๊ตฐ๊ณผ ์‹ธ์› ๋‹ค."],
746
+ ["์ธ๊ณต์ง€๋Šฅ์€ ์ปดํ“จํ„ฐ ๊ณผํ•™์˜ ํ•œ ๋ถ„์•ผ์ด๋‹ค. ๊ธฐ๊ณ„ํ•™์Šต์€ ์ธ๊ณต์ง€๋Šฅ์˜ ํ•˜์œ„ ๋ถ„์•ผ์ด๋‹ค. ๋”ฅ๋Ÿฌ๋‹์€ ๊ธฐ๊ณ„ํ•™์Šต์˜ ํ•œ ๋ฐฉ๋ฒ•์ด๋‹ค."]
747
+ ],
748
+ inputs=raw_text_input,
749
+ label="์˜ˆ์ œ ์„ ํƒ"
750
+ )
751
+
752
+ with gr.Tab("๐Ÿ“š ์‚ฌ์šฉ ๋ฐฉ๋ฒ•"):
753
+ gr.Markdown("""
754
+ # MyEzRAG ์‚ฌ์šฉ ๊ฐ€์ด๋“œ
755
+
756
+ ## 1๏ธโƒฃ My ๋ฐ์ดํ„ฐ์…‹+LLM ํƒญ
757
+ ![Tab1](https://your-image-url.com/tab1.png)
758
+ ### ๊ธฐ๋Šฅ
759
+ - ์—…๋กœ๋“œ๋œ Parquet ๋ฐ์ดํ„ฐ์…‹์„ ๊ธฐ๋ฐ˜์œผ๋กœ LLM๊ณผ ๋Œ€ํ™”
760
+ - ๋ฐ์ดํ„ฐ์…‹์˜ ๋‚ด์šฉ์„ ํ™œ์šฉํ•œ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ
761
+
762
+ ### ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
763
+ 1. Parquet ํŒŒ์ผ ์—…๋กœ๋“œ ์„น์…˜์—์„œ ๋ฐ์ดํ„ฐ์…‹ ํŒŒ์ผ์„ ์—…๋กœ๋“œ
764
+ 2. ์ฑ„ํŒ…์ฐฝ์— ์›ํ•˜๋Š” ์งˆ๋ฌธ์ด๋‚˜ ์š”์ฒญ์‚ฌํ•ญ ์ž…๋ ฅ
765
+ 3. ์˜ˆ์ œ ๋ฒ„ํŠผ์„ ํ™œ์šฉํ•˜์—ฌ ๋‹ค์–‘ํ•œ ํ™œ์šฉ ์‚ฌ๋ก€ ์ฒดํ—˜
766
+
767
+ ### ํŒ
768
+ - ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ์„ค์ •์œผ๋กœ ์‘๋‹ต ์Šคํƒ€์ผ ์กฐ์ • ๊ฐ€๋Šฅ
769
+ - ์ƒ์„ธํ•œ ์งˆ๋ฌธ์ผ์ˆ˜๋ก ๋” ์ •ํ™•ํ•œ ๋‹ต๋ณ€ ์ œ๊ณต
770
+
771
+ ---
772
+
773
+ ## 2๏ธโƒฃ CSV to My ๋ฐ์ดํ„ฐ์…‹ ํƒญ
774
+ ![Tab2](https://your-image-url.com/tab2.png)
775
+ ### ๊ธฐ๋Šฅ
776
+ - CSV ํŒŒ์ผ์„ Parquet ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
777
+ - ๋ฐ์ดํ„ฐ ์ตœ์ ํ™” ๋ฐ ์ •์ œ
778
+
779
+ ### ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
780
+ 1. CSV ํŒŒ์ผ ์ค€๋น„ (ํ•„์ˆ˜ ์ปฌ๋Ÿผ: id, text, label, metadata)
781
+ 2. ํŒŒ์ผ ์—…๋กœ๋“œ ํ›„ '์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜' ๋ฒ„ํŠผ ํด๋ฆญ
782
+ 3. ๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ
783
+
784
+ ### ์ฃผ์˜์‚ฌํ•ญ
785
+ - CSV ํŒŒ์ผ์€ ๋ฐ˜๋“œ์‹œ ํ•„์ˆ˜ ์ปฌ๋Ÿผ์„ ํฌํ•จํ•ด์•ผ ํ•จ
786
+ - ์ธ์ฝ”๋”ฉ์€ UTF-8 ๊ถŒ์žฅ
787
+
788
+ ---
789
+
790
+ ## 3๏ธโƒฃ Text to My ๋ฐ์ดํ„ฐ์…‹ ํƒญ
791
+ ![Tab3](https://your-image-url.com/tab3.png)
792
+ ### ๊ธฐ๋Šฅ
793
+ - ํ…์ŠคํŠธ ํ˜•์‹์˜ ๋ฐ์ดํ„ฐ๋ฅผ Parquet์œผ๋กœ ๋ณ€ํ™˜
794
+ - ์ˆ˜๋™ ๋ฐ์ดํ„ฐ ์ž…๋ ฅ ์ง€์›
795
+
796
+ ### ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
797
+ 1. ์ง€์ •๋œ ํ˜•์‹์œผ๋กœ ํ…์ŠคํŠธ ์ž…๋ ฅ
798
+ ```
799
+ 1,"์ด์ˆœ์‹ ","์žฅ๊ตฐ","๊ฑฐ๋ถ์„ "
800
+ 2,"์›๊ท ","์žฅ๊ตฐ","๋ชจํ•จ"
801
+ ```
802
+ 2. '๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ' ๋ฒ„ํŠผ ํด๋ฆญ
803
+ 3. ๋ณ€ํ™˜๋œ ํŒŒ์ผ ํ™•์ธ ๋ฐ ๋‹ค์šด๋กœ๋“œ
804
+
805
+ ### ์ž…๋ ฅ ํ˜•์‹
806
+ - id: ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
807
+ - text: ์‹ค์ œ ํ…์ŠคํŠธ ๋‚ด์šฉ
808
+ - label: ๋ถ„๋ฅ˜ ๋ผ๋ฒจ
809
+ - metadata: ๋ถ€๊ฐ€ ์ •๋ณด
810
+
811
+ ---
812
+
813
+ ## 4๏ธโƒฃ Text Preprocessing with LLM ํƒญ
814
+ ![Tab4](https://your-image-url.com/tab4.png)
815
+ ### ๊ธฐ๋Šฅ
816
+ - LLM์„ ํ™œ์šฉํ•œ ์ž๋™ ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ
817
+ - ๊ตฌ์กฐํ™”๋œ ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ
818
+
819
+ ### ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
820
+ 1. ์›๋ฌธ ํ…์ŠคํŠธ ์ž…๋ ฅ
821
+ 2. '์ „์ฒ˜๋ฆฌ ์‹คํ–‰' ๋ฒ„ํŠผ ํด๋ฆญ
822
+ 3. ๊ฒฐ๊ณผ ํ™•์ธ ํ›„ ํ•„์š”์‹œ Parquet ๋ณ€ํ™˜
823
+
824
+ ### ํŠน์ง•
825
+ - ์ž๋™ ๋ ˆ์ด๋ธ”๋ง
826
+ - ๋ฌธ์žฅ ๋‹จ์œ„ ๋ถ„๋ฆฌ
827
+ - ์ค‘๋ณต ์ œ๊ฑฐ
828
+ - ๋ฐ์ดํ„ฐ ์ •๊ทœํ™”
829
+
830
+ ## ๐Ÿ’ก ์ผ๋ฐ˜์ ์ธ ํŒ
831
+ - ๊ฐ ํƒญ์˜ ์˜ˆ์ œ๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ์‚ฌ์šฉ๋ฒ• ์ตํžˆ๊ธฐ
832
+ - ๋ฐ์ดํ„ฐ ํ’ˆ์งˆ์ด ์ข‹์„์ˆ˜๋ก ๋” ๋‚˜์€ ๊ฒฐ๊ณผ ์ œ๊ณต
833
+ - ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ํ˜•์‹ ํ™•์ธ
834
+ - ๋Œ€์šฉ๋Ÿ‰ ์ฒ˜๋ฆฌ ์‹œ ์ ์ ˆํ•œ ์ฒญํฌ ํฌ๊ธฐ๋กœ ๋ถ„ํ•  ์ฒ˜๋ฆฌ
835
+
836
+ ## โš ๏ธ ์ฃผ์˜์‚ฌํ•ญ
837
+ - ๋ฏผ๊ฐํ•œ ๊ฐœ์ธ์ •๋ณด ํฌํ•จํ•˜์ง€ ์•Š๊ธฐ
838
+ - ๋ฐ์ดํ„ฐ ๋ฐฑ์—… ๊ถŒ์žฅ
839
+ - ๋„คํŠธ์›Œํฌ ์ƒํƒœ ํ™•์ธ
840
+ - ๋ธŒ๋ผ์šฐ์ € ์บ์‹œ ์ฃผ๊ธฐ์  ์ •๋ฆฌ
841
+
842
+ ## ๐Ÿ” ๋ฌธ์ œ ํ•ด๊ฒฐ
843
+ - ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ํ˜•์‹ ํ™•์ธ
844
+ - ํŒŒ์ผ ์—…๋กœ๋“œ ์‹คํŒจ ์‹œ ํŒŒ์ผ ํฌ๊ธฐ ๋ฐ ํ˜•์‹ ํ™•์ธ
845
+ - ๋ณ€ํ™˜ ์‹คํŒจ ์‹œ ๋ฐ์ดํ„ฐ ์ธ์ฝ”๋”ฉ ํ™•์ธ
846
+ - ์‘๋‹ต์ด ๋Š๋ฆด ๊ฒฝ์šฐ ๋ฐ์ดํ„ฐ ํฌ๊ธฐ ์กฐ์ •
847
+ """)
848
+
849
+
850
+ gr.Markdown("### [email protected]", elem_id="initial-description")
851
+
852
+ if __name__ == "__main__":
853
+ demo.launch(share=True)