Update app.py
Browse files
app.py
CHANGED
@@ -285,59 +285,63 @@ def respond(message: str, history: List[Dict[str, str]], system_message: str = "
|
|
285 |
|
286 |
def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
287 |
try:
|
288 |
-
|
289 |
-
|
290 |
|
291 |
-
#
|
292 |
-
|
293 |
-
cleaned_lines = []
|
294 |
|
295 |
for line in lines:
|
296 |
-
# λΉ μ€ κ±΄λλ°κΈ°
|
297 |
-
if not line.strip():
|
298 |
-
continue
|
299 |
-
|
300 |
-
# μλ°μ΄ν μ κ·ν
|
301 |
-
line = line.replace('""', '"') # μ€λ³΅ μλ°μ΄ν μ²λ¦¬
|
302 |
-
|
303 |
-
# CSV νμ±μ μν μμ StringIO κ°μ²΄ μμ±
|
304 |
-
temp_buffer = StringIO(line)
|
305 |
try:
|
306 |
-
# CSV
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
continue
|
315 |
-
finally:
|
316 |
-
temp_buffer.close()
|
317 |
|
318 |
-
|
319 |
-
|
320 |
|
321 |
# DataFrame μμ±
|
322 |
-
df = pd.
|
323 |
-
StringIO(cleaned_csv),
|
324 |
-
sep=',',
|
325 |
-
quoting=csv.QUOTE_ALL,
|
326 |
-
escapechar='\\',
|
327 |
-
names=['id', 'text', 'label', 'metadata']
|
328 |
-
)
|
329 |
|
330 |
-
# λ°μ΄ν°
|
331 |
-
df = df.astype({
|
|
|
|
|
|
|
|
|
|
|
332 |
|
333 |
# Parquet νμΌλ‘ λ³ν
|
334 |
parquet_filename = 'text_to_parquet.parquet'
|
335 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
336 |
|
337 |
-
#
|
338 |
-
|
339 |
|
340 |
-
return
|
|
|
|
|
|
|
|
|
341 |
|
342 |
except Exception as e:
|
343 |
error_message = f"ν
μ€νΈ λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
|
|
285 |
|
286 |
def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
287 |
try:
|
288 |
+
# μ
λ ₯ ν
μ€νΈλ₯Ό μ€ λ¨μλ‘ λΆλ¦¬
|
289 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
290 |
|
291 |
+
# λ°μ΄ν°λ₯Ό μ μ₯ν 리μ€νΈ
|
292 |
+
data = []
|
|
|
293 |
|
294 |
for line in lines:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
try:
|
296 |
+
# μ κ·μμ μ¬μ©νμ¬ CSV νμ νμ±
|
297 |
+
import re
|
298 |
+
pattern = r'(\d+),([^,]+),([^,]+),(.+)'
|
299 |
+
match = re.match(pattern, line)
|
300 |
+
|
301 |
+
if match:
|
302 |
+
id_val, text_val, label_val, metadata_val = match.groups()
|
303 |
+
|
304 |
+
# μλ°μ΄ν μ κ±° λ° μ μ
|
305 |
+
text_val = text_val.strip().strip('"')
|
306 |
+
label_val = label_val.strip().strip('"')
|
307 |
+
metadata_val = metadata_val.strip().strip('"')
|
308 |
+
|
309 |
+
data.append({
|
310 |
+
'id': int(id_val),
|
311 |
+
'text': text_val,
|
312 |
+
'label': label_val,
|
313 |
+
'metadata': metadata_val
|
314 |
+
})
|
315 |
+
except Exception as e:
|
316 |
+
print(f"λΌμΈ νμ± μ€λ₯: {line}\n{str(e)}")
|
317 |
continue
|
|
|
|
|
318 |
|
319 |
+
if not data:
|
320 |
+
return "λ³νν λ°μ΄ν°κ° μμ΅λλ€.", "", ""
|
321 |
|
322 |
# DataFrame μμ±
|
323 |
+
df = pd.DataFrame(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
+
# λ°μ΄ν° νμ
μ€μ
|
326 |
+
df = df.astype({
|
327 |
+
'id': 'int32',
|
328 |
+
'text': 'string',
|
329 |
+
'label': 'string',
|
330 |
+
'metadata': 'string'
|
331 |
+
})
|
332 |
|
333 |
# Parquet νμΌλ‘ λ³ν
|
334 |
parquet_filename = 'text_to_parquet.parquet'
|
335 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
336 |
|
337 |
+
# 미리보기 μμ±
|
338 |
+
preview = df.to_markdown(index=False)
|
339 |
|
340 |
+
return (
|
341 |
+
f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ λ³νλμμ΅λλ€. μ΄ {len(df)}κ°μ λ μ½λκ° μ²λ¦¬λμμ΅λλ€.",
|
342 |
+
preview,
|
343 |
+
parquet_filename
|
344 |
+
)
|
345 |
|
346 |
except Exception as e:
|
347 |
error_message = f"ν
μ€νΈ λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|