ginipick commited on
Commit
b382c41
Β·
verified Β·
1 Parent(s): df8954c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -38
app.py CHANGED
@@ -285,59 +285,63 @@ def respond(message: str, history: List[Dict[str, str]], system_message: str = "
285
 
286
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
287
  try:
288
- from io import StringIO
289
- import csv
290
 
291
- # μž…λ ₯ ν…μŠ€νŠΈ μ •μ œ
292
- lines = text.strip().split('\n')
293
- cleaned_lines = []
294
 
295
  for line in lines:
296
- # 빈 쀄 κ±΄λ„ˆλ›°κΈ°
297
- if not line.strip():
298
- continue
299
-
300
- # μŒλ”°μ˜΄ν‘œ μ •κ·œν™”
301
- line = line.replace('""', '"') # 쀑볡 μŒλ”°μ˜΄ν‘œ 처리
302
-
303
- # CSV νŒŒμ‹±μ„ μœ„ν•œ μž„μ‹œ StringIO 객체 생성
304
- temp_buffer = StringIO(line)
305
  try:
306
- # CSV 라인 νŒŒμ‹± μ‹œλ„
307
- reader = csv.reader(temp_buffer, quoting=csv.QUOTE_ALL)
308
- parsed_line = next(reader)
309
- if len(parsed_line) == 4: # id, text, label, metadata
310
- # 각 ν•„λ“œλ₯Ό 적절히 ν¬λ§·νŒ…
311
- formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
312
- cleaned_lines.append(formatted_line)
313
- except:
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  continue
315
- finally:
316
- temp_buffer.close()
317
 
318
- # μ •μ œλœ CSV 데이터 생성
319
- cleaned_csv = '\n'.join(cleaned_lines)
320
 
321
  # DataFrame 생성
322
- df = pd.read_csv(
323
- StringIO(cleaned_csv),
324
- sep=',',
325
- quoting=csv.QUOTE_ALL,
326
- escapechar='\\',
327
- names=['id', 'text', 'label', 'metadata']
328
- )
329
 
330
- # 데이터 μœ ν˜• μ΅œμ ν™”
331
- df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
 
 
 
 
 
332
 
333
  # Parquet 파일둜 λ³€ν™˜
334
  parquet_filename = 'text_to_parquet.parquet'
335
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
336
 
337
- # Parquet 파일 λ‚΄μš© 미리보기
338
- parquet_content = load_parquet(parquet_filename)
339
 
340
- return f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_content, parquet_filename
 
 
 
 
341
 
342
  except Exception as e:
343
  error_message = f"ν…μŠ€νŠΈ λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
 
285
 
286
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
287
  try:
288
+ # μž…λ ₯ ν…μŠ€νŠΈλ₯Ό 쀄 λ‹¨μœ„λ‘œ 뢄리
289
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
290
 
291
+ # 데이터λ₯Ό μ €μž₯ν•  리슀트
292
+ data = []
 
293
 
294
  for line in lines:
 
 
 
 
 
 
 
 
 
295
  try:
296
+ # μ •κ·œμ‹μ„ μ‚¬μš©ν•˜μ—¬ CSV ν˜•μ‹ νŒŒμ‹±
297
+ import re
298
+ pattern = r'(\d+),([^,]+),([^,]+),(.+)'
299
+ match = re.match(pattern, line)
300
+
301
+ if match:
302
+ id_val, text_val, label_val, metadata_val = match.groups()
303
+
304
+ # μŒλ”°μ˜΄ν‘œ 제거 및 μ •μ œ
305
+ text_val = text_val.strip().strip('"')
306
+ label_val = label_val.strip().strip('"')
307
+ metadata_val = metadata_val.strip().strip('"')
308
+
309
+ data.append({
310
+ 'id': int(id_val),
311
+ 'text': text_val,
312
+ 'label': label_val,
313
+ 'metadata': metadata_val
314
+ })
315
+ except Exception as e:
316
+ print(f"라인 νŒŒμ‹± 였λ₯˜: {line}\n{str(e)}")
317
  continue
 
 
318
 
319
+ if not data:
320
+ return "λ³€ν™˜ν•  데이터가 μ—†μŠ΅λ‹ˆλ‹€.", "", ""
321
 
322
  # DataFrame 생성
323
+ df = pd.DataFrame(data)
 
 
 
 
 
 
324
 
325
+ # 데이터 νƒ€μž… μ„€μ •
326
+ df = df.astype({
327
+ 'id': 'int32',
328
+ 'text': 'string',
329
+ 'label': 'string',
330
+ 'metadata': 'string'
331
+ })
332
 
333
  # Parquet 파일둜 λ³€ν™˜
334
  parquet_filename = 'text_to_parquet.parquet'
335
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
336
 
337
+ # 미리보기 생성
338
+ preview = df.to_markdown(index=False)
339
 
340
+ return (
341
+ f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€. 총 {len(df)}개의 λ ˆμ½”λ“œκ°€ μ²˜λ¦¬λ˜μ—ˆμŠ΅λ‹ˆλ‹€.",
342
+ preview,
343
+ parquet_filename
344
+ )
345
 
346
  except Exception as e:
347
  error_message = f"ν…μŠ€νŠΈ λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"