myezrag

Running

App Files Files Community

ginipick commited on Oct 26, 2024

Commit

b382c41

verified ·

1 Parent(s): df8954c

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -38

app.py CHANGED Viewed

@@ -285,59 +285,63 @@ def respond(message: str, history: List[Dict[str, str]], system_message: str = "
 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
-        from io import StringIO
-        import csv
-        # 입력 텍스트 정제
-        lines = text.strip().split('\n')
-        cleaned_lines = []
         for line in lines:
-            # 빈 줄 건너뛰기
-            if not line.strip():
-                continue
-            # 쌍따옴표 정규화
-            line = line.replace('""', '"')  # 중복 쌍따옴표 처리
-            # CSV 파싱을 위한 임시 StringIO 객체 생성
-            temp_buffer = StringIO(line)
             try:
-                # CSV 라인 파싱 시도
-                reader = csv.reader(temp_buffer, quoting=csv.QUOTE_ALL)
-                parsed_line = next(reader)
-                if len(parsed_line) == 4:  # id, text, label, metadata
-                    # 각 필드를 적절히 포맷팅
-                    formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
-                    cleaned_lines.append(formatted_line)
-            except:
                 continue
-            finally:
-                temp_buffer.close()
-        # 정제된 CSV 데이터 생성
-        cleaned_csv = '\n'.join(cleaned_lines)
         # DataFrame 생성
-        df = pd.read_csv(
-            StringIO(cleaned_csv),
-            sep=',',
-            quoting=csv.QUOTE_ALL,
-            escapechar='\\',
-            names=['id', 'text', 'label', 'metadata']
-        )
-        # 데이터 유형 최적화
-        df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
         # Parquet 파일로 변환
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
-        # Parquet 파일 내용 미리보기
-        parquet_content = load_parquet(parquet_filename)
-        return f"{parquet_filename} 파일이 성공적으로 변환되었습니다.", parquet_content, parquet_filename
     except Exception as e:
         error_message = f"텍스트 변환 중 오류가 발생했습니다: {str(e)}"

 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
+        # 입력 텍스트를 줄 단위로 분리
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        # 데이터를 저장할 리스트
+        data = []
         for line in lines:
             try:
+                # 정규식을 사용하여 CSV 형식 파싱
+                import re
+                pattern = r'(\d+),([^,]+),([^,]+),(.+)'
+                match = re.match(pattern, line)
+                if match:
+                    id_val, text_val, label_val, metadata_val = match.groups()
+                    # 쌍따옴표 제거 및 정제
+                    text_val = text_val.strip().strip('"')
+                    label_val = label_val.strip().strip('"')
+                    metadata_val = metadata_val.strip().strip('"')
+                    data.append({
+                        'id': int(id_val),
+                        'text': text_val,
+                        'label': label_val,
+                        'metadata': metadata_val
+                    })
+            except Exception as e:
+                print(f"라인 파싱 오류: {line}\n{str(e)}")
                 continue
+        if not data:
+            return "변환할 데이터가 없습니다.", "", ""
         # DataFrame 생성
+        df = pd.DataFrame(data)
+        # 데이터 타입 설정
+        df = df.astype({
+            'id': 'int32',
+            'text': 'string',
+            'label': 'string',
+            'metadata': 'string'
+        })
         # Parquet 파일로 변환
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
+        # 미리보기 생성
+        preview = df.to_markdown(index=False)
+        return (
+            f"{parquet_filename} 파일이 성공적으로 변환되었습니다. 총 {len(df)}개의 레코드가 처리되었습니다.",
+            preview,
+            parquet_filename
+        )
     except Exception as e:
         error_message = f"텍스트 변환 중 오류가 발생했습니다: {str(e)}"