myezrag

Running

App Files Files Community

ginipick commited on Oct 25, 2024

Commit

a958a41

verified ·

1 Parent(s): d81053d

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -155

app.py CHANGED Viewed

@@ -3,80 +3,16 @@ from huggingface_hub import InferenceClient
 import os
 import pandas as pd
 from typing import List, Dict, Tuple
 import io
 import traceback
 import csv
-from functools import lru_cache
-from concurrent.futures import ThreadPoolExecutor
-import nltk
-from nltk.tokenize import sent_tokenize
-from transformers import AutoTokenizer
-# NLTK 데이터 다운로드
-nltk.download('punkt')
 # 추론 API 클라이언트 설정
 hf_client = InferenceClient(
     "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 )
-def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
-    """텍스트를 더 작은 청크로 분할"""
-    tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus-08-2024")
-    sentences = sent_tokenize(text)
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for sentence in sentences:
-        sentence = sentence.strip()
-        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
-        sentence_length = len(tokenized_sentence)
-        if current_length + sentence_length > chunk_size:
-            if current_chunk:
-                chunks.append(' '.join(current_chunk))
-            current_chunk = [sentence]
-            current_length = sentence_length
-        else:
-            current_chunk.append(sentence)
-            current_length += sentence_length
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    return chunks
-# 나머지 코드는 이전과 동일하게 유지
-@lru_cache(maxsize=100)
-def cached_preprocess(text: str) -> str:
-    """자주 사용되는 텍스트에 대한 전처리 결과를 캐싱"""
-    return preprocess_single_chunk(text)
-def preprocess_single_chunk(chunk: str) -> str:
-    """단일 청크에 대한 전처리 수행"""
-    system_prompt = """당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 빠르게 변환하세요.
-    [기존 규칙 동일]"""
-    full_prompt = f"{system_prompt}\n\n입력텍스트:\n{chunk}\n\n출력:"
-    try:
-        # 스트리밍 비활성화 및 파라미터 최적화
-        response = hf_client.text_generation(
-            prompt=full_prompt,
-            max_new_tokens=2000,  # 토큰 수 제한
-            temperature=0.1,      # 더 결정적인 출력
-            top_p=0.5,            # 더 집중된 출력
-            stream=False          # 스트리밍 비활성화
-        )
-        return response.strip()
-    except Exception as e:
-        print(f"청크 처리 중 오류 발생: {str(e)}\n{traceback.format_exc()}")
-        return "청크 처리 중 오류가 발생했습니다. 관리자에게 문의하세요."
 def load_code(filename: str) -> str:
     try:
         with open(filename, 'r', encoding='utf-8') as file:
@@ -84,8 +20,7 @@ def load_code(filename: str) -> str:
     except FileNotFoundError:
         return f"{filename} 파일을 찾을 수 없습니다."
     except Exception as e:
-        print(f"파일 읽기 오류: {str(e)}\n{traceback.format_exc()}")
-        return "파일을 읽는 중 오류가 발생했습니다. 관리자에게 문의하세요."
 def load_parquet(filename: str) -> str:
     try:
@@ -94,8 +29,7 @@ def load_parquet(filename: str) -> str:
     except FileNotFoundError:
         return f"{filename} 파일을 찾을 수 없습니다."
     except Exception as e:
-        print(f"Parquet 파일 로드 오류: {str(e)}\n{traceback.format_exc()}")
-        return "파일을 읽는 중 오류가 발생했습니다. 관리자에게 문의하세요."
 def respond(
     message: str,
@@ -144,9 +78,9 @@ def respond(
                 response += msg
                 yield response
     except Exception as e:
-        error_message = f"추론 중 오류가 발생했습니다: {str(e)}"
         print(error_message)
-        yield "추론 중 오류가 발생했습니다. 관리자에게 문의하세요."
 def upload_csv(file_path: str) -> Tuple[str, str]:
     try:
@@ -168,8 +102,7 @@ def upload_csv(file_path: str) -> Tuple[str, str]:
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         return f"{parquet_filename} 파일이 성공적으로 업로드되고 변환되었습니다.", parquet_filename
     except Exception as e:
-        print(f"CSV 파일 업로드 및 변환 중 오류 발생: {str(e)}\n{traceback.format_exc()}")
-        return "CSV 파일 업로드 및 변환 중 오류가 발생했습니다. 관리자에게 문의하세요.", ""
 def upload_parquet(file_path: str) -> Tuple[str, str, str]:
     try:
@@ -181,26 +114,25 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
         parquet_json = df.to_json(orient='records', force_ascii=False)
         return "Parquet 파일이 성공적으로 업로드되었습니다.", parquet_content, parquet_json
     except Exception as e:
-        print(f"Parquet 파일 업로드 중 오류 발생: {str(e)}\n{traceback.format_exc()}")
-        return "Parquet 파일 업로드 중 오류가 발생했습니다. 관리자에게 문의하세요.", "", ""
 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
         from io import StringIO
         import csv
         # 입력 텍스트 정제
         lines = text.strip().split('\n')
         cleaned_lines = []
         for line in lines:
             # 빈 줄 건너뛰기
             if not line.strip():
                 continue
             # 쌍따옴표 정규화
             line = line.replace('""', '"')  # 중복 쌍따옴표 처리
             # CSV 파싱을 위한 임시 StringIO 객체 생성
             temp_buffer = StringIO(line)
             try:
@@ -211,14 +143,14 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
                     # 각 필드를 적절히 포맷팅
                     formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
                     cleaned_lines.append(formatted_line)
-            except Exception as e:
                 continue
             finally:
                 temp_buffer.close()
         # 정제된 CSV 데이터 생성
         cleaned_csv = '\n'.join(cleaned_lines)
         # DataFrame 생성
         df = pd.read_csv(
             StringIO(cleaned_csv),
@@ -227,28 +159,28 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
             escapechar='\\',
             names=['id', 'text', 'label', 'metadata']
         )
         # 데이터 유형 최적화
         df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
         # Parquet 파일로 변환
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         # Parquet 파일 내용 미리보기
         parquet_content = load_parquet(parquet_filename)
         return f"{parquet_filename} 파일이 성공적으로 변환되었습니다.", parquet_content, parquet_filename
     except Exception as e:
         error_message = f"텍스트 변환 중 오류가 발생했습니다: {str(e)}"
         print(f"{error_message}\n{traceback.format_exc()}")
-        return "텍스트 변환 중 오류가 발생했습니다. 관리자에게 문의하세요.", "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
     if not input_text.strip():
-        return "입력 텍스트가 없습니다."
     system_prompt = """당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 변환하세요.
 규칙:
@@ -277,53 +209,41 @@ def preprocess_text_with_llm(input_text: str) -> str:
 - 각 행은 새로운 줄로 구분
 - 불필요한 반복 출력 금지"""
-    try:
-        # 텍스트를 청크로 분할
-        chunks = chunk_text(input_text)
-        # 병렬 처리로 청크들을 처리
-        with ThreadPoolExecutor(max_workers=3) as executor:
-            futures = []
-            for chunk in chunks:
-                # 각 청크에 대한 프롬프트 생성
-                chunk_prompt = f"{system_prompt}\n\n입력텍스트:\n{chunk}\n\n출력:"
-                future = executor.submit(
-                    hf_client.text_generation,
-                    prompt=chunk_prompt,
-                    max_new_tokens=2000,
-                    temperature=0.1,
-                    top_p=0.5,
-                    stream=False
-                )
-                futures.append(future)
-            processed_chunks = [future.result() for future in futures]
-        # 결과 병합 및 중복 제거
-        all_lines = []
         seen_texts = set()
-        current_id = 1
-        for chunk_result in processed_chunks:
-            # EOS_TOKEN 처리
-            if "<EOS_TOKEN>" in chunk_result:
-                chunk_result = chunk_result.split("<EOS_TOKEN>")[0]
-            lines = chunk_result.strip().split('\n')
-            for line in lines:
-                line = line.strip()
-                if line and '출력:' not in line and line not in seen_texts:
-                    # ID 재할당
-                    parts = line.split(',', 1)
-                    if len(parts) > 1:
-                        new_line = f"{current_id},{parts[1]}"
-                        if new_line not in seen_texts:  # 추가적인 중복 검사
-                            all_lines.append(new_line)
-                            seen_texts.add(new_line)
-                            current_id += 1
-        processed_text = '\n'.join(all_lines)
         # CSV 형식 검증
         try:
             from io import StringIO
@@ -332,11 +252,11 @@ def preprocess_text_with_llm(input_text: str) -> str:
             return processed_text
         except csv.Error:
             return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."
     except Exception as e:
         error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
         print(error_message)
-        return "전처리 중 오류가 발생했습니다. 관리자에게 문의하세요."
 # CSS 설정
 css = """
@@ -375,6 +295,8 @@ with gr.Blocks(css=css) as demo:
         elem_id="initial-description"
     )
     # 첫 번째 탭: 챗봇 데이터 업로드 (탭 이름 변경: "My 데이터셋+LLM")
     with gr.Tab("My 데이터셋+LLM"):
         gr.Markdown("### LLM과 대화하기")
@@ -418,8 +340,7 @@ with gr.Blocks(css=css) as demo:
                 # 어시스턴트의 응답을 히스토리에 추가
                 history.append({"role": "assistant", "content": partial_response})
             except Exception as e:
-                print(f"메시지 처리 중 오류 발생: {str(e)}\n{traceback.format_exc()}")
-                response = "메시지 처리 중 오류가 발생했습니다. 관리자에게 문의하세요."
                 history.append({"role": "assistant", "content": response})
                 yield history, ""
@@ -540,48 +461,49 @@ with gr.Blocks(css=css) as demo:
                     lines=15,
                     placeholder="여기에 전처리할 텍스트를 입력하세요..."
                 )
                 with gr.Row():
                     preprocess_button = gr.Button("전처리 실행", variant="primary")
                     clear_button = gr.Button("초기화")
                 preprocess_status = gr.Textbox(
                     label="전처리 상태",
                     interactive=False,
                     value="대기 중..."
                 )
                 processed_text_output = gr.Textbox(
                     label="전처리된 데이터셋 출력",
                     lines=15,
                     interactive=False
                 )
                 # Parquet 변환 및 다운로드 섹션
                 convert_to_parquet_button = gr.Button("Parquet으로 변환")
                 download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")
                 def handle_text_preprocessing(input_text: str):
                     if not input_text.strip():
-                        yield "입력 텍스트가 없습니다.", ""
-                        return
                     try:
                         preprocess_status_msg = "전처리를 시작합니다..."
                         yield preprocess_status_msg, ""
                         processed_text = preprocess_text_with_llm(input_text)
                         if processed_text:
                             preprocess_status_msg = "전처리가 완료되었습니다."
                             yield preprocess_status_msg, processed_text
                         else:
                             preprocess_status_msg = "전처리 결과가 없습니다."
                             yield preprocess_status_msg, ""
                     except Exception as e:
-                        error_msg = "전처리 중 오류가 발생했습니다. 관���자에게 문의하세요."
-                        print(f"전처리 중 오류 발생: {str(e)}\n{traceback.format_exc()}")
                         yield error_msg, ""
                 def clear_inputs():
@@ -590,15 +512,14 @@ with gr.Blocks(css=css) as demo:
                 def convert_to_parquet_file(processed_text: str):
                     if not processed_text.strip():
                         return "변환할 텍스트가 없습니다.", None
                     try:
                         message, parquet_content, parquet_filename = text_to_parquet(processed_text)
                         if parquet_filename:
                             return message, parquet_filename
                         return message, None
                     except Exception as e:
-                        print(f"Parquet 변환 중 오류 발생: {str(e)}\n{traceback.format_exc()}")
-                        return "Parquet 변환 중 오류가 발생했습니다. 관리자에게 문의하세요.", None
                 # 이벤트 핸들러 연결
                 preprocess_button.click(
@@ -633,5 +554,5 @@ with gr.Blocks(css=css) as demo:
     gr.Markdown("### [email protected]", elem_id="initial-description")
 if __name__ == "__main__":
-    demo.launch(share=True)  # 코드상의 오류나 개선이 필요한 사항을 추론하여 보고하라

 import os
 import pandas as pd
 from typing import List, Dict, Tuple
+import json
 import io
 import traceback
 import csv
 # 추론 API 클라이언트 설정
 hf_client = InferenceClient(
     "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 )
 def load_code(filename: str) -> str:
     try:
         with open(filename, 'r', encoding='utf-8') as file:
     except FileNotFoundError:
         return f"{filename} 파일을 찾을 수 없습니다."
     except Exception as e:
+        return f"파일을 읽는 중 오류가 발생했습니다: {str(e)}"
 def load_parquet(filename: str) -> str:
     try:
     except FileNotFoundError:
         return f"{filename} 파일을 찾을 수 없습니다."
     except Exception as e:
+        return f"파일을 읽는 중 오류가 발생했습니다: {str(e)}"
 def respond(
     message: str,
                 response += msg
                 yield response
     except Exception as e:
+        error_message = f"추론 중 오류가 발생했습니다: {str(e)}\n{traceback.format_exc()}"
         print(error_message)
+        yield error_message
 def upload_csv(file_path: str) -> Tuple[str, str]:
     try:
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         return f"{parquet_filename} 파일이 성공적으로 업로드되고 변환되었습니다.", parquet_filename
     except Exception as e:
+        return f"CSV 파일 업로드 및 변환 중 오류가 발생했습니다: {str(e)}", ""
 def upload_parquet(file_path: str) -> Tuple[str, str, str]:
     try:
         parquet_json = df.to_json(orient='records', force_ascii=False)
         return "Parquet 파일이 성공적으로 업로드되었습니다.", parquet_content, parquet_json
     except Exception as e:
+        return f"Parquet 파일 업로드 중 오류가 발생했습니다: {str(e)}", "", ""
 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
         from io import StringIO
         import csv
         # 입력 텍스트 정제
         lines = text.strip().split('\n')
         cleaned_lines = []
         for line in lines:
             # 빈 줄 건너뛰기
             if not line.strip():
                 continue
             # 쌍따옴표 정규화
             line = line.replace('""', '"')  # 중복 쌍따옴표 처리
             # CSV 파싱을 위한 임시 StringIO 객체 생성
             temp_buffer = StringIO(line)
             try:
                     # 각 필드를 적절히 포맷팅
                     formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
                     cleaned_lines.append(formatted_line)
+            except:
                 continue
             finally:
                 temp_buffer.close()
         # 정제된 CSV 데이터 생성
         cleaned_csv = '\n'.join(cleaned_lines)
         # DataFrame 생성
         df = pd.read_csv(
             StringIO(cleaned_csv),
             escapechar='\\',
             names=['id', 'text', 'label', 'metadata']
         )
         # 데이터 유형 최적화
         df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
         # Parquet 파일로 변환
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         # Parquet 파일 내용 미리보기
         parquet_content = load_parquet(parquet_filename)
         return f"{parquet_filename} 파일이 성공적으로 변환되었습니다.", parquet_content, parquet_filename
     except Exception as e:
         error_message = f"텍스트 변환 중 오류가 발생했습니다: {str(e)}"
         print(f"{error_message}\n{traceback.format_exc()}")
+        return error_message, "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
     if not input_text.strip():
+        return "입력 텍스트가 비어있습니다."
     system_prompt = """당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 변환하세요.
 규칙:
 - 각 행은 새로운 줄로 구분
 - 불필요한 반복 출력 금지"""
+    full_prompt = f"{system_prompt}\n\n입력텍스트:\n{input_text}\n\n출력:"
+    try:
+        response = ""
+        stream = hf_client.text_generation(
+            prompt=full_prompt,
+            max_new_tokens=4000,
+            temperature=0.1,  # 더 결정적인 출력을 위해 낮춤
+            top_p=0.9,
+            stream=True,
+        )
+        for msg in stream:
+            if msg:
+                response += msg
+        # <EOS_TOKEN> 이전까지만 추출하고 정제
+        if "<EOS_TOKEN>" in response:
+            processed_text = response.split("<EOS_TOKEN>")[0].strip()
+        else:
+            processed_text = response.strip()
+        # 중복 출력 제거
+        lines = processed_text.split('\n')
+        unique_lines = []
         seen_texts = set()
+        for line in lines:
+            line = line.strip()
+            if line and '출력:' not in line and line not in seen_texts:
+                unique_lines.append(line)
+                seen_texts.add(line)
+        processed_text = '\n'.join(unique_lines)
         # CSV 형식 검증
         try:
             from io import StringIO
             return processed_text
         except csv.Error:
             return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."
     except Exception as e:
         error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
         print(error_message)
+        return error_message
 # CSS 설정
 css = """
         elem_id="initial-description"
     )
     # 첫 번째 탭: 챗봇 데이터 업로드 (탭 이름 변경: "My 데이터셋+LLM")
     with gr.Tab("My 데이터셋+LLM"):
         gr.Markdown("### LLM과 대화하기")
                 # 어시스턴트의 응답을 히스토리에 추가
                 history.append({"role": "assistant", "content": partial_response})
             except Exception as e:
+                response = f"추론 중 오류가 발생했습니다: {str(e)}"
                 history.append({"role": "assistant", "content": response})
                 yield history, ""
                     lines=15,
                     placeholder="여기에 전처리할 텍스트를 입력하세요..."
                 )
                 with gr.Row():
                     preprocess_button = gr.Button("전처리 실행", variant="primary")
                     clear_button = gr.Button("초기화")
                 preprocess_status = gr.Textbox(
                     label="전처리 상태",
                     interactive=False,
                     value="대기 중..."
                 )
                 processed_text_output = gr.Textbox(
                     label="전처리된 데이터셋 출력",
                     lines=15,
                     interactive=False
                 )
                 # Parquet 변환 및 다운로드 섹션
                 convert_to_parquet_button = gr.Button("Parquet으로 변환")
                 download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")
                 def handle_text_preprocessing(input_text: str):
                     if not input_text.strip():
+                        return "입력 텍스트가 없습니다.", ""
                     try:
                         preprocess_status_msg = "전처리를 시작합니다..."
                         yield preprocess_status_msg, ""
                         processed_text = preprocess_text_with_llm(input_text)
                         if processed_text:
                             preprocess_status_msg = "전처리가 완료되었습니다."
                             yield preprocess_status_msg, processed_text
                         else:
                             preprocess_status_msg = "전처리 결과가 없습니다."
                             yield preprocess_status_msg, ""
                     except Exception as e:
+                        error_msg = f"처리 중 오류가 발생했습니다: {str(e)}"
                         yield error_msg, ""
                 def clear_inputs():
                 def convert_to_parquet_file(processed_text: str):
                     if not processed_text.strip():
                         return "변환할 텍스트가 없습니다.", None
                     try:
                         message, parquet_content, parquet_filename = text_to_parquet(processed_text)
                         if parquet_filename:
                             return message, parquet_filename
                         return message, None
                     except Exception as e:
+                        return f"Parquet 변환 중 오류 발생: {str(e)}", None
                 # 이벤트 핸들러 연결
                 preprocess_button.click(
     gr.Markdown("### [email protected]", elem_id="initial-description")
 if __name__ == "__main__":
+    demo.launch(share=True)