haeyeon commited on
Commit
b84b26e
Β·
1 Parent(s): 2eb709a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -4
app.py CHANGED
@@ -25,11 +25,30 @@ def get_pdf_text(pdf_docs):
25
  pdf_doc = pdf_loader.load() # ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
26
  return pdf_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
27
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # 과제
29
  # μ•„λž˜ ν…μŠ€νŠΈ μΆ”μΆœ ν•¨μˆ˜λ₯Ό μž‘μ„±
 
 
 
 
 
 
 
30
 
31
- def get_text_file(docs):
32
- pass
33
 
34
 
35
  def get_csv_file(csv_docs):
@@ -41,9 +60,20 @@ def get_csv_file(csv_docs):
41
  csv_doc = loader.load()
42
  return csv_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
43
 
44
- def get_json_file(docs):
45
- pass
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
49
  def get_text_chunks(documents):
 
25
  pdf_doc = pdf_loader.load() # ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
26
  return pdf_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
27
 
28
+
29
+ # PDF λ¬Έμ„œλ‘œλΆ€ν„° ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
30
+ def get_pdf_text(pdf_docs):
31
+ temp_dir = tempfile.TemporaryDirectory() # μž„μ‹œ 디렉토리λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
32
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # μž„μ‹œ 파일 경둜λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
33
+ with open(temp_filepath, "wb") as f: # μž„μ‹œ νŒŒμΌμ„ λ°”μ΄λ„ˆλ¦¬ μ“°κΈ° λͺ¨λ“œλ‘œ μ—½λ‹ˆλ‹€.
34
+ f.write(pdf_docs.getvalue()) # PDF λ¬Έμ„œμ˜ λ‚΄μš©μ„ μž„μ‹œ νŒŒμΌμ— μ”λ‹ˆλ‹€.
35
+ pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoaderλ₯Ό μ‚¬μš©ν•΄ PDFλ₯Ό λ‘œλ“œν•©λ‹ˆλ‹€.
36
+ pdf_doc = pdf_loader.load() # ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
37
+ return pdf_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
38
+
39
+
40
  # 과제
41
  # μ•„λž˜ ν…μŠ€νŠΈ μΆ”μΆœ ν•¨μˆ˜λ₯Ό μž‘μ„±
42
+ def get_text_file(text_docs):
43
+ temp_dir = tempfile.TemporaryDirectory() # μž„μ‹œ 디렉토리λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
44
+ temp_filepath = os.path.join(temp_dir.name, text_docs.name) # μž„μ‹œ 파일 경둜λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
45
+
46
+ # ν…μŠ€νŠΈ 파일의 λ‚΄μš©μ„ μ½μ–΄μ˜΅λ‹ˆλ‹€.
47
+ with open(temp_filepath, "r", encoding="utf-8") as f:
48
+ text_content = f.read()
49
 
50
+ # μ½μ–΄μ˜¨ ν…μŠ€νŠΈ λ‚΄μš©μ„ λ°˜ν™˜ν•©λ‹ˆλ‹€.
51
+ return text_content
52
 
53
 
54
  def get_csv_file(csv_docs):
 
60
  csv_doc = loader.load()
61
  return csv_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
62
 
 
 
63
 
64
+ def get_json_file(json_docs):
65
+ temp_dir = tempfile.TemporaryDirectory() # μž„μ‹œ 디렉토리λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
66
+ temp_filepath = os.path.join(temp_dir.name, json_docs.name) # μž„μ‹œ 파일 경둜λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
67
+ with open(temp_filepath, "wb") as f: # μž„μ‹œ νŒŒμΌμ„ λ°”μ΄λ„ˆλ¦¬ μ“°κΈ° λͺ¨λ“œλ‘œ μ—½λ‹ˆλ‹€.
68
+ f.write(json_docs.getvalue()) # PDF λ¬Έμ„œμ˜ λ‚΄μš©μ„ μž„μ‹œ νŒŒμΌμ— μ”λ‹ˆλ‹€.
69
+ loader = JSONLoader(
70
+ file_path=temp_filepath,
71
+ jq_schema='.content',
72
+ text_content=False,
73
+ json_lines=True)
74
+
75
+ json_docs = loader.load()
76
+ return json_docs
77
 
78
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
79
  def get_text_chunks(documents):