import streamlit as st import PyPDF2 import pytesseract from PIL import Image from pdf2image import convert_from_path def pdf_to_text(pdf_file): # Open the PDF file pdf = PyPDF2.PdfReader(pdf_file) # Extract the text from each page text = '' for page in pdf.pages: text += page.extract_text() # If the text is empty, use OCR to extract the text if not text: # Convert the PDF to images images = convert_from_path(pdf_file) # Perform OCR on each image for image in images: text += pytesseract.image_to_string(image) return text def main(): st.title("PDF Text Extractor") st.write("Upload a PDF file to extract the text") pdf_file = st.file_uploader("Upload PDF file", type=["pdf"]) if pdf_file is not None: text = pdf_to_text(pdf_file) txt_file = pdf_file.name.replace('.pdf', '.txt') with open(txt_file, 'w') as f: f.write(text) with open(txt_file, "rb") as file: btn = st.download_button( label="Download Extracted Text", data=file, file_name=txt_file, mime="text/plain" ) if __name__ == "__main__": main()