Spaces:

sitwala
/

hyphen_problem

Sleeping

App Files Files Community

SitwalaM commited on Dec 18, 2024

Commit

f6b9e7f

1 Parent(s): 6ca6f28

first commit

Browse files

Files changed (1) hide show

app.py +66 -0

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import streamlit as st
+import pandas as pd
+from dolma.core.utils import split_paragraphs, split_sentences
+# Title of the Streamlit app
+st.title('Text Splitter: Paragraphs and Sentences')
+# File uploader for text document
+uploaded_file = st.file_uploader("Upload a text file", type=["txt"])
+if uploaded_file:
+    # Read the uploaded text file
+    sample_text = uploaded_file.read().decode("utf-8")
+else:
+    # Text input from user
+    sample_text = st.text_area("Or paste your text below", height=300)
+if sample_text:
+    # Split the text into paragraphs
+    paragraphs = split_paragraphs(sample_text)
+    # Split the text into sentences
+    sentences = split_sentences(sample_text)
+    # Show number of paragraphs and sentences
+    st.write(f"Number of paragraphs: {len(paragraphs)}")
+    st.write(f"Number of sentences: {len(sentences)}")
+    # Create two columns for separate views
+    col1, col2 = st.columns(2)
+    # Display paragraphs in the left column
+    with col1:
+        st.header("Paragraphs")
+        for i, paragraph in enumerate(paragraphs):
+            st.subheader(f"Paragraph {i + 1}")
+            st.write(paragraph.text)
+    # Display sentences in the right column
+    with col2:
+        st.header("Sentences")
+        for i, sentence in enumerate(sentences):
+            st.subheader(f"Sentence {i + 1}")
+            st.write(sentence.text)
+    # Convert paragraphs and sentences to pandas DataFrames
+    paragraphs_df = pd.DataFrame([p.text for p in paragraphs], columns=["Paragraph"])
+    sentences_df = pd.DataFrame([s.text for s in sentences], columns=["Sentence"])
+    # Option to download the paragraphs and sentences as CSV files
+    st.download_button(
+        label="Download Paragraphs as CSV",
+        data=paragraphs_df.to_csv(index=False).encode('utf-8'),
+        file_name="paragraphs.csv",
+        mime="text/csv"
+    )
+    st.download_button(
+        label="Download Sentences as CSV",
+        data=sentences_df.to_csv(index=False).encode('utf-8'),
+        file_name="sentences.csv",
+        mime="text/csv"
+    )
+else:
+    st.write("Please upload a text file or paste your text to split it into paragraphs and sentences.")