Spaces:

oliviercaron
/

GLiNER_file

Running

App Files Files Community

oliviercaron commited on Oct 17, 2024

Commit

5af24b5

verified ·

1 Parent(s): 30f8888

Update app.py

Browse files

Files changed (1) hide show

app.py +310 -343

app.py CHANGED Viewed

@@ -1,343 +1,310 @@
-import os
-import csv
-import streamlit as st
-import polars as pl
-from io import BytesIO, StringIO
-from gliner import GLiNER
-from gliner_file import run_ner
-import time
-import torch
-import platform
-from typing import List
-from streamlit_tags import st_tags  # Importing the st_tags component
-# Streamlit page configuration
-st.set_page_config(
-    page_title="GLiNER",
-    page_icon="🔥",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Function to load data from the uploaded file
-@st.cache_data
-def load_data(file):
-    """
-    Loads an uploaded CSV or Excel file with resilient detection of delimiters and types.
-    """
-    with st.spinner("Loading data, please wait..."):
-        try:
-            _, file_ext = os.path.splitext(file.name)
-            if file_ext.lower() in [".xls", ".xlsx"]:
-                return load_excel(file)
-            elif file_ext.lower() == ".csv":
-                return load_csv(file)
-            else:
-                raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")
-        except Exception as e:
-            st.error("Error loading data:")
-            st.error(str(e))
-            return None
-def load_excel(file):
-    """
-    Loads an Excel file using `BytesIO` and `polars` for reduced latency.
-    """
-    try:
-        # Load the file into BytesIO for faster reading
-        file_bytes = BytesIO(file.read())
-        # Load the Excel file using `polars`
-        df = pl.read_excel(file_bytes, read_options={"ignore_errors": True})
-        return df
-    except Exception as e:
-        raise ValueError(f"Error reading the Excel file: {str(e)}")
-def load_csv(file):
-    """
-    Loads a CSV file by detecting the delimiter and using the quote character to handle internal delimiters.
-    """
-    try:
-        file.seek(0)  # Reset file pointer to ensure reading from the beginning
-        raw_data = file.read()
-        # Try decoding as UTF-8, else as Latin-1
-        try:
-            file_content = raw_data.decode('utf-8')
-        except UnicodeDecodeError:
-            try:
-                file_content = raw_data.decode('latin1')
-            except UnicodeDecodeError:
-                raise ValueError("Unable to decode the file. Ensure it is encoded in UTF-8 or Latin-1.")
-        # List of common delimiters
-        delimiters = [",", ";", "|", "\t", " "]
-        # Try each delimiter until one works
-        for delimiter in delimiters:
-            try:
-                # Read CSV with current delimiter and handle quoted fields
-                df = pl.read_csv(
-                    StringIO(file_content),
-                    separator=delimiter,
-                    quote_char='"',  # Handle internal delimiters with quotes
-                    try_parse_dates=True,
-                    ignore_errors=True,  # Ignore errors for invalid values
-                    truncate_ragged_lines=True
-                )
-                # Return the DataFrame if loading succeeds
-                return df
-            except Exception:
-                continue  # Move to the next delimiter in case of error
-        # If no delimiter worked
-        raise ValueError("Unable to load the file with common delimiters.")
-    except Exception as e:
-        raise ValueError(f"Error reading the CSV file: {str(e)}")
-# Function to load the GLiNER model
-@st.cache_resource
-def load_model():
-    """
-    Loads the GLiNER model into memory to avoid multiple reloads.
-    """
-    try:
-        gpu_available = torch.cuda.is_available()
-        with st.spinner("Loading the GLiNER model... Please wait."):
-            device = torch.device("cuda" if gpu_available else "cpu")
-            model = GLiNER.from_pretrained(
-                "urchade/gliner_multi-v2.1"
-            ).to(device)
-            model.eval()
-        if gpu_available:
-            device_name = torch.cuda.get_device_name(0)
-            st.success(f"GPU detected: {device_name}. Model loaded on GPU.")
-        else:
-            cpu_name = platform.processor()
-            st.warning(f"No GPU detected. Using CPU: {cpu_name}")
-        return model
-    except Exception as e:
-        st.error("Error loading the model:")
-        st.error(str(e))
-        return None
-# Function to perform NER and update the user interface
-def perform_ner(filtered_df, selected_column, labels_list, threshold):
-    """
-    Executes named entity recognition (NER) on the filtered data.
-    """
-    try:
-        texts_to_analyze = filtered_df[selected_column].to_list()
-        total_rows = len(texts_to_analyze)
-        ner_results_list = []
-        # Initialize progress bar and text
-        progress_bar = st.progress(0)
-        progress_text = st.empty()
-        start_time = time.time()
-        # Process each row individually to keep progress updates responsive
-        for index, text in enumerate(texts_to_analyze, 1):
-            if st.session_state.stop_processing:
-                progress_text.text("Processing stopped by user.")
-                break
-            ner_results = run_ner(
-                st.session_state.gliner_model,
-                [text],
-                labels_list,
-                threshold=threshold
-            )
-            ner_results_list.append(ner_results)
-            # Update progress bar and text after each row
-            progress = index / total_rows
-            elapsed_time = time.time() - start_time
-            progress_bar.progress(progress)
-            progress_text.text(f"Progress: {index}/{total_rows} - {progress * 100:.0f}% (Elapsed time: {elapsed_time:.2f}s)")
-        # Add NER results to the DataFrame
-        for label in labels_list:
-            extracted_entities = []
-            for entities in ner_results_list:
-                texts = [entity["text"] for entity in entities[0] if entity["label"] == label]
-                concatenated_texts = ", ".join(texts) if texts else ""
-                extracted_entities.append(concatenated_texts)
-            filtered_df = filtered_df.with_columns(pl.Series(name=label, values=extracted_entities))
-        end_time = time.time()
-        st.success(f"Processing completed in {end_time - start_time:.2f} seconds.")
-        return filtered_df
-    except Exception as e:
-        st.error(f"Error during NER processing: {str(e)}")
-        return filtered_df
-# Main function to run the Streamlit application
-def main():
-    st.title("Use NER with GliNER on your data file")
-    st.markdown("Prototype v0.1")
-    # User instructions
-    st.write("""
-    This application performs named entity recognition (NER) on your text data using GLiNER.
-    **Instructions:**
-    1. Upload a CSV or Excel file.
-    2. Select the column containing the text to analyze.
-    3. Filter the data if necessary.
-    4. Enter the NER labels you wish to detect.
-    5. Click "Start NER" to begin processing.
-    """)
-    # Initializing session state variables
-    if "stop_processing" not in st.session_state:
-        st.session_state.stop_processing = False
-    if "threshold" not in st.session_state:
-        st.session_state.threshold = 0.4
-    if "labels_list" not in st.session_state:
-        st.session_state.labels_list = []
-    # Load the model
-    st.session_state.gliner_model = load_model()
-    if st.session_state.gliner_model is None:
-        return
-    # File upload
-    uploaded_file = st.sidebar.file_uploader("Choose a file (CSV or Excel)")
-    if uploaded_file is None:
-        st.warning("Please upload a file to continue.")
-        return
-    # Loading data
-    df = load_data(uploaded_file)
-    if df is None:
-        return
-    # Column selection
-    selected_column = st.selectbox("Select the column containing the text:", df.columns)
-    # Data filtering
-    filter_text = st.text_input("Filter the column by text", "")
-    if filter_text:
-        filtered_df = df.filter(pl.col(selected_column).str.contains(f"(?i).*{filter_text}.*"))
-    else:
-        filtered_df = df
-    st.write("Filtered data preview:")
-    # Rows per page
-    rows_per_page = 100
-    # Calculate total rows and pages
-    total_rows = len(filtered_df)
-    total_pages = (total_rows - 1) // rows_per_page + 1
-    # Initialize current page in session_state
-    if "current_page" not in st.session_state:
-        st.session_state.current_page = 1
-    # Function to update page
-    def update_page(new_page):
-        st.session_state.current_page = new_page
-    # Pagination buttons
-    col1, col2, col3, col4, col5 = st.columns(5)
-    with col1:
-        first = st.button("⏮️ First")
-    with col2:
-        previous = st.button("⬅️ Previous")
-    with col3:
-        pass  # Page number display will be done after
-    with col4:
-        next = st.button("Next ➡️")
-    with col5:
-        last = st.button("Last ⏭️")
-    # Button clicks management
-    if first:
-        update_page(1)
-    elif previous:
-        if st.session_state.current_page > 1:
-            update_page(st.session_state.current_page - 1)
-    elif next:
-        if st.session_state.current_page < total_pages:
-            update_page(st.session_state.current_page + 1)
-    elif last:
-        update_page(total_pages)
-    # Now display the page number after updating
-    with col3:
-        st.markdown(f"Page **{st.session_state.current_page}** of **{total_pages}**")
-    # Calculate indices for pagination
-    start_idx = (st.session_state.current_page - 1) * rows_per_page
-    end_idx = min(start_idx + rows_per_page, total_rows)
-    # Check if the filtered DataFrame is empty
-    if not filtered_df.is_empty():
-        # Retrieve current page data
-        current_page_data = filtered_df.slice(start_idx, end_idx - start_idx)
-        st.write(f"Displaying {start_idx + 1} to {end_idx} of {total_rows} rows")
-        st.dataframe(current_page_data.to_pandas(), use_container_width=True)
-    else:
-        st.warning("The filtered DataFrame is empty. Please check your filters.")
-    # Confidence threshold slider
-    st.slider("Set confidence threshold", 0.0, 1.0, st.session_state.threshold, 0.01, key="threshold")
-    # Buttons to start and stop NER
-    col1, col2 = st.columns(2)
-    with col1:
-        start_button = st.button("Start NER")
-    with col2:
-        stop_button = st.button("Stop")
-    if start_button:
-        st.session_state.stop_processing = False
-        if not st.session_state.labels_list:
-            st.warning("Please enter labels for NER.")
-        else:
-            # Run NER
-            updated_df = perform_ner(filtered_df, selected_column, st.session_state.labels_list, st.session_state.threshold)
-            st.write("**NER Results:**")
-            st.dataframe(updated_df.to_pandas(), use_container_width=True)
-            # Function to convert DataFrame to Excel
-            def to_excel(df):
-                output = BytesIO()
-                df.write_excel(output)
-                return output.getvalue()
-            # Function to convert DataFrame to CSV
-            def to_csv(df):
-                return df.write_csv().encode('utf-8')
-            # Download buttons for results
-            download_col1, download_col2 = st.columns(2)
-            with download_col1:
-                st.download_button(
-                    label="📥 Download as Excel",
-                    data=to_excel(updated_df),
-                    file_name="ner_results.xlsx",
-                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                )
-            with download_col2:
-                st.download_button(
-                    label="📥 Download as CSV",
-                    data=to_csv(updated_df),
-                    file_name="ner_results.csv",
-                    mime="text/csv",
-                )
-    if stop_button:
-        st.session_state.stop_processing = True
-        st.warning("Processing stopped by user.")
-if __name__ == "__main__":
-    main()

+import os
+import csv
+import streamlit as st
+import polars as pl
+from io import BytesIO, StringIO
+from gliner import GLiNER
+from gliner_file import run_ner
+import time
+import torch
+import platform
+from typing import List
+from streamlit_tags import st_tags  # Importing the st_tags component for labels
+# Streamlit page configuration
+st.set_page_config(
+    page_title="GLiNER",
+    page_icon="🔥",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Function to load data from the uploaded file
+@st.cache_data
+def load_data(file):
+    """
+    Loads an uploaded CSV or Excel file with resilient detection of delimiters and types.
+    """
+    with st.spinner("Loading data, please wait..."):
+        try:
+            _, file_ext = os.path.splitext(file.name)
+            if file_ext.lower() in [".xls", ".xlsx"]:
+                return load_excel(file)
+            elif file_ext.lower() == ".csv":
+                return load_csv(file)
+            else:
+                raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")
+        except Exception as e:
+            st.error("Error loading data:")
+            st.error(str(e))
+            return None
+def load_excel(file):
+    """
+    Loads an Excel file using `BytesIO` and `polars` for reduced latency.
+    """
+    try:
+        file_bytes = BytesIO(file.read())
+        df = pl.read_excel(file_bytes, read_options={"ignore_errors": True})
+        return df
+    except Exception as e:
+        raise ValueError(f"Error reading the Excel file: {str(e)}")
+def load_csv(file):
+    """
+    Loads a CSV file by detecting the delimiter and using the quote character to handle internal delimiters.
+    """
+    try:
+        file.seek(0)  # Reset file pointer to ensure reading from the beginning
+        raw_data = file.read()
+        try:
+            file_content = raw_data.decode('utf-8')
+        except UnicodeDecodeError:
+            try:
+                file_content = raw_data.decode('latin1')
+            except UnicodeDecodeError:
+                raise ValueError("Unable to decode the file. Ensure it is encoded in UTF-8 or Latin-1.")
+        delimiters = [",", ";", "|", "\t", " "]
+        for delimiter in delimiters:
+            try:
+                df = pl.read_csv(
+                    StringIO(file_content),
+                    separator=delimiter,
+                    quote_char='"',
+                    try_parse_dates=True,
+                    ignore_errors=True,
+                    truncate_ragged_lines=True
+                )
+                return df
+            except Exception:
+                continue
+        raise ValueError("Unable to load the file with common delimiters.")
+    except Exception as e:
+        raise ValueError(f"Error reading the CSV file: {str(e)}")
+@st.cache_resource
+def load_model():
+    """
+    Loads the GLiNER model into memory to avoid multiple reloads.
+    """
+    try:
+        gpu_available = torch.cuda.is_available()
+        with st.spinner("Loading the GLiNER model... Please wait."):
+            device = torch.device("cuda" if gpu_available else "cpu")
+            model = GLiNER.from_pretrained(
+                "urchade/gliner_multi-v2.1"
+            ).to(device)
+            model.eval()
+        if gpu_available:
+            device_name = torch.cuda.get_device_name(0)
+            st.success(f"GPU detected: {device_name}. Model loaded on GPU.")
+        else:
+            cpu_name = platform.processor()
+            st.warning(f"No GPU detected. Using CPU: {cpu_name}")
+        return model
+    except Exception as e:
+        st.error("Error loading the model:")
+        st.error(str(e))
+        return None
+def perform_ner(filtered_df, selected_column, labels_list, threshold):
+    """
+    Executes named entity recognition (NER) on the filtered data.
+    """
+    try:
+        texts_to_analyze = filtered_df[selected_column].to_list()
+        total_rows = len(texts_to_analyze)
+        ner_results_list = []
+        progress_bar = st.progress(0)
+        progress_text = st.empty()
+        start_time = time.time()
+        for index, text in enumerate(texts_to_analyze, 1):
+            if st.session_state.stop_processing:
+                progress_text.text("Processing stopped by user.")
+                break
+            ner_results = run_ner(
+                st.session_state.gliner_model,
+                [text],
+                labels_list,
+                threshold=threshold
+            )
+            ner_results_list.append(ner_results)
+            progress = index / total_rows
+            elapsed_time = time.time() - start_time
+            progress_bar.progress(progress)
+            progress_text.text(f"Progress: {index}/{total_rows} - {progress * 100:.0f}% (Elapsed time: {elapsed_time:.2f}s)")
+        for label in labels_list:
+            extracted_entities = []
+            for entities in ner_results_list:
+                texts = [entity["text"] for entity in entities[0] if entity["label"] == label]
+                concatenated_texts = ", ".join(texts) if texts else ""
+                extracted_entities.append(concatenated_texts)
+            filtered_df = filtered_df.with_columns(pl.Series(name=label, values=extracted_entities))
+        end_time = time.time()
+        st.success(f"Processing completed in {end_time - start_time:.2f} seconds.")
+        return filtered_df
+    except Exception as e:
+        st.error(f"Error during NER processing: {str(e)}")
+        return filtered_df
+def main():
+    st.title("Use NER with GliNER on your data file")
+    st.markdown("Prototype v0.1")
+    st.write("""
+    This application performs named entity recognition (NER) on your text data using GLiNER.
+    **Instructions:**
+    1. Upload a CSV or Excel file.
+    2. Select the column containing the text to analyze.
+    3. Filter the data if necessary.
+    4. Enter the NER labels you wish to detect.
+    5. Click "Start NER" to begin processing.
+    """)
+    if "stop_processing" not in st.session_state:
+        st.session_state.stop_processing = False
+    if "threshold" not in st.session_state:
+        st.session_state.threshold = 0.4
+    if "labels_list" not in st.session_state:
+        st.session_state.labels_list = []
+    st.session_state.gliner_model = load_model()
+    if st.session_state.gliner_model is None:
+        return
+    uploaded_file = st.sidebar.file_uploader("Choose a file (CSV or Excel)")
+    if uploaded_file is None:
+        st.warning("Please upload a file to continue.")
+        return
+    df = load_data(uploaded_file)
+    if df is None:
+        return
+    selected_column = st.selectbox("Select the column containing the text:", df.columns)
+    filter_text = st.text_input("Filter the column by text", "")
+    if filter_text:
+        filtered_df = df.filter(pl.col(selected_column).str.contains(f"(?i).*{filter_text}.*"))
+    else:
+        filtered_df = df
+    st.write("Filtered data preview:")
+    rows_per_page = 100
+    total_rows = len(filtered_df)
+    total_pages = (total_rows - 1) // rows_per_page + 1
+    if "current_page" not in st.session_state:
+        st.session_state.current_page = 1
+    def update_page(new_page):
+        st.session_state.current_page = new_page
+    col1, col2, col3, col4, col5 = st.columns(5)
+    with col1:
+        first = st.button("⏮️ First")
+    with col2:
+        previous = st.button("⬅️ Previous")
+    with col3:
+        pass
+    with col4:
+        next = st.button("Next ➡️")
+    with col5:
+        last = st.button("Last ⏭️")
+    if first:
+        update_page(1)
+    elif previous:
+        if st.session_state.current_page > 1:
+            update_page(st.session_state.current_page - 1)
+    elif next:
+        if st.session_state.current_page < total_pages:
+            update_page(st.session_state.current_page + 1)
+    elif last:
+        update_page(total_pages)
+    with col3:
+        st.markdown(f"Page **{st.session_state.current_page}** of **{total_pages}**")
+    start_idx = (st.session_state.current_page - 1) * rows_per_page
+    end_idx = min(start_idx + rows_per_page, total_rows)
+    if not filtered_df.is_empty():
+        current_page_data = filtered_df.slice(start_idx, end_idx - start_idx)
+        st.write(f"Displaying {start_idx + 1} to {end_idx} of {total_rows} rows")
+        st.dataframe(current_page_data.to_pandas(), use_container_width=True)
+    else:
+        st.warning("The filtered DataFrame is empty. Please check your filters.")
+    st.slider("Set confidence threshold", 0.0, 1.0, st.session_state.threshold, 0.01, key="threshold")
+    st.session_state.labels_list = st_tags(
+        label="Enter the NER labels to detect",
+        text="Add more labels as needed",
+        value=st.session_state.labels_list,
+        key="1"
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        start_button = st.button("Start NER")
+    with col2:
+        stop_button = st.button("Stop")
+    if start_button:
+        st.session_state.stop_processing = False
+        if not st.session_state.labels_list:
+            st.warning("Please enter labels for NER.")
+        else:
+            updated_df = perform_ner(filtered_df, selected_column, st.session_state.labels_list, st.session_state.threshold)
+            st.write("**NER Results:**")
+            st.dataframe(updated_df.to_pandas(), use_container_width=True)
+            def to_excel(df):
+                output = BytesIO()
+                df.write_excel(output)
+                return output.getvalue()
+            def to_csv(df):
+                return df.write_csv().encode('utf-8')
+            download_col1, download_col2 = st.columns(2)
+            with download_col1:
+                st.download_button(
+                    label="📥 Download as Excel",
+                    data=to_excel(updated_df),
+                    file_name="ner_results.xlsx"#,
+                    #mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                )
+            with download_col2:
+                st.download_button(
+                    label="📥 Download as CSV",
+                    data=to_csv(updated_df),
+                    file_name="ner_results.csv"#,
+                    #mime="text/csv",
+                )
+    if stop_button:
+        st.session_state.stop_processing = True
+        st.warning("Processing stopped by user.")
+if __name__ == "__main__":
+    main()