import streamlit as st import requests import re from transformers import AutoModelForSequenceClassification, AutoTokenizer import torch import pandas as pd from datasets import Dataset from huggingface_hub import hf_api # Title and description st.title("OSINT Tool 🏢") st.markdown(""" This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs. It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**. """) # Sidebar for navigation st.sidebar.title("Navigation") app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"]) # GitHub Repository Analysis if app_mode == "GitHub Repository Analysis": st.header("GitHub Repository Analysis") repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface") repo_name = st.text_input("Enter GitHub Repository Name", "transformers") if st.button("Analyze Repository"): if repo_owner and repo_name: try: response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}") data = response.json() if response.status_code == 200: st.subheader("Repository Details") st.write(f"**Name**: {data['name']}") st.write(f"**Owner**: {data['owner']['login']}") st.write(f"**Stars**: {data['stargazers_count']}") st.write(f"**Forks**: {data['forks_count']}") st.write(f"**Language**: {data['language']}") st.write(f"**Description**: {data['description']}") else: st.error(f"Error: {data.get('message', 'Something went wrong with the request')}") except Exception as e: st.error(f"Error occurred: {e}") else: st.warning("Please enter both repository owner and name.") # URL Title Fetcher elif app_mode == "URL Title Fetcher": st.header("URL Title Fetcher") url = st.text_input("Enter URL", "https://www.huggingface.co") if st.button("Fetch Title"): if url: try: response = requests.get(url) if response.status_code == 200: # Try to extract the title from the HTML match = re.search('(.*?)', response.text) if match: title = match.group(1) st.write(f"**Page Title**: {title}") else: st.warning("Title tag not found in the page") else: st.error(f"Failed to retrieve the page. Status code: {response.status_code}") except Exception as e: st.error(f"Error occurred: {e}") else: st.warning("Please enter a valid URL.") # Dataset Upload & Fine-Tuning elif app_mode == "Dataset Upload & Fine-Tuning": st.header("Dataset Upload & Fine-Tuning") uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv") if uploaded_file is not None: # Load the CSV into a pandas DataFrame df = pd.read_csv(uploaded_file) # Display dataset preview st.subheader("Dataset Preview") st.write(df.head()) # Convert CSV to Hugging Face dataset format dataset = Dataset.from_pandas(df) model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"]) if st.button("Fine-tune Model"): if model_name: try: model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Prepare the dataset def preprocess_function(examples): return tokenizer(examples['text'], truncation=True, padding=True) tokenized_datasets = dataset.map(preprocess_function, batched=True) # Fine-tuning setup (using Hugging Face Trainer for a complete setup) from transformers import Trainer, TrainingArguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets, eval_dataset=tokenized_datasets, ) # Train the model trainer.train() st.success("Fine-tuning completed successfully!") except Exception as e: st.error(f"Error during fine-tuning: {e}") else: st.warning("Please select a model for fine-tuning.") else: st.warning("Please upload a dataset.")