Spaces:

AliArshad
/

SeverityPrediction

Runtime error

App Files Files Community

AliArshad commited on Nov 30, 2023

Commit

be8976e

•

1 Parent(s): 44fecf0

Create code.py

Browse files

Files changed (1) hide show

code.py +157 -0

code.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import numpy as np
+import pandas as pd
+import torch
+from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
+# Mount Google Drive
+from google.colab import drive
+drive.mount('/content/drive', force_remount=True)
+# Path to your Excel file in Google Drive
+file_path = '/content/drive/My Drive/filtered_data.xlsx'
+# Read the Excel file into a pandas DataFrame
+df = pd.read_excel(file_path)
+# Selecting only the necessary columns
+selected_columns = ['Short Description', 'Severity Label', 'Project']
+new_df = df[selected_columns].copy()
+# Exclude bug reports with 'normal' severity
+filtered_df = new_df[new_df['Severity Label'] != 'normal']
+# Define mapping for label conversion
+severity_mapping = {
+    'blocker': 'severe',
+    'critical': 'severe',
+    'major': 'severe',
+    'trivial': 'non-severe',
+    'minor': 'non-severe'
+}
+# Replace severity labels according to the mapping
+filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)
+# Mapping string labels to numeric representations
+label_mapping = {'non-severe': 0, 'severe': 1}
+filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)
+####
+# Initialize XLNet tokenizer and model
+tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+# Define parameters
+max_len = 100  # Max sequence length
+batch_size = 32
+epochs = 5
+# Initialize evaluation results dictionary
+evaluation_results = {}
+# Iterate through each unique project as the test set
+for test_project in filtered_df['Project'].unique():
+    # Reinitialize the model for each test project
+    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)  # Define num_labels for binary classification
+    # Select data for the current test project
+    test_data = filtered_df[filtered_df['Project'] == test_project]
+    train_data = filtered_df[filtered_df['Project'] != test_project]
+    # Split train and test data
+    train_texts = train_data['Short Description'].tolist()
+    train_labels = train_data['Severity Label'].tolist()
+    test_texts = test_data['Short Description'].tolist()
+    test_labels = test_data['Severity Label'].tolist()
+    # Tokenize train and test data
+    train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
+    test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)
+    # Create PyTorch datasets
+    class CustomDataset(torch.utils.data.Dataset):
+        def __init__(self, encodings, labels):
+            self.encodings = encodings
+            self.labels = labels
+        def __getitem__(self, idx):
+            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+            item['labels'] = torch.tensor(self.labels[idx])
+            return item
+        def __len__(self):
+            return len(self.labels)
+    train_dataset = CustomDataset(train_encodings, train_labels)
+    test_dataset = CustomDataset(test_encodings, test_labels)
+    # Define training arguments
+    training_args = TrainingArguments(
+        output_dir='./results',  # output directory
+        num_train_epochs=epochs,  # total number of training epochs
+        per_device_train_batch_size=batch_size,  # batch size per device during training
+        per_device_eval_batch_size=batch_size,   # batch size for evaluation
+        warmup_steps=500,  # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,  # strength of weight decay
+        logging_dir='./logs',  # directory for storing logs
+    )
+    # Define trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+    )
+    # Train the model
+    trainer.train()
+    # Save the model to Google Drive
+    model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
+    torch.save(model.state_dict(), model_save_path)
+    print(f"Model saved to '{model_save_path}'")
+    # Evaluate the model
+    predictions = trainer.predict(test_dataset)
+    preds = np.argmax(predictions.predictions, axis=1)
+    # Calculate evaluation metrics
+    accuracy = accuracy_score(test_labels, preds)
+    precision = precision_score(test_labels, preds)
+    recall = recall_score(test_labels, preds)
+    f1 = f1_score(test_labels, preds)
+    mcc = matthews_corrcoef(test_labels, preds)
+    conf_matrix = confusion_matrix(test_labels, preds)
+    # Store evaluation results for the current test project
+    evaluation_results[test_project] = {
+        'Accuracy': accuracy,
+        'Precision': precision,
+        'Recall': recall,
+        'F1-score': f1,
+        'MCC': mcc,
+        'Confusion Matrix': conf_matrix
+    }
+# Print evaluation results for all test projects
+for project, results in evaluation_results.items():
+    print(f"Evaluation results for Test Project '{project}':")
+    for metric, value in results.items():
+        if metric != 'Confusion Matrix':
+            print(f"{metric}: {value}")
+        else:
+            print(f"{metric}:")
+            print(value)
+    print("------------------------------")
+# Convert evaluation results to a DataFrame
+df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')
+# Save results to an Excel file
+excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
+df_results.to_excel(excel_file_name)
+print(f"Results saved to '{excel_file_name}'")