Spaces:
Running
Running
File size: 5,598 Bytes
be8976e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import numpy as np
import pandas as pd
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# Path to your Excel file in Google Drive
file_path = '/content/drive/My Drive/filtered_data.xlsx'
# Read the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)
# Selecting only the necessary columns
selected_columns = ['Short Description', 'Severity Label', 'Project']
new_df = df[selected_columns].copy()
# Exclude bug reports with 'normal' severity
filtered_df = new_df[new_df['Severity Label'] != 'normal']
# Define mapping for label conversion
severity_mapping = {
'blocker': 'severe',
'critical': 'severe',
'major': 'severe',
'trivial': 'non-severe',
'minor': 'non-severe'
}
# Replace severity labels according to the mapping
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)
# Mapping string labels to numeric representations
label_mapping = {'non-severe': 0, 'severe': 1}
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)
####
# Initialize XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# Define parameters
max_len = 100 # Max sequence length
batch_size = 32
epochs = 5
# Initialize evaluation results dictionary
evaluation_results = {}
# Iterate through each unique project as the test set
for test_project in filtered_df['Project'].unique():
# Reinitialize the model for each test project
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Define num_labels for binary classification
# Select data for the current test project
test_data = filtered_df[filtered_df['Project'] == test_project]
train_data = filtered_df[filtered_df['Project'] != test_project]
# Split train and test data
train_texts = train_data['Short Description'].tolist()
train_labels = train_data['Severity Label'].tolist()
test_texts = test_data['Short Description'].tolist()
test_labels = test_data['Severity Label'].tolist()
# Tokenize train and test data
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)
# Create PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
# Define training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=epochs, # total number of training epochs
per_device_train_batch_size=batch_size, # batch size per device during training
per_device_eval_batch_size=batch_size, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
)
# Define trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Train the model
trainer.train()
# Save the model to Google Drive
model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to '{model_save_path}'")
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, preds)
precision = precision_score(test_labels, preds)
recall = recall_score(test_labels, preds)
f1 = f1_score(test_labels, preds)
mcc = matthews_corrcoef(test_labels, preds)
conf_matrix = confusion_matrix(test_labels, preds)
# Store evaluation results for the current test project
evaluation_results[test_project] = {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1-score': f1,
'MCC': mcc,
'Confusion Matrix': conf_matrix
}
# Print evaluation results for all test projects
for project, results in evaluation_results.items():
print(f"Evaluation results for Test Project '{project}':")
for metric, value in results.items():
if metric != 'Confusion Matrix':
print(f"{metric}: {value}")
else:
print(f"{metric}:")
print(value)
print("------------------------------")
# Convert evaluation results to a DataFrame
df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')
# Save results to an Excel file
excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
df_results.to_excel(excel_file_name)
print(f"Results saved to '{excel_file_name}'")
|