AliArshad's picture
Create code.py
be8976e
raw
history blame
5.6 kB
import numpy as np
import pandas as pd
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# Path to your Excel file in Google Drive
file_path = '/content/drive/My Drive/filtered_data.xlsx'
# Read the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)
# Selecting only the necessary columns
selected_columns = ['Short Description', 'Severity Label', 'Project']
new_df = df[selected_columns].copy()
# Exclude bug reports with 'normal' severity
filtered_df = new_df[new_df['Severity Label'] != 'normal']
# Define mapping for label conversion
severity_mapping = {
'blocker': 'severe',
'critical': 'severe',
'major': 'severe',
'trivial': 'non-severe',
'minor': 'non-severe'
}
# Replace severity labels according to the mapping
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)
# Mapping string labels to numeric representations
label_mapping = {'non-severe': 0, 'severe': 1}
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)
####
# Initialize XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# Define parameters
max_len = 100 # Max sequence length
batch_size = 32
epochs = 5
# Initialize evaluation results dictionary
evaluation_results = {}
# Iterate through each unique project as the test set
for test_project in filtered_df['Project'].unique():
# Reinitialize the model for each test project
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Define num_labels for binary classification
# Select data for the current test project
test_data = filtered_df[filtered_df['Project'] == test_project]
train_data = filtered_df[filtered_df['Project'] != test_project]
# Split train and test data
train_texts = train_data['Short Description'].tolist()
train_labels = train_data['Severity Label'].tolist()
test_texts = test_data['Short Description'].tolist()
test_labels = test_data['Severity Label'].tolist()
# Tokenize train and test data
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)
# Create PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
# Define training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=epochs, # total number of training epochs
per_device_train_batch_size=batch_size, # batch size per device during training
per_device_eval_batch_size=batch_size, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
)
# Define trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Train the model
trainer.train()
# Save the model to Google Drive
model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to '{model_save_path}'")
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, preds)
precision = precision_score(test_labels, preds)
recall = recall_score(test_labels, preds)
f1 = f1_score(test_labels, preds)
mcc = matthews_corrcoef(test_labels, preds)
conf_matrix = confusion_matrix(test_labels, preds)
# Store evaluation results for the current test project
evaluation_results[test_project] = {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1-score': f1,
'MCC': mcc,
'Confusion Matrix': conf_matrix
}
# Print evaluation results for all test projects
for project, results in evaluation_results.items():
print(f"Evaluation results for Test Project '{project}':")
for metric, value in results.items():
if metric != 'Confusion Matrix':
print(f"{metric}: {value}")
else:
print(f"{metric}:")
print(value)
print("------------------------------")
# Convert evaluation results to a DataFrame
df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')
# Save results to an Excel file
excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
df_results.to_excel(excel_file_name)
print(f"Results saved to '{excel_file_name}'")