{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "source": [ "import os\n", "import xml.etree.ElementTree as ET\n", "from zipfile import ZipFile\n", "from typing import Dict, Any, Set\n", "from collections import Counter\n", "import logging\n", "import json\n", "\n", "logging.basicConfig(\n", " level=logging.INFO,\n", " format='%(asctime)s - %(levelname)s - %(message)s'\n", ")\n", "\n", "class BugSeverityProcessor:\n", " # Comprehensive severity mappings\n", " SEVERITY_MAPPINGS = {\n", " # Mozilla (Firefox/Thunderbird) severities\n", " 'blocker': 'Severe',\n", " 'critical': 'Severe',\n", " 'major': 'Severe',\n", " 'normal': 'Non-Severe',\n", " 'minor': 'Non-Severe',\n", " 'trivial': 'Non-Severe',\n", " 'enhancement': 'Non-Severe',\n", "\n", " # Eclipse (JDT/PDE/Platform) severities\n", " 'blocking': 'Severe',\n", " 'critical_blocking': 'Severe',\n", " 'major_blocking': 'Severe',\n", " 'normal_blocking': 'Non-Severe',\n", " 'minor_blocking': 'Non-Severe',\n", " 'trivial_blocking': 'Non-Severe',\n", "\n", " # Bugzilla specific severities\n", " 's1': 'Severe',\n", " 's2': 'Severe',\n", " 's3': 'Non-Severe',\n", " 's4': 'Non-Severe',\n", " 's5': 'Non-Severe',\n", "\n", " # Core/CDT specific severities\n", " 'crash': 'Severe',\n", " 'severe': 'Severe',\n", " 'important': 'Severe',\n", " 'medium': 'Non-Severe',\n", " 'low': 'Non-Severe'\n", " }\n", "\n", " def __init__(self, base_dir: str):\n", " self.base_dir = base_dir\n", " self.projects = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']\n", " self.all_data: Dict[str, Dict[str, Any]] = {}\n", " self.raw_labels: Dict[str, Set[str]] = {project: set() for project in self.projects}\n", " self.label_stats: Dict[str, Counter] = {project: Counter() for project in self.projects}\n", "\n", " def convert_severity(self, severity: str) -> str:\n", " \"\"\"\n", " Convert severity labels to binary classification while preserving original label.\n", "\n", " Args:\n", " severity: The original severity label\n", "\n", " Returns:\n", " str: 'Severe' or 'Non-Severe'\n", " \"\"\"\n", " severity = severity.lower().strip()\n", " return self.SEVERITY_MAPPINGS.get(severity, 'Non-Severe')\n", "\n", " def process_xml_file(self, file_path: str, report_type: str, project: str) -> Dict[str, Any]:\n", " \"\"\"\n", " Process individual XML files.\n", "\n", " Args:\n", " file_path: Path to XML file\n", " report_type: Type of report ('desc' or 'severity')\n", " project: Project name for tracking raw labels\n", "\n", " Returns:\n", " Dict containing processed data\n", " \"\"\"\n", " try:\n", " tree = ET.parse(file_path)\n", " root = tree.getroot()\n", " data = {}\n", "\n", " for report in root.findall('.//report'):\n", " report_id = report.get('id')\n", " last_update = report.find('.//update[last()]')\n", "\n", " if last_update is not None:\n", " if report_type == 'desc':\n", " when_elem = last_update.find('when')\n", " what_elem = last_update.find('what')\n", "\n", " if when_elem is not None and what_elem is not None:\n", " data[report_id] = {\n", " 'when': when_elem.text,\n", " 'what': what_elem.text\n", " }\n", "\n", " elif report_type == 'severity':\n", " what_elem = last_update.find('what')\n", " if what_elem is not None and what_elem.text:\n", " raw_severity = what_elem.text.strip()\n", " # Store raw label\n", " self.raw_labels[project].add(raw_severity)\n", " # Update label statistics\n", " self.label_stats[project][raw_severity] += 1\n", "\n", " data[report_id] = {\n", " 'severity_mapped': self.convert_severity(raw_severity),\n", " 'severity_raw': raw_severity\n", " }\n", "\n", " return data\n", " except ET.ParseError as e:\n", " logging.error(f\"Error parsing XML file {file_path}: {str(e)}\")\n", " return {}\n", " except Exception as e:\n", " logging.error(f\"Unexpected error processing {file_path}: {str(e)}\")\n", " return {}\n", "\n", " def process_project(self, project: str) -> None:\n", " \"\"\"\n", " Process all XML files for a given project.\n", "\n", " Args:\n", " project: Project name\n", " \"\"\"\n", " project_folder = os.path.join(self.base_dir, project)\n", " if not os.path.exists(project_folder):\n", " logging.warning(f\"Project folder not found: {project_folder}\")\n", " return\n", "\n", " desc_path = os.path.join(project_folder, 'short_desc.xml')\n", " severity_path = os.path.join(project_folder, 'severity.xml')\n", "\n", " desc_data = self.process_xml_file(desc_path, 'desc', project)\n", " severity_data = self.process_xml_file(severity_path, 'severity', project)\n", "\n", " # Merge the data\n", " project_data = {}\n", " for report_id in desc_data:\n", " if report_id in severity_data:\n", " project_data[report_id] = {\n", " **desc_data[report_id],\n", " **severity_data[report_id]\n", " }\n", "\n", " self.all_data[project] = project_data\n", "\n", " def process_all_projects(self) -> None:\n", " \"\"\"Process all projects and analyze labels.\"\"\"\n", " for project in self.projects:\n", " logging.info(f\"Processing project: {project}\")\n", " self.process_project(project)\n", "\n", " def generate_label_report(self) -> None:\n", " \"\"\"Generate and print a comprehensive report of severity labels.\"\"\"\n", " print(\"\\n=== Raw Severity Labels Analysis ===\\n\")\n", "\n", " # All unique labels across all projects\n", " all_labels = set()\n", " for labels in self.raw_labels.values():\n", " all_labels.update(labels)\n", "\n", " print(f\"Total unique severity labels found: {len(all_labels)}\")\n", " print(\"\\nAll unique severity labels:\")\n", " for label in sorted(all_labels):\n", " mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), \"Non-Severe\")\n", " print(f\"- {label:<20} -> {mapped_value}\")\n", "\n", " print(\"\\nLabel distribution by project:\")\n", " for project in self.projects:\n", " if self.label_stats[project]:\n", " print(f\"\\n{project}:\")\n", " total = sum(self.label_stats[project].values())\n", " for label, count in self.label_stats[project].most_common():\n", " percentage = (count / total) * 100\n", " mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), \"Non-Severe\")\n", " print(f\" - {label:<20} : {count:>5} ({percentage:>6.2f}%) -> {mapped_value}\")\n", "\n", "def main():\n", " # Extract ZIP file\n", " zip_file_path = '/content/test1.zip'\n", " destination_directory = '/content/unzipped_data/'\n", "\n", " try:\n", " os.makedirs(destination_directory, exist_ok=True)\n", " with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n", " zip_ref.extractall(destination_directory)\n", " logging.info(f\"Successfully extracted ZIP file to {destination_directory}\")\n", " except Exception as e:\n", " logging.error(f\"Error extracting ZIP file: {str(e)}\")\n", " return\n", "\n", " # Process and analyze the bug reports\n", " processor = BugSeverityProcessor(destination_directory)\n", " processor.process_all_projects()\n", " processor.generate_label_report()\n", "\n", " # Print binary classification statistics\n", " print(\"\\n=== Binary Classification Statistics ===\\n\")\n", " for project, data in processor.all_data.items():\n", " severe_count = sum(1 for report in data.values()\n", " if report.get('severity_mapped') == 'Severe')\n", " total_count = len(data)\n", " if total_count > 0:\n", " severe_percentage = (severe_count / total_count) * 100\n", " print(f\"{project}: {severe_count}/{total_count} \"\n", " f\"({severe_percentage:.1f}%) classified as Severe\")\n", "\n", "if __name__ == \"__main__\":\n", " main()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mjq4ZkQu-3gK", "outputId": "81331220-3ec6-40fc-f9cb-f5685a5e12d5" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "=== Raw Severity Labels Analysis ===\n", "\n", "Total unique severity labels found: 6\n", "\n", "All unique severity labels:\n", "- blocker -> Severe\n", "- critical -> Severe\n", "- major -> Severe\n", "- minor -> Non-Severe\n", "- normal -> Non-Severe\n", "- trivial -> Non-Severe\n", "\n", "Label distribution by project:\n", "\n", "Bugzilla:\n", " - normal : 2478 ( 53.68%) -> Non-Severe\n", " - minor : 766 ( 16.59%) -> Non-Severe\n", " - major : 506 ( 10.96%) -> Severe\n", " - trivial : 415 ( 8.99%) -> Non-Severe\n", " - blocker : 275 ( 5.96%) -> Severe\n", " - critical : 176 ( 3.81%) -> Severe\n", "\n", "CDT:\n", " - normal : 4547 ( 80.62%) -> Non-Severe\n", " - major : 490 ( 8.69%) -> Severe\n", " - minor : 275 ( 4.88%) -> Non-Severe\n", " - critical : 166 ( 2.94%) -> Severe\n", " - trivial : 84 ( 1.49%) -> Non-Severe\n", " - blocker : 78 ( 1.38%) -> Severe\n", "\n", "Core:\n", " - normal : 56125 ( 75.55%) -> Non-Severe\n", " - critical : 10542 ( 14.19%) -> Severe\n", " - major : 4243 ( 5.71%) -> Severe\n", " - minor : 2072 ( 2.79%) -> Non-Severe\n", " - trivial : 859 ( 1.16%) -> Non-Severe\n", " - blocker : 451 ( 0.61%) -> Severe\n", "\n", "Firefox:\n", " - normal : 47635 ( 68.17%) -> Non-Severe\n", " - major : 9486 ( 13.57%) -> Severe\n", " - critical : 6603 ( 9.45%) -> Severe\n", " - minor : 4145 ( 5.93%) -> Non-Severe\n", " - trivial : 1777 ( 2.54%) -> Non-Severe\n", " - blocker : 233 ( 0.33%) -> Severe\n", "\n", "JDT:\n", " - normal : 8306 ( 76.81%) -> Non-Severe\n", " - major : 1000 ( 9.25%) -> Severe\n", " - minor : 781 ( 7.22%) -> Non-Severe\n", " - trivial : 359 ( 3.32%) -> Non-Severe\n", " - critical : 274 ( 2.53%) -> Severe\n", " - blocker : 94 ( 0.87%) -> Severe\n", "\n", "PDE:\n", " - normal : 4693 ( 82.99%) -> Non-Severe\n", " - major : 476 ( 8.42%) -> Severe\n", " - minor : 208 ( 3.68%) -> Non-Severe\n", " - critical : 117 ( 2.07%) -> Severe\n", " - trivial : 114 ( 2.02%) -> Non-Severe\n", " - blocker : 47 ( 0.83%) -> Severe\n", "\n", "Platform:\n", " - normal : 18891 ( 76.25%) -> Non-Severe\n", " - major : 2718 ( 10.97%) -> Severe\n", " - minor : 1088 ( 4.39%) -> Non-Severe\n", " - critical : 989 ( 3.99%) -> Severe\n", " - trivial : 674 ( 2.72%) -> Non-Severe\n", " - blocker : 415 ( 1.68%) -> Severe\n", "\n", "Thunderbird:\n", " - normal : 12429 ( 64.61%) -> Non-Severe\n", " - major : 2982 ( 15.50%) -> Severe\n", " - critical : 1894 ( 9.85%) -> Severe\n", " - minor : 1415 ( 7.36%) -> Non-Severe\n", " - trivial : 452 ( 2.35%) -> Non-Severe\n", " - blocker : 65 ( 0.34%) -> Severe\n", "\n", "=== Binary Classification Statistics ===\n", "\n", "Bugzilla: 957/4616 (20.7%) classified as Severe\n", "CDT: 734/5640 (13.0%) classified as Severe\n", "Core: 15236/74292 (20.5%) classified as Severe\n", "Firefox: 16322/69879 (23.4%) classified as Severe\n", "JDT: 1368/10814 (12.7%) classified as Severe\n", "PDE: 640/5655 (11.3%) classified as Severe\n", "Platform: 4122/24775 (16.6%) classified as Severe\n", "Thunderbird: 4941/19237 (25.7%) classified as Severe\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "ggs5AOVc-6Ww" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import xml.etree.ElementTree as ET\n", "from zipfile import ZipFile\n", "from typing import Dict, Any, Set\n", "from collections import Counter\n", "import logging\n", "import json\n", "\n", "logging.basicConfig(\n", " level=logging.INFO,\n", " format='%(asctime)s - %(levelname)s - %(message)s'\n", ")\n", "\n", "class BugSeverityProcessor:\n", " # Comprehensive severity mappings\n", " SEVERITY_MAPPINGS = {\n", " # Mozilla (Firefox/Thunderbird) severities\n", " 'blocker': 'Severe',\n", " 'critical': 'Severe',\n", " 'major': 'Severe',\n", " 'normal': 'Non-Severe',\n", " 'minor': 'Non-Severe',\n", " 'trivial': 'Non-Severe',\n", " 'enhancement': 'Non-Severe',\n", "\n", " # Eclipse (JDT/PDE/Platform) severities\n", " 'blocking': 'Severe',\n", " 'critical_blocking': 'Severe',\n", " 'major_blocking': 'Severe',\n", " 'normal_blocking': 'Non-Severe',\n", " 'minor_blocking': 'Non-Severe',\n", " 'trivial_blocking': 'Non-Severe',\n", "\n", " # Bugzilla specific severities\n", " 's1': 'Severe',\n", " 's2': 'Severe',\n", " 's3': 'Non-Severe',\n", " 's4': 'Non-Severe',\n", " 's5': 'Non-Severe',\n", "\n", " # Core/CDT specific severities\n", " 'crash': 'Severe',\n", " 'severe': 'Severe',\n", " 'important': 'Severe',\n", " 'medium': 'Non-Severe',\n", " 'low': 'Non-Severe'\n", " }\n", "\n", " def __init__(self, base_dir: str):\n", " self.base_dir = base_dir\n", " self.projects = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']\n", " self.all_data: Dict[str, Dict[str, Any]] = {}\n", " self.raw_labels: Dict[str, Set[str]] = {project: set() for project in self.projects}\n", " self.label_stats: Dict[str, Counter] = {project: Counter() for project in self.projects}\n", " self.fixed_bugs: Dict[str, Set[str]] = {project: set() for project in self.projects}\n", "\n", " def convert_severity(self, severity: str) -> str:\n", " \"\"\"\n", " Convert severity labels to binary classification while preserving original label.\n", "\n", " Args:\n", " severity: The original severity label\n", "\n", " Returns:\n", " str: 'Severe' or 'Non-Severe'\n", " \"\"\"\n", " severity = severity.lower().strip()\n", " return self.SEVERITY_MAPPINGS.get(severity, 'Non-Severe')\n", "\n", " def process_xml_file(self, file_path: str, report_type: str, project: str) -> Dict[str, Any]:\n", " \"\"\"\n", " Process individual XML files.\n", "\n", " Args:\n", " file_path: Path to XML file\n", " report_type: Type of report ('desc', 'severity', or 'resolution')\n", " project: Project name for tracking raw labels\n", "\n", " Returns:\n", " Dict containing processed data\n", " \"\"\"\n", " try:\n", " tree = ET.parse(file_path)\n", " root = tree.getroot()\n", " data = {}\n", "\n", " for report in root.findall('.//report'):\n", " report_id = report.get('id')\n", " last_update = report.find('.//update[last()]')\n", "\n", " if last_update is not None:\n", " if report_type == 'desc':\n", " when_elem = last_update.find('when')\n", " what_elem = last_update.find('what')\n", "\n", " if when_elem is not None and what_elem is not None:\n", " data[report_id] = {\n", " 'when': when_elem.text,\n", " 'what': what_elem.text\n", " }\n", "\n", " elif report_type == 'severity':\n", " what_elem = last_update.find('what')\n", " if what_elem is not None and what_elem.text:\n", " raw_severity = what_elem.text.strip()\n", " self.raw_labels[project].add(raw_severity)\n", " self.label_stats[project][raw_severity] += 1\n", "\n", " data[report_id] = {\n", " 'severity_mapped': self.convert_severity(raw_severity),\n", " 'severity_raw': raw_severity\n", " }\n", "\n", " elif report_type == 'resolution':\n", " what_elem = last_update.find('what')\n", " if what_elem is not None and what_elem.text:\n", " resolution = what_elem.text.strip().upper()\n", " if resolution == 'FIXED':\n", " self.fixed_bugs[project].add(report_id)\n", " data[report_id] = {'resolution': 'FIXED'}\n", "\n", " return data\n", " except ET.ParseError as e:\n", " logging.error(f\"Error parsing XML file {file_path}: {str(e)}\")\n", " return {}\n", " except Exception as e:\n", " logging.error(f\"Unexpected error processing {file_path}: {str(e)}\")\n", " return {}\n", "\n", " def process_project(self, project: str) -> None:\n", " \"\"\"\n", " Process all XML files for a given project.\n", "\n", " Args:\n", " project: Project name\n", " \"\"\"\n", " project_folder = os.path.join(self.base_dir, project)\n", " if not os.path.exists(project_folder):\n", " logging.warning(f\"Project folder not found: {project_folder}\")\n", " return\n", "\n", " desc_path = os.path.join(project_folder, 'short_desc.xml')\n", " severity_path = os.path.join(project_folder, 'severity.xml')\n", " resolution_path = os.path.join(project_folder, 'resolution.xml')\n", "\n", " desc_data = self.process_xml_file(desc_path, 'desc', project)\n", " severity_data = self.process_xml_file(severity_path, 'severity', project)\n", " resolution_data = self.process_xml_file(resolution_path, 'resolution', project)\n", "\n", " # Merge the data only for FIXED bugs\n", " project_data = {}\n", " for report_id in self.fixed_bugs[project]:\n", " if report_id in desc_data and report_id in severity_data:\n", " project_data[report_id] = {\n", " **desc_data[report_id],\n", " **severity_data[report_id],\n", " **resolution_data[report_id]\n", " }\n", "\n", " self.all_data[project] = project_data\n", "\n", " def process_all_projects(self) -> None:\n", " \"\"\"Process all projects and analyze labels.\"\"\"\n", " for project in self.projects:\n", " logging.info(f\"Processing project: {project}\")\n", " self.process_project(project)\n", "\n", " def generate_label_report(self) -> None:\n", " \"\"\"Generate and print a comprehensive report of severity labels for FIXED bugs.\"\"\"\n", " print(\"\\n=== Raw Severity Labels Analysis (FIXED Bugs Only) ===\\n\")\n", "\n", " # All unique labels across all projects\n", " all_labels = set()\n", " for labels in self.raw_labels.values():\n", " all_labels.update(labels)\n", "\n", " print(f\"Total unique severity labels found: {len(all_labels)}\")\n", " print(\"\\nAll unique severity labels:\")\n", " for label in sorted(all_labels):\n", " mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), \"Non-Severe\")\n", " print(f\"- {label:<20} -> {mapped_value}\")\n", "\n", " print(\"\\nLabel distribution by project (FIXED bugs only):\")\n", " for project in self.projects:\n", " if self.label_stats[project]:\n", " print(f\"\\n{project}:\")\n", " fixed_bugs_count = len(self.all_data[project])\n", " if fixed_bugs_count > 0:\n", " severity_counter = Counter()\n", " for bug_data in self.all_data[project].values():\n", " severity_counter[bug_data['severity_raw']] += 1\n", "\n", " for label, count in severity_counter.most_common():\n", " percentage = (count / fixed_bugs_count) * 100\n", " mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), \"Non-Severe\")\n", " print(f\" - {label:<20} : {count:>5} ({percentage:>6.2f}%) -> {mapped_value}\")\n", "\n", "def main():\n", " # Extract ZIP file\n", " zip_file_path = '/content/test1.zip'\n", " destination_directory = '/content/unzipped_data/'\n", "\n", " try:\n", " os.makedirs(destination_directory, exist_ok=True)\n", " with ZipFile(zip_file_path, 'r') as zip_ref:\n", " zip_ref.extractall(destination_directory)\n", " logging.info(f\"Successfully extracted ZIP file to {destination_directory}\")\n", " except Exception as e:\n", " logging.error(f\"Error extracting ZIP file: {str(e)}\")\n", " return\n", "\n", " # Process and analyze the bug reports\n", " processor = BugSeverityProcessor(destination_directory)\n", " processor.process_all_projects()\n", " processor.generate_label_report()\n", "\n", " # Print binary classification statistics for FIXED bugs\n", " print(\"\\n=== Binary Classification Statistics (FIXED Bugs Only) ===\\n\")\n", " for project, data in processor.all_data.items():\n", " severe_count = sum(1 for report in data.values()\n", " if report.get('severity_mapped') == 'Severe')\n", " total_count = len(data)\n", " if total_count > 0:\n", " severe_percentage = (severe_count / total_count) * 100\n", " print(f\"{project}: {severe_count}/{total_count} \"\n", " f\"({severe_percentage:.1f}%) classified as Severe\")\n", " print(f\"Total FIXED bugs: {total_count}\")\n", "\n", "if __name__ == \"__main__\":\n", " main()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5NpKeUE7-6bH", "outputId": "95de9cea-da3e-4e18-a8de-ffa2bd65103a" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "=== Raw Severity Labels Analysis (FIXED Bugs Only) ===\n", "\n", "Total unique severity labels found: 6\n", "\n", "All unique severity labels:\n", "- blocker -> Severe\n", "- critical -> Severe\n", "- major -> Severe\n", "- minor -> Non-Severe\n", "- normal -> Non-Severe\n", "- trivial -> Non-Severe\n", "\n", "Label distribution by project (FIXED bugs only):\n", "\n", "Bugzilla:\n", " - normal : 1033 ( 42.48%) -> Non-Severe\n", " - minor : 492 ( 20.23%) -> Non-Severe\n", " - trivial : 289 ( 11.88%) -> Non-Severe\n", " - blocker : 265 ( 10.90%) -> Severe\n", " - major : 253 ( 10.40%) -> Severe\n", " - critical : 100 ( 4.11%) -> Severe\n", "\n", "CDT:\n", " - normal : 3539 ( 83.51%) -> Non-Severe\n", " - major : 303 ( 7.15%) -> Severe\n", " - minor : 194 ( 4.58%) -> Non-Severe\n", " - critical : 89 ( 2.10%) -> Severe\n", " - trivial : 70 ( 1.65%) -> Non-Severe\n", " - blocker : 43 ( 1.01%) -> Severe\n", "\n", "Core:\n", " - normal : 36960 ( 78.70%) -> Non-Severe\n", " - critical : 5834 ( 12.42%) -> Severe\n", " - major : 2043 ( 4.35%) -> Severe\n", " - minor : 1157 ( 2.46%) -> Non-Severe\n", " - trivial : 633 ( 1.35%) -> Non-Severe\n", " - blocker : 334 ( 0.71%) -> Severe\n", "\n", "Firefox:\n", " - normal : 9787 ( 83.21%) -> Non-Severe\n", " - major : 670 ( 5.70%) -> Severe\n", " - minor : 474 ( 4.03%) -> Non-Severe\n", " - trivial : 434 ( 3.69%) -> Non-Severe\n", " - critical : 275 ( 2.34%) -> Severe\n", " - blocker : 122 ( 1.04%) -> Severe\n", "\n", "JDT:\n", " - normal : 4508 ( 76.23%) -> Non-Severe\n", " - major : 547 ( 9.25%) -> Severe\n", " - minor : 400 ( 6.76%) -> Non-Severe\n", " - trivial : 294 ( 4.97%) -> Non-Severe\n", " - critical : 127 ( 2.15%) -> Severe\n", " - blocker : 38 ( 0.64%) -> Severe\n", "\n", "PDE:\n", " - normal : 3312 ( 83.47%) -> Non-Severe\n", " - major : 303 ( 7.64%) -> Severe\n", " - minor : 155 ( 3.91%) -> Non-Severe\n", " - trivial : 93 ( 2.34%) -> Non-Severe\n", " - critical : 81 ( 2.04%) -> Severe\n", " - blocker : 24 ( 0.60%) -> Severe\n", "\n", "Platform:\n", " - normal : 10793 ( 77.75%) -> Non-Severe\n", " - major : 1412 ( 10.17%) -> Severe\n", " - minor : 580 ( 4.18%) -> Non-Severe\n", " - trivial : 537 ( 3.87%) -> Non-Severe\n", " - critical : 418 ( 3.01%) -> Severe\n", " - blocker : 142 ( 1.02%) -> Severe\n", "\n", "Thunderbird:\n", " - normal : 2842 ( 77.27%) -> Non-Severe\n", " - major : 287 ( 7.80%) -> Severe\n", " - minor : 222 ( 6.04%) -> Non-Severe\n", " - trivial : 148 ( 4.02%) -> Non-Severe\n", " - critical : 145 ( 3.94%) -> Severe\n", " - blocker : 34 ( 0.92%) -> Severe\n", "\n", "=== Binary Classification Statistics (FIXED Bugs Only) ===\n", "\n", "Bugzilla: 618/2432 (25.4%) classified as Severe\n", "Total FIXED bugs: 2432\n", "CDT: 435/4238 (10.3%) classified as Severe\n", "Total FIXED bugs: 4238\n", "Core: 8211/46961 (17.5%) classified as Severe\n", "Total FIXED bugs: 46961\n", "Firefox: 1067/11762 (9.1%) classified as Severe\n", "Total FIXED bugs: 11762\n", "JDT: 712/5914 (12.0%) classified as Severe\n", "Total FIXED bugs: 5914\n", "PDE: 408/3968 (10.3%) classified as Severe\n", "Total FIXED bugs: 3968\n", "Platform: 1972/13882 (14.2%) classified as Severe\n", "Total FIXED bugs: 13882\n", "Thunderbird: 466/3678 (12.7%) classified as Severe\n", "Total FIXED bugs: 3678\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "W9ifMj3m-6e-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from zipfile import ZipFile\n", "with ZipFile('/content/test1.zip', 'r') as zip_ref:\n", " zip_ref.extractall('unzipped_data')\n" ], "metadata": { "id": "CRt8lednbbVi" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Mount Google Drive #Skip\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# Define the path to your ZIP file in Google Drive\n", "zip_file_path = '/content/test1.zip'\n", "\n", "# Destination directory where you want to extract the ZIP file\n", "destination_directory = '/content/unzipped_data/'\n", "\n", "# Import the necessary libraries\n", "import zipfile\n", "import os\n", "\n", "# Create the destination directory if it doesn't exist\n", "os.makedirs(destination_directory, exist_ok=True)\n", "\n", "# Extract the ZIP file\n", "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n", " zip_ref.extractall(destination_directory)\n", "\n", "# Check the content of the destination directory\n", "os.listdir(destination_directory)\n", "\n", "\n", "\n", "\n", "import os\n", "import xml.etree.ElementTree as ET\n", "from zipfile import ZipFile\n", "\n", "# List of project names to process\n", "projects = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']\n", "\n", "# Dictionary to store the extracted data\n", "all_data = {}\n", "\n", "# Function to convert severity labels\n", "def convert_severity(severity):\n", " # Add your criteria to convert severity labels to \"Severe\" or \"Non-Severe\"\n", " if severity in ['critical', 'blocker', 'major']:\n", " return 'Severe'\n", " else:\n", " return 'Non-Severe'\n", "\n", "for project in projects:\n", " project_folder = f\"unzipped_data/{project}\"\n", "\n", " # Define the XML file names for each project\n", " xml_files = ['short_desc.xml', 'severity.xml']\n", "\n", " # Dictionary to store the extracted data for the current project\n", " project_data = {}\n", "\n", " for xml_file in xml_files:\n", " file_path = os.path.join(project_folder, xml_file)\n", "\n", " if os.path.exists(file_path):\n", " tree = ET.parse(file_path)\n", " root = tree.getroot()\n", "\n", " if xml_file == 'short_desc.xml':\n", " reports = root.findall('.//report')\n", " elif xml_file == 'severity.xml':\n", " severities = root.findall('.//report')\n", "\n", " for report in reports:\n", " report_id = report.get('id')\n", " last_update = report.find('.//update[last()]')\n", "\n", " if last_update is not None:\n", " when = last_update.find('when').text\n", " what = last_update.find('what').text\n", "\n", " if report_id not in project_data:\n", " project_data[report_id] = {}\n", " project_data[report_id]['when'] = when\n", " project_data[report_id]['what'] = what\n", "\n", " for severity in severities:\n", " report_id = severity.get('id')\n", " last_update = severity.find('.//update[last()]')\n", "\n", " if last_update is not None:\n", " last_what_element = last_update.find('what[last()]')\n", " if last_what_element is not None:\n", " severity_label = last_what_element.text\n", " converted_severity = convert_severity(severity_label)\n", " if report_id in project_data:\n", " project_data[report_id]['severity'] = converted_severity\n", "\n", " # Store the data for the current project\n", " all_data[project] = project_data\n", "\n", "# Now you have all the data for different projects in the 'all_data' dictionary\n", "# You can use this data to train and validate your model as needed\n", "\n" ], "metadata": { "id": "ZWgFoLCP0I_3", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0c64f884-51a9-439d-8389-edd51d420c44" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "['Platform', 'Bugzilla', 'Core', 'Firefox', 'JDT', 'Thunderbird', 'CDT', 'PDE']" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [ "import xml.etree.ElementTree as ET\n", "\n", "def extract_data_from_xml(xml_file):\n", " tree = ET.parse(xml_file)\n", " root = tree.getroot()\n", "\n", " severity = root.find('severity').text\n", " summary = root.find('summary').text\n", "\n", " return severity, summary\n" ], "metadata": { "id": "chBpeVfEbiD4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#Skip\n", "import random\n", "project_folders = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird'] # List of project folder names\n", "\n", "# Shuffle the project list to ensure each project is tested exactly once\n", "random.shuffle(project_folders)\n", "\n", "# Number of iterations\n", "num_iterations = 8 # You can adjust this as needed\n", "\n", "for i in range(num_iterations):\n", " testing_project = project_folders[i]\n", " training_projects = project_folders[:i] + project_folders[i + 1:]\n", "\n", " print(f\"Iteration {i + 1}:\")\n", " print(f\"Training Projects: {training_projects}\")\n", " print(f\"Testing Project: {testing_project}\\n\")" ], "metadata": { "id": "VbO4HaiGdcp9", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5821dd8b-fe0c-4459-a7ad-83623187ef56" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Iteration 1:\n", "Training Projects: ['PDE', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']\n", "Testing Project: Firefox\n", "\n", "Iteration 2:\n", "Training Projects: ['Firefox', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']\n", "Testing Project: PDE\n", "\n", "Iteration 3:\n", "Training Projects: ['Firefox', 'PDE', 'Core', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']\n", "Testing Project: CDT\n", "\n", "Iteration 4:\n", "Training Projects: ['Firefox', 'PDE', 'CDT', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']\n", "Testing Project: Core\n", "\n", "Iteration 5:\n", "Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'JDT', 'Bugzilla', 'Platform']\n", "Testing Project: Thunderbird\n", "\n", "Iteration 6:\n", "Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'Thunderbird', 'Bugzilla', 'Platform']\n", "Testing Project: JDT\n", "\n", "Iteration 7:\n", "Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Platform']\n", "Testing Project: Bugzilla\n", "\n", "Iteration 8:\n", "Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Bugzilla']\n", "Testing Project: Platform\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "import xml.etree.ElementTree as ET\n", "\n", "# Define the project folder and XML file names\n", "project_folder = \"CDT\" # Replace with the project folder you want to process\n", "xml_files = ['assigned_to.xml', 'bug_status.xml', 'cc.xml', 'component.xml', 'short_desc.xml', 'severity.xml']\n", "\n", "# Dictionary to store the extracted data\n", "data = {}\n", "\n", "# Loop through the XML files and extract severity and short_description\n", "for xml_file in xml_files:\n", " file_path = os.path.join(project_folder, xml_file)\n", "\n", " if os.path.exists(file_path):\n", " tree = ET.parse(file_path)\n", " root = tree.getroot()\n", "\n", " severity_element = root.find('severity')\n", " short_desc_element = root.find('short_desc')\n", "\n", " severity = severity_element.text if severity_element is not None else \"N/A\"\n", " short_desc = short_desc_element.text if short_desc_element is not None else \"N/A\"\n", "\n", " data[xml_file] = {'severity': severity, 'short_description': short_desc}\n", "\n", "# Print the extracted data\n", "for file, content in data.items():\n", " print(f\"File: {file}\")\n", " print(f\"Severity: {content['severity']}\")\n", " print(f\"Short Description: {content['short_description']}\")\n", " print()\n" ], "metadata": { "id": "iz5022XQeqnl" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import xml.etree.ElementTree as ET\n", "from zipfile import ZipFile\n", "\n", "# Extract the ZIP file\n", "with ZipFile('test1.zip', 'r') as zip_ref:\n", " zip_ref.extractall('unzipped_data')\n", "\n", "# Define the project folder and XML file names\n", "project_folder = \"unzipped_data/CDT\" # Replace with the project folder you want to process\n", "xml_files = ['assigned_to.xml', 'bug_status.xml', 'cc.xml', 'component.xml', 'short_desc.xml', 'severity.xml']\n", "\n", "# Dictionary to store the extracted data\n", "data = {}\n", "\n", "# Loop through the XML files and extract severity and short_description\n", "for xml_file in xml_files:\n", " file_path = os.path.join(project_folder, xml_file)\n", "\n", " if os.path.exists(file_path):\n", " tree = ET.parse(file_path)\n", " root = tree.getroot()\n", "\n", " severity_element = root.find('severity')\n", " short_desc_element = root.find('short_desc')\n", "\n", " severity = severity_element.text if severity_element is not None else \"N/A\"\n", " short_desc = short_desc_element.text if short_desc_element is not None else \"N/A\"\n", "\n", " data[xml_file] = {'severity': severity, 'short_description': short_desc}\n", "\n", "# Print the extracted data\n", "for file, content in data.items():\n", " print(f\"File: {file}\")\n", " print(f\"Severity: {content['severity']}\")\n", " print(f\"Short Description: {content['short_description']}\")\n", " print()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IOI_EzjTfQon", "outputId": "3817ecf4-3929-4cf9-950e-9fac87a4e696" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "File: assigned_to.xml\n", "Severity: N/A\n", "Short Description: N/A\n", "\n", "File: bug_status.xml\n", "Severity: N/A\n", "Short Description: N/A\n", "\n", "File: cc.xml\n", "Severity: N/A\n", "Short Description: N/A\n", "\n", "File: component.xml\n", "Severity: N/A\n", "Short Description: N/A\n", "\n", "File: short_desc.xml\n", "Severity: N/A\n", "Short Description: N/A\n", "\n", "File: severity.xml\n", "Severity: N/A\n", "Short Description: N/A\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "import xml.etree.ElementTree as ET\n", "from zipfile import ZipFile\n", "\n", "# Extract the ZIP file\n", "with ZipFile('test1.zip', 'r') as zip_ref:\n", " zip_ref.extractall('unzipped_data')\n", "\n", "# Define the project folder and XML file names\n", "project_folder = \"unzipped_data/Bugzilla\" # Replace with the project folder you want to process\n", "xml_files = ['short_desc.xml'] # Use only the short_desc.xml file\n", "\n", "# Dictionary to store the extracted data\n", "data = {}\n", "\n", "# Loop through the XML files and extract the last tag for each report\n", "for xml_file in xml_files:\n", " file_path = os.path.join(project_folder, xml_file)\n", "\n", " if os.path.exists(file_path):\n", " tree = ET.parse(file_path)\n", " root = tree.getroot()\n", "\n", " # Find all elements\n", " reports = root.findall('.//report')\n", "\n", " for report in reports:\n", " report_id = report.get('id')\n", " last_update = report.find('.//update[last()]')\n", "\n", " if last_update is not None:\n", " when = last_update.find('when').text\n", " what = last_update.find('what').text\n", "\n", " data[report_id] = {'when': when, 'what': what}\n", "\n", "# Print the extracted data\n", "for report_id, content in data.items():\n", " print(f\"Report ID: {report_id}\")\n", " print(f\"When: {content['when']}\")\n", " print(f\"What: {content['what']}\")\n", " print()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8vSIedCRgci6", "outputId": "3da7b90c-2636-4dbf-f67d-f7cef8f042d5" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n", "Report ID: 536190\n", "When: 1261382350\n", "What: Migration from 3.0.4 to 3.4.4: Problem with user member of many groups (more than 500)\n", "\n", "Report ID: 536364\n", "When: 1261458281\n", "What: wrtw\n", "\n", "Report ID: 536553\n", "When: 1261549211\n", "What: Change case of default status and workflow names from UPPERCASE to InitialCaps\n", "\n", "Report ID: 536583\n", "When: 1261563117\n", "What: \"Features\" underlines on mouseover\n", "\n", "Report ID: 536589\n", "When: 1261564854\n", "What: No ability to manually set language\n", "\n", "Report ID: 537083\n", "When: 1262059084\n", "What: The attachment table is too narrow by default\n", "\n", "Report ID: 537111\n", "When: 1262075348\n", "What: help link is broken, excludes locale..\n", "\n", "Report ID: 537295\n", "When: 1262168751\n", "What: Patch to fix t/008filter.t error for template/en/default/reports/duplicates.rdf.tmpl\n", "\n", "Report ID: 537328\n", "When: 1262184848\n", "What: Clicking 'reply' shouldn't add anything to my history\n", "\n", "Report ID: 537746\n", "When: 1262597564\n", "What: Search Criteria Headers on Buglist Duplicate Unnecessarily\n", "\n", "Report ID: 537758\n", "When: 1262599464\n", "What: Failed to bind to the LDAP server. The error message was: No password, did you mean noauth or anonymous ?\n", "\n", "Report ID: 537765\n", "When: 1262600271\n", "What: TEST\n", "\n", "Report ID: 537766\n", "When: 1262601783\n", "What: Add contrib/console.pl\n", "\n", "Report ID: 537788\n", "When: 1262606429\n", "What: Attachment not working without description\n", "\n", "Report ID: 537834\n", "When: 1262616958\n", "What: Buglist results using atom ctype do not display users with empty real names\n", "\n", "Report ID: 537846\n", "When: 1262621547\n", "What: Disable the \"Reset\" checkbox for the maintainer parameter\n", "\n", "Report ID: 538039\n", "When: 1262702695\n", "What: Typo when deleting a custom field value\n", "\n", "Report ID: 538088\n", "When: 1262720187\n", "What: timezone NST considered illegal\n", "\n", "Report ID: 538134\n", "When: 1262746197\n", "What: \"File a new bug\" link not on buglist results even though bugs are all from one product\n", "\n", "Report ID: 538428\n", "When: 1262859089\n", "What: bugzilla.dtd is not valid\n", "\n", "Report ID: 538449\n", "When: 1262864963\n", "What: Allow specifying a default version for new bugs\n", "\n", "Report ID: 538551\n", "When: 1262913533\n", "What: Bugzilla\n", "\n", "Report ID: 538705\n", "When: 1262966320\n", "What: Bugzilla->dbh will become invalid in long jobqueue.pl runs\n", "\n", "Report ID: 538871\n", "When: 1263114323\n", "What: Comment line wrapping breaks autolinkification of internal links (e.g. bug xxxx, comment yy)\n", "\n", "Report ID: 538936\n", "When: 1263173566\n", "What: After migration from 3.0.2 to 3.4.4 in a query list there' s a lots of missing value for Hw (Plt) field.\n", "\n", "Report ID: 539018\n", "When: 1263202391\n", "What: cannot change my account password\n", "\n", "Report ID: 539059\n", "When: 1263210737\n", "What: Bugzilla history shows the old name for renamed flags\n", "\n", "Report ID: 539081\n", "When: 1263214429\n", "What: Integrate bug history into the bug's display page\n", "\n", "Report ID: 539229\n", "When: 1263280538\n", "What: Editing a search with resolution '---' does not remember that resolution\n", "\n", "Report ID: 539667\n", "When: 1263436891\n", "What: crash if I close the mail window while checking for new POP mail\n", "\n", "Report ID: 540027\n", "When: 1263555924\n", "What: Show_bug and search results result in blank page after upgrading to 3.4.4\n", "\n", "Report ID: 540391\n", "When: 1263776186\n", "What: bug\n", "\n", "Report ID: 540570\n", "When: 1263869259\n", "What: Results confined of last searched bugs while editing search after clicking on \"showing last searched results\"\n", "\n", "Report ID: 540811\n", "When: 1263956474\n", "What: Mails sent by bugzilla are truncated (last line is missing)\n", "\n", "Report ID: 541383\n", "When: 1264141136\n", "What: Search Information at top of search results is wrong when using Advanced Searching Using Boolean Charts\n", "\n", "Report ID: 541414\n", "When: 1264146631\n", "What: Bad french translation \"Totauxs\"\n", "\n", "Report ID: 541427\n", "When: 1264149940\n", "What: Blank page when trying to edit product group access control\n", "\n", "Report ID: 541553\n", "When: 1264173224\n", "What: [Oracle] During upgrade: ORA-04098: trigger 'BUGS_TIP.PRODUCTS_MILESTONEURL' is invalid and failed re-validation\n", "\n", "Report ID: 542208\n", "When: 1264481606\n", "What: Error im ChangeCol.cgi\n", "\n", "Report ID: 542532\n", "When: 1264579088\n", "What: checksetup.pl does not use webservergroup when supplied in a script if it is an empty string\n", "\n", "Report ID: 542671\n", "When: 1264604768\n", "What: attachment.cgi error on Windows server when temp folder is not c:\\temp\n", "\n", "Report ID: 542790\n", "When: 1264666113\n", "What: incorrect display of UTF-8 french charater\n", "\n", "Report ID: 542931\n", "When: 1264698990\n", "What: Bug in SOAP::Lite prevents WebService:XMLRPC logins from persisting\n", "\n", "Report ID: 542973\n", "When: 1264722387\n", "What: Version column in search result truncates data\n", "\n", "Report ID: 543102\n", "When: 1264769166\n", "What: column isactive in table for customfields - not used?\n", "\n", "Report ID: 543342\n", "When: 1264928835\n", "What: Release Notes for Bugzilla 3.4.5\n", "\n", "Report ID: 543343\n", "When: 1264928853\n", "What: Release Notes for Bugzilla 3.2.6\n", "\n", "Report ID: 543728\n", "When: 1265088198\n", "What: logging in with bugzìlla@gemal.dk produces \"There is already an account with the login name bugzìlla@gemal.dk.\"\n", "\n", "Report ID: 543986\n", "When: 1265176771\n", "What: False positive in 012throwables.t, 'object_does_not_exist' is still in use\n", "\n", "Report ID: 543987\n", "When: 1265177039\n", "What: bzr to CVS replication broken since we branched\n", "\n", "Report ID: 544008\n", "When: 1265183645\n", "What: 'Base class package \"Bugzilla::Field::ChoiceInterface\" is empty' thrown when running checksetup.pl\n", "\n", "Report ID: 544083\n", "When: 1265197567\n", "What: Add boxes to the 3.6 branch in Tinderbox\n", "\n", "Report ID: 544084\n", "When: 1265197628\n", "What: Add the 3.6 branch to the changelog page\n", "\n", "Report ID: 544114\n", "When: 1265205126\n", "What: Passwords are required to be under 16 characters\n", "\n", "Report ID: 544296\n", "When: 1265278845\n", "What: Bad title displayed when viewing the duplicates table (simple format)\n", "\n", "Report ID: 544506\n", "When: 1265360219\n", "What: Bugzilla->input_params is undefined when calling Bugzilla.version and other methods taking no argument\n", "\n", "Report ID: 544615\n", "When: 1265385539\n", "What: Bug.legal_values triggers an insecure dependency in Bugzilla::Field::get_legal_field_values()\n", "\n", "Report ID: 544662\n", "When: 1265437573\n", "What: Windows / Internet Explorer loses data when cc doesn' t match\n", "\n", "Report ID: 544751\n", "When: 1265520868\n", "What: question marks, plus and minus signs are not visible in UI to attachments; menus are too narrow\n", "\n", "Report ID: 544798\n", "When: 1265553186\n", "What: Using edit-multiple causes \"To reassign a bug, you must provide an address for the new assignee. \"\n", "\n", "Report ID: 544899\n", "When: 1265618842\n", "What: Clipping glitch in Assassin's creed II.\n", "\n", "Report ID: 544990\n", "When: 1265635040\n", "What: Allow directory names in page.cgi ids\n", "\n", "Report ID: 545029\n", "When: 1265641503\n", "What: When deleting a group, no warning is thrown about group inheritance\n", "\n", "Report ID: 545253\n", "When: 1265727038\n", "What: Do not display flags as editable when you cannot edit attachment attributes\n", "\n", "Report ID: 545260\n", "When: 1265729178\n", "What: Template hook per-language hook cache is wrong\n", "\n", "Report ID: 545277\n", "When: 1265733750\n", "What: Closed bugs are always FIXED, in the