Spaces:

yash1506
/

DRLLMTRY

Sleeping

App Files Files Community

yash1506 commited on Dec 8, 2024

Commit

de69528

verified ·

1 Parent(s): 63602f6

Update llm_handler.py

Browse files

Files changed (1) hide show

llm_handler.py +117 -117

llm_handler.py CHANGED Viewed

@@ -1,140 +1,140 @@
-from huggingface_hub import InferenceClient
 import pandas as pd
 import re
-from pathlib import Path
 import os
 from dotenv import load_dotenv
-class LLMHandler:
     def __init__(self):
         load_dotenv()
         self.client = InferenceClient(api_key=os.getenv("HF_TOK_KEY"))
         self.model = "Qwen/Qwen2.5-Coder-32B-Instruct"
-        self.results_dir = Path("results")
-        self.results_dir.mkdir(exist_ok=True)
-    def analyze_data(self, df):
         """
-        Analyze the dataset using the LLM model
         """
         # Generate knowledge prompt
-        know_prompt = self._generate_knowledge_prompt(df)
-        # Process each row
-        results = []
-        for idx, row in df.iterrows():
-            result = self._process_row(row, know_prompt)
-            results.append(result)
-        result = self._process_row(row, know_prompt)  # Process the row
-        results.append(result)
-        # Save results
-        results_df = pd.DataFrame(results)
-        results_path = self.results_dir / "PROBABILITY_OF_EACH_ROW_DDOS_AND_BENGNIN.csv"
-        results_df.to_csv(results_path, index=False)
-        return results_df
-    def _generate_knowledge_prompt(self, df):
         """
-        Generate the knowledge prompt for the LLM
         """
-        df_without_label = df.drop(self._get_label_column(df), axis=1)
-        stats = {
-            'max': df_without_label.max(),
-            'min': df_without_label.min(),
-            'median': df_without_label.median(),
-            'mean': df_without_label.mean(),
-            'variance': df_without_label.var()
-        }
-        prompt = ("Supposed that you are now an [[  HIGHLY EXPERIENCED NETWORK TRAFFIC DATA ANALYSIS EXPERT ]] . "
-                   "You need to help me analyze the data in the DDoS dataset and determine whether the data is [[ DDoS traffic ]]  or  [[ normal traffic ]] ."
-                  "Next, I will give you the maximum, minimum, median, mean, and variance of all the data under each label or columns present in the data set, which may help you make your judgment."
-                   "DO DEEP ANALYSIS ITS YOUR WORKS AND PRROIVDE ACCURATE ANSWERS ALONGWITH GIVEN TASK ::\n\n")
-        for column in df_without_label.columns:
-            prompt += (f"{column}: max={stats['max'][column]:.1f}, "
-                      f"min={stats['min'][column]:.1f}, "
-                      f"median={stats['median'][column]:.1f}, "
-                      f"mean={stats['mean'][column]:.1f}, "
-                      f"variance={stats['variance'][column]:.1f}\n")
         return prompt
-    def _process_row(self, row, know_prompt):
         """
-        Process a single row using the LLM
         """
-        if isinstance(row, dict):
-            row = pd.Series(row)
-        # Prepare row data prompt
-        row_prompt = self._generate_row_prompt(row)
-        # Get LLM response
-        messages = [
-            {'role': 'user', 'content': know_prompt},
-            {'role': 'user', 'content': row_prompt}
-        ]
-        completion = self.client.chat.completions.create(
-            model=self.model,
-            messages=messages,
-            max_tokens=10000
         )
-        # Extract probabilities
-        response = completion.choices[0].message.content
-        probabilities = self._extract_probabilities(response)
-        return {
-        'index': row.name,  # This should now work as row is a pandas.Series
-        'attack': probabilities[0],
-        'benign': probabilities[1],
-        'original': response
-        }
-    def _generate_row_prompt(self, row):
         """
-        Generate prompt for a single row
         """
-        return ("Next, I will give you a piece of data about network traffic information. You need to first tell me the probability of this data belonging to DDoS traffic data and normal traffic data directly and separately, and express the probability in the format of [0.xxx,0.xxx]. "
-                "The first number is the probability of DDoS traffic like [[  0.xxx ]] only where x are any numeral , and the second number is the probability of normal traffic like[ 0.xxx ]] only  where x are any numeral . KEEP IN MIND : [  BOTH NUMBERS ARE DECIMALS BETWEEN 0 AND 1, AND THE SUM OF THE TWO NUMBERS IS 1 ]  . "
-                "CLEARLY NOTE THAT YOU HAVE TO PROVIDE ONLY PROBABLITY OF EACH DDOS AND BENGNIN GIVEN THAT TOATL PROBABILITY WHEN SUMMED MUST BE 1 . "
-                "AND YES IT MUST BE LIKE 0<=P([DDOS,BENGNIN])<=1 BUT YOU HAVE TO PROVIDE PROBABILITY  IN THIS FORM express the probability in the format of [0.xxx,0.xxx] ONLY  ."
-                "Let's think step by step and explain the reasons for your judgment Each POINTWISE . The characteristics of its network traffic data packets are"
-                "for DDoS vs normal traffic in the format [0.xxx,0.xxx]. "
-                "The data features are:\n" +
-                "\n".join([f"{col}: {val}" for col, val in row.items()]))
-    def _extract_probabilities(self, response):
         """
-        Extract probabilities from LLM response
         """
         pattern = r'\[(.*?)\]'
-        matches = re.findall(pattern, response)
-        if matches and ',' in matches[0]:
-            probs = matches[0].split(',')
-            return [float(p) for p in probs]
-        return [None, None]
-    def get_chat_response(self, prompt):
-        """
-        Get response for chat interface
-        """
-        messages = [{'role': 'user', 'content': prompt}]
-        completion = self.client.chat.completions.create(
-            model=self.model,
-            messages=messages,
-            max_tokens=10000
-        )
-        return completion.choices[0].message.content
-    def _get_label_column(self, df):
-        """
-        Get the label column name
-        """
-        potential_labels = [col for col in df.columns if ' Label' in col.lower()]
-        return potential_labels[0] if potential_labels else None

 import pandas as pd
 import re
 import os
+from pathlib import Path
+from huggingface_hub import InferenceClient
 from dotenv import load_dotenv
+class DDoSInference:
     def __init__(self):
         load_dotenv()
         self.client = InferenceClient(api_key=os.getenv("HF_TOK_KEY"))
         self.model = "Qwen/Qwen2.5-Coder-32B-Instruct"
+        self.dataset_path = Path("~/.dataset/original.csv").expanduser()
+        self.results_path = Path("~/.dataset/PROBABILITY_OF_EACH_ROW_DDOS_AND_BENGNIN.csv").expanduser()
+        self.results_path.parent.mkdir(parents=True, exist_ok=True)
+    def process_dataset(self):
         """
+        Processes the dataset row by row and performs inference using the LLM.
         """
+        if not self.dataset_path.exists():
+            raise FileNotFoundError("The preprocessed dataset file does not exist. Ensure it is generated using the processor.")
+        ddos_data = pd.read_csv(self.dataset_path)
+        label_column = " Label"
+        if label_column not in ddos_data.columns:
+            label_column = input("Enter the label column name in your dataset: ").strip()
+            if label_column not in ddos_data.columns:
+                raise ValueError(f"Label column '{label_column}' not found in the dataset.")
+        ddos_data_without_label = ddos_data.drop([label_column], axis=1)
+        stats = {
+            'Max': ddos_data_without_label.max(),
+            'Min': ddos_data_without_label.min(),
+            'Median': ddos_data_without_label.median(),
+            'Mean': ddos_data_without_label.mean(),
+            'Variance': ddos_data_without_label.var()
+        }
         # Generate knowledge prompt
+        know_prompt = self.generate_knowledge_prompt(stats)
+        # Prepare results DataFrame
+        if self.results_path.exists():
+            predict_df = pd.read_csv(self.results_path)
+        else:
+            predict_df = pd.DataFrame(columns=["index", "attack", "benign", "original"])
+        start_index = predict_df.shape[0]
+        print(f"Starting inference from row {start_index}")
+        # Process rows for inference
+        for i in range(start_index, ddos_data.shape[0]):
+            row_prompt = self.generate_row_prompt(ddos_data.iloc[i])
+            probabilities = self.infer_row(know_prompt, row_prompt)
+            if probabilities:
+                predict_df.loc[i] = [i, *probabilities]
+            else:
+                predict_df.loc[i] = [i, "None", "None", "No valid response"]
+            # Save after each row for resilience
+            predict_df.to_csv(self.results_path, index=False)
+            print(f"Processed row {i}: {predict_df.loc[i].to_dict()}")
+        print("Inference complete. Results saved at:", self.results_path)
+    def generate_knowledge_prompt(self, stats):
         """
+        Generates the knowledge prompt based on dataset statistics.
         """
+        prompt = (
+            "Supposed that you are now an [[ HIGHLY EXPERIENCED NETWORK TRAFFIC DATA ANALYSIS EXPERT ]]. "
+            "You need to help me analyze the data in the DDoS dataset and determine whether the data is [[ DDoS traffic ]] or [[ normal traffic ]]. "
+            "Here are the maximum, minimum, median, mean, and variance of each column in the dataset to help your judgment:\n"
+        )
+        for col, values in stats.items():
+            prompt += f"{col}: max={values['Max']:.2f}, min={values['Min']:.2f}, median={values['Median']:.2f}, mean={values['Mean']:.2f}, variance={values['Variance']:.2f}\n"
         return prompt
+    def generate_row_prompt(self, row):
         """
+        Generates a row-specific prompt for the LLM.
         """
+        row_prompt = (
+            "Next, I will give you a piece of data about network traffic information. "
+            "You need to tell me the probability of this data being DDoS traffic or normal traffic. "
+            "Express the probability in the format [0.xxx, 0.xxx], where the first number represents DDoS probability and the second represents normal traffic probability. "
+            "Ensure that the sum of probabilities is exactly 1.\n"
         )
+        for col, val in row.items():
+            row_prompt += f"{col}: {val}, "
+        return row_prompt.strip(', ')
+    def infer_row(self, know_prompt, row_prompt):
         """
+        Performs inference for a single row using the LLM.
         """
+        try:
+            messages = [
+                {'role': 'user', 'content': know_prompt},
+                {'role': 'user', 'content': row_prompt}
+            ]
+            completion = self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                max_tokens=1000
+            )
+            response = completion.choices[0].message.content
+            probabilities = self.extract_probabilities(response)
+            return probabilities
+        except Exception as e:
+            print(f"Error during inference for row: {e}")
+            return None
+    def extract_probabilities(self, response):
         """
+        Extract probabilities from the LLM response using regex.
         """
         pattern = r'\[(.*?)\]'
+        match = re.search(pattern, response)
+        if match:
+            probs = match.group(1).split(',')
+            return [float(p.strip()) for p in probs if p.strip()]
+        return None
+# Example usage
+if __name__ == "__main__":
+    handler = DDoSInference()
+    handler.process_dataset()
+    print("You can now interact with the model for mitigation steps or download the results.")