JenetGhumman commited on
Commit
8b796b7
·
verified ·
1 Parent(s): 69859cb

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +31 -32
tasks/text.py CHANGED
@@ -1,26 +1,24 @@
1
  from fastapi import APIRouter
2
  from datetime import datetime
3
  from datasets import load_dataset
 
 
4
  from sklearn.metrics import accuracy_score
5
- import random
 
6
 
7
  from .utils.evaluation import TextEvaluationRequest
8
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
9
 
10
  router = APIRouter()
11
 
12
- DESCRIPTION = "Random Baseline"
13
  ROUTE = "/text"
14
 
15
- @router.post(ROUTE, tags=["Text Task"],
16
- description=DESCRIPTION)
17
  async def evaluate_text(request: TextEvaluationRequest):
18
  """
19
- Evaluate text classification for climate disinformation detection.
20
-
21
- Current Model: Random Baseline
22
- - Makes random predictions from the label space (0-7)
23
- - Used as a baseline for comparison
24
  """
25
  # Get space info
26
  username, space_url = get_space_info()
@@ -43,34 +41,36 @@ async def evaluate_text(request: TextEvaluationRequest):
43
  # Convert string labels to integers
44
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
45
 
46
- # Split dataset
47
- train_test = dataset["train"]
48
- test_dataset = dataset["test"]
49
-
 
 
 
 
50
  # Start tracking emissions
51
  tracker.start()
52
  tracker.start_task("inference")
53
 
54
- #--------------------------------------------------------------------------------------------
55
- # YOUR MODEL INFERENCE CODE HERE
56
- # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
57
- #--------------------------------------------------------------------------------------------
58
-
59
- # Make random predictions (placeholder for actual model inference)
60
- true_labels = test_dataset["label"]
61
- predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
62
 
63
- #--------------------------------------------------------------------------------------------
64
- # YOUR MODEL INFERENCE STOPS HERE
65
- #--------------------------------------------------------------------------------------------
66
 
67
-
68
  # Stop tracking emissions
69
  emissions_data = tracker.stop_task()
70
-
71
  # Calculate accuracy
72
- accuracy = accuracy_score(true_labels, predictions)
73
-
74
  # Prepare results dictionary
75
  results = {
76
  "username": username,
@@ -84,9 +84,8 @@ async def evaluate_text(request: TextEvaluationRequest):
84
  "api_route": ROUTE,
85
  "dataset_config": {
86
  "dataset_name": request.dataset_name,
87
- "test_size": request.test_size,
88
- "test_seed": request.test_seed
89
  }
90
  }
91
-
92
- return results
 
1
  from fastapi import APIRouter
2
  from datetime import datetime
3
  from datasets import load_dataset
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.svm import SVC
6
  from sklearn.metrics import accuracy_score
7
+ from sklearn.model_selection import train_test_split
8
+ import numpy as np
9
 
10
  from .utils.evaluation import TextEvaluationRequest
11
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
12
 
13
  router = APIRouter()
14
 
15
+ DESCRIPTION = "TF-IDF + SVM Classifier"
16
  ROUTE = "/text"
17
 
18
+ @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
 
19
  async def evaluate_text(request: TextEvaluationRequest):
20
  """
21
+ Evaluate text classification for climate disinformation detection using TF-IDF and SVM.
 
 
 
 
22
  """
23
  # Get space info
24
  username, space_url = get_space_info()
 
41
  # Convert string labels to integers
42
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
43
 
44
+ # Split dataset into training and testing sets
45
+ train_data = dataset["train"]
46
+ test_data = dataset["test"]
47
+
48
+ # Extract text and labels
49
+ train_texts, train_labels = train_data["text"], train_data["label"]
50
+ test_texts, test_labels = test_data["text"], test_data["label"]
51
+
52
  # Start tracking emissions
53
  tracker.start()
54
  tracker.start_task("inference")
55
 
56
+ # TF-IDF Vectorization
57
+ vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words="english")
58
+ X_train = vectorizer.fit_transform(train_texts)
59
+ X_test = vectorizer.transform(test_texts)
60
+
61
+ # Train SVM Classifier
62
+ svm_model = SVC(kernel="linear", probability=True)
63
+ svm_model.fit(X_train, train_labels)
64
 
65
+ # Model Inference
66
+ predictions = svm_model.predict(X_test)
 
67
 
 
68
  # Stop tracking emissions
69
  emissions_data = tracker.stop_task()
70
+
71
  # Calculate accuracy
72
+ accuracy = accuracy_score(test_labels, predictions)
73
+
74
  # Prepare results dictionary
75
  results = {
76
  "username": username,
 
84
  "api_route": ROUTE,
85
  "dataset_config": {
86
  "dataset_name": request.dataset_name,
87
+ "test_size": len(test_data),
 
88
  }
89
  }
90
+
91
+ return results