nishan-chatterjee commited on
Commit
7ea9682
·
1 Parent(s): 6aa9d30

Add model and tokenizer files with Git LFS

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ config.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
39
+ sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
40
+ tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,52 @@
1
  ---
 
 
 
2
  license: gpl-3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: multilingual-persuasion-detection-from-text
3
+ app_file: inference.py
4
+ pinned: false
5
  license: gpl-3.0
6
+ language:
7
+ - multilingual
8
+ tags:
9
+ - mbart-50
10
+ - text-classification
11
+ - multi-label-classification
12
+ - persuasion-detection
13
+ - meme-analysis
14
+ - social-media-analysis
15
+ - propaganda-detection
16
+ - hierarchical-classification
17
+ - multilingual
18
+ pipeline_tag: text-classification
19
+ inference: True
20
  ---
21
+
22
+ # Multilingual Persuasion Detection in Memes
23
+
24
+ Given only the “textual content” of a meme, the goal is to identify which of the 20 persuasion techniques, organized in a hierarchy, it uses. Selecting only the ancestor node of a technique gives only a partial reward. This is a hierarchical multi-label classification problem based on the [SemEval 2024 Task 4 Subtask 1 of "Multilingual Detection of Persuasion Techniques in Memes"](https://propaganda.math.unipd.it/semeval2024task4/index.html).
25
+
26
+ ### Hierarchy
27
+ <img src="images/persuasion_techniques_hierarchy_graph.png" width="622" height="350">
28
+
29
+ ### Input Example
30
+ - **Input:** "I HATE TRUMP\n\nMOST TERRORIST DO",
31
+ - **Outputs:**
32
+ - Child-only Label List: ['Name calling/Labeling', 'Loaded Language']
33
+ - Complete Hierarchical Label List: ['Ethos', 'Ad Hominem', 'Name calling/Labeling', 'Pathos', 'Loaded Language']
34
+
35
+ ## Training Hyperparameters
36
+ - Base Model: "facebook/mbart-large-50-many-to-many-mmt"
37
+ - Learning Rate: 5e-05
38
+ - Max Length: 256
39
+ - Batch Size: 64
40
+ - Epoch: 3
41
+ - Seed: 42
42
+
43
+ ## Model Statistics
44
+ The model obtained the following metrics on the Development Set as of March 31st, 2024:
45
+ - Hierarchical F1: 63.58%
46
+ - Hierarchical Precision: 58.3%
47
+ - Hierarchical Recall: 69.9%
48
+
49
+ ## Licensing
50
+ The model is available under the GNU General Public License v3.0 (GPL-3.0), which allows for free use, modification, and distribution under the same license. However, it is strictly for research purposes only and cannot be used for malicious activities, including but not limited to manipulation, targeted harassment, hate speech, deception, and discrimination.
51
+
52
+ The dataset is available on the [competition website](https://propaganda.math.unipd.it/semeval2024task4/). Users must accept an online agreement before downloading and using the data. This agreement stipulates that the data is for research purposes only and cannot be redistributed or used for malicious purposes as outlined above.
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a5e70a64d1755934c8946a9e79282de7761e3e04bb48599969bd4fdcea884b
3
+ size 2574
images/persuasion_techniques_hierarchy_graph.png ADDED
inference.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import networkx as nx
4
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
+
6
+ def _make_logits_consistent(x, R):
7
+ c_out = x.unsqueeze(1) + 10
8
+ c_out = c_out.expand(len(x), R.shape[1], R.shape[1])
9
+ R_batch = R.expand(len(x), R.shape[1], R.shape[1]).to(x.device)
10
+ final_out, _ = torch.max(R_batch * c_out, dim=2)
11
+ return final_out - 10
12
+
13
+ def initialize_model():
14
+ model_dir = "."
15
+ G = nx.DiGraph()
16
+ edges = [
17
+ ("ROOT", "Logos"),
18
+ ("Logos", "Repetition"), ("Logos", "Obfuscation, Intentional vagueness, Confusion"), ("Logos", "Reasoning"), ("Logos", "Justification"),
19
+ ("Justification", "Slogans"), ("Justification", "Bandwagon"), ("Justification", "Appeal to authority"), ("Justification", "Flag-waving"), ("Justification", "Appeal to fear/prejudice"),
20
+ ("Reasoning", "Simplification"),
21
+ ("Simplification", "Causal Oversimplification"), ("Simplification", "Black-and-white Fallacy/Dictatorship"), ("Simplification", "Thought-terminating cliché"),
22
+ ("Reasoning", "Distraction"),
23
+ ("Distraction", "Misrepresentation of Someone's Position (Straw Man)"), ("Distraction", "Presenting Irrelevant Data (Red Herring)"), ("Distraction", "Whataboutism"),
24
+ ("ROOT", "Ethos"),
25
+ ("Ethos", "Appeal to authority"), ("Ethos", "Glittering generalities (Virtue)"), ("Ethos", "Bandwagon"), ("Ethos", "Ad Hominem"), ("Ethos", "Transfer"),
26
+ ("Ad Hominem", "Doubt"), ("Ad Hominem", "Name calling/Labeling"), ("Ad Hominem", "Smears"), ("Ad Hominem", "Reductio ad hitlerum"), ("Ad Hominem", "Whataboutism"),
27
+ ("ROOT", "Pathos"),
28
+ ("Pathos", "Exaggeration/Minimisation"), ("Pathos", "Loaded Language"), ("Pathos", "Appeal to (Strong) Emotions"), ("Pathos", "Appeal to fear/prejudice"), ("Pathos", "Flag-waving"), ("Pathos", "Transfer")
29
+ ]
30
+ G.add_edges_from(edges)
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
33
+ model = AutoModelForSequenceClassification.from_pretrained(model_dir)
34
+
35
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
+ model.to(device)
37
+
38
+ A = nx.to_numpy_array(G).transpose()
39
+ R = np.zeros(A.shape)
40
+ np.fill_diagonal(R, 1)
41
+ g = nx.DiGraph(A)
42
+ for i in range(len(A)):
43
+ descendants = list(nx.descendants(g, i))
44
+ if descendants:
45
+ R[i, descendants] = 1
46
+ R = torch.tensor(R).transpose(1, 0).unsqueeze(0)
47
+
48
+ return tokenizer, model, R, G, device
49
+
50
+ def predict_persuasion_labels(text, tokenizer, model, R, G, device):
51
+ encoding = tokenizer.encode_plus(
52
+ text,
53
+ add_special_tokens=True,
54
+ max_length=128,
55
+ return_token_type_ids=False,
56
+ padding="max_length",
57
+ truncation=True,
58
+ return_attention_mask=True,
59
+ return_tensors="pt",
60
+ )
61
+
62
+ with torch.no_grad():
63
+ outputs = model(
64
+ input_ids=encoding["input_ids"].to(device),
65
+ attention_mask=encoding["attention_mask"].to(device),
66
+ )
67
+ logits = _make_logits_consistent(outputs.logits, R)
68
+ logits[:, 0] = -1.0
69
+ logits = logits > 0.0
70
+ complete_predicted_hierarchy = np.array(G.nodes)[logits[0].cpu().nonzero()].flatten().tolist()
71
+
72
+ child_only_labels = []
73
+ for label in complete_predicted_hierarchy:
74
+ if not list(G.successors(label)):
75
+ child_only_labels.append(label)
76
+
77
+ return complete_predicted_hierarchy, child_only_labels
78
+
79
+ tokenizer, model, R, G, device = initialize_model()
80
+
81
+ def inference(text):
82
+ return predict_persuasion_labels(text, tokenizer, model, R, G, device)
83
+
84
+ if __name__ == "__main__":
85
+ # ask the user for input
86
+ text = input("Enter the text: ")
87
+ print(inference(text))
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f22069b784f1dbe6dcdcfd85c6941a4330a81cdfd5e3d4996246fd6a500a877c
3
+ size 2447904292
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ numpy
3
+ networkx
4
+ transformers
5
+ tqdm
6
+ sentencepiece
7
+ protobuf
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1daf98a95c01a96007f2ab65fed7b31641e363f2de94f82e06d948b7855ed21d
3
+ size 992
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86f983b6563a9468794455498914bda0eaf9a60e5c9cd5a21669a24a625e490d
3
+ size 17109921
tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9aff9ce0b78ebc744dba8ac8ebf44d7e675d581e97971edb35b1203926d9586
3
+ size 10924