Upload 9 files
Browse files- README.md +6 -14
- app.py +33 -0
- pegasus_summery_model.zip +3 -0
- special_tokens_map.json +110 -0
- spiece.model +3 -0
- summerization.ipynb +368 -0
- text_summarization.py +52 -0
- tokenizer.json +0 -0
- tokenizer_config.json +117 -0
README.md
CHANGED
@@ -1,14 +1,6 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
metrics:
|
8 |
-
- accuracy
|
9 |
-
- bleu
|
10 |
-
base_model:
|
11 |
-
- facebook/bart-large-cnn
|
12 |
-
pipeline_tag: summarization
|
13 |
-
library_name: adapter-transformers
|
14 |
-
---
|
|
|
1 |
+
# Text_summerization_pegasus
|
2 |
+
|
3 |
+
The Model is to be downloaded via the user if any finetuning or inferncing is needed
|
4 |
+
|
5 |
+
This is a google pegasus model we have used streamlit interface to presume the summrize text over the articles.
|
6 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
st.write("""# Summerize your text""")
|
6 |
+
|
7 |
+
|
8 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
9 |
+
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("tokenizer")
|
11 |
+
|
12 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("pegasus_summery_model")
|
13 |
+
|
14 |
+
text_input = st.text_area("text to summerize")
|
15 |
+
|
16 |
+
if text_input:
|
17 |
+
|
18 |
+
tokenized_text = tokenizer.encode_plus(
|
19 |
+
str(text_input),
|
20 |
+
return_attention_mask= True,
|
21 |
+
return_tensors='pt'
|
22 |
+
)
|
23 |
+
|
24 |
+
generated_token = model.generate(
|
25 |
+
input_ids = tokenized_text['input_ids'],
|
26 |
+
attention_mask = tokenized_text["attention_mask"],
|
27 |
+
use_cache=True,)
|
28 |
+
|
29 |
+
|
30 |
+
pred = [tokenizer.decode(token_ids=ids, skip_special_tokens=True)for ids in generated_token]
|
31 |
+
|
32 |
+
st.write("## Summerized Text")
|
33 |
+
st.write(" ".join(pred))
|
pegasus_summery_model.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca66f2a69bb5a500522394ad90331c8df1848d58a5822bedb84db2c505c18473
|
3 |
+
size 1240
|
special_tokens_map.json
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<mask_1>",
|
4 |
+
"<unk_2>",
|
5 |
+
"<unk_3>",
|
6 |
+
"<unk_4>",
|
7 |
+
"<unk_5>",
|
8 |
+
"<unk_6>",
|
9 |
+
"<unk_7>",
|
10 |
+
"<unk_8>",
|
11 |
+
"<unk_9>",
|
12 |
+
"<unk_10>",
|
13 |
+
"<unk_11>",
|
14 |
+
"<unk_12>",
|
15 |
+
"<unk_13>",
|
16 |
+
"<unk_14>",
|
17 |
+
"<unk_15>",
|
18 |
+
"<unk_16>",
|
19 |
+
"<unk_17>",
|
20 |
+
"<unk_18>",
|
21 |
+
"<unk_19>",
|
22 |
+
"<unk_20>",
|
23 |
+
"<unk_21>",
|
24 |
+
"<unk_22>",
|
25 |
+
"<unk_23>",
|
26 |
+
"<unk_24>",
|
27 |
+
"<unk_25>",
|
28 |
+
"<unk_26>",
|
29 |
+
"<unk_27>",
|
30 |
+
"<unk_28>",
|
31 |
+
"<unk_29>",
|
32 |
+
"<unk_30>",
|
33 |
+
"<unk_31>",
|
34 |
+
"<unk_32>",
|
35 |
+
"<unk_33>",
|
36 |
+
"<unk_34>",
|
37 |
+
"<unk_35>",
|
38 |
+
"<unk_36>",
|
39 |
+
"<unk_37>",
|
40 |
+
"<unk_38>",
|
41 |
+
"<unk_39>",
|
42 |
+
"<unk_40>",
|
43 |
+
"<unk_41>",
|
44 |
+
"<unk_42>",
|
45 |
+
"<unk_43>",
|
46 |
+
"<unk_44>",
|
47 |
+
"<unk_45>",
|
48 |
+
"<unk_46>",
|
49 |
+
"<unk_47>",
|
50 |
+
"<unk_48>",
|
51 |
+
"<unk_49>",
|
52 |
+
"<unk_50>",
|
53 |
+
"<unk_51>",
|
54 |
+
"<unk_52>",
|
55 |
+
"<unk_53>",
|
56 |
+
"<unk_54>",
|
57 |
+
"<unk_55>",
|
58 |
+
"<unk_56>",
|
59 |
+
"<unk_57>",
|
60 |
+
"<unk_58>",
|
61 |
+
"<unk_59>",
|
62 |
+
"<unk_60>",
|
63 |
+
"<unk_61>",
|
64 |
+
"<unk_62>",
|
65 |
+
"<unk_63>",
|
66 |
+
"<unk_64>",
|
67 |
+
"<unk_65>",
|
68 |
+
"<unk_66>",
|
69 |
+
"<unk_67>",
|
70 |
+
"<unk_68>",
|
71 |
+
"<unk_69>",
|
72 |
+
"<unk_70>",
|
73 |
+
"<unk_71>",
|
74 |
+
"<unk_72>",
|
75 |
+
"<unk_73>",
|
76 |
+
"<unk_74>",
|
77 |
+
"<unk_75>",
|
78 |
+
"<unk_76>",
|
79 |
+
"<unk_77>",
|
80 |
+
"<unk_78>",
|
81 |
+
"<unk_79>",
|
82 |
+
"<unk_80>",
|
83 |
+
"<unk_81>",
|
84 |
+
"<unk_82>",
|
85 |
+
"<unk_83>",
|
86 |
+
"<unk_84>",
|
87 |
+
"<unk_85>",
|
88 |
+
"<unk_86>",
|
89 |
+
"<unk_87>",
|
90 |
+
"<unk_88>",
|
91 |
+
"<unk_89>",
|
92 |
+
"<unk_90>",
|
93 |
+
"<unk_91>",
|
94 |
+
"<unk_92>",
|
95 |
+
"<unk_93>",
|
96 |
+
"<unk_94>",
|
97 |
+
"<unk_95>",
|
98 |
+
"<unk_96>",
|
99 |
+
"<unk_97>",
|
100 |
+
"<unk_98>",
|
101 |
+
"<unk_99>",
|
102 |
+
"<unk_100>",
|
103 |
+
"<unk_101>",
|
104 |
+
"<unk_102>"
|
105 |
+
],
|
106 |
+
"eos_token": "</s>",
|
107 |
+
"mask_token": "<mask_2>",
|
108 |
+
"pad_token": "<pad>",
|
109 |
+
"unk_token": "<unk>"
|
110 |
+
}
|
spiece.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0015189ef36359283fec8b93cf6d9ce51bca37eb1101defc68a53b394913b96c
|
3 |
+
size 1912529
|
summerization.ipynb
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 7,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"Token indices sequence length is longer than the specified maximum sequence length for this model (1106 > 512). Running this sequence through the model will result in indexing errors\n"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"data": {
|
17 |
+
"text/plain": [
|
18 |
+
"'The US Supreme Court has suspended a legal battle between the US and the US administration over a spending dispute that could conceivably cause the Obama administration to lose millions of dollars in health insurance subsidies.'"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
"execution_count": 7,
|
22 |
+
"metadata": {},
|
23 |
+
"output_type": "execute_result"
|
24 |
+
}
|
25 |
+
],
|
26 |
+
"source": [
|
27 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
|
28 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"minhtoan/t5-finetune-bbc-news\") \n",
|
29 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(\"minhtoan/t5-finetune-bbc-news\")\n",
|
30 |
+
"model.cuda()\n",
|
31 |
+
"src = \"summarize: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.First Minister Nicola Sturgeon visited the area to inspect the damage.The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.'It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - but it is almost like we're neglected or forgotten,' she said.'That may not be true but it is perhaps my perspective over the last few days.'Why were you not ready to help us a bit more when the warning and the alarm alerts had gone out?'Meanwhile, a flood alert remains in place across the Borders because of the constant rain.Peebles was badly hit by problems, sparking calls to introduce more defences in the area.Scottish Borders Council has put a list on its website of the roads worst affected and drivers have been urged not to ignore closure signs.The Labour Party's deputy Scottish leader Alex Rowley was in Hawick on Monday to see the situation first hand.He said it was important to get the flood protection plan right but backed calls to speed up the process.'I was quite taken aback by the amount of damage that has been done,' he said.'Obviously it is heart-breaking for people who have been forced out of their homes and the impact on businesses.'He said it was important that 'immediate steps' were taken to protect the areas most vulnerable and a clear timetable put in place for flood prevention plans.Have you been affected by flooding in Dumfries and Galloway or the Borders? Tell us about your experience of the situation and how it was handled. Email us on [email protected] or [email protected].\"\n",
|
32 |
+
"tokenized_text = tokenizer.encode(src, return_tensors=\"pt\").cuda()\n",
|
33 |
+
"model.eval()\n",
|
34 |
+
"summary_ids = model.generate(tokenized_text, max_length=150)\n",
|
35 |
+
"output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)\n",
|
36 |
+
"output\n"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 2,
|
42 |
+
"metadata": {},
|
43 |
+
"outputs": [
|
44 |
+
{
|
45 |
+
"name": "stdout",
|
46 |
+
"output_type": "stream",
|
47 |
+
"text": [
|
48 |
+
"Sun May 28 16:12:32 2023 \n",
|
49 |
+
"+-----------------------------------------------------------------------------+\n",
|
50 |
+
"| NVIDIA-SMI 528.24 Driver Version: 528.24 CUDA Version: 12.0 |\n",
|
51 |
+
"|-------------------------------+----------------------+----------------------+\n",
|
52 |
+
"| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
|
53 |
+
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
|
54 |
+
"| | | MIG M. |\n",
|
55 |
+
"|===============================+======================+======================|\n",
|
56 |
+
"| 0 NVIDIA GeForce ... WDDM | 00000000:01:00.0 Off | N/A |\n",
|
57 |
+
"| N/A 52C P8 2W / 80W | 390MiB / 6144MiB | 0% Default |\n",
|
58 |
+
"| | | N/A |\n",
|
59 |
+
"+-------------------------------+----------------------+----------------------+\n",
|
60 |
+
" \n",
|
61 |
+
"+-----------------------------------------------------------------------------+\n",
|
62 |
+
"| Processes: |\n",
|
63 |
+
"| GPU GI CI PID Type Process name GPU Memory |\n",
|
64 |
+
"| ID ID Usage |\n",
|
65 |
+
"|=============================================================================|\n",
|
66 |
+
"| 0 N/A N/A 10748 C ...thon\\Python311\\python.exe N/A |\n",
|
67 |
+
"+-----------------------------------------------------------------------------+\n"
|
68 |
+
]
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"source": [
|
72 |
+
"!nvidia-smi"
|
73 |
+
]
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"cell_type": "code",
|
77 |
+
"execution_count": 3,
|
78 |
+
"metadata": {},
|
79 |
+
"outputs": [
|
80 |
+
{
|
81 |
+
"name": "stdout",
|
82 |
+
"output_type": "stream",
|
83 |
+
"text": [
|
84 |
+
"GPU is available\n",
|
85 |
+
"PyTorch is using GPU\n"
|
86 |
+
]
|
87 |
+
}
|
88 |
+
],
|
89 |
+
"source": [
|
90 |
+
"import torch\n",
|
91 |
+
"\n",
|
92 |
+
"# Check if GPU is available\n",
|
93 |
+
"if torch.cuda.is_available():\n",
|
94 |
+
" print(\"GPU is available\")\n",
|
95 |
+
"else:\n",
|
96 |
+
" print(\"GPU is not available\")\n",
|
97 |
+
"\n",
|
98 |
+
"# Check if PyTorch is using GPU\n",
|
99 |
+
"if torch.cuda.current_device() != -1:\n",
|
100 |
+
" print(\"PyTorch is using GPU\")\n",
|
101 |
+
"else:\n",
|
102 |
+
" print(\"PyTorch is using CPU\")\n"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": 4,
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [],
|
110 |
+
"source": [
|
111 |
+
"import pandas as pd\n",
|
112 |
+
"\n",
|
113 |
+
"df1 = pd.read_csv(\"articles1.csv\")"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"execution_count": 6,
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [
|
121 |
+
{
|
122 |
+
"data": {
|
123 |
+
"text/plain": [
|
124 |
+
"'WASHINGTON — Congressional Republicans have a new fear when it comes to their health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for and Americans, handing House Republicans a big victory on issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been demanding an end to the law for years. In another twist, Donald J. Trump’s administration, worried about preserving executive branch prerogatives, could choose to fight its Republican allies in the House on some central questions in the dispute. Eager to avoid an ugly political pileup, Republicans on Capitol Hill and the Trump transition team are gaming out how to handle the lawsuit, which, after the election, has been put in limbo until at least late February by the United States Court of Appeals for the District of Columbia Circuit. They are not yet ready to divulge their strategy. “Given that this pending litigation involves the Obama administration and Congress, it would be inappropriate to comment,” said Phillip J. Blando, a spokesman for the Trump transition effort. “Upon taking office, the Trump administration will evaluate this case and all related aspects of the Affordable Care Act. ” In a potentially decision in 2015, Judge Rosemary M. Collyer ruled that House Republicans had the standing to sue the executive branch over a spending dispute and that the Obama administration had been distributing the health insurance subsidies, in violation of the Constitution, without approval from Congress. The Justice Department, confident that Judge Collyer’s decision would be reversed, quickly appealed, and the subsidies have remained in place during the appeal. In successfully seeking a temporary halt in the proceedings after Mr. Trump won, House Republicans last month told the court that they “and the ’s transition team currently are discussing potential options for resolution of this matter, to take effect after the ’s inauguration on Jan. 20, 2017. ” The suspension of the case, House lawyers said, will “provide the and his future administration time to consider whether to continue prosecuting or to otherwise resolve this appeal. ” Republican leadership officials in the House acknowledge the possibility of “cascading effects” if the payments, which have totaled an estimated $13 billion, are suddenly stopped. Insurers that receive the subsidies in exchange for paying costs such as deductibles and for eligible consumers could race to drop coverage since they would be losing money. Over all, the loss of the subsidies could destabilize the entire program and cause a lack of confidence that leads other insurers to seek a quick exit as well. Anticipating that the Trump administration might not be inclined to mount a vigorous fight against the House Republicans given the ’s dim view of the health care law, a team of lawyers this month sought to intervene in the case on behalf of two participants in the health care program. In their request, the lawyers predicted that a deal between House Republicans and the new administration to dismiss or settle the case “will produce devastating consequences for the individuals who receive these reductions, as well as for the nation’s health insurance and health care systems generally. ” No matter what happens, House Republicans say, they want to prevail on two overarching concepts: the congressional power of the purse, and the right of Congress to sue the executive branch if it violates the Constitution regarding that spending power. House Republicans contend that Congress never appropriated the money for the subsidies, as required by the Constitution. In the suit, which was initially championed by John A. Boehner, the House speaker at the time, and later in House committee reports, Republicans asserted that the administration, desperate for the funding, had required the Treasury Department to provide it despite widespread internal skepticism that the spending was proper. The White House said that the spending was a permanent part of the law passed in 2010, and that no annual appropriation was required — even though the administration initially sought one. Just as important to House Republicans, Judge Collyer found that Congress had the standing to sue the White House on this issue — a ruling that many legal experts said was flawed — and they want that precedent to be set to restore congressional leverage over the executive branch. But on spending power and standing, the Trump administration may come under pressure from advocates of presidential authority to fight the House no matter their shared views on health care, since those precedents could have broad repercussions. It is a complicated set of dynamics illustrating how a quick legal victory for the House in the Trump era might come with costs that Republicans never anticipated when they took on the Obama White House.'"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
"execution_count": 6,
|
128 |
+
"metadata": {},
|
129 |
+
"output_type": "execute_result"
|
130 |
+
}
|
131 |
+
],
|
132 |
+
"source": [
|
133 |
+
"df1.content[0]"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"cell_type": "code",
|
138 |
+
"execution_count": 13,
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [
|
141 |
+
{
|
142 |
+
"name": "stdout",
|
143 |
+
"output_type": "stream",
|
144 |
+
"text": [
|
145 |
+
"Requirement already satisfied: sentencepiece in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (0.1.99)\n"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"name": "stderr",
|
150 |
+
"output_type": "stream",
|
151 |
+
"text": [
|
152 |
+
"\n",
|
153 |
+
"[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
|
154 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
155 |
+
]
|
156 |
+
}
|
157 |
+
],
|
158 |
+
"source": [
|
159 |
+
"!pip install sentencepiece\n"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"cell_type": "code",
|
164 |
+
"execution_count": 9,
|
165 |
+
"metadata": {},
|
166 |
+
"outputs": [
|
167 |
+
{
|
168 |
+
"name": "stdout",
|
169 |
+
"output_type": "stream",
|
170 |
+
"text": [
|
171 |
+
"Requirement already satisfied: tokenizers in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (0.13.3)\n"
|
172 |
+
]
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"name": "stderr",
|
176 |
+
"output_type": "stream",
|
177 |
+
"text": [
|
178 |
+
"\n",
|
179 |
+
"[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
|
180 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
181 |
+
]
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"source": [
|
185 |
+
"!pip install tokenizers\n"
|
186 |
+
]
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"cell_type": "code",
|
190 |
+
"execution_count": 10,
|
191 |
+
"metadata": {},
|
192 |
+
"outputs": [
|
193 |
+
{
|
194 |
+
"name": "stderr",
|
195 |
+
"output_type": "stream",
|
196 |
+
"text": [
|
197 |
+
"< was unexpected at this time.\n"
|
198 |
+
]
|
199 |
+
}
|
200 |
+
],
|
201 |
+
"source": [
|
202 |
+
"!python -m sentencepiece convert <path-to-slow-tokenizer> <path-to-fast-tokenizer>\n"
|
203 |
+
]
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"cell_type": "code",
|
207 |
+
"execution_count": 17,
|
208 |
+
"metadata": {},
|
209 |
+
"outputs": [
|
210 |
+
{
|
211 |
+
"ename": "SyntaxError",
|
212 |
+
"evalue": "invalid syntax (3422649496.py, line 12)",
|
213 |
+
"output_type": "error",
|
214 |
+
"traceback": [
|
215 |
+
"\u001b[1;36m Cell \u001b[1;32mIn[17], line 12\u001b[1;36m\u001b[0m\n\u001b[1;33m output_model=<google/pegasus-cnn_dailymail>,\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
|
216 |
+
]
|
217 |
+
}
|
218 |
+
],
|
219 |
+
"source": [
|
220 |
+
"import sentencepiece\n",
|
221 |
+
"\n",
|
222 |
+
"# Load the slow tokenizer.\n",
|
223 |
+
"tokenizer = sentencepiece.SentencePieceProcessor()\n",
|
224 |
+
"tokenizer.load(\"google/pegasus-cnn_dailymail\")\n",
|
225 |
+
"\n",
|
226 |
+
"# Convert the slow tokenizer to a fast one.\n",
|
227 |
+
"fast_tokenizer = sentencepiece.SentencePieceProcessor()\n",
|
228 |
+
"fast_tokenizer.train(\n",
|
229 |
+
" input=tokenizer.model_file,\n",
|
230 |
+
" vocab_size=tokenizer.vocab_size,\n",
|
231 |
+
" output_model=<google/pegasus-cnn_dailymail>,\n",
|
232 |
+
")\n"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": 1,
|
238 |
+
"metadata": {},
|
239 |
+
"outputs": [
|
240 |
+
{
|
241 |
+
"name": "stderr",
|
242 |
+
"output_type": "stream",
|
243 |
+
"text": [
|
244 |
+
"c:\\Users\\tigna\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
245 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
246 |
+
"Downloading pytorch_model.bin: 100%|██████████| 2.28G/2.28G [21:10<00:00, 1.79MB/s]\n",
|
247 |
+
"c:\\Users\\tigna\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\huggingface_hub\\file_download.py:133: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\tigna\\.cache\\huggingface\\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
248 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
249 |
+
" warnings.warn(message)\n",
|
250 |
+
"Downloading (…)neration_config.json: 100%|██████████| 280/280 [00:00<00:00, 57.9kB/s]\n"
|
251 |
+
]
|
252 |
+
}
|
253 |
+
],
|
254 |
+
"source": [
|
255 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
|
256 |
+
"\n",
|
257 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"google/pegasus-cnn_dailymail\")\n",
|
258 |
+
"\n",
|
259 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(\"google/pegasus-cnn_dailymail\")"
|
260 |
+
]
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"cell_type": "code",
|
264 |
+
"execution_count": 2,
|
265 |
+
"metadata": {},
|
266 |
+
"outputs": [
|
267 |
+
{
|
268 |
+
"name": "stdout",
|
269 |
+
"output_type": "stream",
|
270 |
+
"text": [
|
271 |
+
"Requirement already satisfied: streamlit in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.22.0)\n",
|
272 |
+
"Requirement already satisfied: altair<5,>=3.2.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (4.2.2)\n",
|
273 |
+
"Requirement already satisfied: blinker>=1.0.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (1.6.2)\n",
|
274 |
+
"Requirement already satisfied: cachetools>=4.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (5.3.0)\n",
|
275 |
+
"Requirement already satisfied: click>=7.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (8.1.3)\n",
|
276 |
+
"Requirement already satisfied: importlib-metadata>=1.4 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (6.6.0)\n",
|
277 |
+
"Requirement already satisfied: numpy in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (1.23.5)\n",
|
278 |
+
"Requirement already satisfied: packaging>=14.1 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (23.0)\n",
|
279 |
+
"Requirement already satisfied: pandas<3,>=0.25 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (1.5.3)\n",
|
280 |
+
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (9.4.0)\n",
|
281 |
+
"Requirement already satisfied: protobuf<4,>=3.12 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (3.20.3)\n",
|
282 |
+
"Requirement already satisfied: pyarrow>=4.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (12.0.0)\n",
|
283 |
+
"Requirement already satisfied: pympler>=0.9 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (1.0.1)\n",
|
284 |
+
"Requirement already satisfied: python-dateutil in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (2.8.2)\n",
|
285 |
+
"Requirement already satisfied: requests>=2.4 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (2.28.2)\n",
|
286 |
+
"Requirement already satisfied: rich>=10.11.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (13.3.5)\n",
|
287 |
+
"Requirement already satisfied: tenacity<9,>=8.0.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (8.2.2)\n",
|
288 |
+
"Requirement already satisfied: toml in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (0.10.2)\n",
|
289 |
+
"Requirement already satisfied: typing-extensions>=3.10.0.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (4.5.0)\n",
|
290 |
+
"Requirement already satisfied: tzlocal>=1.1 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (4.3)\n",
|
291 |
+
"Requirement already satisfied: validators>=0.2 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (0.20.0)\n",
|
292 |
+
"Requirement already satisfied: gitpython!=3.1.19 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (3.1.31)\n",
|
293 |
+
"Requirement already satisfied: pydeck>=0.1.dev5 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (0.8.1b0)\n",
|
294 |
+
"Requirement already satisfied: tornado>=6.0.3 in c:\\users\\tigna\\appdata\\roaming\\python\\python311\\site-packages (from streamlit) (6.2)\n",
|
295 |
+
"Requirement already satisfied: watchdog in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from streamlit) (3.0.0)\n",
|
296 |
+
"Requirement already satisfied: entrypoints in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from altair<5,>=3.2.0->streamlit) (0.4)\n",
|
297 |
+
"Requirement already satisfied: jinja2 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from altair<5,>=3.2.0->streamlit) (3.1.2)\n",
|
298 |
+
"Requirement already satisfied: jsonschema>=3.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from altair<5,>=3.2.0->streamlit) (4.17.3)\n",
|
299 |
+
"Requirement already satisfied: toolz in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from altair<5,>=3.2.0->streamlit) (0.12.0)\n",
|
300 |
+
"Requirement already satisfied: colorama in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from click>=7.0->streamlit) (0.4.6)\n",
|
301 |
+
"Requirement already satisfied: gitdb<5,>=4.0.1 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from gitpython!=3.1.19->streamlit) (4.0.10)\n",
|
302 |
+
"Requirement already satisfied: zipp>=0.5 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from importlib-metadata>=1.4->streamlit) (3.15.0)\n",
|
303 |
+
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas<3,>=0.25->streamlit) (2023.2)\n",
|
304 |
+
"Requirement already satisfied: six>=1.5 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from python-dateutil->streamlit) (1.16.0)\n",
|
305 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.4->streamlit) (3.1.0)\n",
|
306 |
+
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.4->streamlit) (3.4)\n",
|
307 |
+
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.4->streamlit) (1.26.15)\n",
|
308 |
+
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.4->streamlit) (2022.12.7)\n",
|
309 |
+
"Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from rich>=10.11.0->streamlit) (2.2.0)\n",
|
310 |
+
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\tigna\\appdata\\roaming\\python\\python311\\site-packages (from rich>=10.11.0->streamlit) (2.14.0)\n",
|
311 |
+
"Requirement already satisfied: pytz-deprecation-shim in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from tzlocal>=1.1->streamlit) (0.1.0.post0)\n",
|
312 |
+
"Requirement already satisfied: tzdata in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from tzlocal>=1.1->streamlit) (2023.3)\n",
|
313 |
+
"Requirement already satisfied: decorator>=3.4.0 in c:\\users\\tigna\\appdata\\roaming\\python\\python311\\site-packages (from validators>=0.2->streamlit) (5.1.1)\n",
|
314 |
+
"Requirement already satisfied: smmap<6,>=3.0.1 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19->streamlit) (5.0.0)\n",
|
315 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jinja2->altair<5,>=3.2.0->streamlit) (2.1.2)\n",
|
316 |
+
"Requirement already satisfied: attrs>=17.4.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=3.0->altair<5,>=3.2.0->streamlit) (22.2.0)\n",
|
317 |
+
"Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=3.0->altair<5,>=3.2.0->streamlit) (0.19.3)\n",
|
318 |
+
"Requirement already satisfied: mdurl~=0.1 in c:\\users\\tigna\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from markdown-it-py<3.0.0,>=2.2.0->rich>=10.11.0->streamlit) (0.1.2)\n"
|
319 |
+
]
|
320 |
+
},
|
321 |
+
{
|
322 |
+
"name": "stderr",
|
323 |
+
"output_type": "stream",
|
324 |
+
"text": [
|
325 |
+
"\n",
|
326 |
+
"[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
|
327 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
328 |
+
]
|
329 |
+
}
|
330 |
+
],
|
331 |
+
"source": [
|
332 |
+
"!pip install streamlit"
|
333 |
+
]
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"cell_type": "code",
|
337 |
+
"execution_count": 3,
|
338 |
+
"metadata": {},
|
339 |
+
"outputs": [],
|
340 |
+
"source": [
|
341 |
+
"tokenizer.save_pretrained('tokenizer')\n",
|
342 |
+
"model.save_pretrained('pegasus_summery_model')"
|
343 |
+
]
|
344 |
+
}
|
345 |
+
],
|
346 |
+
"metadata": {
|
347 |
+
"kernelspec": {
|
348 |
+
"display_name": "Python 3",
|
349 |
+
"language": "python",
|
350 |
+
"name": "python3"
|
351 |
+
},
|
352 |
+
"language_info": {
|
353 |
+
"codemirror_mode": {
|
354 |
+
"name": "ipython",
|
355 |
+
"version": 3
|
356 |
+
},
|
357 |
+
"file_extension": ".py",
|
358 |
+
"mimetype": "text/x-python",
|
359 |
+
"name": "python",
|
360 |
+
"nbconvert_exporter": "python",
|
361 |
+
"pygments_lexer": "ipython3",
|
362 |
+
"version": "3.11.1"
|
363 |
+
},
|
364 |
+
"orig_nbformat": 4
|
365 |
+
},
|
366 |
+
"nbformat": 4,
|
367 |
+
"nbformat_minor": 2
|
368 |
+
}
|
text_summarization.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
@st.cache_resource
|
5 |
+
def load_summarizer():
|
6 |
+
model = pipeline("summarization", device=0)
|
7 |
+
return model
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
def generate_chunks(inp_str):
|
12 |
+
max_chunk = 500
|
13 |
+
inp_str = inp_str.replace('.', '.<eos>')
|
14 |
+
inp_str = inp_str.replace('?', '?<eos>')
|
15 |
+
inp_str = inp_str.replace('!', '!<eos>')
|
16 |
+
|
17 |
+
sentences = inp_str.split('<eos>')
|
18 |
+
current_chunk = 0
|
19 |
+
chunks = []
|
20 |
+
for sentence in sentences:
|
21 |
+
if len(chunks) == current_chunk + 1:
|
22 |
+
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
|
23 |
+
chunks[current_chunk].extend(sentence.split(' '))
|
24 |
+
else:
|
25 |
+
current_chunk += 1
|
26 |
+
chunks.append(sentence.split(' '))
|
27 |
+
else:
|
28 |
+
chunks.append(sentence.split(' '))
|
29 |
+
|
30 |
+
for chunk_id in range(len(chunks)):
|
31 |
+
chunks[chunk_id] = ' '.join(chunks[chunk_id])
|
32 |
+
return chunks
|
33 |
+
|
34 |
+
|
35 |
+
summarizer = load_summarizer()
|
36 |
+
st.title("Summarize Text")
|
37 |
+
sentence = st.text_area('Please paste your article :', height=30)
|
38 |
+
button = st.button("Summarize")
|
39 |
+
|
40 |
+
max = st.sidebar.slider('Select max', 50, 500, step=10, value=150)
|
41 |
+
min = st.sidebar.slider('Select min', 10, 450, step=10, value=50)
|
42 |
+
do_sample = st.sidebar.checkbox("Do sample", value=False)
|
43 |
+
with st.spinner("Generating Summary.."):
|
44 |
+
if button and sentence:
|
45 |
+
chunks = generate_chunks(sentence)
|
46 |
+
res = summarizer(chunks,
|
47 |
+
max_length=max,
|
48 |
+
min_length=min,
|
49 |
+
do_sample=do_sample)
|
50 |
+
text = ' '.join([summ['summary_text'] for summ in res])
|
51 |
+
# st.write(result[0]['summary_text'])
|
52 |
+
st.write(text)
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<mask_1>",
|
4 |
+
"<unk_2>",
|
5 |
+
"<unk_3>",
|
6 |
+
"<unk_4>",
|
7 |
+
"<unk_5>",
|
8 |
+
"<unk_6>",
|
9 |
+
"<unk_7>",
|
10 |
+
"<unk_8>",
|
11 |
+
"<unk_9>",
|
12 |
+
"<unk_10>",
|
13 |
+
"<unk_11>",
|
14 |
+
"<unk_12>",
|
15 |
+
"<unk_13>",
|
16 |
+
"<unk_14>",
|
17 |
+
"<unk_15>",
|
18 |
+
"<unk_16>",
|
19 |
+
"<unk_17>",
|
20 |
+
"<unk_18>",
|
21 |
+
"<unk_19>",
|
22 |
+
"<unk_20>",
|
23 |
+
"<unk_21>",
|
24 |
+
"<unk_22>",
|
25 |
+
"<unk_23>",
|
26 |
+
"<unk_24>",
|
27 |
+
"<unk_25>",
|
28 |
+
"<unk_26>",
|
29 |
+
"<unk_27>",
|
30 |
+
"<unk_28>",
|
31 |
+
"<unk_29>",
|
32 |
+
"<unk_30>",
|
33 |
+
"<unk_31>",
|
34 |
+
"<unk_32>",
|
35 |
+
"<unk_33>",
|
36 |
+
"<unk_34>",
|
37 |
+
"<unk_35>",
|
38 |
+
"<unk_36>",
|
39 |
+
"<unk_37>",
|
40 |
+
"<unk_38>",
|
41 |
+
"<unk_39>",
|
42 |
+
"<unk_40>",
|
43 |
+
"<unk_41>",
|
44 |
+
"<unk_42>",
|
45 |
+
"<unk_43>",
|
46 |
+
"<unk_44>",
|
47 |
+
"<unk_45>",
|
48 |
+
"<unk_46>",
|
49 |
+
"<unk_47>",
|
50 |
+
"<unk_48>",
|
51 |
+
"<unk_49>",
|
52 |
+
"<unk_50>",
|
53 |
+
"<unk_51>",
|
54 |
+
"<unk_52>",
|
55 |
+
"<unk_53>",
|
56 |
+
"<unk_54>",
|
57 |
+
"<unk_55>",
|
58 |
+
"<unk_56>",
|
59 |
+
"<unk_57>",
|
60 |
+
"<unk_58>",
|
61 |
+
"<unk_59>",
|
62 |
+
"<unk_60>",
|
63 |
+
"<unk_61>",
|
64 |
+
"<unk_62>",
|
65 |
+
"<unk_63>",
|
66 |
+
"<unk_64>",
|
67 |
+
"<unk_65>",
|
68 |
+
"<unk_66>",
|
69 |
+
"<unk_67>",
|
70 |
+
"<unk_68>",
|
71 |
+
"<unk_69>",
|
72 |
+
"<unk_70>",
|
73 |
+
"<unk_71>",
|
74 |
+
"<unk_72>",
|
75 |
+
"<unk_73>",
|
76 |
+
"<unk_74>",
|
77 |
+
"<unk_75>",
|
78 |
+
"<unk_76>",
|
79 |
+
"<unk_77>",
|
80 |
+
"<unk_78>",
|
81 |
+
"<unk_79>",
|
82 |
+
"<unk_80>",
|
83 |
+
"<unk_81>",
|
84 |
+
"<unk_82>",
|
85 |
+
"<unk_83>",
|
86 |
+
"<unk_84>",
|
87 |
+
"<unk_85>",
|
88 |
+
"<unk_86>",
|
89 |
+
"<unk_87>",
|
90 |
+
"<unk_88>",
|
91 |
+
"<unk_89>",
|
92 |
+
"<unk_90>",
|
93 |
+
"<unk_91>",
|
94 |
+
"<unk_92>",
|
95 |
+
"<unk_93>",
|
96 |
+
"<unk_94>",
|
97 |
+
"<unk_95>",
|
98 |
+
"<unk_96>",
|
99 |
+
"<unk_97>",
|
100 |
+
"<unk_98>",
|
101 |
+
"<unk_99>",
|
102 |
+
"<unk_100>",
|
103 |
+
"<unk_101>",
|
104 |
+
"<unk_102>"
|
105 |
+
],
|
106 |
+
"clean_up_tokenization_spaces": true,
|
107 |
+
"eos_token": "</s>",
|
108 |
+
"full_tokenizer_file": null,
|
109 |
+
"mask_token": "<mask_2>",
|
110 |
+
"mask_token_sent": "<mask_1>",
|
111 |
+
"model_max_length": 1024,
|
112 |
+
"offset": 103,
|
113 |
+
"pad_token": "<pad>",
|
114 |
+
"sp_model_kwargs": {},
|
115 |
+
"tokenizer_class": "PegasusTokenizer",
|
116 |
+
"unk_token": "<unk>"
|
117 |
+
}
|