Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +423 -0
- config.json +37 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +63 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1024,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:21769
|
8 |
+
- loss:MultipleNegativesRankingLoss
|
9 |
+
base_model: am-azadi/bilingual-embedding-large_Fine_Tuned_1e
|
10 |
+
widget:
|
11 |
+
- source_sentence: 'GOOD NEWS! Eriksen, has already gone out to the hospital window,
|
12 |
+
where he is under observation and looks optimistic after having suffered a cardiac
|
13 |
+
arrest. '
|
14 |
+
sentences:
|
15 |
+
- Bolsonaro with the two assassins of Marielle Franco No, the men next to Jair Bolsonaro
|
16 |
+
in this photo are not the ones accused of the murder of Marielle Franco
|
17 |
+
- This photo shows Christian Eriksen waving from the window of the hospital where
|
18 |
+
he was admitted after suffering cardiac arrest The photo of Eriksen waving from
|
19 |
+
the window was taken months before his heart incident
|
20 |
+
- Video of protests in the US during the COVID-19 pandemic This video has been circulating
|
21 |
+
in reports about the funeral procession of military commanders in Iran in January
|
22 |
+
2020
|
23 |
+
- source_sentence: What a dirty game... "US postman arrested in canadian border with
|
24 |
+
banknotes stolen in the trunk of the car". 91 Breaking911 5h U.S. Postal Worker
|
25 |
+
Caught at Canadian Border With Stolen Ballots In Car Trunk - breaking911.com/u-s-postal-wor...
|
26 |
+
8218248 Claudia Wild IT 8206434 300 4:57 06 Nov 20 Twitter for iPhone 1,134
|
27 |
+
Retweets 113 Tweets with comment
|
28 |
+
sentences:
|
29 |
+
- Postman arrested with stolen bills at US-Canada border Only three blank bills
|
30 |
+
were found in a US postal worker's car
|
31 |
+
- Covid relief plan will cost every American $5,750 Misleading posts claim US covid
|
32 |
+
relief plan costs every American $5,750
|
33 |
+
- CDC informs that 10% of the swabs used for PCR testing were sent to LABORATORIES,
|
34 |
+
being analyzed of GENETIC SEQUENCES We check the claim that PCR tests aim to sequence
|
35 |
+
the DNA of patients with covid-19
|
36 |
+
- source_sentence: '. Northeast Always in Our Hearts! Advance Northeast!! . Brazilian
|
37 |
+
Army through its Engineering Battalion finds a Huge Potable Water Well in Seridó
|
38 |
+
- Caicó/RN, one of the most needy areas. This well will supply the homes of more
|
39 |
+
than 3,000 people!! . It''s our President Bolsonaro ridding the Bravo People
|
40 |
+
of the Northeast from the wounds of drought! . . . BRAZIL LOVED HOMELAND .
|
41 |
+
. Friends and Followers of : Follow and Turn on our Notifications . . #
|
42 |
+
pocket . '
|
43 |
+
sentences:
|
44 |
+
- Twitter suspended Elon Musk's Twitter account after he pulled out of deal Imposter
|
45 |
+
Elon Musk Twitter account shared in false posts claiming he was 'suspended' over
|
46 |
+
buyout row
|
47 |
+
- The Brazilian Army found water in Caicó, Rio Grande do Norte, during the government
|
48 |
+
of President Jair Bolsonaro. The recording of the drilling of an artesian well
|
49 |
+
in Caicó, Rio Grande do Norte, has been circulating since 2015
|
50 |
+
- A video was published today about Syrian refugees in Sweden being subjected to
|
51 |
+
the separation of husbands, as well as the forcible removal of their children
|
52 |
+
and the handing over of children to Christian families to change their religion.
|
53 |
+
And to turn them into Christians, they will have two children Swedish police did
|
54 |
+
not take Syrian children to hand over to Christian families
|
55 |
+
- source_sentence: what hp Álvaro Uribe Vélez ... 3pm ✓ The coastal people are the
|
56 |
+
least intellectual of the country, that is why this region of Colombia is mired
|
57 |
+
in poverty. They don't like to work either. that's why there is currently a level
|
58 |
+
very high of misery in la guajira. With the democratic center we will change.
|
59 |
+
The entire Caribbean coast must feel outraged by the statements of this individual.
|
60 |
+
Now with more reasons, the coastal people should support Petro. The how.. see
|
61 |
+
more
|
62 |
+
sentences:
|
63 |
+
- 'Covid-19: Omicron variant is transmitted by eye contact according to the WHO
|
64 |
+
The coronavirus is transmitted by interaction with contaminated droplets, not
|
65 |
+
by eye contact'
|
66 |
+
- 5G causes suffocation in humans, affects the respiratory system There is no evidence
|
67 |
+
that 5G technology affects the respiratory system and increases toxins in the
|
68 |
+
body
|
69 |
+
- Álvaro Uribe tweeted that the coastal people are the least intellectual population
|
70 |
+
in Colombia There is no record of Uribe tweeting that the coast is the "least
|
71 |
+
intellectual" region of Colombia
|
72 |
+
- source_sentence: 'The terrorists evaporated in seconds A very rare scene of the
|
73 |
+
moment the Egyptian planes bombed the terrorist elements in Sinai Watch the video
|
74 |
+
here NB Please all our followers on our page subscribe to our YouTube channel
|
75 |
+
We will publish everything new on the ground Open the channel link '
|
76 |
+
sentences:
|
77 |
+
- Cars melt due to hot weather in Saudi Arabia No, these cars did not melt due to
|
78 |
+
hot weather
|
79 |
+
- Footage shows robbery in Sri Lanka Delhi crime footage falsely shared as 'Sri
|
80 |
+
Lanka burglary'
|
81 |
+
- A very rare scene of the moment the Egyptian planes bombed the terrorist elements
|
82 |
+
in Sinai This picture is not of an Egyptian warplane, but of an Israeli plane
|
83 |
+
pipeline_tag: sentence-similarity
|
84 |
+
library_name: sentence-transformers
|
85 |
+
---
|
86 |
+
|
87 |
+
# SentenceTransformer based on am-azadi/bilingual-embedding-large_Fine_Tuned_1e
|
88 |
+
|
89 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [am-azadi/bilingual-embedding-large_Fine_Tuned_1e](https://huggingface.co/am-azadi/bilingual-embedding-large_Fine_Tuned_1e). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
90 |
+
|
91 |
+
## Model Details
|
92 |
+
|
93 |
+
### Model Description
|
94 |
+
- **Model Type:** Sentence Transformer
|
95 |
+
- **Base model:** [am-azadi/bilingual-embedding-large_Fine_Tuned_1e](https://huggingface.co/am-azadi/bilingual-embedding-large_Fine_Tuned_1e) <!-- at revision 9212ebc911617536aa06e4fe49c33d6f93ace38a -->
|
96 |
+
- **Maximum Sequence Length:** 512 tokens
|
97 |
+
- **Output Dimensionality:** 1024 dimensions
|
98 |
+
- **Similarity Function:** Cosine Similarity
|
99 |
+
<!-- - **Training Dataset:** Unknown -->
|
100 |
+
<!-- - **Language:** Unknown -->
|
101 |
+
<!-- - **License:** Unknown -->
|
102 |
+
|
103 |
+
### Model Sources
|
104 |
+
|
105 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
106 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
107 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
108 |
+
|
109 |
+
### Full Model Architecture
|
110 |
+
|
111 |
+
```
|
112 |
+
SentenceTransformer(
|
113 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BilingualModel
|
114 |
+
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
115 |
+
(2): Normalize()
|
116 |
+
)
|
117 |
+
```
|
118 |
+
|
119 |
+
## Usage
|
120 |
+
|
121 |
+
### Direct Usage (Sentence Transformers)
|
122 |
+
|
123 |
+
First install the Sentence Transformers library:
|
124 |
+
|
125 |
+
```bash
|
126 |
+
pip install -U sentence-transformers
|
127 |
+
```
|
128 |
+
|
129 |
+
Then you can load this model and run inference.
|
130 |
+
```python
|
131 |
+
from sentence_transformers import SentenceTransformer
|
132 |
+
|
133 |
+
# Download from the 🤗 Hub
|
134 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
135 |
+
# Run inference
|
136 |
+
sentences = [
|
137 |
+
'The terrorists evaporated in seconds A very rare scene of the moment the Egyptian planes bombed the terrorist elements in Sinai Watch the video here NB Please all our followers on our page subscribe to our YouTube channel We will publish everything new on the ground Open the channel link ',
|
138 |
+
'A very rare scene of the moment the Egyptian planes bombed the terrorist elements in Sinai This picture is not of an Egyptian warplane, but of an Israeli plane',
|
139 |
+
'Cars melt due to hot weather in Saudi Arabia No, these cars did not melt due to hot weather',
|
140 |
+
]
|
141 |
+
embeddings = model.encode(sentences)
|
142 |
+
print(embeddings.shape)
|
143 |
+
# [3, 1024]
|
144 |
+
|
145 |
+
# Get the similarity scores for the embeddings
|
146 |
+
similarities = model.similarity(embeddings, embeddings)
|
147 |
+
print(similarities.shape)
|
148 |
+
# [3, 3]
|
149 |
+
```
|
150 |
+
|
151 |
+
<!--
|
152 |
+
### Direct Usage (Transformers)
|
153 |
+
|
154 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
155 |
+
|
156 |
+
</details>
|
157 |
+
-->
|
158 |
+
|
159 |
+
<!--
|
160 |
+
### Downstream Usage (Sentence Transformers)
|
161 |
+
|
162 |
+
You can finetune this model on your own dataset.
|
163 |
+
|
164 |
+
<details><summary>Click to expand</summary>
|
165 |
+
|
166 |
+
</details>
|
167 |
+
-->
|
168 |
+
|
169 |
+
<!--
|
170 |
+
### Out-of-Scope Use
|
171 |
+
|
172 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
173 |
+
-->
|
174 |
+
|
175 |
+
<!--
|
176 |
+
## Bias, Risks and Limitations
|
177 |
+
|
178 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
179 |
+
-->
|
180 |
+
|
181 |
+
<!--
|
182 |
+
### Recommendations
|
183 |
+
|
184 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
185 |
+
-->
|
186 |
+
|
187 |
+
## Training Details
|
188 |
+
|
189 |
+
### Training Dataset
|
190 |
+
|
191 |
+
#### Unnamed Dataset
|
192 |
+
|
193 |
+
* Size: 21,769 training samples
|
194 |
+
* Columns: <code>sentence_0</code> and <code>sentence_1</code>
|
195 |
+
* Approximate statistics based on the first 1000 samples:
|
196 |
+
| | sentence_0 | sentence_1 |
|
197 |
+
|:--------|:------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
|
198 |
+
| type | string | string |
|
199 |
+
| details | <ul><li>min: 6 tokens</li><li>mean: 119.28 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 18 tokens</li><li>mean: 39.42 tokens</li><li>max: 98 tokens</li></ul> |
|
200 |
+
* Samples:
|
201 |
+
| sentence_0 | sentence_1 |
|
202 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
203 |
+
| <code>HAPPENING NOW ; KENYA ELECTRIC BUS IS ON FIRE ALONG KAREN ROAD. </code> | <code>Electric bus catches fire in Nairobi Video shows a methane-powered bus that caught fire in Italy, not an electric bus in Kenya</code> |
|
204 |
+
| <code> RUPTLY Viewed 51,670 times 8 hours Snorr On the way down Khao Pak Thong Chai, route 3-4, Sattahip - Korat, all of them would have died. pity Incident 27 Jun.</code> | <code>Video showing road accidents in Thailand? This is a video published in a news report about a car crash in Russia.</code> |
|
205 |
+
| <code>The image that went around the world! This photo won the best of the decade award and led to the author to depression, the author narrated in his description; "Cheetahs chased a mother deer and her 2 babies, she offered herself so that her children could escape and in the photo looks like she watches her babies run to safety as she is about to be devoured" How many times have you stopped to think how many sacrifices your parents do for you. While you have fun, laugh and you enjoy life, they give theirs.</code> | <code>Cheetahs chased a mother deer and she volunteered so her children could escape Behind the picture: Cheetahs learned from their mother how to capture prey</code> |
|
206 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
207 |
+
```json
|
208 |
+
{
|
209 |
+
"scale": 20.0,
|
210 |
+
"similarity_fct": "cos_sim"
|
211 |
+
}
|
212 |
+
```
|
213 |
+
|
214 |
+
### Training Hyperparameters
|
215 |
+
#### Non-Default Hyperparameters
|
216 |
+
|
217 |
+
- `per_device_train_batch_size`: 2
|
218 |
+
- `per_device_eval_batch_size`: 2
|
219 |
+
- `num_train_epochs`: 1
|
220 |
+
- `multi_dataset_batch_sampler`: round_robin
|
221 |
+
|
222 |
+
#### All Hyperparameters
|
223 |
+
<details><summary>Click to expand</summary>
|
224 |
+
|
225 |
+
- `overwrite_output_dir`: False
|
226 |
+
- `do_predict`: False
|
227 |
+
- `eval_strategy`: no
|
228 |
+
- `prediction_loss_only`: True
|
229 |
+
- `per_device_train_batch_size`: 2
|
230 |
+
- `per_device_eval_batch_size`: 2
|
231 |
+
- `per_gpu_train_batch_size`: None
|
232 |
+
- `per_gpu_eval_batch_size`: None
|
233 |
+
- `gradient_accumulation_steps`: 1
|
234 |
+
- `eval_accumulation_steps`: None
|
235 |
+
- `torch_empty_cache_steps`: None
|
236 |
+
- `learning_rate`: 5e-05
|
237 |
+
- `weight_decay`: 0.0
|
238 |
+
- `adam_beta1`: 0.9
|
239 |
+
- `adam_beta2`: 0.999
|
240 |
+
- `adam_epsilon`: 1e-08
|
241 |
+
- `max_grad_norm`: 1
|
242 |
+
- `num_train_epochs`: 1
|
243 |
+
- `max_steps`: -1
|
244 |
+
- `lr_scheduler_type`: linear
|
245 |
+
- `lr_scheduler_kwargs`: {}
|
246 |
+
- `warmup_ratio`: 0.0
|
247 |
+
- `warmup_steps`: 0
|
248 |
+
- `log_level`: passive
|
249 |
+
- `log_level_replica`: warning
|
250 |
+
- `log_on_each_node`: True
|
251 |
+
- `logging_nan_inf_filter`: True
|
252 |
+
- `save_safetensors`: True
|
253 |
+
- `save_on_each_node`: False
|
254 |
+
- `save_only_model`: False
|
255 |
+
- `restore_callback_states_from_checkpoint`: False
|
256 |
+
- `no_cuda`: False
|
257 |
+
- `use_cpu`: False
|
258 |
+
- `use_mps_device`: False
|
259 |
+
- `seed`: 42
|
260 |
+
- `data_seed`: None
|
261 |
+
- `jit_mode_eval`: False
|
262 |
+
- `use_ipex`: False
|
263 |
+
- `bf16`: False
|
264 |
+
- `fp16`: False
|
265 |
+
- `fp16_opt_level`: O1
|
266 |
+
- `half_precision_backend`: auto
|
267 |
+
- `bf16_full_eval`: False
|
268 |
+
- `fp16_full_eval`: False
|
269 |
+
- `tf32`: None
|
270 |
+
- `local_rank`: 0
|
271 |
+
- `ddp_backend`: None
|
272 |
+
- `tpu_num_cores`: None
|
273 |
+
- `tpu_metrics_debug`: False
|
274 |
+
- `debug`: []
|
275 |
+
- `dataloader_drop_last`: False
|
276 |
+
- `dataloader_num_workers`: 0
|
277 |
+
- `dataloader_prefetch_factor`: None
|
278 |
+
- `past_index`: -1
|
279 |
+
- `disable_tqdm`: False
|
280 |
+
- `remove_unused_columns`: True
|
281 |
+
- `label_names`: None
|
282 |
+
- `load_best_model_at_end`: False
|
283 |
+
- `ignore_data_skip`: False
|
284 |
+
- `fsdp`: []
|
285 |
+
- `fsdp_min_num_params`: 0
|
286 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
287 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
288 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
289 |
+
- `deepspeed`: None
|
290 |
+
- `label_smoothing_factor`: 0.0
|
291 |
+
- `optim`: adamw_torch
|
292 |
+
- `optim_args`: None
|
293 |
+
- `adafactor`: False
|
294 |
+
- `group_by_length`: False
|
295 |
+
- `length_column_name`: length
|
296 |
+
- `ddp_find_unused_parameters`: None
|
297 |
+
- `ddp_bucket_cap_mb`: None
|
298 |
+
- `ddp_broadcast_buffers`: False
|
299 |
+
- `dataloader_pin_memory`: True
|
300 |
+
- `dataloader_persistent_workers`: False
|
301 |
+
- `skip_memory_metrics`: True
|
302 |
+
- `use_legacy_prediction_loop`: False
|
303 |
+
- `push_to_hub`: False
|
304 |
+
- `resume_from_checkpoint`: None
|
305 |
+
- `hub_model_id`: None
|
306 |
+
- `hub_strategy`: every_save
|
307 |
+
- `hub_private_repo`: None
|
308 |
+
- `hub_always_push`: False
|
309 |
+
- `gradient_checkpointing`: False
|
310 |
+
- `gradient_checkpointing_kwargs`: None
|
311 |
+
- `include_inputs_for_metrics`: False
|
312 |
+
- `include_for_metrics`: []
|
313 |
+
- `eval_do_concat_batches`: True
|
314 |
+
- `fp16_backend`: auto
|
315 |
+
- `push_to_hub_model_id`: None
|
316 |
+
- `push_to_hub_organization`: None
|
317 |
+
- `mp_parameters`:
|
318 |
+
- `auto_find_batch_size`: False
|
319 |
+
- `full_determinism`: False
|
320 |
+
- `torchdynamo`: None
|
321 |
+
- `ray_scope`: last
|
322 |
+
- `ddp_timeout`: 1800
|
323 |
+
- `torch_compile`: False
|
324 |
+
- `torch_compile_backend`: None
|
325 |
+
- `torch_compile_mode`: None
|
326 |
+
- `dispatch_batches`: None
|
327 |
+
- `split_batches`: None
|
328 |
+
- `include_tokens_per_second`: False
|
329 |
+
- `include_num_input_tokens_seen`: False
|
330 |
+
- `neftune_noise_alpha`: None
|
331 |
+
- `optim_target_modules`: None
|
332 |
+
- `batch_eval_metrics`: False
|
333 |
+
- `eval_on_start`: False
|
334 |
+
- `use_liger_kernel`: False
|
335 |
+
- `eval_use_gather_object`: False
|
336 |
+
- `average_tokens_across_devices`: False
|
337 |
+
- `prompts`: None
|
338 |
+
- `batch_sampler`: batch_sampler
|
339 |
+
- `multi_dataset_batch_sampler`: round_robin
|
340 |
+
|
341 |
+
</details>
|
342 |
+
|
343 |
+
### Training Logs
|
344 |
+
| Epoch | Step | Training Loss |
|
345 |
+
|:------:|:-----:|:-------------:|
|
346 |
+
| 0.0459 | 500 | 0.0135 |
|
347 |
+
| 0.0919 | 1000 | 0.024 |
|
348 |
+
| 0.1378 | 1500 | 0.0073 |
|
349 |
+
| 0.1837 | 2000 | 0.0103 |
|
350 |
+
| 0.2297 | 2500 | 0.0265 |
|
351 |
+
| 0.2756 | 3000 | 0.0209 |
|
352 |
+
| 0.3215 | 3500 | 0.0308 |
|
353 |
+
| 0.3675 | 4000 | 0.0301 |
|
354 |
+
| 0.4134 | 4500 | 0.0382 |
|
355 |
+
| 0.4593 | 5000 | 0.0164 |
|
356 |
+
| 0.5053 | 5500 | 0.0251 |
|
357 |
+
| 0.5512 | 6000 | 0.0141 |
|
358 |
+
| 0.5972 | 6500 | 0.0131 |
|
359 |
+
| 0.6431 | 7000 | 0.006 |
|
360 |
+
| 0.6890 | 7500 | 0.0261 |
|
361 |
+
| 0.7350 | 8000 | 0.0111 |
|
362 |
+
| 0.7809 | 8500 | 0.0089 |
|
363 |
+
| 0.8268 | 9000 | 0.0201 |
|
364 |
+
| 0.8728 | 9500 | 0.0175 |
|
365 |
+
| 0.9187 | 10000 | 0.0086 |
|
366 |
+
| 0.9646 | 10500 | 0.0049 |
|
367 |
+
|
368 |
+
|
369 |
+
### Framework Versions
|
370 |
+
- Python: 3.11.11
|
371 |
+
- Sentence Transformers: 3.4.1
|
372 |
+
- Transformers: 4.48.3
|
373 |
+
- PyTorch: 2.5.1+cu124
|
374 |
+
- Accelerate: 1.3.0
|
375 |
+
- Datasets: 3.3.2
|
376 |
+
- Tokenizers: 0.21.0
|
377 |
+
|
378 |
+
## Citation
|
379 |
+
|
380 |
+
### BibTeX
|
381 |
+
|
382 |
+
#### Sentence Transformers
|
383 |
+
```bibtex
|
384 |
+
@inproceedings{reimers-2019-sentence-bert,
|
385 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
386 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
387 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
388 |
+
month = "11",
|
389 |
+
year = "2019",
|
390 |
+
publisher = "Association for Computational Linguistics",
|
391 |
+
url = "https://arxiv.org/abs/1908.10084",
|
392 |
+
}
|
393 |
+
```
|
394 |
+
|
395 |
+
#### MultipleNegativesRankingLoss
|
396 |
+
```bibtex
|
397 |
+
@misc{henderson2017efficient,
|
398 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
399 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
400 |
+
year={2017},
|
401 |
+
eprint={1705.00652},
|
402 |
+
archivePrefix={arXiv},
|
403 |
+
primaryClass={cs.CL}
|
404 |
+
}
|
405 |
+
```
|
406 |
+
|
407 |
+
<!--
|
408 |
+
## Glossary
|
409 |
+
|
410 |
+
*Clearly define terms in order to be accessible across audiences.*
|
411 |
+
-->
|
412 |
+
|
413 |
+
<!--
|
414 |
+
## Model Card Authors
|
415 |
+
|
416 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
417 |
+
-->
|
418 |
+
|
419 |
+
<!--
|
420 |
+
## Model Card Contact
|
421 |
+
|
422 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
423 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "am-azadi/bilingual-embedding-large_Fine_Tuned_1e",
|
3 |
+
"architectures": [
|
4 |
+
"BilingualModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "dangvantuan/bilingual_impl--config.BilingualConfig",
|
9 |
+
"AutoModel": "dangvantuan/bilingual_impl--modeling.BilingualModel",
|
10 |
+
"AutoModelForMaskedLM": "dangvantuan/bilingual_impl--modeling.BilingualForMaskedLM",
|
11 |
+
"AutoModelForMultipleChoice": "dangvantuan/bilingual_impl--modeling.BilingualForMultipleChoice",
|
12 |
+
"AutoModelForQuestionAnswering": "dangvantuan/bilingual_impl--modeling.BilingualForQuestionAnswering",
|
13 |
+
"AutoModelForSequenceClassification": "dangvantuan/bilingual_impl--modeling.BilingualForSequenceClassification",
|
14 |
+
"AutoModelForTokenClassification": "dangvantuan/bilingual_impl--modeling.BilingualForTokenClassification"
|
15 |
+
},
|
16 |
+
"bos_token_id": 0,
|
17 |
+
"classifier_dropout": null,
|
18 |
+
"eos_token_id": 2,
|
19 |
+
"hidden_act": "gelu",
|
20 |
+
"hidden_dropout_prob": 0.1,
|
21 |
+
"hidden_size": 1024,
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 4096,
|
24 |
+
"layer_norm_eps": 1e-05,
|
25 |
+
"max_position_embeddings": 514,
|
26 |
+
"model_type": "bilingual",
|
27 |
+
"num_attention_heads": 16,
|
28 |
+
"num_hidden_layers": 24,
|
29 |
+
"output_past": true,
|
30 |
+
"pad_token_id": 1,
|
31 |
+
"position_embedding_type": "absolute",
|
32 |
+
"torch_dtype": "float32",
|
33 |
+
"transformers_version": "4.48.3",
|
34 |
+
"type_vocab_size": 1,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 250002
|
37 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.4.1",
|
4 |
+
"transformers": "4.48.3",
|
5 |
+
"pytorch": "2.5.1+cu124"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5de6cb5734383d6b4791e6a42a794683a3c3f361499f1f85093f6a47e2eeaeac
|
3 |
+
size 2239607176
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
|
3 |
+
size 17082987
|
tokenizer_config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"additional_special_tokens": [],
|
45 |
+
"bos_token": "<s>",
|
46 |
+
"clean_up_tokenization_spaces": true,
|
47 |
+
"cls_token": "<s>",
|
48 |
+
"eos_token": "</s>",
|
49 |
+
"extra_special_tokens": {},
|
50 |
+
"mask_token": "<mask>",
|
51 |
+
"max_length": 512,
|
52 |
+
"model_max_length": 512,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "<pad>",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "</s>",
|
58 |
+
"stride": 0,
|
59 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
60 |
+
"truncation_side": "right",
|
61 |
+
"truncation_strategy": "longest_first",
|
62 |
+
"unk_token": "<unk>"
|
63 |
+
}
|