am-azadi commited on
Commit
a857baa
·
verified ·
1 Parent(s): 797d7a4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1024,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sentence-similarity
5
+ - feature-extraction
6
+ - generated_from_trainer
7
+ - dataset_size:21769
8
+ - loss:MultipleNegativesRankingLoss
9
+ base_model: am-azadi/bilingual-embedding-large_Fine_Tuned_1e
10
+ widget:
11
+ - source_sentence: 'GOOD NEWS! Eriksen, has already gone out to the hospital window,
12
+ where he is under observation and looks optimistic after having suffered a cardiac
13
+ arrest. '
14
+ sentences:
15
+ - Bolsonaro with the two assassins of Marielle Franco No, the men next to Jair Bolsonaro
16
+ in this photo are not the ones accused of the murder of Marielle Franco
17
+ - This photo shows Christian Eriksen waving from the window of the hospital where
18
+ he was admitted after suffering cardiac arrest The photo of Eriksen waving from
19
+ the window was taken months before his heart incident
20
+ - Video of protests in the US during the COVID-19 pandemic This video has been circulating
21
+ in reports about the funeral procession of military commanders in Iran in January
22
+ 2020
23
+ - source_sentence: What a dirty game... "US postman arrested in canadian border with
24
+ banknotes stolen in the trunk of the car". 91 Breaking911 5h U.S. Postal Worker
25
+ Caught at Canadian Border With Stolen Ballots In Car Trunk - breaking911.com/u-s-postal-wor...
26
+ 8218248 Claudia Wild IT 8206434 300 4:57 06 Nov 20 Twitter for iPhone 1,134
27
+ Retweets 113 Tweets with comment
28
+ sentences:
29
+ - Postman arrested with stolen bills at US-Canada border Only three blank bills
30
+ were found in a US postal worker's car
31
+ - Covid relief plan will cost every American $5,750 Misleading posts claim US covid
32
+ relief plan costs every American $5,750
33
+ - CDC informs that 10% of the swabs used for PCR testing were sent to LABORATORIES,
34
+ being analyzed of GENETIC SEQUENCES We check the claim that PCR tests aim to sequence
35
+ the DNA of patients with covid-19
36
+ - source_sentence: '. Northeast Always in Our Hearts! Advance Northeast!! . Brazilian
37
+ Army through its Engineering Battalion finds a Huge Potable Water Well in Seridó
38
+ - Caicó/RN, one of the most needy areas. This well will supply the homes of more
39
+ than 3,000 people!! . It''s our President Bolsonaro ridding the Bravo People
40
+ of the Northeast from the wounds of drought! . . . BRAZIL LOVED HOMELAND .
41
+ . Friends and Followers of : Follow and Turn on our Notifications . . #
42
+ pocket . '
43
+ sentences:
44
+ - Twitter suspended Elon Musk's Twitter account after he pulled out of deal Imposter
45
+ Elon Musk Twitter account shared in false posts claiming he was 'suspended' over
46
+ buyout row
47
+ - The Brazilian Army found water in Caicó, Rio Grande do Norte, during the government
48
+ of President Jair Bolsonaro. The recording of the drilling of an artesian well
49
+ in Caicó, Rio Grande do Norte, has been circulating since 2015
50
+ - A video was published today about Syrian refugees in Sweden being subjected to
51
+ the separation of husbands, as well as the forcible removal of their children
52
+ and the handing over of children to Christian families to change their religion.
53
+ And to turn them into Christians, they will have two children Swedish police did
54
+ not take Syrian children to hand over to Christian families
55
+ - source_sentence: what hp Álvaro Uribe Vélez ... 3pm ✓ The coastal people are the
56
+ least intellectual of the country, that is why this region of Colombia is mired
57
+ in poverty. They don't like to work either. that's why there is currently a level
58
+ very high of misery in la guajira. With the democratic center we will change.
59
+ The entire Caribbean coast must feel outraged by the statements of this individual.
60
+ Now with more reasons, the coastal people should support Petro. The how.. see
61
+ more
62
+ sentences:
63
+ - 'Covid-19: Omicron variant is transmitted by eye contact according to the WHO
64
+ The coronavirus is transmitted by interaction with contaminated droplets, not
65
+ by eye contact'
66
+ - 5G causes suffocation in humans, affects the respiratory system There is no evidence
67
+ that 5G technology affects the respiratory system and increases toxins in the
68
+ body
69
+ - Álvaro Uribe tweeted that the coastal people are the least intellectual population
70
+ in Colombia There is no record of Uribe tweeting that the coast is the "least
71
+ intellectual" region of Colombia
72
+ - source_sentence: 'The terrorists evaporated in seconds A very rare scene of the
73
+ moment the Egyptian planes bombed the terrorist elements in Sinai Watch the video
74
+ here NB Please all our followers on our page subscribe to our YouTube channel
75
+ We will publish everything new on the ground Open the channel link '
76
+ sentences:
77
+ - Cars melt due to hot weather in Saudi Arabia No, these cars did not melt due to
78
+ hot weather
79
+ - Footage shows robbery in Sri Lanka Delhi crime footage falsely shared as 'Sri
80
+ Lanka burglary'
81
+ - A very rare scene of the moment the Egyptian planes bombed the terrorist elements
82
+ in Sinai This picture is not of an Egyptian warplane, but of an Israeli plane
83
+ pipeline_tag: sentence-similarity
84
+ library_name: sentence-transformers
85
+ ---
86
+
87
+ # SentenceTransformer based on am-azadi/bilingual-embedding-large_Fine_Tuned_1e
88
+
89
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [am-azadi/bilingual-embedding-large_Fine_Tuned_1e](https://huggingface.co/am-azadi/bilingual-embedding-large_Fine_Tuned_1e). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
90
+
91
+ ## Model Details
92
+
93
+ ### Model Description
94
+ - **Model Type:** Sentence Transformer
95
+ - **Base model:** [am-azadi/bilingual-embedding-large_Fine_Tuned_1e](https://huggingface.co/am-azadi/bilingual-embedding-large_Fine_Tuned_1e) <!-- at revision 9212ebc911617536aa06e4fe49c33d6f93ace38a -->
96
+ - **Maximum Sequence Length:** 512 tokens
97
+ - **Output Dimensionality:** 1024 dimensions
98
+ - **Similarity Function:** Cosine Similarity
99
+ <!-- - **Training Dataset:** Unknown -->
100
+ <!-- - **Language:** Unknown -->
101
+ <!-- - **License:** Unknown -->
102
+
103
+ ### Model Sources
104
+
105
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
106
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
107
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
108
+
109
+ ### Full Model Architecture
110
+
111
+ ```
112
+ SentenceTransformer(
113
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BilingualModel
114
+ (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
115
+ (2): Normalize()
116
+ )
117
+ ```
118
+
119
+ ## Usage
120
+
121
+ ### Direct Usage (Sentence Transformers)
122
+
123
+ First install the Sentence Transformers library:
124
+
125
+ ```bash
126
+ pip install -U sentence-transformers
127
+ ```
128
+
129
+ Then you can load this model and run inference.
130
+ ```python
131
+ from sentence_transformers import SentenceTransformer
132
+
133
+ # Download from the 🤗 Hub
134
+ model = SentenceTransformer("sentence_transformers_model_id")
135
+ # Run inference
136
+ sentences = [
137
+ 'The terrorists evaporated in seconds A very rare scene of the moment the Egyptian planes bombed the terrorist elements in Sinai Watch the video here NB Please all our followers on our page subscribe to our YouTube channel We will publish everything new on the ground Open the channel link ',
138
+ 'A very rare scene of the moment the Egyptian planes bombed the terrorist elements in Sinai This picture is not of an Egyptian warplane, but of an Israeli plane',
139
+ 'Cars melt due to hot weather in Saudi Arabia No, these cars did not melt due to hot weather',
140
+ ]
141
+ embeddings = model.encode(sentences)
142
+ print(embeddings.shape)
143
+ # [3, 1024]
144
+
145
+ # Get the similarity scores for the embeddings
146
+ similarities = model.similarity(embeddings, embeddings)
147
+ print(similarities.shape)
148
+ # [3, 3]
149
+ ```
150
+
151
+ <!--
152
+ ### Direct Usage (Transformers)
153
+
154
+ <details><summary>Click to see the direct usage in Transformers</summary>
155
+
156
+ </details>
157
+ -->
158
+
159
+ <!--
160
+ ### Downstream Usage (Sentence Transformers)
161
+
162
+ You can finetune this model on your own dataset.
163
+
164
+ <details><summary>Click to expand</summary>
165
+
166
+ </details>
167
+ -->
168
+
169
+ <!--
170
+ ### Out-of-Scope Use
171
+
172
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
173
+ -->
174
+
175
+ <!--
176
+ ## Bias, Risks and Limitations
177
+
178
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
179
+ -->
180
+
181
+ <!--
182
+ ### Recommendations
183
+
184
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
185
+ -->
186
+
187
+ ## Training Details
188
+
189
+ ### Training Dataset
190
+
191
+ #### Unnamed Dataset
192
+
193
+ * Size: 21,769 training samples
194
+ * Columns: <code>sentence_0</code> and <code>sentence_1</code>
195
+ * Approximate statistics based on the first 1000 samples:
196
+ | | sentence_0 | sentence_1 |
197
+ |:--------|:------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
198
+ | type | string | string |
199
+ | details | <ul><li>min: 6 tokens</li><li>mean: 119.28 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 18 tokens</li><li>mean: 39.42 tokens</li><li>max: 98 tokens</li></ul> |
200
+ * Samples:
201
+ | sentence_0 | sentence_1 |
202
+ |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
203
+ | <code>HAPPENING NOW ; KENYA ELECTRIC BUS IS ON FIRE ALONG KAREN ROAD. </code> | <code>Electric bus catches fire in Nairobi Video shows a methane-powered bus that caught fire in Italy, not an electric bus in Kenya</code> |
204
+ | <code> RUPTLY Viewed 51,670 times 8 hours Snorr On the way down Khao Pak Thong Chai, route 3-4, Sattahip - Korat, all of them would have died. pity Incident 27 Jun.</code> | <code>Video showing road accidents in Thailand? This is a video published in a news report about a car crash in Russia.</code> |
205
+ | <code>The image that went around the world! This photo won the best of the decade award and led to the author to depression, the author narrated in his description; "Cheetahs chased a mother deer and her 2 babies, she offered herself so that her children could escape and in the photo looks like she watches her babies run to safety as she is about to be devoured" How many times have you stopped to think how many sacrifices your parents do for you. While you have fun, laugh and you enjoy life, they give theirs.</code> | <code>Cheetahs chased a mother deer and she volunteered so her children could escape Behind the picture: Cheetahs learned from their mother how to capture prey</code> |
206
+ * Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
207
+ ```json
208
+ {
209
+ "scale": 20.0,
210
+ "similarity_fct": "cos_sim"
211
+ }
212
+ ```
213
+
214
+ ### Training Hyperparameters
215
+ #### Non-Default Hyperparameters
216
+
217
+ - `per_device_train_batch_size`: 2
218
+ - `per_device_eval_batch_size`: 2
219
+ - `num_train_epochs`: 1
220
+ - `multi_dataset_batch_sampler`: round_robin
221
+
222
+ #### All Hyperparameters
223
+ <details><summary>Click to expand</summary>
224
+
225
+ - `overwrite_output_dir`: False
226
+ - `do_predict`: False
227
+ - `eval_strategy`: no
228
+ - `prediction_loss_only`: True
229
+ - `per_device_train_batch_size`: 2
230
+ - `per_device_eval_batch_size`: 2
231
+ - `per_gpu_train_batch_size`: None
232
+ - `per_gpu_eval_batch_size`: None
233
+ - `gradient_accumulation_steps`: 1
234
+ - `eval_accumulation_steps`: None
235
+ - `torch_empty_cache_steps`: None
236
+ - `learning_rate`: 5e-05
237
+ - `weight_decay`: 0.0
238
+ - `adam_beta1`: 0.9
239
+ - `adam_beta2`: 0.999
240
+ - `adam_epsilon`: 1e-08
241
+ - `max_grad_norm`: 1
242
+ - `num_train_epochs`: 1
243
+ - `max_steps`: -1
244
+ - `lr_scheduler_type`: linear
245
+ - `lr_scheduler_kwargs`: {}
246
+ - `warmup_ratio`: 0.0
247
+ - `warmup_steps`: 0
248
+ - `log_level`: passive
249
+ - `log_level_replica`: warning
250
+ - `log_on_each_node`: True
251
+ - `logging_nan_inf_filter`: True
252
+ - `save_safetensors`: True
253
+ - `save_on_each_node`: False
254
+ - `save_only_model`: False
255
+ - `restore_callback_states_from_checkpoint`: False
256
+ - `no_cuda`: False
257
+ - `use_cpu`: False
258
+ - `use_mps_device`: False
259
+ - `seed`: 42
260
+ - `data_seed`: None
261
+ - `jit_mode_eval`: False
262
+ - `use_ipex`: False
263
+ - `bf16`: False
264
+ - `fp16`: False
265
+ - `fp16_opt_level`: O1
266
+ - `half_precision_backend`: auto
267
+ - `bf16_full_eval`: False
268
+ - `fp16_full_eval`: False
269
+ - `tf32`: None
270
+ - `local_rank`: 0
271
+ - `ddp_backend`: None
272
+ - `tpu_num_cores`: None
273
+ - `tpu_metrics_debug`: False
274
+ - `debug`: []
275
+ - `dataloader_drop_last`: False
276
+ - `dataloader_num_workers`: 0
277
+ - `dataloader_prefetch_factor`: None
278
+ - `past_index`: -1
279
+ - `disable_tqdm`: False
280
+ - `remove_unused_columns`: True
281
+ - `label_names`: None
282
+ - `load_best_model_at_end`: False
283
+ - `ignore_data_skip`: False
284
+ - `fsdp`: []
285
+ - `fsdp_min_num_params`: 0
286
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
287
+ - `fsdp_transformer_layer_cls_to_wrap`: None
288
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
289
+ - `deepspeed`: None
290
+ - `label_smoothing_factor`: 0.0
291
+ - `optim`: adamw_torch
292
+ - `optim_args`: None
293
+ - `adafactor`: False
294
+ - `group_by_length`: False
295
+ - `length_column_name`: length
296
+ - `ddp_find_unused_parameters`: None
297
+ - `ddp_bucket_cap_mb`: None
298
+ - `ddp_broadcast_buffers`: False
299
+ - `dataloader_pin_memory`: True
300
+ - `dataloader_persistent_workers`: False
301
+ - `skip_memory_metrics`: True
302
+ - `use_legacy_prediction_loop`: False
303
+ - `push_to_hub`: False
304
+ - `resume_from_checkpoint`: None
305
+ - `hub_model_id`: None
306
+ - `hub_strategy`: every_save
307
+ - `hub_private_repo`: None
308
+ - `hub_always_push`: False
309
+ - `gradient_checkpointing`: False
310
+ - `gradient_checkpointing_kwargs`: None
311
+ - `include_inputs_for_metrics`: False
312
+ - `include_for_metrics`: []
313
+ - `eval_do_concat_batches`: True
314
+ - `fp16_backend`: auto
315
+ - `push_to_hub_model_id`: None
316
+ - `push_to_hub_organization`: None
317
+ - `mp_parameters`:
318
+ - `auto_find_batch_size`: False
319
+ - `full_determinism`: False
320
+ - `torchdynamo`: None
321
+ - `ray_scope`: last
322
+ - `ddp_timeout`: 1800
323
+ - `torch_compile`: False
324
+ - `torch_compile_backend`: None
325
+ - `torch_compile_mode`: None
326
+ - `dispatch_batches`: None
327
+ - `split_batches`: None
328
+ - `include_tokens_per_second`: False
329
+ - `include_num_input_tokens_seen`: False
330
+ - `neftune_noise_alpha`: None
331
+ - `optim_target_modules`: None
332
+ - `batch_eval_metrics`: False
333
+ - `eval_on_start`: False
334
+ - `use_liger_kernel`: False
335
+ - `eval_use_gather_object`: False
336
+ - `average_tokens_across_devices`: False
337
+ - `prompts`: None
338
+ - `batch_sampler`: batch_sampler
339
+ - `multi_dataset_batch_sampler`: round_robin
340
+
341
+ </details>
342
+
343
+ ### Training Logs
344
+ | Epoch | Step | Training Loss |
345
+ |:------:|:-----:|:-------------:|
346
+ | 0.0459 | 500 | 0.0135 |
347
+ | 0.0919 | 1000 | 0.024 |
348
+ | 0.1378 | 1500 | 0.0073 |
349
+ | 0.1837 | 2000 | 0.0103 |
350
+ | 0.2297 | 2500 | 0.0265 |
351
+ | 0.2756 | 3000 | 0.0209 |
352
+ | 0.3215 | 3500 | 0.0308 |
353
+ | 0.3675 | 4000 | 0.0301 |
354
+ | 0.4134 | 4500 | 0.0382 |
355
+ | 0.4593 | 5000 | 0.0164 |
356
+ | 0.5053 | 5500 | 0.0251 |
357
+ | 0.5512 | 6000 | 0.0141 |
358
+ | 0.5972 | 6500 | 0.0131 |
359
+ | 0.6431 | 7000 | 0.006 |
360
+ | 0.6890 | 7500 | 0.0261 |
361
+ | 0.7350 | 8000 | 0.0111 |
362
+ | 0.7809 | 8500 | 0.0089 |
363
+ | 0.8268 | 9000 | 0.0201 |
364
+ | 0.8728 | 9500 | 0.0175 |
365
+ | 0.9187 | 10000 | 0.0086 |
366
+ | 0.9646 | 10500 | 0.0049 |
367
+
368
+
369
+ ### Framework Versions
370
+ - Python: 3.11.11
371
+ - Sentence Transformers: 3.4.1
372
+ - Transformers: 4.48.3
373
+ - PyTorch: 2.5.1+cu124
374
+ - Accelerate: 1.3.0
375
+ - Datasets: 3.3.2
376
+ - Tokenizers: 0.21.0
377
+
378
+ ## Citation
379
+
380
+ ### BibTeX
381
+
382
+ #### Sentence Transformers
383
+ ```bibtex
384
+ @inproceedings{reimers-2019-sentence-bert,
385
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
386
+ author = "Reimers, Nils and Gurevych, Iryna",
387
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
388
+ month = "11",
389
+ year = "2019",
390
+ publisher = "Association for Computational Linguistics",
391
+ url = "https://arxiv.org/abs/1908.10084",
392
+ }
393
+ ```
394
+
395
+ #### MultipleNegativesRankingLoss
396
+ ```bibtex
397
+ @misc{henderson2017efficient,
398
+ title={Efficient Natural Language Response Suggestion for Smart Reply},
399
+ author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
400
+ year={2017},
401
+ eprint={1705.00652},
402
+ archivePrefix={arXiv},
403
+ primaryClass={cs.CL}
404
+ }
405
+ ```
406
+
407
+ <!--
408
+ ## Glossary
409
+
410
+ *Clearly define terms in order to be accessible across audiences.*
411
+ -->
412
+
413
+ <!--
414
+ ## Model Card Authors
415
+
416
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
417
+ -->
418
+
419
+ <!--
420
+ ## Model Card Contact
421
+
422
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
423
+ -->
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "am-azadi/bilingual-embedding-large_Fine_Tuned_1e",
3
+ "architectures": [
4
+ "BilingualModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "dangvantuan/bilingual_impl--config.BilingualConfig",
9
+ "AutoModel": "dangvantuan/bilingual_impl--modeling.BilingualModel",
10
+ "AutoModelForMaskedLM": "dangvantuan/bilingual_impl--modeling.BilingualForMaskedLM",
11
+ "AutoModelForMultipleChoice": "dangvantuan/bilingual_impl--modeling.BilingualForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "dangvantuan/bilingual_impl--modeling.BilingualForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "dangvantuan/bilingual_impl--modeling.BilingualForSequenceClassification",
14
+ "AutoModelForTokenClassification": "dangvantuan/bilingual_impl--modeling.BilingualForTokenClassification"
15
+ },
16
+ "bos_token_id": 0,
17
+ "classifier_dropout": null,
18
+ "eos_token_id": 2,
19
+ "hidden_act": "gelu",
20
+ "hidden_dropout_prob": 0.1,
21
+ "hidden_size": 1024,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "layer_norm_eps": 1e-05,
25
+ "max_position_embeddings": 514,
26
+ "model_type": "bilingual",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "output_past": true,
30
+ "pad_token_id": 1,
31
+ "position_embedding_type": "absolute",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.48.3",
34
+ "type_vocab_size": 1,
35
+ "use_cache": true,
36
+ "vocab_size": 250002
37
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.4.1",
4
+ "transformers": "4.48.3",
5
+ "pytorch": "2.5.1+cu124"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de6cb5734383d6b4791e6a42a794683a3c3f361499f1f85093f6a47e2eeaeac
3
+ size 2239607176
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
3
+ size 17082987
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "additional_special_tokens": [],
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "<mask>",
51
+ "max_length": 512,
52
+ "model_max_length": 512,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "<pad>",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "</s>",
58
+ "stride": 0,
59
+ "tokenizer_class": "XLMRobertaTokenizer",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "<unk>"
63
+ }