upskyy commited on
Commit
ffcc0dc
1 Parent(s): 8774a48

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -1,3 +1,357 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - multilingual
4
+ - af
5
+ - am
6
+ - ar
7
+ - as
8
+ - az
9
+ - be
10
+ - bg
11
+ - bn
12
+ - br
13
+ - bs
14
+ - ca
15
+ - cs
16
+ - cy
17
+ - da
18
+ - de
19
+ - el
20
+ - en
21
+ - eo
22
+ - es
23
+ - et
24
+ - eu
25
+ - fa
26
+ - fi
27
+ - fr
28
+ - fy
29
+ - ga
30
+ - gd
31
+ - gl
32
+ - gu
33
+ - ha
34
+ - he
35
+ - hi
36
+ - hr
37
+ - hu
38
+ - hy
39
+ - id
40
+ - is
41
+ - it
42
+ - ja
43
+ - jv
44
+ - ka
45
+ - kk
46
+ - km
47
+ - kn
48
+ - ko
49
+ - ku
50
+ - ky
51
+ - la
52
+ - lo
53
+ - lt
54
+ - lv
55
+ - mg
56
+ - mk
57
+ - ml
58
+ - mn
59
+ - mr
60
+ - ms
61
+ - my
62
+ - ne
63
+ - nl
64
+ - 'no'
65
+ - om
66
+ - or
67
+ - pa
68
+ - pl
69
+ - ps
70
+ - pt
71
+ - ro
72
+ - ru
73
+ - sa
74
+ - sd
75
+ - si
76
+ - sk
77
+ - sl
78
+ - so
79
+ - sq
80
+ - sr
81
+ - su
82
+ - sv
83
+ - sw
84
+ - ta
85
+ - te
86
+ - th
87
+ - tl
88
+ - tr
89
+ - ug
90
+ - uk
91
+ - ur
92
+ - uz
93
+ - vi
94
+ - xh
95
+ - yi
96
+ - zh
97
+ license: mit
98
+ library_name: sentence-transformers
99
+ tags:
100
+ - korean
101
+ - sentence-transformers
102
+ - transformers
103
+ - multilingual
104
+ - sentence-transformers
105
+ - sentence-similarity
106
+ - feature-extraction
107
+ base_model: intfloat/multilingual-e5-base
108
+ datasets: []
109
+ metrics:
110
+ - pearson_cosine
111
+ - spearman_cosine
112
+ - pearson_manhattan
113
+ - spearman_manhattan
114
+ - pearson_euclidean
115
+ - spearman_euclidean
116
+ - pearson_dot
117
+ - spearman_dot
118
+ - pearson_max
119
+ - spearman_max
120
+ widget:
121
+ - source_sentence: 이집트 군대가 형제애를 단속하다
122
+ sentences:
123
+ - 이집트의 군대가 무슬림 형제애를 단속하다
124
+ - 아르헨티나의 기예르모 코리아와 네덜란드의 마틴 버커크의 또 다른 준결승전도 매력적이다.
125
+ - 그것이 사실일 수도 있다고 생각하는 것은 재미있다.
126
+ - source_sentence: 오, 그리고 다시 결혼은 근본적인 인권이라고 주장한다.
127
+ sentences:
128
+ - 특히 결혼은 근본적인 인권이라고 말한 후에.
129
+ - 해변에 있는 흑인과 그의 개...
130
+ - 이란은 핵 프로그램이 평화적인 목적을 위한 것이라고 주장한다
131
+ - source_sentence: 담배 피우는 여자.
132
+ sentences:
133
+ - 이것은 내가 영국의 아서 안데르센 사업부의 파트너인 짐 와디아를 아서 안데르센 경영진이 선택한 것보다 래리 웨인바흐를 안데르센 월드와이드의
134
+ 경영 파트너로 승계하기 위해 안데르센 컨설팅 사업부(현재의 엑센츄어라고 알려져 있음)의 전 관리 파트너인 조지 샤힌에 대한 지지를 표명했을
135
+ 때 가장 명백했다.
136
+ - 한 여자가 물 한 잔을 마시고 있다.
137
+ - 한 여성이 담배를 피우면서 청구서를 지불하는 것을 압도했다.
138
+ - source_sentence: 루이 15세의 소수 민족인 프랑스의 리젠트인 필리프 도를레앙 시대에는 악명 높은 오르가즘의 현장이었다.
139
+ sentences:
140
+ - 필립 도린스는 루이 15세가 70대였을 때 섭정이었다.
141
+ - 행복한 어린 소년이 커다란 엘모 인형이 있는 의자에 앉아 있다.
142
+ - 필리프 도를레앙 시대에는 그곳에서 많은 유명한 오르가즘이 일어났다.
143
+ - source_sentence: 두 남자가 안에서 일하고 있다
144
+ sentences:
145
+ - 국립공원에서 가장 큰 마을인 케스윅의 인구는 매년 여름 등산객, 뱃사람, 관광객이 도착함에 따라 증가한다.
146
+ - 두 남자가 축구 경기를 보고 간식을 먹는다.
147
+ - 두 남자가 집에 타일을 깔았다.
148
+ pipeline_tag: sentence-similarity
149
+ model-index:
150
+ - name: upskyy/e5-base-korean
151
+ results:
152
+ - task:
153
+ type: semantic-similarity
154
+ name: Semantic Similarity
155
+ dataset:
156
+ name: sts dev
157
+ type: sts-dev
158
+ metrics:
159
+ - type: pearson_cosine
160
+ value: 0.8593935914692068
161
+ name: Pearson Cosine
162
+ - type: spearman_cosine
163
+ value: 0.8572594228080116
164
+ name: Spearman Cosine
165
+ - type: pearson_manhattan
166
+ value: 0.8217336375412545
167
+ name: Pearson Manhattan
168
+ - type: spearman_manhattan
169
+ value: 0.8280050978871264
170
+ name: Spearman Manhattan
171
+ - type: pearson_euclidean
172
+ value: 0.8208931119126335
173
+ name: Pearson Euclidean
174
+ - type: spearman_euclidean
175
+ value: 0.8277058727421436
176
+ name: Spearman Euclidean
177
+ - type: pearson_dot
178
+ value: 0.8187961699085111
179
+ name: Pearson Dot
180
+ - type: spearman_dot
181
+ value: 0.8236175658758088
182
+ name: Spearman Dot
183
+ - type: pearson_max
184
+ value: 0.8593935914692068
185
+ name: Pearson Max
186
+ - type: spearman_max
187
+ value: 0.8572594228080116
188
+ name: Spearman Max
189
+ ---
190
+
191
+ # SentenceTransformer based on intfloat/multilingual-e5-base
192
+
193
+ This model is korsts and kornli finetuning model from [intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
194
+
195
+ ## Model Details
196
+
197
+ ### Model Description
198
+ - **Model Type:** Sentence Transformer
199
+ - **Base model:** [intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) <!-- at revision d13f1b27baf31030b7fd040960d60d909913633f -->
200
+ - **Maximum Sequence Length:** 512 tokens
201
+ - **Output Dimensionality:** 768 tokens
202
+ - **Similarity Function:** Cosine Similarity
203
+ <!-- - **Training Dataset:** Unknown -->
204
+ <!-- - **Language:** Unknown -->
205
+ <!-- - **License:** Unknown -->
206
+
207
+
208
+ ### Full Model Architecture
209
+
210
+ ```
211
+ SentenceTransformer(
212
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
213
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
214
+ )
215
+ ```
216
+
217
+
218
+ ## Usage
219
+
220
+ ### Usage (Sentence-Transformers)
221
+
222
+
223
+ First install the Sentence Transformers library:
224
+
225
+ ```bash
226
+ pip install -U sentence-transformers
227
+ ```
228
+
229
+ Then you can load this model and run inference.
230
+ ```python
231
+ from sentence_transformers import SentenceTransformer
232
+
233
+ # Download from the 🤗 Hub
234
+ model = SentenceTransformer("upskyy/e5-base-korean")
235
+
236
+ # Run inference
237
+ sentences = [
238
+ '아이를 가진 엄마가 해변을 걷는다.',
239
+ '두 사람이 해변을 걷는다.',
240
+ '한 남자가 해변에서 개를 산책시킨다.',
241
+ ]
242
+ embeddings = model.encode(sentences)
243
+ print(embeddings.shape)
244
+ # [3, 768]
245
+
246
+ # Get the similarity scores for the embeddings
247
+ similarities = model.similarity(embeddings, embeddings)
248
+ print(similarities.shape)
249
+ # [3, 3]
250
+ ```
251
+
252
+ ### Usage (HuggingFace Transformers)
253
+
254
+ Without sentence-transformers, you can use the model like this:
255
+ First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
256
+
257
+ ```python
258
+ from transformers import AutoTokenizer, AutoModel
259
+ import torch
260
+
261
+
262
+ # Mean Pooling - Take attention mask into account for correct averaging
263
+ def mean_pooling(model_output, attention_mask):
264
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
265
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
266
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
267
+
268
+
269
+ # Sentences we want sentence embeddings for
270
+ sentences = ["안녕하세요?", "한국어 문장 임베딩을 위한 버트 모델입니다."]
271
+
272
+ # Load model from HuggingFace Hub
273
+ tokenizer = AutoTokenizer.from_pretrained("upskyy/e5-base-korean")
274
+ model = AutoModel.from_pretrained("upskyy/e5-base-korean")
275
+
276
+ # Tokenize sentences
277
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
278
+
279
+ # Compute token embeddings
280
+ with torch.no_grad():
281
+ model_output = model(**encoded_input)
282
+
283
+ # Perform pooling. In this case, mean pooling.
284
+ sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
285
+
286
+ print("Sentence embeddings:")
287
+ print(sentence_embeddings)
288
+ ```
289
+
290
+
291
+ ## Evaluation
292
+
293
+ ### Metrics
294
+
295
+ #### Semantic Similarity
296
+ * Dataset: `sts-dev`
297
+ * Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator)
298
+
299
+ | Metric | Value |
300
+ | :----------------- | :--------- |
301
+ | pearson_cosine | 0.8594 |
302
+ | spearman_cosine | 0.8573 |
303
+ | pearson_manhattan | 0.8217 |
304
+ | spearman_manhattan | 0.828 |
305
+ | pearson_euclidean | 0.8209 |
306
+ | spearman_euclidean | 0.8277 |
307
+ | pearson_dot | 0.8188 |
308
+ | spearman_dot | 0.8236 |
309
+ | **pearson_max** | **0.8594** |
310
+ | **spearman_max** | **0.8573** |
311
+
312
+ <!--
313
+ ## Bias, Risks and Limitations
314
+
315
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
316
+ -->
317
+
318
+ <!--
319
+ ### Recommendations
320
+
321
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
322
+ -->
323
+
324
+ ### Framework Versions
325
+ - Python: 3.10.13
326
+ - Sentence Transformers: 3.0.1
327
+ - Transformers: 4.42.4
328
+ - PyTorch: 2.3.0+cu121
329
+ - Accelerate: 0.30.1
330
+ - Datasets: 2.16.1
331
+ - Tokenizers: 0.19.1
332
+
333
+ ## Citation
334
+
335
+ ### BibTeX
336
+
337
+ #### Sentence Transformers
338
+ ```bibtex
339
+ @article{wang2024multilingual,
340
+ title={Multilingual E5 Text Embeddings: A Technical Report},
341
+ author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
342
+ journal={arXiv preprint arXiv:2402.05672},
343
+ year={2024}
344
+ }
345
+ ```
346
+
347
+ ```bibtex
348
+ @inproceedings{reimers-2019-sentence-bert,
349
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
350
+ author = "Reimers, Nils and Gurevych, Iryna",
351
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
352
+ month = "11",
353
+ year = "2019",
354
+ publisher = "Association for Computational Linguistics",
355
+ url = "https://arxiv.org/abs/1908.10084",
356
+ }
357
+ ```
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "xlm-roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "output_past": true,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.42.4",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 250002
27
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:427570b1987be1bdfacdf9425d41bc941cdbb935a079d4d7f692cbdb3af178e6
3
+ size 1112197096
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
3
+ size 17082987
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 512,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "tokenizer_class": "XLMRobertaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }