Upload 8 files
Browse files- README.md +93 -0
- config.json +27 -0
- model.safetensors +3 -0
- modeling_sparse.py +18 -0
- special_tokens_map.json +7 -0
- tokenization_sparse.py +112 -0
- tokenizer_config.json +10 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- zh
|
4 |
+
base_model: junnyu/roformer_chinese_base
|
5 |
+
tags:
|
6 |
+
- transformers
|
7 |
+
---
|
8 |
+
|
9 |
+
## <u>INF</u> <u>W</u>ord-level <u>S</u>parse <u>E</u>mbedding v2 (INF-WSE-v2)
|
10 |
+
|
11 |
+
**INF-WSE-v2** is the latest version of the word-level sparse embedding model developed by [INF TECH](https://www.infly.cn/en).
|
12 |
+
|
13 |
+
Compared to [INF-WSE-v1](https://huggingface.co/infly/inf-wse-v1-base-zh), INF-WSE-v2 continues to be pretrained on the [Wudao](https://huggingface.co/datasets/p208p2002/wudao) corpus (from [roformer_chinese_base](https://huggingface.co/junnyu/roformer_chinese_base)) and introduces enhanced token rewriting capabilities. These advancements improve the model's ability to generate more accurate, adaptable, and contextually relevant text embeddings, with a particular focus on Chinese language processing.
|
14 |
+
|
15 |
+
### Key Features:
|
16 |
+
|
17 |
+
- **Optimized for Retrieval**: INF-WSE-v2 is specifically designed for information retrieval tasks. By leveraging sparse embeddings, the model ensures efficient matching between queries and documents, making it ideal for semantic search, ranking, and other retrieval scenarios where both speed and accuracy are essential.
|
18 |
+
|
19 |
+
- **Token Rewriting Capability**: A new token rewriting feature allows INF-WSE-v2 to dynamically modify tokens during the embedding process. This improves the model’s ability to produce more accurate and contextually relevant representations, especially when dealing with complex linguistic structures and nuances in Chinese text.
|
20 |
+
|
21 |
+
- **Sparse Representation for Efficiency**: Unlike traditional dense embeddings, which have a fixed dimensionality, INF-WSE-v2 uses sparse embeddings where most dimensions are set to zero. This results in embeddings where only the most significant dimensions are non-zero, reducing computational load while maintaining high accuracy for retrieval tasks.
|
22 |
+
|
23 |
+
## Usage
|
24 |
+
|
25 |
+
### Transformers
|
26 |
+
|
27 |
+
#### Infer embeddings
|
28 |
+
```python
|
29 |
+
import torch
|
30 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
31 |
+
|
32 |
+
queries = ['电脑一体机由什么构成?', '什么是掌上电脑?']
|
33 |
+
documents = [
|
34 |
+
'电脑一体机,是由一台显示器、一个电脑键盘和一个鼠标组成的电脑。',
|
35 |
+
'掌上电脑是一种运行在嵌入式操作系统和内嵌式应用软件之上的、小巧、轻便、易带、实用、价廉的手持式计算设备。',
|
36 |
+
]
|
37 |
+
input_texts = queries + documents
|
38 |
+
|
39 |
+
tokenizer = AutoTokenizer.from_pretrained("infly/inf-wse-v2-base-zh", trust_remote_code=True, use_fast=False) # Fast tokenizer has not been supported yet
|
40 |
+
model = AutoModelForMaskedLM.from_pretrained("infly/inf-wse-v2-base-zh", trust_remote_code=True)
|
41 |
+
model.eval()
|
42 |
+
|
43 |
+
max_length = 512
|
44 |
+
|
45 |
+
input_batch = tokenizer(input_texts, padding=True, max_length=max_length, truncation=True, return_tensors="pt")
|
46 |
+
with torch.no_grad():
|
47 |
+
embeddings = model(input_batch['input_ids'], input_batch['attention_mask'], return_sparse=False) # if return_sparse=True, return sparse tensor, else return dense tensor
|
48 |
+
|
49 |
+
scores = embeddings[:2] @ embeddings[2:].T
|
50 |
+
print(scores.tolist())
|
51 |
+
# [[25.137710571289062, 9.891149520874023], [11.703001976013184, 30.97362518310547]]
|
52 |
+
```
|
53 |
+
|
54 |
+
#### Convert embeddings to lexical weights
|
55 |
+
```python
|
56 |
+
from collections import OrderedDict
|
57 |
+
def convert_embeddings_to_weights(embeddings, tokenizer):
|
58 |
+
values, indices = torch.sort(embeddings, dim=-1, descending=True)
|
59 |
+
|
60 |
+
token2weight = []
|
61 |
+
for i in range(embeddings.size(0)):
|
62 |
+
token2weight.append(OrderedDict())
|
63 |
+
|
64 |
+
non_zero_mask = values[i] != 0
|
65 |
+
tokens = tokenizer.convert_ids_to_tokens(indices[i][non_zero_mask])
|
66 |
+
weights = values[i][non_zero_mask].tolist()
|
67 |
+
|
68 |
+
for token, weight in zip(tokens, weights):
|
69 |
+
token2weight[i][token] = weight
|
70 |
+
|
71 |
+
return token2weight
|
72 |
+
|
73 |
+
token2weight = convert_embeddings_to_weights(embeddings, tokenizer)
|
74 |
+
print(token2weight[1])
|
75 |
+
# OrderedDict([('掌上', 1.9666814804077148), ('电脑', 1.4205719232559204), ('掌中', 1.2688857316970825), ('全称', 1.2548470497131348), ('to', 1.041936993598938), ('台式机', 0.9435897469520569), ('编程语言', 0.8740423917770386), ('pad', 0.8506593108177185), ('手持', 0.835372269153595), ('point', 0.8245767951011658), ('计算机', 0.8100651502609253), ('叫法', 0.8098558187484741), ('手部', 0.7246338725090027), ('手机', 0.6195603013038635), ('micro', 0.5971686244010925), ('电子产品', 0.5647062063217163), ('软件', 0.561561107635498), ('手指', 0.494046688079834), ('technology', 0.47637590765953064), ('pen', 0.4651668071746826), ('virtual', 0.4590775668621063), ('掌心', 0.4538556635379791), ('智能', 0.40049654245376587), ('智慧', 0.3949573338031769), ('touch', 0.38361087441444397), ('指向', 0.3723030686378479), ('移动', 0.3585004508495331), ('事物', 0.34118232131004333), ('电子元件', 0.3282782733440399), ('笔记本', 0.3156297206878662), ('原名', 0.3028894364833832), ('鼠标', 0.28492796421051025), ('android', 0.25649091601371765), ('指', 0.1655425727367401), ('掌握', 0.16021089255809784), ('chi', 0.15045176446437836), ('前臂', 0.11981695145368576), ('book', 0.09273456782102585), ('手掌', 0.07757095992565155), ('按键', 0.06321503221988678), ('小型', 0.05425526574254036), ('一体机', 0.04848058149218559), ('my', 0.03250341862440109), ('psp', 0.01875465363264084), ('跨平台', 0.01767222210764885), ('电脑游戏', 0.005152992904186249)])
|
76 |
+
```
|
77 |
+
|
78 |
+
## Evaluation
|
79 |
+
|
80 |
+
### C-MTEB Retrieval task
|
81 |
+
|
82 |
+
([Chinese Massive Text Embedding Benchmark](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB))
|
83 |
+
|
84 |
+
Metric: nDCG@10
|
85 |
+
|
86 |
+
| Model Name | Max Length | Average | Cmedqa | Covid | Du | Ecom | Medical | MMarco | T2 | Video |
|
87 |
+
|:---------------------------------------------------------------------:|:----------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
|
88 |
+
| [BM25-zh](https://github.com/castorini/pyserini) | - | 50.37 | 13.70 | **86.58** | 57.13 | 44.04 | 32.08 | 48.31 | 60.48 | 60.64 |
|
89 |
+
| [bge-m3-sparse](https://huggingface.co/BAAI/bge-m3) | 512 | 57.00 | 24.50 | 76.09 | 71.51 | 50.49 | 43.93 | 59.28 | 71.76 | 58.43 |
|
90 |
+
| [inf-wse-v1-base-zh](https://huggingface.co/infly/inf-wse-v1-base-zh) | 512 | 61.16 | 20.51 | 76.41 | 79.84 | 56.78 | 46.24 | 66.40 | 76.50 | 68.57 |
|
91 |
+
| **inf-wse-v2-base-zh** | 512 | **69.15** | **30.64** | 79.38 | **87.12** | **64.95** | **56.54** | **78.80** | **83.05** | **72.69** |
|
92 |
+
|
93 |
+
All results, except for BM25, are measured by building the sparse index via [Qdrant](https://github.com/qdrant/qdrant).
|
config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RoFormerForMaskedLM"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoModelForMaskedLM": "modeling_sparse.RoFormerForSparseEmbeddingV2"
|
7 |
+
},
|
8 |
+
"attention_probs_dropout_prob": 0.1,
|
9 |
+
"embedding_size": 768,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 3072,
|
15 |
+
"layer_norm_eps": 1e-12,
|
16 |
+
"max_position_embeddings": 1536,
|
17 |
+
"model_type": "roformer",
|
18 |
+
"num_attention_heads": 12,
|
19 |
+
"num_hidden_layers": 12,
|
20 |
+
"pad_token_id": 0,
|
21 |
+
"rotary_value": false,
|
22 |
+
"torch_dtype": "bfloat16",
|
23 |
+
"transformers_version": "4.39.3",
|
24 |
+
"type_vocab_size": 2,
|
25 |
+
"use_cache": true,
|
26 |
+
"vocab_size": 50000
|
27 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:340da6777e4c33d9dc5faf9ebfc1f69b6a26d2772986695e28ba7b735505088e
|
3 |
+
size 248420568
|
modeling_sparse.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import RoFormerForMaskedLM
|
3 |
+
|
4 |
+
|
5 |
+
class RoFormerForSparseEmbeddingV2(RoFormerForMaskedLM):
|
6 |
+
def forward(self, input_ids, attention_mask, return_sparse=False):
|
7 |
+
logits = super().forward(input_ids, attention_mask)['logits'] # [B,L,V]
|
8 |
+
token_mask = (1 - attention_mask.unsqueeze(-1)) * -1e4 # [B,L,1]
|
9 |
+
token_mask[:, 0, :] = -1e4
|
10 |
+
last_ind = torch.sum(attention_mask, -1, keepdim=True).unsqueeze(-1) - 1 # [B,1,1]
|
11 |
+
token_mask = torch.scatter(token_mask, -2, last_ind, -1e4)
|
12 |
+
logits = logits + token_mask
|
13 |
+
emb = torch.log(1 + torch.max(torch.relu(logits), dim=-2).values) # [B,V]
|
14 |
+
|
15 |
+
if return_sparse:
|
16 |
+
emb = emb.to_sparse()
|
17 |
+
|
18 |
+
return emb
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenization_sparse.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers.models.roformer.tokenization_roformer import (WordpieceTokenizer, whitespace_tokenize,
|
2 |
+
RoFormerTokenizer)
|
3 |
+
|
4 |
+
|
5 |
+
# Copied from transformers.models.roformer.tokenization_roformer.BasicTokenizer._is_chinese_char
|
6 |
+
def _is_chinese_char(cp):
|
7 |
+
"""Checks whether CP is the codepoint of a CJK character."""
|
8 |
+
# This defines a "chinese character" as anything in the CJK Unicode block:
|
9 |
+
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
10 |
+
#
|
11 |
+
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
12 |
+
# despite its name. The modern Korean Hangul alphabet is a different block,
|
13 |
+
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
14 |
+
# space-separated words, so they are not treated specially and handled
|
15 |
+
# like the all of the other languages.
|
16 |
+
if (
|
17 |
+
(cp >= 0x4E00 and cp <= 0x9FFF)
|
18 |
+
or (cp >= 0x3400 and cp <= 0x4DBF) #
|
19 |
+
or (cp >= 0x20000 and cp <= 0x2A6DF) #
|
20 |
+
or (cp >= 0x2A700 and cp <= 0x2B73F) #
|
21 |
+
or (cp >= 0x2B740 and cp <= 0x2B81F) #
|
22 |
+
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
|
23 |
+
or (cp >= 0xF900 and cp <= 0xFAFF)
|
24 |
+
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
|
25 |
+
): #
|
26 |
+
return True
|
27 |
+
|
28 |
+
return False
|
29 |
+
|
30 |
+
|
31 |
+
# Modified from transformers.models.roformer.tokenization_roformer.WordpieceTokenizer
|
32 |
+
class ChineseWordpieceTokenizer(WordpieceTokenizer):
|
33 |
+
def tokenize(self, text):
|
34 |
+
"""
|
35 |
+
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
|
36 |
+
tokenization using the given vocabulary.
|
37 |
+
|
38 |
+
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
text: A single token or whitespace separated tokens. This should have
|
42 |
+
already been passed through *BasicTokenizer*.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
A list of wordpiece tokens.
|
46 |
+
"""
|
47 |
+
|
48 |
+
output_tokens = []
|
49 |
+
for token in whitespace_tokenize(text):
|
50 |
+
chars = list(token)
|
51 |
+
if len(chars) > self.max_input_chars_per_word:
|
52 |
+
output_tokens.append(self.unk_token)
|
53 |
+
continue
|
54 |
+
|
55 |
+
is_bad = False
|
56 |
+
start = 0
|
57 |
+
sub_tokens = []
|
58 |
+
while start < len(chars):
|
59 |
+
end = len(chars)
|
60 |
+
cur_substr = None
|
61 |
+
while start < end:
|
62 |
+
substr = "".join(chars[start:end])
|
63 |
+
if start > 0 and not _is_chinese_char(ord(substr[0])): # only add ## when not Chinese character
|
64 |
+
substr = "##" + substr
|
65 |
+
if substr in self.vocab:
|
66 |
+
cur_substr = substr
|
67 |
+
break
|
68 |
+
end -= 1
|
69 |
+
if cur_substr is None:
|
70 |
+
is_bad = True
|
71 |
+
break
|
72 |
+
sub_tokens.append(cur_substr)
|
73 |
+
start = end
|
74 |
+
|
75 |
+
if is_bad:
|
76 |
+
output_tokens.append(self.unk_token)
|
77 |
+
else:
|
78 |
+
output_tokens.extend(sub_tokens)
|
79 |
+
return output_tokens
|
80 |
+
|
81 |
+
|
82 |
+
class ChineseRoFormerTokenizer(RoFormerTokenizer):
|
83 |
+
def __init__(
|
84 |
+
self,
|
85 |
+
vocab_file,
|
86 |
+
do_lower_case=True,
|
87 |
+
do_basic_tokenize=True,
|
88 |
+
never_split=None,
|
89 |
+
unk_token="[UNK]",
|
90 |
+
sep_token="[SEP]",
|
91 |
+
pad_token="[PAD]",
|
92 |
+
cls_token="[CLS]",
|
93 |
+
mask_token="[MASK]",
|
94 |
+
tokenize_chinese_chars=False,
|
95 |
+
strip_accents=None,
|
96 |
+
**kwargs,
|
97 |
+
):
|
98 |
+
super().__init__(
|
99 |
+
vocab_file=vocab_file,
|
100 |
+
do_lower_case=do_lower_case,
|
101 |
+
do_basic_tokenize=do_basic_tokenize,
|
102 |
+
never_split=never_split,
|
103 |
+
unk_token=unk_token,
|
104 |
+
sep_token=sep_token,
|
105 |
+
pad_token=pad_token,
|
106 |
+
cls_token=cls_token,
|
107 |
+
mask_token=mask_token,
|
108 |
+
tokenize_chinese_chars=tokenize_chinese_chars,
|
109 |
+
strip_accents=strip_accents,
|
110 |
+
**kwargs,
|
111 |
+
)
|
112 |
+
self.wordpiece_tokenizer = ChineseWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
tokenizer_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoTokenizer": [
|
4 |
+
"tokenization_sparse.ChineseRoFormerTokenizer",
|
5 |
+
""
|
6 |
+
]
|
7 |
+
},
|
8 |
+
"tokenizer_class": "ChineseRoFormerTokenizer",
|
9 |
+
"tokenize_chinese_chars": false
|
10 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|