multimolecule
/

rnaernie

@@ -10,6 +10,19 @@ library_name: multimolecule
 pipeline_tag: fill-mask
 mask_token: "<mask>"
 widget:
   - example_title: "microRNA-21"
     text: "UAGC<mask>UAUCAGACUGAUGUUGA"
     output:
@@ -78,7 +91,7 @@ Note that during the conversion process, additional tokens such as `[IND]` and n
 - **Paper**: Multi-purpose RNA language modelling with motif-aware pretraining and type-guided fine-tuning
 - **Developed by**: Ning Wang, Jiang Bian, Yuchen Li, Xuhong Li, Shahid Mumtaz, Linghe Kong, Haoyi Xiong.
 - **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased) - [ERNIE](https://huggingface.co/nghuyong/ernie-3.0-base-zh)
-- **Original Repository**: [https://github.com/CatIIIIIIII/RNAErnie](https://github.com/CatIIIIIIII/RNAErnie)
 ## Usage
@@ -95,29 +108,29 @@ You can use this model directly with a pipeline for masked language modeling:
 ```python
 >>> import multimolecule  # you must import multimolecule to register models
 >>> from transformers import pipeline
->>> unmasker = pipeline('fill-mask', model='multimolecule/rnaernie')
->>> unmasker("uagc<mask>uaucagacugauguuga")
-[{'score': 0.09372635930776596,
   'token': 8,
   'token_str': 'G',
-  'sequence': 'U A G C G U A U C A G A C U G A U G U U G A'},
- {'score': 0.08816102892160416,
   'token': 11,
   'token_str': 'R',
-  'sequence': 'U A G C R U A U C A G A C U G A U G U U G A'},
- {'score': 0.08292599022388458,
   'token': 6,
   'token_str': 'A',
-  'sequence': 'U A G C A U A U C A G A C U G A U G U U G A'},
- {'score': 0.07841548323631287,
-  'token': 2,
-  'token_str': '<eos>',
-  'sequence': 'U A G C U A U C A G A C U G A U G U U G A'},
- {'score': 0.073448047041893,
   'token': 20,
   'token_str': 'V',
-  'sequence': 'U A G C V U A U C A G A C U G A U G U U G A'}]
 ```
 ### Downstream Use
@@ -130,11 +143,11 @@ Here is how to use this model to get the features of a given sequence in PyTorch
 from multimolecule import RnaTokenizer, RnaErnieModel
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/rnaernie')
-model = RnaErnieModel.from_pretrained('multimolecule/rnaernie')
 text = "UAGCUUAUCAGACUGAUGUUGA"
-input = tokenizer(text, return_tensors='pt')
 output = model(**input)
 ```
@@ -150,17 +163,17 @@ import torch
 from multimolecule import RnaTokenizer, RnaErnieForSequencePrediction
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/rnaernie')
-model = RnaErnieForSequencePrediction.from_pretrained('multimolecule/rnaernie')
 text = "UAGCUUAUCAGACUGAUGUUGA"
-input = tokenizer(text, return_tensors='pt')
 label = torch.tensor([1])
 output = model(**input, labels=label)
 ```
-#### Nucleotide Classification / Regression
 **Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
@@ -168,14 +181,14 @@ Here is how to use this model as backbone to fine-tune for a nucleotide-level ta
 ```python
 import torch
-from multimolecule import RnaTokenizer, RnaErnieForNucleotidePrediction
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/rnaernie')
-model = RnaErnieForNucleotidePrediction.from_pretrained('multimolecule/rnaernie')
 text = "UAGCUUAUCAGACUGAUGUUGA"
-input = tokenizer(text, return_tensors='pt')
 label = torch.randint(2, (len(text), ))
 output = model(**input, labels=label)
@@ -192,11 +205,11 @@ import torch
 from multimolecule import RnaTokenizer, RnaErnieForContactPrediction
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/rnaernie')
-model = RnaErnieForContactPrediction.from_pretrained('multimolecule/rnaernie')
 text = "UAGCUUAUCAGACUGAUGUUGA"
-input = tokenizer(text, return_tensors='pt')
 label = torch.randint(2, (len(text), len(text)))
 output = model(**input, labels=label)

 pipeline_tag: fill-mask
 mask_token: "<mask>"
 widget:
+  - example_title: "HIV-1"
+    text: "GGUC<mask>CUCUGGUUAGACCAGAUCUGAGCCU"
+    output:
+      - label: "G"
+        score: 0.09252794831991196
+      - label: "R"
+        score: 0.09062391519546509
+      - label: "A"
+        score: 0.08875908702611923
+      - label: "V"
+        score: 0.07809742540121078
+      - label: "S"
+        score: 0.07325706630945206
   - example_title: "microRNA-21"
     text: "UAGC<mask>UAUCAGACUGAUGUUGA"
     output:
 - **Paper**: Multi-purpose RNA language modelling with motif-aware pretraining and type-guided fine-tuning
 - **Developed by**: Ning Wang, Jiang Bian, Yuchen Li, Xuhong Li, Shahid Mumtaz, Linghe Kong, Haoyi Xiong.
 - **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased) - [ERNIE](https://huggingface.co/nghuyong/ernie-3.0-base-zh)
+- **Original Repository**: [CatIIIIIIII/RNAErnie](https://github.com/CatIIIIIIII/RNAErnie)
 ## Usage
 ```python
 >>> import multimolecule  # you must import multimolecule to register models
 >>> from transformers import pipeline
+>>> unmasker = pipeline("fill-mask", model="multimolecule/rnaernie")
+>>> unmasker("gguc<mask>cucugguuagaccagaucugagccu")
+[{'score': 0.09252794831991196,
   'token': 8,
   'token_str': 'G',
+  'sequence': 'G G U C G C U C U G G U U A G A C C A G A U C U G A G C C U'},
+ {'score': 0.09062391519546509,
   'token': 11,
   'token_str': 'R',
+  'sequence': 'G G U C R C U C U G G U U A G A C C A G A U C U G A G C C U'},
+ {'score': 0.08875908702611923,
   'token': 6,
   'token_str': 'A',
+  'sequence': 'G G U C A C U C U G G U U A G A C C A G A U C U G A G C C U'},
+ {'score': 0.07809742540121078,
   'token': 20,
   'token_str': 'V',
+  'sequence': 'G G U C V C U C U G G U U A G A C C A G A U C U G A G C C U'},
+ {'score': 0.07325706630945206,
+  'token': 13,
+  'token_str': 'S',
+  'sequence': 'G G U C S C U C U G G U U A G A C C A G A U C U G A G C C U'}]
 ```
 ### Downstream Use
 from multimolecule import RnaTokenizer, RnaErnieModel
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/rnaernie")
+model = RnaErnieModel.from_pretrained("multimolecule/rnaernie")
 text = "UAGCUUAUCAGACUGAUGUUGA"
+input = tokenizer(text, return_tensors="pt")
 output = model(**input)
 ```
 from multimolecule import RnaTokenizer, RnaErnieForSequencePrediction
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/rnaernie")
+model = RnaErnieForSequencePrediction.from_pretrained("multimolecule/rnaernie")
 text = "UAGCUUAUCAGACUGAUGUUGA"
+input = tokenizer(text, return_tensors="pt")
 label = torch.tensor([1])
 output = model(**input, labels=label)
 ```
+#### Token Classification / Regression
 **Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
 ```python
 import torch
+from multimolecule import RnaTokenizer, RnaErnieForTokenPrediction
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/rnaernie")
+model = RnaErnieForTokenPrediction.from_pretrained("multimolecule/rnaernie")
 text = "UAGCUUAUCAGACUGAUGUUGA"
+input = tokenizer(text, return_tensors="pt")
 label = torch.randint(2, (len(text), ))
 output = model(**input, labels=label)
 from multimolecule import RnaTokenizer, RnaErnieForContactPrediction
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/rnaernie")
+model = RnaErnieForContactPrediction.from_pretrained("multimolecule/rnaernie")
 text = "UAGCUUAUCAGACUGAUGUUGA"
+input = tokenizer(text, return_tensors="pt")
 label = torch.randint(2, (len(text), len(text)))
 output = model(**input, labels=label)