openbmb
/

MiniCPM-o-2_6

Model card Files Files and versions Community

wanderor commited on 15 days ago

Commit

1ceb0cb

verified ·

1 Parent(s): 21e853e

Supports MacOS in MiniCPM-o 2.6

Browse files

Decouples from cuda.

Note: verified in torch 2.5.1. It does not work in torch 2.3.1 on MacOS (specified in requirements).

Files changed (1) hide show

modeling_minicpmo.py +6 -6

modeling_minicpmo.py CHANGED Viewed

@@ -184,7 +184,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
             args=(),
             init={"class_path": "vocos.heads.ISTFTHead", "init_args": {"dim": 512, "n_fft": 1024, "hop_length": 256}},
         )
-        vocos = Vocos(feature_extractor, backbone, head).to("cuda").eval().to(torch.float32)
         vocos.load_state_dict(torch.load(ckpt_path, weights_only=True, mmap=True))
         return vocos
@@ -1185,7 +1185,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         generate_prompt = "<|im_end|>\n<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>"
-        input_ids = tokenizer(generate_prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].cuda()
         spk_start_idx = torch.where(input_ids[0] == tokenizer.spk_start_id)[0]
         spk_end_idx = torch.where(input_ids[0] == tokenizer.spk_end_id)[0]
@@ -1289,7 +1289,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
         text = "[Stts]" + "[spk_emb]" * self.tts.num_spk_embs
         tts_input_ids = self.tts_processor.text_tokenizer(text, return_tensors="pt", add_special_tokens=False)[
             "input_ids"
-        ].cuda()
         return tts_input_ids
     def _build_streaming_mask(self, tts_tokens_len):
@@ -1320,7 +1320,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
         gen_text = text.split("<|tts_eos|>")[0]
         tts_text, tts_token_lens = self.prepare_tts_text(gen_text)
         tts_inputs = self.tts_processor.text_tokenizer.encode(tts_text, add_special_tokens=False)
-        tts_input_ids = torch.Tensor(tts_inputs).unsqueeze(0).to("cuda", dtype=torch.long)
         streaming_tts_text_mask = self._build_streaming_mask(tts_token_lens).to(device=self.tts.device)
         logits_warpers, logits_processors = gen_logits(
@@ -1617,7 +1617,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
                 tts_input_ids = self.tts_processor.text_tokenizer(
                     tts_text, return_tensors="pt", add_special_tokens=False
-                )["input_ids"].cuda()
                 text_input_ids = tts_input_ids[:, begin:end]
                 streaming_tts_text_mask = self._build_streaming_mask(tts_token_lens).to(device=self.tts.device)
                 position_ids = torch.arange(begin, end, dtype=torch.long, device=self.tts.device).unsqueeze(0)
@@ -1726,7 +1726,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
             if end > begin:
                 tts_input_ids = self.tts_processor.text_tokenizer(
                     tts_text, return_tensors="pt", add_special_tokens=False
-                )["input_ids"].cuda()
                 text_input_ids = tts_input_ids[:, begin:end]
                 streaming_tts_text_mask = self._build_streaming_mask(tts_token_lens).to(device=self.tts.device)
                 position_ids = torch.arange(begin, end, dtype=torch.long, device=self.tts.device).unsqueeze(0)

             args=(),
             init={"class_path": "vocos.heads.ISTFTHead", "init_args": {"dim": 512, "n_fft": 1024, "hop_length": 256}},
         )
+        vocos = Vocos(feature_extractor, backbone, head).to(self.device).eval().to(torch.float32)
         vocos.load_state_dict(torch.load(ckpt_path, weights_only=True, mmap=True))
         return vocos
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         generate_prompt = "<|im_end|>\n<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>"
+        input_ids = tokenizer(generate_prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].to(self.device)
         spk_start_idx = torch.where(input_ids[0] == tokenizer.spk_start_id)[0]
         spk_end_idx = torch.where(input_ids[0] == tokenizer.spk_end_id)[0]
         text = "[Stts]" + "[spk_emb]" * self.tts.num_spk_embs
         tts_input_ids = self.tts_processor.text_tokenizer(text, return_tensors="pt", add_special_tokens=False)[
             "input_ids"
+        ].to(self.device)
         return tts_input_ids
     def _build_streaming_mask(self, tts_tokens_len):
         gen_text = text.split("<|tts_eos|>")[0]
         tts_text, tts_token_lens = self.prepare_tts_text(gen_text)
         tts_inputs = self.tts_processor.text_tokenizer.encode(tts_text, add_special_tokens=False)
+        tts_input_ids = torch.Tensor(tts_inputs).unsqueeze(0).to(self.device, dtype=torch.long)
         streaming_tts_text_mask = self._build_streaming_mask(tts_token_lens).to(device=self.tts.device)
         logits_warpers, logits_processors = gen_logits(
                 tts_input_ids = self.tts_processor.text_tokenizer(
                     tts_text, return_tensors="pt", add_special_tokens=False
+                )["input_ids"].to(self.device)
                 text_input_ids = tts_input_ids[:, begin:end]
                 streaming_tts_text_mask = self._build_streaming_mask(tts_token_lens).to(device=self.tts.device)
                 position_ids = torch.arange(begin, end, dtype=torch.long, device=self.tts.device).unsqueeze(0)
             if end > begin:
                 tts_input_ids = self.tts_processor.text_tokenizer(
                     tts_text, return_tensors="pt", add_special_tokens=False
+                )["input_ids"].to(self.device)
                 text_input_ids = tts_input_ids[:, begin:end]
                 streaming_tts_text_mask = self._build_streaming_mask(tts_token_lens).to(device=self.tts.device)
                 position_ids = torch.arange(begin, end, dtype=torch.long, device=self.tts.device).unsqueeze(0)