import torch from transformers import LlamaTokenizer, TextGenerationPipeline, AutoModelForCausalLM from yuan_moe_hf_model import YuanForCausalLM import sys, os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") quantized_model_dir = "/temp_data/LLM_test/MOE/Yuan2-M32-int4-hf" # 加载tokenizer tokenizer = LlamaTokenizer.from_pretrained(quantized_model_dir, add_eos_token=False, add_bos_token=False, eos_token='') # 加载模型并移动到指定设备 model = YuanForCausalLM.from_pretrained(quantized_model_dir, trust_remote_code=True, use_safetensors=True, torch_dtype=torch.float16).to(device) #for name, param in model.named_parameters(): # if not "quantized" in name: # param.data.normal_(mean=0.0, std=0.02) # 或者使用其他适当的初始化方式 # 将模型移动到 GPU #model.to(device) # 推理生成文本 input_text = "北京是中国的" input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device) output_ids = model.generate(input_ids, max_new_tokens=256) output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) print(output_text)