This model is for debugging. It is randomly initialized using the config from Qwen/Qwen2-Audio-7B-Instruct but with smaller size.

Codes:

import os
from typing import Dict

import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
                          AutoTokenizer, GenerationConfig,
                          Qwen2AudioForConditionalGeneration, pipeline,
                          set_seed)

model_id = "Qwen/Qwen2-Audio-7B-Instruct"
repo_id = "yujiepan/qwen2-audio-tiny-random"
save_path = f"/tmp/{repo_id}"

config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.audio_config.encoder_layers = 2
config.audio_config.encoder_attention_heads = 2
config.audio_config.encoder_ffn_dim = 32
config.audio_config.d_model = 16
config.text_config.num_hidden_layers = 2
config.text_config.intermediate_size = 32
config.text_config.hidden_size = 16
config.text_config.num_attention_heads = 2
config.text_config.num_key_value_heads = 1

model = Qwen2AudioForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
    model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
    for _, p in sorted(model.named_parameters()):
        torch.nn.init.uniform_(p, -0.3, 0.3)

processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")


def try_inference():
    from io import BytesIO
    from urllib.request import urlopen

    import librosa
    processor = AutoProcessor.from_pretrained(save_path)
    model = Qwen2AudioForConditionalGeneration.from_pretrained(
        save_path, device_map="auto")
    conversation = [
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
        ]},
        {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
        ]},
    ]
    text = processor.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=False)
    audios = []
    for message in conversation:
        if isinstance(message["content"], list):
            for ele in message["content"]:
                if ele["type"] == "audio":
                    audios.append(librosa.load(
                        BytesIO(urlopen(ele['audio_url']).read()),
                        sr=processor.feature_extractor.sampling_rate)[0]
                    )

    inputs = processor(text=text, audios=audios,
                       return_tensors="pt", padding=True)
    inputs.input_ids = inputs.input_ids.to("cuda")

    generate_ids = model.generate(**inputs, max_length=256)
    generate_ids = generate_ids[:, inputs.input_ids.size(1):]

    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(response)


try_inference()
Downloads last month
70
Safetensors
Model size
5.03M params
Tensor type
BF16
·
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.

Collection including yujiepan/qwen2-audio-tiny-random