import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import spaces import re from markdownify import markdownify models = { "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"), "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda") } tokenizers = { "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True), "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True), } @spaces.GPU def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): print("Start Model Processing") model = models[model_id] tokenizer = tokenizers[model_id] messages = [{"role": "user", "content": html_content}] input_text=tokenizer.apply_chat_template(messages, tokenize=False) inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08) pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) print("Start Markdownify Processing") markdownify_output = markdownify(html_content) return assistant_response[0], markdownify_output css = """ #output { height: 500px; overflow: auto; border: 1px solid #ccc; } """ example_html = """

My To Do List

Add
""" with gr.Blocks(css=css) as demo: gr.Markdown(""" # HTML-to-Markdown Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co./jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify). """) with gr.Row(): with gr.Column(): model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-1.5b") html_content = gr.Textbox(label="HTML") submit_btn = gr.Button(value="Submit") with gr.Column(): model_output_text = gr.Textbox(label="Reader LM Output") markdownify_output = gr.Textbox(label="Markdownify Output") gr.Examples( examples=[ [example_html], ], inputs=[html_content], outputs=[model_output_text, markdownify_output], fn=run_example, cache_examples=True, label="Try examples" ) submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output]) demo.launch(debug=True)