--- pipeline_tag: text-generation language: - multilingual inference: false license: cc-by-nc-4.0 library_name: transformers base_model: jinaai/ReaderLM-v2 tags: - llama-cpp - gguf-my-repo --- # Svngoku/ReaderLM-v2-Q8_0-GGUF This model was converted to GGUF format from [`jinaai/ReaderLM-v2`](https://huggingface.co./jinaai/ReaderLM-v2) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co./spaces/ggml-org/gguf-my-repo) space. Refer to the [original model card](https://huggingface.co./jinaai/ReaderLM-v2) for more details on the model. ## Use with llama.cpp Install llama.cpp through brew (works on Mac and Linux) ```bash brew install llama.cpp ``` Invoke the llama.cpp server or the CLI. ### CLI: ```bash llama-cli --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -p "The meaning to life and the universe is" ``` ### Server: ```bash llama-server --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -c 2048 ``` Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well. Step 1: Clone llama.cpp from GitHub. ``` git clone https://github.com/ggerganov/llama.cpp ``` Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux). ``` cd llama.cpp && LLAMA_CURL=1 make ``` Step 3: Run inference through the main binary. ``` ./llama-cli --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -p "The meaning to life and the universe is" ``` or ``` ./llama-server --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -c 2048 ``` ## VLLM Inference ```py # -*- coding: utf-8 -*- """Untitled64.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1hVqCTm6XLJmrOjkaIYLHXgOTg2ffnhue """ !pip install vllm model_name = 'Svngoku/ReaderLM-v2-Q8_0-GGUF' # @param ["jinaai/ReaderLM-v2", "jinaai/reader-lm-1.5b", "Svngoku/ReaderLM-v2-Q8_0-GGUF"] max_model_len = 256000 # @param {type:"integer"} # @markdown --- # @markdown ### SamplingParams: top_k = 1 # @param {type:"integer"} temperature = 0 # @param {type:"slider", min:0, max:1, step:0.1} repetition_penalty = 1.05 # @param {type:"number"} presence_penalty = 0.25 # @param {type:"slider", min:0, max:1, step:0.1} max_tokens = 8192 # @param {type:"integer"} # @markdown --- from vllm import SamplingParams sampling_params = SamplingParams(temperature=temperature, top_k=top_k, presence_penalty=presence_penalty, repetition_penalty=repetition_penalty, max_tokens=max_tokens) print('sampling_params', sampling_params) !wget https://huggingface.co./Svngoku/ReaderLM-v2-Q8_0-GGUF/resolve/main/readerlm-v2-q8_0.gguf !wget https://huggingface.co./jinaai/ReaderLM-v2/resolve/main/tokenizer.json !vllm serve /content/readerlm-v2-q8_0.gguf --tokenizer /content/tokenizer.json from vllm import LLM llm = LLM( model="/content/readerlm-v2-q8_0.gguf", max_model_len=max_model_len, tokenizer='jinaai/ReaderLM-v2' ) # @title ## Specify a URL as input{"run":"auto","vertical-output":true} import re import requests from IPython.display import display, Markdown def display_header(text): display(Markdown(f'**{text}**')) def display_rendered_md(text): # for mimic "Reading mode" in Safari/Firefox display(Markdown(text)) def display_content(text): display(Markdown(text)) def get_html_content(url): api_url = f'https://r.jina.ai/{url}' headers = {'X-Return-Format': 'html'} try: response = requests.get(api_url, headers=headers, timeout=10) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: return f"error: {str(e)}" def get_html_content(url): api_url = f'https://r.jina.ai/{url}' headers = {'X-Return-Format': 'html'} try: response = requests.get(api_url, headers=headers, timeout=10) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: return f"error: {str(e)}" def create_prompt(text: str, tokenizer = None, instruction: str = None, schema: str = None) -> str: """ Create a prompt for the model with optional instruction and JSON schema. Args: text (str): The input HTML text tokenizer: The tokenizer to use instruction (str, optional): Custom instruction for the model schema (str, optional): JSON schema for structured extraction Returns: str: The formatted prompt """ if not tokenizer: tokenizer = llm.get_tokenizer() if not instruction: instruction = "Extract the main content from the given HTML and convert it to Markdown format." if schema: instruction = 'Extract the specified information from a list of news threads and present it in a structured JSON format.' prompt = f"{instruction}\n```html\n{text}\n```\nThe JSON schema is as follows:```json{schema}```" else: prompt = f"{instruction}\n```html\n{text}\n```" messages = [ { "role": "user", "content": prompt, } ] return tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # (REMOVE and variations) SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>' # mach any char zero or more times # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) # (REMOVE HTML and variations) STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>' # mach any char zero or more times # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) # (REMOVE HTML to and variations) META_PATTERN = r'<[ ]*meta.*?>' # mach any char zero or more times # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) # (REMOVE HTML COMMENTS and variations) COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>' # mach any char zero or more times # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) # (REMOVE HTML LINK to and variations) LINK_PATTERN = r'<[ ]*link.*?>' # mach any char zero or more times # (REPLACE base64 images) BASE64_IMG_PATTERN = r']+src="data:image/[^;]+;base64,[^"]+"[^>]*>' # (REPLACE to and variations) SVG_PATTERN = r'(]*>)(.*?)(<\/svg>)' def replace_svg(html: str, new_content: str = "this is a placeholder") -> str: return re.sub( SVG_PATTERN, lambda match: f"{match.group(1)}{new_content}{match.group(3)}", html, flags=re.DOTALL, ) def replace_base64_images(html: str, new_image_src: str = "#") -> str: return re.sub(BASE64_IMG_PATTERN, f'', html) def has_base64_images(text: str) -> bool: base64_content_pattern = r'data:image/[^;]+;base64,[^"]+' return bool(re.search(base64_content_pattern, text, flags=re.DOTALL)) def has_svg_components(text: str) -> bool: return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL)) def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False): html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) if clean_svg: html = replace_svg(html) if clean_base64: html = replace_base64_images(html) return html url = "https://news.ycombinator.com/" # @param {type:"string"} print(f'We will use Jina Reader to fetch the **raw HTML** from: {url}') html = get_html_content(url) html = clean_html(html, clean_svg=True, clean_base64=True) prompt = create_prompt(html) result = llm.generate(prompt, sampling_params=sampling_params)[0].outputs[0].text.strip() print(result) import json schema = { "type": "object", "properties": { "title": {"type": "string", "description": "News thread title"}, "url": {"type": "string", "description": "Thread URL"}, "summary": {"type": "string", "description": "Article summary"}, "keywords": {"type": "list", "description": "Descriptive keywords"}, "author": {"type": "string", "description": "Thread author"}, "comments": {"type": "integer", "description": "Comment count"} }, "required": ["title", "url", "date", "points", "author", "comments"] } prompt = create_prompt(html, schema=json.dumps(schema, indent=2)) result = llm.generate(prompt, sampling_params=sampling_params)[0].outputs[0].text.strip() print(result) from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment import gc import os import torch destroy_model_parallel() destroy_distributed_environment() del llm.llm_engine.model_executor.driver_worker del llm.llm_engine.model_executor del llm gc.collect() torch.cuda.empty_cache() print(f"cuda memory: {torch.cuda.memory_allocated() // 1024 // 1024}MB") ```