|
--- |
|
pipeline_tag: text-generation |
|
language: |
|
- multilingual |
|
inference: false |
|
license: cc-by-nc-4.0 |
|
library_name: transformers |
|
base_model: jinaai/ReaderLM-v2 |
|
tags: |
|
- llama-cpp |
|
- gguf-my-repo |
|
--- |
|
|
|
# Svngoku/ReaderLM-v2-Q8_0-GGUF |
|
This model was converted to GGUF format from [`jinaai/ReaderLM-v2`](https://huggingface.co./jinaai/ReaderLM-v2) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co./spaces/ggml-org/gguf-my-repo) space. |
|
Refer to the [original model card](https://huggingface.co./jinaai/ReaderLM-v2) for more details on the model. |
|
|
|
## Use with llama.cpp |
|
Install llama.cpp through brew (works on Mac and Linux) |
|
|
|
```bash |
|
brew install llama.cpp |
|
|
|
``` |
|
Invoke the llama.cpp server or the CLI. |
|
|
|
### CLI: |
|
```bash |
|
llama-cli --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -p "The meaning to life and the universe is" |
|
``` |
|
|
|
### Server: |
|
```bash |
|
llama-server --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -c 2048 |
|
``` |
|
|
|
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well. |
|
|
|
Step 1: Clone llama.cpp from GitHub. |
|
``` |
|
git clone https://github.com/ggerganov/llama.cpp |
|
``` |
|
|
|
Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux). |
|
``` |
|
cd llama.cpp && LLAMA_CURL=1 make |
|
``` |
|
|
|
Step 3: Run inference through the main binary. |
|
``` |
|
./llama-cli --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -p "The meaning to life and the universe is" |
|
``` |
|
or |
|
``` |
|
./llama-server --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -c 2048 |
|
``` |
|
|
|
## VLLM Inference |
|
|
|
```py |
|
# -*- coding: utf-8 -*- |
|
"""Untitled64.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1hVqCTm6XLJmrOjkaIYLHXgOTg2ffnhue |
|
""" |
|
|
|
!pip install vllm |
|
|
|
model_name = 'Svngoku/ReaderLM-v2-Q8_0-GGUF' # @param ["jinaai/ReaderLM-v2", "jinaai/reader-lm-1.5b", "Svngoku/ReaderLM-v2-Q8_0-GGUF"] |
|
max_model_len = 256000 # @param {type:"integer"} |
|
# @markdown --- |
|
# @markdown ### SamplingParams: |
|
|
|
top_k = 1 # @param {type:"integer"} |
|
temperature = 0 # @param {type:"slider", min:0, max:1, step:0.1} |
|
repetition_penalty = 1.05 # @param {type:"number"} |
|
presence_penalty = 0.25 # @param {type:"slider", min:0, max:1, step:0.1} |
|
max_tokens = 8192 # @param {type:"integer"} |
|
# @markdown --- |
|
|
|
from vllm import SamplingParams |
|
|
|
sampling_params = SamplingParams(temperature=temperature, top_k=top_k, presence_penalty=presence_penalty, repetition_penalty=repetition_penalty, max_tokens=max_tokens) |
|
|
|
print('sampling_params', sampling_params) |
|
|
|
!wget https://huggingface.co./Svngoku/ReaderLM-v2-Q8_0-GGUF/resolve/main/readerlm-v2-q8_0.gguf |
|
|
|
!wget https://huggingface.co./jinaai/ReaderLM-v2/resolve/main/tokenizer.json |
|
|
|
!vllm serve /content/readerlm-v2-q8_0.gguf --tokenizer /content/tokenizer.json |
|
|
|
from vllm import LLM |
|
|
|
llm = LLM( |
|
model="/content/readerlm-v2-q8_0.gguf", |
|
max_model_len=max_model_len, |
|
tokenizer='jinaai/ReaderLM-v2' |
|
) |
|
|
|
# @title ## Specify a URL as input{"run":"auto","vertical-output":true} |
|
|
|
import re |
|
import requests |
|
from IPython.display import display, Markdown |
|
|
|
def display_header(text): |
|
display(Markdown(f'**{text}**')) |
|
|
|
def display_rendered_md(text): |
|
# for mimic "Reading mode" in Safari/Firefox |
|
display(Markdown(text)) |
|
|
|
def display_content(text): |
|
display(Markdown(text)) |
|
|
|
def get_html_content(url): |
|
api_url = f'https://r.jina.ai/{url}' |
|
headers = {'X-Return-Format': 'html'} |
|
try: |
|
response = requests.get(api_url, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
return response.text |
|
except requests.exceptions.RequestException as e: |
|
return f"error: {str(e)}" |
|
|
|
|
|
def get_html_content(url): |
|
api_url = f'https://r.jina.ai/{url}' |
|
headers = {'X-Return-Format': 'html'} |
|
try: |
|
response = requests.get(api_url, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
return response.text |
|
except requests.exceptions.RequestException as e: |
|
return f"error: {str(e)}" |
|
|
|
def create_prompt(text: str, tokenizer = None, instruction: str = None, schema: str = None) -> str: |
|
""" |
|
Create a prompt for the model with optional instruction and JSON schema. |
|
|
|
Args: |
|
text (str): The input HTML text |
|
tokenizer: The tokenizer to use |
|
instruction (str, optional): Custom instruction for the model |
|
schema (str, optional): JSON schema for structured extraction |
|
|
|
Returns: |
|
str: The formatted prompt |
|
""" |
|
if not tokenizer: |
|
tokenizer = llm.get_tokenizer() |
|
|
|
|
|
if not instruction: |
|
instruction = "Extract the main content from the given HTML and convert it to Markdown format." |
|
|
|
if schema: |
|
instruction = 'Extract the specified information from a list of news threads and present it in a structured JSON format.' |
|
prompt = f"{instruction}\n```html\n{text}\n```\nThe JSON schema is as follows:```json{schema}```" |
|
else: |
|
prompt = f"{instruction}\n```html\n{text}\n```" |
|
|
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": prompt, |
|
} |
|
] |
|
|
|
return tokenizer.apply_chat_template( |
|
messages, tokenize=False, add_generation_prompt=True |
|
) |
|
|
|
|
|
|
|
# (REMOVE <SCRIPT> to </script> and variations) |
|
SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>' # mach any char zero or more times |
|
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
|
|
# (REMOVE HTML <STYLE> to </style> and variations) |
|
STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>' # mach any char zero or more times |
|
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
|
|
# (REMOVE HTML <META> to </meta> and variations) |
|
META_PATTERN = r'<[ ]*meta.*?>' # mach any char zero or more times |
|
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
|
|
# (REMOVE HTML COMMENTS <!-- to --> and variations) |
|
COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>' # mach any char zero or more times |
|
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
|
|
# (REMOVE HTML LINK <LINK> to </link> and variations) |
|
LINK_PATTERN = r'<[ ]*link.*?>' # mach any char zero or more times |
|
|
|
# (REPLACE base64 images) |
|
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>' |
|
|
|
# (REPLACE <svg> to </svg> and variations) |
|
SVG_PATTERN = r'(<svg[^>]*>)(.*?)(<\/svg>)' |
|
|
|
|
|
def replace_svg(html: str, new_content: str = "this is a placeholder") -> str: |
|
return re.sub( |
|
SVG_PATTERN, |
|
lambda match: f"{match.group(1)}{new_content}{match.group(3)}", |
|
html, |
|
flags=re.DOTALL, |
|
) |
|
|
|
|
|
def replace_base64_images(html: str, new_image_src: str = "#") -> str: |
|
return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html) |
|
|
|
|
|
def has_base64_images(text: str) -> bool: |
|
base64_content_pattern = r'data:image/[^;]+;base64,[^"]+' |
|
return bool(re.search(base64_content_pattern, text, flags=re.DOTALL)) |
|
|
|
|
|
def has_svg_components(text: str) -> bool: |
|
return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL)) |
|
|
|
|
|
def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False): |
|
html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) |
|
|
|
if clean_svg: |
|
html = replace_svg(html) |
|
|
|
if clean_base64: |
|
html = replace_base64_images(html) |
|
|
|
return html |
|
|
|
url = "https://news.ycombinator.com/" # @param {type:"string"} |
|
|
|
|
|
print(f'We will use Jina Reader to fetch the **raw HTML** from: {url}') |
|
|
|
html = get_html_content(url) |
|
|
|
html = clean_html(html, clean_svg=True, clean_base64=True) |
|
|
|
prompt = create_prompt(html) |
|
result = llm.generate(prompt, sampling_params=sampling_params)[0].outputs[0].text.strip() |
|
|
|
print(result) |
|
|
|
import json |
|
|
|
schema = { |
|
"type": "object", |
|
"properties": { |
|
"title": {"type": "string", "description": "News thread title"}, |
|
"url": {"type": "string", "description": "Thread URL"}, |
|
"summary": {"type": "string", "description": "Article summary"}, |
|
"keywords": {"type": "list", "description": "Descriptive keywords"}, |
|
"author": {"type": "string", "description": "Thread author"}, |
|
"comments": {"type": "integer", "description": "Comment count"} |
|
}, |
|
"required": ["title", "url", "date", "points", "author", "comments"] |
|
} |
|
|
|
prompt = create_prompt(html, schema=json.dumps(schema, indent=2)) |
|
|
|
|
|
result = llm.generate(prompt, sampling_params=sampling_params)[0].outputs[0].text.strip() |
|
print(result) |
|
|
|
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment |
|
import gc |
|
import os |
|
import torch |
|
|
|
destroy_model_parallel() |
|
destroy_distributed_environment() |
|
del llm.llm_engine.model_executor.driver_worker |
|
del llm.llm_engine.model_executor |
|
del llm |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
print(f"cuda memory: {torch.cuda.memory_allocated() // 1024 // 1024}MB") |
|
|
|
``` |
|
|