Update README.md

769b82c verified 9 days ago

9.46 kB

	---
	pipeline_tag: text-generation
	language:
	- multilingual
	inference: false
	license: cc-by-nc-4.0
	library_name: transformers
	base_model: jinaai/ReaderLM-v2
	tags:
	- llama-cpp
	- gguf-my-repo
	---

	# Svngoku/ReaderLM-v2-Q8_0-GGUF
	This model was converted to GGUF format from [`jinaai/ReaderLM-v2`](https://huggingface.co./jinaai/ReaderLM-v2) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co./spaces/ggml-org/gguf-my-repo) space.
	Refer to the [original model card](https://huggingface.co./jinaai/ReaderLM-v2) for more details on the model.

	## Use with llama.cpp
	Install llama.cpp through brew (works on Mac and Linux)

	```bash
	brew install llama.cpp

	```
	Invoke the llama.cpp server or the CLI.

	### CLI:
	```bash
	llama-cli --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -p "The meaning to life and the universe is"
	```

	### Server:
	```bash
	llama-server --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -c 2048
	```

	Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.

	Step 1: Clone llama.cpp from GitHub.
	```
	git clone https://github.com/ggerganov/llama.cpp
	```

	Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
	```
	cd llama.cpp && LLAMA_CURL=1 make
	```

	Step 3: Run inference through the main binary.
	```
	./llama-cli --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -p "The meaning to life and the universe is"
	```
	or
	```
	./llama-server --hf-repo Svngoku/ReaderLM-v2-Q8_0-GGUF --hf-file readerlm-v2-q8_0.gguf -c 2048
	```

	## VLLM Inference

	```py
	# -- coding: utf-8 --
	"""Untitled64.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1hVqCTm6XLJmrOjkaIYLHXgOTg2ffnhue
	"""

	!pip install vllm

	model_name = 'Svngoku/ReaderLM-v2-Q8_0-GGUF' # @param ["jinaai/ReaderLM-v2", "jinaai/reader-lm-1.5b", "Svngoku/ReaderLM-v2-Q8_0-GGUF"]
	max_model_len = 256000 # @param {type:"integer"}
	# @markdown ---
	# @markdown ### SamplingParams:

	top_k = 1 # @param {type:"integer"}
	temperature = 0 # @param {type:"slider", min:0, max:1, step:0.1}
	repetition_penalty = 1.05 # @param {type:"number"}
	presence_penalty = 0.25 # @param {type:"slider", min:0, max:1, step:0.1}
	max_tokens = 8192 # @param {type:"integer"}
	# @markdown ---

	from vllm import SamplingParams

	sampling_params = SamplingParams(temperature=temperature, top_k=top_k, presence_penalty=presence_penalty, repetition_penalty=repetition_penalty, max_tokens=max_tokens)

	print('sampling_params', sampling_params)

	!wget https://huggingface.co./Svngoku/ReaderLM-v2-Q8_0-GGUF/resolve/main/readerlm-v2-q8_0.gguf

	!wget https://huggingface.co./jinaai/ReaderLM-v2/resolve/main/tokenizer.json

	!vllm serve /content/readerlm-v2-q8_0.gguf --tokenizer /content/tokenizer.json

	from vllm import LLM

	llm = LLM(
	model="/content/readerlm-v2-q8_0.gguf",
	max_model_len=max_model_len,
	tokenizer='jinaai/ReaderLM-v2'
	)

	# @title ## Specify a URL as input{"run":"auto","vertical-output":true}

	import re
	import requests
	from IPython.display import display, Markdown

	def display_header(text):
	display(Markdown(f'{text}'))

	def display_rendered_md(text):
	# for mimic "Reading mode" in Safari/Firefox
	display(Markdown(text))

	def display_content(text):
	display(Markdown(text))

	def get_html_content(url):
	api_url = f'https://r.jina.ai/{url}'
	headers = {'X-Return-Format': 'html'}
	try:
	response = requests.get(api_url, headers=headers, timeout=10)
	response.raise_for_status()
	return response.text
	except requests.exceptions.RequestException as e:
	return f"error: {str(e)}"


	def get_html_content(url):
	api_url = f'https://r.jina.ai/{url}'
	headers = {'X-Return-Format': 'html'}
	try:
	response = requests.get(api_url, headers=headers, timeout=10)
	response.raise_for_status()
	return response.text
	except requests.exceptions.RequestException as e:
	return f"error: {str(e)}"

	def create_prompt(text: str, tokenizer = None, instruction: str = None, schema: str = None) -> str:
	"""
	Create a prompt for the model with optional instruction and JSON schema.

	Args:
	text (str): The input HTML text
	tokenizer: The tokenizer to use
	instruction (str, optional): Custom instruction for the model
	schema (str, optional): JSON schema for structured extraction

	Returns:
	str: The formatted prompt
	"""
	if not tokenizer:
	tokenizer = llm.get_tokenizer()


	if not instruction:
	instruction = "Extract the main content from the given HTML and convert it to Markdown format."

	if schema:
	instruction = 'Extract the specified information from a list of news threads and present it in a structured JSON format.'
	prompt = f"{instruction}\n```html\n{text}\n```\nThe JSON schema is as follows:```json{schema}```"
	else:
	prompt = f"{instruction}\n```html\n{text}\n```"

	messages = [
	{
	"role": "user",
	"content": prompt,
	}
	]

	return tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)



	# (REMOVE <SCRIPT> to </script> and variations)
	SCRIPT_PATTERN = r'<[ ]script.?\/[ ]script[ ]>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML <STYLE> to </style> and variations)
	STYLE_PATTERN = r'<[ ]style.?\/[ ]style[ ]>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML <META> to </meta> and variations)
	META_PATTERN = r'<[ ]meta.?>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML COMMENTS <!-- to --> and variations)
	COMMENT_PATTERN = r'<[ ]!--.?--[ ]*>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML LINK <LINK> to </link> and variations)
	LINK_PATTERN = r'<[ ]link.?>' # mach any char zero or more times

	# (REPLACE base64 images)
	BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'

	# (REPLACE <svg> to </svg> and variations)
	SVG_PATTERN = r'(<svg[^>]>)(.?)(<\/svg>)'


	def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
	return re.sub(
	SVG_PATTERN,
	lambda match: f"{match.group(1)}{new_content}{match.group(3)}",
	html,
	flags=re.DOTALL,
	)


	def replace_base64_images(html: str, new_image_src: str = "#") -> str:
	return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)


	def has_base64_images(text: str) -> bool:
	base64_content_pattern = r'data:image/[^;]+;base64,[^"]+'
	return bool(re.search(base64_content_pattern, text, flags=re.DOTALL))


	def has_svg_components(text: str) -> bool:
	return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL))


	def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
	html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	if clean_svg:
	html = replace_svg(html)

	if clean_base64:
	html = replace_base64_images(html)

	return html

	url = "https://news.ycombinator.com/" # @param {type:"string"}


	print(f'We will use Jina Reader to fetch the raw HTML from: {url}')

	html = get_html_content(url)

	html = clean_html(html, clean_svg=True, clean_base64=True)

	prompt = create_prompt(html)
	result = llm.generate(prompt, sampling_params=sampling_params)[0].outputs[0].text.strip()

	print(result)

	import json

	schema = {
	"type": "object",
	"properties": {
	"title": {"type": "string", "description": "News thread title"},
	"url": {"type": "string", "description": "Thread URL"},
	"summary": {"type": "string", "description": "Article summary"},
	"keywords": {"type": "list", "description": "Descriptive keywords"},
	"author": {"type": "string", "description": "Thread author"},
	"comments": {"type": "integer", "description": "Comment count"}
	},
	"required": ["title", "url", "date", "points", "author", "comments"]
	}

	prompt = create_prompt(html, schema=json.dumps(schema, indent=2))


	result = llm.generate(prompt, sampling_params=sampling_params)[0].outputs[0].text.strip()
	print(result)

	from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
	import gc
	import os
	import torch

	destroy_model_parallel()
	destroy_distributed_environment()
	del llm.llm_engine.model_executor.driver_worker
	del llm.llm_engine.model_executor
	del llm
	gc.collect()
	torch.cuda.empty_cache()

	print(f"cuda memory: {torch.cuda.memory_allocated() // 1024 // 1024}MB")

	```