|
import json |
|
import uuid |
|
|
|
import pandas as pd |
|
import tiktoken |
|
|
|
|
|
|
|
def num_tokens_from_string(string: str, encoding_name: str) -> int: |
|
encoding = tiktoken.get_encoding(encoding_name) |
|
num_tokens = len( |
|
encoding.encode( |
|
string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"}) |
|
) |
|
) |
|
return num_tokens |
|
|
|
|
|
|
|
def remove_copyright_header(content: str) -> str: |
|
|
|
return content |
|
|
|
|
|
|
|
def convert_to_jsonl_with_conditions(df, encoding_name="cl100k_base"): |
|
jsonl_data = [] |
|
for _, row in df.iterrows(): |
|
token_count = num_tokens_from_string(row["text"], encoding_name) |
|
|
|
|
|
if token_count < 100 or token_count > 200_000: |
|
print(f"Skipping {row['title']} due to token count {token_count}") |
|
continue |
|
|
|
cleaned_content = remove_copyright_header(row["text"]) |
|
|
|
entry = { |
|
"tokens": token_count, |
|
"doc_id": str(uuid.uuid4()), |
|
"name": row["title"], |
|
"url": row["tai_url"], |
|
"retrieve_doc": (token_count <= 8000), |
|
"source": "tai_blog", |
|
"content": cleaned_content, |
|
} |
|
jsonl_data.append(entry) |
|
return jsonl_data |
|
|
|
|
|
|
|
data = pd.read_csv("data/tai.csv") |
|
|
|
|
|
jsonl_data_with_conditions = convert_to_jsonl_with_conditions(data) |
|
|
|
|
|
output_path = "data/tai_blog_data_conditions.jsonl" |
|
with open(output_path, "w") as f: |
|
for entry in jsonl_data_with_conditions: |
|
f.write(json.dumps(entry) + "\n") |
|
|