omarsol's picture
Initial commit
5ddcfe5
import json
import uuid
import pandas as pd
import tiktoken
# Function to count tokens using tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(
encoding.encode(
string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
)
)
return num_tokens
# Function to clean or remove specific content, e.g., copyright headers
def remove_copyright_header(content: str) -> str:
# Implement any cleaning logic you need here
return content
# Function to convert DataFrame to JSONL format with token counting
def convert_to_jsonl_with_conditions(df, encoding_name="cl100k_base"):
jsonl_data = []
for _, row in df.iterrows():
token_count = num_tokens_from_string(row["text"], encoding_name)
# Skip entries based on token count conditions
if token_count < 100 or token_count > 200_000:
print(f"Skipping {row['title']} due to token count {token_count}")
continue
cleaned_content = remove_copyright_header(row["text"])
entry = {
"tokens": token_count, # Token count using tiktoken
"doc_id": str(uuid.uuid4()), # Generate a unique UUID
"name": row["title"],
"url": row["tai_url"],
"retrieve_doc": (token_count <= 8000), # retrieve_doc condition
"source": "tai_blog",
"content": cleaned_content,
}
jsonl_data.append(entry)
return jsonl_data
# Load the CSV file
data = pd.read_csv("data/tai.csv")
# Convert the dataframe to JSONL format with token counting and conditions
jsonl_data_with_conditions = convert_to_jsonl_with_conditions(data)
# Save the output to a new JSONL file using json.dumps to ensure proper escaping
output_path = "data/tai_blog_data_conditions.jsonl"
with open(output_path, "w") as f:
for entry in jsonl_data_with_conditions:
f.write(json.dumps(entry) + "\n")