File size: 2,025 Bytes
5ddcfe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
import uuid

import pandas as pd
import tiktoken


# Function to count tokens using tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(
        encoding.encode(
            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
        )
    )
    return num_tokens


# Function to clean or remove specific content, e.g., copyright headers
def remove_copyright_header(content: str) -> str:
    # Implement any cleaning logic you need here
    return content


# Function to convert DataFrame to JSONL format with token counting
def convert_to_jsonl_with_conditions(df, encoding_name="cl100k_base"):
    jsonl_data = []
    for _, row in df.iterrows():
        token_count = num_tokens_from_string(row["text"], encoding_name)

        # Skip entries based on token count conditions
        if token_count < 100 or token_count > 200_000:
            print(f"Skipping {row['title']} due to token count {token_count}")
            continue

        cleaned_content = remove_copyright_header(row["text"])

        entry = {
            "tokens": token_count,  # Token count using tiktoken
            "doc_id": str(uuid.uuid4()),  # Generate a unique UUID
            "name": row["title"],
            "url": row["tai_url"],
            "retrieve_doc": (token_count <= 8000),  # retrieve_doc condition
            "source": "tai_blog",
            "content": cleaned_content,
        }
        jsonl_data.append(entry)
    return jsonl_data


# Load the CSV file
data = pd.read_csv("data/tai.csv")

# Convert the dataframe to JSONL format with token counting and conditions
jsonl_data_with_conditions = convert_to_jsonl_with_conditions(data)

# Save the output to a new JSONL file using json.dumps to ensure proper escaping
output_path = "data/tai_blog_data_conditions.jsonl"
with open(output_path, "w") as f:
    for entry in jsonl_data_with_conditions:
        f.write(json.dumps(entry) + "\n")