File size: 1,290 Bytes
c5e57d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("phyloforfun/HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05")

# Define the directory where you want to save the files
save_dir = "D:/Dropbox/VoucherVision/datasets/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05"

# Save each split as a JSONL file in the specified directory
for split, split_dataset in dataset.items():
    split_dataset.to_json(f"{save_dir}/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-{split}.jsonl")


'''import json # convert to google

# Load the JSONL file
input_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train.jsonl'
output_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train-converted.jsonl'

# Define the conversion function
def convert_record(record):
    return {
        "input_text": record.get('instruction', '') + ' ' + record.get('input', ''),
        "target_text": record.get('output', '')
    }

# Convert and save the new JSONL file
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        record = json.loads(line)
        converted_record = convert_record(record)
        outfile.write(json.dumps(converted_record) + '\n')

output_file_path'''