googcheng commited on
Commit
319b69b
1 Parent(s): 7091ec2

Upload data_conv.ipynb

Browse files
Files changed (1) hide show
  1. data_conv.ipynb +63 -0
data_conv.ipynb ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 15,
6
+ "id": "c9140a01-4f24-4dc2-8d8f-686f38dd5385",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "path = '/root/autodl-tmp/labeled-recipes/data/train-00000-of-00001-5dd0d415a357ff24.parquet'\n",
11
+ "output_file_name = '/root/autodl-tmp/data/train.jsonl'"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 16,
17
+ "id": "24d9b3a4-81e1-44c4-a25b-38c2bda4fdac",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "\n",
23
+ "# Read the Parquet file into a DataFrame\n",
24
+ "df = pd.read_parquet(path, engine='pyarrow')\n",
25
+ "\n",
26
+ "# Convert the DataFrame to JSONL and save it to a file\n",
27
+ "with open(output_file_name, 'w') as f:\n",
28
+ " for index, row in df.iterrows():\n",
29
+ " json_row = row.to_json()\n",
30
+ " f.write(json_row + '\\n')"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "id": "e4c72fcf-a59b-4a6e-be45-f2f66e28bf4a",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": []
40
+ }
41
+ ],
42
+ "metadata": {
43
+ "kernelspec": {
44
+ "display_name": "Python 3 (ipykernel)",
45
+ "language": "python",
46
+ "name": "python3"
47
+ },
48
+ "language_info": {
49
+ "codemirror_mode": {
50
+ "name": "ipython",
51
+ "version": 3
52
+ },
53
+ "file_extension": ".py",
54
+ "mimetype": "text/x-python",
55
+ "name": "python",
56
+ "nbconvert_exporter": "python",
57
+ "pygments_lexer": "ipython3",
58
+ "version": "3.10.8"
59
+ }
60
+ },
61
+ "nbformat": 4,
62
+ "nbformat_minor": 5
63
+ }