File size: 1,432 Bytes
bee398c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1bf02b
bee398c
 
 
 
 
 
 
 
b1bf02b
bee398c
b1bf02b
bee398c
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from datetime import datetime, date
from pathlib import Path
from zoneinfo import ZoneInfo
import pandas as pd


HERE = Path(__file__).parent
JST = ZoneInfo("Asia/Tokyo")


def parse_ja_date(dt: datetime) -> date:
    return dt.astimezone(JST).date()


def parse_csv(filename: str | Path) -> pd.DataFrame:
    df = pd.read_csv(filename,
                    header=None,
                    )
    df.columns = ["id", "day_import", "date_import", "length", "audio", "title_import"]
    df["datetime"] = pd.to_datetime(df["date_import"], format=" %d %b %Y %H:%M:%S %Z")
    df["date"] = df.loc[:, "datetime"].apply(lambda x: parse_ja_date(x))
    df["title"] = df.loc[:, "title_import"].astype("string")
    return df


def ignore_columns(df: pd.DataFrame, limit: str | None) -> pd.DataFrame:
    if limit is not None:
        df_out = df.loc[df.loc[:, "datetime"] > limit, ["id", "date", "length", "audio", "title"]]
    else:
        df_out = df.loc[:, ["id", "date", "length", "audio", "title"]]
    return df_out


def main(input: str | Path, output: str | Path, limit: str | None):
    df = parse_csv(input)
    df_out = ignore_columns(df, limit)
    df_out.to_parquet(output, index=False)


if __name__ == "__main__":
    input = HERE.parent / "data" / "episode-list-202501.csv"
    output = HERE.parent / "store" / "podcast-title-list-202301-202501.parquet"
    limit = "2023-01-01 00:00:00+9:00"
    main(input, output, limit)