File size: 1,409 Bytes
c30b770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import shutil
import logging
import pretty_errors
import huggingface_hub
from datasets import Dataset, load_dataset, disable_caching
import schedule
import time

disable_caching()

# Set up the logger
logger = logging.getLogger("basic_logger")
logger.setLevel(logging.INFO)

# Set up the console handler with a simple format
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)  # Corrected the format string
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

DS_NAME = "amaye15/object-segmentation"
DATA_DIR = "data"


def get_data():
    ds = load_dataset(
        DS_NAME,
        cache_dir=os.path.join(os.getcwd(), DATA_DIR),
        streaming=True,
        download_mode="force_redownload",
    )
    for row in ds["train"]:
        yield row


def process_and_push_data():
    p = os.path.join(os.getcwd(), DATA_DIR)

    if os.path.exists(p):
        shutil.rmtree(p)

    os.mkdir(p)

    ds_processed = Dataset.from_generator(get_data)
    ds_processed.push_to_hub("amaye15/tmp")
    # logger.info("Data processed and pushed to the hub.")


# Schedule the task to run every minute
schedule.every(1).minute.do(process_and_push_data)  # Corrected to pass the function

# Run the scheduler
while True:
    schedule.run_pending()
    time.sleep(1)