File size: 2,054 Bytes
c30b770
 
 
 
10c2fec
 
 
c30b770
 
 
 
 
10c2fec
c30b770
 
 
 
 
 
 
 
10c2fec
c30b770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10c2fec
c30b770
 
 
 
 
 
 
 
 
10c2fec
 
c30b770
 
 
 
 
 
 
 
 
10c2fec
c30b770
 
 
 
 
 
 
 
 
 
10c2fec
c30b770
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import shutil
import logging
from huggingface_hub import WebhooksServer, WebhookPayload, webhook_endpoint
from datasets import Dataset, load_dataset, disable_caching

disable_caching()

# Set up the logger
logger = logging.getLogger("basic_logger")
logger.setLevel(logging.INFO)

# Set up the console handler with a simple format
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

DS_NAME = "amaye15/object-segmentation"
DATA_DIR = "data"
TARGET_REPO = "amaye15/tmp"


def get_data():
    """
    Generator function to stream data from the dataset.
    """
    ds = load_dataset(
        DS_NAME,
        cache_dir=os.path.join(os.getcwd(), DATA_DIR),
        streaming=True,
        download_mode="force_redownload",
    )
    for row in ds["train"]:
        yield row


def process_and_push_data():
    """
    Function to process and push new data to the target repository.
    """
    p = os.path.join(os.getcwd(), DATA_DIR)

    if os.path.exists(p):
        shutil.rmtree(p)

    os.mkdir(p)

    ds_processed = Dataset.from_generator(get_data)
    ds_processed.push_to_hub(TARGET_REPO)
    logger.info("Data processed and pushed to the hub.")


# Initialize the WebhooksServer
app = WebhooksServer(webhook_secret="my_secret_key")


@webhook_endpoint
async def trigger_processing(payload: WebhookPayload):
    """
    Webhook endpoint that triggers data processing when the dataset is updated.
    """
    if payload.repo.type == "dataset" and payload.event.action == "update":
        logger.info(f"Dataset {payload.repo.name} updated. Triggering processing.")
        process_and_push_data()
        return {"message": "Data processing triggered successfully."}
    else:
        logger.info(f"Ignored event: {payload.event.action} on {payload.repo.name}")
        return {"message": "Event ignored."}


# Start the webhook server
app.launch()