Spaces:

amaye15
/

object-segmentation-processing

Running

App Files Files Community

amaye15 commited on 29 days ago

Commit

5150064

•

1 Parent(s): 60283f6

update

Browse files

Files changed (2) hide show

app.py +22 -7
dev.ipynb +123 -7

app.py CHANGED Viewed

@@ -7,6 +7,26 @@ from huggingface_hub import WebhooksServer, WebhookPayload
 from datasets import Dataset, load_dataset, disable_caching
 from fastapi import BackgroundTasks, Response, status
 # Disable caching globally for Hugging Face datasets
 disable_caching()
@@ -37,13 +57,11 @@ def get_data():
     """
     ds = load_dataset(
         DS_NAME,
-        cache_dir=DATA_DIR,
         streaming=True,
-        download_mode="force_redownload",
     )
     for row in ds["train"]:
         yield row
-    gc.collect()
 def process_and_push_data():
@@ -53,16 +71,13 @@ def process_and_push_data():
     Removes existing data directory if it exists, recreates it, processes
     the dataset, and pushes the processed dataset to the hub.
     """
-    if DATA_DIR.exists():
-        shutil.rmtree(DATA_DIR)
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
     # Process data using the generator and push it to the hub
     ds_processed = Dataset.from_generator(get_data)
     ds_processed.push_to_hub(TARGET_REPO)
     logger.info("Data processed and pushed to the hub.")
-    gc.collect()
 # Initialize the WebhooksServer with Gradio interface (if needed)

 from datasets import Dataset, load_dataset, disable_caching
 from fastapi import BackgroundTasks, Response, status
+import shutil
+from pathlib import Path
+def clear_huggingface_cache():
+    # Path to the Hugging Face cache directory
+    cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
+    # Remove the entire datasets directory
+    if cache_dir.exists() and cache_dir.is_dir():
+        shutil.rmtree(cache_dir)
+        print(f"Removed cache directory: {cache_dir}")
+    else:
+        print("Cache directory does not exist.")
+# Example usage
+clear_huggingface_cache()
 # Disable caching globally for Hugging Face datasets
 disable_caching()
     """
     ds = load_dataset(
         DS_NAME,
         streaming=True,
     )
     for row in ds["train"]:
         yield row
+    clear_huggingface_cache()
 def process_and_push_data():
     Removes existing data directory if it exists, recreates it, processes
     the dataset, and pushes the processed dataset to the hub.
     """
     # Process data using the generator and push it to the hub
     ds_processed = Dataset.from_generator(get_data)
     ds_processed.push_to_hub(TARGET_REPO)
     logger.info("Data processed and pushed to the hub.")
+    clear_huggingface_cache()
 # Initialize the WebhooksServer with Gradio interface (if needed)

dev.ipynb CHANGED Viewed

@@ -2,9 +2,52 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 28,
    "metadata": {},
-   "outputs": [],
    "source": [
     "\n",
     "import os\n",
@@ -34,19 +77,92 @@
     "DATA_DIR = \"data\"\n",
     "p = os.path.join(os.getcwd(), DATA_DIR)\n",
     "\n",
-    "if os.path.exists(p):\n",
-    "    shutil.rmtree(p)\n",
     "\n",
     "\n",
-    "os.mkdir(p)\n",
     "\n",
     "def get_data():\n",
-    "    ds = load_dataset(DS_NAME, cache_dir=p, streaming=True)\n",
     "    for row in ds[\"train\"]:\n",
     "        yield row\n",
     "\n",
     "ds_processed = Dataset.from_generator(get_data)\n",
-    "# ds_processed.push_to_hub(\"amaye15/tmp\")"
    ]
   },
   {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4b4f99f9ac7940a894807b88d339f866",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0ce6f3f8cc7f49f7a42d7b2219a12a7e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading readme:   0%|          | 0.00/5.24k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3d596e0ae9594943905996935ef84329",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Resolving data files:   0%|          | 0/60 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "\n",
     "import os\n",
     "DATA_DIR = \"data\"\n",
     "p = os.path.join(os.getcwd(), DATA_DIR)\n",
     "\n",
+    "# if os.path.exists(p):\n",
+    "#     shutil.rmtree(p)\n",
     "\n",
     "\n",
+    "# os.mkdir(p)\n",
     "\n",
     "def get_data():\n",
+    "    ds = load_dataset(DS_NAME, streaming=True)\n",
     "    for row in ds[\"train\"]:\n",
     "        yield row\n",
     "\n",
+    "\n",
+    "\n",
+    "# def main():\n",
+    "#     ds_processed = Dataset.from_generator(get_data)\n",
+    "#     return\n",
+    "# # ds_processed.push_to_hub(\"amaye15/tmp\")\n",
+    "\n",
+    "\n",
+    "# main()\n",
+    "# import gc\n",
+    "\n",
     "ds_processed = Dataset.from_generator(get_data)\n",
+    "\n",
+    "# gc.collect()\n",
+    "# gc.c"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['image', 'masked_image', 'mask'],\n",
+       "    num_rows: 59\n",
+       "})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds_processed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Removed cache directory: /Users/andrewmayes/.cache/huggingface/datasets\n"
+     ]
+    }
+   ],
+   "source": [
+    "import shutil\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Path to the Hugging Face cache directory\n",
+    "cache_dir = Path.home() / \".cache\" / \"huggingface\" / \"datasets\"\n",
+    "\n",
+    "# Remove the entire datasets directory\n",
+    "if cache_dir.exists() and cache_dir.is_dir():\n",
+    "    shutil.rmtree(cache_dir)\n",
+    "    print(f\"Removed cache directory: {cache_dir}\")\n",
+    "else:\n",
+    "    print(\"Cache directory does not exist.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "/Users/andrewmayes/.cache/huggingface/datasets"
    ]
   },
   {