burtenshaw HF staff commited on
Commit
cdb761d
β€’
1 Parent(s): f92d1a9

Upload 5 files

Browse files
Files changed (4) hide show
  1. app.py +90 -66
  2. hub.py +32 -98
  3. project_config.json +1 -1
  4. seed_data.json +2 -26
app.py CHANGED
@@ -1,94 +1,118 @@
1
- import streamlit as st
2
 
3
- from defaults import (
4
- PROJECT_NAME,
5
- ARGILLA_SPACE_REPO_ID,
6
- DATASET_REPO_ID,
7
- ARGILLA_URL,
8
- PROJECT_SPACE_REPO_ID,
9
- DIBT_PARENT_APP_URL,
10
  )
11
- from utils import project_sidebar
12
 
13
- st.set_page_config("Domain Data Grower", page_icon="πŸ§‘β€πŸŒΎ")
14
 
15
- project_sidebar()
16
 
17
- if PROJECT_NAME == "DEFAULT_DOMAIN":
18
- st.warning(
19
- "Please set up the project configuration in the parent app before proceeding."
20
- )
21
- st.stop()
22
 
 
 
 
23
 
24
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
25
  st.divider()
26
 
27
- st.markdown(
28
- """
29
- ## 🌱 Create a dataset seed for aligning models to a specific domain
30
-
31
- This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
32
- Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
33
- """
34
- )
35
- st.markdown(
36
- """
37
- ## 🚜 How it works
38
-
39
- You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
40
- The dataset seed is then used to generate synthetic data for training a language model.
41
-
42
- """
43
  )
44
- st.markdown(
45
- """
46
- ## πŸ—ΊοΈ The process
47
 
48
- ### Step 1: ~~Setup the project~~
 
 
49
 
50
- ~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
51
- """
52
- )
53
- st.link_button("πŸš€ ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
54
 
55
  st.markdown(
56
- """
57
- ### Step 2: Describe the Domain
 
 
 
58
 
59
- Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
60
- You can collaborate with domain experts to define the domain expertise and perspectives.
61
  """
62
  )
63
 
64
  st.page_link(
65
- "pages/2_πŸ‘©πŸΌβ€πŸ”¬ Describe Domain.py",
66
- label="Describe Domain",
67
- icon="πŸ‘©πŸΌβ€πŸ”¬",
68
  )
69
 
70
- st.markdown(
71
- """
72
- ### Step 3: Generate Synthetic Data
73
 
74
- Use distilabel to generate synthetic data for your domain-specific dataset.
75
- You can run the pipeline locally or in this space to generate synthetic data.
76
- """
77
- )
78
 
79
- st.page_link(
80
- "pages/3_🌱 Generate Dataset.py",
81
- label="Generate Dataset",
82
- icon="🌱",
83
- )
84
 
85
- st.markdown(
86
- """
87
- ### Step 4: Review the Dataset
88
 
89
- Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
 
 
 
90
 
 
 
 
91
 
92
- """
93
- )
94
- st.link_button("πŸ” Review the dataset in Argilla", ARGILLA_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
 
3
+ from hub import (
4
+ setup_dataset_on_hub,
5
+ duplicate_space_on_hub,
6
+ add_project_config_to_space_repo,
 
 
 
7
  )
 
8
 
9
+ import streamlit as st
10
 
 
11
 
12
+ # Constants
13
+ # Written here to avoid defaults.py
14
+ DEFAULT_DOMAIN = "farming"
 
 
15
 
16
+ st.set_page_config(
17
+ "Domain Data Grower", page_icon="πŸ§‘β€πŸŒΎ", initial_sidebar_state="collapsed"
18
+ )
19
 
20
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
21
  st.divider()
22
 
23
+ st.sidebar.link_button(
24
+ "πŸ€— Get your Hub Token", "https://huggingface.co/settings/tokens"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
 
 
 
26
 
27
+ ################################################################################
28
+ # APP MARKDOWN
29
+ ################################################################################
30
 
31
+ st.header("🌱 Create a domain specific dataset")
 
 
 
32
 
33
  st.markdown(
34
+ """This space will set up your domain specific dataset project. It will
35
+ create the resources that you need to build a dataset. Those resources include:
36
+
37
+ - A dataset repository on the Hub
38
+ - Another space to define expert domain and run generation pipelines
39
 
40
+ For a complete overview of the project. Check out the README
 
41
  """
42
  )
43
 
44
  st.page_link(
45
+ "pages/πŸ§‘β€πŸŒΎ Domain Data Grower.py",
46
+ label="Domain Data Grower",
47
+ icon="πŸ§‘β€πŸŒΎ",
48
  )
49
 
50
+ ################################################################################
51
+ # CONFIGURATION
52
+ ################################################################################
53
 
54
+ st.subheader("🌾 Project Configuration")
 
 
 
55
 
56
+ project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
57
+ hub_username = st.text_input("Hub Username", "argilla")
58
+ hub_token = st.text_input("Hub Token", type="password")
59
+ private_selector = st.checkbox("Private Space", value=False)
 
60
 
61
+ if st.button("πŸ€— Setup Project Resources"):
62
+ repo_id = f"{hub_username}/{project_name}"
 
63
 
64
+ setup_dataset_on_hub(
65
+ repo_id=repo_id,
66
+ hub_token=hub_token,
67
+ )
68
 
69
+ st.success(
70
+ f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
71
+ )
72
 
73
+ space_name = f"{project_name}_config_space"
74
+
75
+ duplicate_space_on_hub(
76
+ source_repo="argilla/domain-specific-datasets-template",
77
+ target_repo=space_name,
78
+ hub_token=hub_token,
79
+ private=private_selector,
80
+ )
81
+
82
+ st.success(
83
+ f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
84
+ )
85
+
86
+ argilla_name = f"{project_name}_argilla_space"
87
+
88
+ duplicate_space_on_hub(
89
+ source_repo="argilla/argilla-template-space",
90
+ target_repo=argilla_name,
91
+ hub_token=hub_token,
92
+ private=private_selector,
93
+ )
94
+
95
+ st.success(
96
+ f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
97
+ )
98
+
99
+ seconds = 5
100
+
101
+ with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
102
+ time.sleep(seconds)
103
+ add_project_config_to_space_repo(
104
+ dataset_repo_id=repo_id,
105
+ hub_token=hub_token,
106
+ project_name=project_name,
107
+ argilla_space_repo_id=f"{hub_username}/{argilla_name}",
108
+ project_space_repo_id=f"{hub_username}/{space_name}",
109
+ )
110
+
111
+ st.subheader("πŸ‘’ Next Steps")
112
+
113
+ st.write("Go to you project specific space!")
114
+
115
+ st.link_button(
116
+ "πŸ§‘β€πŸŒΎ Open Configuration Space",
117
+ f"https://huggingface.co/spaces/{hub_username}/{space_name}",
118
+ )
hub.py CHANGED
@@ -1,43 +1,10 @@
1
  import json
2
- from tempfile import mktemp
3
 
4
- import argilla as rg
5
- from huggingface_hub import HfApi
6
-
7
- from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
8
 
9
 
10
  hf_api = HfApi()
11
 
12
- with open("DATASET_README_BASE.md") as f:
13
- DATASET_README_BASE = f.read()
14
-
15
-
16
- def create_readme(domain_seed_data, project_name, domain):
17
- # create a readme for the project that shows the domain and project name
18
- readme = DATASET_README_BASE
19
- readme += f"# {project_name}\n\n## Domain: {domain}"
20
- perspectives = domain_seed_data.get("perspectives")
21
- topics = domain_seed_data.get("topics")
22
- examples = domain_seed_data.get("examples")
23
- if perspectives:
24
- readme += "\n\n## Perspectives\n\n"
25
- for p in perspectives:
26
- readme += f"- {p}\n"
27
- if topics:
28
- readme += "\n\n## Topics\n\n"
29
- for t in topics:
30
- readme += f"- {t}\n"
31
- if examples:
32
- readme += "\n\n## Examples\n\n"
33
- for example in examples:
34
- readme += f"### {example['question']}\n\n{example['answer']}\n\n"
35
- temp_file = mktemp()
36
-
37
- with open(temp_file, "w") as f:
38
- f.write(readme)
39
- return temp_file
40
-
41
 
42
  def setup_dataset_on_hub(repo_id, hub_token):
43
  # create an empty dataset repo on the hub
@@ -45,85 +12,52 @@ def setup_dataset_on_hub(repo_id, hub_token):
45
  repo_id=repo_id,
46
  token=hub_token,
47
  repo_type="dataset",
48
- exist_ok=True,
49
  )
50
 
51
-
52
- def push_dataset_to_hub(
53
- domain_seed_data_path,
54
- project_name,
55
- domain,
56
- pipeline_path,
57
- hub_username,
58
- hub_token: str,
59
- ):
60
- repo_id = f"{hub_username}/{project_name}"
61
-
62
- setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
63
-
64
- # upload the seed data and readme to the hub
65
  hf_api.upload_file(
66
- path_or_fileobj=domain_seed_data_path,
67
  path_in_repo="seed_data.json",
68
- token=hub_token,
69
  repo_id=repo_id,
70
  repo_type="dataset",
 
71
  )
72
 
73
- # upload the readme to the hub
74
- domain_seed_data = json.load(open(domain_seed_data_path))
75
- hf_api.upload_file(
76
- path_or_fileobj=create_readme(
77
- domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
78
- ),
79
- path_in_repo="README.md",
80
  token=hub_token,
81
- repo_id=repo_id,
82
- repo_type="dataset",
83
  )
84
 
85
 
86
- def push_pipeline_to_hub(
87
- pipeline_path,
88
- hub_username,
89
- hub_token: str,
90
  project_name,
 
 
91
  ):
92
- repo_id = f"{hub_username}/{project_name}"
93
-
94
- # upload the pipeline to the hub
95
- hf_api.upload_file(
96
- path_or_fileobj=pipeline_path,
97
- path_in_repo="pipeline.yaml",
98
- token=hub_token,
99
- repo_id=repo_id,
100
- repo_type="dataset",
101
- )
102
 
103
- for code_path in REMOTE_CODE_PATHS:
104
- hf_api.upload_file(
105
- path_or_fileobj=code_path,
106
- path_in_repo=code_path,
107
- token=hub_token,
108
- repo_id=repo_id,
109
- repo_type="dataset",
 
 
110
  )
111
 
112
- print(f"Dataset uploaded to {repo_id}")
113
-
114
-
115
- def pull_seed_data_from_repo(repo_id, hub_token):
116
- # pull the dataset repo from the hub
117
- hf_api.hf_hub_download(
118
- repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
119
  )
120
- return json.load(open(SEED_DATA_PATH))
121
-
122
-
123
- def push_argilla_dataset_to_hub(
124
- name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
125
- ):
126
- rg.init(api_url=url, api_key=api_key)
127
- feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
128
- local_dataset = feedback_dataset.pull()
129
- local_dataset.push_to_huggingface(repo_id=repo_id)
 
1
  import json
 
2
 
3
+ from huggingface_hub import duplicate_space, HfApi
 
 
 
4
 
5
 
6
  hf_api = HfApi()
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def setup_dataset_on_hub(repo_id, hub_token):
10
  # create an empty dataset repo on the hub
 
12
  repo_id=repo_id,
13
  token=hub_token,
14
  repo_type="dataset",
 
15
  )
16
 
17
+ # upload the seed data
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  hf_api.upload_file(
19
+ path_or_fileobj="seed_data.json",
20
  path_in_repo="seed_data.json",
 
21
  repo_id=repo_id,
22
  repo_type="dataset",
23
+ token=hub_token,
24
  )
25
 
26
+
27
+ def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
28
+ duplicate_space(
29
+ from_id=source_repo,
30
+ to_id=target_repo,
 
 
31
  token=hub_token,
32
+ private=private,
33
+ exist_ok=True,
34
  )
35
 
36
 
37
+ def add_project_config_to_space_repo(
38
+ dataset_repo_id,
39
+ hub_token,
 
40
  project_name,
41
+ argilla_space_repo_id,
42
+ project_space_repo_id,
43
  ):
44
+ # upload the seed data and readme to the hub
 
 
 
 
 
 
 
 
 
45
 
46
+ with open("project_config.json", "w") as f:
47
+ json.dump(
48
+ {
49
+ "project_name": project_name,
50
+ "argilla_space_repo_id": argilla_space_repo_id,
51
+ "project_space_repo_id": project_space_repo_id,
52
+ "dataset_repo_id": dataset_repo_id,
53
+ },
54
+ f,
55
  )
56
 
57
+ hf_api.upload_file(
58
+ path_or_fileobj="project_config.json",
59
+ path_in_repo="project_config.json",
60
+ token=hub_token,
61
+ repo_id=project_space_repo_id,
62
+ repo_type="space",
 
63
  )
 
 
 
 
 
 
 
 
 
 
project_config.json CHANGED
@@ -1 +1 @@
1
- {"project_name": "domain_test_4", "argilla_space_repo_id": "burtenshaw/domain_test_4_argilla_space", "project_space_repo_id": "burtenshaw/domain_test_4_config_space", "dataset_repo_id": "burtenshaw/domain_test_4"}
 
1
+ {"project_name": "farming", "argilla_space_repo_id": "ignacioct/farming_argilla_space", "project_space_repo_id": "ignacioct/farming_config_space", "dataset_repo_id": "ignacioct/farming"}
seed_data.json CHANGED
@@ -1,39 +1,15 @@
1
  {
2
  "domain": "farming",
3
  "perspectives": [
4
- "Family Farming",
5
- "Agribusiness",
6
- "Permaculture",
7
- "Agroforestery",
8
- "Conventional Farming"
9
  ],
10
  "topics": [
11
- "animal welfare",
12
- "economic growth",
13
- "land",
14
- "resources",
15
- "efficiency"
16
  ],
17
  "examples": [
18
  {
19
  "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
20
  "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
21
- },
22
- {
23
- "question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.",
24
- "answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices."
25
- },
26
- {
27
- "question": "Analyze the economic implications of transitioning from conventional to organic farming.",
28
- "answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)."
29
- },
30
- {
31
- "question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.",
32
- "answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)."
33
- },
34
- {
35
- "question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ",
36
- "answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals."
37
  }
38
  ],
39
  "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
 
1
  {
2
  "domain": "farming",
3
  "perspectives": [
4
+ "Family Farming"
 
 
 
 
5
  ],
6
  "topics": [
7
+ "animal welfare"
 
 
 
 
8
  ],
9
  "examples": [
10
  {
11
  "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
12
  "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  }
14
  ],
15
  "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."