patrickvonplaten commited on
Commit
d27cdb4
·
1 Parent(s): d021c86
Files changed (2) hide show
  1. app.py +142 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ import json
4
+ from huggingface_hub import snapshot_download
5
+ from collections import defaultdict
6
+ import pandas as pd
7
+ import streamlit as st
8
+ from datetime import datetime, timedelta
9
+ import matplotlib.pyplot as plt
10
+
11
+ user_input = st.text_input("Enter your text here:")
12
+
13
+ libraries = [
14
+ "open-source-metrics/accelerate-dependents",
15
+ "open-source-metrics/hub-docs-dependents",
16
+ "open-source-metrics/huggingface_hub-dependents",
17
+ "open-source-metrics/evaluate-dependents",
18
+ "open-source-metrics/datasets-dependents",
19
+ "open-source-metrics/pytorch-image-models-dependents",
20
+ "open-source-metrics/tokenizers-dependents",
21
+ "open-source-metrics/transformers-dependents",
22
+ "open-source-metrics/diffusers-dependents",
23
+ "open-source-metrics/gradio-dependents",
24
+ "open-source-metrics/optimum-dependents",
25
+ "open-source-metrics/accelerate-dependents",
26
+ ]
27
+
28
+ option = st.selectbox(
29
+ 'Choose library',
30
+ libraries
31
+ )
32
+
33
+ cached_folder = snapshot_download("open-source-metrics/transformers-dependents", repo_type="dataset")
34
+
35
+ num_dependents = defaultdict(int)
36
+ num_stars_all_dependents = defaultdict(int)
37
+
38
+ def load_json_files(directory):
39
+ for subdir, dirs, files in os.walk(directory):
40
+ for file in files:
41
+ if file.endswith('.json'):
42
+ file_path = os.path.join(subdir, file)
43
+ date = "_".join(file_path.split(".")[-2].split("/")[-3:])
44
+ with open(file_path, 'r') as f:
45
+ data = json.load(f)
46
+ # Process the JSON data as needed
47
+ if "name" in data and "stars" in data:
48
+ num_dependents[date] = len(data["name"])
49
+ num_stars_all_dependents[date] = sum(data["stars"])
50
+
51
+ # Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
52
+ load_json_files(cached_folder)
53
+
54
+ def sort_dict_by_date(d):
55
+ # Convert date strings to datetime objects and sort
56
+ sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
57
+ # Convert back to dictionary if needed
58
+ return defaultdict(int, sorted_tuples)
59
+
60
+ def remove_incorrect_entries(data):
61
+ # Convert string dates to datetime objects for easier comparison
62
+ sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
63
+
64
+ # Initialize a new dictionary to store the corrected data
65
+ corrected_data = defaultdict(int)
66
+
67
+ # Variable to keep track of the number of dependents on the previous date
68
+ previous_dependents = None
69
+
70
+ for date, dependents in sorted_data:
71
+ # If the current number of dependents is not less than the previous, add it to the corrected data
72
+ if previous_dependents is None or dependents >= previous_dependents:
73
+ corrected_data[date] = dependents
74
+ previous_dependents = dependents
75
+
76
+ return corrected_data
77
+
78
+ def interpolate_missing_dates(data):
79
+ # Convert string dates to datetime objects
80
+ temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
81
+
82
+ # Find the min and max dates to establish the range
83
+ min_date, max_date = min(temp_data.keys()), max(temp_data.keys())
84
+
85
+ # Generate a date range
86
+ current_date = min_date
87
+ while current_date <= max_date:
88
+ # If the current date is missing
89
+ if current_date not in temp_data:
90
+ # Find previous and next dates that are present
91
+ prev_date = current_date - timedelta(days=1)
92
+ next_date = current_date + timedelta(days=1)
93
+ while prev_date not in temp_data:
94
+ prev_date -= timedelta(days=1)
95
+ while next_date not in temp_data:
96
+ next_date += timedelta(days=1)
97
+
98
+ # Linear interpolation
99
+ prev_value = temp_data[prev_date]
100
+ next_value = temp_data[next_date]
101
+ interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
102
+ temp_data[current_date] = interpolated_value
103
+
104
+ current_date += timedelta(days=1)
105
+
106
+ # Convert datetime objects back to string format
107
+ interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
108
+
109
+ return interpolated_data
110
+
111
+ num_dependents = remove_incorrect_entries(num_dependents)
112
+ num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)
113
+
114
+ num_dependents = interpolate_missing_dates(num_dependents)
115
+ num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)
116
+
117
+ num_dependents = sort_dict_by_date(num_dependents)
118
+ num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)
119
+
120
+ num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
121
+ num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])
122
+
123
+ num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
124
+ num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')
125
+
126
+ num_dependents_df.set_index('Date', inplace=True)
127
+ num_dependents_df = num_dependents_df.resample('D').asfreq()
128
+ num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()
129
+
130
+ num_cum_stars_df.set_index('Date', inplace=True)
131
+ num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
132
+ num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()
133
+
134
+ # Plotting
135
+ plt.figure(figsize=(10, 6))
136
+ plt.plot(num_dependents_df.index, num_dependents_df['Value'], marker='o')
137
+ plt.xlabel('Date')
138
+ plt.ylabel('Number of Dependents')
139
+ plt.title('Dependencies History')
140
+
141
+ # Display in Streamlit
142
+ st.pyplot(plt)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pandas
2
+ matplotlib