gaodrew commited on
Commit
67dafee
1 Parent(s): 1d4a24a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +291 -0
app.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import warnings
5
+ warnings.filterwarnings('ignore')
6
+ import math
7
+ from scipy.cluster.hierarchy import dendrogram, linkage
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ import itertools
10
+ import plotly.figure_factory as ff
11
+ from community import community_louvain
12
+ import networkx as nx
13
+ from sklearn.metrics.pairwise import cosine_distances
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ from sklearn.feature_extraction.text import CountVectorizer
16
+ from sklearn.cluster import AgglomerativeClustering
17
+ from PIL import Image
18
+ from wordcloud import WordCloud
19
+ import plotly.graph_objects as go
20
+
21
+
22
+ def create_dendrogram(X, labels):
23
+ Z = linkage(X.toarray(), "single")
24
+ fig = ff.create_dendrogram(Z, orientation='left', labels=labels)
25
+ return fig
26
+
27
+ @st.cache_data
28
+ def load_data():
29
+ data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
30
+ return data
31
+
32
+ df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
33
+ st.title("Constellation: An Atlas of 15,000 Large Language Models")
34
+ st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.")
35
+ st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793")
36
+ threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000)
37
+ numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50)
38
+ wordClouds = st.checkbox("Show word clouds?")
39
+
40
+ def create_downloads_vs_likes_scatter(dataframe):
41
+ # Convert 'likes' column to numeric values
42
+ dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce')
43
+
44
+ # Filter out the outlier point at 14M likes
45
+ dataframe_filtered = dataframe[dataframe['likes'] != 14000000]
46
+
47
+ fig = go.Figure()
48
+ fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers',
49
+ marker=dict(color='blue', size=7, opacity=0.7),
50
+ text=dataframe_filtered['model_name'],
51
+ hovertemplate="Model Name: %{text}<br>Downloads: %{x}<br>Likes: %{y}<extra></extra>"))
52
+ fig.update_layout(title='Downloads vs Likes',
53
+ xaxis_title='Downloads',
54
+ #xaxis_range=[0,300000],
55
+ yaxis_title='Likes')
56
+ #yaxis_range=[0, 800]) # Set custom y-axis range
57
+ return fig
58
+
59
+
60
+ if st.button("Run Clustering"):
61
+ df_filtered = df[df['downloads'] > threshold]
62
+ df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first')
63
+
64
+ # Convert the model names into a matrix of TF-IDF features
65
+ vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
66
+ X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray()
67
+
68
+ # Function to compute the pairwise cosine distances
69
+ def distfun(X):
70
+ return cosine_distances(X)
71
+
72
+ # Function to compute the linkage matrix
73
+ def linkagefun(dist_array):
74
+ return linkage(dist_array, "single")
75
+
76
+ # Create dendrogram
77
+ fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun)
78
+ #fig.update_layout(width=800, height=500)
79
+ st.plotly_chart(fig, use_container_width=True)
80
+
81
+ # Group by cluster
82
+ # Convert the model names into a matrix of token counts
83
+ vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6))
84
+ X = vectorizer.fit_transform(df_extra_filtered['model_name'])
85
+ # Use clustering to group model names
86
+ clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray())
87
+
88
+ # Add cluster labels to the filtered DataFrame
89
+ df_extra_filtered['cluster'] = clustering.labels_
90
+
91
+ # Count the number of models in each cluster
92
+ cluster_counts = df_extra_filtered['cluster'].value_counts()
93
+
94
+ # Create a bar chart
95
+ fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)])
96
+ fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models')
97
+ st.plotly_chart(fig)
98
+
99
+ # graphing!
100
+
101
+ # Convert the model names into a matrix of TF-IDF features
102
+ vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
103
+ X = vectorizer.fit_transform(df_extra_filtered['model_name'])
104
+
105
+ # Compute the pairwise cosine similarities
106
+ sim_matrix = cosine_similarity(X)
107
+
108
+ # Create a graph
109
+ G = nx.Graph()
110
+
111
+ # Add nodes to the graph
112
+ for i in range(len(df_extra_filtered)):
113
+ G.add_node(i, label=df_extra_filtered['model_name'].iloc[i])
114
+
115
+ # Add edges to the graph
116
+ for i in range(len(df_extra_filtered)):
117
+ for j in range(i+1, len(df_extra_filtered)):
118
+ # If the similarity is above a certain threshold
119
+ if sim_matrix[i, j] > 0.2:
120
+ G.add_edge(i, j, weight=sim_matrix[i, j])
121
+
122
+ # Compute the layout positions
123
+ pos = nx.spring_layout(G)
124
+
125
+ # Detect communities
126
+ partition = community_louvain.best_partition(G)
127
+ # Create a figure
128
+ # Compute the layout for each community
129
+ layouts = {}
130
+ for community in set(partition.values()):
131
+ nodes_in_community = [node for node, comm in partition.items() if comm == community]
132
+ subgraph = G.subgraph(nodes_in_community)
133
+ layouts[community] = nx.spring_layout(subgraph)
134
+
135
+ # Combine the layouts, spreading them out on a grid
136
+ grid_size = math.ceil(math.sqrt(len(layouts))) # Size of the grid
137
+ grid = np.array(list(itertools.product(range(grid_size), repeat=2))) # Coordinates for the grid
138
+ scale = 2 # Scale factor for spreading out the communities
139
+ offsets = dict(zip(layouts, grid*scale)) # Map communities to grid coordinates
140
+
141
+ combined_layout = {}
142
+ for community, layout in layouts.items():
143
+ for node, position in layout.items():
144
+ combined_layout[node] = position + offsets[community]
145
+
146
+ # Prepare data for plotly
147
+ x = [combined_layout[node][0] for node in range(len(df_extra_filtered))]
148
+ y = [combined_layout[node][1] for node in range(len(df_extra_filtered))]
149
+
150
+ # Create a figure
151
+ fig = go.Figure()
152
+
153
+ # Prepare lists for node positions, labels, ranks, downloads, likes, and params
154
+ x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], []
155
+
156
+ # Prepare the node attributes
157
+ for node, community in partition.items():
158
+ # Get model info
159
+ model_info = df_extra_filtered.iloc[node]
160
+
161
+ # Node position
162
+ x.append(pos[node][0])
163
+ y.append(pos[node][1])
164
+
165
+ # Node attributes
166
+ labels.append(model_info['model_name'])
167
+ ranks.append(model_info['rank'])
168
+ downloads.append(model_info['downloads'])
169
+ likes.append(model_info['likes'])
170
+ params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A')
171
+
172
+ # Compute the centroid of each cluster for background coloring
173
+ centroids = dict()
174
+ community_sizes = dict() # Create a dict to store the sizes of each community
175
+ for community in set(partition.values()):
176
+ nodes_in_community = [node for node, comm in partition.items() if comm == community]
177
+ if len(nodes_in_community) > 1: # Only consider communities with more than one node
178
+ centroid_x = np.mean([pos[node][0] for node in nodes_in_community])
179
+ centroid_y = np.mean([pos[node][1] for node in nodes_in_community])
180
+ centroids[community] = (centroid_x, centroid_y)
181
+ community_sizes[community] = len(nodes_in_community)
182
+
183
+ # Add background coloring for each cluster
184
+ for community, centroid in centroids.items():
185
+ fig.add_trace(go.Scatter(
186
+ x=[centroid[0]], y=[centroid[1]],
187
+ mode='markers',
188
+ marker=dict(
189
+ size=community_sizes[community]*5, # Adjust size by multiplying the community size by a factor
190
+ color=community,
191
+ opacity=0.1
192
+ ),
193
+ hoverinfo='none',
194
+ showlegend=False
195
+ ))
196
+
197
+ # Add nodes to the figure
198
+ fig.add_trace(go.Scatter(
199
+ x=x, y=y,
200
+ mode='markers',
201
+ marker=dict(size=3, color=community),
202
+ text=labels,
203
+ customdata=np.stack((ranks, downloads, likes, params), axis=-1),
204
+ hovertemplate=(
205
+ "Model Name: %{text}<br>"
206
+ "Rank: %{customdata[0]}<br>"
207
+ "Downloads: %{customdata[1]}<br>"
208
+ "Likes: %{customdata[2]}<br>"
209
+ "Params (millions): %{customdata[3]}"
210
+ "<extra></extra>"
211
+ )
212
+ ))
213
+
214
+ # Add edges to the figure
215
+ for edge in G.edges():
216
+ # Calculate edge weight for line width, normalize it for better visibility
217
+ line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values()))
218
+
219
+ fig.add_trace(go.Scatter(
220
+ x=[pos[edge[0]][0], pos[edge[1]][0]],
221
+ y=[pos[edge[0]][1], pos[edge[1]][1]],
222
+ mode='lines',
223
+ line=dict(width=line_width), # Multiply by a factor for better visibility
224
+ hoverinfo='none'
225
+ ))
226
+
227
+ # Set the figure layout
228
+ fig.update_layout(showlegend=False, hovermode='closest')
229
+
230
+ st.plotly_chart(fig)
231
+
232
+ # Calculate degree of each node
233
+ degrees = dict(G.degree())
234
+
235
+ # Sort nodes by degree in descending order and get top 20
236
+ top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20]
237
+
238
+ # Prepare data for display
239
+ models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models]
240
+ connections = [degree for node, degree in top_20_models]
241
+
242
+ st.subheader("Top 20 Models by Number of Connections")
243
+ for model, connections in zip(models, connections):
244
+ st.write(f"{model}: {connections} connections")
245
+
246
+
247
+ # Find the representative model for each community
248
+ representatives = dict()
249
+ for community in set(partition.values()):
250
+ nodes_in_community = [node for node, comm in partition.items() if comm == community]
251
+ # Select the node with the highest degree within the community as representative
252
+ representative = max(nodes_in_community, key=lambda node: degrees[node])
253
+ representatives[community] = df_extra_filtered.iloc[representative]['model_name']
254
+
255
+ # Prepare data for display
256
+ communities = list(representatives.keys())
257
+ community_sizes = [community_sizes.get(comm, 1) for comm in communities] # Use a default size of 1 for communities not in the dictionary
258
+ representatives = list(representatives.values())
259
+
260
+ # Create a DataFrame to hold the data
261
+ df_reps = pd.DataFrame({
262
+ 'Community ID': communities,
263
+ 'Size': community_sizes,
264
+ 'Representative Model': representatives
265
+ })
266
+
267
+ # Sort the DataFrame by community size in descending order
268
+ df_reps.sort_values(by='Size', ascending=False, inplace=True)
269
+
270
+ # Display in Streamlit
271
+ st.subheader("Representative for each community, sorted by community size.")
272
+ st.dataframe(df_reps)
273
+ if wordClouds:
274
+ groups = df_extra_filtered.groupby('cluster')
275
+
276
+ for name, group in groups:
277
+ # Join all model names in the cluster into a single string
278
+ text = ' '.join(group['model_name'])
279
+
280
+ # Generate a word cloud
281
+ wordcloud = WordCloud().generate(text)
282
+
283
+ # Convert WordCloud to Image
284
+ image = wordcloud.to_image()
285
+
286
+ # Display the word cloud
287
+ st.image(image, use_column_width=True)
288
+ st.write(f'Word Cloud for Cluster {name}')
289
+
290
+ scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered)
291
+ st.plotly_chart(scatter_plot, use_container_width=True)