Spaces:
Runtime error
Runtime error
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import warnings | |
warnings.filterwarnings('ignore') | |
import math | |
from scipy.cluster.hierarchy import dendrogram, linkage | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import itertools | |
import plotly.figure_factory as ff | |
from community import community_louvain | |
import networkx as nx | |
from sklearn.metrics.pairwise import cosine_distances | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.cluster import AgglomerativeClustering | |
from wordcloud import WordCloud | |
import plotly.graph_objects as go | |
def create_dendrogram(X, labels): | |
Z = linkage(X.toarray(), "single") | |
fig = ff.create_dendrogram(Z, orientation='left', labels=labels) | |
return fig | |
def load_data(): | |
data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv") | |
return data | |
df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv") | |
st.title("Constellation: An Atlas of 15,000 Large Language Models") | |
st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.") | |
st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793") | |
threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000) | |
numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50) | |
wordClouds = st.checkbox("Show word clouds?") | |
def create_downloads_vs_likes_scatter(dataframe): | |
# Convert 'likes' column to numeric values | |
dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce') | |
# Filter out the outlier point at 14M likes | |
dataframe_filtered = dataframe[dataframe['likes'] != 14000000] | |
fig = go.Figure() | |
fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers', | |
marker=dict(color='blue', size=7, opacity=0.7), | |
text=dataframe_filtered['model_name'], | |
hovertemplate="Model Name: %{text}<br>Downloads: %{x}<br>Likes: %{y}<extra></extra>")) | |
fig.update_layout(title='Downloads vs Likes', | |
xaxis_title='Downloads', | |
#xaxis_range=[0,300000], | |
yaxis_title='Likes') | |
#yaxis_range=[0, 800]) # Set custom y-axis range | |
return fig | |
if st.button("Run Clustering"): | |
df_filtered = df[df['downloads'] > threshold] | |
df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first') | |
# Convert the model names into a matrix of TF-IDF features | |
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8)) | |
X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray() | |
# Function to compute the pairwise cosine distances | |
def distfun(X): | |
return cosine_distances(X) | |
# Function to compute the linkage matrix | |
def linkagefun(dist_array): | |
return linkage(dist_array, "single") | |
# Create dendrogram | |
fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun) | |
#fig.update_layout(width=800, height=500) | |
st.plotly_chart(fig, use_container_width=True) | |
# Group by cluster | |
# Convert the model names into a matrix of token counts | |
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6)) | |
X = vectorizer.fit_transform(df_extra_filtered['model_name']) | |
# Use clustering to group model names | |
clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray()) | |
# Add cluster labels to the filtered DataFrame | |
df_extra_filtered['cluster'] = clustering.labels_ | |
# Count the number of models in each cluster | |
cluster_counts = df_extra_filtered['cluster'].value_counts() | |
# Create a bar chart | |
fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)]) | |
fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models') | |
st.plotly_chart(fig) | |
# graphing! | |
# Convert the model names into a matrix of TF-IDF features | |
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8)) | |
X = vectorizer.fit_transform(df_extra_filtered['model_name']) | |
# Compute the pairwise cosine similarities | |
sim_matrix = cosine_similarity(X) | |
# Create a graph | |
G = nx.Graph() | |
# Add nodes to the graph | |
for i in range(len(df_extra_filtered)): | |
G.add_node(i, label=df_extra_filtered['model_name'].iloc[i]) | |
# Add edges to the graph | |
for i in range(len(df_extra_filtered)): | |
for j in range(i+1, len(df_extra_filtered)): | |
# If the similarity is above a certain threshold | |
if sim_matrix[i, j] > 0.2: | |
G.add_edge(i, j, weight=sim_matrix[i, j]) | |
# Compute the layout positions | |
pos = nx.spring_layout(G) | |
# Detect communities | |
partition = community_louvain.best_partition(G) | |
# Create a figure | |
# Compute the layout for each community | |
layouts = {} | |
for community in set(partition.values()): | |
nodes_in_community = [node for node, comm in partition.items() if comm == community] | |
subgraph = G.subgraph(nodes_in_community) | |
layouts[community] = nx.spring_layout(subgraph) | |
# Combine the layouts, spreading them out on a grid | |
grid_size = math.ceil(math.sqrt(len(layouts))) # Size of the grid | |
grid = np.array(list(itertools.product(range(grid_size), repeat=2))) # Coordinates for the grid | |
scale = 2 # Scale factor for spreading out the communities | |
offsets = dict(zip(layouts, grid*scale)) # Map communities to grid coordinates | |
combined_layout = {} | |
for community, layout in layouts.items(): | |
for node, position in layout.items(): | |
combined_layout[node] = position + offsets[community] | |
# Prepare data for plotly | |
x = [combined_layout[node][0] for node in range(len(df_extra_filtered))] | |
y = [combined_layout[node][1] for node in range(len(df_extra_filtered))] | |
# Create a figure | |
fig = go.Figure() | |
# Prepare lists for node positions, labels, ranks, downloads, likes, and params | |
x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], [] | |
# Prepare the node attributes | |
for node, community in partition.items(): | |
# Get model info | |
model_info = df_extra_filtered.iloc[node] | |
# Node position | |
x.append(pos[node][0]) | |
y.append(pos[node][1]) | |
# Node attributes | |
labels.append(model_info['model_name']) | |
ranks.append(model_info['rank']) | |
downloads.append(model_info['downloads']) | |
likes.append(model_info['likes']) | |
params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A') | |
# Compute the centroid of each cluster for background coloring | |
centroids = dict() | |
community_sizes = dict() # Create a dict to store the sizes of each community | |
for community in set(partition.values()): | |
nodes_in_community = [node for node, comm in partition.items() if comm == community] | |
if len(nodes_in_community) > 1: # Only consider communities with more than one node | |
centroid_x = np.mean([pos[node][0] for node in nodes_in_community]) | |
centroid_y = np.mean([pos[node][1] for node in nodes_in_community]) | |
centroids[community] = (centroid_x, centroid_y) | |
community_sizes[community] = len(nodes_in_community) | |
# Add background coloring for each cluster | |
for community, centroid in centroids.items(): | |
fig.add_trace(go.Scatter( | |
x=[centroid[0]], y=[centroid[1]], | |
mode='markers', | |
marker=dict( | |
size=community_sizes[community]*5, # Adjust size by multiplying the community size by a factor | |
color=community, | |
opacity=0.1 | |
), | |
hoverinfo='none', | |
showlegend=False | |
)) | |
# Add nodes to the figure | |
fig.add_trace(go.Scatter( | |
x=x, y=y, | |
mode='markers', | |
marker=dict(size=3, color=community), | |
text=labels, | |
customdata=np.stack((ranks, downloads, likes, params), axis=-1), | |
hovertemplate=( | |
"Model Name: %{text}<br>" | |
"Rank: %{customdata[0]}<br>" | |
"Downloads: %{customdata[1]}<br>" | |
"Likes: %{customdata[2]}<br>" | |
"Params (millions): %{customdata[3]}" | |
"<extra></extra>" | |
) | |
)) | |
# Add edges to the figure | |
for edge in G.edges(): | |
# Calculate edge weight for line width, normalize it for better visibility | |
line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values())) | |
fig.add_trace(go.Scatter( | |
x=[pos[edge[0]][0], pos[edge[1]][0]], | |
y=[pos[edge[0]][1], pos[edge[1]][1]], | |
mode='lines', | |
line=dict(width=line_width), # Multiply by a factor for better visibility | |
hoverinfo='none' | |
)) | |
# Set the figure layout | |
fig.update_layout(showlegend=False, hovermode='closest') | |
st.plotly_chart(fig) | |
# Calculate degree of each node | |
degrees = dict(G.degree()) | |
# Sort nodes by degree in descending order and get top 20 | |
top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20] | |
# Prepare data for display | |
models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models] | |
connections = [degree for node, degree in top_20_models] | |
st.subheader("Top 20 Models by Number of Connections") | |
for model, connections in zip(models, connections): | |
st.write(f"{model}: {connections} connections") | |
# Find the representative model for each community | |
representatives = dict() | |
for community in set(partition.values()): | |
nodes_in_community = [node for node, comm in partition.items() if comm == community] | |
# Select the node with the highest degree within the community as representative | |
representative = max(nodes_in_community, key=lambda node: degrees[node]) | |
representatives[community] = df_extra_filtered.iloc[representative]['model_name'] | |
# Prepare data for display | |
communities = list(representatives.keys()) | |
community_sizes = [community_sizes.get(comm, 1) for comm in communities] # Use a default size of 1 for communities not in the dictionary | |
representatives = list(representatives.values()) | |
# Create a DataFrame to hold the data | |
df_reps = pd.DataFrame({ | |
'Community ID': communities, | |
'Size': community_sizes, | |
'Representative Model': representatives | |
}) | |
# Sort the DataFrame by community size in descending order | |
df_reps.sort_values(by='Size', ascending=False, inplace=True) | |
# Display in Streamlit | |
st.subheader("Representative for each community, sorted by community size.") | |
st.dataframe(df_reps) | |
if wordClouds: | |
groups = df_extra_filtered.groupby('cluster') | |
for name, group in groups: | |
# Join all model names in the cluster into a single string | |
text = ' '.join(group['model_name']) | |
# Generate a word cloud | |
wordcloud = WordCloud().generate(text) | |
# Convert WordCloud to Image | |
image = wordcloud.to_image() | |
# Display the word cloud | |
st.image(image, use_column_width=True) | |
st.write(f'Word Cloud for Cluster {name}') | |
scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered) | |
st.plotly_chart(scatter_plot, use_container_width=True) |