import streamlit as st
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import math
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
import plotly.figure_factory as ff
from community import community_louvain
import networkx as nx
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from PIL import Image
from wordcloud import WordCloud
import plotly.graph_objects as go
def create_dendrogram(X, labels):
Z = linkage(X.toarray(), "single")
fig = ff.create_dendrogram(Z, orientation='left', labels=labels)
return fig
@st.cache_data
def load_data():
data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
return data
df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
st.title("Constellation: An Atlas of 15,000 Large Language Models")
st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.")
st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793")
threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000)
numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50)
wordClouds = st.checkbox("Show word clouds?")
def create_downloads_vs_likes_scatter(dataframe):
# Convert 'likes' column to numeric values
dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce')
# Filter out the outlier point at 14M likes
dataframe_filtered = dataframe[dataframe['likes'] != 14000000]
fig = go.Figure()
fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers',
marker=dict(color='blue', size=7, opacity=0.7),
text=dataframe_filtered['model_name'],
hovertemplate="Model Name: %{text}
Downloads: %{x}
Likes: %{y}"))
fig.update_layout(title='Downloads vs Likes',
xaxis_title='Downloads',
#xaxis_range=[0,300000],
yaxis_title='Likes')
#yaxis_range=[0, 800]) # Set custom y-axis range
return fig
if st.button("Run Clustering"):
df_filtered = df[df['downloads'] > threshold]
df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first')
# Convert the model names into a matrix of TF-IDF features
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray()
# Function to compute the pairwise cosine distances
def distfun(X):
return cosine_distances(X)
# Function to compute the linkage matrix
def linkagefun(dist_array):
return linkage(dist_array, "single")
# Create dendrogram
fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun)
#fig.update_layout(width=800, height=500)
st.plotly_chart(fig, use_container_width=True)
# Group by cluster
# Convert the model names into a matrix of token counts
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6))
X = vectorizer.fit_transform(df_extra_filtered['model_name'])
# Use clustering to group model names
clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray())
# Add cluster labels to the filtered DataFrame
df_extra_filtered['cluster'] = clustering.labels_
# Count the number of models in each cluster
cluster_counts = df_extra_filtered['cluster'].value_counts()
# Create a bar chart
fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)])
fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models')
st.plotly_chart(fig)
# graphing!
# Convert the model names into a matrix of TF-IDF features
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
X = vectorizer.fit_transform(df_extra_filtered['model_name'])
# Compute the pairwise cosine similarities
sim_matrix = cosine_similarity(X)
# Create a graph
G = nx.Graph()
# Add nodes to the graph
for i in range(len(df_extra_filtered)):
G.add_node(i, label=df_extra_filtered['model_name'].iloc[i])
# Add edges to the graph
for i in range(len(df_extra_filtered)):
for j in range(i+1, len(df_extra_filtered)):
# If the similarity is above a certain threshold
if sim_matrix[i, j] > 0.2:
G.add_edge(i, j, weight=sim_matrix[i, j])
# Compute the layout positions
pos = nx.spring_layout(G)
# Detect communities
partition = community_louvain.best_partition(G)
# Create a figure
# Compute the layout for each community
layouts = {}
for community in set(partition.values()):
nodes_in_community = [node for node, comm in partition.items() if comm == community]
subgraph = G.subgraph(nodes_in_community)
layouts[community] = nx.spring_layout(subgraph)
# Combine the layouts, spreading them out on a grid
grid_size = math.ceil(math.sqrt(len(layouts))) # Size of the grid
grid = np.array(list(itertools.product(range(grid_size), repeat=2))) # Coordinates for the grid
scale = 2 # Scale factor for spreading out the communities
offsets = dict(zip(layouts, grid*scale)) # Map communities to grid coordinates
combined_layout = {}
for community, layout in layouts.items():
for node, position in layout.items():
combined_layout[node] = position + offsets[community]
# Prepare data for plotly
x = [combined_layout[node][0] for node in range(len(df_extra_filtered))]
y = [combined_layout[node][1] for node in range(len(df_extra_filtered))]
# Create a figure
fig = go.Figure()
# Prepare lists for node positions, labels, ranks, downloads, likes, and params
x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], []
# Prepare the node attributes
for node, community in partition.items():
# Get model info
model_info = df_extra_filtered.iloc[node]
# Node position
x.append(pos[node][0])
y.append(pos[node][1])
# Node attributes
labels.append(model_info['model_name'])
ranks.append(model_info['rank'])
downloads.append(model_info['downloads'])
likes.append(model_info['likes'])
params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A')
# Compute the centroid of each cluster for background coloring
centroids = dict()
community_sizes = dict() # Create a dict to store the sizes of each community
for community in set(partition.values()):
nodes_in_community = [node for node, comm in partition.items() if comm == community]
if len(nodes_in_community) > 1: # Only consider communities with more than one node
centroid_x = np.mean([pos[node][0] for node in nodes_in_community])
centroid_y = np.mean([pos[node][1] for node in nodes_in_community])
centroids[community] = (centroid_x, centroid_y)
community_sizes[community] = len(nodes_in_community)
# Add background coloring for each cluster
for community, centroid in centroids.items():
fig.add_trace(go.Scatter(
x=[centroid[0]], y=[centroid[1]],
mode='markers',
marker=dict(
size=community_sizes[community]*5, # Adjust size by multiplying the community size by a factor
color=community,
opacity=0.1
),
hoverinfo='none',
showlegend=False
))
# Add nodes to the figure
fig.add_trace(go.Scatter(
x=x, y=y,
mode='markers',
marker=dict(size=3, color=community),
text=labels,
customdata=np.stack((ranks, downloads, likes, params), axis=-1),
hovertemplate=(
"Model Name: %{text}
"
"Rank: %{customdata[0]}
"
"Downloads: %{customdata[1]}
"
"Likes: %{customdata[2]}
"
"Params (millions): %{customdata[3]}"
""
)
))
# Add edges to the figure
for edge in G.edges():
# Calculate edge weight for line width, normalize it for better visibility
line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values()))
fig.add_trace(go.Scatter(
x=[pos[edge[0]][0], pos[edge[1]][0]],
y=[pos[edge[0]][1], pos[edge[1]][1]],
mode='lines',
line=dict(width=line_width), # Multiply by a factor for better visibility
hoverinfo='none'
))
# Set the figure layout
fig.update_layout(showlegend=False, hovermode='closest')
st.plotly_chart(fig)
# Calculate degree of each node
degrees = dict(G.degree())
# Sort nodes by degree in descending order and get top 20
top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20]
# Prepare data for display
models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models]
connections = [degree for node, degree in top_20_models]
st.subheader("Top 20 Models by Number of Connections")
for model, connections in zip(models, connections):
st.write(f"{model}: {connections} connections")
# Find the representative model for each community
representatives = dict()
for community in set(partition.values()):
nodes_in_community = [node for node, comm in partition.items() if comm == community]
# Select the node with the highest degree within the community as representative
representative = max(nodes_in_community, key=lambda node: degrees[node])
representatives[community] = df_extra_filtered.iloc[representative]['model_name']
# Prepare data for display
communities = list(representatives.keys())
community_sizes = [community_sizes.get(comm, 1) for comm in communities] # Use a default size of 1 for communities not in the dictionary
representatives = list(representatives.values())
# Create a DataFrame to hold the data
df_reps = pd.DataFrame({
'Community ID': communities,
'Size': community_sizes,
'Representative Model': representatives
})
# Sort the DataFrame by community size in descending order
df_reps.sort_values(by='Size', ascending=False, inplace=True)
# Display in Streamlit
st.subheader("Representative for each community, sorted by community size.")
st.dataframe(df_reps)
if wordClouds:
groups = df_extra_filtered.groupby('cluster')
for name, group in groups:
# Join all model names in the cluster into a single string
text = ' '.join(group['model_name'])
# Generate a word cloud
wordcloud = WordCloud().generate(text)
# Convert WordCloud to Image
image = wordcloud.to_image()
# Display the word cloud
st.image(image, use_column_width=True)
st.write(f'Word Cloud for Cluster {name}')
scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered)
st.plotly_chart(scatter_plot, use_container_width=True)