import streamlit as st import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') import math from scipy.cluster.hierarchy import dendrogram, linkage from sklearn.feature_extraction.text import TfidfVectorizer import itertools import plotly.figure_factory as ff from community import community_louvain import networkx as nx from sklearn.metrics.pairwise import cosine_distances from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer from sklearn.cluster import AgglomerativeClustering from wordcloud import WordCloud import plotly.graph_objects as go def create_dendrogram(X, labels): Z = linkage(X.toarray(), "single") fig = ff.create_dendrogram(Z, orientation='left', labels=labels) return fig @st.cache_data def load_data(): data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv") return data df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv") st.title("Constellation: An Atlas of 15,000 Large Language Models") st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.") st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793") threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000) numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50) wordClouds = st.checkbox("Show word clouds?") def create_downloads_vs_likes_scatter(dataframe): # Convert 'likes' column to numeric values dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce') # Filter out the outlier point at 14M likes dataframe_filtered = dataframe[dataframe['likes'] != 14000000] fig = go.Figure() fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers', marker=dict(color='blue', size=7, opacity=0.7), text=dataframe_filtered['model_name'], hovertemplate="Model Name: %{text}
Downloads: %{x}
Likes: %{y}")) fig.update_layout(title='Downloads vs Likes', xaxis_title='Downloads', #xaxis_range=[0,300000], yaxis_title='Likes') #yaxis_range=[0, 800]) # Set custom y-axis range return fig if st.button("Run Clustering"): df_filtered = df[df['downloads'] > threshold] df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first') # Convert the model names into a matrix of TF-IDF features vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8)) X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray() # Function to compute the pairwise cosine distances def distfun(X): return cosine_distances(X) # Function to compute the linkage matrix def linkagefun(dist_array): return linkage(dist_array, "single") # Create dendrogram fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun) #fig.update_layout(width=800, height=500) st.plotly_chart(fig, use_container_width=True) # Group by cluster # Convert the model names into a matrix of token counts vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6)) X = vectorizer.fit_transform(df_extra_filtered['model_name']) # Use clustering to group model names clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray()) # Add cluster labels to the filtered DataFrame df_extra_filtered['cluster'] = clustering.labels_ # Count the number of models in each cluster cluster_counts = df_extra_filtered['cluster'].value_counts() # Create a bar chart fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)]) fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models') st.plotly_chart(fig) # graphing! # Convert the model names into a matrix of TF-IDF features vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8)) X = vectorizer.fit_transform(df_extra_filtered['model_name']) # Compute the pairwise cosine similarities sim_matrix = cosine_similarity(X) # Create a graph G = nx.Graph() # Add nodes to the graph for i in range(len(df_extra_filtered)): G.add_node(i, label=df_extra_filtered['model_name'].iloc[i]) # Add edges to the graph for i in range(len(df_extra_filtered)): for j in range(i+1, len(df_extra_filtered)): # If the similarity is above a certain threshold if sim_matrix[i, j] > 0.2: G.add_edge(i, j, weight=sim_matrix[i, j]) # Compute the layout positions pos = nx.spring_layout(G) # Detect communities partition = community_louvain.best_partition(G) # Create a figure # Compute the layout for each community layouts = {} for community in set(partition.values()): nodes_in_community = [node for node, comm in partition.items() if comm == community] subgraph = G.subgraph(nodes_in_community) layouts[community] = nx.spring_layout(subgraph) # Combine the layouts, spreading them out on a grid grid_size = math.ceil(math.sqrt(len(layouts))) # Size of the grid grid = np.array(list(itertools.product(range(grid_size), repeat=2))) # Coordinates for the grid scale = 2 # Scale factor for spreading out the communities offsets = dict(zip(layouts, grid*scale)) # Map communities to grid coordinates combined_layout = {} for community, layout in layouts.items(): for node, position in layout.items(): combined_layout[node] = position + offsets[community] # Prepare data for plotly x = [combined_layout[node][0] for node in range(len(df_extra_filtered))] y = [combined_layout[node][1] for node in range(len(df_extra_filtered))] # Create a figure fig = go.Figure() # Prepare lists for node positions, labels, ranks, downloads, likes, and params x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], [] # Prepare the node attributes for node, community in partition.items(): # Get model info model_info = df_extra_filtered.iloc[node] # Node position x.append(pos[node][0]) y.append(pos[node][1]) # Node attributes labels.append(model_info['model_name']) ranks.append(model_info['rank']) downloads.append(model_info['downloads']) likes.append(model_info['likes']) params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A') # Compute the centroid of each cluster for background coloring centroids = dict() community_sizes = dict() # Create a dict to store the sizes of each community for community in set(partition.values()): nodes_in_community = [node for node, comm in partition.items() if comm == community] if len(nodes_in_community) > 1: # Only consider communities with more than one node centroid_x = np.mean([pos[node][0] for node in nodes_in_community]) centroid_y = np.mean([pos[node][1] for node in nodes_in_community]) centroids[community] = (centroid_x, centroid_y) community_sizes[community] = len(nodes_in_community) # Add background coloring for each cluster for community, centroid in centroids.items(): fig.add_trace(go.Scatter( x=[centroid[0]], y=[centroid[1]], mode='markers', marker=dict( size=community_sizes[community]*5, # Adjust size by multiplying the community size by a factor color=community, opacity=0.1 ), hoverinfo='none', showlegend=False )) # Add nodes to the figure fig.add_trace(go.Scatter( x=x, y=y, mode='markers', marker=dict(size=3, color=community), text=labels, customdata=np.stack((ranks, downloads, likes, params), axis=-1), hovertemplate=( "Model Name: %{text}
" "Rank: %{customdata[0]}
" "Downloads: %{customdata[1]}
" "Likes: %{customdata[2]}
" "Params (millions): %{customdata[3]}" "" ) )) # Add edges to the figure for edge in G.edges(): # Calculate edge weight for line width, normalize it for better visibility line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values())) fig.add_trace(go.Scatter( x=[pos[edge[0]][0], pos[edge[1]][0]], y=[pos[edge[0]][1], pos[edge[1]][1]], mode='lines', line=dict(width=line_width), # Multiply by a factor for better visibility hoverinfo='none' )) # Set the figure layout fig.update_layout(showlegend=False, hovermode='closest') st.plotly_chart(fig) # Calculate degree of each node degrees = dict(G.degree()) # Sort nodes by degree in descending order and get top 20 top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20] # Prepare data for display models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models] connections = [degree for node, degree in top_20_models] st.subheader("Top 20 Models by Number of Connections") for model, connections in zip(models, connections): st.write(f"{model}: {connections} connections") # Find the representative model for each community representatives = dict() for community in set(partition.values()): nodes_in_community = [node for node, comm in partition.items() if comm == community] # Select the node with the highest degree within the community as representative representative = max(nodes_in_community, key=lambda node: degrees[node]) representatives[community] = df_extra_filtered.iloc[representative]['model_name'] # Prepare data for display communities = list(representatives.keys()) community_sizes = [community_sizes.get(comm, 1) for comm in communities] # Use a default size of 1 for communities not in the dictionary representatives = list(representatives.values()) # Create a DataFrame to hold the data df_reps = pd.DataFrame({ 'Community ID': communities, 'Size': community_sizes, 'Representative Model': representatives }) # Sort the DataFrame by community size in descending order df_reps.sort_values(by='Size', ascending=False, inplace=True) # Display in Streamlit st.subheader("Representative for each community, sorted by community size.") st.dataframe(df_reps) if wordClouds: groups = df_extra_filtered.groupby('cluster') for name, group in groups: # Join all model names in the cluster into a single string text = ' '.join(group['model_name']) # Generate a word cloud wordcloud = WordCloud().generate(text) # Convert WordCloud to Image image = wordcloud.to_image() # Display the word cloud st.image(image, use_column_width=True) st.write(f'Word Cloud for Cluster {name}') scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered) st.plotly_chart(scatter_plot, use_container_width=True)