Spaces:

gaodrew
/

constellation

Runtime error

App Files Files Community

gaodrew commited on Jul 21, 2023

Commit

67dafee

•

1 Parent(s): 1d4a24a

Create app.py

Browse files

Files changed (1) hide show

app.py +291 -0

app.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import warnings
+warnings.filterwarnings('ignore')
+import math
+from scipy.cluster.hierarchy import dendrogram, linkage
+from sklearn.feature_extraction.text import TfidfVectorizer
+import itertools
+import plotly.figure_factory as ff
+from community import community_louvain
+import networkx as nx
+from sklearn.metrics.pairwise import cosine_distances
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.cluster import AgglomerativeClustering
+from PIL import Image
+from wordcloud import WordCloud
+import plotly.graph_objects as go
+def create_dendrogram(X, labels):
+    Z = linkage(X.toarray(), "single")
+    fig = ff.create_dendrogram(Z, orientation='left', labels=labels)
+    return fig
+@st.cache_data
+def load_data():
+    data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
+    return data
+df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
+st.title("Constellation: An Atlas of 15,000 Large Language Models")
+st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.")
+st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793")
+threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000)
+numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50)
+wordClouds = st.checkbox("Show word clouds?")
+def create_downloads_vs_likes_scatter(dataframe):
+    # Convert 'likes' column to numeric values
+    dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce')
+    # Filter out the outlier point at 14M likes
+    dataframe_filtered = dataframe[dataframe['likes'] != 14000000]
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers',
+                             marker=dict(color='blue', size=7, opacity=0.7),
+                             text=dataframe_filtered['model_name'],
+                             hovertemplate="Model Name: %{text}<br>Downloads: %{x}<br>Likes: %{y}<extra></extra>"))
+    fig.update_layout(title='Downloads vs Likes',
+                      xaxis_title='Downloads',
+                      #xaxis_range=[0,300000],
+                      yaxis_title='Likes')
+                    #yaxis_range=[0, 800])  # Set custom y-axis range
+    return fig
+if st.button("Run Clustering"):
+    df_filtered = df[df['downloads'] > threshold]
+    df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first')
+    # Convert the model names into a matrix of TF-IDF features
+    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
+    X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray()
+    # Function to compute the pairwise cosine distances
+    def distfun(X):
+        return cosine_distances(X)
+    # Function to compute the linkage matrix
+    def linkagefun(dist_array):
+        return linkage(dist_array, "single")
+    # Create dendrogram
+    fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun)
+    #fig.update_layout(width=800, height=500)
+    st.plotly_chart(fig, use_container_width=True)
+    # Group by cluster
+    # Convert the model names into a matrix of token counts
+    vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6))
+    X = vectorizer.fit_transform(df_extra_filtered['model_name'])
+    # Use clustering to group model names
+    clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray())
+    # Add cluster labels to the filtered DataFrame
+    df_extra_filtered['cluster'] = clustering.labels_
+    # Count the number of models in each cluster
+    cluster_counts = df_extra_filtered['cluster'].value_counts()
+    # Create a bar chart
+    fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)])
+    fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models')
+    st.plotly_chart(fig)
+    # graphing!
+    # Convert the model names into a matrix of TF-IDF features
+    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
+    X = vectorizer.fit_transform(df_extra_filtered['model_name'])
+    # Compute the pairwise cosine similarities
+    sim_matrix = cosine_similarity(X)
+    # Create a graph
+    G = nx.Graph()
+    # Add nodes to the graph
+    for i in range(len(df_extra_filtered)):
+        G.add_node(i, label=df_extra_filtered['model_name'].iloc[i])
+    # Add edges to the graph
+    for i in range(len(df_extra_filtered)):
+        for j in range(i+1, len(df_extra_filtered)):
+            # If the similarity is above a certain threshold
+            if sim_matrix[i, j] > 0.2:
+                G.add_edge(i, j, weight=sim_matrix[i, j])
+    # Compute the layout positions
+    pos = nx.spring_layout(G)
+    # Detect communities
+    partition = community_louvain.best_partition(G)
+    # Create a figure
+    # Compute the layout for each community
+    layouts = {}
+    for community in set(partition.values()):
+        nodes_in_community = [node for node, comm in partition.items() if comm == community]
+        subgraph = G.subgraph(nodes_in_community)
+        layouts[community] = nx.spring_layout(subgraph)
+    # Combine the layouts, spreading them out on a grid
+    grid_size = math.ceil(math.sqrt(len(layouts)))  # Size of the grid
+    grid = np.array(list(itertools.product(range(grid_size), repeat=2)))  # Coordinates for the grid
+    scale = 2  # Scale factor for spreading out the communities
+    offsets = dict(zip(layouts, grid*scale))  # Map communities to grid coordinates
+    combined_layout = {}
+    for community, layout in layouts.items():
+        for node, position in layout.items():
+            combined_layout[node] = position + offsets[community]
+    # Prepare data for plotly
+    x = [combined_layout[node][0] for node in range(len(df_extra_filtered))]
+    y = [combined_layout[node][1] for node in range(len(df_extra_filtered))]
+    # Create a figure
+    fig = go.Figure()
+    # Prepare lists for node positions, labels, ranks, downloads, likes, and params
+    x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], []
+    # Prepare the node attributes
+    for node, community in partition.items():
+        # Get model info
+        model_info = df_extra_filtered.iloc[node]
+        # Node position
+        x.append(pos[node][0])
+        y.append(pos[node][1])
+        # Node attributes
+        labels.append(model_info['model_name'])
+        ranks.append(model_info['rank'])
+        downloads.append(model_info['downloads'])
+        likes.append(model_info['likes'])
+        params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A')
+    # Compute the centroid of each cluster for background coloring
+    centroids = dict()
+    community_sizes = dict()  # Create a dict to store the sizes of each community
+    for community in set(partition.values()):
+        nodes_in_community = [node for node, comm in partition.items() if comm == community]
+        if len(nodes_in_community) > 1:  # Only consider communities with more than one node
+            centroid_x = np.mean([pos[node][0] for node in nodes_in_community])
+            centroid_y = np.mean([pos[node][1] for node in nodes_in_community])
+            centroids[community] = (centroid_x, centroid_y)
+            community_sizes[community] = len(nodes_in_community)
+    # Add background coloring for each cluster
+    for community, centroid in centroids.items():
+        fig.add_trace(go.Scatter(
+            x=[centroid[0]], y=[centroid[1]],
+            mode='markers',
+            marker=dict(
+                size=community_sizes[community]*5,  # Adjust size by multiplying the community size by a factor
+                color=community,
+                opacity=0.1
+            ),
+            hoverinfo='none',
+            showlegend=False
+        ))
+    # Add nodes to the figure
+    fig.add_trace(go.Scatter(
+        x=x, y=y,
+        mode='markers',
+        marker=dict(size=3, color=community),
+        text=labels,
+        customdata=np.stack((ranks, downloads, likes, params), axis=-1),
+        hovertemplate=(
+            "Model Name: %{text}<br>"
+            "Rank: %{customdata[0]}<br>"
+            "Downloads: %{customdata[1]}<br>"
+            "Likes: %{customdata[2]}<br>"
+            "Params (millions): %{customdata[3]}"
+            "<extra></extra>"
+        )
+    ))
+    # Add edges to the figure
+    for edge in G.edges():
+        # Calculate edge weight for line width, normalize it for better visibility
+        line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values()))
+        fig.add_trace(go.Scatter(
+            x=[pos[edge[0]][0], pos[edge[1]][0]],
+            y=[pos[edge[0]][1], pos[edge[1]][1]],
+            mode='lines',
+            line=dict(width=line_width), # Multiply by a factor for better visibility
+            hoverinfo='none'
+        ))
+    # Set the figure layout
+    fig.update_layout(showlegend=False, hovermode='closest')
+    st.plotly_chart(fig)
+    # Calculate degree of each node
+    degrees = dict(G.degree())
+    # Sort nodes by degree in descending order and get top 20
+    top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20]
+    # Prepare data for display
+    models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models]
+    connections = [degree for node, degree in top_20_models]
+    st.subheader("Top 20 Models by Number of Connections")
+    for model, connections in zip(models, connections):
+        st.write(f"{model}: {connections} connections")
+    # Find the representative model for each community
+    representatives = dict()
+    for community in set(partition.values()):
+        nodes_in_community = [node for node, comm in partition.items() if comm == community]
+        # Select the node with the highest degree within the community as representative
+        representative = max(nodes_in_community, key=lambda node: degrees[node])
+        representatives[community] = df_extra_filtered.iloc[representative]['model_name']
+    # Prepare data for display
+    communities = list(representatives.keys())
+    community_sizes = [community_sizes.get(comm, 1) for comm in communities]  # Use a default size of 1 for communities not in the dictionary
+    representatives = list(representatives.values())
+    # Create a DataFrame to hold the data
+    df_reps = pd.DataFrame({
+        'Community ID': communities,
+        'Size': community_sizes,
+        'Representative Model': representatives
+    })
+    # Sort the DataFrame by community size in descending order
+    df_reps.sort_values(by='Size', ascending=False, inplace=True)
+    # Display in Streamlit
+    st.subheader("Representative for each community, sorted by community size.")
+    st.dataframe(df_reps)
+    if wordClouds:
+        groups = df_extra_filtered.groupby('cluster')
+        for name, group in groups:
+            # Join all model names in the cluster into a single string
+            text = ' '.join(group['model_name'])
+            # Generate a word cloud
+            wordcloud = WordCloud().generate(text)
+            # Convert WordCloud to Image
+            image = wordcloud.to_image()
+            # Display the word cloud
+            st.image(image, use_column_width=True)
+            st.write(f'Word Cloud for Cluster {name}')
+    scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered)
+    st.plotly_chart(scatter_plot, use_container_width=True)