In [68]:
from sklearn.preprocessing import normalize as sklearn_normalize
from scipy.sparse import isspmatrix, csc_matrix
import networkx as nx
import numpy as np
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import time

#List of all librairies to import
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from wordcloud import WordCloud
import plotly.graph_objects as go
%matplotlib inline
import gzip
import seaborn as sns

Loading the graph

In [69]:
# Charger les arêtes du graphe
def load_edges(file_path):
    edges = []
    with gzip.open(file_path, 'rt') as f:  # Ouvrir en mode texte
        for line in f:
            source, target = map(int, line.strip().split())
            edges.append((source, target))
    return edges

# Charger les noms des pages
def load_page_names(file_path):
    page_names = {}
    with gzip.open(file_path, 'rt') as f:  # Ouvrir en mode texte
        for line in f:
            line = line.strip()
            if not line:  # Ignorer les lignes vides
                continue
            parts = line.split(' ', 1)
            if len(parts) == 2:  # Vérifier que la ligne contient bien deux parties
                page_id, page_name = parts
                page_names[int(page_id)] = page_name
    return page_names

# Chemins vers les fichiers
base_path = "data/big_graph"
edges_file = f"{base_path}/wiki-topcats.txt.gz"
page_names_file = f"{base_path}/wiki-topcats-page-names.txt.gz"

# Charger les données
edges = load_edges(edges_file)
page_names = load_page_names(page_names_file)

# Créer le graphe
G = nx.Graph()
G.add_edges_from(edges)  # Ajouter les arêtes

# Ajouter les noms des pages comme attributs des nœuds
nx.set_node_attributes(G, page_names, "name")

# Informations sur le graphe
print(f"Nombre de nœuds : {G.number_of_nodes()}")
print(f"Nombre d'arêtes : {G.number_of_edges()}")

# Visualiser une sous-partie du graphe
subgraph = G.subgraph(list(G.nodes)[:100])  # Prenez un sous-graphe avec 100 nœuds
pos = nx.spring_layout(subgraph, seed=42)  # Layout pour la visualisation
plt.figure(figsize=(12, 8))
nx.draw(
    subgraph,
    pos,
    with_labels=True,
    labels={node: G.nodes[node]["name"] for node in subgraph.nodes},  # Utiliser les noms des pages
    node_size=50,
    font_size=8,
    edge_color="gray",
)
plt.title("Sous-graphe des 100 premiers nœuds")
plt.show()
Nombre de nœuds : 1791489
Nombre d'arêtes : 25447873
No description has been provided for this image
In [70]:
# Visualiser une sous-partie du graphe
subgraph = G.subgraph(list(G.nodes)[:1000])  # Prenez un sous-graphe avec 100 nœuds
In [ ]:
# enumerate the nodes in the graph
subgraph_node_list = list(subgraph.nodes())
print(f"Number of nodes: {len(subgraph_node_list)}")
print(f"Node List: {subgraph_node_list}")

# number of edges in the subgraph
print(f"Number of edges: {subgraph.number_of_edges()}")


# Convert the graph to a dense adjacency matrix
adj_matrix = nx.adjacency_matrix(subgraph).toarray()

# Save the adjacency matrix to a file (optional)
np.save("adj_matrix.npy", adj_matrix)

# Print the shape of the adjacency matrix
print(f"Adjacency Matrix Shape: {adj_matrix.shape}")

with open("subgraph_edge_list.txt", 'w') as f:
        for edge in subgraph.edges():
            f.write(f"{edge[0]} {edge[1]} 1.0\n")

Helper functions

In [72]:
def normalize(M):
    return sklearn_normalize(M, norm="l1", axis=0)
In [73]:
def get_clusters(matrix):
    """
    Retrieve the clusters from the matrix
    
    :param matrix: The matrix produced by the MCL algorithm
    :returns: A list of tuples where each tuple represents a cluster and
              contains the indices of the nodes belonging to the cluster
    """

    if not isspmatrix(matrix):
        # Convert dense matrix to sparse to use sparse-specific methods
        matrix = csc_matrix(matrix)


    # get the attractors - non-zero elements of the matrix diagonal
    attractors = matrix.diagonal().nonzero()[0]
    print("Attractors:", attractors)

    # somewhere to put the clusters
    clusters = set()

    # the nodes in the same row as each attractor form a cluster
    for attractor in attractors:
        cluster = tuple(matrix.getrow(attractor).nonzero()[1].tolist())
        clusters.add(cluster)

    return sorted(list(clusters))

Main algorithm

In [74]:
def mcl(M, inflation, max_iter=10, epsilon=1e-4, verbose=0):
    """
    Perform Markov Clustering (MCL) on a graph represented by an edge list file.

    Args:
        file_path (str): Path to the file containing the edge list.
        inflation (float): Inflation parameter for the MCL algorithm.
        max_iter (int): Maximum number of iterations to perform.
        epsilon (float): Convergence threshold for the Frobenius norm.

    Returns:
        numpy.ndarray: The final matrix after MCL processing.
    """
    print("Starting MCL algorithm") if verbose >= 1 else None
    # Into numpy
    M = np.array(M, dtype=float)
    # Initialization
    M = normalize(M)

    convergence = False
    previous_M = M
    i = 0
    total_time = 0  # To keep track of total time across all iterations
    iteration_times = []

    while i < max_iter and not convergence:
        print(f"Iteration {i + 1} started") if verbose >= 1 else None
        iteration_start_time = time.time()

        # Expansion
        expansion_start_time = time.time()
        M = np.matmul(M, M)
        expansion_time = time.time() - expansion_start_time
        print(f"Expansion took {expansion_time} seconds") if verbose == 2 else None

        # Inflation
        inflation_start_time = time.time()
        M = np.power(M, inflation)
        M = normalize(M)
        inflation_time = time.time() - inflation_start_time
        print(f"Inflation took {inflation_time} seconds") if verbose >= 2 else None

        # Check convergence using Frobenius norm
        diff_norm = np.linalg.norm(previous_M - M)
        print(f"Frobenius norm difference = {diff_norm}") if verbose >= 1 else None

        convergence = diff_norm < epsilon
        if convergence:
            print(f"Convergence achieved at iteration {i + 1}")

        previous_M = M
        iteration_end_time = time.time()
        iteration_time = iteration_end_time - iteration_start_time
        total_time += iteration_time
        print(f"Iteration {i + 1}: Total iteration time = {iteration_time} seconds") if verbose >= 1 else None
        iteration_times.append(iteration_time)

        i += 1

    # Log total time
    print("Total time for MCL processing: %.6f seconds", total_time) if verbose >= 1 else None

    # Log final matrix
    print("Final matrix:\n%s", M) if verbose >= 2 else None

    return M, iteration_times

Runnning the algorithm

In [ ]:
# running the algo with differen inflation values
inflation_values = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]

my_final_matrices = {}
for inflation in inflation_values:
    print("Running MCL with inflation = %.2f", inflation)
    final_matrix, _ = mcl(adj_matrix, inflation=inflation, max_iter=200, epsilon=1e-7, verbose=2)
    my_final_matrices[inflation] = final_matrix

Extracting the clusters

In [ ]:
# Extract clusters for each inflation value
clusters = {}
for inflation, matrix in my_final_matrices.items():
    print("Extracting clusters for inflation =", inflation)
    clusters[inflation] = get_clusters(matrix)
In [77]:
# Save the clusters to files
for inflation, cluster_ids in clusters.items():
    with open(f"clusters_inf_{inflation}.txt", "w") as f:
        for cluster in cluster_ids:
            f.write(" ".join(map(str, cluster)) + "\n")

Modularity

In [78]:
def global_modularity(clusters, G):
    m = G.number_of_edges()  #Total number of edges in the graph

    #Transform the list of sets from 0...999 into actual node ids
    clusters = [{subgraph_node_list[node] for node in node_set} for node_set in clusters]
    
    #Modularity initialized to 0
    Q = 0

    #Iterate over each cluster
    for cluster in clusters:

        cluster_nodes = list(cluster) #convert the set to a list

        #Subgraph containing only the nodes and edges of the cluster
        subgraph = G.subgraph(cluster_nodes)

        #Number of edges inside the cluster
        L_c = subgraph.number_of_edges()

        #Sum of the degrees of the nodes in the cluster (all the edges that are connected to the nodes in the cluster)
        k_c = sum(G.degree(n) for n in cluster_nodes)

        #Add the modularity for the cluster
        Q += (L_c / m) - (k_c / (2 * m)) ** 2

    #Return the global modularity
    return Q
In [79]:
modularity = []

for inflation, cluster_list in clusters.items():
    modularity_value = global_modularity(cluster_list, subgraph)
    modularity.append((inflation, modularity_value, len(cluster_list)))

# Print the modularity values
for inflation, mod_value, cluster_size in modularity:
    print(f"Inflation: {inflation}, Modularity: {mod_value}, Number of clusters: {cluster_size}")
Inflation: 1.1, Modularity: 0.00021600602547791105, Number of clusters: 3
Inflation: 1.2, Modularity: 0.00021600602547791105, Number of clusters: 3
Inflation: 1.3, Modularity: 0.06857514320408123, Number of clusters: 8
Inflation: 1.4, Modularity: 0.08257000154633522, Number of clusters: 11
Inflation: 1.5, Modularity: 0.25996611135555325, Number of clusters: 21
Inflation: 1.6, Modularity: 0.2482154349273036, Number of clusters: 29
Inflation: 1.7, Modularity: 0.2441235529253749, Number of clusters: 37
Inflation: 1.8, Modularity: 0.25114909462824075, Number of clusters: 54
Inflation: 1.9, Modularity: 0.2378177433927788, Number of clusters: 73
Inflation: 2.0, Modularity: 0.2312409633705325, Number of clusters: 86
In [80]:
# Unpack the modularity list into separate lists for plotting
inflation_values, modularity_values, cluster_counts = zip(*modularity)

fig, ax1 = plt.subplots(figsize=(10, 6))

color = 'tab:blue'
ax1.set_xlabel('Inflation')
ax1.set_ylabel('Modularity', color=color)
ax1.plot(inflation_values, modularity_values, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:red'
ax2.set_ylabel('Number of Clusters', color=color)  # we already handled the x-label with ax1
ax2.plot(inflation_values, cluster_counts, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.title("Modularity and Number of Clusters vs Inflation")
sns.despine(fig=fig, right=False, left=False)
plt.savefig(
    "Modularity_and_Number_of_Clusters_vs_Inflation.pdf",  # Nom du fichier
    format="pdf",             # Format PDF
    dpi=300,                  # Résolution haute (300 DPI standard)
    bbox_inches="tight",      # Ajuste les marges autour de la figure
    pad_inches=0.05           # Réduit l'espace ajouté autour de la figure
)
plt.show()
No description has been provided for this image

Notice that we have the highest modularity when inflation is set to 1.5.

Vizualizing the clustering of the graph

In [81]:
def visualize_clusters_plotly(graph, clusters, node_list, title="Plotly-visualisation"):
    #We use plotly now to get interactive graphs 
    pos = nx.spring_layout(graph, seed=42)  # We want to generate a disposition of the nodes that is always the same
    print(len(pos))
    print(pos.keys())
    edge_x = []
    edge_y = []

    #Draw the edges
    for edge in graph.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
    
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color="#888"),
        hoverinfo="none",
        mode="lines"
    )

    #Draw nodes
    node_traces = []
    colors = plt.cm.get_cmap("hsv", len(clusters))
    #colors = plt.cm.get_cmap("tab20", len(clusters))
    
    #Draw the nodes, colored the same for each cluster
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i + 1} has {len(cluster)} nodes")
        node_x = []
        node_y = []
        for node in cluster:
            x, y = pos[subgraph_node_list[node]]
            node_x.append(x)
            node_y.append(y)
        
        node_trace = go.Scatter(
            x=node_x, y=node_y,
            mode="markers",
            marker=dict(
                size=10,
                color=f"rgba({int(colors(i)[0]*255)}, {int(colors(i)[1]*255)}, {int(colors(i)[2]*255)}, 0.8)",
                line_width=2
            ),
            name=f"Cluster {i + 1}"
        )
        node_traces.append(node_trace)
    
    #Create figure
    fig = go.Figure(data=[edge_trace] + node_traces,
                    layout=go.Layout(
                        title=title,
                        titlefont_size=16,
                        showlegend=True,
                        hovermode="closest",
                        margin=dict(b=0, l=0, r=0, t=40),
                        xaxis=dict(showgrid=False, zeroline=False),
                        yaxis=dict(showgrid=False, zeroline=False)
                    ))
    fig.show(renderer="browser")
In [82]:
# Visualize the clusters using Plotly
visualize_clusters_plotly(subgraph, clusters[1.5], subgraph_node_list, title="MCL Clusters")
Cluster 1 has 1 nodes
Cluster 2 has 1 nodes
Cluster 3 has 589 nodes
Cluster 4 has 34 nodes
Cluster 5 has 297 nodes
Cluster 6 has 1 nodes
Cluster 7 has 1 nodes
Cluster 8 has 1 nodes
Cluster 9 has 5 nodes
Cluster 10 has 14 nodes
Cluster 11 has 1 nodes
Cluster 12 has 5 nodes
Cluster 13 has 44 nodes
Cluster 14 has 1 nodes
Cluster 15 has 1 nodes
Cluster 16 has 1 nodes
Cluster 17 has 3 nodes
Cluster 18 has 2 nodes
Cluster 19 has 2 nodes
Cluster 20 has 1 nodes
Cluster 21 has 1 nodes
In [1]:
# display the clusters image
from IPython.display import Image
Image(filename='clusters.png')
Out[1]:
No description has been provided for this image

Generating wordclouds

In [83]:
import re
def create_and_generate_wordclouds_from_graph(graph, clusters):
    """
    Creates a dictionary of clusters with their associated node names (from the graph) 
    and generates Word Clouds for each cluster.

    Parameters:
        graph (networkx.Graph): The graph with node names as attributes.
        clusters (list of sets): A list of clusters where each cluster is a set of node IDs.

    Returns:
        dictionary_clusters (dict): A dictionary where keys are cluster IDs and values are lists of node names.
    """
    # sort by length
    clusters = sorted(clusters, key=len, reverse=True)

    clusters = clusters[:7]  # Limit to the first 7 clusters
    # Step 1: Create the dictionary of clusters
    dictionary_clusters = {}
    for i, cluster in enumerate(clusters):
        cluster_list = [graph.nodes[subgraph_node_list[node]]["name"] for node in cluster if "name" in graph.nodes[subgraph_node_list[node]]]
        dictionary_clusters[i] = cluster_list

    # Step 2: Generate Word Clouds for each cluster
    for cluster_id, cluster_phrases in dictionary_clusters.items():
        # Combine all phrases from the cluster
        text = ' '.join(cluster_phrases).lower()
        #text = ' '.join(set(text.split()))  # Remove duplicates
        text = re.sub(r'[^\w\s]', '', text)  # Remove non-alphanumeric characters
        text = text.lower()

        print(f"Cluster {cluster_id + 1}: {text[:100]}...")  # Preview the first 100 characters

        # Generate the Word Cloud
        wordcloud = WordCloud(max_words=6, background_color="white").generate(text)

        # Display the Word Cloud
        plt.figure()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.title(f"Word Cloud for Cluster {cluster_id + 1}")
        plt.show()

    return dictionary_clusters
In [84]:
create_and_generate_wordclouds_from_graph(subgraph, clusters[1.5])
Cluster 1: pinakion menallen township pennsylvania missouri route 117 jadwin missouri gladden missouri missouri...
No description has been provided for this image
Cluster 2: missouri route 117 missouri route 119 missouri route 68 lecoma missouri missouri route 12 missouri r...
No description has been provided for this image
Cluster 3: 13th century in literature codex gigas huon of bordeaux bertrand de barsuraube doon de mayence roger...
No description has been provided for this image
Cluster 4: lyndonhochschildserre spectral sequence group mathematics normal subgroup zariskis main theorem fult...
No description has been provided for this image
Cluster 5: cretien van campen multimodal integration scp visual perception simon baroncohen marcel proust cogni...
No description has been provided for this image
Cluster 6: alghurair group saif ahmad al ghurair manufacturing electric power retail...
No description has been provided for this image
Cluster 7: dick thoenen 1967 philadelphia phillies season innings pitched earned run average philadelphia phill...
No description has been provided for this image
Out[84]: