In [68]:
from sklearn.preprocessing import normalize as sklearn_normalize
from scipy.sparse import isspmatrix, csc_matrix
import networkx as nx
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import time
#List of all librairies to import
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud
import plotly.graph_objects as go
%matplotlib inline
import gzip
import seaborn as sns
Loading the graph
In [69]:
# Charger les arêtes du graphe
def load_edges(file_path):
edges = []
with gzip.open(file_path, 'rt') as f: # Ouvrir en mode texte
for line in f:
source, target = map(int, line.strip().split())
edges.append((source, target))
return edges
# Charger les noms des pages
def load_page_names(file_path):
page_names = {}
with gzip.open(file_path, 'rt') as f: # Ouvrir en mode texte
for line in f:
line = line.strip()
if not line: # Ignorer les lignes vides
continue
parts = line.split(' ', 1)
if len(parts) == 2: # Vérifier que la ligne contient bien deux parties
page_id, page_name = parts
page_names[int(page_id)] = page_name
return page_names
# Chemins vers les fichiers
base_path = "data/big_graph"
edges_file = f"{base_path}/wiki-topcats.txt.gz"
page_names_file = f"{base_path}/wiki-topcats-page-names.txt.gz"
# Charger les données
edges = load_edges(edges_file)
page_names = load_page_names(page_names_file)
# Créer le graphe
G = nx.Graph()
G.add_edges_from(edges) # Ajouter les arêtes
# Ajouter les noms des pages comme attributs des nœuds
nx.set_node_attributes(G, page_names, "name")
# Informations sur le graphe
print(f"Nombre de nœuds : {G.number_of_nodes()}")
print(f"Nombre d'arêtes : {G.number_of_edges()}")
# Visualiser une sous-partie du graphe
subgraph = G.subgraph(list(G.nodes)[:100]) # Prenez un sous-graphe avec 100 nœuds
pos = nx.spring_layout(subgraph, seed=42) # Layout pour la visualisation
plt.figure(figsize=(12, 8))
nx.draw(
subgraph,
pos,
with_labels=True,
labels={node: G.nodes[node]["name"] for node in subgraph.nodes}, # Utiliser les noms des pages
node_size=50,
font_size=8,
edge_color="gray",
)
plt.title("Sous-graphe des 100 premiers nœuds")
plt.show()
Nombre de nœuds : 1791489 Nombre d'arêtes : 25447873
In [70]:
# Visualiser une sous-partie du graphe
subgraph = G.subgraph(list(G.nodes)[:1000]) # Prenez un sous-graphe avec 100 nœuds
In [ ]:
# enumerate the nodes in the graph
subgraph_node_list = list(subgraph.nodes())
print(f"Number of nodes: {len(subgraph_node_list)}")
print(f"Node List: {subgraph_node_list}")
# number of edges in the subgraph
print(f"Number of edges: {subgraph.number_of_edges()}")
# Convert the graph to a dense adjacency matrix
adj_matrix = nx.adjacency_matrix(subgraph).toarray()
# Save the adjacency matrix to a file (optional)
np.save("adj_matrix.npy", adj_matrix)
# Print the shape of the adjacency matrix
print(f"Adjacency Matrix Shape: {adj_matrix.shape}")
with open("subgraph_edge_list.txt", 'w') as f:
for edge in subgraph.edges():
f.write(f"{edge[0]} {edge[1]} 1.0\n")
Helper functions
In [72]:
def normalize(M):
return sklearn_normalize(M, norm="l1", axis=0)
In [73]:
def get_clusters(matrix):
"""
Retrieve the clusters from the matrix
:param matrix: The matrix produced by the MCL algorithm
:returns: A list of tuples where each tuple represents a cluster and
contains the indices of the nodes belonging to the cluster
"""
if not isspmatrix(matrix):
# Convert dense matrix to sparse to use sparse-specific methods
matrix = csc_matrix(matrix)
# get the attractors - non-zero elements of the matrix diagonal
attractors = matrix.diagonal().nonzero()[0]
print("Attractors:", attractors)
# somewhere to put the clusters
clusters = set()
# the nodes in the same row as each attractor form a cluster
for attractor in attractors:
cluster = tuple(matrix.getrow(attractor).nonzero()[1].tolist())
clusters.add(cluster)
return sorted(list(clusters))
Main algorithm
In [74]:
def mcl(M, inflation, max_iter=10, epsilon=1e-4, verbose=0):
"""
Perform Markov Clustering (MCL) on a graph represented by an edge list file.
Args:
file_path (str): Path to the file containing the edge list.
inflation (float): Inflation parameter for the MCL algorithm.
max_iter (int): Maximum number of iterations to perform.
epsilon (float): Convergence threshold for the Frobenius norm.
Returns:
numpy.ndarray: The final matrix after MCL processing.
"""
print("Starting MCL algorithm") if verbose >= 1 else None
# Into numpy
M = np.array(M, dtype=float)
# Initialization
M = normalize(M)
convergence = False
previous_M = M
i = 0
total_time = 0 # To keep track of total time across all iterations
iteration_times = []
while i < max_iter and not convergence:
print(f"Iteration {i + 1} started") if verbose >= 1 else None
iteration_start_time = time.time()
# Expansion
expansion_start_time = time.time()
M = np.matmul(M, M)
expansion_time = time.time() - expansion_start_time
print(f"Expansion took {expansion_time} seconds") if verbose == 2 else None
# Inflation
inflation_start_time = time.time()
M = np.power(M, inflation)
M = normalize(M)
inflation_time = time.time() - inflation_start_time
print(f"Inflation took {inflation_time} seconds") if verbose >= 2 else None
# Check convergence using Frobenius norm
diff_norm = np.linalg.norm(previous_M - M)
print(f"Frobenius norm difference = {diff_norm}") if verbose >= 1 else None
convergence = diff_norm < epsilon
if convergence:
print(f"Convergence achieved at iteration {i + 1}")
previous_M = M
iteration_end_time = time.time()
iteration_time = iteration_end_time - iteration_start_time
total_time += iteration_time
print(f"Iteration {i + 1}: Total iteration time = {iteration_time} seconds") if verbose >= 1 else None
iteration_times.append(iteration_time)
i += 1
# Log total time
print("Total time for MCL processing: %.6f seconds", total_time) if verbose >= 1 else None
# Log final matrix
print("Final matrix:\n%s", M) if verbose >= 2 else None
return M, iteration_times
Runnning the algorithm
In [ ]:
# running the algo with differen inflation values
inflation_values = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
my_final_matrices = {}
for inflation in inflation_values:
print("Running MCL with inflation = %.2f", inflation)
final_matrix, _ = mcl(adj_matrix, inflation=inflation, max_iter=200, epsilon=1e-7, verbose=2)
my_final_matrices[inflation] = final_matrix
Extracting the clusters
In [ ]:
# Extract clusters for each inflation value
clusters = {}
for inflation, matrix in my_final_matrices.items():
print("Extracting clusters for inflation =", inflation)
clusters[inflation] = get_clusters(matrix)
In [77]:
# Save the clusters to files
for inflation, cluster_ids in clusters.items():
with open(f"clusters_inf_{inflation}.txt", "w") as f:
for cluster in cluster_ids:
f.write(" ".join(map(str, cluster)) + "\n")
Modularity
In [78]:
def global_modularity(clusters, G):
m = G.number_of_edges() #Total number of edges in the graph
#Transform the list of sets from 0...999 into actual node ids
clusters = [{subgraph_node_list[node] for node in node_set} for node_set in clusters]
#Modularity initialized to 0
Q = 0
#Iterate over each cluster
for cluster in clusters:
cluster_nodes = list(cluster) #convert the set to a list
#Subgraph containing only the nodes and edges of the cluster
subgraph = G.subgraph(cluster_nodes)
#Number of edges inside the cluster
L_c = subgraph.number_of_edges()
#Sum of the degrees of the nodes in the cluster (all the edges that are connected to the nodes in the cluster)
k_c = sum(G.degree(n) for n in cluster_nodes)
#Add the modularity for the cluster
Q += (L_c / m) - (k_c / (2 * m)) ** 2
#Return the global modularity
return Q
In [79]:
modularity = []
for inflation, cluster_list in clusters.items():
modularity_value = global_modularity(cluster_list, subgraph)
modularity.append((inflation, modularity_value, len(cluster_list)))
# Print the modularity values
for inflation, mod_value, cluster_size in modularity:
print(f"Inflation: {inflation}, Modularity: {mod_value}, Number of clusters: {cluster_size}")
Inflation: 1.1, Modularity: 0.00021600602547791105, Number of clusters: 3 Inflation: 1.2, Modularity: 0.00021600602547791105, Number of clusters: 3 Inflation: 1.3, Modularity: 0.06857514320408123, Number of clusters: 8 Inflation: 1.4, Modularity: 0.08257000154633522, Number of clusters: 11 Inflation: 1.5, Modularity: 0.25996611135555325, Number of clusters: 21 Inflation: 1.6, Modularity: 0.2482154349273036, Number of clusters: 29 Inflation: 1.7, Modularity: 0.2441235529253749, Number of clusters: 37 Inflation: 1.8, Modularity: 0.25114909462824075, Number of clusters: 54 Inflation: 1.9, Modularity: 0.2378177433927788, Number of clusters: 73 Inflation: 2.0, Modularity: 0.2312409633705325, Number of clusters: 86
In [80]:
# Unpack the modularity list into separate lists for plotting
inflation_values, modularity_values, cluster_counts = zip(*modularity)
fig, ax1 = plt.subplots(figsize=(10, 6))
color = 'tab:blue'
ax1.set_xlabel('Inflation')
ax1.set_ylabel('Modularity', color=color)
ax1.plot(inflation_values, modularity_values, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:red'
ax2.set_ylabel('Number of Clusters', color=color) # we already handled the x-label with ax1
ax2.plot(inflation_values, cluster_counts, color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.title("Modularity and Number of Clusters vs Inflation")
sns.despine(fig=fig, right=False, left=False)
plt.savefig(
"Modularity_and_Number_of_Clusters_vs_Inflation.pdf", # Nom du fichier
format="pdf", # Format PDF
dpi=300, # Résolution haute (300 DPI standard)
bbox_inches="tight", # Ajuste les marges autour de la figure
pad_inches=0.05 # Réduit l'espace ajouté autour de la figure
)
plt.show()
Notice that we have the highest modularity when inflation is set to 1.5.
Vizualizing the clustering of the graph
In [81]:
def visualize_clusters_plotly(graph, clusters, node_list, title="Plotly-visualisation"):
#We use plotly now to get interactive graphs
pos = nx.spring_layout(graph, seed=42) # We want to generate a disposition of the nodes that is always the same
print(len(pos))
print(pos.keys())
edge_x = []
edge_y = []
#Draw the edges
for edge in graph.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color="#888"),
hoverinfo="none",
mode="lines"
)
#Draw nodes
node_traces = []
colors = plt.cm.get_cmap("hsv", len(clusters))
#colors = plt.cm.get_cmap("tab20", len(clusters))
#Draw the nodes, colored the same for each cluster
for i, cluster in enumerate(clusters):
print(f"Cluster {i + 1} has {len(cluster)} nodes")
node_x = []
node_y = []
for node in cluster:
x, y = pos[subgraph_node_list[node]]
node_x.append(x)
node_y.append(y)
node_trace = go.Scatter(
x=node_x, y=node_y,
mode="markers",
marker=dict(
size=10,
color=f"rgba({int(colors(i)[0]*255)}, {int(colors(i)[1]*255)}, {int(colors(i)[2]*255)}, 0.8)",
line_width=2
),
name=f"Cluster {i + 1}"
)
node_traces.append(node_trace)
#Create figure
fig = go.Figure(data=[edge_trace] + node_traces,
layout=go.Layout(
title=title,
titlefont_size=16,
showlegend=True,
hovermode="closest",
margin=dict(b=0, l=0, r=0, t=40),
xaxis=dict(showgrid=False, zeroline=False),
yaxis=dict(showgrid=False, zeroline=False)
))
fig.show(renderer="browser")
In [82]:
# Visualize the clusters using Plotly
visualize_clusters_plotly(subgraph, clusters[1.5], subgraph_node_list, title="MCL Clusters")
Cluster 1 has 1 nodes Cluster 2 has 1 nodes Cluster 3 has 589 nodes Cluster 4 has 34 nodes Cluster 5 has 297 nodes Cluster 6 has 1 nodes Cluster 7 has 1 nodes Cluster 8 has 1 nodes Cluster 9 has 5 nodes Cluster 10 has 14 nodes Cluster 11 has 1 nodes Cluster 12 has 5 nodes Cluster 13 has 44 nodes Cluster 14 has 1 nodes Cluster 15 has 1 nodes Cluster 16 has 1 nodes Cluster 17 has 3 nodes Cluster 18 has 2 nodes Cluster 19 has 2 nodes Cluster 20 has 1 nodes Cluster 21 has 1 nodes
In [1]:
# display the clusters image
from IPython.display import Image
Image(filename='clusters.png')
Out[1]:
Generating wordclouds
In [83]:
import re
def create_and_generate_wordclouds_from_graph(graph, clusters):
"""
Creates a dictionary of clusters with their associated node names (from the graph)
and generates Word Clouds for each cluster.
Parameters:
graph (networkx.Graph): The graph with node names as attributes.
clusters (list of sets): A list of clusters where each cluster is a set of node IDs.
Returns:
dictionary_clusters (dict): A dictionary where keys are cluster IDs and values are lists of node names.
"""
# sort by length
clusters = sorted(clusters, key=len, reverse=True)
clusters = clusters[:7] # Limit to the first 7 clusters
# Step 1: Create the dictionary of clusters
dictionary_clusters = {}
for i, cluster in enumerate(clusters):
cluster_list = [graph.nodes[subgraph_node_list[node]]["name"] for node in cluster if "name" in graph.nodes[subgraph_node_list[node]]]
dictionary_clusters[i] = cluster_list
# Step 2: Generate Word Clouds for each cluster
for cluster_id, cluster_phrases in dictionary_clusters.items():
# Combine all phrases from the cluster
text = ' '.join(cluster_phrases).lower()
#text = ' '.join(set(text.split())) # Remove duplicates
text = re.sub(r'[^\w\s]', '', text) # Remove non-alphanumeric characters
text = text.lower()
print(f"Cluster {cluster_id + 1}: {text[:100]}...") # Preview the first 100 characters
# Generate the Word Cloud
wordcloud = WordCloud(max_words=6, background_color="white").generate(text)
# Display the Word Cloud
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(f"Word Cloud for Cluster {cluster_id + 1}")
plt.show()
return dictionary_clusters
In [84]:
create_and_generate_wordclouds_from_graph(subgraph, clusters[1.5])
Cluster 1: pinakion menallen township pennsylvania missouri route 117 jadwin missouri gladden missouri missouri...
Cluster 2: missouri route 117 missouri route 119 missouri route 68 lecoma missouri missouri route 12 missouri r...
Cluster 3: 13th century in literature codex gigas huon of bordeaux bertrand de barsuraube doon de mayence roger...
Cluster 4: lyndonhochschildserre spectral sequence group mathematics normal subgroup zariskis main theorem fult...
Cluster 5: cretien van campen multimodal integration scp visual perception simon baroncohen marcel proust cogni...
Cluster 6: alghurair group saif ahmad al ghurair manufacturing electric power retail...
Cluster 7: dick thoenen 1967 philadelphia phillies season innings pitched earned run average philadelphia phill...
Out[84]: