# #!/usr/bin/env python # # coding: utf-8 # # # Network Analysis # # In[2]: import networkx as nx from networkx.drawing.nx_agraph import graphviz_layout import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import pandas as pd import numpy as np import dzcnapy_plotlib as dzcnapy import csv import math import collections as coll from networkx.algorithms import community as com import community as lou # Importazione dataset with open("dataset.csv") as infile: csv_reader = csv.reader(infile) G = nx.Graph(csv_reader) # # ## Nodes, Edges, Density # # In[8]: # # Nodi, Archi, Densità # numNodes = G.number_of_nodes() # numEdges = G.number_of_edges() # allEdges = int(numNodes * (numNodes-1) / 2) # density = nx.density(G) * 100 # # constants # N = numNodes # M = int(numEdges / numNodes) # print("#Nodes:", numNodes) # print("#Edges:", numEdges) # print("#Edges if graph was a full mesh:", allEdges) # print("Density: %.2f%%" % density) # # ## Detecting and removing self-loops # # In[3]: # ns = G.number_of_selfloops() # print("#Self-loops:", ns) # if ns > 0: # # removing self-loops # G.remove_edges_from(G.selfloop_edges()) # print("#Edges without self-loops:", G.number_of_edges()) # # ## Detecting and removing isolates # # In[4]: # ni = nx.number_of_isolates(G) # print("#isolates:", ni) # if ni > 0: # # remove isolates # G.remove_nodes_from(nx.isolates(G)) # print("#Nodes without isolates", G.number_of_nodes()) # # ## Degree # # ### Average, variance and standard deviation # # In[5]: # avgDeg = (2*G.number_of_edges())/(G.number_of_nodes()) # print("Average degree: %.2f" % avgDeg) # deg = [G.degree(n) for n in G.nodes] # var = np.var(deg) # devstd = math.sqrt(var) # print("Variance {:.2f}".format(var)) # print("Standard deviation {:.2f}".format(devstd)) # # ### Linear scale distribution # # In[6]: # # Degree distribution # degrees = sorted([d for n, d in G.degree()], reverse=True) # degreeCount = coll.Counter(degrees) # x, y = zip(*degreeCount.items()) # plt.figure() # plt.plot(x, y, 'go-') # plt.xlabel('Degree') # plt.ylabel('Frequency') # plt.title('Degree Distribution') # plt.title('Degree Distribution with linear scale') # plt.savefig('plots/LinScaleDegreeDistr.png') # plt.show() # # ### Logarithmic scale distribution # # In[7]: # plt.scatter(x, y, s=50, c="green") # plt.xlim(0.9, max(x)) # plt.ylim(0.9, max(y)) # plt.xscale('log') # plt.yscale('log') # plt.xlabel("Degree") # plt.ylabel("Frequency") # plt.title('Degree Distribution with logarithmic scale') # plt.savefig('plots/LogScaleDegreeDistr.png') # # ## Clustering coefficient # # In[8]: # trans= nx.transitivity(G)*100 # # fraction of triadic closures (closed triangles) found in the network # print("Transitivity coefficient of the network: %.2f%%" %trans) # # Clustering coefficient # acc = nx.average_clustering(G) # print ("Average clustering coefficient {:.2f}".format(acc)) # # ## Greatest Connected Component # # In[11]: # numCC = nx.number_connected_components(G) # gcc = max(nx.connected_component_subgraphs(G), key=len) # nodesgcc = gcc.nodes() # edgesgcc = gcc.edges() # nx.write_graphml(gcc, "graphs/GCC.graphml"); # print("Numero di componenti connesse:", numCC) # print("Numero nodi GCC:", len(nodesgcc)) # print("Numero archi GCC:", len(edgesgcc)) # print("Percentuale di nodi sul totale %.2f%%:" %(len(nodesgcc)/len(G.nodes())*100)) # print("Percentuale di archi sul totale %.2f%%:" %(len(edgesgcc)/len(G.edges())*100)) # print("Densità: {:.2f}".format(nx.density(gcc) * 100)) # print("Distanza media: {:.2f}".format(nx.average_shortest_path_length(gcc))) # print('linea 165') # # ### Distanze GCC # # In[13]: # if True: # distDict = {} # #i=1 # row = [] # for n in gcc.nodes(): # nodeDists = nx.single_source_shortest_path_length(gcc,n) # #if i%1000 == 0: # # print(i) # for d in nodeDists: # #if (int(d) in marked): # # continue # if nodeDists[d] in distDict: # distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1 # else: # distDict[nodeDists[d]] = 1 # row.append(nodeDists[d]) # #i += 1 # distDict.pop(0) # print('linea 194') # plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b') # plt.title("Distance Distribution for G") # plt.ylabel("Frequency") # plt.xlabel("Shortest Path Distance") # #plt.savefig('plots/DistDistributionGlobal.png') # plt.show() # print('linea 204') # # ### GCC Eccentricity - Diameter - Radius - Center - Periphery # # In[15]: # #Eccentricity # ecc = nx.eccentricity(gcc) # # Adding eccentricity data to gcc # for k in ecc.keys(): # gcc.node[k]['eccentricity'] = ecc.get(k) # # In[ ]: # diametergcc = nx.diameter(gcc, ecc) # radiusgcc = nx.radius(gcc, ecc) # centergcc = nx.center(gcc, e=ecc) # peripherygcc = nx.periphery(gcc, e=ecc) # print ("Diameter GCC:", diametergcc) # print ("Radius GCC", radiusgcc) # # In[ ]: # print('linea 231') # #Adding data to gcc # nx.set_node_attributes(gcc, 0, 'center') # nx.set_node_attributes(gcc, 0, 'periphery') # for v in range(len(centergcc)): # gcc.node[centergcc[v]]["center"] = 1 # for v in range(len(peripherygcc)): # gcc.node[peripherygcc[v]]["periphery"] = 1 # nx.write_graphml(gcc, "graphs/gccEcc.graphml"); # # ## Distanze # print('linea 248') # Distanza media su tutta la rete if True: distDict = {} #i=1 #marked = set() row = [] for n in G.nodes(): nodeDists = nx.single_source_shortest_path_length(G,n) #if i%1000 == 0: # print(i) for d in nodeDists: #if (int(d) in marked): # continue if nodeDists[d] in distDict: distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1 else: distDict[nodeDists[d]] = 1 row.append(nodeDists[d]) #i += 1 #marked.add(int(n)) avgShortPathG = np.average(row) distDict.pop(0) print("Average Distance {:.2f}".format(avgShortPathG)) plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b') plt.title("Distance Distribution for G") plt.ylabel("Frequency") plt.xlabel("Shortest Path Distance") plt.savefig('plots/DistDistributionGlobal.png') plt.show() # print('linea 285') # #print("Numero componenti connesse:", nx.number_connected_components(G)) # #print("Distanza media:", nx.average_shortest_path_length(G)) # # ## Degree correlation # # In[ ]: # # The following code fragment calculates the dictionary and separates the keys and values into # # two lists my_degree and their_degree: # npDict = nx.average_degree_connectivity(G) # plt.scatter(npDict.keys(), npDict.values(), s=50, c="b",) # plt.xscale('log') # plt.yscale('log') # plt.xlabel("k") # plt.ylabel("$k_{nn}(k)$") # plt.savefig('plots/Assortativity.png') # # ## Communities # # ### 4-Clique Communities # # In[5]: # print('linea 314') print(""" #Nodes: 37702 #Edges: 289004 #Edges if graph was a full mesh: 710701551 Density: 0.04% #Self-loops: 0 #isolates: 0 Average degree: 15.33 Variance 6526.21 Standard deviation 80.78 Transitivity coefficient of the network: 1.24% Average clustering coefficient 0.17 Numero di componenti connesse: 2 Numero nodi GCC: 37700 Numero archi GCC: 289003 Percentuale di nodi sul totale 99.99%: Percentuale di archi sul totale 100.00%: Densità: 0.04 Distanza media: 3.25 linea 165 linea 194 linea 204 Diameter GCC: 11 Radius GCC 6 linea 231 linea 248 Average Distance 3.25 linea 285 linea 314 """) # commK = com.k_clique_communities(G, 4) # NO k-cliques bcos muh memory # print("Clique computed") # lClique = 0 # for i,cl in enumerate(commK): # lClique += 1 # for n in cl: # G.node[n]["kClique"] = i+1 # print("Numero 4-Clique communities: ", lClique) # ### Modularity based communities (Louvain) # In[ ]: # print('linea 332') # part = lou.best_partition(G) # mod = lou.modularity(part,G) # print('linea 368') # part_as_seriesG = pd.Series(part) # print('linea 369') # part_as_seriesG.sort_values() # print('linea 370') # part_as_seriesG.value_counts() # print("Numero Louvain communities: ", part_as_seriesG.value_counts().size) # # In[ ]: # #Saving Communities Attribute # nx.set_node_attributes(G, 0, 'LvnG') # for k in part.keys(): # part[k]+= 1 # for i in part.keys(): # G.node[i]["LvnG"] = part.get(i) # nx.write_graphml(G, "graphs/GComm.graphml"); # # ## Centralities # # In[ ]: # print('linea 397') # dgr = nx.degree_centrality(G) # clo = nx.closeness_centrality(G) # har = nx.harmonic_centrality(G) # eig = nx.eigenvector_centrality(G) # bet = nx.betweenness_centrality(G) # pgr = nx.pagerank(G) # hits = nx.hits(G) # centralities = pd.concat( # [pd.Series(c) for c in (hits[1], eig, pgr, har, clo, hits[0], dgr, bet)], # axis=1) # centralities.columns = ("Authorities", "Eigenvector", "PageRank", # "Harmonic Closeness", "Closeness", "Hubs", # "Degree", "Betweenness") # centralities["Harmonic Closeness"] /= centralities.shape[0] # # Calculate the correlations for each pair of centralities # c_df = centralities.corr() # ll_triangle = np.tri(c_df.shape[0], k=-1) # c_df *= ll_triangle # c_series = c_df.stack().sort_values() # c_series.tail()