UniTO/anno3/avrc/assignments/coff/modello_github/assignment2.py
2024-10-29 09:11:05 +01:00

424 lines
9.3 KiB
Python
Executable file

# #!/usr/bin/env python
# # coding: utf-8
# # # Network Analysis
# # In[2]:
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import dzcnapy_plotlib as dzcnapy
import csv
import math
import collections as coll
from networkx.algorithms import community as com
import community as lou
# Importazione dataset
with open("dataset.csv") as infile:
csv_reader = csv.reader(infile)
G = nx.Graph(csv_reader)
# # ## Nodes, Edges, Density
# # In[8]:
# # Nodi, Archi, Densità
# numNodes = G.number_of_nodes()
# numEdges = G.number_of_edges()
# allEdges = int(numNodes * (numNodes-1) / 2)
# density = nx.density(G) * 100
# # constants
# N = numNodes
# M = int(numEdges / numNodes)
# print("#Nodes:", numNodes)
# print("#Edges:", numEdges)
# print("#Edges if graph was a full mesh:", allEdges)
# print("Density: %.2f%%" % density)
# # ## Detecting and removing self-loops
# # In[3]:
# ns = G.number_of_selfloops()
# print("#Self-loops:", ns)
# if ns > 0:
# # removing self-loops
# G.remove_edges_from(G.selfloop_edges())
# print("#Edges without self-loops:", G.number_of_edges())
# # ## Detecting and removing isolates
# # In[4]:
# ni = nx.number_of_isolates(G)
# print("#isolates:", ni)
# if ni > 0:
# # remove isolates
# G.remove_nodes_from(nx.isolates(G))
# print("#Nodes without isolates", G.number_of_nodes())
# # ## Degree
# # ### Average, variance and standard deviation
# # In[5]:
# avgDeg = (2*G.number_of_edges())/(G.number_of_nodes())
# print("Average degree: %.2f" % avgDeg)
# deg = [G.degree(n) for n in G.nodes]
# var = np.var(deg)
# devstd = math.sqrt(var)
# print("Variance {:.2f}".format(var))
# print("Standard deviation {:.2f}".format(devstd))
# # ### Linear scale distribution
# # In[6]:
# # Degree distribution
# degrees = sorted([d for n, d in G.degree()], reverse=True)
# degreeCount = coll.Counter(degrees)
# x, y = zip(*degreeCount.items())
# plt.figure()
# plt.plot(x, y, 'go-')
# plt.xlabel('Degree')
# plt.ylabel('Frequency')
# plt.title('Degree Distribution')
# plt.title('Degree Distribution with linear scale')
# plt.savefig('plots/LinScaleDegreeDistr.png')
# plt.show()
# # ### Logarithmic scale distribution
# # In[7]:
# plt.scatter(x, y, s=50, c="green")
# plt.xlim(0.9, max(x))
# plt.ylim(0.9, max(y))
# plt.xscale('log')
# plt.yscale('log')
# plt.xlabel("Degree")
# plt.ylabel("Frequency")
# plt.title('Degree Distribution with logarithmic scale')
# plt.savefig('plots/LogScaleDegreeDistr.png')
# # ## Clustering coefficient
# # In[8]:
# trans= nx.transitivity(G)*100
# # fraction of triadic closures (closed triangles) found in the network
# print("Transitivity coefficient of the network: %.2f%%" %trans)
# # Clustering coefficient
# acc = nx.average_clustering(G)
# print ("Average clustering coefficient {:.2f}".format(acc))
# # ## Greatest Connected Component
# # In[11]:
# numCC = nx.number_connected_components(G)
# gcc = max(nx.connected_component_subgraphs(G), key=len)
# nodesgcc = gcc.nodes()
# edgesgcc = gcc.edges()
# nx.write_graphml(gcc, "graphs/GCC.graphml");
# print("Numero di componenti connesse:", numCC)
# print("Numero nodi GCC:", len(nodesgcc))
# print("Numero archi GCC:", len(edgesgcc))
# print("Percentuale di nodi sul totale %.2f%%:" %(len(nodesgcc)/len(G.nodes())*100))
# print("Percentuale di archi sul totale %.2f%%:" %(len(edgesgcc)/len(G.edges())*100))
# print("Densità: {:.2f}".format(nx.density(gcc) * 100))
# print("Distanza media: {:.2f}".format(nx.average_shortest_path_length(gcc)))
# print('linea 165')
# # ### Distanze GCC
# # In[13]:
# if True:
# distDict = {}
# #i=1
# row = []
# for n in gcc.nodes():
# nodeDists = nx.single_source_shortest_path_length(gcc,n)
# #if i%1000 == 0:
# # print(i)
# for d in nodeDists:
# #if (int(d) in marked):
# # continue
# if nodeDists[d] in distDict:
# distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1
# else:
# distDict[nodeDists[d]] = 1
# row.append(nodeDists[d])
# #i += 1
# distDict.pop(0)
# print('linea 194')
# plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b')
# plt.title("Distance Distribution for G")
# plt.ylabel("Frequency")
# plt.xlabel("Shortest Path Distance")
# #plt.savefig('plots/DistDistributionGlobal.png')
# plt.show()
# print('linea 204')
# # ### GCC Eccentricity - Diameter - Radius - Center - Periphery
# # In[15]:
# #Eccentricity
# ecc = nx.eccentricity(gcc)
# # Adding eccentricity data to gcc
# for k in ecc.keys():
# gcc.node[k]['eccentricity'] = ecc.get(k)
# # In[ ]:
# diametergcc = nx.diameter(gcc, ecc)
# radiusgcc = nx.radius(gcc, ecc)
# centergcc = nx.center(gcc, e=ecc)
# peripherygcc = nx.periphery(gcc, e=ecc)
# print ("Diameter GCC:", diametergcc)
# print ("Radius GCC", radiusgcc)
# # In[ ]:
# print('linea 231')
# #Adding data to gcc
# nx.set_node_attributes(gcc, 0, 'center')
# nx.set_node_attributes(gcc, 0, 'periphery')
# for v in range(len(centergcc)):
# gcc.node[centergcc[v]]["center"] = 1
# for v in range(len(peripherygcc)):
# gcc.node[peripherygcc[v]]["periphery"] = 1
# nx.write_graphml(gcc, "graphs/gccEcc.graphml");
# # ## Distanze
# print('linea 248')
# Distanza media su tutta la rete
if True:
distDict = {}
#i=1
#marked = set()
row = []
for n in G.nodes():
nodeDists = nx.single_source_shortest_path_length(G,n)
#if i%1000 == 0:
# print(i)
for d in nodeDists:
#if (int(d) in marked):
# continue
if nodeDists[d] in distDict:
distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1
else:
distDict[nodeDists[d]] = 1
row.append(nodeDists[d])
#i += 1
#marked.add(int(n))
avgShortPathG = np.average(row)
distDict.pop(0)
print("Average Distance {:.2f}".format(avgShortPathG))
plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b')
plt.title("Distance Distribution for G")
plt.ylabel("Frequency")
plt.xlabel("Shortest Path Distance")
plt.savefig('plots/DistDistributionGlobal.png')
plt.show()
# print('linea 285')
# #print("Numero componenti connesse:", nx.number_connected_components(G))
# #print("Distanza media:", nx.average_shortest_path_length(G))
# # ## Degree correlation
# # In[ ]:
# # The following code fragment calculates the dictionary and separates the keys and values into
# # two lists my_degree and their_degree:
# npDict = nx.average_degree_connectivity(G)
# plt.scatter(npDict.keys(), npDict.values(), s=50, c="b",)
# plt.xscale('log')
# plt.yscale('log')
# plt.xlabel("k")
# plt.ylabel("$k_{nn}(k)$")
# plt.savefig('plots/Assortativity.png')
# # ## Communities
# # ### 4-Clique Communities
# # In[5]:
# print('linea 314')
print("""
#Nodes: 37702
#Edges: 289004
#Edges if graph was a full mesh: 710701551
Density: 0.04%
#Self-loops: 0
#isolates: 0
Average degree: 15.33
Variance 6526.21
Standard deviation 80.78
Transitivity coefficient of the network: 1.24%
Average clustering coefficient 0.17
Numero di componenti connesse: 2
Numero nodi GCC: 37700
Numero archi GCC: 289003
Percentuale di nodi sul totale 99.99%:
Percentuale di archi sul totale 100.00%:
Densità: 0.04
Distanza media: 3.25
linea 165
linea 194
linea 204
Diameter GCC: 11
Radius GCC 6
linea 231
linea 248
Average Distance 3.25
linea 285
linea 314
""")
# commK = com.k_clique_communities(G, 4)
# NO k-cliques bcos muh memory
# print("Clique computed")
# lClique = 0
# for i,cl in enumerate(commK):
# lClique += 1
# for n in cl:
# G.node[n]["kClique"] = i+1
# print("Numero 4-Clique communities: ", lClique)
# ### Modularity based communities (Louvain)
# In[ ]:
# print('linea 332')
# part = lou.best_partition(G)
# mod = lou.modularity(part,G)
# print('linea 368')
# part_as_seriesG = pd.Series(part)
# print('linea 369')
# part_as_seriesG.sort_values()
# print('linea 370')
# part_as_seriesG.value_counts()
# print("Numero Louvain communities: ", part_as_seriesG.value_counts().size)
# # In[ ]:
# #Saving Communities Attribute
# nx.set_node_attributes(G, 0, 'LvnG')
# for k in part.keys():
# part[k]+= 1
# for i in part.keys():
# G.node[i]["LvnG"] = part.get(i)
# nx.write_graphml(G, "graphs/GComm.graphml");
# # ## Centralities
# # In[ ]:
# print('linea 397')
# dgr = nx.degree_centrality(G)
# clo = nx.closeness_centrality(G)
# har = nx.harmonic_centrality(G)
# eig = nx.eigenvector_centrality(G)
# bet = nx.betweenness_centrality(G)
# pgr = nx.pagerank(G)
# hits = nx.hits(G)
# centralities = pd.concat(
# [pd.Series(c) for c in (hits[1], eig, pgr, har, clo, hits[0], dgr, bet)],
# axis=1)
# centralities.columns = ("Authorities", "Eigenvector", "PageRank",
# "Harmonic Closeness", "Closeness", "Hubs",
# "Degree", "Betweenness")
# centralities["Harmonic Closeness"] /= centralities.shape[0]
# # Calculate the correlations for each pair of centralities
# c_df = centralities.corr()
# ll_triangle = np.tri(c_df.shape[0], k=-1)
# c_df *= ll_triangle
# c_series = c_df.stack().sort_values()
# c_series.tail()