UniTO/anno3/avrc/assignments/coff/modello_github/assignment2.py

# #!/usr/bin/env python
# # coding: utf-8

# # # Network Analysis

# # In[2]:


import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import dzcnapy_plotlib as dzcnapy
import csv
import math
import collections as coll
from networkx.algorithms import community as com
import community as lou

# Importazione dataset
with open("dataset.csv") as infile:
    csv_reader = csv.reader(infile)
    G = nx.Graph(csv_reader)


# # ## Nodes, Edges, Density

# # In[8]:


# # Nodi, Archi, Densità
# numNodes = G.number_of_nodes()
# numEdges = G.number_of_edges()
# allEdges = int(numNodes * (numNodes-1) / 2)
# density = nx.density(G) * 100
    
# # constants
# N = numNodes
# M = int(numEdges / numNodes)

# print("#Nodes:", numNodes)
# print("#Edges:", numEdges)
# print("#Edges if graph was a full mesh:", allEdges)
# print("Density: %.2f%%" % density)


# # ## Detecting and removing self-loops

# # In[3]:


# ns = G.number_of_selfloops()
# print("#Self-loops:", ns)

# if ns > 0:
#     # removing self-loops
#     G.remove_edges_from(G.selfloop_edges())
#     print("#Edges without self-loops:", G.number_of_edges())


# # ## Detecting and removing isolates

# # In[4]:


# ni = nx.number_of_isolates(G)
# print("#isolates:", ni)

# if ni > 0:
#     # remove isolates
#     G.remove_nodes_from(nx.isolates(G))
#     print("#Nodes without isolates", G.number_of_nodes())


# # ## Degree

# # ### Average, variance and standard deviation

# # In[5]:


# avgDeg = (2*G.number_of_edges())/(G.number_of_nodes())
# print("Average degree: %.2f" % avgDeg)

# deg = [G.degree(n) for n in G.nodes]
# var = np.var(deg)
# devstd = math.sqrt(var)

# print("Variance {:.2f}".format(var))
# print("Standard deviation {:.2f}".format(devstd))


# # ### Linear scale distribution

# # In[6]:


# # Degree distribution
# degrees = sorted([d for n, d in G.degree()], reverse=True)
# degreeCount = coll.Counter(degrees)
# x, y = zip(*degreeCount.items())

# plt.figure()  
# plt.plot(x, y, 'go-') 
# plt.xlabel('Degree')
# plt.ylabel('Frequency')
# plt.title('Degree Distribution') 
# plt.title('Degree Distribution with linear scale')
# plt.savefig('plots/LinScaleDegreeDistr.png')
# plt.show()


# # ### Logarithmic scale distribution

# # In[7]:


# plt.scatter(x, y, s=50, c="green")
# plt.xlim(0.9, max(x))
# plt.ylim(0.9, max(y))
# plt.xscale('log')
# plt.yscale('log')
# plt.xlabel("Degree")
# plt.ylabel("Frequency")
# plt.title('Degree Distribution with logarithmic scale') 
# plt.savefig('plots/LogScaleDegreeDistr.png')


# # ## Clustering coefficient

# # In[8]:

# trans= nx.transitivity(G)*100
# # fraction of triadic closures (closed triangles) found in the network
# print("Transitivity coefficient of the network: %.2f%%" %trans)

# # Clustering coefficient
# acc = nx.average_clustering(G)
# print ("Average clustering coefficient {:.2f}".format(acc))


# # ## Greatest Connected Component 

# # In[11]:


# numCC = nx.number_connected_components(G)
# gcc = max(nx.connected_component_subgraphs(G), key=len)

# nodesgcc = gcc.nodes()
# edgesgcc = gcc.edges()
# nx.write_graphml(gcc, "graphs/GCC.graphml");

# print("Numero di componenti connesse:", numCC)
# print("Numero nodi GCC:", len(nodesgcc))
# print("Numero archi GCC:", len(edgesgcc))
# print("Percentuale di nodi sul totale %.2f%%:" %(len(nodesgcc)/len(G.nodes())*100))
# print("Percentuale di archi sul totale %.2f%%:" %(len(edgesgcc)/len(G.edges())*100))
# print("Densità: {:.2f}".format(nx.density(gcc) * 100))
# print("Distanza media: {:.2f}".format(nx.average_shortest_path_length(gcc)))
# print('linea 165')


# # ### Distanze GCC

# # In[13]:


# if True:
#     distDict = {}
#     #i=1
#     row = []
#     for n in gcc.nodes():
#         nodeDists = nx.single_source_shortest_path_length(gcc,n)
#         #if i%1000 == 0:
#         #    print(i)

#         for d in nodeDists:
#             #if (int(d) in marked):
#             #    continue
#             if nodeDists[d] in distDict:
#                 distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1
#             else:
#                 distDict[nodeDists[d]] = 1
#             row.append(nodeDists[d])
#         #i += 1

#     distDict.pop(0)

#     print('linea 194')
#     plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b')
#     plt.title("Distance Distribution for G")
#     plt.ylabel("Frequency")
#     plt.xlabel("Shortest Path Distance")
#     #plt.savefig('plots/DistDistributionGlobal.png')
#     plt.show()


# print('linea 204')
# # ### GCC Eccentricity - Diameter - Radius - Center - Periphery 

# # In[15]:


# #Eccentricity
# ecc = nx.eccentricity(gcc)

# # Adding eccentricity data to gcc
# for k in ecc.keys():
#     gcc.node[k]['eccentricity'] = ecc.get(k)


# # In[ ]:


# diametergcc = nx.diameter(gcc, ecc)
# radiusgcc = nx.radius(gcc, ecc)
# centergcc = nx.center(gcc, e=ecc)
# peripherygcc = nx.periphery(gcc, e=ecc)

# print ("Diameter GCC:", diametergcc)
# print ("Radius GCC", radiusgcc)


# # In[ ]:

# print('linea 231')

# #Adding data to gcc
# nx.set_node_attributes(gcc, 0, 'center')
# nx.set_node_attributes(gcc, 0, 'periphery')

# for v in range(len(centergcc)):
#     gcc.node[centergcc[v]]["center"] = 1

# for v in range(len(peripherygcc)):
#     gcc.node[peripherygcc[v]]["periphery"] = 1
    
# nx.write_graphml(gcc, "graphs/gccEcc.graphml");


# # ## Distanze
# print('linea 248')

# Distanza media su tutta la rete
if True:
    distDict = {}
    #i=1
    #marked = set()
    row = []
    for n in G.nodes():
        nodeDists = nx.single_source_shortest_path_length(G,n)
        #if i%1000 == 0:
        #    print(i)

        for d in nodeDists:
            #if (int(d) in marked):
            #    continue
            if nodeDists[d] in distDict:
                distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1
            else:
                distDict[nodeDists[d]] = 1
            row.append(nodeDists[d])
        #i += 1
        #marked.add(int(n))

    avgShortPathG = np.average(row)
    distDict.pop(0)

    print("Average Distance {:.2f}".format(avgShortPathG))

    plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b')
    plt.title("Distance Distribution for G")
    plt.ylabel("Frequency")
    plt.xlabel("Shortest Path Distance")
    plt.savefig('plots/DistDistributionGlobal.png')
    plt.show()


# print('linea 285')
# #print("Numero componenti connesse:", nx.number_connected_components(G))
# #print("Distanza media:", nx.average_shortest_path_length(G))


# # ## Degree correlation

# # In[ ]:


# # The following code fragment calculates the dictionary and separates the keys and values into 
# # two lists my_degree and their_degree:

# npDict = nx.average_degree_connectivity(G)

# plt.scatter(npDict.keys(), npDict.values(), s=50, c="b",)
# plt.xscale('log')
# plt.yscale('log')
# plt.xlabel("k")
# plt.ylabel("$k_{nn}(k)$")
# plt.savefig('plots/Assortativity.png')


# # ## Communities

# # ### 4-Clique Communities

# # In[5]:

# print('linea 314')

print("""
#Nodes: 37702
#Edges: 289004
#Edges if graph was a full mesh: 710701551
Density: 0.04%
#Self-loops: 0
#isolates: 0
Average degree: 15.33
Variance 6526.21
Standard deviation 80.78
Transitivity coefficient of the network: 1.24%
Average clustering coefficient 0.17
Numero di componenti connesse: 2
Numero nodi GCC: 37700
Numero archi GCC: 289003
Percentuale di nodi sul totale 99.99%:
Percentuale di archi sul totale 100.00%:
Densità: 0.04
Distanza media: 3.25
linea 165
linea 194
linea 204
Diameter GCC: 11
Radius GCC 6
linea 231
linea 248
Average Distance 3.25
linea 285
linea 314
""")

# commK  = com.k_clique_communities(G, 4)

# NO k-cliques bcos muh memory

# print("Clique computed")

# lClique = 0
# for i,cl in enumerate(commK):
#     lClique += 1
#     for n in cl:
#         G.node[n]["kClique"] = i+1
        
# print("Numero 4-Clique communities: ", lClique)


# ### Modularity based communities (Louvain)

# In[ ]:

# print('linea 332')

# part = lou.best_partition(G)
# mod = lou.modularity(part,G)

# print('linea 368')
# part_as_seriesG = pd.Series(part)
# print('linea 369')
# part_as_seriesG.sort_values()
# print('linea 370')
# part_as_seriesG.value_counts() 

# print("Numero Louvain communities: ", part_as_seriesG.value_counts().size)


# # In[ ]:


# #Saving Communities Attribute
# nx.set_node_attributes(G, 0, 'LvnG')
# for k in part.keys():
#     part[k]+= 1

# for i in part.keys():
#     G.node[i]["LvnG"] = part.get(i)

# nx.write_graphml(G, "graphs/GComm.graphml");


# # ## Centralities

# # In[ ]:


# print('linea 397')
# dgr = nx.degree_centrality(G)
# clo = nx.closeness_centrality(G)
# har = nx.harmonic_centrality(G)
# eig = nx.eigenvector_centrality(G)
# bet = nx.betweenness_centrality(G)
# pgr = nx.pagerank(G)
# hits = nx.hits(G)

# centralities = pd.concat(
#     [pd.Series(c) for c in (hits[1], eig, pgr, har, clo, hits[0], dgr, bet)],
#     axis=1)

# centralities.columns = ("Authorities", "Eigenvector", "PageRank",
#                         "Harmonic Closeness", "Closeness", "Hubs",
#                         "Degree", "Betweenness")
# centralities["Harmonic Closeness"] /= centralities.shape[0]

# # Calculate the correlations for each pair of centralities
# c_df = centralities.corr()
# ll_triangle = np.tri(c_df.shape[0], k=-1)
# c_df *= ll_triangle
# c_series = c_df.stack().sort_values()
# c_series.tail()
avrc :3 2019-12-05 16:45:57 +01:00			`# #!/usr/bin/env python`
			`# # coding: utf-8`

			`# # # Network Analysis`

			`# # In[2]:`


			`import networkx as nx`
			`from networkx.drawing.nx_agraph import graphviz_layout`
			`import matplotlib as mpl`
			`mpl.use('Agg')`
			`import matplotlib.pyplot as plt`
			`import pandas as pd`
			`import numpy as np`
			`import dzcnapy_plotlib as dzcnapy`
			`import csv`
			`import math`
			`import collections as coll`
			`from networkx.algorithms import community as com`
			`import community as lou`

			`# Importazione dataset`
			`with open("dataset.csv") as infile:`
			`csv_reader = csv.reader(infile)`
			`G = nx.Graph(csv_reader)`


			`# # ## Nodes, Edges, Density`

			`# # In[8]:`


			`# # Nodi, Archi, Densità`
			`# numNodes = G.number_of_nodes()`
			`# numEdges = G.number_of_edges()`
			`# allEdges = int(numNodes * (numNodes-1) / 2)`
			`# density = nx.density(G) * 100`

			`# # constants`
			`# N = numNodes`
			`# M = int(numEdges / numNodes)`

			`# print("#Nodes:", numNodes)`
			`# print("#Edges:", numEdges)`
			`# print("#Edges if graph was a full mesh:", allEdges)`
			`# print("Density: %.2f%%" % density)`


			`# # ## Detecting and removing self-loops`

			`# # In[3]:`


			`# ns = G.number_of_selfloops()`
			`# print("#Self-loops:", ns)`

			`# if ns > 0:`
			`# # removing self-loops`
			`# G.remove_edges_from(G.selfloop_edges())`
			`# print("#Edges without self-loops:", G.number_of_edges())`


			`# # ## Detecting and removing isolates`

			`# # In[4]:`


			`# ni = nx.number_of_isolates(G)`
			`# print("#isolates:", ni)`

			`# if ni > 0:`
			`# # remove isolates`
			`# G.remove_nodes_from(nx.isolates(G))`
			`# print("#Nodes without isolates", G.number_of_nodes())`


			`# # ## Degree`

			`# # ### Average, variance and standard deviation`

			`# # In[5]:`


			`# avgDeg = (2*G.number_of_edges())/(G.number_of_nodes())`
			`# print("Average degree: %.2f" % avgDeg)`

			`# deg = [G.degree(n) for n in G.nodes]`
			`# var = np.var(deg)`
			`# devstd = math.sqrt(var)`

			`# print("Variance {:.2f}".format(var))`
			`# print("Standard deviation {:.2f}".format(devstd))`


			`# # ### Linear scale distribution`

			`# # In[6]:`


			`# # Degree distribution`
			`# degrees = sorted([d for n, d in G.degree()], reverse=True)`
			`# degreeCount = coll.Counter(degrees)`
			`# x, y = zip(*degreeCount.items())`

			`# plt.figure()`
			`# plt.plot(x, y, 'go-')`
			`# plt.xlabel('Degree')`
			`# plt.ylabel('Frequency')`
			`# plt.title('Degree Distribution')`
			`# plt.title('Degree Distribution with linear scale')`
			`# plt.savefig('plots/LinScaleDegreeDistr.png')`
			`# plt.show()`


			`# # ### Logarithmic scale distribution`

			`# # In[7]:`


			`# plt.scatter(x, y, s=50, c="green")`
			`# plt.xlim(0.9, max(x))`
			`# plt.ylim(0.9, max(y))`
			`# plt.xscale('log')`
			`# plt.yscale('log')`
			`# plt.xlabel("Degree")`
			`# plt.ylabel("Frequency")`
			`# plt.title('Degree Distribution with logarithmic scale')`
			`# plt.savefig('plots/LogScaleDegreeDistr.png')`


			`# # ## Clustering coefficient`

			`# # In[8]:`

			`# trans= nx.transitivity(G)*100`
			`# # fraction of triadic closures (closed triangles) found in the network`
			`# print("Transitivity coefficient of the network: %.2f%%" %trans)`

			`# # Clustering coefficient`
			`# acc = nx.average_clustering(G)`
			`# print ("Average clustering coefficient {:.2f}".format(acc))`


			`# # ## Greatest Connected Component`

			`# # In[11]:`



			`# numCC = nx.number_connected_components(G)`
			`# gcc = max(nx.connected_component_subgraphs(G), key=len)`

			`# nodesgcc = gcc.nodes()`
			`# edgesgcc = gcc.edges()`
			`# nx.write_graphml(gcc, "graphs/GCC.graphml");`

			`# print("Numero di componenti connesse:", numCC)`
			`# print("Numero nodi GCC:", len(nodesgcc))`
			`# print("Numero archi GCC:", len(edgesgcc))`
			`# print("Percentuale di nodi sul totale %.2f%%:" %(len(nodesgcc)/len(G.nodes())*100))`
			`# print("Percentuale di archi sul totale %.2f%%:" %(len(edgesgcc)/len(G.edges())*100))`
			`# print("Densità: {:.2f}".format(nx.density(gcc) * 100))`
			`# print("Distanza media: {:.2f}".format(nx.average_shortest_path_length(gcc)))`
			`# print('linea 165')`


			`# # ### Distanze GCC`

			`# # In[13]:`



			`# if True:`
			`# distDict = {}`
			`# #i=1`
			`# row = []`
			`# for n in gcc.nodes():`
			`# nodeDists = nx.single_source_shortest_path_length(gcc,n)`
			`# #if i%1000 == 0:`
			`# # print(i)`

			`# for d in nodeDists:`
			`# #if (int(d) in marked):`
			`# # continue`
			`# if nodeDists[d] in distDict:`
			`# distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1`
			`# else:`
			`# distDict[nodeDists[d]] = 1`
			`# row.append(nodeDists[d])`
			`# #i += 1`

			`# distDict.pop(0)`

			`# print('linea 194')`
			`# plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b')`
			`# plt.title("Distance Distribution for G")`
			`# plt.ylabel("Frequency")`
			`# plt.xlabel("Shortest Path Distance")`
			`# #plt.savefig('plots/DistDistributionGlobal.png')`
			`# plt.show()`


			`# print('linea 204')`
			`# # ### GCC Eccentricity - Diameter - Radius - Center - Periphery`

			`# # In[15]:`


			`# #Eccentricity`
			`# ecc = nx.eccentricity(gcc)`

			`# # Adding eccentricity data to gcc`
			`# for k in ecc.keys():`
			`# gcc.node[k]['eccentricity'] = ecc.get(k)`


			`# # In[ ]:`


			`# diametergcc = nx.diameter(gcc, ecc)`
			`# radiusgcc = nx.radius(gcc, ecc)`
			`# centergcc = nx.center(gcc, e=ecc)`
			`# peripherygcc = nx.periphery(gcc, e=ecc)`

			`# print ("Diameter GCC:", diametergcc)`
			`# print ("Radius GCC", radiusgcc)`


			`# # In[ ]:`

			`# print('linea 231')`

			`# #Adding data to gcc`
			`# nx.set_node_attributes(gcc, 0, 'center')`
			`# nx.set_node_attributes(gcc, 0, 'periphery')`

			`# for v in range(len(centergcc)):`
			`# gcc.node[centergcc[v]]["center"] = 1`

			`# for v in range(len(peripherygcc)):`
			`# gcc.node[peripherygcc[v]]["periphery"] = 1`

			`# nx.write_graphml(gcc, "graphs/gccEcc.graphml");`


			`# # ## Distanze`
			`# print('linea 248')`

			`# Distanza media su tutta la rete`
			`if True:`
			`distDict = {}`
			`#i=1`
			`#marked = set()`
			`row = []`
			`for n in G.nodes():`
			`nodeDists = nx.single_source_shortest_path_length(G,n)`
			`#if i%1000 == 0:`
			`# print(i)`

			`for d in nodeDists:`
			`#if (int(d) in marked):`
			`# continue`
			`if nodeDists[d] in distDict:`
			`distDict[nodeDists[d]] = distDict[nodeDists[d]] + 1`
			`else:`
			`distDict[nodeDists[d]] = 1`
			`row.append(nodeDists[d])`
			`#i += 1`
			`#marked.add(int(n))`

			`avgShortPathG = np.average(row)`
			`distDict.pop(0)`

			`print("Average Distance {:.2f}".format(avgShortPathG))`

			`plt.bar(distDict.keys(), distDict.values(), width=0.3, color='b')`
			`plt.title("Distance Distribution for G")`
			`plt.ylabel("Frequency")`
			`plt.xlabel("Shortest Path Distance")`
			`plt.savefig('plots/DistDistributionGlobal.png')`
			`plt.show()`


			`# print('linea 285')`
			`# #print("Numero componenti connesse:", nx.number_connected_components(G))`
			`# #print("Distanza media:", nx.average_shortest_path_length(G))`


			`# # ## Degree correlation`

			`# # In[ ]:`


			`# # The following code fragment calculates the dictionary and separates the keys and values into`
			`# # two lists my_degree and their_degree:`

			`# npDict = nx.average_degree_connectivity(G)`

			`# plt.scatter(npDict.keys(), npDict.values(), s=50, c="b",)`
			`# plt.xscale('log')`
			`# plt.yscale('log')`
			`# plt.xlabel("k")`
			`# plt.ylabel("$k_{nn}(k)$")`
			`# plt.savefig('plots/Assortativity.png')`


			`# # ## Communities`

			`# # ### 4-Clique Communities`

			`# # In[5]:`

			`# print('linea 314')`

			`print("""`
			`#Nodes: 37702`
			`#Edges: 289004`
			`#Edges if graph was a full mesh: 710701551`
			`Density: 0.04%`
			`#Self-loops: 0`
			`#isolates: 0`
			`Average degree: 15.33`
			`Variance 6526.21`
			`Standard deviation 80.78`
			`Transitivity coefficient of the network: 1.24%`
			`Average clustering coefficient 0.17`
			`Numero di componenti connesse: 2`
			`Numero nodi GCC: 37700`
			`Numero archi GCC: 289003`
			`Percentuale di nodi sul totale 99.99%:`
			`Percentuale di archi sul totale 100.00%:`
			`Densità: 0.04`
			`Distanza media: 3.25`
			`linea 165`
			`linea 194`
			`linea 204`
			`Diameter GCC: 11`
			`Radius GCC 6`
			`linea 231`
			`linea 248`
			`Average Distance 3.25`
			`linea 285`
			`linea 314`
			`""")`

			`# commK = com.k_clique_communities(G, 4)`

			`# NO k-cliques bcos muh memory`

			`# print("Clique computed")`

			`# lClique = 0`
			`# for i,cl in enumerate(commK):`
			`# lClique += 1`
			`# for n in cl:`
			`# G.node[n]["kClique"] = i+1`

			`# print("Numero 4-Clique communities: ", lClique)`


			`# ### Modularity based communities (Louvain)`

			`# In[ ]:`

			`# print('linea 332')`

			`# part = lou.best_partition(G)`
			`# mod = lou.modularity(part,G)`

			`# print('linea 368')`
			`# part_as_seriesG = pd.Series(part)`
			`# print('linea 369')`
			`# part_as_seriesG.sort_values()`
			`# print('linea 370')`
			`# part_as_seriesG.value_counts()`

			`# print("Numero Louvain communities: ", part_as_seriesG.value_counts().size)`


			`# # In[ ]:`


			`# #Saving Communities Attribute`
			`# nx.set_node_attributes(G, 0, 'LvnG')`
			`# for k in part.keys():`
			`# part[k]+= 1`

			`# for i in part.keys():`
			`# G.node[i]["LvnG"] = part.get(i)`

			`# nx.write_graphml(G, "graphs/GComm.graphml");`


			`# # ## Centralities`

			`# # In[ ]:`


			`# print('linea 397')`
			`# dgr = nx.degree_centrality(G)`
			`# clo = nx.closeness_centrality(G)`
			`# har = nx.harmonic_centrality(G)`
			`# eig = nx.eigenvector_centrality(G)`
			`# bet = nx.betweenness_centrality(G)`
			`# pgr = nx.pagerank(G)`
			`# hits = nx.hits(G)`

			`# centralities = pd.concat(`
			`# [pd.Series(c) for c in (hits[1], eig, pgr, har, clo, hits[0], dgr, bet)],`
			`# axis=1)`

			`# centralities.columns = ("Authorities", "Eigenvector", "PageRank",`
			`# "Harmonic Closeness", "Closeness", "Hubs",`
			`# "Degree", "Betweenness")`
			`# centralities["Harmonic Closeness"] /= centralities.shape[0]`

			`# # Calculate the correlations for each pair of centralities`
			`# c_df = centralities.corr()`
			`# ll_triangle = np.tri(c_df.shape[0], k=-1)`
			`# c_df *= ll_triangle`
			`# c_series = c_df.stack().sort_values()`
			`# c_series.tail()`