UniTO/anno3/avrc/assignments/dataviz/dataset/make.py

266 lines
8.3 KiB
Python
Raw Permalink Normal View History

2019-12-05 16:44:41 +01:00
import json
import os
import calendar
from collections import Counter
from IPython import embed as fuck
for yr in range(2013, 2020):
try:
os.mkdir('avg_'+str(yr))
except:
pass
for yr in range(2013, 2020):
try:
os.mkdir(str(yr))
except:
pass
try:
os.mkdir('all')
except:
pass
try:
os.mkdir('trends')
except:
pass
def to_strtime(time, includeYear):
if includeYear:
return f'{time.day}-{calendar.month_name[time.month][:3]}-{time.year}'
else:
return f'{time.day}-{calendar.month_name[time.month][:3]}'
def compute_trend(p1, y1, p2, y2):
from datetime import date, timedelta
max_date = date(2019, 11, 24)
if y1 == 'all':
assert y2 == 'all'
start = date(2018, 1, 1) if 'renzi' in (p1, p2) else date(2013, 1, 1)
end = max_date
else:
mmin = min((y1, y2))
mmax = max((y1, y2))
start = date(mmin, 1, 1)
end = date(mmax, 12, 31)
if end > max_date:
end = max_date
delta = end - start
assert delta.days > 0, fuck()
filename = f'trends/{y1}_{p1}_{y2}_{p2}_trends.tsv'
with open(str(y1)+'/'+p1+'_trend.json', 'r') as f:
trend1 = json.loads(f.read())
with open(str(y2)+'/'+p2+'_trend.json', 'r') as f:
trend2 = json.loads(f.read())
with open(filename, 'w') as f:
f.write(f'date\t{p1.capitalize()}-{str(y1)[2:]}\t{p2.capitalize()}-{str(y2)[2:]}\n')
cnt = 0
for i in range(365):
cnt+=1
day = start + timedelta(days=i)
k = to_strtime(day, False)
v1 = 0 if k not in trend1 else trend1[k]
v2 = 0 if k not in trend2 else trend2[k]
mth = calendar.month_name[day.month][:3]
jday = f'70-{mth}-{day.day}' if y1 == 'all' else k #with year: to_strtime(day, y1 != 'all')
f.write(f'{jday}\t{v1}\t{v2}\n')
print('wrote:', cnt,'trends')
def compute_possible_trends():
from itertools import repeat, combinations
r = zip(repeat(('renzi')), (2018, 2019))
s = zip(repeat(('salvini')), range(2013, 2020))
t = tuple(r) + tuple(s)
poss = list(combinations(t, 2))
poss.append([('renzi', 'all'), ('salvini', 'all')])
try:
for p1, p2 in poss:
compute_trend(*p1, *p2)
except Exception as e:
fuck()
import sys
sys.exit()
def parse_politico(filename):
from dateutil.parser import parse as dateparse
politico = dict()
with open(filename, 'r') as f:
content = [json.loads(l) for l in f.readlines()]
all = []
for j in content : # parse dates and removed unused keys
j['time'] = dateparse(j['time'])
for k in ['text', 'shared_text', 'link', 'post_url', 'shares', 'comments', 'likes', 'image', 'post_id']:
j.pop(k)
all.append(j)
politico['all'] = all
# per-year posts and avg post length / year
years = set([y['time'].year for y in all])
for yr in years:
yposts = [j for j in all if j['time'].year == yr]
politico[yr] = yposts
avg = 0
for p in yposts:
avg += len(p['post_text'])
avg = int(avg/len(yposts))
politico["avg_"+str(yr)] = avg
print('Parsed', filename)
return politico
def calcolo_comuni(words):
with open('comuni.json', 'r') as f:
comuni_content = f.read()
with open('world_cities.json', 'r') as f:
world_content = f.read()
comuniCoord = dict()
for j in json.loads(comuni_content):
comuniCoord[j['name'].upper()] = j
# loaded comuni italiani
mondo = dict()
for j in json.loads(world_content):
name = j['name'].upper()
cc = j['country_code']
if cc not in ['CA', 'US']:
j['lat'], j['lng'] = j['latitude'], j['longitude']
mondo[name] = j
interesse = {'PARIGI': 'PARIS', 'MOSCA': 'MOSCOW', 'BERLINO': 'BERLIN', 'LONDRA': 'LONDON', 'BRUXELLES': 'BRUSSELS'}
europa = {key: mondo[value] for key, value in interesse.items()}
comuniCoord.update(europa)
comuni = set(comuniCoord.keys())
comuni.remove("ALFANO")
comuni.remove("PAESE")
comuni.remove("FONDO")
comuni.remove("VITA")
comuni.remove("CARDINALE")
comuni.remove("PARENTI")
comuni.remove("AMATO")
comuni.remove("BELLA")
comuni.remove("LIBERI")
comuni.remove("BOMBA")
comuni.remove("POPOLI")
comuni.remove("MENTANA")
comuni.remove("MONTI")
comuni.remove("CALCI")
comuni.remove("ORA")
comuni.remove("DON")
comuni.remove("PREZZO")
comuni.remove("CALCIO")
comuni.remove("MACELLO")
comuni.remove("RUSSI")
comuni.remove("PORTE")
visitati = filter(lambda w: w.upper() in comuni, words)
return {'locations':[comuniCoord[v.upper()] for v in visitati]}
def write_words(politico, prefix):
# a file for each year and one for total
# prefix is prefix of filename
import string
import emoji
punctuations = string.punctuation + ''.join(["", "", ""])
with open('italian_stopwords', 'r') as f:
personal = [str(i) for i in range(50)] + ['tre', 'ps', '15', '23', 'fra', 'va', 'ce', 'due', 'co', 'qui', 'di', 'far', 'di', 'sa', 'cè', 'quattro', 'cinque', 'sei', 'sette', 'otto', 'nove', 'dieci', 'post', 'd', 'p', 's', 'de', 'ly']
personal.remove('18')
stopwords = set([w.strip() for w in f.readlines()] + personal)
for k, v in politico.items():
# v keys:
# 'all' one year of posts or all posts
# '<year>' posts for year '<year>'
# 'avg_<year>' avg post length for year '<year>'
words = []
emojis = []
sonno = dict()
# generate layout
for month in calendar.month_name:
sonno[month[:3]] = dict()
for i in range(24):
sonno[month[:3]][i] = 0
filename_words = str(k)+'/'+prefix+'_'+'words'
filename_counter = str(k)+'/'+prefix+'_'+'counter'
filename_sleep = str(k)+'/'+prefix+'_'+'sleep'
filename_emoji = str(k)+'/'+prefix+'_'+'emoji'
filename_comuni = str(k)+'/'+prefix+'_'+'comuni.html'
filename_trend = str(k)+'/'+prefix+'_trend.json'
filename_avg = str(k) +'/'+ prefix
# write avg post length
if str(k).startswith('avg'):
with open(filename_avg, 'w') as f:
f.write(str(v))
print('Wrote', filename_avg)
continue
def filter_word(w):
return w not in stopwords and not w.isdigit() and w != '' and w[0] != ' '
trends = list() # keep track of days for posts
for j in v:
text = j['post_text'].replace('\n', ' ')
for punct in punctuations:
text = text.replace(punct, ' ').lower()
words.extend(filter(filter_word , text.split(' ')))
emojis.extend(filter(lambda w: w in emoji.UNICODE_EMOJI, text.split(' ')))
assert k in list(range(2013, 2020))+['all'], k
time = j['time']
sonno[calendar.month_name[time.month][:3]][time.hour] += 1
trends.append(to_strtime(time, k == 'all'))
print('Computed stemmed words, sleep and emoji counting')
with open(filename_words, 'w') as f:
f.writelines([w+'\n' for w in words])
print('Wrote', filename_words)
wcounter = Counter(words)
with open(filename_counter, 'w') as f:
f.write(json.dumps(wcounter))
print('Wrote', filename_counter)
ecounter = Counter(emojis)
with open(filename_emoji, 'w') as f:
f.write(json.dumps(ecounter))
print('Wrote', filename_emoji)
with open(filename_trend, 'w') as f:
f.write(json.dumps(Counter(trends)))
print('Wrote', filename_trend)
with open(filename_sleep, 'w') as f:
f.write(json.dumps(sonno))
print('Wrote', filename_sleep)
print('Calcolo comuni')
comuni_visitati = calcolo_comuni(words)
from make_heatmap import Generator as HeatMapGen
generator = HeatMapGen()
assert type(comuni_visitati is dict)
generator.run(comuni_visitati, filename_comuni)
print()
if __name__ == '__main__':
renzi = parse_politico('matteorenziufficiale.json')
write_words(renzi, 'renzi')
salvini = parse_politico('salviniofficial.json')
write_words(salvini, 'salvini')
compute_possible_trends()