UniTO/anno3/avrc/assignments/dataviz/dataset/make.py
2024-10-29 09:11:05 +01:00

265 lines
8.3 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import calendar
from collections import Counter
from IPython import embed as fuck
for yr in range(2013, 2020):
try:
os.mkdir('avg_'+str(yr))
except:
pass
for yr in range(2013, 2020):
try:
os.mkdir(str(yr))
except:
pass
try:
os.mkdir('all')
except:
pass
try:
os.mkdir('trends')
except:
pass
def to_strtime(time, includeYear):
if includeYear:
return f'{time.day}-{calendar.month_name[time.month][:3]}-{time.year}'
else:
return f'{time.day}-{calendar.month_name[time.month][:3]}'
def compute_trend(p1, y1, p2, y2):
from datetime import date, timedelta
max_date = date(2019, 11, 24)
if y1 == 'all':
assert y2 == 'all'
start = date(2018, 1, 1) if 'renzi' in (p1, p2) else date(2013, 1, 1)
end = max_date
else:
mmin = min((y1, y2))
mmax = max((y1, y2))
start = date(mmin, 1, 1)
end = date(mmax, 12, 31)
if end > max_date:
end = max_date
delta = end - start
assert delta.days > 0, fuck()
filename = f'trends/{y1}_{p1}_{y2}_{p2}_trends.tsv'
with open(str(y1)+'/'+p1+'_trend.json', 'r') as f:
trend1 = json.loads(f.read())
with open(str(y2)+'/'+p2+'_trend.json', 'r') as f:
trend2 = json.loads(f.read())
with open(filename, 'w') as f:
f.write(f'date\t{p1.capitalize()}-{str(y1)[2:]}\t{p2.capitalize()}-{str(y2)[2:]}\n')
cnt = 0
for i in range(365):
cnt+=1
day = start + timedelta(days=i)
k = to_strtime(day, False)
v1 = 0 if k not in trend1 else trend1[k]
v2 = 0 if k not in trend2 else trend2[k]
mth = calendar.month_name[day.month][:3]
jday = f'70-{mth}-{day.day}' if y1 == 'all' else k #with year: to_strtime(day, y1 != 'all')
f.write(f'{jday}\t{v1}\t{v2}\n')
print('wrote:', cnt,'trends')
def compute_possible_trends():
from itertools import repeat, combinations
r = zip(repeat(('renzi')), (2018, 2019))
s = zip(repeat(('salvini')), range(2013, 2020))
t = tuple(r) + tuple(s)
poss = list(combinations(t, 2))
poss.append([('renzi', 'all'), ('salvini', 'all')])
try:
for p1, p2 in poss:
compute_trend(*p1, *p2)
except Exception as e:
fuck()
import sys
sys.exit()
def parse_politico(filename):
from dateutil.parser import parse as dateparse
politico = dict()
with open(filename, 'r') as f:
content = [json.loads(l) for l in f.readlines()]
all = []
for j in content : # parse dates and removed unused keys
j['time'] = dateparse(j['time'])
for k in ['text', 'shared_text', 'link', 'post_url', 'shares', 'comments', 'likes', 'image', 'post_id']:
j.pop(k)
all.append(j)
politico['all'] = all
# per-year posts and avg post length / year
years = set([y['time'].year for y in all])
for yr in years:
yposts = [j for j in all if j['time'].year == yr]
politico[yr] = yposts
avg = 0
for p in yposts:
avg += len(p['post_text'])
avg = int(avg/len(yposts))
politico["avg_"+str(yr)] = avg
print('Parsed', filename)
return politico
def calcolo_comuni(words):
with open('comuni.json', 'r') as f:
comuni_content = f.read()
with open('world_cities.json', 'r') as f:
world_content = f.read()
comuniCoord = dict()
for j in json.loads(comuni_content):
comuniCoord[j['name'].upper()] = j
# loaded comuni italiani
mondo = dict()
for j in json.loads(world_content):
name = j['name'].upper()
cc = j['country_code']
if cc not in ['CA', 'US']:
j['lat'], j['lng'] = j['latitude'], j['longitude']
mondo[name] = j
interesse = {'PARIGI': 'PARIS', 'MOSCA': 'MOSCOW', 'BERLINO': 'BERLIN', 'LONDRA': 'LONDON', 'BRUXELLES': 'BRUSSELS'}
europa = {key: mondo[value] for key, value in interesse.items()}
comuniCoord.update(europa)
comuni = set(comuniCoord.keys())
comuni.remove("ALFANO")
comuni.remove("PAESE")
comuni.remove("FONDO")
comuni.remove("VITA")
comuni.remove("CARDINALE")
comuni.remove("PARENTI")
comuni.remove("AMATO")
comuni.remove("BELLA")
comuni.remove("LIBERI")
comuni.remove("BOMBA")
comuni.remove("POPOLI")
comuni.remove("MENTANA")
comuni.remove("MONTI")
comuni.remove("CALCI")
comuni.remove("ORA")
comuni.remove("DON")
comuni.remove("PREZZO")
comuni.remove("CALCIO")
comuni.remove("MACELLO")
comuni.remove("RUSSI")
comuni.remove("PORTE")
visitati = filter(lambda w: w.upper() in comuni, words)
return {'locations':[comuniCoord[v.upper()] for v in visitati]}
def write_words(politico, prefix):
# a file for each year and one for total
# prefix is prefix of filename
import string
import emoji
punctuations = string.punctuation + ''.join(["", "", ""])
with open('italian_stopwords', 'r') as f:
personal = [str(i) for i in range(50)] + ['tre', 'ps', '15', '23', 'fra', 'va', 'ce', 'due', 'co', 'qui', 'di', 'far', 'di', 'sa', 'cè', 'quattro', 'cinque', 'sei', 'sette', 'otto', 'nove', 'dieci', 'post', 'd', 'p', 's', 'de', 'ly']
personal.remove('18')
stopwords = set([w.strip() for w in f.readlines()] + personal)
for k, v in politico.items():
# v keys:
# 'all' one year of posts or all posts
# '<year>' posts for year '<year>'
# 'avg_<year>' avg post length for year '<year>'
words = []
emojis = []
sonno = dict()
# generate layout
for month in calendar.month_name:
sonno[month[:3]] = dict()
for i in range(24):
sonno[month[:3]][i] = 0
filename_words = str(k)+'/'+prefix+'_'+'words'
filename_counter = str(k)+'/'+prefix+'_'+'counter'
filename_sleep = str(k)+'/'+prefix+'_'+'sleep'
filename_emoji = str(k)+'/'+prefix+'_'+'emoji'
filename_comuni = str(k)+'/'+prefix+'_'+'comuni.html'
filename_trend = str(k)+'/'+prefix+'_trend.json'
filename_avg = str(k) +'/'+ prefix
# write avg post length
if str(k).startswith('avg'):
with open(filename_avg, 'w') as f:
f.write(str(v))
print('Wrote', filename_avg)
continue
def filter_word(w):
return w not in stopwords and not w.isdigit() and w != '' and w[0] != ' '
trends = list() # keep track of days for posts
for j in v:
text = j['post_text'].replace('\n', ' ')
for punct in punctuations:
text = text.replace(punct, ' ').lower()
words.extend(filter(filter_word , text.split(' ')))
emojis.extend(filter(lambda w: w in emoji.UNICODE_EMOJI, text.split(' ')))
assert k in list(range(2013, 2020))+['all'], k
time = j['time']
sonno[calendar.month_name[time.month][:3]][time.hour] += 1
trends.append(to_strtime(time, k == 'all'))
print('Computed stemmed words, sleep and emoji counting')
with open(filename_words, 'w') as f:
f.writelines([w+'\n' for w in words])
print('Wrote', filename_words)
wcounter = Counter(words)
with open(filename_counter, 'w') as f:
f.write(json.dumps(wcounter))
print('Wrote', filename_counter)
ecounter = Counter(emojis)
with open(filename_emoji, 'w') as f:
f.write(json.dumps(ecounter))
print('Wrote', filename_emoji)
with open(filename_trend, 'w') as f:
f.write(json.dumps(Counter(trends)))
print('Wrote', filename_trend)
with open(filename_sleep, 'w') as f:
f.write(json.dumps(sonno))
print('Wrote', filename_sleep)
print('Calcolo comuni')
comuni_visitati = calcolo_comuni(words)
from make_heatmap import Generator as HeatMapGen
generator = HeatMapGen()
assert type(comuni_visitati is dict)
generator.run(comuni_visitati, filename_comuni)
print()
if __name__ == '__main__':
renzi = parse_politico('matteorenziufficiale.json')
write_words(renzi, 'renzi')
salvini = parse_politico('salviniofficial.json')
write_words(salvini, 'salvini')
compute_possible_trends()